@@ -393,74 +393,91 @@ def test_groupby_drop_nan_with_multi_index():
393393 tm .assert_frame_equal (result , expected )
394394
395395
396+ # sequence_index enumerates all strings made up of x, y, z of length 4
397+ @pytest .mark .parametrize ("sequence_index" , range (3 ** 4 ))
396398@pytest .mark .parametrize (
397- "values, dtype" ,
399+ "dtype" ,
398400 [
399- ([2 , np .nan , 1 , 2 ], None ),
400- ([2 , np .nan , 1 , 2 ], "UInt8" ),
401- ([2 , np .nan , 1 , 2 ], "Int8" ),
402- ([2 , np .nan , 1 , 2 ], "UInt16" ),
403- ([2 , np .nan , 1 , 2 ], "Int16" ),
404- ([2 , np .nan , 1 , 2 ], "UInt32" ),
405- ([2 , np .nan , 1 , 2 ], "Int32" ),
406- ([2 , np .nan , 1 , 2 ], "UInt64" ),
407- ([2 , np .nan , 1 , 2 ], "Int64" ),
408- ([2 , np .nan , 1 , 2 ], "Float32" ),
409- ([2 , np .nan , 1 , 2 ], "Int64" ),
410- ([2 , np .nan , 1 , 2 ], "Float64" ),
401+ None ,
402+ "UInt8" ,
403+ "Int8" ,
404+ "UInt16" ,
405+ "Int16" ,
406+ "UInt32" ,
407+ "Int32" ,
408+ "UInt64" ,
409+ "Int64" ,
410+ "Float32" ,
411+ "Int64" ,
412+ "Float64" ,
413+ "category" ,
414+ "string" ,
411415 pytest .param (
412- ["y" , None , "x" , "y" ],
413- "category" ,
414- marks = pytest .mark .xfail (
415- reason = "dropna=False not correct for categorical, GH#48645"
416- ),
417- ),
418- (["y" , pd .NA , "x" , "y" ], "string" ),
419- pytest .param (
420- ["y" , pd .NA , "x" , "y" ],
421416 "string[pyarrow]" ,
422417 marks = pytest .mark .skipif (
423418 pa_version_under1p01 , reason = "pyarrow is not installed"
424419 ),
425420 ),
426- (
427- ["2016-01-01" , np .datetime64 ("NaT" ), "2017-01-01" , "2016-01-01" ],
428- "datetime64[ns]" ,
429- ),
430- (
431- [
432- pd .Period ("2012-02-01" , freq = "D" ),
433- pd .NaT ,
434- pd .Period ("2012-01-01" , freq = "D" ),
435- pd .Period ("2012-02-01" , freq = "D" ),
436- ],
437- None ,
438- ),
439- (pd .arrays .SparseArray ([2 , np .nan , 1 , 2 ]), None ),
421+ "datetime64[ns]" ,
422+ "period[d]" ,
423+ "Sparse[float]" ,
440424 ],
441425)
442426@pytest .mark .parametrize ("test_series" , [True , False ])
443- def test_no_sort_keep_na (values , dtype , test_series ):
444- # GH#46584
445- key = pd .Series (values , dtype = dtype )
446- df = pd .DataFrame ({"key" : key , "a" : [1 , 2 , 3 , 4 ]})
427+ def test_no_sort_keep_na (request , sequence_index , dtype , test_series ):
428+ # GH#46584, GH#48794
429+
430+ # Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
431+ # This sequence is used for the grouper.
432+ sequence = "" .join (
433+ [{0 : "x" , 1 : "y" , 2 : "z" }[sequence_index // (3 ** k ) % 3 ] for k in range (4 )]
434+ )
435+
436+ if dtype == "category" and "z" in sequence :
437+ # Only xfail when nulls are present
438+ msg = "dropna=False not correct for categorical, GH#48645"
439+ request .node .add_marker (pytest .mark .xfail (reason = msg ))
440+
441+ # Unique values to use for grouper, depends on dtype
442+ if dtype in ("string" , "string[pyarrow]" ):
443+ uniques = {"x" : "x" , "y" : "y" , "z" : pd .NA }
444+ elif dtype in ("datetime64[ns]" , "period[d]" ):
445+ uniques = {"x" : "2016-01-01" , "y" : "2017-01-01" , "z" : pd .NA }
446+ else :
447+ uniques = {"x" : 1 , "y" : 2 , "z" : np .nan }
448+
449+ df = pd .DataFrame (
450+ {
451+ "key" : pd .Series ([uniques [label ] for label in sequence ], dtype = dtype ),
452+ "a" : [0 , 1 , 2 , 3 ],
453+ }
454+ )
447455 gb = df .groupby ("key" , dropna = False , sort = False )
448456 if test_series :
449457 gb = gb ["a" ]
458+ result = gb .sum ()
450459
451- warn = None
452- if isinstance (values , pd .arrays .SparseArray ):
453- warn = FutureWarning
454- msg = "passing a SparseArray to pd.Index will store that array directly"
455- with tm .assert_produces_warning (warn , match = msg ):
456- result = gb .sum ()
457- expected = pd .DataFrame ({"a" : [5 , 2 , 3 ]}, index = key [:- 1 ].rename ("key" ))
460+ # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
461+ # issues with hashing np.nan
462+ summed = {}
463+ for idx , label in enumerate (sequence ):
464+ summed [label ] = summed .get (label , 0 ) + idx
465+ if dtype == "category" :
466+ index = pd .CategoricalIndex (
467+ [uniques [e ] for e in summed ],
468+ list ({uniques [k ]: 0 for k in sequence if not pd .isnull (uniques [k ])}),
469+ name = "key" ,
470+ )
471+ elif isinstance (dtype , str ) and dtype .startswith ("Sparse" ):
472+ index = pd .Index (
473+ pd .array ([uniques [label ] for label in summed ], dtype = dtype ), name = "key"
474+ )
475+ else :
476+ index = pd .Index ([uniques [label ] for label in summed ], dtype = dtype , name = "key" )
477+ expected = pd .Series (summed .values (), index = index , name = "a" , dtype = None )
478+ if not test_series :
479+ expected = expected .to_frame ()
458480
459- if test_series :
460- expected = expected ["a" ]
461- if expected .index .is_categorical ():
462- # TODO: Slicing reorders categories?
463- expected .index = expected .index .reorder_categories (["y" , "x" ])
464481 tm .assert_equal (result , expected )
465482
466483
0 commit comments