[ArrayManager] TST: run (+fix/skip) pandas/tests/indexing tests (#40325)

jorisvandenbossche · web-flow · commit 8945a4267588 · 2021-03-17T13:02:49.000+01:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -163,10 +163,7 @@ jobs:
         pytest pandas/tests/resample/
         pytest pandas/tests/reshape/merge
         pytest pandas/tests/series/
-
-        # indexing subset (temporary since other tests don't pass yet)
-        pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_astype_assignment_with_dups
-        pytest pandas/tests/indexing/multiindex/test_setitem.py::TestMultiIndexSetItem::test_frame_setitem_multi_column
+        pytest pandas/tests/indexing/
 
         pytest pandas/tests/api/
         pytest pandas/tests/apply/
diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py
@@ -342,6 +342,8 @@ def length_of_indexer(indexer, target=None) -> int:
             # GH#25774
             return indexer.sum()
         return len(indexer)
+    elif isinstance(indexer, range):
+        return (indexer.stop - indexer.start) // indexer.step
     elif not is_list_like_indexer(indexer):
         return 1
     raise AssertionError("cannot find the length of the indexer")
diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -228,6 +228,11 @@ def _verify_integrity(self) -> None:
                     "Passed arrays should be np.ndarray or ExtensionArray instances, "
                     f"got {type(arr)} instead"
                 )
+            if not arr.ndim == 1:
+                raise ValueError(
+                    "Passed arrays should be 1-dimensional, got array with "
+                    f"{arr.ndim} dimensions instead."
+                )
 
     def reduce(
         self: T, func: Callable, ignore_failures: bool = False
@@ -1040,6 +1045,9 @@ def take(self: T, indexer, axis: int = 1, verify: bool = True) -> T:
             else np.asanyarray(indexer, dtype="int64")
         )
 
+        if not indexer.ndim == 1:
+            raise ValueError("indexer should be 1-dimensional")
+
         n = self.shape_proper[axis]
         indexer = maybe_convert_indices(indexer, n, verify=verify)
 
diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 from pandas import (
     DataFrame,
     MultiIndex,
@@ -32,6 +34,7 @@ def test_detect_chained_assignment():
         zed["eyes"]["right"].fillna(value=555, inplace=True)
 
 
+@td.skip_array_manager_invalid_test  # with ArrayManager df.loc[0] is not a view
 def test_cache_updating():
     # 5216
     # make sure that we don't try to set a dead cache
diff --git a/pandas/tests/indexing/multiindex/test_partial.py b/pandas/tests/indexing/multiindex/test_partial.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 from pandas import (
     DataFrame,
     Float64Index,
@@ -114,6 +116,9 @@ def test_getitem_partial_column_select(self):
         with pytest.raises(KeyError, match=r"\('a', 'foo'\)"):
             df.loc[("a", "foo"), :]
 
+    # TODO(ArrayManager) rewrite test to not use .values
+    # exp.loc[2000, 4].values[:] select multiple columns -> .values is not a view
+    @td.skip_array_manager_invalid_test
     def test_partial_set(self, multiindex_year_month_day_dataframe_random_data):
         # GH #397
         ymd = multiindex_year_month_day_dataframe_random_data
diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -119,6 +121,9 @@ def test_setitem_multiindex3(self):
             expected=copy,
         )
 
+    # TODO(ArrayManager) df.loc["bar"] *= 2 doesn't raise an error but results in
+    # all NaNs -> doesn't work in the "split" path (also for BlockManager actually)
+    @td.skip_array_manager_not_yet_implemented
     def test_multiindex_setitem(self):
 
         # GH 3738
@@ -457,6 +462,8 @@ def test_setitem_new_column_all_na(self):
         assert df["new"].isna().all()
 
 
+@td.skip_array_manager_invalid_test  # df["foo"] select multiple columns -> .values
+# is not a view
 def test_frame_setitem_view_direct(multiindex_dataframe_random_data):
     # this works because we are modifying the underlying array
     # really a no-no
diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -169,7 +171,7 @@ def test_detect_chained_assignment(self):
         tm.assert_frame_equal(df, expected)
 
     @pytest.mark.arm_slow
-    def test_detect_chained_assignment_raises(self):
+    def test_detect_chained_assignment_raises(self, using_array_manager):
 
         # test with the chaining
         df = DataFrame(
@@ -180,13 +182,23 @@ def test_detect_chained_assignment_raises(self):
         )
         assert df._is_copy is None
 
-        with pytest.raises(com.SettingWithCopyError, match=msg):
-            df["A"][0] = -5
+        if not using_array_manager:
+            with pytest.raises(com.SettingWithCopyError, match=msg):
+                df["A"][0] = -5
 
-        with pytest.raises(com.SettingWithCopyError, match=msg):
-            df["A"][1] = np.nan
+            with pytest.raises(com.SettingWithCopyError, match=msg):
+                df["A"][1] = np.nan
+
+            assert df["A"]._is_copy is None
 
-        assert df["A"]._is_copy is None
+        else:
+            # INFO(ArrayManager) for ArrayManager it doesn't matter that it's
+            # a mixed dataframe
+            df["A"][0] = -5
+            df["A"][1] = -6
+            expected = DataFrame([[-5, 2], [-6, 3]], columns=list("AB"))
+            expected["B"] = expected["B"].astype("float64")
+            tm.assert_frame_equal(df, expected)
 
     @pytest.mark.arm_slow
     def test_detect_chained_assignment_fails(self):
@@ -219,18 +231,24 @@ def test_detect_chained_assignment_doc_example(self):
             df[indexer]["c"] = 42
 
     @pytest.mark.arm_slow
-    def test_detect_chained_assignment_object_dtype(self):
+    def test_detect_chained_assignment_object_dtype(self, using_array_manager):
 
         expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]})
         df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})
 
-        with pytest.raises(com.SettingWithCopyError, match=msg):
-            df["A"][0] = 111
-
         with pytest.raises(com.SettingWithCopyError, match=msg):
             df.loc[0]["A"] = 111
 
-        df.loc[0, "A"] = 111
+        if not using_array_manager:
+            with pytest.raises(com.SettingWithCopyError, match=msg):
+                df["A"][0] = 111
+
+            df.loc[0, "A"] = 111
+        else:
+            # INFO(ArrayManager) for ArrayManager it doesn't matter that it's
+            # a mixed dataframe
+            df["A"][0] = 111
+
         tm.assert_frame_equal(df, expected)
 
     @pytest.mark.arm_slow
@@ -347,7 +365,7 @@ def test_detect_chained_assignment_undefined_column(self):
             df.iloc[0:5]["group"] = "a"
 
     @pytest.mark.arm_slow
-    def test_detect_chained_assignment_changing_dtype(self):
+    def test_detect_chained_assignment_changing_dtype(self, using_array_manager):
 
         # Mixed type setting but same dtype & changing dtype
         df = DataFrame(
@@ -365,8 +383,14 @@ def test_detect_chained_assignment_changing_dtype(self):
         with pytest.raises(com.SettingWithCopyError, match=msg):
             df.loc[2]["C"] = "foo"
 
-        with pytest.raises(com.SettingWithCopyError, match=msg):
+        if not using_array_manager:
+            with pytest.raises(com.SettingWithCopyError, match=msg):
+                df["C"][2] = "foo"
+        else:
+            # INFO(ArrayManager) for ArrayManager it doesn't matter if it's
+            # changing the dtype or not
             df["C"][2] = "foo"
+            assert df.loc[2, "C"] == "foo"
 
     def test_setting_with_copy_bug(self):
 
@@ -411,6 +435,8 @@ def test_detect_chained_assignment_warnings_filter_and_dupe_cols(self):
             )
             tm.assert_frame_equal(df, expected)
 
+    # TODO(ArrayManager) fast_xs with array-like scalars is not yet working
+    @td.skip_array_manager_not_yet_implemented
     def test_chained_getitem_with_lists(self):
 
         # GH6394
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -10,6 +10,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 from pandas import (
     Categorical,
     CategoricalDtype,
@@ -63,26 +65,30 @@ class TestiLocBaseIndependent:
         ],
     )
     @pytest.mark.parametrize("indexer", [tm.loc, tm.iloc])
-    def test_iloc_setitem_fullcol_categorical(self, indexer, key):
+    def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manager):
         frame = DataFrame({0: range(3)}, dtype=object)
 
         cat = Categorical(["alpha", "beta", "gamma"])
 
-        assert frame._mgr.blocks[0]._can_hold_element(cat)
+        if not using_array_manager:
+            assert frame._mgr.blocks[0]._can_hold_element(cat)
 
         df = frame.copy()
         orig_vals = df.values
         indexer(df)[key, 0] = cat
 
         overwrite = isinstance(key, slice) and key == slice(None)
 
-        if overwrite:
+        if overwrite or using_array_manager:
+            # TODO(ArrayManager) we always overwrite because ArrayManager takes
+            #  the "split" path, which still overwrites
             # TODO: GH#39986 this probably shouldn't behave differently
             expected = DataFrame({0: cat})
             assert not np.shares_memory(df.values, orig_vals)
         else:
             expected = DataFrame({0: cat}).astype(object)
-            assert np.shares_memory(df.values, orig_vals)
+            if not using_array_manager:
+                assert np.shares_memory(df[0].values, orig_vals)
 
         tm.assert_frame_equal(df, expected)
 
@@ -93,13 +99,27 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key):
         else:
             assert cat[0] != "gamma"
 
+        # TODO with mixed dataframe ("split" path), we always overwrite the column
+        frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)})
+        df = frame.copy()
+        orig_vals = df.values
+        indexer(df)[key, 0] = cat
+        expected = DataFrame({0: cat, 1: range(3)})
+        tm.assert_frame_equal(df, expected)
+
+    # TODO(ArrayManager) does not yet update parent
+    @td.skip_array_manager_not_yet_implemented
     @pytest.mark.parametrize("box", [array, Series])
-    def test_iloc_setitem_ea_inplace(self, frame_or_series, box):
+    def test_iloc_setitem_ea_inplace(self, frame_or_series, box, using_array_manager):
         # GH#38952 Case with not setting a full column
         #  IntegerArray without NAs
         arr = array([1, 2, 3, 4])
         obj = frame_or_series(arr.to_numpy("i8"))
-        values = obj.values
+
+        if frame_or_series is Series or not using_array_manager:
+            values = obj.values
+        else:
+            values = obj[0].values
 
         obj.iloc[:2] = box(arr[2:])
         expected = frame_or_series(np.array([3, 4, 3, 4], dtype="i8"))
@@ -109,7 +129,10 @@ def test_iloc_setitem_ea_inplace(self, frame_or_series, box):
         if frame_or_series is Series:
             assert obj.values is values
         else:
-            assert obj.values.base is values.base and values.base is not None
+            if using_array_manager:
+                assert obj[0].values is values
+            else:
+                assert obj.values.base is values.base and values.base is not None
 
     def test_is_scalar_access(self):
         # GH#32085 index with duplicates doesn't matter for _is_scalar_access
@@ -481,13 +504,16 @@ def test_iloc_setitem_dups(self):
         df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True)
         tm.assert_frame_equal(df, expected)
 
-    def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self):
+    def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(
+        self, using_array_manager
+    ):
         # Same as the "assign back to self" check in test_iloc_setitem_dups
         #  but on a DataFrame with multiple blocks
         df = DataFrame([[0, 1], [2, 3]], columns=["B", "B"])
 
         df.iloc[:, 0] = df.iloc[:, 0].astype("f8")
-        assert len(df._mgr.blocks) == 2
+        if not using_array_manager:
+            assert len(df._mgr.blocks) == 2
         expected = df.copy()
 
         # assign back to self
@@ -577,7 +603,7 @@ def test_iloc_getitem_labelled_frame(self):
         with pytest.raises(ValueError, match=msg):
             df.iloc["j", "D"]
 
-    def test_iloc_getitem_doc_issue(self):
+    def test_iloc_getitem_doc_issue(self, using_array_manager):
 
         # multi axis slicing issue with single block
         # surfaced in GH 6059
@@ -612,7 +638,8 @@ def test_iloc_getitem_doc_issue(self):
         columns = list(range(0, 8, 2))
         df = DataFrame(arr, index=index, columns=columns)
 
-        df._mgr.blocks[0].mgr_locs
+        if not using_array_manager:
+            df._mgr.blocks[0].mgr_locs
         result = df.iloc[1:5, 2:4]
         str(result)
         result.dtypes
@@ -793,15 +820,20 @@ def test_iloc_empty_list_indexer_is_ok(self):
             df.iloc[[]], df.iloc[:0, :], check_index_type=True, check_column_type=True
         )
 
-    def test_identity_slice_returns_new_object(self):
+    def test_identity_slice_returns_new_object(self, using_array_manager):
         # GH13873
         original_df = DataFrame({"a": [1, 2, 3]})
         sliced_df = original_df.iloc[:]
         assert sliced_df is not original_df
 
         # should be a shallow copy
         original_df["a"] = [4, 4, 4]
-        assert (sliced_df["a"] == 4).all()
+        if using_array_manager:
+            # TODO(ArrayManager) verify it is expected that the original didn't change
+            # setitem is replacing full column, so doesn't update "viewing" dataframe
+            assert not (sliced_df["a"] == 4).all()
+        else:
+            assert (sliced_df["a"] == 4).all()
 
         original_series = Series([1, 2, 3, 4, 5, 6])
         sliced_series = original_series.iloc[:]
@@ -932,6 +964,9 @@ def test_iloc_getitem_readonly_key(self):
         expected = df["data"].loc[[1, 3, 6]]
         tm.assert_series_equal(result, expected)
 
+    # TODO(ArrayManager) setting single item with an iterable doesn't work yet
+    # in the "split" path
+    @td.skip_array_manager_not_yet_implemented
     def test_iloc_assign_series_to_df_cell(self):
         # GH 37593
         df = DataFrame(columns=["a"], index=[0])
@@ -1088,6 +1123,8 @@ def test_iloc_getitem_setitem_fancy_exceptions(self, float_frame):
             # GH#32257 we let numpy do validation, get their exception
             float_frame.iloc[:, :, :] = 1
 
+    # TODO(ArrayManager) "split" path doesn't properly implement DataFrame indexer
+    @td.skip_array_manager_not_yet_implemented
     def test_iloc_frame_indexer(self):
         # GH#39004
         df = DataFrame({"a": [1, 2, 3]})
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py