[ArrayManager] TST: get tests running for /tests/frame

jorisvandenbossche · jorisvandenbossche · commit b0d8ff861efe · 2021-02-23T11:26:31.000+01:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -153,6 +153,7 @@ jobs:
       run: |
         source activate pandas-dev
         pytest pandas/tests/frame/methods --array-manager
+        pytest pandas/tests/frame/test_* --array-manager -k "not test_reductions"
         pytest pandas/tests/arithmetic/ --array-manager
         pytest pandas/tests/reshape/merge --array-manager
 
diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -407,11 +407,14 @@ def __len__(self):
 # Indices
 # ----------------------------------------------------------------
 @pytest.fixture
-def multiindex_year_month_day_dataframe_random_data():
+def multiindex_year_month_day_dataframe_random_data(using_array_manager):
     """
     DataFrame with 3 level MultiIndex (year, month, day) covering
     first 100 business days from 2000-01-01 with random data
     """
+    if using_array_manager:
+        # TODO(ArrayManager) groupby
+        pytest.skip("Not yet implemented for ArrayManager")
     tdf = tm.makeTimeDataFrame(100)
     ymd = tdf.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day]).sum()
     # use Int64Index, to make sure things work
diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py
@@ -741,7 +741,13 @@ def insert(self, loc: int, item: Hashable, value, allow_duplicates: bool = False
 
         value = extract_array(value, extract_numpy=True)
         if value.ndim == 2:
-            value = value[0, :]
+            if value.shape[0] == 1:
+                value = value[0, :]
+            else:
+                raise ValueError(
+                    f"expected 1D array, got array with shape {value.shape}"
+                )
+
         # TODO self.arrays can be empty
         # assert len(value) == len(self.arrays[0])
 
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
@@ -7,6 +7,8 @@
 import pytest
 import pytz
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -686,6 +688,7 @@ def test_df_add_2d_array_collike_broadcasts(self):
         result = collike + df
         tm.assert_frame_equal(result, expected)
 
+    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) decide on dtypes
     def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators):
         # GH#23000
         opname = all_arithmetic_operators
@@ -707,6 +710,7 @@ def test_df_arith_2d_array_rowlike_broadcasts(self, all_arithmetic_operators):
         result = getattr(df, opname)(rowlike)
         tm.assert_frame_equal(result, expected)
 
+    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) decide on dtypes
     def test_df_arith_2d_array_collike_broadcasts(self, all_arithmetic_operators):
         # GH#23000
         opname = all_arithmetic_operators
@@ -1351,7 +1355,7 @@ def test_strings_to_numbers_comparisons_raises(self, compare_operators_no_eq_ne)
 
     def test_comparison_protected_from_errstate(self):
         missing_df = tm.makeDataFrame()
-        missing_df.iloc[0]["A"] = np.nan
+        missing_df.loc[missing_df.index[0], "A"] = np.nan
         with np.errstate(invalid="ignore"):
             expected = missing_df.values < 0
         with np.errstate(invalid="raise"):
diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
@@ -9,6 +9,7 @@
 import pytest
 
 from pandas.errors import PerformanceWarning
+import pandas.util._test_decorators as td
 
 import pandas as pd
 from pandas import (
@@ -30,6 +31,11 @@
 # structure
 
 
+# TODO(ArrayManager) check which of those tests need to be rewritten the test the
+# equivalent for ArrayManager
+pytestmark = td.skip_array_manager_invalid_test
+
+
 class TestDataFrameBlockInternals:
     def test_setitem_invalidates_datetime_index_freq(self):
         # GH#24096 altering a datetime64tz column inplace invalidates the
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -18,6 +18,7 @@
 import pytz
 
 from pandas.compat import np_version_under1p19
+import pandas.util._test_decorators as td
 
 from pandas.core.dtypes.common import is_integer_dtype
 from pandas.core.dtypes.dtypes import (
@@ -159,7 +160,10 @@ def test_constructor_cast_failure(self):
         df["foo"] = np.ones((4, 2)).tolist()
 
         # this is not ok
-        msg = "Wrong number of items passed 2, placement implies 1"
+        msg = (
+            "Wrong number of items passed 2, placement implies 1"
+            "|expected 1D array, got array"
+        )
         with pytest.raises(ValueError, match=msg):
             df["test"] = np.ones((4, 2))
 
@@ -174,12 +178,15 @@ def test_constructor_dtype_copy(self):
         new_df["col1"] = 200.0
         assert orig_df["col1"][0] == 1.0
 
-    def test_constructor_dtype_nocast_view(self):
+    def test_constructor_dtype_nocast_view_dataframe(self):
         df = DataFrame([[1, 2]])
         should_be_view = DataFrame(df, dtype=df[0].dtype)
         should_be_view[0][0] = 99
         assert df.values[0, 0] == 99
 
+    @td.skip_array_manager_invalid_test  # TODO(ArrayManager) keep view on 2D array?
+    def test_constructor_dtype_nocast_view_2d_array(self):
+        df = DataFrame([[1, 2]])
         should_be_view = DataFrame(df.values, dtype=df[0].dtype)
         should_be_view[0][0] = 97
         assert df.values[0, 0] == 97
@@ -1931,6 +1938,7 @@ def test_constructor_frame_copy(self, float_frame):
         assert (cop["A"] == 5).all()
         assert not (float_frame["A"] == 5).all()
 
+    @td.skip_array_manager_invalid_test  # TODO(ArrayManager) keep view on 2D array?
     def test_constructor_ndarray_copy(self, float_frame):
         df = DataFrame(float_frame.values)
 
diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -18,6 +20,9 @@ def check(result, expected=None):
 
 
 class TestDataFrameNonuniqueIndexes:
+
+    # TODO(ArrayManager) iset with multiple elements not yet implemented
+    @td.skip_array_manager_not_yet_implemented
     def test_setattr_columns_vs_construct_with_columns(self):
 
         # assignment
@@ -234,7 +239,59 @@ def test_column_dups_dropna(self):
         result = df.dropna(subset=["A", "C"], how="all")
         tm.assert_frame_equal(result, expected)
 
+<<<<<<< HEAD
     def test_dup_columns_comparisons(self):
+=======
+    def test_getitem_boolean_series_with_duplicate_columns(self):
+        # boolean indexing
+        # GH 4879
+        dups = ["A", "A", "C", "D"]
+        df = DataFrame(
+            np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
+        )
+        expected = df[df.C > 6]
+        expected.columns = dups
+        df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
+        result = df[df.C > 6]
+        check(result, expected)
+
+    def test_getitem_boolean_frame_with_duplicate_columns(self):
+        dups = ["A", "A", "C", "D"]
+
+        # where
+        df = DataFrame(
+            np.arange(12).reshape(3, 4), columns=["A", "B", "C", "D"], dtype="float64"
+        )
+        # `df > 6` is a DataFrame with the same shape+alignment as df
+        expected = df[df > 6]
+        expected.columns = dups
+        df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
+        result = df[df > 6]
+        check(result, expected)
+
+    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) duplicate indices
+    # fix error message
+    def test_getitem_boolean_frame_unaligned_with_duplicate_columns(self):
+        # `df.A > 6` is a DataFrame with a different shape from df
+        dups = ["A", "A", "C", "D"]
+
+        # boolean with the duplicate raises
+        df = DataFrame(np.arange(12).reshape(3, 4), columns=dups, dtype="float64")
+        msg = "cannot reindex from a duplicate axis"
+        with pytest.raises(ValueError, match=msg):
+            df[df.A > 6]
+
+    def test_column_dups_indexing(self):
+
+        # dup aligning operations should work
+        # GH 5185
+        df1 = DataFrame([1, 2, 3, 4, 5], index=[1, 2, 1, 2, 3])
+        df2 = DataFrame([1, 2, 3], index=[1, 2, 3])
+        expected = DataFrame([0, 2, 0, 2, 2], index=[1, 1, 2, 2, 3])
+        result = df1.sub(df2)
+        tm.assert_frame_equal(result, expected)
+
+>>>>>>> ec83091284... [ArrayManager] TST: get tests running for /tests/frame
         # equality
         df1 = DataFrame([[1, 2], [2, np.nan], [3, 4], [4, 4]], columns=["A", "B"])
         df2 = DataFrame([[0, 1], [2, 4], [2, np.nan], [4, 5]], columns=["A", "A"])
@@ -286,7 +343,7 @@ def test_multi_axis_dups(self):
         result = z.loc[["a", "c", "a"]]
         check(result, expected)
 
-    def test_columns_with_dups(self):
+    def test_columns_with_dups(self, using_array_manager):
         # GH 3468 related
 
         # basic
@@ -341,8 +398,9 @@ def test_dups_across_blocks(self):
         )
         df = pd.concat([df_float, df_int, df_bool, df_object, df_dt], axis=1)
 
-        assert len(df._mgr.blknos) == len(df.columns)
-        assert len(df._mgr.blklocs) == len(df.columns)
+        if not using_array_manager:
+            assert len(df._mgr.blknos) == len(df.columns)
+            assert len(df._mgr.blklocs) == len(df.columns)
 
         # testing iloc
         for i in range(len(df.columns)):
diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py
@@ -26,14 +26,16 @@
 
 
 class TestDataFrameReprInfoEtc:
-    def test_repr_bytes_61_lines(self):
+    def test_repr_bytes_61_lines(self, using_array_manager):
         # GH#12857
         lets = list("ACDEFGHIJKLMNOP")
         slen = 50
         nseqs = 1000
         words = [[np.random.choice(lets) for x in range(slen)] for _ in range(nseqs)]
         df = DataFrame(words).astype("U1")
-        assert (df.dtypes == object).all()
+        # TODO(Arraymanager) astype("U1") actually gives this dtype instead of object
+        if not using_array_manager:
+            assert (df.dtypes == object).all()
 
         # smoke tests; at one point this raised with 61 but not 60
         repr(df)
diff --git a/pandas/tests/frame/test_stack_unstack.py b/pandas/tests/frame/test_stack_unstack.py
@@ -5,6 +5,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     DataFrame,
@@ -60,12 +62,13 @@ def test_stack_mixed_level(self):
         expected = expected[["a", "b"]]
         tm.assert_frame_equal(result, expected)
 
-    def test_unstack_not_consolidated(self):
+    def test_unstack_not_consolidated(self, using_array_manager):
         # Gh#34708
         df = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]})
         df2 = df[["x"]]
         df2["y"] = df["y"]
-        assert len(df2._mgr.blocks) == 2
+        if not using_array_manager:
+            assert len(df2._mgr.blocks) == 2
 
         res = df2.unstack()
         expected = df.unstack()
@@ -118,6 +121,8 @@ def test_unstack_fill(self):
         expected = unstacked["w"]
         tm.assert_frame_equal(result, expected)
 
+    # TODO(ArrayManager) iset with multiple elements not yet implemented
+    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) iset
     def test_unstack_fill_frame(self):
 
         # From a dataframe
@@ -747,7 +752,8 @@ def test_unstack_multi_level_rows_and_cols(self):
         expected = df.unstack(["i3"]).unstack(["i2"])
         tm.assert_frame_equal(result, expected)
 
-    def test_unstack_nan_index(self):  # GH7466
+    def test_unstack_nan_index1(self):
+        # GH7466
         def cast(val):
             val_str = "" if val != val else val
             return f"{val_str:1}"
@@ -833,6 +839,7 @@ def verify(df):
                 for col in ["4th", "5th"]:
                     verify(udf[col])
 
+    def test_unstack_nan_index2(self):
         # GH7403
         df = DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)})
         df.iloc[3, 1] = np.NaN
@@ -875,6 +882,7 @@ def verify(df):
         right = DataFrame(vals, columns=cols, index=idx)
         tm.assert_frame_equal(left, right)
 
+    def test_unstack_nan_index3(self, using_array_manager):
         # GH7401
         df = DataFrame(
             {
@@ -896,8 +904,13 @@ def verify(df):
         )
 
         right = DataFrame(vals, columns=cols, index=idx)
+        if using_array_manager:
+            # with ArrayManager preserve dtype where possible
+            cols = right.columns[[1, 2, 3, 5]]
+            right[cols] = right[cols].astype("int64")
         tm.assert_frame_equal(left, right)
 
+    def test_unstack_nan_index4(self):
         # GH4862
         vals = [
             ["Hg", np.nan, np.nan, 680585148],
@@ -938,6 +951,8 @@ def verify(df):
         left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"])
         tm.assert_frame_equal(left.unstack(), right)
 
+    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) MultiIndex bug
+    def test_unstack_nan_index5(self):
         # GH9497 - multiple unstack with nulls
         df = DataFrame(
             {
@@ -1453,6 +1468,7 @@ def test_stack_mixed_dtype(self, multiindex_dataframe_random_data):
         assert result.name is None
         assert stacked["bar"].dtype == np.float_
 
+    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) groupby
     def test_unstack_bug(self):
         df = DataFrame(
             {
@@ -1689,6 +1705,7 @@ def test_unstack_period_frame(self):
 
         tm.assert_frame_equal(result3, expected)
 
+    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) groupby
     def test_stack_multiple_bug(self):
         # bug when some uniques are not present in the data GH#3170
         id_col = ([1] * 3) + ([2] * 3)
@@ -1887,7 +1904,7 @@ def test_unstack_group_index_overflow(self):
         result = s.unstack(4)
         assert result.shape == (500, 2)
 
-    def test_unstack_with_missing_int_cast_to_float(self):
+    def test_unstack_with_missing_int_cast_to_float(self, using_array_manager):
         # https://github.com/pandas-dev/pandas/issues/37115
         df = DataFrame(
             {
@@ -1899,7 +1916,8 @@ def test_unstack_with_missing_int_cast_to_float(self):
 
         # add another int column to get 2 blocks
         df["is_"] = 1
-        assert len(df._mgr.blocks) == 2
+        if not using_array_manager:
+            assert len(df._mgr.blocks) == 2
 
         result = df.unstack("b")
         result[("is_", "ca")] = result[("is_", "ca")].fillna(0)
@@ -1912,6 +1930,10 @@ def test_unstack_with_missing_int_cast_to_float(self):
                 names=[None, "b"],
             ),
         )
+        if using_array_manager:
+            # with ArrayManager preserve dtype where possible
+            expected[("v", "cb")] = expected[("v", "cb")].astype("int64")
+            expected[("is_", "cb")] = expected[("is_", "cb")].astype("int64")
         tm.assert_frame_equal(result, expected)
 
     def test_unstack_with_level_has_nan(self):
diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py
@@ -702,6 +702,7 @@ def test_idxmax_preserves_subclass(self):
         result = df.idxmax()
         assert isinstance(result, tm.SubclassedSeries)
 
+    @td.skip_array_manager_not_yet_implemented  # TODO(ArrayManager) equals
     def test_equals_subclass(self):
         # https://github.com/pandas-dev/pandas/pull/34402
         # allow subclass in both directions