@@ -1545,6 +1545,36 @@ def test_isin_empty_datetimelike(self):
1545
1545
# ----------------------------------------------------------------------
1546
1546
# Row deduplication
1547
1547
1548
+ @pytest .mark .parametrize ('subset' , ['a' , ['a' ], ['a' , 'B' ]])
1549
+ def test_duplicated_with_misspelled_column_name (self , subset ):
1550
+ # GH 19730
1551
+ df = pd .DataFrame ({'A' : [0 , 0 , 1 ],
1552
+ 'B' : [0 , 0 , 1 ],
1553
+ 'C' : [0 , 0 , 1 ]})
1554
+
1555
+ with pytest .raises (KeyError ):
1556
+ df .duplicated (subset )
1557
+
1558
+ with pytest .raises (KeyError ):
1559
+ df .drop_duplicates (subset )
1560
+
1561
+ @pytest .mark .slow
1562
+ def test_duplicated_do_not_fail_on_wide_dataframes (self ):
1563
+ # gh-21524
1564
+ # Given the wide dataframe with a lot of columns
1565
+ # with different (important!) values
1566
+ data = {'col_{0:02d}' .format (i ): np .random .randint (0 , 1000 , 30000 )
1567
+ for i in range (100 )}
1568
+ df = pd .DataFrame (data ).T
1569
+ result = df .duplicated ()
1570
+
1571
+ # Then duplicates produce the bool pd.Series as a result
1572
+ # and don't fail during calculation.
1573
+ # Actual values doesn't matter here, though usually
1574
+ # it's all False in this case
1575
+ assert isinstance (result , pd .Series )
1576
+ assert result .dtype == np .bool
1577
+
1548
1578
def test_drop_duplicates (self ):
1549
1579
df = DataFrame ({'AAA' : ['foo' , 'bar' , 'foo' , 'bar' ,
1550
1580
'foo' , 'bar' , 'bar' , 'foo' ],
@@ -1640,36 +1670,6 @@ def test_drop_duplicates(self):
1640
1670
for keep in ['first' , 'last' , False ]:
1641
1671
assert df .duplicated (keep = keep ).sum () == 0
1642
1672
1643
- @pytest .mark .parametrize ('subset' , ['a' , ['a' ], ['a' , 'B' ]])
1644
- def test_duplicated_with_misspelled_column_name (self , subset ):
1645
- # GH 19730
1646
- df = pd .DataFrame ({'A' : [0 , 0 , 1 ],
1647
- 'B' : [0 , 0 , 1 ],
1648
- 'C' : [0 , 0 , 1 ]})
1649
-
1650
- with pytest .raises (KeyError ):
1651
- df .duplicated (subset )
1652
-
1653
- with pytest .raises (KeyError ):
1654
- df .drop_duplicates (subset )
1655
-
1656
- @pytest .mark .slow
1657
- def test_duplicated_do_not_fail_on_wide_dataframes (self ):
1658
- # gh-21524
1659
- # Given the wide dataframe with a lot of columns
1660
- # with different (important!) values
1661
- data = {'col_{0:02d}' .format (i ): np .random .randint (0 , 1000 , 30000 )
1662
- for i in range (100 )}
1663
- df = pd .DataFrame (data ).T
1664
- result = df .duplicated ()
1665
-
1666
- # Then duplicates produce the bool pd.Series as a result
1667
- # and don't fail during calculation.
1668
- # Actual values doesn't matter here, though usually
1669
- # it's all False in this case
1670
- assert isinstance (result , pd .Series )
1671
- assert result .dtype == np .bool
1672
-
1673
1673
def test_drop_duplicates_with_duplicate_column_names (self ):
1674
1674
# GH17836
1675
1675
df = DataFrame ([
0 commit comments