diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 414794dd6a56e..1b5f9dd41eaeb 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -246,6 +246,7 @@ Other enhancements - Improved error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) - Improved consistency of error messages when passing an invalid ``win_type`` argument in :ref:`Window methods ` (:issue:`15969`) - :func:`read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) +- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when ``usecols`` is not specified (:issue:`21768`) - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - Added support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2a86ff13a2edc..f914e0601fb89 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -23,6 +23,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import ( + ArrayLike, DtypeArg, FilePathOrBuffer, final, @@ -803,6 +804,29 @@ def _do_date_conversions(self, names, data): return names, data + def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: + """Checks if length of data is equal to length of column names. + + One set of trailing commas is allowed. self.index_col not False + results in a ParserError previously when lengths do not match. + + Parameters + ---------- + columns: list of column names + data: list of array-likes containing the data column-wise. + """ + if not self.index_col and len(columns) != len(data) and columns: + if len(columns) == len(data) - 1 and np.all( + (is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1]) + ): + return + warnings.warn( + "Length of header or names does not match length of data. This leads " + "to a loss of data with index_col=False.", + ParserWarning, + stacklevel=6, + ) + def _evaluate_usecols(self, usecols, names): """ Check whether or not the 'usecols' parameter diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 5c1f8f94a72da..110211125514e 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -300,6 +300,8 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data_tups] + if self.usecols is None: + self._check_data_length(names, alldata) data = {k: v for k, (i, v) in zip(names, data_tups)} diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 670868c6f4261..0d41d2972e799 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -292,6 +292,8 @@ def _exclude_implicit_index(self, alldata): offset = len(self.index_col) # type: ignore[has-type] len_alldata = len(alldata) + self._check_data_length(names, alldata) + return { name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata }, names diff --git a/pandas/tests/io/parser/common/__init__.py b/pandas/tests/io/parser/common/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index ceb770ce72b78..5b7df02603b74 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -143,10 +143,7 @@ def test_read_chunksize_jagged_names(all_parsers): parser = all_parsers data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) - # error: List item 0 has incompatible type "float"; expected "int" - expected = DataFrame( - [[0] + [np.nan] * 9] * 7 + [[0] * 10] # type: ignore[list-item] - ) + expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: result = concat(reader) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 8fa2d7f7b8d65..a1c76e2740dbe 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas import ( @@ -685,7 +686,8 @@ def test_no_header_two_extra_columns(all_parsers): ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index b86dc5ef85fc6..16649be5b8a58 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -383,7 +383,9 @@ def test_usecols_indices_out_of_bounds(all_parsers, names): a,b 1,2 """ - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) expected = DataFrame({"a": [1], "b": [None]}) if names is None and parser.engine == "python":