diff --git a/doc/source/io.rst b/doc/source/io.rst index 1d83e06a13567..00e86d971182d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1992,6 +1992,27 @@ indices to be parsed. read_excel('path_to_file.xls', 'Sheet1', parse_cols=[0, 2, 3]) +.. note:: + + It is possible to transform the contents of Excel cells via the `converters` + option. For instance, to convert a column to boolean: + + .. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + + This options handles missing values and treats exceptions in the converters + as missing data. Transformations are applied cell by cell rather than to the + column as a whole, so the array dtype is not guaranteed. For instance, a + column of integers with missing values cannot be transformed to an array + with integer dtype, because NaN is strictly a float. You can manually mask + missing data to recover integer dtype: + + .. code-block:: python + + cfun = lambda x: int(x) if x else -1 + read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + To write a DataFrame object to a sheet of an Excel file, you can use the ``to_excel`` instance method. The arguments are largely the same as ``to_csv`` described above, the first argument being the name of the excel file, and the diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 424518cbde4f8..2ece91b5dea11 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -83,6 +83,11 @@ def read_excel(io, sheetname=0, **kwds): Rows to skip at the beginning (0-indexed) skip_footer : int, default 0 Rows at the end to skip (0-indexed) + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the Excel cell content, and return the transformed + content. index_col : int, default None Column to use as the row labels of the DataFrame. Pass None if there is no such column @@ -175,7 +180,7 @@ def __init__(self, io, **kwds): def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None, - convert_float=True, has_index_names=False, **kwds): + convert_float=True, has_index_names=False, converters=None, **kwds): """Read an Excel table into DataFrame Parameters @@ -188,6 +193,9 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, Rows to skip at the beginning (0-indexed) skip_footer : int, default 0 Rows at the end to skip (0-indexed) + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels index_col : int, default None Column to use as the row labels of the DataFrame. Pass None if there is no such column @@ -235,6 +243,7 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, thousands=thousands, chunksize=chunksize, skip_footer=skip_footer, convert_float=convert_float, + converters=converters, **kwds) def _should_parse(self, i, parse_cols): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8f8e3151d56e6..b23aa017138e1 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -127,7 +127,7 @@ class ParserWarning(Warning): Return TextFileReader object for iteration skipfooter : int, default 0 Number of lines at bottom of file to skip (Unsupported with engine='c') -converters : dict. optional +converters : dict, default None Dict of functions for converting values in certain columns. Keys can either be integers or column labels verbose : boolean, default False @@ -983,8 +983,13 @@ def _convert_to_ndarrays(self, dct, na_values, na_fvalues, verbose=False, na_fvalues) coerce_type = True if conv_f is not None: - values = lib.map_infer(values, conv_f) + try: + values = lib.map_infer(values, conv_f) + except ValueError: + mask = lib.ismember(values, na_values).view(np.uin8) + values = lib.map_infer_mask(values, conv_f, mask) coerce_type = False + cvals, na_count = self._convert_types( values, set(col_na_values) | col_na_fvalues, coerce_type) result[c] = cvals @@ -1269,6 +1274,11 @@ def TextParser(*args, **kwds): Row numbers to skip skip_footer : int Number of line at bottom of file to skip + converters : dict, default None + Dict of functions for converting values in certain columns. Keys can + either be integers or column labels, values are functions that take one + input argument, the cell (not column) content, and return the + transformed content. encoding : string, default None Encoding to use for UTF when reading/writing (ex. 'utf-8') squeeze : boolean, default False diff --git a/pandas/io/tests/data/test_converters.xls b/pandas/io/tests/data/test_converters.xls new file mode 100644 index 0000000000000..c0aa9d903adad Binary files /dev/null and b/pandas/io/tests/data/test_converters.xls differ diff --git a/pandas/io/tests/data/test_converters.xlsx b/pandas/io/tests/data/test_converters.xlsx new file mode 100644 index 0000000000000..e21bc5fbf9ee2 Binary files /dev/null and b/pandas/io/tests/data/test_converters.xlsx differ diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 6d3f0b5475298..4f97cef3d46d3 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -399,6 +399,31 @@ def test_reader_special_dtypes(self): convert_float=False) tm.assert_frame_equal(actual, no_convert_float) + # GH8212 - support for converters and missing values + def test_reader_converters(self): + _skip_if_no_xlrd() + + expected = DataFrame.from_items([ + ("IntCol", [1, 2, -3, -1000, 0]), + ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), + ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']), + ("StrCol", ['1', np.nan, '3', '4', '5']), + ]) + + converters = {'IntCol': lambda x: int(x) if x != '' else -1000, + 'FloatCol': lambda x: 10 * x if x else np.nan, + 2: lambda x: 'Found' if x != '' else 'Not found', + 3: lambda x: str(x) if x else '', + } + + xlsx_path = os.path.join(self.dirpath, 'test_converters.xlsx') + xls_path = os.path.join(self.dirpath, 'test_converters.xls') + + # should read in correctly and set types of single cells (not array dtypes) + for path in (xls_path, xlsx_path): + actual = read_excel(path, 'Sheet1', converters=converters) + tm.assert_frame_equal(actual, expected) + def test_reader_seconds(self): # Test reading times with and without milliseconds. GH5945. _skip_if_no_xlrd()