Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,5 @@ Performance Improvements

Bug Fixes
~~~~~~~~~

- Bug in ``pd.read_csv()`` in which the ``dtype`` parameter was not being respected for empty data (:issue:`14712`)
25 changes: 17 additions & 8 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
is_float,
is_scalar)
from pandas.core.index import Index, MultiIndex, RangeIndex
from pandas.core.series import Series
from pandas.core.frame import DataFrame
from pandas.core.common import AbstractMethodError
from pandas.core.config import get_option
Expand Down Expand Up @@ -2791,27 +2792,35 @@ def _clean_index_names(columns, index_col):
def _get_empty_meta(columns, index_col, index_names, dtype=None):
columns = list(columns)

if dtype is None:
dtype = {}
# Convert `dtype` to a defaultdict of some kind.
# This will enable us to write `dtype[col_name]`
# without worrying about KeyError issues later on.
if not isinstance(dtype, dict):
# if dtype == None, default will be np.object.
default_dtype = dtype or np.object
dtype = defaultdict(lambda: default_dtype)
else:
if not isinstance(dtype, dict):
dtype = defaultdict(lambda: dtype)
# Save a copy of the dictionary.
_dtype = dtype.copy()
dtype = defaultdict(lambda: np.object)

# Convert column indexes to column names.
dtype = dict((columns[k] if is_integer(k) else k, v)
for k, v in compat.iteritems(dtype))
for k, v in compat.iteritems(_dtype):
col = columns[k] if is_integer(k) else k
dtype[col] = v

if index_col is None or index_col is False:
index = Index([])
else:
index = [np.empty(0, dtype=dtype.get(index_name, np.object))
index = [Series([], dtype=dtype[index_name])
for index_name in index_names]
index = MultiIndex.from_arrays(index, names=index_names)
index_col.sort()
for i, n in enumerate(index_col):
columns.pop(n - i)

col_dict = dict((col_name,
np.empty(0, dtype=dtype.get(col_name, np.object)))
Series([], dtype=dtype[col_name]))
for col_name in columns)

return index, columns, col_dict
Expand Down
46 changes: 46 additions & 0 deletions pandas/io/tests/parser/c_parser_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,3 +561,49 @@ def test_internal_null_byte(self):

result = self.read_csv(StringIO(data), names=names)
tm.assert_frame_equal(result, expected)

def test_empty_dtype(self):
# see gh-14712
data = 'a,b'

expected = pd.DataFrame(columns=['a', 'b'], dtype=np.float64)
result = self.read_csv(StringIO(data), header=0, dtype=np.float64)
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame({'a': pd.Categorical([]),
'b': pd.Categorical([])},
index=[])
result = self.read_csv(StringIO(data), header=0,
dtype='category')
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame(columns=['a', 'b'], dtype='datetime64[ns]')
result = self.read_csv(StringIO(data), header=0,
dtype='datetime64[ns]')
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame({'a': pd.Series([], dtype='timedelta64[ns]'),
'b': pd.Series([], dtype='timedelta64[ns]')},
index=[])
result = self.read_csv(StringIO(data), header=0,
dtype='timedelta64[ns]')
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame(columns=['a', 'b'])
expected['a'] = expected['a'].astype(np.float64)
result = self.read_csv(StringIO(data), header=0,
dtype={'a': np.float64})
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame(columns=['a', 'b'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for completeness, can you tests with timedelta/datetime/category as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, done.

expected['a'] = expected['a'].astype(np.float64)
result = self.read_csv(StringIO(data), header=0,
dtype={0: np.float64})
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame(columns=['a', 'b'])
expected['a'] = expected['a'].astype(np.int32)
expected['b'] = expected['b'].astype(np.float64)
result = self.read_csv(StringIO(data), header=0,
dtype={'a': np.int32, 1: np.float64})
tm.assert_frame_equal(result, expected)