From e693c3a196a96e65accbd4ec93c93185141baf6d Mon Sep 17 00:00:00 2001 From: Jeff Blackburne Date: Wed, 29 Apr 2015 12:31:06 -0700 Subject: [PATCH] Changed a condition in tokenize_delimited to account for data chunks that start with newline. Changed a condition in tokenize_delim_customterm to account for data chunks that start with terminator. Added a unit test that fails in master and passes in this branch. Moved new unit test in order to test all parser engines. Added GH issue number. Added release note. --- doc/source/whatsnew/v0.16.1.txt | 1 + pandas/io/tests/test_parsers.py | 6 ++++++ pandas/src/parser/tokenizer.c | 4 ++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt index 2ddf77d99d51d..16aab41cc3e88 100755 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -216,6 +216,7 @@ Bug Fixes - Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`) - Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`) - Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`) +- Bug in C csv parser causing spurious NaNs when data started with newline followed by whitespace. (:issue:`10022`) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 59fb3f14de8d2..7d52c6ad4cb3b 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -2287,6 +2287,12 @@ def test_single_char_leading_whitespace(self): result = self.read_csv(StringIO(data), skipinitialspace=True) tm.assert_frame_equal(result, expected) + def test_chunk_begins_with_newline_whitespace(self): + # GH 10022 + data = '\n hello\nworld\n' + result = self.read_csv(StringIO(data), header=None) + self.assertEqual(len(result), 2) + class TestPythonParser(ParserTests, tm.TestCase): def test_negative_skipfooter_raises(self): diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index e7b5db9c5e361..3be17f17d6afa 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -854,7 +854,7 @@ int tokenize_delimited(parser_t *self, size_t line_limit) --i; } while (i + 1 > self->datapos && *buf != '\n'); - if (i + 1 > self->datapos) // reached a newline rather than the beginning + if (*buf == '\n') // reached a newline rather than the beginning { ++buf; // move pointer to first char after newline ++i; @@ -1172,7 +1172,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit) --i; } while (i + 1 > self->datapos && *buf != self->lineterminator); - if (i + 1 > self->datapos) // reached a newline rather than the beginning + if (*buf == self->lineterminator) // reached a newline rather than the beginning { ++buf; // move pointer to first char after newline ++i;