diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py index c68b2bf064d97..6d476e326213e 100644 --- a/pandas/tests/io/parser/c_parser_only.py +++ b/pandas/tests/io/parser/c_parser_only.py @@ -290,11 +290,11 @@ def test_empty_header_read(count): test_empty_header_read(count) def test_parse_trim_buffers(self): - # This test is part of a bugfix for issue #13703. It attmepts to + # This test is part of a bugfix for issue #13703. It attempts to # to stress the system memory allocator, to cause it to move the # stream buffer and either let the OS reclaim the region, or let # other memory requests of parser otherwise modify the contents - # of memory space, where it was formely located. + # of memory space, where it was formally located. # This test is designed to cause a `segfault` with unpatched # `tokenizer.c`. Sometimes the test fails on `segfault`, other # times it fails due to memory corruption, which causes the @@ -346,7 +346,7 @@ def test_parse_trim_buffers(self): # Generate the expected output: manually create the dataframe # by splitting by comma and repeating the `n_lines` times. - row = tuple(val_ if val_ else float("nan") + row = tuple(val_ if val_ else np.nan for val_ in record_.split(",")) expected = pd.DataFrame([row for _ in range(n_lines)], dtype=object, columns=None, index=None) @@ -359,6 +359,15 @@ def test_parse_trim_buffers(self): # Check for data corruption if there was no segfault tm.assert_frame_equal(result, expected) + # This extra test was added to replicate the fault in gh-5291. + # Force 'utf-8' encoding, so that `_string_convert` would take + # a different execution branch. + chunks_ = self.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize, + encoding='utf_8') + result = pd.concat(chunks_, axis=0, ignore_index=True) + tm.assert_frame_equal(result, expected) + def test_internal_null_byte(self): # see gh-14012 #