From 434f1e0728497eb3d460ce2e076af9377d1c7efb Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Mon, 25 Jul 2016 20:00:34 +0300 Subject: [PATCH 01/12] FIX: 'parser_trim_buffers' properly initializes word pointers --- pandas/src/parser/tokenizer.c | 44 ++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 14 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 6091c79e2b4fc..7562b088efb8c 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1221,20 +1221,7 @@ int parser_trim_buffers(parser_t *self) { size_t new_cap; void *newptr; - /* trim stream */ - new_cap = _next_pow2(self->stream_len) + 1; - TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", - new_cap, self->stream_cap, self->lines_cap)); - if (new_cap < self->stream_cap) { - TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); - newptr = safe_realloc((void*) self->stream, new_cap); - if (newptr == NULL) { - return PARSER_OUT_OF_MEMORY; - } else { - self->stream = newptr; - self->stream_cap = new_cap; - } - } + int i; /* trim words, word_starts */ new_cap = _next_pow2(self->words_len) + 1; @@ -1255,6 +1242,35 @@ int parser_trim_buffers(parser_t *self) { } } + /* trim stream */ + new_cap = _next_pow2(self->stream_len) + 1; + TRACE(("parser_trim_buffers: new_cap = %zu, stream_cap = %zu, lines_cap = %zu\n", + new_cap, self->stream_cap, self->lines_cap)); + if (new_cap < self->stream_cap) { + TRACE(("parser_trim_buffers: new_cap < self->stream_cap, calling safe_realloc\n")); + newptr = safe_realloc((void*) self->stream, new_cap); + if (newptr == NULL) { + return PARSER_OUT_OF_MEMORY; + } else { + // realloc sets errno when moving buffer? + if (self->stream != newptr) { + // uff + /* TRACE(("Moving word pointers\n")) */ + + self->pword_start = newptr + self->word_start; + + for (i = 0; i < self->words_len; ++i) + { + self->words[i] = newptr + self->word_starts[i]; + } + } + + self->stream = newptr; + self->stream_cap = new_cap; + + } + } + /* trim line_start, line_fields */ new_cap = _next_pow2(self->lines) + 1; if (new_cap < self->lines_cap) { From 21207198958738f6f583028c804551da6369ab25 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 11:57:32 +0300 Subject: [PATCH 02/12] A memory 'stress' test of parser.pyx to cause corruption or segfault --- pandas/tests/test_parser.py | 42 +++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 pandas/tests/test_parser.py diff --git a/pandas/tests/test_parser.py b/pandas/tests/test_parser.py new file mode 100644 index 0000000000000..1e8139569e96c --- /dev/null +++ b/pandas/tests/test_parser.py @@ -0,0 +1,42 @@ +import os +import subprocess + +import pandas.util.testing as tm + +class TestParser(tm.TestCase): + _multiprocess_can_split_ = True + + def test_parse_trim_buffers(self): + code_ = """\n +import pandas as pd +from cStringIO import StringIO +record_ = "9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,," +csv_data = "\\n".join([record_]*173) + "\\n" +for n_lines in range(82, 90): + iterator_ = pd.read_csv(StringIO(csv_data), header=None, engine="c", + dtype=object, chunksize=n_lines, na_filter=True) + for chunk_ in iterator_: + print n_lines, chunk_.iloc[0, 0], chunk_.iloc[-1, 0] +exit(0) +""" + expected_ = "".join("%d 9999-9 9999-9\n"%(n_lines,) + for n_lines in range(82, 90) + for _ in range((173 + n_lines - 1) // n_lines)) + + # Run the faulty code via ang explicit argumnet to python + proc_ = subprocess.Popen(("python", "-c", code_), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Wait until the subprocess finishes and then collect the output + stdout_, stderr_ = proc_.communicate() + exit_code = proc_.poll() + + # Check whether a segfault or memory corruption occurred + # self.assertTrue(exit_code == -11 or (exit_code == 0 and stdout_ != expected_)) + + # Check for correct exit code and output + self.assertTrue(exit_code == 0 and stdout_ == expected_, msg="success") + +if __name__ == '__main__': + import nose + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) From 07b4647a28a65a884a596939b3a79fa3b795ebc4 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 12:29:34 +0300 Subject: [PATCH 03/12] praser_trim_fix: More stressful test --- pandas/tests/test_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/test_parser.py b/pandas/tests/test_parser.py index 1e8139569e96c..6b3593e82cd73 100644 --- a/pandas/tests/test_parser.py +++ b/pandas/tests/test_parser.py @@ -12,7 +12,7 @@ def test_parse_trim_buffers(self): from cStringIO import StringIO record_ = "9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,," csv_data = "\\n".join([record_]*173) + "\\n" -for n_lines in range(82, 90): +for n_lines in range(60, 90): iterator_ = pd.read_csv(StringIO(csv_data), header=None, engine="c", dtype=object, chunksize=n_lines, na_filter=True) for chunk_ in iterator_: @@ -20,7 +20,7 @@ def test_parse_trim_buffers(self): exit(0) """ expected_ = "".join("%d 9999-9 9999-9\n"%(n_lines,) - for n_lines in range(82, 90) + for n_lines in range(60, 90) for _ in range((173 + n_lines - 1) // n_lines)) # Run the faulty code via ang explicit argumnet to python From a831dbb86e65f8aae0b63ffe9e098130fa3b57f3 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 13:08:12 +0300 Subject: [PATCH 04/12] Moved 'parser_trim_buffers' test to its proper place --- pandas/io/tests/parser/common.py | 33 +++++++++++++++++++++++++ pandas/tests/test_parser.py | 42 -------------------------------- 2 files changed, 33 insertions(+), 42 deletions(-) delete mode 100644 pandas/tests/test_parser.py diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 11eed79e03267..f8cc3c69fea5f 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -5,6 +5,8 @@ import platform import codecs +import subprocess + import re import sys from datetime import datetime @@ -1491,3 +1493,34 @@ def test_memory_map(self): out = self.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(out, expected) + + + def test_parse_trim_buffers(self): + code_ = """\n +import pandas as pd +from pandas.compat import StringIO +record_ = "9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,," +csv_data = "\\n".join([record_]*173) + "\\n" +for n_lines in range(57, 90): + iterator_ = pd.read_csv(StringIO(csv_data), header=None, engine="c", + dtype=object, chunksize=n_lines, na_filter=True) + for chunk_ in iterator_: + print n_lines, chunk_.iloc[0, 0], chunk_.iloc[-1, 0] +exit(0) +""" + expected_ = "".join("%d 9999-9 9999-9\n"%(n_lines,) + for n_lines in range(57, 90) + for _ in range((173 + n_lines - 1) // n_lines)) + + # Run the faulty code via ang explicit argumnet to python + proc_ = subprocess.Popen(("python", "-c", code_), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + # Wait until the subprocess finishes and then collect the output + stdout_, stderr_ = proc_.communicate() + exit_code = proc_.poll() + + # Check whether a segfault or memory corruption occurred + # tm.assertTrue(exit_code == -11 or (exit_code == 0 and stdout_ != expected_)) + + # Check for correct exit code and output + tm.assert_equal(exit_code == 0 and stdout_ == expected_, True) diff --git a/pandas/tests/test_parser.py b/pandas/tests/test_parser.py deleted file mode 100644 index 6b3593e82cd73..0000000000000 --- a/pandas/tests/test_parser.py +++ /dev/null @@ -1,42 +0,0 @@ -import os -import subprocess - -import pandas.util.testing as tm - -class TestParser(tm.TestCase): - _multiprocess_can_split_ = True - - def test_parse_trim_buffers(self): - code_ = """\n -import pandas as pd -from cStringIO import StringIO -record_ = "9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,," -csv_data = "\\n".join([record_]*173) + "\\n" -for n_lines in range(60, 90): - iterator_ = pd.read_csv(StringIO(csv_data), header=None, engine="c", - dtype=object, chunksize=n_lines, na_filter=True) - for chunk_ in iterator_: - print n_lines, chunk_.iloc[0, 0], chunk_.iloc[-1, 0] -exit(0) -""" - expected_ = "".join("%d 9999-9 9999-9\n"%(n_lines,) - for n_lines in range(60, 90) - for _ in range((173 + n_lines - 1) // n_lines)) - - # Run the faulty code via ang explicit argumnet to python - proc_ = subprocess.Popen(("python", "-c", code_), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - # Wait until the subprocess finishes and then collect the output - stdout_, stderr_ = proc_.communicate() - exit_code = proc_.poll() - - # Check whether a segfault or memory corruption occurred - # self.assertTrue(exit_code == -11 or (exit_code == 0 and stdout_ != expected_)) - - # Check for correct exit code and output - self.assertTrue(exit_code == 0 and stdout_ == expected_, msg="success") - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) From 5ab36363414ef0542932d7dbc174cdba07d60ed9 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 13:40:48 +0300 Subject: [PATCH 05/12] Expanded the explanation of the patch --- pandas/src/parser/tokenizer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 7562b088efb8c..ac909f2c8bfdb 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1252,11 +1252,11 @@ int parser_trim_buffers(parser_t *self) { if (newptr == NULL) { return PARSER_OUT_OF_MEMORY; } else { - // realloc sets errno when moving buffer? + // Update the pointers in the self->words array (char **) if `safe_realloc` + // moved the `self->stream` buffer. This block mirrors a similar block in + // `make_stream_space`. if (self->stream != newptr) { - // uff /* TRACE(("Moving word pointers\n")) */ - self->pword_start = newptr + self->word_start; for (i = 0; i < self->words_len; ++i) From bdba66f2d6c84b504c03a31634aa1e02edb4a71f Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 14:47:58 +0300 Subject: [PATCH 06/12] Rewritten the 'parser_trim_buffers' test --- pandas/io/tests/parser/common.py | 59 +++++++++++++++++--------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index f8cc3c69fea5f..546a9adf7e9c3 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1496,31 +1496,34 @@ def test_memory_map(self): def test_parse_trim_buffers(self): - code_ = """\n -import pandas as pd -from pandas.compat import StringIO -record_ = "9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,," -csv_data = "\\n".join([record_]*173) + "\\n" -for n_lines in range(57, 90): - iterator_ = pd.read_csv(StringIO(csv_data), header=None, engine="c", - dtype=object, chunksize=n_lines, na_filter=True) - for chunk_ in iterator_: - print n_lines, chunk_.iloc[0, 0], chunk_.iloc[-1, 0] -exit(0) -""" - expected_ = "".join("%d 9999-9 9999-9\n"%(n_lines,) - for n_lines in range(57, 90) - for _ in range((173 + n_lines - 1) // n_lines)) - - # Run the faulty code via ang explicit argumnet to python - proc_ = subprocess.Popen(("python", "-c", code_), stdout=subprocess.PIPE, stderr=subprocess.PIPE) - - # Wait until the subprocess finishes and then collect the output - stdout_, stderr_ = proc_.communicate() - exit_code = proc_.poll() - - # Check whether a segfault or memory corruption occurred - # tm.assertTrue(exit_code == -11 or (exit_code == 0 and stdout_ != expected_)) - - # Check for correct exit code and output - tm.assert_equal(exit_code == 0 and stdout_ == expected_, True) + # This test is designed to cause a `segfault` with unpatched `tokenizer.c`, + # Sometimes the test fails on `segfault`, other times it fails due to memory + # corruption, which causes the loaded DataFrame to differ from the expected + # one. + n_lines, chunksizes = 173, range(57, 90) + + # Create the expected output + expected_ = [(chunksize_, "9999-9", "9999-9") + for chunksize_ in chunksizes + for _ in range((n_lines + chunksize_ - 1) // chunksize_)] + expected = pd.DataFrame(expected_, columns=None, index=None) + + # Generate a large mixed-type CSV file on-the-fly (approx 272 KiB) + record_ = "9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,," + csv_data = "\n".join([record_] * n_lines) + "\n" + + output_ = list() + for chunksize_ in chunksizes: + try: + iterator_ = self.read_csv(StringIO(csv_data), header=None, dtype=object, + chunksize=chunksize_, na_filter=True) + except ValueError, e: + # Ignore unsuported dtype=object by engine=python + pass + + for chunk_ in iterator_: + output_.append((chunksize_, chunk_.iloc[0, 0], chunk_.iloc[-1, 0])) + + df = pd.DataFrame(output_, columns=None, index=None) + + tm.assert_frame_equal(df, expected) \ No newline at end of file From 020d706259c45c98e04288970ce607f754341d9d Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 15:00:49 +0300 Subject: [PATCH 07/12] Updated WHATSNEW with the bug fix information --- doc/source/whatsnew/v0.19.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 06625e09d70a1..cc69a293d5945 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -673,6 +673,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``pandas.parser.parser_trim_buffers()``, which did not update the word vectors (``parser->words``) when stream buffer was shrunk (:issue:`13788`, :issue:`13703`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) From e0b4c8320bfd48bdd0ba67fd19c5d2fcd044b3a4 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 15:44:26 +0300 Subject: [PATCH 08/12] flake8 style test correction --- pandas/io/tests/parser/common.py | 63 ++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 546a9adf7e9c3..a3497d0df0ff2 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -5,8 +5,6 @@ import platform import codecs -import subprocess - import re import sys from datetime import datetime @@ -1494,36 +1492,71 @@ def test_memory_map(self): out = self.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(out, expected) - def test_parse_trim_buffers(self): - # This test is designed to cause a `segfault` with unpatched `tokenizer.c`, - # Sometimes the test fails on `segfault`, other times it fails due to memory - # corruption, which causes the loaded DataFrame to differ from the expected - # one. + # This test is designed to cause a `segfault` with unpatched + # `tokenizer.c`, Sometimes the test fails on `segfault`, other + # times it fails due to memory corruption, which causes the + # loaded DataFrame to differ from the expected one. n_lines, chunksizes = 173, range(57, 90) # Create the expected output expected_ = [(chunksize_, "9999-9", "9999-9") - for chunksize_ in chunksizes - for _ in range((n_lines + chunksize_ - 1) // chunksize_)] + for chunksize_ in chunksizes + for _ in range((n_lines + chunksize_ - 1) // chunksize_)] expected = pd.DataFrame(expected_, columns=None, index=None) # Generate a large mixed-type CSV file on-the-fly (approx 272 KiB) - record_ = "9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,," + record_ = \ + """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.""" \ + """99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-""" \ + """ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-""" \ + """ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-""" \ + """ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-""" \ + """ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-""" \ + """ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,""" \ + """ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-""" \ + """ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-""" \ + """ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.""" \ + """99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.""" \ + """99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.""" \ + """99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-""" \ + """ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-""" \ + """ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.""" \ + """999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ + """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-""" \ + """ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-""" \ + """ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-""" \ + """ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-""" \ + """ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-""" \ + """ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.""" \ + """99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.""" \ + """99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9""" \ + """.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ + """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-""" \ + """ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.""" \ + """99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.""" \ + """99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-""" \ + """ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.""" \ + """99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.""" \ + """99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.""" \ + """99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" csv_data = "\n".join([record_] * n_lines) + "\n" output_ = list() for chunksize_ in chunksizes: try: - iterator_ = self.read_csv(StringIO(csv_data), header=None, dtype=object, - chunksize=chunksize_, na_filter=True) - except ValueError, e: + iterator_ = self.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize_, + na_filter=True) + except ValueError: # Ignore unsuported dtype=object by engine=python pass for chunk_ in iterator_: - output_.append((chunksize_, chunk_.iloc[0, 0], chunk_.iloc[-1, 0])) + output_.append((chunksize_, + chunk_.iloc[0, 0], + chunk_.iloc[-1, 0])) df = pd.DataFrame(output_, columns=None, index=None) - tm.assert_frame_equal(df, expected) \ No newline at end of file + tm.assert_frame_equal(df, expected) From 834c851007f4fe79c153d3ae9b6b3a414cda07da Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 15:59:46 +0300 Subject: [PATCH 09/12] Improved readability of bugfix description; minor style fixes of the test --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/io/tests/parser/common.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index cc69a293d5945..c24ab1f5dcb08 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -673,7 +673,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug in ``pandas.parser.parser_trim_buffers()``, which did not update the word vectors (``parser->words``) when stream buffer was shrunk (:issue:`13788`, :issue:`13703`) +- Bug in ``pd.read_csv()`` causing a segfault when iterating over a large file in chunks (:issue:`13703`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index a3497d0df0ff2..af2ef38171844 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1542,7 +1542,7 @@ def test_parse_trim_buffers(self): """99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" csv_data = "\n".join([record_] * n_lines) + "\n" - output_ = list() + output_ = [] for chunksize_ in chunksizes: try: iterator_ = self.read_csv(StringIO(csv_data), header=None, From 629198d2c8058db10283fd6991e92602a64379ad Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 17:44:25 +0300 Subject: [PATCH 10/12] Referenced issue in the test, rewrote the bugfix description --- doc/source/whatsnew/v0.19.0.txt | 2 +- pandas/io/tests/parser/common.py | 35 ++++++++++++++++++-------------- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index c24ab1f5dcb08..326e124c2f03e 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -673,7 +673,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ -- Bug in ``pd.read_csv()`` causing a segfault when iterating over a large file in chunks (:issue:`13703`) +- Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing result may have normal ``Index`` (:issue:`13144`) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index af2ef38171844..969b407529aca 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1493,8 +1493,13 @@ def test_memory_map(self): tm.assert_frame_equal(out, expected) def test_parse_trim_buffers(self): + # This test is part of a bugfix for issue #13703. It attmepts to + # to stress the system memory allocator, to cause it to move the + # stream buffer and either let the OS reclaim the region, or let + # other memory requests of parser otherwise modify the contents + # of memory space, where it was formely located. # This test is designed to cause a `segfault` with unpatched - # `tokenizer.c`, Sometimes the test fails on `segfault`, other + # `tokenizer.c`. Sometimes the test fails on `segfault`, other # times it fails due to memory corruption, which causes the # loaded DataFrame to differ from the expected one. n_lines, chunksizes = 173, range(57, 90) @@ -1543,20 +1548,20 @@ def test_parse_trim_buffers(self): csv_data = "\n".join([record_] * n_lines) + "\n" output_ = [] - for chunksize_ in chunksizes: - try: + try: + for chunksize_ in chunksizes: iterator_ = self.read_csv(StringIO(csv_data), header=None, dtype=object, chunksize=chunksize_, na_filter=True) - except ValueError: - # Ignore unsuported dtype=object by engine=python - pass - - for chunk_ in iterator_: - output_.append((chunksize_, - chunk_.iloc[0, 0], - chunk_.iloc[-1, 0])) - - df = pd.DataFrame(output_, columns=None, index=None) - - tm.assert_frame_equal(df, expected) + for chunk_ in iterator_: + output_.append((chunksize_, + chunk_.iloc[0, 0], + chunk_.iloc[-1, 0])) + except ValueError: + # Ignore unsuported dtype=object by engine=python + # in this case output_ list is empty + pass + + if output_: + df = pd.DataFrame(output_, columns=None, index=None) + tm.assert_frame_equal(df, expected) From 9b521f6eb4a73047bf026a5b0b5c6bde6136cd31 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Tue, 26 Jul 2016 19:32:01 +0300 Subject: [PATCH 11/12] Improved the clarity and logic of the test --- pandas/io/tests/parser/common.py | 106 ++++++++++++++++--------------- 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 969b407529aca..c17ca0f1700fa 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1502,66 +1502,72 @@ def test_parse_trim_buffers(self): # `tokenizer.c`. Sometimes the test fails on `segfault`, other # times it fails due to memory corruption, which causes the # loaded DataFrame to differ from the expected one. - n_lines, chunksizes = 173, range(57, 90) - # Create the expected output - expected_ = [(chunksize_, "9999-9", "9999-9") - for chunksize_ in chunksizes - for _ in range((n_lines + chunksize_ - 1) // chunksize_)] - expected = pd.DataFrame(expected_, columns=None, index=None) - - # Generate a large mixed-type CSV file on-the-fly (approx 272 KiB) + # Generate a large mixed-type CSV file on-the-fly (one record is + # approx 1.5KiB). record_ = \ - """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.""" \ - """99,ZZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-""" \ - """ZZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-""" \ - """ZZZZ,ZZZ-ZZZZ,999,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-""" \ - """ZZZZZ,ZZZ-ZZZZ,,,9,9,9,9,99,99,999,999,ZZZZZ,ZZZ-""" \ - """ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.99,ZZ-ZZZZ,ZZ-""" \ - """ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.99,999.99,,,""" \ - """ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZZZZ,ZZZ-""" \ - """ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-""" \ - """ZZZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.""" \ - """99,,,,ZZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.""" \ - """99,9,9,9.99,9.99,,,,9.99,9.99,,99,,99,9.99,9.""" \ - """99,,,ZZZ,ZZZ,,999.99,,999.99,ZZZ,ZZZ-ZZZZ,ZZZ-""" \ - """ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,,,,,,ZZZ-""" \ - """ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999,9.""" \ - """999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ - """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-""" \ - """ZZZZ,,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-""" \ - """ZZZZ,ZZZ-ZZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-""" \ - """ZZZZ,ZZ-ZZZZ,ZZ,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-""" \ - """ZZZZ,,,99.99,99.99,,,9.99,9.99,9.99,9.99,ZZZ-""" \ - """ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-9.99,-9.99,-9.""" \ - """99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9.99,-9.""" \ - """99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,,,,-9""" \ - """.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ - """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-""" \ - """ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.""" \ - """99,ZZ-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.""" \ - """99,,,ZZ-ZZZZZZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-""" \ - """ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ,9999,999.99,ZZZ-ZZZZ,-9.""" \ - """99,-9.99,ZZZ-ZZZZ,99:99:99,,99,99,,9.99,,-99.""" \ - """99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9.""" \ - """99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" + """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \ + """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \ + """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \ + """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \ + """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \ + """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \ + """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \ + """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \ + """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \ + """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \ + """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \ + """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \ + """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \ + """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ + """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \ + """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \ + """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \ + """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \ + """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \ + """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \ + """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \ + """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ + """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \ + """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \ + """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \ + """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \ + """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \ + """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \ + """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" + + # Set the number of line so that a call to `parser_trim_buffers` + # is trgiggered: a couple of full chunks and a relatively small + # 'residual' chunk. + chunksize, n_lines = 128, 2 * 128 + 15 csv_data = "\n".join([record_] * n_lines) + "\n" + # We will use StringIO to load the CSV from this text buffer. + # pd.read_csv() will iterate over the file in chunks and will + # finally read a residual chunk of really small size. + + # Create the expected output: maually create the dataframe + # by splitting by comma and repeating the `n_lines` number + # of times. + row = tuple(val_ if val_ else float("nan") + for val_ in record_.split(",")) + expected_ = [row for _ in range(n_lines)] + expected = pd.DataFrame(expected_, dtype=object, + columns=None, index=None) + + # Iterate over the CSV file in chunks of `chunksize` lines output_ = [] try: - for chunksize_ in chunksizes: - iterator_ = self.read_csv(StringIO(csv_data), header=None, - dtype=object, chunksize=chunksize_, - na_filter=True) - for chunk_ in iterator_: - output_.append((chunksize_, - chunk_.iloc[0, 0], - chunk_.iloc[-1, 0])) + iterator_ = self.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize) + for chunk_ in iterator_: + output_.append(chunk_) except ValueError: # Ignore unsuported dtype=object by engine=python # in this case output_ list is empty pass + # Check for data corruption if there is any output. if output_: - df = pd.DataFrame(output_, columns=None, index=None) + df = pd.concat(output_, axis=0, ignore_index=True) tm.assert_frame_equal(df, expected) From d59624eddf063c9da12f1f95227d8e9c9b00cf6e Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Wed, 27 Jul 2016 02:24:48 +0300 Subject: [PATCH 12/12] Moved the test to 'c_parser_only' --- pandas/io/tests/parser/c_parser_only.py | 70 ++++++++++++++++++++++ pandas/io/tests/parser/common.py | 80 ------------------------- 2 files changed, 70 insertions(+), 80 deletions(-) diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index b6048051edc4d..103c9fa2b7ce8 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -381,3 +381,73 @@ def test_empty_header_read(count): for count in range(1, 101): test_empty_header_read(count) + + def test_parse_trim_buffers(self): + # This test is part of a bugfix for issue #13703. It attmepts to + # to stress the system memory allocator, to cause it to move the + # stream buffer and either let the OS reclaim the region, or let + # other memory requests of parser otherwise modify the contents + # of memory space, where it was formely located. + # This test is designed to cause a `segfault` with unpatched + # `tokenizer.c`. Sometimes the test fails on `segfault`, other + # times it fails due to memory corruption, which causes the + # loaded DataFrame to differ from the expected one. + + # Generate a large mixed-type CSV file on-the-fly (one record is + # approx 1.5KiB). + record_ = \ + """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \ + """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \ + """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \ + """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \ + """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \ + """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \ + """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \ + """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \ + """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \ + """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \ + """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \ + """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \ + """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \ + """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ + """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \ + """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \ + """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \ + """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \ + """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \ + """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \ + """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \ + """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ + """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \ + """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \ + """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \ + """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \ + """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \ + """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \ + """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" + + # Set the number of lines so that a call to `parser_trim_buffers` + # is triggered: after a couple of full chunks are consumed a + # relatively small 'residual' chunk would cause reallocation + # within the parser. + chunksize, n_lines = 128, 2 * 128 + 15 + csv_data = "\n".join([record_] * n_lines) + "\n" + + # We will use StringIO to load the CSV from this text buffer. + # pd.read_csv() will iterate over the file in chunks and will + # finally read a residual chunk of really small size. + + # Generate the expected output: manually create the dataframe + # by splitting by comma and repeating the `n_lines` times. + row = tuple(val_ if val_ else float("nan") + for val_ in record_.split(",")) + expected = pd.DataFrame([row for _ in range(n_lines)], + dtype=object, columns=None, index=None) + + # Iterate over the CSV file in chunks of `chunksize` lines + chunks_ = self.read_csv(StringIO(csv_data), header=None, + dtype=object, chunksize=chunksize) + result = pd.concat(chunks_, axis=0, ignore_index=True) + + # Check for data corruption if there was no segfault + tm.assert_frame_equal(result, expected) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index c17ca0f1700fa..11eed79e03267 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -1491,83 +1491,3 @@ def test_memory_map(self): out = self.read_csv(mmap_file, memory_map=True) tm.assert_frame_equal(out, expected) - - def test_parse_trim_buffers(self): - # This test is part of a bugfix for issue #13703. It attmepts to - # to stress the system memory allocator, to cause it to move the - # stream buffer and either let the OS reclaim the region, or let - # other memory requests of parser otherwise modify the contents - # of memory space, where it was formely located. - # This test is designed to cause a `segfault` with unpatched - # `tokenizer.c`. Sometimes the test fails on `segfault`, other - # times it fails due to memory corruption, which causes the - # loaded DataFrame to differ from the expected one. - - # Generate a large mixed-type CSV file on-the-fly (one record is - # approx 1.5KiB). - record_ = \ - """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z""" \ - """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,""" \ - """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9""" \ - """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,""" \ - """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9.""" \ - """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999.""" \ - """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ""" \ - """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ""" \ - """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z""" \ - """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,""" \ - """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,""" \ - """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,""" \ - """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999""" \ - """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9.""" \ - """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,""" \ - """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z""" \ - """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ""" \ - """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99""" \ - """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-""" \ - """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9""" \ - """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,""" \ - """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9.""" \ - """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ""" \ - """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ""" \ - """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ""" \ - """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ""" \ - """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99""" \ - """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9""" \ - """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,""" - - # Set the number of line so that a call to `parser_trim_buffers` - # is trgiggered: a couple of full chunks and a relatively small - # 'residual' chunk. - chunksize, n_lines = 128, 2 * 128 + 15 - csv_data = "\n".join([record_] * n_lines) + "\n" - - # We will use StringIO to load the CSV from this text buffer. - # pd.read_csv() will iterate over the file in chunks and will - # finally read a residual chunk of really small size. - - # Create the expected output: maually create the dataframe - # by splitting by comma and repeating the `n_lines` number - # of times. - row = tuple(val_ if val_ else float("nan") - for val_ in record_.split(",")) - expected_ = [row for _ in range(n_lines)] - expected = pd.DataFrame(expected_, dtype=object, - columns=None, index=None) - - # Iterate over the CSV file in chunks of `chunksize` lines - output_ = [] - try: - iterator_ = self.read_csv(StringIO(csv_data), header=None, - dtype=object, chunksize=chunksize) - for chunk_ in iterator_: - output_.append(chunk_) - except ValueError: - # Ignore unsuported dtype=object by engine=python - # in this case output_ list is empty - pass - - # Check for data corruption if there is any output. - if output_: - df = pd.concat(output_, axis=0, ignore_index=True) - tm.assert_frame_equal(df, expected)