From 973fe16828b7d3464b890121cefa8d55cdfebcf6 Mon Sep 17 00:00:00 2001 From: John Ward Date: Sat, 17 Aug 2019 13:20:14 -0500 Subject: [PATCH 01/13] Added auto-encode parameter and encoding parameter setting logic to read_csv function, for issue 27655. --- pandas/io/parsers.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a3ff837bc7f52..3e4b96bfa3ee5 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -422,6 +422,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding + elif encoding is None and kwds.get("auto_encode"): + kwds["encoding"] = 'auto' compression = kwds.get("compression", "infer") compression = _infer_compression(filepath_or_buffer, compression) @@ -579,6 +581,7 @@ def parser_f( escapechar=None, comment=None, encoding=None, + auto_encode=True, dialect=None, # Error Handling error_bad_lines=True, @@ -590,6 +593,8 @@ def parser_f( float_precision=None, ): + + # gh-23761 # # When a dialect is passed, it overrides any of the overlapping @@ -663,6 +668,7 @@ def parser_f( usecols=usecols, verbose=verbose, encoding=encoding, + auto_encode=auto_encode, squeeze=squeeze, memory_map=memory_map, float_precision=float_precision, From 73a750b908c9430305601ad6c55371cdc5298002 Mon Sep 17 00:00:00 2001 From: John Ward Date: Sat, 17 Aug 2019 18:27:32 -0500 Subject: [PATCH 02/13] added test_read_csv test for issue 27655, appears that encoding is already determined automatically if left empty. --- asv_bench/benchmarks/io/json.py | 2 + pandas/io/common.py | 1 - pandas/io/parsers.py | 4 - pandas/tests/io/parser/test_read_csv.py | 156 ++++++++++++++++++++++++ 4 files changed, 158 insertions(+), 5 deletions(-) create mode 100644 pandas/tests/io/parser/test_read_csv.py diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index fc07f2a484102..6774ba9224efa 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -4,6 +4,8 @@ from ..pandas_vb_common import BaseIO +def loads(*args, **kwargs): + print("LOADS") class ReadJSON(BaseIO): diff --git a/pandas/io/common.py b/pandas/io/common.py index e01e473047b88..d17ea15f599c7 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -34,7 +34,6 @@ from pandas.core.dtypes.common import is_file_like from pandas._typing import FilePathOrBuffer - # gh-12665: Alias for now and remove later. CParserError = ParserError diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 3e4b96bfa3ee5..0a224a1a698fa 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -422,8 +422,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding - elif encoding is None and kwds.get("auto_encode"): - kwds["encoding"] = 'auto' compression = kwds.get("compression", "infer") compression = _infer_compression(filepath_or_buffer, compression) @@ -581,7 +579,6 @@ def parser_f( escapechar=None, comment=None, encoding=None, - auto_encode=True, dialect=None, # Error Handling error_bad_lines=True, @@ -668,7 +665,6 @@ def parser_f( usecols=usecols, verbose=verbose, encoding=encoding, - auto_encode=auto_encode, squeeze=squeeze, memory_map=memory_map, float_precision=float_precision, diff --git a/pandas/tests/io/parser/test_read_csv.py b/pandas/tests/io/parser/test_read_csv.py new file mode 100644 index 0000000000000..4035d8f83de7d --- /dev/null +++ b/pandas/tests/io/parser/test_read_csv.py @@ -0,0 +1,156 @@ +import pytest +import csv +import os +import sys + + +rootpath = os.path.dirname(os.path.abspath(__file__)) + +sys.path.append(os.path.join(rootpath, "pandas/io")) +from pandas.io import parsers + + + +TMP_PATH = "tmp" + +""" +To run test, run 'python path/to/test_parsers.py' +""" + + +""" + test_read_csv_without_encoding_kwarg returns result of read_csv method. + - if exception is raised from method, the result returned is the exception. +""" +def test_read_csv_without_encoding_kwarg(file): + try: + result = parsers.read_csv(file) + except Exception as e: + result = e + return result + + +def write_csv_file(filename, data, encoding, delimiter=",", newline=""): + with open(filename, 'w', newline=newline, encoding=encoding) as csv_file: + writer = csv.writer(csv_file, delimiter=delimiter) + # for row in data: + writer.writerow(data) + return filename + + +def test(): + test_results = {} + test_dtypes = ['ascii', + 'big5', + 'big5hkscs', + 'cp037', + 'cp273', + 'cp424', + 'cp437', + 'cp500', + 'cp720', + 'cp737', + 'cp775', + 'cp850', + 'cp852', + 'cp855', + 'cp856', + 'cp857', + 'cp858', + 'cp860', + 'cp861', + 'cp862', + 'cp863', + 'cp864', + 'cp865', + 'cp866', + 'cp869', + 'cp874', + 'cp875', + 'cp932', + 'cp949', + 'cp950', + 'cp1006', + 'cp1026', + 'cp1125', + 'cp1140', + 'cp1250', + 'cp1251', + 'cp1252', + 'cp1253', + 'cp1254', + 'cp1255', + 'cp1256', + 'cp1257', + 'cp1258', + 'cp65001', + 'euc_jp', + 'euc_jis_2004', + 'euc_jisx0213', + 'euc_kr', + 'gb2312', + 'gbk', + 'gb18030', + 'hz', + 'iso2022_jp', + 'iso2022_jp_1', + 'iso2022_jp_2', + 'iso2022_jp_2004', + 'iso2022_jp_3', + 'iso2022_jp_ext', + 'iso2022_kr', + 'latin_1', + 'iso8859_2', + 'iso8859_3', + 'iso8859_4', + 'iso8859_5', + 'iso8859_6', + 'iso8859_7', + 'iso8859_8', + 'iso8859_9', + 'iso8859_10', + 'iso8859_11', + 'iso8859_13', + 'iso8859_14', + 'iso8859_15', + 'iso8859_16', + 'johab', + 'koi8_r', + 'koi8_t', + 'koi8_u', + 'kz1048', + 'mac_cyrillic', + 'mac_greek', + 'mac_iceland', + 'mac_latin2', + 'mac_roman', + 'mac_turkish', + 'ptcp154', + 'shift_jis', + 'shift_jis_2004', + 'shift_jisx0213', + 'utf_32', + 'utf_32_be', + 'utf_32_le', + 'utf_16', + 'utf_16_be', + 'utf_16_le', + 'utf_7', + 'utf_8', + 'utf_8_sig' + ] + data = """ + one,two,three + 1,2,3 + uno,dos,tres + """ + for i, dtype in enumerate(test_dtypes): + file = write_csv_file(f"test{i}.csv", data, dtype) + result = test_read_csv_without_encoding_kwarg(file) + test_results[dtype] = result + + print("test results: ", test_results) + + +if __name__ == '__main__': + test() From 877c0a7f939e99724603452d8f17882f76271cd7 Mon Sep 17 00:00:00 2001 From: John Ward Date: Sat, 17 Aug 2019 18:54:13 -0500 Subject: [PATCH 03/13] attempt to fix styling issues --- asv_bench/benchmarks/io/json.py | 3 - pandas/io/common.py | 1 + pandas/io/parsers.py | 2 - pandas/tests/io/parser/test_read_csv.py | 211 ++++++++++++------------ 4 files changed, 104 insertions(+), 113 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 6774ba9224efa..203994b500f2c 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -4,9 +4,6 @@ from ..pandas_vb_common import BaseIO -def loads(*args, **kwargs): - print("LOADS") - class ReadJSON(BaseIO): fname = "__test__.json" diff --git a/pandas/io/common.py b/pandas/io/common.py index d17ea15f599c7..e01e473047b88 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -34,6 +34,7 @@ from pandas.core.dtypes.common import is_file_like from pandas._typing import FilePathOrBuffer + # gh-12665: Alias for now and remove later. CParserError = ParserError diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 0a224a1a698fa..a3ff837bc7f52 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -590,8 +590,6 @@ def parser_f( float_precision=None, ): - - # gh-23761 # # When a dialect is passed, it overrides any of the overlapping diff --git a/pandas/tests/io/parser/test_read_csv.py b/pandas/tests/io/parser/test_read_csv.py index 4035d8f83de7d..9e13fc620707c 100644 --- a/pandas/tests/io/parser/test_read_csv.py +++ b/pandas/tests/io/parser/test_read_csv.py @@ -3,25 +3,20 @@ import os import sys - rootpath = os.path.dirname(os.path.abspath(__file__)) sys.path.append(os.path.join(rootpath, "pandas/io")) from pandas.io import parsers - - TMP_PATH = "tmp" """ To run test, run 'python path/to/test_parsers.py' -""" - -""" test_read_csv_without_encoding_kwarg returns result of read_csv method. - if exception is raised from method, the result returned is the exception. """ + def test_read_csv_without_encoding_kwarg(file): try: result = parsers.read_csv(file) @@ -31,7 +26,7 @@ def test_read_csv_without_encoding_kwarg(file): def write_csv_file(filename, data, encoding, delimiter=",", newline=""): - with open(filename, 'w', newline=newline, encoding=encoding) as csv_file: + with open(filename, "w", newline=newline, encoding=encoding) as csv_file: writer = csv.writer(csv_file, delimiter=delimiter) # for row in data: writer.writerow(data) @@ -40,105 +35,106 @@ def write_csv_file(filename, data, encoding, delimiter=",", newline=""): def test(): test_results = {} - test_dtypes = ['ascii', - 'big5', - 'big5hkscs', - 'cp037', - 'cp273', - 'cp424', - 'cp437', - 'cp500', - 'cp720', - 'cp737', - 'cp775', - 'cp850', - 'cp852', - 'cp855', - 'cp856', - 'cp857', - 'cp858', - 'cp860', - 'cp861', - 'cp862', - 'cp863', - 'cp864', - 'cp865', - 'cp866', - 'cp869', - 'cp874', - 'cp875', - 'cp932', - 'cp949', - 'cp950', - 'cp1006', - 'cp1026', - 'cp1125', - 'cp1140', - 'cp1250', - 'cp1251', - 'cp1252', - 'cp1253', - 'cp1254', - 'cp1255', - 'cp1256', - 'cp1257', - 'cp1258', - 'cp65001', - 'euc_jp', - 'euc_jis_2004', - 'euc_jisx0213', - 'euc_kr', - 'gb2312', - 'gbk', - 'gb18030', - 'hz', - 'iso2022_jp', - 'iso2022_jp_1', - 'iso2022_jp_2', - 'iso2022_jp_2004', - 'iso2022_jp_3', - 'iso2022_jp_ext', - 'iso2022_kr', - 'latin_1', - 'iso8859_2', - 'iso8859_3', - 'iso8859_4', - 'iso8859_5', - 'iso8859_6', - 'iso8859_7', - 'iso8859_8', - 'iso8859_9', - 'iso8859_10', - 'iso8859_11', - 'iso8859_13', - 'iso8859_14', - 'iso8859_15', - 'iso8859_16', - 'johab', - 'koi8_r', - 'koi8_t', - 'koi8_u', - 'kz1048', - 'mac_cyrillic', - 'mac_greek', - 'mac_iceland', - 'mac_latin2', - 'mac_roman', - 'mac_turkish', - 'ptcp154', - 'shift_jis', - 'shift_jis_2004', - 'shift_jisx0213', - 'utf_32', - 'utf_32_be', - 'utf_32_le', - 'utf_16', - 'utf_16_be', - 'utf_16_le', - 'utf_7', - 'utf_8', - 'utf_8_sig' - ] + test_dtypes = [ + "ascii", + "big5", + "big5hkscs", + "cp037", + "cp273", + "cp424", + "cp437", + "cp500", + "cp720", + "cp737", + "cp775", + "cp850", + "cp852", + "cp855", + "cp856", + "cp857", + "cp858", + "cp860", + "cp861", + "cp862", + "cp863", + "cp864", + "cp865", + "cp866", + "cp869", + "cp874", + "cp875", + "cp932", + "cp949", + "cp950", + "cp1006", + "cp1026", + "cp1125", + "cp1140", + "cp1250", + "cp1251", + "cp1252", + "cp1253", + "cp1254", + "cp1255", + "cp1256", + "cp1257", + "cp1258", + "cp65001", + "euc_jp", + "euc_jis_2004", + "euc_jisx0213", + "euc_kr", + "gb2312", + "gbk", + "gb18030", + "hz", + "iso2022_jp", + "iso2022_jp_1", + "iso2022_jp_2", + "iso2022_jp_2004", + "iso2022_jp_3", + "iso2022_jp_ext", + "iso2022_kr", + "latin_1", + "iso8859_2", + "iso8859_3", + "iso8859_4", + "iso8859_5", + "iso8859_6", + "iso8859_7", + "iso8859_8", + "iso8859_9", + "iso8859_10", + "iso8859_11", + "iso8859_13", + "iso8859_14", + "iso8859_15", + "iso8859_16", + "johab", + "koi8_r", + "koi8_t", + "koi8_u", + "kz1048", + "mac_cyrillic", + "mac_greek", + "mac_iceland", + "mac_latin2", + "mac_roman", + "mac_turkish", + "ptcp154", + "shift_jis", + "shift_jis_2004", + "shift_jisx0213", + "utf_32", + "utf_32_be", + "utf_32_le", + "utf_16", + "utf_16_be", + "utf_16_le", + "utf_7", + "utf_8", + "utf_8_sig"] + data = """ one,two,three 1,2,3 @@ -151,6 +147,5 @@ def test(): print("test results: ", test_results) - -if __name__ == '__main__': +if __name__ == "__main__": test() From 75737f5bf560dd08ab4663d3dce0b0b1000c32f9 Mon Sep 17 00:00:00 2001 From: John Ward Date: Sat, 17 Aug 2019 19:11:44 -0500 Subject: [PATCH 04/13] reformatted changes from black pandas --- pandas/tests/io/parser/test_read_csv.py | 199 ++++++++++++------------ 1 file changed, 101 insertions(+), 98 deletions(-) diff --git a/pandas/tests/io/parser/test_read_csv.py b/pandas/tests/io/parser/test_read_csv.py index 9e13fc620707c..cef1fda37d38a 100644 --- a/pandas/tests/io/parser/test_read_csv.py +++ b/pandas/tests/io/parser/test_read_csv.py @@ -17,6 +17,7 @@ - if exception is raised from method, the result returned is the exception. """ + def test_read_csv_without_encoding_kwarg(file): try: result = parsers.read_csv(file) @@ -36,104 +37,105 @@ def write_csv_file(filename, data, encoding, delimiter=",", newline=""): def test(): test_results = {} test_dtypes = [ - "ascii", - "big5", - "big5hkscs", - "cp037", - "cp273", - "cp424", - "cp437", - "cp500", - "cp720", - "cp737", - "cp775", - "cp850", - "cp852", - "cp855", - "cp856", - "cp857", - "cp858", - "cp860", - "cp861", - "cp862", - "cp863", - "cp864", - "cp865", - "cp866", - "cp869", - "cp874", - "cp875", - "cp932", - "cp949", - "cp950", - "cp1006", - "cp1026", - "cp1125", - "cp1140", - "cp1250", - "cp1251", - "cp1252", - "cp1253", - "cp1254", - "cp1255", - "cp1256", - "cp1257", - "cp1258", - "cp65001", - "euc_jp", - "euc_jis_2004", - "euc_jisx0213", - "euc_kr", - "gb2312", - "gbk", - "gb18030", - "hz", - "iso2022_jp", - "iso2022_jp_1", - "iso2022_jp_2", - "iso2022_jp_2004", - "iso2022_jp_3", - "iso2022_jp_ext", - "iso2022_kr", - "latin_1", - "iso8859_2", - "iso8859_3", - "iso8859_4", - "iso8859_5", - "iso8859_6", - "iso8859_7", - "iso8859_8", - "iso8859_9", - "iso8859_10", - "iso8859_11", - "iso8859_13", - "iso8859_14", - "iso8859_15", - "iso8859_16", - "johab", - "koi8_r", - "koi8_t", - "koi8_u", - "kz1048", - "mac_cyrillic", - "mac_greek", - "mac_iceland", - "mac_latin2", - "mac_roman", - "mac_turkish", - "ptcp154", - "shift_jis", - "shift_jis_2004", - "shift_jisx0213", - "utf_32", - "utf_32_be", - "utf_32_le", - "utf_16", - "utf_16_be", - "utf_16_le", - "utf_7", - "utf_8", - "utf_8_sig"] + "ascii", + "big5", + "big5hkscs", + "cp037", + "cp273", + "cp424", + "cp437", + "cp500", + "cp720", + "cp737", + "cp775", + "cp850", + "cp852", + "cp855", + "cp856", + "cp857", + "cp858", + "cp860", + "cp861", + "cp862", + "cp863", + "cp864", + "cp865", + "cp866", + "cp869", + "cp874", + "cp875", + "cp932", + "cp949", + "cp950", + "cp1006", + "cp1026", + "cp1125", + "cp1140", + "cp1250", + "cp1251", + "cp1252", + "cp1253", + "cp1254", + "cp1255", + "cp1256", + "cp1257", + "cp1258", + "cp65001", + "euc_jp", + "euc_jis_2004", + "euc_jisx0213", + "euc_kr", + "gb2312", + "gbk", + "gb18030", + "hz", + "iso2022_jp", + "iso2022_jp_1", + "iso2022_jp_2", + "iso2022_jp_2004", + "iso2022_jp_3", + "iso2022_jp_ext", + "iso2022_kr", + "latin_1", + "iso8859_2", + "iso8859_3", + "iso8859_4", + "iso8859_5", + "iso8859_6", + "iso8859_7", + "iso8859_8", + "iso8859_9", + "iso8859_10", + "iso8859_11", + "iso8859_13", + "iso8859_14", + "iso8859_15", + "iso8859_16", + "johab", + "koi8_r", + "koi8_t", + "koi8_u", + "kz1048", + "mac_cyrillic", + "mac_greek", + "mac_iceland", + "mac_latin2", + "mac_roman", + "mac_turkish", + "ptcp154", + "shift_jis", + "shift_jis_2004", + "shift_jisx0213", + "utf_32", + "utf_32_be", + "utf_32_le", + "utf_16", + "utf_16_be", + "utf_16_le", + "utf_7", + "utf_8", + "utf_8_sig", + ] data = """ one,two,three @@ -147,5 +149,6 @@ def test(): print("test results: ", test_results) + if __name__ == "__main__": test() From f1bc202544481afb7244dd6569f9ba4d577be03a Mon Sep 17 00:00:00 2001 From: John Ward Date: Sun, 18 Aug 2019 11:12:17 -0500 Subject: [PATCH 05/13] Fixed docstrings for pandas.DataFrame.keys, pandas.read_clipboard and some of pandas.ExcelFile.parse (issue #27979) --- pandas/core/generic.py | 2 +- pandas/io/clipboards.py | 9 ++-- pandas/io/excel/_base.py | 4 +- pandas/tseries/offsets.py | 101 ++++++++++++++++++++++++++++++++++++-- 4 files changed, 106 insertions(+), 10 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index ba1c516b9b444..90779baea32cb 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1875,7 +1875,7 @@ def __iter__(self): # can we get a better explanation of this? def keys(self): """ - Get the 'info axis' (see Indexing for more) + Get the 'info axis' (see Indexing for more). This is index for Series, columns for DataFrame. diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index d38221d784273..76c01535a26e7 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -9,8 +9,7 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover r""" - Read text from clipboard and pass to read_csv. See read_csv for the - full argument list + Read text from clipboard and pass to read_csv. Parameters ---------- @@ -18,9 +17,13 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover A string or regex delimiter. The default of '\s+' denotes one or more whitespace characters. + **kwargs + See read_csv for the full argument list. + Returns ------- - parsed : DataFrame + DataFrame + A parsed DataFrame object. """ encoding = kwargs.pop("encoding", "utf-8") diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 154656fbb250b..997edf49d9e8f 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -837,10 +837,10 @@ def parse( **kwds ): """ - Parse specified sheet(s) into a DataFrame + Parse specified sheet(s) into a DataFrame. Equivalent to read_excel(ExcelFile, ...) See the read_excel - docstring for more info on accepted parameters + docstring for more info on accepted parameters. Returns ------- diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index a208d5ad2fea9..f64c0248cd3d4 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -204,8 +204,7 @@ def __add__(date): normalize : bool, default False Whether to round the result of a DateOffset addition down to the previous midnight. - **kwds - Temporal parameter that add to or replace the offset value. + **kwds : Temporal parameter that add to or replace the offset value. Parameters that **add** to the offset (like Timedelta): @@ -231,18 +230,21 @@ def __add__(date): - microsecond - nanosecond + . + See Also -------- - dateutil.relativedelta.relativedelta + dateutil.relativedelta.relativedelta : The relativedelta type is designed to be applied to an existing datetime an can replace specific components of that datetime, or represents an interval of time. Examples -------- + >>> from pandas.tseries.offsets import DateOffset >>> ts = pd.Timestamp('2017-01-01 09:10:11') >>> ts + DateOffset(months=3) Timestamp('2017-04-01 09:10:11') >>> ts = pd.Timestamp('2017-01-01 09:10:11') - >>> ts + DateOffset(month=3) + >>> ts + DateOffset(months=2) Timestamp('2017-03-01 09:10:11') """ @@ -471,6 +473,97 @@ def nanos(self): class SingleConstructorOffset(DateOffset): + """ + Standard kind of date increment used for a date range. + + Works exactly like relativedelta in terms of the keyword args you + pass in, use of the keyword n is discouraged-- you would be better + off specifying n in the keywords you use, but regardless it is + there for you. n is needed for DateOffset subclasses. + + DateOffset work as follows. Each offset specify a set of dates + that conform to the DateOffset. For example, Bday defines this + set to be the set of dates that are weekdays (M-F). To test if a + date is in the set of a DateOffset dateOffset we can use the + onOffset method: dateOffset.onOffset(date). + + If a date is not on a valid date, the rollback and rollforward + methods can be used to roll the date to the nearest valid date + before/after the date. + + DateOffsets can be created to move dates forward a given number of + valid dates. For example, Bday(2) can be added to a date to move + it two business days forward. If the date does not start on a + valid date, first it is moved to a valid date. Thus pseudo code + is: + + def __add__(date): + date = rollback(date) # does nothing if date is valid + return date + + + When a date offset is created for a negative number of periods, + the date is first rolled forward. The pseudo code is: + + def __add__(date): + date = rollforward(date) # does nothing is date is valid + return date + + + Zero presents a problem. Should it roll forward or back? We + arbitrarily have it rollforward: + + date + BDay(0) == BDay.rollforward(date) + + Since 0 is a bit weird, we suggest avoiding its use. + + Parameters + ---------- + n : int, default 1 + The number of time periods the offset represents. + normalize : bool, default False + Whether to round the result of a DateOffset addition down to the + previous midnight. + **kwds : Temporal parameter that add to or replace the offset value. + + Parameters that **add** to the offset (like Timedelta): + + - years + - months + - weeks + - days + - hours + - minutes + - seconds + - microseconds + - nanoseconds + + Parameters that **replace** the offset value: + + - year + - month + - day + - weekday + - hour + - minute + - second + - microsecond + - nanosecond + + . + + See Also + -------- + + Examples + -------- + >>> from pandas.tseries.offsets import DateOffset + >>> ts = pd.Timestamp('2017-01-01 09:10:11') + >>> ts + DateOffset(months=3) + Timestamp('2017-04-01 09:10:11') + + >>> ts = pd.Timestamp('2017-01-01 09:10:11') + >>> ts + DateOffset(months=2) + Timestamp('2017-03-01 09:10:11') + """ @classmethod def _from_name(cls, suffix=None): # default _from_name calls cls with no args From c087a4a76310ef252365ff9da8c6677f41d17bd0 Mon Sep 17 00:00:00 2001 From: John Ward Date: Sun, 18 Aug 2019 12:31:14 -0500 Subject: [PATCH 06/13] pandas.HDFStore.walk --- asv_bench/benchmarks/io/json.py | 4 ++ pandas/io/parsers.py | 4 ++ pandas/io/pytables.py | 90 ++++++++++++++++++--------------- 3 files changed, 58 insertions(+), 40 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 203994b500f2c..0241a56ad4733 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -4,6 +4,10 @@ from ..pandas_vb_common import BaseIO +<<<<<<< HEAD +======= + +>>>>>>> parent of 73a750b90... added test_read_csv test for issue 27655, appears that encoding is already determined automatically if left empty. class ReadJSON(BaseIO): fname = "__test__.json" diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index a3ff837bc7f52..4d69e72d845a9 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -422,6 +422,8 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding + elif encoding is None and kwds.get("auto_encode"): + kwds["encoding"] = 'auto' compression = kwds.get("compression", "infer") compression = _infer_compression(filepath_or_buffer, compression) @@ -579,6 +581,7 @@ def parser_f( escapechar=None, comment=None, encoding=None, + auto_encode=True, dialect=None, # Error Handling error_bad_lines=True, @@ -663,6 +666,7 @@ def parser_f( usecols=usecols, verbose=verbose, encoding=encoding, + auto_encode=auto_encode, squeeze=squeeze, memory_map=memory_map, float_precision=float_precision, diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 6af5dd6f1bf37..9c4ad31e70e27 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -431,8 +431,8 @@ def _is_metadata_of(group, parent_group): class HDFStore: """ - Dict-like IO interface for storing pandas objects in PyTables - either Fixed or Table format. + Dict-like IO interface for storing pandas objects in PyTables. + Either Fixed or Table format. Parameters ---------- @@ -564,13 +564,12 @@ def __exit__(self, exc_type, exc_value, traceback): def keys(self): """ - Return a (potentially unordered) list of the keys corresponding to the - objects stored in the HDFStore. These are ABSOLUTE path-names (e.g. - have the leading '/' + Return a (potentially unordered) list of the keys corresponding to the objects stored in the HDFStore. Returns ------- list + List of ABSOLUTE path-names (e.g. have the leading '/'). """ return [n._v_pathname for n in self.groups()] @@ -703,7 +702,7 @@ def flush(self, fsync=False): def get(self, key): """ - Retrieve pandas object stored in file + Retrieve pandas object stored in file. Parameters ---------- @@ -711,7 +710,8 @@ def get(self, key): Returns ------- - obj : same type as object stored in file + object + Same type as object stored in file. """ group = self.get_node(key) if group is None: @@ -731,25 +731,31 @@ def select( **kwargs ): """ - Retrieve pandas object stored in file, optionally based on where - criteria + Retrieve pandas object stored in file, optionally based on where criteria. Parameters ---------- key : object - where : list of Term (or convertible) objects, optional - start : integer (defaults to None), row number to start selection - stop : integer (defaults to None), row number to stop selection - columns : a list of columns that if not None, will limit the return - columns - iterator : boolean, return an iterator, default False - chunksize : nrows to include in iteration, return an iterator - auto_close : boolean, should automatically close the store when - finished, default is False + Object being retrieved from file. + where : list, default None + List of Term (or convertible) objects, optional. + start : int, default None + Row number to start selection. + stop : int, default None + Row number to stop selection. + columns : list, default None + A list of columns that if not None, will limit the return columns. + iterator : bool, default False + Returns an iterator. + chunksize : int, default None + Number or rows to include in iteration, return an iterator. + auto_close : bool, default False + Should automatically close the store when finished. Returns ------- - The selected object + object + Retrieved object from file. """ group = self.get_node(key) if group is None: @@ -929,28 +935,30 @@ def func(_start, _stop, _where): def put(self, key, value, format=None, append=False, **kwargs): """ - Store object in HDFStore + Store object in HDFStore. Parameters ---------- key : object value : {Series, DataFrame} - format : 'fixed(f)|table(t)', default is 'fixed' - fixed(f) : Fixed format - Fast writing/reading. Not-appendable, nor searchable - table(t) : Table format + format : 'Fixed(f)|Table(t)', default is 'fixed' + Fixed(f) : Fixed format + Fast writing/reading. Not-appendable, nor searchable. + Table(t) : Table format Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching - / selecting subsets of the data - append : boolean, default False + / selecting subsets of the data. + append : bool, default False This will force Table format, append the input data to the existing. - data_columns : list of columns to create as data columns, or True to + data_columns : list, default None + List of columns to create as data columns, or True to use all columns. See `here `__. - encoding : default None, provide an encoding for strings - dropna : boolean, default False, do not write an ALL nan row to - the store settable by the option 'io.hdf.dropna_table' + encoding : str, default None + Provide an encoding for strings. + dropna : bool, default False, do not write an ALL nan row to + The store settable by the option 'io.hdf.dropna_table'. """ if format is None: format = get_option("io.hdf.default_format") or "fixed" @@ -1165,12 +1173,13 @@ def create_table_index(self, key, **kwargs): s.create_index(**kwargs) def groups(self): - """return a list of all the top-level nodes (that are not themselves a - pandas storage object) + """ + Return a list of all the top-level nodes (that are not themselves a pandas storage object). Returns ------- list + List of objects. """ _tables() self._check_if_open() @@ -1188,10 +1197,12 @@ def groups(self): ] def walk(self, where="/"): - """ Walk the pytables group hierarchy for pandas objects + """ + Walk the pytables group hierarchy for pandas objects. This generator will yield the group path, subgroups and pandas object names for each group. + Any non-pandas PyTables objects that are not a group will be ignored. The `where` group itself is listed first (preorder), then each of its @@ -1202,18 +1213,17 @@ def walk(self, where="/"): Parameters ---------- - where : str, optional + where : str, optional, default "/" Group where to start walking. - If not supplied, the root group is used. Yields ------ path : str - Full path to a group (without trailing '/') - groups : list of str - names of the groups contained in `path` - leaves : list of str - names of the pandas objects contained in `path` + Full path to a group (without trailing '/'). + groups : list + Names (strings) of the groups contained in `path`. + leaves : list + Names (strings) of the pandas objects contained in `path`. """ _tables() self._check_if_open() From cce1b4618b6815fd223b122ea357bded9aa937c6 Mon Sep 17 00:00:00 2001 From: John Ward Date: Sun, 18 Aug 2019 12:34:26 -0500 Subject: [PATCH 07/13] fix --- asv_bench/benchmarks/io/json.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 0241a56ad4733..203994b500f2c 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -4,10 +4,6 @@ from ..pandas_vb_common import BaseIO -<<<<<<< HEAD -======= - ->>>>>>> parent of 73a750b90... added test_read_csv test for issue 27655, appears that encoding is already determined automatically if left empty. class ReadJSON(BaseIO): fname = "__test__.json" From be4543fa96af4cac7cfccaefcfcac2b7efee7c08 Mon Sep 17 00:00:00 2001 From: John Ward Date: Sun, 18 Aug 2019 12:37:40 -0500 Subject: [PATCH 08/13] deleted unnecessary test i previously created --- pandas/tests/io/parser/test_read_csv.py | 154 ------------------------ 1 file changed, 154 deletions(-) delete mode 100644 pandas/tests/io/parser/test_read_csv.py diff --git a/pandas/tests/io/parser/test_read_csv.py b/pandas/tests/io/parser/test_read_csv.py deleted file mode 100644 index cef1fda37d38a..0000000000000 --- a/pandas/tests/io/parser/test_read_csv.py +++ /dev/null @@ -1,154 +0,0 @@ -import pytest -import csv -import os -import sys - -rootpath = os.path.dirname(os.path.abspath(__file__)) - -sys.path.append(os.path.join(rootpath, "pandas/io")) -from pandas.io import parsers - -TMP_PATH = "tmp" - -""" -To run test, run 'python path/to/test_parsers.py' - - test_read_csv_without_encoding_kwarg returns result of read_csv method. - - if exception is raised from method, the result returned is the exception. -""" - - -def test_read_csv_without_encoding_kwarg(file): - try: - result = parsers.read_csv(file) - except Exception as e: - result = e - return result - - -def write_csv_file(filename, data, encoding, delimiter=",", newline=""): - with open(filename, "w", newline=newline, encoding=encoding) as csv_file: - writer = csv.writer(csv_file, delimiter=delimiter) - # for row in data: - writer.writerow(data) - return filename - - -def test(): - test_results = {} - test_dtypes = [ - "ascii", - "big5", - "big5hkscs", - "cp037", - "cp273", - "cp424", - "cp437", - "cp500", - "cp720", - "cp737", - "cp775", - "cp850", - "cp852", - "cp855", - "cp856", - "cp857", - "cp858", - "cp860", - "cp861", - "cp862", - "cp863", - "cp864", - "cp865", - "cp866", - "cp869", - "cp874", - "cp875", - "cp932", - "cp949", - "cp950", - "cp1006", - "cp1026", - "cp1125", - "cp1140", - "cp1250", - "cp1251", - "cp1252", - "cp1253", - "cp1254", - "cp1255", - "cp1256", - "cp1257", - "cp1258", - "cp65001", - "euc_jp", - "euc_jis_2004", - "euc_jisx0213", - "euc_kr", - "gb2312", - "gbk", - "gb18030", - "hz", - "iso2022_jp", - "iso2022_jp_1", - "iso2022_jp_2", - "iso2022_jp_2004", - "iso2022_jp_3", - "iso2022_jp_ext", - "iso2022_kr", - "latin_1", - "iso8859_2", - "iso8859_3", - "iso8859_4", - "iso8859_5", - "iso8859_6", - "iso8859_7", - "iso8859_8", - "iso8859_9", - "iso8859_10", - "iso8859_11", - "iso8859_13", - "iso8859_14", - "iso8859_15", - "iso8859_16", - "johab", - "koi8_r", - "koi8_t", - "koi8_u", - "kz1048", - "mac_cyrillic", - "mac_greek", - "mac_iceland", - "mac_latin2", - "mac_roman", - "mac_turkish", - "ptcp154", - "shift_jis", - "shift_jis_2004", - "shift_jisx0213", - "utf_32", - "utf_32_be", - "utf_32_le", - "utf_16", - "utf_16_be", - "utf_16_le", - "utf_7", - "utf_8", - "utf_8_sig", - ] - - data = """ - one,two,three - 1,2,3 - uno,dos,tres - """ - for i, dtype in enumerate(test_dtypes): - file = write_csv_file(f"test{i}.csv", data, dtype) - result = test_read_csv_without_encoding_kwarg(file) - test_results[dtype] = result - - print("test results: ", test_results) - - -if __name__ == "__main__": - test() From d647f0931550efd88dfed63c2ebf57ad16815a81 Mon Sep 17 00:00:00 2001 From: John Ward Date: Sun, 18 Aug 2019 12:41:15 -0500 Subject: [PATCH 09/13] removed summary for SingleConstructorOffset, which was basically a copied-over summary from its parent DateOffset --- pandas/tseries/offsets.py | 91 --------------------------------------- 1 file changed, 91 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index f64c0248cd3d4..e78efc517a65a 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -473,97 +473,6 @@ def nanos(self): class SingleConstructorOffset(DateOffset): - """ - Standard kind of date increment used for a date range. - - Works exactly like relativedelta in terms of the keyword args you - pass in, use of the keyword n is discouraged-- you would be better - off specifying n in the keywords you use, but regardless it is - there for you. n is needed for DateOffset subclasses. - - DateOffset work as follows. Each offset specify a set of dates - that conform to the DateOffset. For example, Bday defines this - set to be the set of dates that are weekdays (M-F). To test if a - date is in the set of a DateOffset dateOffset we can use the - onOffset method: dateOffset.onOffset(date). - - If a date is not on a valid date, the rollback and rollforward - methods can be used to roll the date to the nearest valid date - before/after the date. - - DateOffsets can be created to move dates forward a given number of - valid dates. For example, Bday(2) can be added to a date to move - it two business days forward. If the date does not start on a - valid date, first it is moved to a valid date. Thus pseudo code - is: - - def __add__(date): - date = rollback(date) # does nothing if date is valid - return date + - - When a date offset is created for a negative number of periods, - the date is first rolled forward. The pseudo code is: - - def __add__(date): - date = rollforward(date) # does nothing is date is valid - return date + - - Zero presents a problem. Should it roll forward or back? We - arbitrarily have it rollforward: - - date + BDay(0) == BDay.rollforward(date) - - Since 0 is a bit weird, we suggest avoiding its use. - - Parameters - ---------- - n : int, default 1 - The number of time periods the offset represents. - normalize : bool, default False - Whether to round the result of a DateOffset addition down to the - previous midnight. - **kwds : Temporal parameter that add to or replace the offset value. - - Parameters that **add** to the offset (like Timedelta): - - - years - - months - - weeks - - days - - hours - - minutes - - seconds - - microseconds - - nanoseconds - - Parameters that **replace** the offset value: - - - year - - month - - day - - weekday - - hour - - minute - - second - - microsecond - - nanosecond - - . - - See Also - -------- - - Examples - -------- - >>> from pandas.tseries.offsets import DateOffset - >>> ts = pd.Timestamp('2017-01-01 09:10:11') - >>> ts + DateOffset(months=3) - Timestamp('2017-04-01 09:10:11') - - >>> ts = pd.Timestamp('2017-01-01 09:10:11') - >>> ts + DateOffset(months=2) - Timestamp('2017-03-01 09:10:11') - """ @classmethod def _from_name(cls, suffix=None): # default _from_name calls cls with no args From 0273150d25146f22bdeec9127141618beae89a6d Mon Sep 17 00:00:00 2001 From: John Ward Date: Thu, 22 Aug 2019 12:01:08 -0500 Subject: [PATCH 10/13] fixed formatting issues. --- asv_bench/benchmarks/io/json.py | 1 + pandas/io/parsers.py | 2 -- pandas/io/pytables.py | 15 ++++++++------- pandas/tseries/offsets.py | 6 +++++- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 203994b500f2c..fc07f2a484102 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -4,6 +4,7 @@ from ..pandas_vb_common import BaseIO + class ReadJSON(BaseIO): fname = "__test__.json" diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4d69e72d845a9..4025e6d64a859 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -422,8 +422,6 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding - elif encoding is None and kwds.get("auto_encode"): - kwds["encoding"] = 'auto' compression = kwds.get("compression", "infer") compression = _infer_compression(filepath_or_buffer, compression) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 9c4ad31e70e27..057672f6f1be8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -432,6 +432,7 @@ class HDFStore: """ Dict-like IO interface for storing pandas objects in PyTables. + Either Fixed or Table format. Parameters @@ -564,7 +565,7 @@ def __exit__(self, exc_type, exc_value, traceback): def keys(self): """ - Return a (potentially unordered) list of the keys corresponding to the objects stored in the HDFStore. + Return a list of keys corresponding to objects stored in HDFStore. Returns ------- @@ -939,12 +940,12 @@ def put(self, key, value, format=None, append=False, **kwargs): Parameters ---------- - key : object - value : {Series, DataFrame} - format : 'Fixed(f)|Table(t)', default is 'fixed' - Fixed(f) : Fixed format + key : object + value : {Series, DataFrame} + format : 'fixed(f)|table(t)', default is 'fixed' + fixed(f) : Fixed format Fast writing/reading. Not-appendable, nor searchable. - Table(t) : Table format + table(t) : Table format Write as a PyTables Table structure which may perform worse but allow more flexible operations like searching / selecting subsets of the data. @@ -1213,7 +1214,7 @@ def walk(self, where="/"): Parameters ---------- - where : str, optional, default "/" + where : str, default "/" Group where to start walking. Yields diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index e78efc517a65a..156a3a1967cf5 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -234,7 +234,11 @@ def __add__(date): See Also -------- - dateutil.relativedelta.relativedelta : The relativedelta type is designed to be applied to an existing datetime an can replace specific components of that datetime, or represents an interval of time. + dateutil.relativedelta.relativedelta : The relativedelta type is designed + + to be applied to an existing datetime an can replace specific components of + + that datetime, or represents an interval of time. Examples -------- From 700abba8ccc91a22a7995a3aebd451f9f9deb7ee Mon Sep 17 00:00:00 2001 From: John Ward Date: Thu, 22 Aug 2019 12:09:38 -0500 Subject: [PATCH 11/13] removed experimental changes unrelated to pr --- pandas/io/parsers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 4025e6d64a859..a3ff837bc7f52 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -579,7 +579,6 @@ def parser_f( escapechar=None, comment=None, encoding=None, - auto_encode=True, dialect=None, # Error Handling error_bad_lines=True, @@ -664,7 +663,6 @@ def parser_f( usecols=usecols, verbose=verbose, encoding=encoding, - auto_encode=auto_encode, squeeze=squeeze, memory_map=memory_map, float_precision=float_precision, From f8675ad9f5d524763231c5148b83803d68a7e7d3 Mon Sep 17 00:00:00 2001 From: John Ward Date: Thu, 22 Aug 2019 12:20:31 -0500 Subject: [PATCH 12/13] shortened line --- pandas/io/pytables.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 057672f6f1be8..576c45a2f8097 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -1175,7 +1175,9 @@ def create_table_index(self, key, **kwargs): def groups(self): """ - Return a list of all the top-level nodes (that are not themselves a pandas storage object). + Return a list of all the top-level nodes. + + Each node returned is not a pandas storage object. Returns ------- From d0d3f6cf184d547189262a408d2d9bfa570c9dd1 Mon Sep 17 00:00:00 2001 From: John Ward Date: Sat, 24 Aug 2019 10:08:57 -0500 Subject: [PATCH 13/13] removed blank & unnecessary lines --- pandas/tseries/offsets.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 156a3a1967cf5..edf58ba3850a1 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -230,15 +230,11 @@ def __add__(date): - microsecond - nanosecond - . - See Also -------- dateutil.relativedelta.relativedelta : The relativedelta type is designed - - to be applied to an existing datetime an can replace specific components of - - that datetime, or represents an interval of time. + to be applied to an existing datetime an can replace specific components of + that datetime, or represents an interval of time. Examples --------