From 3dff081a7656c7d882d18e5d5648523c0110302a Mon Sep 17 00:00:00 2001 From: "SiYoungOh(ohahohah)" Date: Sat, 10 Mar 2018 17:46:50 +0900 Subject: [PATCH 1/6] DOC: Improved the docstring of pd.DataFrame.memory_usage/empty --- pandas/core/frame.py | 40 ++++++++++++++++++++++++++++++++++++++-- pandas/core/generic.py | 12 ++++++++++-- 2 files changed, 48 insertions(+), 4 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a66d00fff9714..fc95e4f604461 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1943,7 +1943,11 @@ def _sizeof_fmt(num, size_qualifier): _put_lines(buf, lines) def memory_usage(self, index=True, deep=False): - """Memory usage of DataFrame columns. + """ + Memory usage of DataFrame columns. + + Memory usage of DataFrame is accessing pandas.DataFrame.info method. + A configuration option, `display.memory_usage` (see Parameters) Parameters ---------- @@ -1953,7 +1957,7 @@ def memory_usage(self, index=True, deep=False): the first index of the Series is `Index`. deep : bool Introspect the data deeply, interrogate - `object` dtypes for system-level memory consumption + `object` dtypes for system-level memory consumption. Returns ------- @@ -1969,6 +1973,38 @@ def memory_usage(self, index=True, deep=False): See Also -------- numpy.ndarray.nbytes + + Examples + -------- + >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] + >>> data = dict([(t, np.random.randint(100, size=5000).astype(t)) + ... for t in dtypes]) + >>> df = pd.DataFrame(data) + >>> df.memory_usage() + Index 80 + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + >>> df.memory_usage(index=False) + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + >>> df.memory_usage(index=True) + Index 80 + int64 40000 + float64 40000 + complex128 80000 + object 40000 + bool 5000 + dtype: int64 + >>> df.memory_usage(index=True).sum() + 205080 """ result = Series([c.memory_usage(index=False, deep=deep) for col, c in self.iteritems()], index=self.columns) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a893b2ba1a189..34835dc69bec0 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1436,12 +1436,20 @@ def __contains__(self, key): @property def empty(self): - """True if NDFrame is entirely empty [no items], meaning any of the + """ + True if DataFrame is empty. + + True if DataFrame is entirely empty [no items], meaning any of the axes are of length 0. + Returns + ------- + empty : boolean + if DataFrame is empty, return true, if not return false. + Notes ----- - If NDFrame contains only NaNs, it is still not considered empty. See + If DataFrame contains only NaNs, it is still not considered empty. See the example below. Examples From fc5b498ba8d2ad211f64e993e74c1901b09a3d53 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 14:56:16 -0500 Subject: [PATCH 2/6] Updates [ci skip] * Consistent with Series.memory_usage * Added Categorical notes [ci skip] --- pandas/core/frame.py | 64 +++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 19 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index fc95e4f604461..82e6847150344 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1944,42 +1944,52 @@ def _sizeof_fmt(num, size_qualifier): def memory_usage(self, index=True, deep=False): """ - Memory usage of DataFrame columns. + Return the memory usage of each column in bytes. + + The memory usage can optionally include the contribution of + the index and elements of `object` dtype. - Memory usage of DataFrame is accessing pandas.DataFrame.info method. A configuration option, `display.memory_usage` (see Parameters) Parameters ---------- - index : bool - Specifies whether to include memory usage of DataFrame's - index in returned Series. If `index=True` (default is False) - the first index of the Series is `Index`. + index : bool, default False + Specifies whether to include the memory usage of the DataFrame's + index in returned Series. If ``index=True`` the memory usage of the + index the first item in the output. deep : bool - Introspect the data deeply, interrogate - `object` dtypes for system-level memory consumption. + If True, introspect the data deeply by interrogating + `object` dtypes for system-level memory consumption, and include + it in the returned values. Returns ------- sizes : Series - A series with column names as index and memory usage of - columns with units of bytes. - - Notes - ----- - Memory usage does not include memory consumed by elements that - are not components of the array if deep=False + A Series whose index is the original column names and whose values + is the memory usage of each column in bytes. See Also -------- - numpy.ndarray.nbytes + numpy.ndarray.nbytes : Total bytes consumed by the elements of an + ndarray. + Series.memory_usage : Bytes consumed by a Series. + pandas.Categorical : Memory-efficient array for string values with + many repeated values. Examples -------- >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool'] - >>> data = dict([(t, np.random.randint(100, size=5000).astype(t)) + >>> data = dict([(t, np.ones(shape=5000).astype(t)) ... for t in dtypes]) >>> df = pd.DataFrame(data) + >>> df.head() + int64 float64 complex128 object bool + 0 1 1.0 (1+0j) 1 True + 1 1 1.0 (1+0j) 1 True + 2 1 1.0 (1+0j) 1 True + 3 1 1.0 (1+0j) 1 True + 4 1 1.0 (1+0j) 1 True + >>> df.memory_usage() Index 80 int64 40000 @@ -1988,6 +1998,7 @@ def memory_usage(self, index=True, deep=False): object 40000 bool 5000 dtype: int64 + >>> df.memory_usage(index=False) int64 40000 float64 40000 @@ -1995,6 +2006,7 @@ def memory_usage(self, index=True, deep=False): object 40000 bool 5000 dtype: int64 + >>> df.memory_usage(index=True) Index 80 int64 40000 @@ -2003,8 +2015,22 @@ def memory_usage(self, index=True, deep=False): object 40000 bool 5000 dtype: int64 - >>> df.memory_usage(index=True).sum() - 205080 + + The memory footprint of `object` dtype columns is ignored by default: + >>> df.memory_usage(deep=True) + Index 80 + int64 40000 + float64 40000 + complex128 80000 + object 160000 + bool 5000 + dtype: int64 + + Use a Categorical for efficient storage of an object-dtype column with + many repeated values. + + >>> df['object'].astype('category').memory_usage(deep=True) + 5168 """ result = Series([c.memory_usage(index=False, deep=deep) for col, c in self.iteritems()], index=self.columns) From b033dc6950f1cd20578d2bce71c5d24431645edb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 15 Mar 2018 21:21:15 +0100 Subject: [PATCH 3/6] fix wrong default --- pandas/core/frame.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 82e6847150344..ece0e4819566e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1953,11 +1953,11 @@ def memory_usage(self, index=True, deep=False): Parameters ---------- - index : bool, default False + index : bool, default True Specifies whether to include the memory usage of the DataFrame's index in returned Series. If ``index=True`` the memory usage of the index the first item in the output. - deep : bool + deep : bool, default False If True, introspect the data deeply by interrogating `object` dtypes for system-level memory consumption, and include it in the returned values. @@ -2007,16 +2007,8 @@ def memory_usage(self, index=True, deep=False): bool 5000 dtype: int64 - >>> df.memory_usage(index=True) - Index 80 - int64 40000 - float64 40000 - complex128 80000 - object 40000 - bool 5000 - dtype: int64 - The memory footprint of `object` dtype columns is ignored by default: + >>> df.memory_usage(deep=True) Index 80 int64 40000 From bb7f341c71517bd8e351c314af6ff191f8ca1792 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 15 Mar 2018 21:23:39 +0100 Subject: [PATCH 4/6] Update generic.py --- pandas/core/generic.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 34835dc69bec0..c6a31a6e1c749 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1437,15 +1437,15 @@ def __contains__(self, key): @property def empty(self): """ - True if DataFrame is empty. + Indicator whether DataFrame is empty. - True if DataFrame is entirely empty [no items], meaning any of the - axes are of length 0. + True if DataFrame is entirely empty (no items), meaning any of the + axes are of length 0. O Returns ------- - empty : boolean - if DataFrame is empty, return true, if not return false. + bool + If DataFrame is empty, return True, if not return False. Notes ----- From 1585a0e09f6621a9e81be260269c5046a316db59 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 15 Mar 2018 21:24:30 +0100 Subject: [PATCH 5/6] Update generic.py --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c6a31a6e1c749..4a2698290166f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1440,7 +1440,7 @@ def empty(self): Indicator whether DataFrame is empty. True if DataFrame is entirely empty (no items), meaning any of the - axes are of length 0. O + axes are of length 0. Returns ------- From d4cc71d94be50e796863ad754fdd6220ffa56401 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 15 Mar 2018 15:49:43 -0500 Subject: [PATCH 6/6] info [ci skip] [ci skip] --- pandas/core/frame.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ece0e4819566e..af3d5a0f93cce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1949,7 +1949,8 @@ def memory_usage(self, index=True, deep=False): The memory usage can optionally include the contribution of the index and elements of `object` dtype. - A configuration option, `display.memory_usage` (see Parameters) + This value is displayed in `DataFrame.info` by default. This can be + suppressed by setting ``pandas.options.display.memory_usage`` to False. Parameters ---------- @@ -1975,6 +1976,7 @@ def memory_usage(self, index=True, deep=False): Series.memory_usage : Bytes consumed by a Series. pandas.Categorical : Memory-efficient array for string values with many repeated values. + DataFrame.info : Concise summary of a DataFrame. Examples --------