4
4
Method NDFrame.describe() delegates actual execution to function describe_ndframe().
5
5
"""
6
6
7
- from typing import TYPE_CHECKING , List , Optional , Sequence , Union
7
+ from typing import TYPE_CHECKING , List , Optional , Sequence , Union , cast
8
8
import warnings
9
9
10
10
import numpy as np
@@ -62,32 +62,14 @@ def describe_ndframe(
62
62
if obj .ndim == 2 and obj .columns .size == 0 :
63
63
raise ValueError ("Cannot describe a DataFrame without columns" )
64
64
65
- if percentiles is not None :
66
- # explicit conversion of `percentiles` to list
67
- percentiles = list (percentiles )
68
-
69
- # get them all to be in [0, 1]
70
- validate_percentile (percentiles )
71
-
72
- # median should always be included
73
- if 0.5 not in percentiles :
74
- percentiles .append (0.5 )
75
- percentiles = np .asarray (percentiles )
76
- else :
77
- percentiles = np .array ([0.25 , 0.5 , 0.75 ])
78
-
79
- # sort and check for duplicates
80
- unique_pcts = np .unique (percentiles )
81
- assert percentiles is not None
82
- if len (unique_pcts ) < len (percentiles ):
83
- raise ValueError ("percentiles cannot contain duplicates" )
84
- percentiles = unique_pcts
65
+ percentiles = _refine_percentiles (percentiles )
85
66
86
67
if obj .ndim == 1 :
68
+ series = cast ("Series" , obj )
87
69
# Incompatible return value type
88
70
# (got "Series", expected "FrameOrSeries") [return-value]
89
71
return describe_1d (
90
- obj ,
72
+ series ,
91
73
percentiles ,
92
74
datetime_is_numeric ,
93
75
is_series = True ,
@@ -125,14 +107,14 @@ def describe_ndframe(
125
107
return d
126
108
127
109
128
- def describe_numeric_1d (series , percentiles ) -> "Series" :
110
+ def describe_numeric_1d (series : "Series" , percentiles : Sequence [ float ] ) -> "Series" :
129
111
"""Describe series containing numerical data.
130
112
131
113
Parameters
132
114
----------
133
115
series : Series
134
116
Series to be described.
135
- percentiles : list-like of numbers, optional
117
+ percentiles : list-like of numbers
136
118
The percentiles to include in the output.
137
119
"""
138
120
from pandas import Series
@@ -148,7 +130,7 @@ def describe_numeric_1d(series, percentiles) -> "Series":
148
130
return Series (d , index = stat_index , name = series .name )
149
131
150
132
151
- def describe_categorical_1d (data , is_series ) -> "Series" :
133
+ def describe_categorical_1d (data : "Series" , is_series : bool ) -> "Series" :
152
134
"""Describe series containing categorical data.
153
135
154
136
Parameters
@@ -210,14 +192,14 @@ def describe_categorical_1d(data, is_series) -> "Series":
210
192
return Series (result , index = names , name = data .name , dtype = dtype )
211
193
212
194
213
- def describe_timestamp_1d (data , percentiles ) -> "Series" :
195
+ def describe_timestamp_1d (data : "Series" , percentiles : Sequence [ float ] ) -> "Series" :
214
196
"""Describe series containing datetime64 dtype.
215
197
216
198
Parameters
217
199
----------
218
200
data : Series
219
201
Series to be described.
220
- percentiles : list-like of numbers, optional
202
+ percentiles : list-like of numbers
221
203
The percentiles to include in the output.
222
204
"""
223
205
# GH-30164
@@ -234,14 +216,20 @@ def describe_timestamp_1d(data, percentiles) -> "Series":
234
216
return Series (d , index = stat_index , name = data .name )
235
217
236
218
237
- def describe_1d (data , percentiles , datetime_is_numeric , * , is_series ) -> "Series" :
219
+ def describe_1d (
220
+ data : "Series" ,
221
+ percentiles : Sequence [float ],
222
+ datetime_is_numeric : bool ,
223
+ * ,
224
+ is_series : bool ,
225
+ ) -> "Series" :
238
226
"""Describe series.
239
227
240
228
Parameters
241
229
----------
242
230
data : Series
243
231
Series to be described.
244
- percentiles : list-like of numbers, optional
232
+ percentiles : list-like of numbers
245
233
The percentiles to include in the output.
246
234
datetime_is_numeric : bool, default False
247
235
Whether to treat datetime dtypes as numeric.
@@ -263,3 +251,35 @@ def describe_1d(data, percentiles, datetime_is_numeric, *, is_series) -> "Series
263
251
return describe_numeric_1d (data , percentiles )
264
252
else :
265
253
return describe_categorical_1d (data , is_series )
254
+
255
+
256
+ def _refine_percentiles (percentiles : Optional [Sequence [float ]]) -> Sequence [float ]:
257
+ """Ensure that percentiles are unique and sorted.
258
+
259
+ Parameters
260
+ ----------
261
+ percentiles : list-like of numbers, optional
262
+ The percentiles to include in the output.
263
+ """
264
+ if percentiles is None :
265
+ return np .array ([0.25 , 0.5 , 0.75 ])
266
+
267
+ # explicit conversion of `percentiles` to list
268
+ percentiles = list (percentiles )
269
+
270
+ # get them all to be in [0, 1]
271
+ validate_percentile (percentiles )
272
+
273
+ # median should always be included
274
+ if 0.5 not in percentiles :
275
+ percentiles .append (0.5 )
276
+
277
+ percentiles = np .asarray (percentiles )
278
+
279
+ # sort and check for duplicates
280
+ unique_pcts = np .unique (percentiles )
281
+ assert percentiles is not None
282
+ if len (unique_pcts ) < len (percentiles ):
283
+ raise ValueError ("percentiles cannot contain duplicates" )
284
+
285
+ return unique_pcts
0 commit comments