Skip to content

Commit b0305f7

Browse files
authored
ENH: Add use_nullable_dtypes for read_html (#50286)
* ENH: Add use_nullable_dtypes for read_html * Add gh ref * Fix test * Fix test * Add whatsnew * Address review * Add backend
1 parent 029e098 commit b0305f7

File tree

6 files changed

+85
-1
lines changed

6 files changed

+85
-1
lines changed

doc/source/user_guide/io.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1149,7 +1149,7 @@ To completely override the default values that are recognized as missing, specif
11491149
.. _io.navaluesconst:
11501150

11511151
The default ``NaN`` recognized values are ``['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A',
1152-
'n/a', 'NA', '<NA>', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']``.
1152+
'n/a', 'NA', '<NA>', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', 'None', '']``.
11531153

11541154
Let us consider some examples:
11551155

doc/source/whatsnew/v2.0.0.rst

+3
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
3838
* :func:`read_csv`
3939
* :func:`read_fwf`
4040
* :func:`read_excel`
41+
* :func:`read_html`
4142
* :func:`read_sql`
4243
* :func:`read_sql_query`
4344
* :func:`read_sql_table`
@@ -47,6 +48,7 @@ to select the nullable dtypes implementation.
4748

4849
* :func:`read_csv` (with ``engine="pyarrow"`` or ``engine="python"``)
4950
* :func:`read_excel`
51+
* :func:`read_html`
5052
* :func:`read_parquet`
5153
* :func:`read_orc`
5254

@@ -482,6 +484,7 @@ Other API changes
482484
- :func:`read_stata` with parameter ``index_col`` set to ``None`` (the default) will now set the index on the returned :class:`DataFrame` to a :class:`RangeIndex` instead of a :class:`Int64Index` (:issue:`49745`)
483485
- Changed behavior of :class:`Index`, :class:`Series`, and :class:`DataFrame` arithmetic methods when working with object-dtypes, the results no longer do type inference on the result of the array operations, use ``result.infer_objects(copy=False)`` to do type inference on the result (:issue:`49999`, :issue:`49714`)
484486
- Changed behavior of :class:`Index` constructor with an object-dtype ``numpy.ndarray`` containing all-``bool`` values or all-complex values, this will now retain object dtype, consistent with the :class:`Series` behavior (:issue:`49594`)
487+
- Added ``"None"`` to default ``na_values`` in :func:`read_csv` (:issue:`50286`)
485488
- Changed behavior of :class:`Series` and :class:`DataFrame` constructors when given an integer dtype and floating-point data that is not round numbers, this now raises ``ValueError`` instead of silently retaining the float dtype; do ``Series(data)`` or ``DataFrame(data)`` to get the old behavior, and ``Series(data).astype(dtype)`` or ``DataFrame(data).astype(dtype)`` to get the specified dtype (:issue:`49599`)
486489
- Changed behavior of :meth:`DataFrame.shift` with ``axis=1``, an integer ``fill_value``, and homogeneous datetime-like dtype, this now fills new columns with integer dtypes instead of casting to datetimelike (:issue:`49842`)
487490
- Files are now closed when encountering an exception in :func:`read_json` (:issue:`49921`)

pandas/_libs/parsers.pyx

+1
Original file line numberDiff line numberDiff line change
@@ -1384,6 +1384,7 @@ STR_NA_VALUES = {
13841384
"nan",
13851385
"-nan",
13861386
"",
1387+
"None",
13871388
}
13881389
_NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
13891390

pandas/io/html.py

+15
Original file line numberDiff line numberDiff line change
@@ -1043,6 +1043,7 @@ def read_html(
10431043
keep_default_na: bool = True,
10441044
displayed_only: bool = True,
10451045
extract_links: Literal[None, "header", "footer", "body", "all"] = None,
1046+
use_nullable_dtypes: bool = False,
10461047
) -> list[DataFrame]:
10471048
r"""
10481049
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -1143,6 +1144,19 @@ def read_html(
11431144
11441145
.. versionadded:: 1.5.0
11451146
1147+
use_nullable_dtypes : bool = False
1148+
Whether to use nullable dtypes as default when reading data. If
1149+
set to True, nullable dtypes are used for all dtypes that have a nullable
1150+
implementation, even if no nulls are present.
1151+
1152+
The nullable dtype implementation can be configured by calling
1153+
``pd.set_option("mode.dtype_backend", "pandas")`` to use
1154+
numpy-backed nullable dtypes or
1155+
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
1156+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
1157+
1158+
.. versionadded:: 2.0
1159+
11461160
Returns
11471161
-------
11481162
dfs
@@ -1218,4 +1232,5 @@ def read_html(
12181232
keep_default_na=keep_default_na,
12191233
displayed_only=displayed_only,
12201234
extract_links=extract_links,
1235+
use_nullable_dtypes=use_nullable_dtypes,
12211236
)

pandas/tests/io/parser/test_na_values.py

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ def test_default_na_values(all_parsers):
110110
"-nan",
111111
"#N/A N/A",
112112
"",
113+
"None",
113114
}
114115
assert _NA_VALUES == STR_NA_VALUES
115116

pandas/tests/io/test_html.py

+64
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717
from pandas.compat import is_platform_windows
1818
import pandas.util._test_decorators as td
1919

20+
import pandas as pd
2021
from pandas import (
22+
NA,
2123
DataFrame,
2224
MultiIndex,
2325
Series,
@@ -27,6 +29,10 @@
2729
to_datetime,
2830
)
2931
import pandas._testing as tm
32+
from pandas.core.arrays import (
33+
ArrowStringArray,
34+
StringArray,
35+
)
3036

3137
from pandas.io.common import file_path_to_url
3238
import pandas.io.html
@@ -132,6 +138,64 @@ def test_to_html_compat(self):
132138
res = self.read_html(out, attrs={"class": "dataframe"}, index_col=0)[0]
133139
tm.assert_frame_equal(res, df)
134140

141+
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
142+
@pytest.mark.parametrize("storage", ["python", "pyarrow"])
143+
def test_use_nullable_dtypes(self, storage, dtype_backend):
144+
# GH#50286
145+
df = DataFrame(
146+
{
147+
"a": Series([1, np.nan, 3], dtype="Int64"),
148+
"b": Series([1, 2, 3], dtype="Int64"),
149+
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
150+
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
151+
"e": [True, False, None],
152+
"f": [True, False, True],
153+
"g": ["a", "b", "c"],
154+
"h": ["a", "b", None],
155+
}
156+
)
157+
158+
if storage == "python":
159+
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
160+
string_array_na = StringArray(np.array(["a", "b", NA], dtype=np.object_))
161+
162+
else:
163+
pa = pytest.importorskip("pyarrow")
164+
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
165+
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
166+
167+
out = df.to_html(index=False)
168+
with pd.option_context("mode.string_storage", storage):
169+
with pd.option_context("mode.dtype_backend", dtype_backend):
170+
result = self.read_html(out, use_nullable_dtypes=True)[0]
171+
172+
expected = DataFrame(
173+
{
174+
"a": Series([1, np.nan, 3], dtype="Int64"),
175+
"b": Series([1, 2, 3], dtype="Int64"),
176+
"c": Series([1.5, np.nan, 2.5], dtype="Float64"),
177+
"d": Series([1.5, 2.0, 2.5], dtype="Float64"),
178+
"e": Series([True, False, NA], dtype="boolean"),
179+
"f": Series([True, False, True], dtype="boolean"),
180+
"g": string_array,
181+
"h": string_array_na,
182+
}
183+
)
184+
185+
if dtype_backend == "pyarrow":
186+
import pyarrow as pa
187+
188+
from pandas.arrays import ArrowExtensionArray
189+
190+
expected = DataFrame(
191+
{
192+
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
193+
for col in expected.columns
194+
}
195+
)
196+
197+
tm.assert_frame_equal(result, expected)
198+
135199
@pytest.mark.network
136200
@tm.network(
137201
url=(

0 commit comments

Comments
 (0)