From 13dff555a03f752e43d383bb6350c5b80326c73d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 28 Apr 2021 11:54:18 +0200 Subject: [PATCH 1/4] ENH: use native filesystem (if available) for read_parquet with pyarrow engine --- pandas/io/parquet.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 3801a29fec39e..f7f71ac947330 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -172,9 +172,24 @@ def write( table = self.api.Table.from_pandas(df, **from_pandas_kwargs) + filesystem = kwargs.pop("filesystem", None) + if ( + isinstance(path, str) + and storage_options is None + and filesystem is None + and LooseVersion(self.api.__version__) >= "5.0.0" + ): + try: + from pyarrow.fs import FileSystem + + filesystem, path = FileSystem.from_uri(path) + except Exception: + # fallback to use fsspec for filesystems that pyarrow doesn't support + pass + path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, - kwargs.pop("filesystem", None), + filesystem, storage_options=storage_options, mode="wb", is_dir=partition_cols is not None, From 26866418611da571e4f0b6cd25498027409a6885 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 28 Apr 2021 12:01:29 +0200 Subject: [PATCH 2/4] update comment --- pandas/io/parquet.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f7f71ac947330..75e191f60c8f0 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -184,7 +184,8 @@ def write( filesystem, path = FileSystem.from_uri(path) except Exception: - # fallback to use fsspec for filesystems that pyarrow doesn't support + # fallback to use get_handle / fsspec for filesystems + # that pyarrow doesn't support pass path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( From d5eab50993aa0957eb49bcacc1e975f9be0044d8 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 14 Sep 2021 13:59:54 +0200 Subject: [PATCH 3/4] fixup version comparison --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f4949089929bc..7a1c583cb5746 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -177,7 +177,7 @@ def write( isinstance(path, str) and storage_options is None and filesystem is None - and LooseVersion(self.api.__version__) >= "5.0.0" + and Version(self.api.__version__) >= Version("5.0.0") ): try: from pyarrow.fs import FileSystem From 6791f9e6242a24741d30c3c3d79393c94591dd2b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 7 Dec 2021 08:30:44 +0100 Subject: [PATCH 4/4] add docstring --- pandas/io/parquet.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 4d2b15c15ad13..3b8077c531996 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -486,6 +486,12 @@ def read_parquet( .. versionadded:: 1.3.0 + When using the 'pyarrow' engine, no storage options are provided + and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec`` + (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is preferred. + Provide the instantiated fsspec filesystem using the ``filesystem`` + keyword if you wish to use its implementation. + use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator for the resulting DataFrame. (only applicable for the ``pyarrow``