diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 3b24310014ff8..839ab8d50df43 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -103,7 +103,7 @@ Performance improvements - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`) - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) -- +- Performance improvement in :meth:`read_orc` when reading a remote URI file path. (:issue:`51609`) .. --------------------------------------------------------------------------- .. _whatsnew_210.bug_fixes: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 4623539a19413..3999fc5840f02 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -24,13 +24,17 @@ from pandas.core.arrays import ArrowExtensionArray from pandas.core.frame import DataFrame -from pandas.io.common import get_handle +from pandas.io.common import ( + get_handle, + is_fsspec_url, +) def read_orc( path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, use_nullable_dtypes: bool | lib.NoDefault = lib.no_default, + filesystem=None, **kwargs, ) -> DataFrame: """ @@ -64,6 +68,11 @@ def read_orc( .. versionadded:: 2.0 + filesystem : fsspec or pyarrow filesystem, default None + Filesystem object to use when reading the parquet file. + + .. versionadded:: 2.1.0 + **kwargs Any additional kwargs are passed to pyarrow. @@ -75,6 +84,11 @@ def read_orc( ----- Before using this function you should read the :ref:`user guide about ORC ` and :ref:`install optional dependencies `. + + If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"), + a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a + pyarrow or fsspec filesystem object into the filesystem keyword to override this + behavior. """ # we require a newer version of pyarrow than we support for parquet @@ -87,8 +101,18 @@ def read_orc( ) with get_handle(path, "rb", is_text=False) as handles: - orc_file = orc.ORCFile(handles.handle) - pa_table = orc_file.read(columns=columns, **kwargs) + source = handles.handle + if is_fsspec_url(path) and filesystem is None: + pa = import_optional_dependency("pyarrow") + pa_fs = import_optional_dependency("pyarrow.fs") + try: + filesystem, source = pa_fs.FileSystem.from_uri(path) + except (TypeError, pa.ArrowInvalid): + pass + + pa_table = orc.read_table( + source=source, columns=columns, filesystem=filesystem, **kwargs + ) if use_nullable_dtypes: dtype_backend = get_option("mode.dtype_backend") if dtype_backend == "pyarrow": diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 2a95240a5f83d..9db19d4eb8448 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -3,6 +3,7 @@ from decimal import Decimal from io import BytesIO import os +import pathlib import numpy as np import pytest @@ -396,3 +397,12 @@ def test_orc_use_nullable_dtypes_option(): expected = pd.DataFrame({"int": pd.Series([1, 2, 3], dtype="Int64")}) tm.assert_frame_equal(result, expected) + + +def test_orc_uri_path(): + expected = pd.DataFrame({"int": list(range(1, 4))}) + with tm.ensure_clean("tmp.orc") as path: + expected.to_orc(path) + uri = pathlib.Path(path).as_uri() + result = read_orc(uri) + tm.assert_frame_equal(result, expected)