Skip to content

Commit 55b75ca

Browse files
authored
Add Hugging Face filesystem support to fsspec (#1997)
# Rationale for this change Add support for the Hugging Face filesystem in `fsspec`, which uses `hf://` paths. This allows to import [HF datasets](https://huggingface.co/datasets). Authentication is done using the `"hf.token"` property. # Are these changes tested? I tried locally but haven't added tests in test_fsspec.py (lmk if it's a requirement) # Are there any user-facing changes? No changes, it simply adds support for `hf://` URLs
1 parent 62ad2ca commit 55b75ca

File tree

5 files changed

+90
-19
lines changed

5 files changed

+90
-19
lines changed

mkdocs/docs/configuration.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ Iceberg works with the concept of a FileIO which is a pluggable module for readi
9595
- **hdfs**: `PyArrowFileIO`
9696
- **abfs**, **abfss**: `FsspecFileIO`
9797
- **oss**: `PyArrowFileIO`
98+
- **hf**: `FsspecFileIO`
9899

99100
You can also set the FileIO explicitly:
100101

@@ -193,6 +194,17 @@ PyIceberg uses [S3FileSystem](https://arrow.apache.org/docs/python/generated/pya
193194

194195
<!-- markdown-link-check-enable-->
195196

197+
### Hugging Face
198+
199+
<!-- markdown-link-check-disable -->
200+
201+
| Key | Example | Description |
202+
| ----------- | ------------------------ | --------------------------------------------------------- |
203+
| hf.endpoint | <https://huggingface.co> | Configure the endpoint for Hugging Face |
204+
| hf.token | hf_xxx | The Hugging Face token to access HF Datasets repositories |
205+
206+
<!-- markdown-link-check-enable-->
207+
196208
### PyArrow
197209

198210
<!-- markdown-link-check-disable -->

poetry.lock

Lines changed: 57 additions & 19 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyiceberg/io/__init__.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@
9292
GCS_SERVICE_HOST = "gcs.service.host"
9393
GCS_DEFAULT_LOCATION = "gcs.default-bucket-location"
9494
GCS_VERSION_AWARE = "gcs.version-aware"
95+
HF_ENDPOINT = "hf.endpoint"
96+
HF_TOKEN = "hf.token"
9597
PYARROW_USE_LARGE_TYPES_ON_READ = "pyarrow.use-large-types-on-read"
9698

9799

@@ -306,6 +308,7 @@ def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
306308
"viewfs": [ARROW_FILE_IO],
307309
"abfs": [FSSPEC_FILE_IO],
308310
"abfss": [FSSPEC_FILE_IO],
311+
"hf": [FSSPEC_FILE_IO],
309312
}
310313

311314

pyiceberg/io/fsspec.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@
5959
GCS_SESSION_KWARGS,
6060
GCS_TOKEN,
6161
GCS_VERSION_AWARE,
62+
HF_ENDPOINT,
63+
HF_TOKEN,
6264
S3_ACCESS_KEY_ID,
6365
S3_CONNECT_TIMEOUT,
6466
S3_ENDPOINT,
@@ -209,6 +211,15 @@ def _adls(properties: Properties) -> AbstractFileSystem:
209211
)
210212

211213

214+
def _hf(properties: Properties) -> AbstractFileSystem:
215+
from huggingface_hub import HfFileSystem
216+
217+
return HfFileSystem(
218+
endpoint=properties.get(HF_ENDPOINT),
219+
token=properties.get(HF_TOKEN),
220+
)
221+
222+
212223
SCHEME_TO_FS = {
213224
"": _file,
214225
"file": _file,
@@ -219,6 +230,7 @@ def _adls(properties: Properties) -> AbstractFileSystem:
219230
"abfss": _adls,
220231
"gs": _gs,
221232
"gcs": _gs,
233+
"hf": _hf,
222234
}
223235

224236

pyproject.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ boto3 = { version = ">=1.24.59", optional = true }
7676
s3fs = { version = ">=2023.1.0", optional = true }
7777
adlfs = { version = ">=2023.1.0", optional = true }
7878
gcsfs = { version = ">=2023.1.0", optional = true }
79+
huggingface-hub = { version = ">=0.24.0", optional = true }
7980
psycopg2-binary = { version = ">=2.9.6", optional = true }
8081
sqlalchemy = { version = "^2.0.18", optional = true }
8182
getdaft = { version = ">=0.2.12", optional = true }
@@ -306,6 +307,7 @@ sql-postgres = ["sqlalchemy", "psycopg2-binary"]
306307
sql-sqlite = ["sqlalchemy"]
307308
gcsfs = ["gcsfs"]
308309
rest-sigv4 = ["boto3"]
310+
hf = ["huggingface-hub"]
309311
pyiceberg-core = ["pyiceberg-core"]
310312

311313
[tool.pytest.ini_options]
@@ -427,6 +429,10 @@ ignore_missing_imports = true
427429
module = "gcsfs.*"
428430
ignore_missing_imports = true
429431

432+
[[tool.mypy.overrides]]
433+
module = "huggingface_hub.*"
434+
ignore_missing_imports = true
435+
430436
[[tool.mypy.overrides]]
431437
module = "packaging.*"
432438
ignore_missing_imports = true

0 commit comments

Comments
 (0)