Skip to content

Commit 5cc0982

Browse files
committed
Use PyArrow dataset for metadata
1 parent c110f6e commit 5cc0982

File tree

1 file changed

+29
-9
lines changed

1 file changed

+29
-9
lines changed

dask_expr/io/parquet.py

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -395,13 +395,17 @@ def filesystem(self):
395395
import boto3
396396
from pyarrow.fs import S3FileSystem
397397

398+
bucket = self.path[5:].split("/")[0]
398399
session = boto3.session.Session()
399400
credentials = session.get_credentials()
401+
region = session.client("s3").get_bucket_location(Bucket=bucket)[
402+
"LocationConstraint"
403+
]
400404

401405
return S3FileSystem(
402406
secret_key=credentials.secret_key,
403407
access_key=credentials.access_key,
404-
region="us-east-2", # TODO
408+
region=region,
405409
session_token=credentials.token,
406410
)
407411
else:
@@ -510,8 +514,27 @@ def meta_and_filenames(path):
510514
if str(path).startswith("s3://"):
511515
import s3fs
512516

513-
s3 = s3fs.S3FileSystem()
514-
filenames = s3.ls(path)
517+
filenames = s3fs.S3FileSystem().ls(path)
518+
519+
import boto3
520+
from pyarrow.fs import S3FileSystem
521+
522+
session = boto3.session.Session()
523+
credentials = session.get_credentials()
524+
525+
bucket = path[5:].split("/")[0]
526+
region = session.client("s3").get_bucket_location(Bucket=bucket)[
527+
"LocationConstraint"
528+
]
529+
530+
filesystem = S3FileSystem(
531+
secret_key=credentials.secret_key,
532+
access_key=credentials.access_key,
533+
region=region,
534+
session_token=credentials.token,
535+
)
536+
path = path[5:]
537+
515538
else:
516539
import glob
517540
import os
@@ -521,14 +544,11 @@ def meta_and_filenames(path):
521544
else:
522545
filenames = [path] # TODO: split by row group
523546

524-
<<<<<<< HEAD
525-
import dask.dataframe as dd
526-
meta = dd.read_parquet(path)._meta
527-
=======
528-
ds = pq.ParquetDataset(path)
547+
filesystem = None
548+
549+
ds = pq.ParquetDataset(path, filesystem=filesystem)
529550
t = pa.Table.from_pylist([], schema=ds.schema)
530551
meta = t.to_pandas(types_mapper=types_mapper)
531-
>>>>>>> d037c82 (Grab meta from arrow rather than dask.dataframe)
532552

533553
return meta, filenames
534554

0 commit comments

Comments
 (0)