Closed
Description
Apache Iceberg version
0.6.0 (latest release)
Please describe the bug 🐞
Given a table like so:
In [36]: table
Out[36]:
matches(
...
14: player_last_session: optional timestamptz,
...
30: subject_last_session: optional timestamptz,
),
partition by: [run_date, player_agg_cluster_name, initiating_at],
sort order: [],
snapshot: Operation.APPEND: id=6595288807809068528, schema_id=0
I get the following error
In [25]: table.scan().to_arrow()
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[25], line 1
----> 1 table.scan().to_arrow()
File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/table/__init__.py:1418, in DataScan.to_arrow(self)
1415 def to_arrow(self) -> pa.Table:
1416 from pyiceberg.io.pyarrow import project_table
-> 1418 return project_table(
1419 self.plan_files(),
1420 self.table,
1421 self.row_filter,
1422 self.projection(),
1423 case_sensitive=self.case_sensitive,
1424 limit=self.limit,
1425 )
File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:1114, in project_table(tasks, table, row_filter, projected_schema, case_sensitive, limit)
1111 if limit is not None:
1112 _ = [f.cancel() for f in futures if not f.done()]
-> 1114 tables = [f.result() for f in completed_futures if f.result()]
1116 if len(tables) < 1:
1117 return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema))
File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:1114, in <listcomp>(.0)
1111 if limit is not None:
1112 _ = [f.cancel() for f in futures if not f.done()]
-> 1114 tables = [f.result() for f in completed_futures if f.result()]
1116 if len(tables) < 1:
1117 return pa.Table.from_batches([], schema=schema_to_pyarrow(projected_schema))
File ~/.pyenv/versions/3.11.7/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout)
447 raise CancelledError()
448 elif self._state == FINISHED:
--> 449 return self.__get_result()
451 self._condition.wait(timeout)
453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
File ~/.pyenv/versions/3.11.7/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self)
399 if self._exception:
400 try:
--> 401 raise self._exception
402 finally:
403 # Break a reference cycle with the exception in self._exception
404 self = None
File ~/.pyenv/versions/3.11.7/lib/python3.11/concurrent/futures/thread.py:58, in _WorkItem.run(self)
55 return
57 try:
---> 58 result = self.fn(*self.args, **self.kwargs)
59 except BaseException as exc:
60 self.future.set_exception(exc)
File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:957, in _task_to_table(fs, task, bound_row_filter, projected_schema, projected_field_ids, positional_deletes, case_sensitive, row_counts, limit, name_mapping)
954 if metadata := physical_schema.metadata:
955 schema_raw = metadata.get(ICEBERG_SCHEMA)
956 file_schema = (
--> 957 Schema.model_validate_json(schema_raw) if schema_raw is not None else pyarrow_to_schema(physical_schema, name_mapping)
958 )
960 pyarrow_filter = None
961 if bound_row_filter is not AlwaysTrue():
File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:655, in pyarrow_to_schema(schema, name_mapping)
651 else:
652 raise ValueError(
653 "Parquet file does not have field-ids and the Iceberg table does not have 'schema.name-mapping.default' defined"
654 )
--> 655 return visit_pyarrow(schema, visitor)
File ~/.pyenv/versions/3.11.7/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
905 if not args:
906 raise TypeError(f'{funcname} requires at least '
907 '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)
File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:676, in _(obj, visitor)
674 @visit_pyarrow.register(pa.Schema)
675 def _(obj: pa.Schema, visitor: PyArrowSchemaVisitor[T]) -> T:
--> 676 return visitor.schema(obj, visit_pyarrow(pa.struct(obj), visitor))
File ~/.pyenv/versions/3.11.7/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
905 if not args:
906 raise TypeError(f'{funcname} requires at least '
907 '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)
File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:685, in _(obj, visitor)
683 for field in obj:
684 visitor.before_field(field)
--> 685 result = visit_pyarrow(field.type, visitor)
686 results.append(visitor.field(field, result))
687 visitor.after_field(field)
File ~/.pyenv/versions/3.11.7/lib/python3.11/functools.py:909, in singledispatch.<locals>.wrapper(*args, **kw)
905 if not args:
906 raise TypeError(f'{funcname} requires at least '
907 '1 positional argument')
--> 909 return dispatch(args[0].__class__)(*args, **kw)
File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:718, in _(obj, visitor)
716 if pa.types.is_nested(obj):
717 raise TypeError(f"Expected primitive type, got: {type(obj)}")
--> 718 return visitor.primitive(obj)
File ~/.pyenv/versions/3.11.7/envs/ml/lib/python3.11/site-packages/pyiceberg/io/pyarrow.py:891, in _ConvertToIceberg.primitive(self, primitive)
888 primitive = cast(pa.FixedSizeBinaryType, primitive)
889 return FixedType(primitive.byte_width)
--> 891 raise TypeError(f"Unsupported type: {primitive}")
TypeError: Unsupported type: timestamp[ns]
After some debugging, at this line I find
ipdb> physical_schema
player_last_session: timestamp[ns]
...
subject_last_session: timestamp[ns]
I imagine the fix is to do something like this on this line, but currently those overrides are not exposed. Am I on the right track?
I believe that this issue is somewhat similar to #520
Metadata
Metadata
Assignees
Labels
No labels