|
71 | 71 | ManifestEntry,
|
72 | 72 | ManifestEntryStatus,
|
73 | 73 | ManifestFile,
|
| 74 | + PartitionFieldSummary, |
74 | 75 | write_manifest,
|
75 | 76 | write_manifest_list,
|
76 | 77 | )
|
@@ -3547,6 +3548,94 @@ def update_partitions_map(
|
3547 | 3548 | schema=table_schema,
|
3548 | 3549 | )
|
3549 | 3550 |
|
| 3551 | + def manifests(self) -> "pa.Table": |
| 3552 | + import pyarrow as pa |
| 3553 | + |
| 3554 | + from pyiceberg.conversions import from_bytes |
| 3555 | + |
| 3556 | + partition_summary_schema = pa.struct([ |
| 3557 | + pa.field("contains_null", pa.bool_(), nullable=False), |
| 3558 | + pa.field("contains_nan", pa.bool_(), nullable=True), |
| 3559 | + pa.field("lower_bound", pa.string(), nullable=True), |
| 3560 | + pa.field("upper_bound", pa.string(), nullable=True), |
| 3561 | + ]) |
| 3562 | + |
| 3563 | + manifest_schema = pa.schema([ |
| 3564 | + pa.field('content', pa.int8(), nullable=False), |
| 3565 | + pa.field('path', pa.string(), nullable=False), |
| 3566 | + pa.field('length', pa.int64(), nullable=False), |
| 3567 | + pa.field('partition_spec_id', pa.int32(), nullable=False), |
| 3568 | + pa.field('added_snapshot_id', pa.int64(), nullable=False), |
| 3569 | + pa.field('added_data_files_count', pa.int32(), nullable=False), |
| 3570 | + pa.field('existing_data_files_count', pa.int32(), nullable=False), |
| 3571 | + pa.field('deleted_data_files_count', pa.int32(), nullable=False), |
| 3572 | + pa.field('added_delete_files_count', pa.int32(), nullable=False), |
| 3573 | + pa.field('existing_delete_files_count', pa.int32(), nullable=False), |
| 3574 | + pa.field('deleted_delete_files_count', pa.int32(), nullable=False), |
| 3575 | + pa.field('partition_summaries', pa.list_(partition_summary_schema), nullable=False), |
| 3576 | + ]) |
| 3577 | + |
| 3578 | + def _partition_summaries_to_rows( |
| 3579 | + spec: PartitionSpec, partition_summaries: List[PartitionFieldSummary] |
| 3580 | + ) -> List[Dict[str, Any]]: |
| 3581 | + rows = [] |
| 3582 | + for i, field_summary in enumerate(partition_summaries): |
| 3583 | + field = spec.fields[i] |
| 3584 | + partition_field_type = spec.partition_type(self.tbl.schema()).fields[i].field_type |
| 3585 | + lower_bound = ( |
| 3586 | + ( |
| 3587 | + field.transform.to_human_string( |
| 3588 | + partition_field_type, from_bytes(partition_field_type, field_summary.lower_bound) |
| 3589 | + ) |
| 3590 | + ) |
| 3591 | + if field_summary.lower_bound |
| 3592 | + else None |
| 3593 | + ) |
| 3594 | + upper_bound = ( |
| 3595 | + ( |
| 3596 | + field.transform.to_human_string( |
| 3597 | + partition_field_type, from_bytes(partition_field_type, field_summary.upper_bound) |
| 3598 | + ) |
| 3599 | + ) |
| 3600 | + if field_summary.upper_bound |
| 3601 | + else None |
| 3602 | + ) |
| 3603 | + rows.append({ |
| 3604 | + 'contains_null': field_summary.contains_null, |
| 3605 | + 'contains_nan': field_summary.contains_nan, |
| 3606 | + 'lower_bound': lower_bound, |
| 3607 | + 'upper_bound': upper_bound, |
| 3608 | + }) |
| 3609 | + return rows |
| 3610 | + |
| 3611 | + specs = self.tbl.metadata.specs() |
| 3612 | + manifests = [] |
| 3613 | + if snapshot := self.tbl.metadata.current_snapshot(): |
| 3614 | + for manifest in snapshot.manifests(self.tbl.io): |
| 3615 | + is_data_file = manifest.content == ManifestContent.DATA |
| 3616 | + is_delete_file = manifest.content == ManifestContent.DELETES |
| 3617 | + manifests.append({ |
| 3618 | + 'content': manifest.content, |
| 3619 | + 'path': manifest.manifest_path, |
| 3620 | + 'length': manifest.manifest_length, |
| 3621 | + 'partition_spec_id': manifest.partition_spec_id, |
| 3622 | + 'added_snapshot_id': manifest.added_snapshot_id, |
| 3623 | + 'added_data_files_count': manifest.added_files_count if is_data_file else 0, |
| 3624 | + 'existing_data_files_count': manifest.existing_files_count if is_data_file else 0, |
| 3625 | + 'deleted_data_files_count': manifest.deleted_files_count if is_data_file else 0, |
| 3626 | + 'added_delete_files_count': manifest.added_files_count if is_delete_file else 0, |
| 3627 | + 'existing_delete_files_count': manifest.existing_files_count if is_delete_file else 0, |
| 3628 | + 'deleted_delete_files_count': manifest.deleted_files_count if is_delete_file else 0, |
| 3629 | + 'partition_summaries': _partition_summaries_to_rows(specs[manifest.partition_spec_id], manifest.partitions) |
| 3630 | + if manifest.partitions |
| 3631 | + else [], |
| 3632 | + }) |
| 3633 | + |
| 3634 | + return pa.Table.from_pylist( |
| 3635 | + manifests, |
| 3636 | + schema=manifest_schema, |
| 3637 | + ) |
| 3638 | + |
3550 | 3639 |
|
3551 | 3640 | @dataclass(frozen=True)
|
3552 | 3641 | class TablePartition:
|
|
0 commit comments