112
112
SnapshotLogEntry ,
113
113
SnapshotSummaryCollector ,
114
114
Summary ,
115
+ ancestors_between ,
116
+ is_parent_ancestor_of ,
115
117
update_snapshot_summaries ,
116
118
)
117
119
from pyiceberg .table .sorting import UNSORTED_SORT_ORDER , SortOrder
@@ -1679,10 +1681,6 @@ def snapshot(self) -> Optional[Snapshot]:
1679
1681
return self .table_metadata .snapshot_by_id (self .snapshot_id )
1680
1682
return self .table_metadata .current_snapshot ()
1681
1683
1682
- def _build_manifest_evaluator (self , spec_id : int ) -> Callable [[ManifestFile ], bool ]:
1683
- spec = self .table_metadata .specs ()[spec_id ]
1684
- return manifest_evaluator (spec , self .table_metadata .schema (), self .partition_filters [spec_id ], self .case_sensitive )
1685
-
1686
1684
def projection (self ) -> Schema :
1687
1685
current_schema = self .table_metadata .schema ()
1688
1686
if self .snapshot_id is not None :
@@ -1703,41 +1701,6 @@ def projection(self) -> Schema:
1703
1701
1704
1702
return current_schema .select (* self .selected_fields , case_sensitive = self .case_sensitive )
1705
1703
1706
- def _build_partition_evaluator (self , spec_id : int ) -> Callable [[DataFile ], bool ]:
1707
- spec = self .table_metadata .specs ()[spec_id ]
1708
- partition_type = spec .partition_type (self .table_metadata .schema ())
1709
- partition_schema = Schema (* partition_type .fields )
1710
- partition_expr = self .partition_filters [spec_id ]
1711
-
1712
- # The lambda created here is run in multiple threads.
1713
- # So we avoid creating _EvaluatorExpression methods bound to a single
1714
- # shared instance across multiple threads.
1715
- return lambda data_file : expression_evaluator (partition_schema , partition_expr , self .case_sensitive )(data_file .partition )
1716
-
1717
- def _check_sequence_number (self , min_data_sequence_number : int , manifest : ManifestFile ) -> bool :
1718
- """Ensure that no manifests are loaded that contain deletes that are older than the data.
1719
-
1720
- Args:
1721
- min_data_sequence_number (int): The minimal sequence number.
1722
- manifest (ManifestFile): A ManifestFile that can be either data or deletes.
1723
-
1724
- Returns:
1725
- Boolean indicating if it is either a data file, or a relevant delete file.
1726
- """
1727
- return manifest .content == ManifestContent .DATA or (
1728
- # Not interested in deletes that are older than the data
1729
- manifest .content == ManifestContent .DELETES
1730
- and (manifest .sequence_number or INITIAL_SEQUENCE_NUMBER ) >= min_data_sequence_number
1731
- )
1732
-
1733
- def use_ref (self : S , name : str ) -> S :
1734
- if self .snapshot_id : # type: ignore
1735
- raise ValueError (f"Cannot override ref, already set snapshot id={ self .snapshot_id } " ) # type: ignore
1736
- if snapshot := self .table_metadata .snapshot_by_name (name ):
1737
- return self .update (snapshot_id = snapshot .snapshot_id )
1738
-
1739
- raise ValueError (f"Cannot scan unknown ref={ name } " )
1740
-
1741
1704
def plan_files (self ) -> Iterable [FileScanTask ]:
1742
1705
"""Plans the relevant files by filtering on the PartitionSpecs.
1743
1706
@@ -1825,6 +1788,14 @@ def to_arrow(self) -> pa.Table:
1825
1788
def to_pandas (self , ** kwargs : Any ) -> pd .DataFrame :
1826
1789
return self .to_arrow ().to_pandas (** kwargs )
1827
1790
1791
+ def use_ref (self : S , name : str ) -> S :
1792
+ if self .snapshot_id : # type: ignore
1793
+ raise ValueError (f"Cannot override ref, already set snapshot id={ self .snapshot_id } " ) # type: ignore
1794
+ if snapshot := self .table_metadata .snapshot_by_name (name ):
1795
+ return self .update (snapshot_id = snapshot .snapshot_id )
1796
+
1797
+ raise ValueError (f"Cannot scan unknown ref={ name } " )
1798
+
1828
1799
def to_duckdb (self , table_name : str , connection : Optional [DuckDBPyConnection ] = None ) -> DuckDBPyConnection :
1829
1800
import duckdb
1830
1801
@@ -1840,6 +1811,13 @@ def to_ray(self) -> ray.data.dataset.Dataset:
1840
1811
1841
1812
1842
1813
class BaseIncrementalScan (TableScan ):
1814
+ """Base class for incremental scans.
1815
+
1816
+ Args:
1817
+ to_snapshot_id: The end snapshot ID (inclusive).
1818
+ from_snapshot_id_exclusive: The start snapshot ID (exclusive).
1819
+ """
1820
+
1843
1821
to_snapshot_id : Optional [int ]
1844
1822
from_snapshot_id_exclusive : Optional [int ]
1845
1823
@@ -3913,35 +3891,3 @@ def _determine_partitions(spec: PartitionSpec, schema: Schema, arrow_table: pa.T
3913
3891
table_partitions : list [TablePartition ] = _get_table_partitions (arrow_table , spec , schema , slice_instructions )
3914
3892
3915
3893
return table_partitions
3916
-
3917
-
3918
- def ancestors_between (to_snapshot : int , from_snapshot : Optional [int ], table_metadata : TableMetadata ) -> Iterable [Snapshot ]:
3919
- if from_snapshot is not None :
3920
- for snapshot in ancestors_of (table_metadata .snapshot_by_id (to_snapshot ), table_metadata ): # type: ignore
3921
- if snapshot .snapshot_id == from_snapshot :
3922
- break
3923
- yield snapshot
3924
- else :
3925
- yield from ancestors_of (table_metadata .snapshot_by_id (to_snapshot ), table_metadata ) # type: ignore
3926
-
3927
-
3928
- def is_parent_ancestor_of (snapshot_id : int , ancestor_parent_snapshot_id : int , table_metadata : TableMetadata ) -> bool :
3929
- for snapshot in ancestors_of (table_metadata .snapshot_by_id (snapshot_id ), table_metadata ): # type: ignore
3930
- if snapshot .parent_snapshot_id and snapshot .parent_snapshot_id == ancestor_parent_snapshot_id :
3931
- return True
3932
- return False
3933
-
3934
-
3935
- def oldest_ancestor_of (snapshot_id : int , table_metadata : TableMetadata ) -> Optional [int ]:
3936
- last_snapshot = None
3937
- for snapshot in ancestors_of (table_metadata .snapshot_by_id (snapshot_id ), table_metadata ): # type: ignore
3938
- last_snapshot = snapshot .snapshot_id
3939
- return last_snapshot
3940
-
3941
-
3942
- def ancestors_of (latest_snapshot : Snapshot , table_metadata : TableMetadata ) -> Iterable [Snapshot ]:
3943
- if latest_snapshot :
3944
- yield latest_snapshot
3945
- if latest_snapshot .parent_snapshot_id :
3946
- if parent := table_metadata .snapshot_by_id (latest_snapshot .parent_snapshot_id ):
3947
- yield from ancestors_of (parent , table_metadata )
0 commit comments