|
69 | 69 | _read_deletes,
|
70 | 70 | _to_requested_schema,
|
71 | 71 | bin_pack_arrow_table,
|
| 72 | + compute_statistics_plan, |
| 73 | + data_file_statistics_from_parquet_metadata, |
72 | 74 | expression_to_pyarrow,
|
| 75 | + parquet_path_to_id_mapping, |
73 | 76 | schema_to_pyarrow,
|
74 | 77 | )
|
75 | 78 | from pyiceberg.manifest import DataFile, DataFileContent, FileFormat
|
76 | 79 | from pyiceberg.partitioning import PartitionField, PartitionSpec
|
77 | 80 | from pyiceberg.schema import Schema, make_compatible_name, visit
|
78 | 81 | from pyiceberg.table import FileScanTask, TableProperties
|
79 | 82 | from pyiceberg.table.metadata import TableMetadataV2
|
| 83 | +from pyiceberg.table.name_mapping import create_mapping_from_schema |
80 | 84 | from pyiceberg.transforms import IdentityTransform
|
81 | 85 | from pyiceberg.typedef import UTF8, Properties, Record
|
82 | 86 | from pyiceberg.types import (
|
|
99 | 103 | TimestamptzType,
|
100 | 104 | TimeType,
|
101 | 105 | )
|
| 106 | +from tests.catalog.test_base import InMemoryCatalog |
102 | 107 | from tests.conftest import UNIFIED_AWS_SESSION_PROPERTIES
|
103 | 108 |
|
104 | 109 |
|
@@ -1127,6 +1132,133 @@ def test_projection_concat_files(schema_int: Schema, file_int: str) -> None:
|
1127 | 1132 | assert repr(result_table.schema) == "id: int32"
|
1128 | 1133 |
|
1129 | 1134 |
|
| 1135 | +def test_identity_transform_column_projection(tmp_path: str, catalog: InMemoryCatalog) -> None: |
| 1136 | + # Test by adding a non-partitioned data file to a partitioned table, verifying partition value projection from manifest metadata. |
| 1137 | + # TODO: Update to use a data file created by writing data to an unpartitioned table once add_files supports field IDs. |
| 1138 | + # (context: https://github.com/apache/iceberg-python/pull/1443#discussion_r1901374875) |
| 1139 | + |
| 1140 | + schema = Schema( |
| 1141 | + NestedField(1, "other_field", StringType(), required=False), NestedField(2, "partition_id", IntegerType(), required=False) |
| 1142 | + ) |
| 1143 | + |
| 1144 | + partition_spec = PartitionSpec( |
| 1145 | + PartitionField(2, 1000, IdentityTransform(), "partition_id"), |
| 1146 | + ) |
| 1147 | + |
| 1148 | + table = catalog.create_table( |
| 1149 | + "default.test_projection_partition", |
| 1150 | + schema=schema, |
| 1151 | + partition_spec=partition_spec, |
| 1152 | + properties={TableProperties.DEFAULT_NAME_MAPPING: create_mapping_from_schema(schema).model_dump_json()}, |
| 1153 | + ) |
| 1154 | + |
| 1155 | + file_data = pa.array(["foo"], type=pa.string()) |
| 1156 | + file_loc = f"{tmp_path}/test.parquet" |
| 1157 | + pq.write_table(pa.table([file_data], names=["other_field"]), file_loc) |
| 1158 | + |
| 1159 | + statistics = data_file_statistics_from_parquet_metadata( |
| 1160 | + parquet_metadata=pq.read_metadata(file_loc), |
| 1161 | + stats_columns=compute_statistics_plan(table.schema(), table.metadata.properties), |
| 1162 | + parquet_column_mapping=parquet_path_to_id_mapping(table.schema()), |
| 1163 | + ) |
| 1164 | + |
| 1165 | + unpartitioned_file = DataFile( |
| 1166 | + content=DataFileContent.DATA, |
| 1167 | + file_path=file_loc, |
| 1168 | + file_format=FileFormat.PARQUET, |
| 1169 | + # projected value |
| 1170 | + partition=Record(partition_id=1), |
| 1171 | + file_size_in_bytes=os.path.getsize(file_loc), |
| 1172 | + sort_order_id=None, |
| 1173 | + spec_id=table.metadata.default_spec_id, |
| 1174 | + equality_ids=None, |
| 1175 | + key_metadata=None, |
| 1176 | + **statistics.to_serialized_dict(), |
| 1177 | + ) |
| 1178 | + |
| 1179 | + with table.transaction() as transaction: |
| 1180 | + with transaction.update_snapshot().overwrite() as update: |
| 1181 | + update.append_data_file(unpartitioned_file) |
| 1182 | + |
| 1183 | + assert ( |
| 1184 | + str(table.scan().to_arrow()) |
| 1185 | + == """pyarrow.Table |
| 1186 | +other_field: large_string |
| 1187 | +partition_id: int64 |
| 1188 | +---- |
| 1189 | +other_field: [["foo"]] |
| 1190 | +partition_id: [[1]]""" |
| 1191 | + ) |
| 1192 | + |
| 1193 | + |
| 1194 | +def test_identity_transform_columns_projection(tmp_path: str, catalog: InMemoryCatalog) -> None: |
| 1195 | + # Test by adding a non-partitioned data file to a multi-partitioned table, verifying partition value projection from manifest metadata. |
| 1196 | + # TODO: Update to use a data file created by writing data to an unpartitioned table once add_files supports field IDs. |
| 1197 | + # (context: https://github.com/apache/iceberg-python/pull/1443#discussion_r1901374875) |
| 1198 | + schema = Schema( |
| 1199 | + NestedField(1, "field_1", StringType(), required=False), |
| 1200 | + NestedField(2, "field_2", IntegerType(), required=False), |
| 1201 | + NestedField(3, "field_3", IntegerType(), required=False), |
| 1202 | + ) |
| 1203 | + |
| 1204 | + partition_spec = PartitionSpec( |
| 1205 | + PartitionField(2, 1000, IdentityTransform(), "field_2"), |
| 1206 | + PartitionField(3, 1001, IdentityTransform(), "field_3"), |
| 1207 | + ) |
| 1208 | + |
| 1209 | + table = catalog.create_table( |
| 1210 | + "default.test_projection_partitions", |
| 1211 | + schema=schema, |
| 1212 | + partition_spec=partition_spec, |
| 1213 | + properties={TableProperties.DEFAULT_NAME_MAPPING: create_mapping_from_schema(schema).model_dump_json()}, |
| 1214 | + ) |
| 1215 | + |
| 1216 | + file_data = pa.array(["foo"], type=pa.string()) |
| 1217 | + file_loc = f"{tmp_path}/test.parquet" |
| 1218 | + pq.write_table(pa.table([file_data], names=["field_1"]), file_loc) |
| 1219 | + |
| 1220 | + statistics = data_file_statistics_from_parquet_metadata( |
| 1221 | + parquet_metadata=pq.read_metadata(file_loc), |
| 1222 | + stats_columns=compute_statistics_plan(table.schema(), table.metadata.properties), |
| 1223 | + parquet_column_mapping=parquet_path_to_id_mapping(table.schema()), |
| 1224 | + ) |
| 1225 | + |
| 1226 | + unpartitioned_file = DataFile( |
| 1227 | + content=DataFileContent.DATA, |
| 1228 | + file_path=file_loc, |
| 1229 | + file_format=FileFormat.PARQUET, |
| 1230 | + # projected value |
| 1231 | + partition=Record(field_2=2, field_3=3), |
| 1232 | + file_size_in_bytes=os.path.getsize(file_loc), |
| 1233 | + sort_order_id=None, |
| 1234 | + spec_id=table.metadata.default_spec_id, |
| 1235 | + equality_ids=None, |
| 1236 | + key_metadata=None, |
| 1237 | + **statistics.to_serialized_dict(), |
| 1238 | + ) |
| 1239 | + |
| 1240 | + with table.transaction() as transaction: |
| 1241 | + with transaction.update_snapshot().overwrite() as update: |
| 1242 | + update.append_data_file(unpartitioned_file) |
| 1243 | + |
| 1244 | + assert ( |
| 1245 | + str(table.scan().to_arrow()) |
| 1246 | + == """pyarrow.Table |
| 1247 | +field_1: large_string |
| 1248 | +field_2: int64 |
| 1249 | +field_3: int64 |
| 1250 | +---- |
| 1251 | +field_1: [["foo"]] |
| 1252 | +field_2: [[2]] |
| 1253 | +field_3: [[3]]""" |
| 1254 | + ) |
| 1255 | + |
| 1256 | + |
| 1257 | +@pytest.fixture |
| 1258 | +def catalog() -> InMemoryCatalog: |
| 1259 | + return InMemoryCatalog("test.in_memory.catalog", **{"test.key": "test.value"}) |
| 1260 | + |
| 1261 | + |
1130 | 1262 | def test_projection_filter(schema_int: Schema, file_int: str) -> None:
|
1131 | 1263 | result_table = project(schema_int, [file_int], GreaterThan("id", 4))
|
1132 | 1264 | assert len(result_table.columns[0]) == 0
|
|
0 commit comments