Cast data to Iceberg Table's pyarrow schema (apache#523)

kevinjqliu · HonahX · commit e24541ba4ebe · 2024-03-30T21:00:40.000-04:00
Backport to 0.6.1
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -1721,7 +1721,7 @@ def write_file(table: Table, tasks: Iterator[WriteTask]) -> Iterator[DataFile]:
     parquet_writer_kwargs = _get_parquet_writer_kwargs(table.properties)
 
     file_path = f'{table.location()}/data/{task.generate_data_file_filename("parquet")}'
-    file_schema = schema_to_pyarrow(table.schema())
+    file_schema = table.schema().as_arrow()
 
     fo = table.io.new_output(file_path)
     row_group_size = PropertyUtil.property_as_int(
diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -132,7 +132,15 @@
 _JAVA_LONG_MAX = 9223372036854775807
 
 
-def _check_schema(table_schema: Schema, other_schema: "pa.Schema") -> None:
+def _check_schema_compatible(table_schema: Schema, other_schema: "pa.Schema") -> None:
+    """
+    Check if the `table_schema` is compatible with `other_schema`.
+
+    Two schemas are considered compatible when they are equal in terms of the Iceberg Schema type.
+
+    Raises:
+        ValueError: If the schemas are not compatible.
+    """
     from pyiceberg.io.pyarrow import _pyarrow_to_schema_without_ids, pyarrow_to_schema
 
     name_mapping = table_schema.name_mapping
@@ -1044,7 +1052,10 @@ def append(self, df: pa.Table) -> None:
         if len(self.spec().fields) > 0:
             raise ValueError("Cannot write to partitioned tables")
 
-        _check_schema(self.schema(), other_schema=df.schema)
+        _check_schema_compatible(self.schema(), other_schema=df.schema)
+        # cast if the two schemas are compatible but not equal
+        if self.schema().as_arrow() != df.schema:
+            df = df.cast(self.schema().as_arrow())
 
         merge = _MergingSnapshotProducer(operation=Operation.APPEND, table=self)
 
@@ -1079,7 +1090,10 @@ def overwrite(self, df: pa.Table, overwrite_filter: BooleanExpression = ALWAYS_T
         if len(self.spec().fields) > 0:
             raise ValueError("Cannot write to partitioned tables")
 
-        _check_schema(self.schema(), other_schema=df.schema)
+        _check_schema_compatible(self.schema(), other_schema=df.schema)
+        # cast if the two schemas are compatible but not equal
+        if self.schema().as_arrow() != df.schema:
+            df = df.cast(self.schema().as_arrow())
 
         merge = _MergingSnapshotProducer(
             operation=Operation.OVERWRITE if self.current_snapshot() is not None else Operation.APPEND,
diff --git a/tests/catalog/test_sql.py b/tests/catalog/test_sql.py
@@ -191,6 +191,39 @@ def test_create_table_with_pyarrow_schema(
     catalog.drop_table(random_identifier)
 
 
+@pytest.mark.parametrize(
+    'catalog',
+    [
+        lazy_fixture('catalog_memory'),
+        # lazy_fixture('catalog_sqlite'),
+    ],
+)
+def test_write_pyarrow_schema(catalog: SqlCatalog, random_identifier: Identifier) -> None:
+    import pyarrow as pa
+
+    pyarrow_table = pa.Table.from_arrays(
+        [
+            pa.array([None, "A", "B", "C"]),  # 'foo' column
+            pa.array([1, 2, 3, 4]),  # 'bar' column
+            pa.array([True, None, False, True]),  # 'baz' column
+            pa.array([None, "A", "B", "C"]),  # 'large' column
+        ],
+        schema=pa.schema([
+            pa.field('foo', pa.string(), nullable=True),
+            pa.field('bar', pa.int32(), nullable=False),
+            pa.field('baz', pa.bool_(), nullable=True),
+            pa.field('large', pa.large_string(), nullable=True),
+        ]),
+    )
+    database_name, _table_name = random_identifier
+    catalog.create_namespace(database_name)
+    table = catalog.create_table(random_identifier, pyarrow_table.schema)
+    print(pyarrow_table.schema)
+    print(table.schema().as_struct())
+    print()
+    table.overwrite(pyarrow_table)
+
+
 @pytest.mark.parametrize(
     'catalog',
     [
diff --git a/tests/table/test_init.py b/tests/table/test_init.py
@@ -58,7 +58,7 @@
     Table,
     UpdateSchema,
     _apply_table_update,
-    _check_schema,
+    _check_schema_compatible,
     _generate_snapshot_id,
     _match_deletes_to_data_file,
     _TableMetadataUpdateContext,
@@ -1004,7 +1004,7 @@ def test_schema_mismatch_type(table_schema_simple: Schema) -> None:
 """
 
     with pytest.raises(ValueError, match=expected):
-        _check_schema(table_schema_simple, other_schema)
+        _check_schema_compatible(table_schema_simple, other_schema)
 
 
 def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None:
@@ -1025,7 +1025,7 @@ def test_schema_mismatch_nullability(table_schema_simple: Schema) -> None:
 """
 
     with pytest.raises(ValueError, match=expected):
-        _check_schema(table_schema_simple, other_schema)
+        _check_schema_compatible(table_schema_simple, other_schema)
 
 
 def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None:
@@ -1045,7 +1045,7 @@ def test_schema_mismatch_missing_field(table_schema_simple: Schema) -> None:
 """
 
     with pytest.raises(ValueError, match=expected):
-        _check_schema(table_schema_simple, other_schema)
+        _check_schema_compatible(table_schema_simple, other_schema)
 
 
 def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None:
@@ -1059,4 +1059,18 @@ def test_schema_mismatch_additional_field(table_schema_simple: Schema) -> None:
     expected = r"PyArrow table contains more columns: new_field. Update the schema first \(hint, use union_by_name\)."
 
     with pytest.raises(ValueError, match=expected):
-        _check_schema(table_schema_simple, other_schema)
+        _check_schema_compatible(table_schema_simple, other_schema)
+
+
+def test_schema_downcast(table_schema_simple: Schema) -> None:
+    # large_string type is compatible with string type
+    other_schema = pa.schema((
+        pa.field("foo", pa.large_string(), nullable=True),
+        pa.field("bar", pa.int32(), nullable=False),
+        pa.field("baz", pa.bool_(), nullable=True),
+    ))
+
+    try:
+        _check_schema_compatible(table_schema_simple, other_schema)
+    except Exception:
+        pytest.fail("Unexpected Exception raised when calling `_check_schema`")