Add support for categorical type (#693)

sungwy · web-flow · commit 05086675d267 · 2024-05-07T14:18:48.000-06:00
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -731,6 +731,16 @@ def _(obj: pa.MapType, visitor: PyArrowSchemaVisitor[T]) -> T:
     return visitor.map(obj, key_result, value_result)
 
 
+@visit_pyarrow.register(pa.DictionaryType)
+def _(obj: pa.DictionaryType, visitor: PyArrowSchemaVisitor[T]) -> T:
+    # Parquet has no dictionary type. dictionary-encoding is handled
+    # as an encoding detail, not as a separate type.
+    # We will follow this approach in determining the Iceberg Type,
+    # as we only support parquet in PyIceberg for now.
+    logger.warning(f"Iceberg does not have a dictionary type. {type(obj)} will be inferred as {obj.value_type} on read.")
+    return visit_pyarrow(obj.value_type, visitor)
+
+
 @visit_pyarrow.register(pa.DataType)
 def _(obj: pa.DataType, visitor: PyArrowSchemaVisitor[T]) -> T:
     if pa.types.is_nested(obj):
diff --git a/tests/integration/test_writes/test_writes.py b/tests/integration/test_writes/test_writes.py
@@ -315,6 +315,30 @@ def test_python_writes_special_character_column_with_spark_reads(
     assert spark_df.equals(pyiceberg_df)
 
 
+@pytest.mark.integration
+@pytest.mark.parametrize("format_version", [1, 2])
+def test_python_writes_dictionary_encoded_column_with_spark_reads(
+    spark: SparkSession, session_catalog: Catalog, format_version: int
+) -> None:
+    identifier = "default.python_writes_dictionary_encoded_column_with_spark_reads"
+    TEST_DATA = {
+        'id': [1, 2, 3, 1, 1],
+        'name': ['AB', 'CD', 'EF', 'CD', 'EF'],
+    }
+    pa_schema = pa.schema([
+        pa.field('id', pa.dictionary(pa.int32(), pa.int32(), False)),
+        pa.field('name', pa.dictionary(pa.int32(), pa.string(), False)),
+    ])
+    arrow_table = pa.Table.from_pydict(TEST_DATA, schema=pa_schema)
+
+    tbl = _create_table(session_catalog, identifier, {"format-version": format_version}, schema=pa_schema)
+
+    tbl.overwrite(arrow_table)
+    spark_df = spark.sql(f"SELECT * FROM {identifier}").toPandas()
+    pyiceberg_df = tbl.scan().to_pandas()
+    assert spark_df.equals(pyiceberg_df)
+
+
 @pytest.mark.integration
 def test_write_bin_pack_data_files(spark: SparkSession, session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
     identifier = "default.write_bin_pack_data_files"
diff --git a/tests/io/test_pyarrow_visitor.py b/tests/io/test_pyarrow_visitor.py
@@ -39,6 +39,7 @@
     DoubleType,
     FixedType,
     FloatType,
+    IcebergType,
     IntegerType,
     ListType,
     LongType,
@@ -280,6 +281,19 @@ def test_pyarrow_map_to_iceberg() -> None:
     assert visit_pyarrow(pyarrow_map, _ConvertToIceberg()) == expected
 
 
+@pytest.mark.parametrize(
+    "value_type, expected_result",
+    [
+        (pa.string(), StringType()),
+        (pa.int32(), IntegerType()),
+        (pa.float64(), DoubleType()),
+    ],
+)
+def test_pyarrow_dictionary_encoded_type_to_iceberg(value_type: pa.DataType, expected_result: IcebergType) -> None:
+    pyarrow_dict = pa.dictionary(pa.int32(), value_type)
+    assert visit_pyarrow(pyarrow_dict, _ConvertToIceberg()) == expected_result
+
+
 def test_round_schema_conversion_simple(table_schema_simple: Schema) -> None:
     actual = str(pyarrow_to_schema(schema_to_pyarrow(table_schema_simple)))
     expected = """table {