Closed
Description
Apache Iceberg version
0.7.0
Please describe the bug 🐞
There is a regression in introduced in version 0.7.0 where arrow tables written with a "string" data type, get cast to "large_string" when read back from Iceberg.
The code below reproduces the bug. The assertion succeeds in v0.6.1, but fails in 0.7.0 because the schema is being changed from "string" to "large_string".
from tempfile import TemporaryDirectory
import pyarrow
from pyiceberg.catalog.sql import SqlCatalog
def main():
with TemporaryDirectory() as warehouse_path:
catalog = SqlCatalog(
"default",
**{
"uri": f"sqlite:///{warehouse_path}/pyiceberg_catalog.db",
"warehouse": f"file://{warehouse_path}",
},
)
catalog.create_namespace("default")
schema = pyarrow.schema(
[
pyarrow.field("foo", pyarrow.string(), nullable=True),
]
)
df = pyarrow.table(data={"foo": ["bar"]}, schema=schema)
table = catalog.create_table(
"default.test_table",
schema=df.schema,
)
table.append(df)
# read arrow table back table from iceberg
df2 = table.scan().to_arrow()
# this assert succeeds with 0.6.1, but fails with 0.7.0 because the column type
# has changed from "string" to "large_string"
assert df.equals(df2)
if __name__ == "__main__":
main()