Skip to content

Feature: Write to branches #941

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Jul 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d4ca653
s/"main"/MAIN_BRANCH
kevinjqliu Jan 27, 2024
0b7aaaf
replace string literals
kevinjqliu Jan 27, 2024
23f04ec
default writes to main branch
kevinjqliu Jan 27, 2024
af6ff9a
Added some more methods for branches
vinjai Jul 18, 2024
6fbf3f1
s/"main"/MAIN_BRANCH
kevinjqliu Jan 27, 2024
8ce1509
replace string literals
kevinjqliu Jan 27, 2024
6daf29e
default writes to main branch
kevinjqliu Jan 27, 2024
09321cd
Added some more methods for branches
vinjai Jul 18, 2024
60fef31
Merged with master
vinjai Oct 12, 2024
45b01a6
Updated antries for branches
vinjai Oct 12, 2024
917108b
Resolved Merge Conflict
vinjai Oct 12, 2024
917b044
Fixed some bugs
vinjai Oct 14, 2024
398f6c0
Fixed bugs in delete and overwrite
vinjai Oct 15, 2024
b7b8ba0
Added tests and some refactoring
vinjai Oct 16, 2024
ee591b4
Added another integration test
vinjai Oct 16, 2024
e81907d
Fixed bug: concurrent same name branch and tag writes
vinjai Oct 16, 2024
4cf9198
Merge with main branch
vinjai Nov 13, 2024
bc6fb68
Added integration tests with spark
vinjai Nov 14, 2024
82e65e1
Fixed comments for AssertSnapshotRef
vinjai Feb 23, 2025
82e5b90
Fixed comments and linter issues
vinjai Feb 23, 2025
84d0971
Fixed comments
vinjai Feb 23, 2025
3efe53c
Fixed comments
vinjai Feb 23, 2025
dfedc63
Fixed a bug in tests
vinjai Feb 24, 2025
076a6d5
Fixed some more tests
vinjai Feb 24, 2025
53a7f84
Merge branch 'main' into feature/write-to-branch
vinjai May 25, 2025
e4463df
Fixed linter and code errors
vinjai May 25, 2025
49f75b4
Fixed bug for empty tables
vinjai May 26, 2025
4ed0607
Fixed bugs and added more tests
vinjai May 27, 2025
958aac4
changed design context for branch writes
vinjai May 27, 2025
a0aae4d
Merge branch 'main' into feature/write-to-branch
vinjai Jun 3, 2025
76249e9
Merge branch 'main' into feature/write-to-branch
vinjai Jun 23, 2025
079802a
Fixed linter, comments and other bugs
vinjai Jun 24, 2025
f45df8b
Usage of builder pattern
vinjai Jun 24, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions pyiceberg/cli/console.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from pyiceberg.cli.output import ConsoleOutput, JsonOutput, Output
from pyiceberg.exceptions import NoSuchNamespaceError, NoSuchPropertyException, NoSuchTableError
from pyiceberg.table import TableProperties
from pyiceberg.table.refs import SnapshotRef
from pyiceberg.table.refs import SnapshotRef, SnapshotRefType
from pyiceberg.utils.properties import property_as_int


Expand Down Expand Up @@ -417,7 +417,7 @@ def list_refs(ctx: Context, identifier: str, type: str, verbose: bool) -> None:
refs = table.refs()
if type:
type = type.lower()
if type not in {"branch", "tag"}:
if type not in {SnapshotRefType.BRANCH, SnapshotRefType.TAG}:
raise ValueError(f"Type must be either branch or tag, got: {type}")

relevant_refs = [
Expand All @@ -431,7 +431,7 @@ def list_refs(ctx: Context, identifier: str, type: str, verbose: bool) -> None:

def _retention_properties(ref: SnapshotRef, table_properties: Dict[str, str]) -> Dict[str, str]:
retention_properties = {}
if ref.snapshot_ref_type == "branch":
if ref.snapshot_ref_type == SnapshotRefType.BRANCH:
default_min_snapshots_to_keep = property_as_int(
table_properties,
TableProperties.MIN_SNAPSHOTS_TO_KEEP,
Expand Down
97 changes: 72 additions & 25 deletions pyiceberg/table/__init__.py

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions pyiceberg/table/update/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
from pyiceberg.partitioning import PARTITION_FIELD_ID_START, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.table.metadata import SUPPORTED_TABLE_FORMAT_VERSION, TableMetadata, TableMetadataUtil
from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef
from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef, SnapshotRefType
from pyiceberg.table.snapshots import (
MetadataLogEntry,
Snapshot,
Expand Down Expand Up @@ -139,7 +139,7 @@ class AddSnapshotUpdate(IcebergBaseModel):
class SetSnapshotRefUpdate(IcebergBaseModel):
action: Literal["set-snapshot-ref"] = Field(default="set-snapshot-ref")
ref_name: str = Field(alias="ref-name")
type: Literal["tag", "branch"]
type: Literal[SnapshotRefType.TAG, SnapshotRefType.BRANCH]
snapshot_id: int = Field(alias="snapshot-id")
max_ref_age_ms: Annotated[Optional[int], Field(alias="max-ref-age-ms", default=None)]
max_snapshot_age_ms: Annotated[Optional[int], Field(alias="max-snapshot-age-ms", default=None)]
Expand Down Expand Up @@ -702,6 +702,10 @@ class AssertRefSnapshotId(ValidatableTableRequirement):
def validate(self, base_metadata: Optional[TableMetadata]) -> None:
if base_metadata is None:
raise CommitFailedException("Requirement failed: current table metadata is missing")
elif len(base_metadata.snapshots) == 0 and self.ref != MAIN_BRANCH:
raise CommitFailedException(
f"Requirement failed: Table has no snapshots and can only be written to the {MAIN_BRANCH} BRANCH."
)
elif snapshot_ref := base_metadata.refs.get(self.ref):
ref_type = snapshot_ref.snapshot_ref_type
if self.snapshot_id is None:
Expand Down
178 changes: 112 additions & 66 deletions pyiceberg/table/update/snapshot.py

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions pyiceberg/utils/concurrent.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
class ExecutorFactory:
_instance: Optional[Executor] = None

@staticmethod
def max_workers() -> Optional[int]:
"""Return the max number of workers configured."""
return Config().get_int("max-workers")

@staticmethod
def get_or_create() -> Executor:
"""Return the same executor in each call."""
Expand All @@ -33,8 +38,3 @@ def get_or_create() -> Executor:
ExecutorFactory._instance = ThreadPoolExecutor(max_workers=max_workers)

return ExecutorFactory._instance

@staticmethod
def max_workers() -> Optional[int]:
"""Return the max number of workers configured."""
return Config().get_int("max-workers")
29 changes: 29 additions & 0 deletions tests/integration/test_deletes.py
Original file line number Diff line number Diff line change
Expand Up @@ -894,3 +894,32 @@ def test_overwrite_with_filter_case_insensitive(test_table: Table) -> None:
test_table.overwrite(df=new_table, overwrite_filter=f"Idx == {record_to_overwrite['idx']}", case_sensitive=False)
assert record_to_overwrite not in test_table.scan().to_arrow().to_pylist()
assert new_record_to_insert in test_table.scan().to_arrow().to_pylist()


@pytest.mark.integration
@pytest.mark.parametrize("format_version", [1, 2])
@pytest.mark.filterwarnings("ignore:Delete operation did not match any records")
def test_delete_on_empty_table(spark: SparkSession, session_catalog: RestCatalog, format_version: int) -> None:
identifier = f"default.test_delete_on_empty_table_{format_version}"

run_spark_commands(
spark,
[
f"DROP TABLE IF EXISTS {identifier}",
f"""
CREATE TABLE {identifier} (
volume int
)
USING iceberg
TBLPROPERTIES('format-version' = {format_version})
""",
],
)

tbl = session_catalog.load_table(identifier)

# Perform a delete operation on the empty table
tbl.delete(AlwaysTrue())

# Assert that no new snapshot was created because no rows were deleted
assert len(tbl.snapshots()) == 0
160 changes: 159 additions & 1 deletion tests/integration/test_writes/test_writes.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,13 @@
from pyiceberg.catalog import Catalog, load_catalog
from pyiceberg.catalog.hive import HiveCatalog
from pyiceberg.catalog.sql import SqlCatalog
from pyiceberg.exceptions import NoSuchTableError
from pyiceberg.exceptions import CommitFailedException, NoSuchTableError
from pyiceberg.expressions import And, EqualTo, GreaterThanOrEqual, In, LessThan, Not
from pyiceberg.io.pyarrow import _dataframe_to_data_files
from pyiceberg.partitioning import PartitionField, PartitionSpec
from pyiceberg.schema import Schema
from pyiceberg.table import TableProperties
from pyiceberg.table.refs import MAIN_BRANCH
from pyiceberg.table.sorting import SortDirection, SortField, SortOrder
from pyiceberg.transforms import DayTransform, HourTransform, IdentityTransform
from pyiceberg.types import (
Expand Down Expand Up @@ -1856,3 +1857,160 @@ def test_avro_compression_codecs(session_catalog: Catalog, arrow_table_with_null
with tbl.io.new_input(current_snapshot.manifest_list).open() as f:
reader = fastavro.reader(f)
assert reader.codec == "null"


@pytest.mark.integration
def test_append_to_non_existing_branch(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
identifier = "default.test_non_existing_branch"
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, [])
with pytest.raises(
CommitFailedException, match=f"Table has no snapshots and can only be written to the {MAIN_BRANCH} BRANCH."
):
tbl.append(arrow_table_with_null, branch="non_existing_branch")


@pytest.mark.integration
def test_append_to_existing_branch(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
identifier = "default.test_existing_branch_append"
branch = "existing_branch"
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, [arrow_table_with_null])

assert tbl.metadata.current_snapshot_id is not None

tbl.manage_snapshots().create_branch(snapshot_id=tbl.metadata.current_snapshot_id, branch_name=branch).commit()
tbl.append(arrow_table_with_null, branch=branch)

assert len(tbl.scan().use_ref(branch).to_arrow()) == 6
assert len(tbl.scan().to_arrow()) == 3
branch_snapshot = tbl.metadata.snapshot_by_name(branch)
assert branch_snapshot is not None
main_snapshot = tbl.metadata.snapshot_by_name("main")
assert main_snapshot is not None
assert branch_snapshot.parent_snapshot_id == main_snapshot.snapshot_id


@pytest.mark.integration
def test_delete_to_existing_branch(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
identifier = "default.test_existing_branch_delete"
branch = "existing_branch"
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, [arrow_table_with_null])

assert tbl.metadata.current_snapshot_id is not None

tbl.manage_snapshots().create_branch(snapshot_id=tbl.metadata.current_snapshot_id, branch_name=branch).commit()
tbl.delete(delete_filter="int = 9", branch=branch)

assert len(tbl.scan().use_ref(branch).to_arrow()) == 2
assert len(tbl.scan().to_arrow()) == 3
branch_snapshot = tbl.metadata.snapshot_by_name(branch)
assert branch_snapshot is not None
main_snapshot = tbl.metadata.snapshot_by_name("main")
assert main_snapshot is not None
assert branch_snapshot.parent_snapshot_id == main_snapshot.snapshot_id


@pytest.mark.integration
def test_overwrite_to_existing_branch(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
identifier = "default.test_existing_branch_overwrite"
branch = "existing_branch"
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, [arrow_table_with_null])

assert tbl.metadata.current_snapshot_id is not None

tbl.manage_snapshots().create_branch(snapshot_id=tbl.metadata.current_snapshot_id, branch_name=branch).commit()
tbl.overwrite(arrow_table_with_null, branch=branch)

assert len(tbl.scan().use_ref(branch).to_arrow()) == 3
assert len(tbl.scan().to_arrow()) == 3
branch_snapshot = tbl.metadata.snapshot_by_name(branch)
assert branch_snapshot is not None and branch_snapshot.parent_snapshot_id is not None
delete_snapshot = tbl.metadata.snapshot_by_id(branch_snapshot.parent_snapshot_id)
assert delete_snapshot is not None
main_snapshot = tbl.metadata.snapshot_by_name("main")
assert main_snapshot is not None
assert (
delete_snapshot.parent_snapshot_id == main_snapshot.snapshot_id
) # Currently overwrite is a delete followed by an append operation


@pytest.mark.integration
def test_intertwined_branch_writes(session_catalog: Catalog, arrow_table_with_null: pa.Table) -> None:
identifier = "default.test_intertwined_branch_operations"
branch1 = "existing_branch_1"
branch2 = "existing_branch_2"

tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, [arrow_table_with_null])

assert tbl.metadata.current_snapshot_id is not None

tbl.manage_snapshots().create_branch(snapshot_id=tbl.metadata.current_snapshot_id, branch_name=branch1).commit()

tbl.delete("int = 9", branch=branch1)

tbl.append(arrow_table_with_null)

tbl.manage_snapshots().create_branch(snapshot_id=tbl.metadata.current_snapshot_id, branch_name=branch2).commit()

tbl.overwrite(arrow_table_with_null, branch=branch2)

assert len(tbl.scan().use_ref(branch1).to_arrow()) == 2
assert len(tbl.scan().use_ref(branch2).to_arrow()) == 3
assert len(tbl.scan().to_arrow()) == 6


@pytest.mark.integration
def test_branch_spark_write_py_read(session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table) -> None:
# Initialize table with branch
identifier = "default.test_branch_spark_write_py_read"
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, [arrow_table_with_null])
branch = "existing_spark_branch"

# Create branch in Spark
spark.sql(f"ALTER TABLE {identifier} CREATE BRANCH {branch}")

# Spark Write
spark.sql(
f"""
DELETE FROM {identifier}.branch_{branch}
WHERE int = 9
"""
)

# Refresh table to get new refs
tbl.refresh()

# Python Read
assert len(tbl.scan().to_arrow()) == 3
assert len(tbl.scan().use_ref(branch).to_arrow()) == 2


@pytest.mark.integration
def test_branch_py_write_spark_read(session_catalog: Catalog, spark: SparkSession, arrow_table_with_null: pa.Table) -> None:
# Initialize table with branch
identifier = "default.test_branch_py_write_spark_read"
tbl = _create_table(session_catalog, identifier, {"format-version": "2"}, [arrow_table_with_null])
branch = "existing_py_branch"

assert tbl.metadata.current_snapshot_id is not None

# Create branch
tbl.manage_snapshots().create_branch(snapshot_id=tbl.metadata.current_snapshot_id, branch_name=branch).commit()

# Python Write
tbl.delete("int = 9", branch=branch)

# Spark Read
main_df = spark.sql(
f"""
SELECT *
FROM {identifier}
"""
)
branch_df = spark.sql(
f"""
SELECT *
FROM {identifier}.branch_{branch}
"""
)
assert main_df.count() == 3
assert branch_df.count() == 2
32 changes: 23 additions & 9 deletions tests/table/test_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
_match_deletes_to_data_file,
)
from pyiceberg.table.metadata import INITIAL_SEQUENCE_NUMBER, TableMetadataUtil, TableMetadataV2, _generate_snapshot_id
from pyiceberg.table.refs import SnapshotRef
from pyiceberg.table.refs import MAIN_BRANCH, SnapshotRef, SnapshotRefType
from pyiceberg.table.snapshots import (
MetadataLogEntry,
Operation,
Expand Down Expand Up @@ -1000,28 +1000,42 @@ def test_assert_table_uuid(table_v2: Table) -> None:

def test_assert_ref_snapshot_id(table_v2: Table) -> None:
base_metadata = table_v2.metadata
AssertRefSnapshotId(ref="main", snapshot_id=base_metadata.current_snapshot_id).validate(base_metadata)
AssertRefSnapshotId(ref=MAIN_BRANCH, snapshot_id=base_metadata.current_snapshot_id).validate(base_metadata)

with pytest.raises(CommitFailedException, match="Requirement failed: current table metadata is missing"):
AssertRefSnapshotId(ref="main", snapshot_id=1).validate(None)
AssertRefSnapshotId(ref=MAIN_BRANCH, snapshot_id=1).validate(None)

with pytest.raises(
CommitFailedException,
match="Requirement failed: branch main was created concurrently",
match=f"Requirement failed: branch {MAIN_BRANCH} was created concurrently",
):
AssertRefSnapshotId(ref="main", snapshot_id=None).validate(base_metadata)
AssertRefSnapshotId(ref=MAIN_BRANCH, snapshot_id=None).validate(base_metadata)

with pytest.raises(
CommitFailedException,
match="Requirement failed: branch main has changed: expected id 1, found 3055729675574597004",
match=f"Requirement failed: branch {MAIN_BRANCH} has changed: expected id 1, found 3055729675574597004",
):
AssertRefSnapshotId(ref="main", snapshot_id=1).validate(base_metadata)
AssertRefSnapshotId(ref=MAIN_BRANCH, snapshot_id=1).validate(base_metadata)

non_existing_ref = "not_exist_branch_or_tag"
assert table_v2.refs().get("not_exist_branch_or_tag") is None

with pytest.raises(
CommitFailedException,
match=f"Requirement failed: branch or tag {non_existing_ref} is missing, expected 1",
):
AssertRefSnapshotId(ref=non_existing_ref, snapshot_id=1).validate(base_metadata)

# existing Tag in metadata: test
ref_tag = table_v2.refs().get("test")
assert ref_tag is not None
assert ref_tag.snapshot_ref_type == SnapshotRefType.TAG, "TAG test should be present in table to be tested"

with pytest.raises(
CommitFailedException,
match="Requirement failed: branch or tag not_exist is missing, expected 1",
match="Requirement failed: tag test has changed: expected id 3055729675574597004, found 3051729675574597004",
):
AssertRefSnapshotId(ref="not_exist", snapshot_id=1).validate(base_metadata)
AssertRefSnapshotId(ref="test", snapshot_id=3055729675574597004).validate(base_metadata)


def test_assert_last_assigned_field_id(table_v2: Table) -> None:
Expand Down