Skip to content

Commit 45d3eaf

Browse files
committed
Fix memory issues
1 parent 155cdc8 commit 45d3eaf

File tree

2 files changed

+48
-49
lines changed

2 files changed

+48
-49
lines changed

airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py

Lines changed: 44 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import logging
77
import threading
88
import time
9-
from collections import OrderedDict
9+
from collections import deque, OrderedDict
1010
from copy import deepcopy
1111
from datetime import timedelta
1212
from typing import Any, Callable, Iterable, Mapping, MutableMapping, Optional
@@ -99,9 +99,13 @@ def __init__(
9999
self._semaphore_per_partition: OrderedDict[str, threading.Semaphore] = OrderedDict()
100100

101101
# Parent-state tracking: store each partition’s parent state in creation order
102-
self._partition_parent_state_map: OrderedDict[str, Mapping[str, Any]] = OrderedDict()
102+
self._partition_parent_state_map: OrderedDict[str, tuple[Mapping[str, Any], int]] = OrderedDict()
103103

104104
self._finished_partitions: set[str] = set()
105+
self._open_seqs: deque[int] = deque()
106+
self._next_seq: int = 0
107+
self._seq_by_partition: dict[str, int] = {}
108+
105109
self._lock = threading.Lock()
106110
self._timer = Timer()
107111
self._new_global_cursor: Optional[StreamState] = None
@@ -162,55 +166,28 @@ def close_partition(self, partition: Partition) -> None:
162166
):
163167
self._update_global_cursor(cursor.state[self.cursor_field.cursor_field_key])
164168

169+
# Clean up the partition if it is fully processed
170+
self._cleanup_if_done(partition_key)
171+
165172
self._check_and_update_parent_state()
166173

167174
self._emit_state_message()
168175

169176
def _check_and_update_parent_state(self) -> None:
170-
"""
171-
Pop the leftmost partition state from _partition_parent_state_map only if
172-
*all partitions* up to (and including) that partition key in _semaphore_per_partition
173-
are fully finished (i.e. in _finished_partitions and semaphore._value == 0).
174-
Additionally, delete finished semaphores with a value of 0 to free up memory,
175-
as they are only needed to track errors and completion status.
176-
"""
177177
last_closed_state = None
178178

179179
while self._partition_parent_state_map:
180-
# Look at the earliest partition key in creation order
181-
earliest_key = next(iter(self._partition_parent_state_map))
182-
183-
# Verify ALL partitions from the left up to earliest_key are finished
184-
all_left_finished = True
185-
for p_key, sem in list(
186-
self._semaphore_per_partition.items()
187-
): # Use list to allow modification during iteration
188-
# If any earlier partition is still not finished, we must stop
189-
if p_key not in self._finished_partitions or sem._value != 0:
190-
all_left_finished = False
191-
break
192-
# Once we've reached earliest_key in the semaphore order, we can stop checking
193-
if p_key == earliest_key:
194-
break
195-
196-
# If the partitions up to earliest_key are not all finished, break the while-loop
197-
if not all_left_finished:
198-
break
180+
earliest_key, (candidate_state, candidate_seq) = \
181+
next(iter(self._partition_parent_state_map.items()))
199182

200-
# Pop the leftmost entry from parent-state map
201-
_, closed_parent_state = self._partition_parent_state_map.popitem(last=False)
202-
last_closed_state = closed_parent_state
183+
# if any partition that started <= candidate_seq is still open, we must wait
184+
if self._open_seqs and self._open_seqs[0] <= candidate_seq:
185+
break
203186

204-
# Clean up finished semaphores with value 0 up to and including earliest_key
205-
for p_key in list(self._semaphore_per_partition.keys()):
206-
sem = self._semaphore_per_partition[p_key]
207-
if p_key in self._finished_partitions and sem._value == 0:
208-
del self._semaphore_per_partition[p_key]
209-
logger.debug(f"Deleted finished semaphore for partition {p_key} with value 0")
210-
if p_key == earliest_key:
211-
break
187+
# safe to pop
188+
self._partition_parent_state_map.popitem(last=False)
189+
last_closed_state = candidate_state
212190

213-
# Update _parent_state if we popped at least one partition
214191
if last_closed_state is not None:
215192
self._parent_state = last_closed_state
216193

@@ -293,14 +270,19 @@ def _generate_slices_from_partition(
293270
self._semaphore_per_partition[partition_key] = threading.Semaphore(0)
294271

295272
with self._lock:
273+
seq = self._next_seq
274+
self._next_seq += 1
275+
self._open_seqs.append(seq)
276+
self._seq_by_partition[partition_key] = seq
277+
296278
if (
297279
len(self._partition_parent_state_map) == 0
298280
or self._partition_parent_state_map[
299281
next(reversed(self._partition_parent_state_map))
300-
]
282+
][0]
301283
!= parent_state
302284
):
303-
self._partition_parent_state_map[partition_key] = deepcopy(parent_state)
285+
self._partition_parent_state_map[partition_key] = (deepcopy(parent_state), seq)
304286

305287
for cursor_slice, is_last_slice, _ in iterate_with_last_flag_and_state(
306288
cursor.stream_slices(),
@@ -338,10 +320,7 @@ def _ensure_partition_limit(self) -> None:
338320
while len(self._cursor_per_partition) > self.DEFAULT_MAX_PARTITIONS_NUMBER - 1:
339321
# Try removing finished partitions first
340322
for partition_key in list(self._cursor_per_partition.keys()):
341-
if partition_key in self._finished_partitions and (
342-
partition_key not in self._semaphore_per_partition
343-
or self._semaphore_per_partition[partition_key]._value == 0
344-
):
323+
if partition_key not in self._seq_by_partition:
345324
oldest_partition = self._cursor_per_partition.pop(
346325
partition_key
347326
) # Remove the oldest partition
@@ -474,6 +453,25 @@ def _update_global_cursor(self, value: Any) -> None:
474453
):
475454
self._new_global_cursor = {self.cursor_field.cursor_field_key: copy.deepcopy(value)}
476455

456+
def _cleanup_if_done(self, partition_key: str) -> None:
457+
"""
458+
Free every in-memory structure that belonged to a completed partition:
459+
cursor, semaphore, flag inside `_finished_partitions`
460+
"""
461+
if not (
462+
partition_key in self._finished_partitions
463+
and self._semaphore_per_partition[partition_key]._value == 0
464+
):
465+
return
466+
467+
self._semaphore_per_partition.pop(partition_key, None)
468+
self._finished_partitions.discard(partition_key)
469+
470+
seq = self._seq_by_partition.pop(partition_key)
471+
self._open_seqs.remove(seq)
472+
473+
logger.debug(f"Partition {partition_key} fully processed and cleaned up.")
474+
477475
def _to_partition_key(self, partition: Mapping[str, Any]) -> str:
478476
return self._partition_serializer.to_partition_key(partition)
479477

unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3436,8 +3436,8 @@ def test_given_unfinished_first_parent_partition_no_parent_state_update():
34363436
}
34373437
assert mock_cursor_1.stream_slices.call_count == 1 # Called once for each partition
34383438
assert mock_cursor_2.stream_slices.call_count == 1 # Called once for each partition
3439-
assert len(cursor._semaphore_per_partition) == 2
3440-
3439+
assert len(cursor._semaphore_per_partition) == 1 # Semaphore cleaned after partiton is completed
3440+
# ToDo: Add check for other interal values
34413441

34423442
def test_given_unfinished_last_parent_partition_with_partial_parent_state_update():
34433443
# Create two mock cursors with different states for each partition
@@ -3520,7 +3520,8 @@ def test_given_unfinished_last_parent_partition_with_partial_parent_state_update
35203520
}
35213521
assert mock_cursor_1.stream_slices.call_count == 1 # Called once for each partition
35223522
assert mock_cursor_2.stream_slices.call_count == 1 # Called once for each partition
3523-
assert len(cursor._semaphore_per_partition) == 1
3523+
assert len(cursor._semaphore_per_partition) == 0
3524+
# ToDo: Add check for other interal values
35243525

35253526

35263527
def test_given_all_partitions_finished_when_close_partition_then_final_state_emitted():

0 commit comments

Comments
 (0)