Merge remote-tracking branch 'origin/main' into baz/cdk/add-deprecations-module

Oleksandr Bazarnov · Oleksandr Bazarnov · commit 9f743f24d3f9 · 2025-04-21T11:58:19.000+03:00
diff --git a/.github/workflows/connector-tests.yml b/.github/workflows/connector-tests.yml
@@ -75,11 +75,11 @@ jobs:
           # Chargebee is being flaky:
           # - connector: source-chargebee
           #   cdk_extra: n/a
-          # These two are behind in CDK updates and can't be used as tests until they are updated:
-          # - connector: source-s3
-          #   cdk_extra: file-based
+          # This one is behind in CDK updates and can't be used as tests until it is updated:
           # - connector: destination-pinecone
           #   cdk_extra: vector-db-based
+          - connector: source-google-drive
+            cdk_extra: file-based
           - connector: destination-motherduck
             cdk_extra: sql
           # ZenDesk currently failing (as of 2024-12-02)
diff --git a/airbyte_cdk/sources/declarative/concurrent_declarative_source.py b/airbyte_cdk/sources/declarative/concurrent_declarative_source.py
@@ -19,7 +19,10 @@
 from airbyte_cdk.sources.declarative.extractors.record_filter import (
     ClientSideIncrementalRecordFilterDecorator,
 )
-from airbyte_cdk.sources.declarative.incremental import ConcurrentPerPartitionCursor
+from airbyte_cdk.sources.declarative.incremental import (
+    ConcurrentPerPartitionCursor,
+    GlobalSubstreamCursor,
+)
 from airbyte_cdk.sources.declarative.incremental.datetime_based_cursor import DatetimeBasedCursor
 from airbyte_cdk.sources.declarative.incremental.per_partition_with_global import (
     PerPartitionWithGlobalCursor,
@@ -361,7 +364,8 @@ def _group_streams(
                     == DatetimeBasedCursorModel.__name__
                     and hasattr(declarative_stream.retriever, "stream_slicer")
                     and isinstance(
-                        declarative_stream.retriever.stream_slicer, PerPartitionWithGlobalCursor
+                        declarative_stream.retriever.stream_slicer,
+                        (GlobalSubstreamCursor, PerPartitionWithGlobalCursor),
                     )
                 ):
                     stream_state = self._connector_state_manager.get_stream_state(
diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py
@@ -1471,7 +1471,9 @@ def create_concurrent_cursor_from_perpartition_cursor(
         stream_state = self.apply_stream_state_migrations(stream_state_migrations, stream_state)
 
         # Per-partition state doesn't make sense for GroupingPartitionRouter, so force the global state
-        use_global_cursor = isinstance(partition_router, GroupingPartitionRouter)
+        use_global_cursor = isinstance(
+            partition_router, GroupingPartitionRouter
+        ) or component_definition.get("global_substream_cursor", False)
 
         # Return the concurrent cursor and state converter
         return ConcurrentPerPartitionCursor(
diff --git a/airbyte_cdk/sources/declarative/requesters/query_properties/property_chunking.py b/airbyte_cdk/sources/declarative/requesters/query_properties/property_chunking.py
@@ -52,8 +52,10 @@ def get_request_property_chunks(
         chunk_size = 0
         for property_field in property_fields:
             # If property_limit_type is not defined, we default to property_count which is just an incrementing count
+            # todo: Add ability to specify parameter delimiter representation and take into account in property_field_size
             property_field_size = (
                 len(property_field)
+                + 3  # The +3 represents the extra characters for encoding the delimiter in between properties
                 if self.property_limit_type == PropertyLimitType.characters
                 else 1
             )
diff --git a/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py b/airbyte_cdk/sources/declarative/retrievers/simple_retriever.py
@@ -10,6 +10,7 @@
 from typing import (
     Any,
     Callable,
+    Dict,
     Iterable,
     List,
     Mapping,
@@ -367,14 +368,65 @@ def _read_pages(
             {"next_page_token": initial_token} if initial_token is not None else None
         )
         while not pagination_complete:
-            response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
+            property_chunks: List[List[str]] = (
+                list(
+                    self.additional_query_properties.get_request_property_chunks(
+                        stream_slice=stream_slice
+                    )
+                )
+                if self.additional_query_properties
+                else [
+                    []
+                ]  # A single empty property chunk represents the case where property chunking is not configured
+            )
 
+            merged_records: MutableMapping[str, Any] = defaultdict(dict)
             last_page_size = 0
             last_record: Optional[Record] = None
-            for record in records_generator_fn(response):
-                last_page_size += 1
-                last_record = record
-                yield record
+            response: Optional[requests.Response] = None
+            for properties in property_chunks:
+                if len(properties) > 0:
+                    stream_slice = StreamSlice(
+                        partition=stream_slice.partition or {},
+                        cursor_slice=stream_slice.cursor_slice or {},
+                        extra_fields={"query_properties": properties},
+                    )
+
+                response = self._fetch_next_page(stream_state, stream_slice, next_page_token)
+                for current_record in records_generator_fn(response):
+                    if (
+                        current_record
+                        and self.additional_query_properties
+                        and self.additional_query_properties.property_chunking
+                    ):
+                        merge_key = (
+                            self.additional_query_properties.property_chunking.get_merge_key(
+                                current_record
+                            )
+                        )
+                        if merge_key:
+                            _deep_merge(merged_records[merge_key], current_record)
+                        else:
+                            # We should still emit records even if the record did not have a merge key
+                            last_page_size += 1
+                            last_record = current_record
+                            yield current_record
+                    else:
+                        last_page_size += 1
+                        last_record = current_record
+                        yield current_record
+
+            if (
+                self.additional_query_properties
+                and self.additional_query_properties.property_chunking
+            ):
+                for merged_record in merged_records.values():
+                    record = Record(
+                        data=merged_record, stream_name=self.name, associated_slice=stream_slice
+                    )
+                    last_page_size += 1
+                    last_record = record
+                    yield record
 
             if not response:
                 pagination_complete = True
@@ -449,110 +501,43 @@ def read_records(
         :param stream_slice: The stream slice to read data for
         :return: The records read from the API source
         """
-
-        property_chunks = (
-            list(
-                self.additional_query_properties.get_request_property_chunks(
-                    stream_slice=stream_slice
-                )
-            )
-            if self.additional_query_properties
-            else []
-        )
-        records_without_merge_key = []
-        merged_records: MutableMapping[str, Any] = defaultdict(dict)
-
         _slice = stream_slice or StreamSlice(partition={}, cursor_slice={})  # None-check
+
         most_recent_record_from_slice = None
+        record_generator = partial(
+            self._parse_records,
+            stream_slice=stream_slice,
+            stream_state=self.state or {},
+            records_schema=records_schema,
+        )
 
-        if self.additional_query_properties:
-            for properties in property_chunks:
-                _slice = StreamSlice(
-                    partition=_slice.partition or {},
-                    cursor_slice=_slice.cursor_slice or {},
-                    extra_fields={"query_properties": properties},
-                )  # None-check
-
-                record_generator = partial(
-                    self._parse_records,
-                    stream_slice=_slice,
-                    stream_state=self.state or {},
-                    records_schema=records_schema,
-                )
+        if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
+            stream_state = self.state
 
-                for stream_data in self._read_pages(record_generator, self.state, _slice):
-                    current_record = self._extract_record(stream_data, _slice)
-                    if self.cursor and current_record:
-                        self.cursor.observe(_slice, current_record)
+            # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
+            # fetch more records. The platform deletes stream state for full refresh streams before starting a
+            # new job, so we don't need to worry about this value existing for the initial attempt
+            if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
+                return
 
-                    # Latest record read, not necessarily within slice boundaries.
-                    # TODO Remove once all custom components implement `observe` method.
-                    # https://github.com/airbytehq/airbyte-internal-issues/issues/6955
-                    most_recent_record_from_slice = self._get_most_recent_record(
-                        most_recent_record_from_slice, current_record, _slice
-                    )
+            yield from self._read_single_page(record_generator, stream_state, _slice)
+        else:
+            for stream_data in self._read_pages(record_generator, self.state, _slice):
+                current_record = self._extract_record(stream_data, _slice)
+                if self.cursor and current_record:
+                    self.cursor.observe(_slice, current_record)
+
+                # Latest record read, not necessarily within slice boundaries.
+                # TODO Remove once all custom components implement `observe` method.
+                # https://github.com/airbytehq/airbyte-internal-issues/issues/6955
+                most_recent_record_from_slice = self._get_most_recent_record(
+                    most_recent_record_from_slice, current_record, _slice
+                )
+                yield stream_data
 
-                    if current_record and self.additional_query_properties.property_chunking:
-                        merge_key = (
-                            self.additional_query_properties.property_chunking.get_merge_key(
-                                current_record
-                            )
-                        )
-                        if merge_key:
-                            merged_records[merge_key].update(current_record)
-                        else:
-                            # We should still emit records even if the record did not have a merge key
-                            records_without_merge_key.append(current_record)
-                    else:
-                        yield stream_data
             if self.cursor:
                 self.cursor.close_slice(_slice, most_recent_record_from_slice)
-
-            if len(merged_records) > 0:
-                yield from [
-                    Record(data=merged_record, stream_name=self.name, associated_slice=stream_slice)
-                    for merged_record in merged_records.values()
-                ]
-            if len(records_without_merge_key) > 0:
-                yield from records_without_merge_key
-        else:
-            _slice = stream_slice or StreamSlice(partition={}, cursor_slice={})  # None-check
-
-            most_recent_record_from_slice = None
-            record_generator = partial(
-                self._parse_records,
-                stream_slice=stream_slice,
-                stream_state=self.state or {},
-                records_schema=records_schema,
-            )
-
-            if self.cursor and isinstance(self.cursor, ResumableFullRefreshCursor):
-                stream_state = self.state
-
-                # Before syncing the RFR stream, we check if the job's prior attempt was successful and don't need to
-                # fetch more records. The platform deletes stream state for full refresh streams before starting a
-                # new job, so we don't need to worry about this value existing for the initial attempt
-                if stream_state.get(FULL_REFRESH_SYNC_COMPLETE_KEY):
-                    return
-
-                yield from self._read_single_page(record_generator, stream_state, _slice)
-            else:
-                for stream_data in self._read_pages(record_generator, self.state, _slice):
-                    current_record = self._extract_record(stream_data, _slice)
-                    if self.cursor and current_record:
-                        self.cursor.observe(_slice, current_record)
-
-                    # Latest record read, not necessarily within slice boundaries.
-                    # TODO Remove once all custom components implement `observe` method.
-                    # https://github.com/airbytehq/airbyte-internal-issues/issues/6955
-                    most_recent_record_from_slice = self._get_most_recent_record(
-                        most_recent_record_from_slice, current_record, _slice
-                    )
-                    yield stream_data
-
-                if self.cursor:
-                    self.cursor.close_slice(_slice, most_recent_record_from_slice)
-            return
+        return
 
     def _get_most_recent_record(
         self,
@@ -639,6 +624,26 @@ def _to_partition_key(to_serialize: Any) -> str:
         return json.dumps(to_serialize, indent=None, separators=(",", ":"), sort_keys=True)
 
 
+def _deep_merge(
+    target: MutableMapping[str, Any], source: Union[Record, MutableMapping[str, Any]]
+) -> None:
+    """
+    Recursively merge two dictionaries, combining nested dictionaries instead of overwriting them.
+
+    :param target: The dictionary to merge into (modified in place)
+    :param source: The dictionary to merge from
+    """
+    for key, value in source.items():
+        if (
+            key in target
+            and isinstance(target[key], MutableMapping)
+            and isinstance(value, MutableMapping)
+        ):
+            _deep_merge(target[key], value)
+        else:
+            target[key] = value
+
+
 @dataclass
 class SimpleRetrieverTestReadDecorator(SimpleRetriever):
     """
diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py
@@ -3449,3 +3449,48 @@ def test_semaphore_cleanup():
     assert '{"id":"2"}' not in cursor._semaphore_per_partition
     assert len(cursor._partition_parent_state_map) == 0  # All parent states should be popped
     assert cursor._parent_state == {"parent": {"state": "state2"}}  # Last parent state
+
+
+def test_given_global_state_when_read_then_state_is_not_per_partition() -> None:
+    manifest = deepcopy(SUBSTREAM_MANIFEST)
+    manifest["definitions"]["post_comments_stream"]["incremental_sync"][
+        "global_substream_cursor"
+    ] = True
+    manifest["streams"].remove({"$ref": "#/definitions/post_comment_votes_stream"})
+    record = {
+        "id": 9,
+        "post_id": 1,
+        "updated_at": COMMENT_10_UPDATED_AT,
+    }
+    mock_requests = [
+        (
+            f"https://api.example.com/community/posts?per_page=100&start_time={START_DATE}",
+            {
+                "posts": [
+                    {"id": 1, "updated_at": POST_1_UPDATED_AT},
+                ],
+            },
+        ),
+        # Fetch the first page of comments for post 1
+        (
+            "https://api.example.com/community/posts/1/comments?per_page=100",
+            {
+                "comments": [record],
+            },
+        ),
+    ]
+
+    run_mocked_test(
+        mock_requests,
+        manifest,
+        CONFIG,
+        "post_comments",
+        {},
+        [record],
+        {
+            "lookback_window": 1,
+            "parent_state": {"posts": {"updated_at": "2024-01-30T00:00:00Z"}},
+            "state": {"updated_at": "2024-01-25T00:00:00Z"},
+            "use_global_cursor": True,  # ensures that it is running the Concurrent CDK version as this is not populated in the declarative implementation
+        },  # this state does have per partition which would be under `states`
+    )
diff --git a/unit_tests/sources/declarative/requesters/query_properties/test_property_chunking.py b/unit_tests/sources/declarative/requesters/query_properties/test_property_chunking.py
@@ -43,10 +43,18 @@
             ["kate", "laurie", "jaclyn"],
             None,
             PropertyLimitType.characters,
-            10,
+            20,
             [["kate", "laurie"], ["jaclyn"]],
             id="test_property_chunking_limit_characters",
         ),
+        pytest.param(
+            ["laurie", "jaclyn", "kaitlin"],
+            None,
+            PropertyLimitType.characters,
+            17,  # laurie%2Cjaclyn%2C == 18, so this will create separate chunks
+            [["laurie"], ["jaclyn"], ["kaitlin"]],
+            id="test_property_chunking_includes_extra_delimiter",
+        ),
         pytest.param(
             [],
             None,
diff --git a/unit_tests/sources/declarative/retrievers/test_simple_retriever.py b/unit_tests/sources/declarative/retrievers/test_simple_retriever.py