Merge branch 'main' into brian/missing_stream_emit_incomplete_status

brianjlai · brianjlai · commit 8a7983463b37 · 2025-09-12T16:09:05.000-07:00
diff --git a/airbyte_cdk/sources/declarative/declarative_component_schema.yaml b/airbyte_cdk/sources/declarative/declarative_component_schema.yaml
@@ -3797,7 +3797,6 @@ definitions:
       - polling_requester
       - download_requester
       - status_extractor
-      - download_target_extractor
     properties:
       type:
         type: string
diff --git a/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py b/airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py
@@ -189,6 +189,7 @@ def __init__(
         # FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
         self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
         self._synced_some_data = False
+        self._logged_regarding_datetime_format_error = False
 
     @property
     def cursor_field(self) -> CursorField:
@@ -518,10 +519,23 @@ def observe(self, record: Record) -> None:
         except ValueError:
             return
 
+        try:
+            record_cursor = self._connector_state_converter.output_format(
+                self._connector_state_converter.parse_value(record_cursor_value)
+            )
+        except ValueError as exception:
+            if not self._logged_regarding_datetime_format_error:
+                logger.warning(
+                    "Skipping cursor update for stream '%s': failed to parse cursor field '%s' value %r: %s",
+                    self._stream_name,
+                    self._cursor_field.cursor_field_key,
+                    record_cursor_value,
+                    exception,
+                )
+                self._logged_regarding_datetime_format_error = True
+            return
+
         self._synced_some_data = True
-        record_cursor = self._connector_state_converter.output_format(
-            self._connector_state_converter.parse_value(record_cursor_value)
-        )
         self._update_global_cursor(record_cursor)
         if not self._use_global_cursor:
             self._cursor_per_partition[
diff --git a/airbyte_cdk/sources/declarative/models/declarative_component_schema.py b/airbyte_cdk/sources/declarative/models/declarative_component_schema.py
@@ -2852,8 +2852,8 @@ class AsyncRetriever(BaseModel):
     status_extractor: Union[DpathExtractor, CustomRecordExtractor] = Field(
         ..., description="Responsible for fetching the actual status of the async job."
     )
-    download_target_extractor: Union[DpathExtractor, CustomRecordExtractor] = Field(
-        ...,
+    download_target_extractor: Optional[Union[DpathExtractor, CustomRecordExtractor]] = Field(
+        None,
         description="Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.",
     )
     download_extractor: Optional[
diff --git a/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py b/airbyte_cdk/sources/declarative/parsers/model_to_component_factory.py
@@ -1976,7 +1976,10 @@ def create_default_stream(
         primary_key = model.primary_key.__root__ if model.primary_key else None
 
         partition_router = self._build_stream_slicer_from_partition_router(
-            model.retriever, config, stream_name=model.name
+            model.retriever,
+            config,
+            stream_name=model.name,
+            **kwargs,
         )
         concurrent_cursor = self._build_concurrent_cursor(model, partition_router, config)
         if model.incremental_sync and isinstance(model.incremental_sync, DatetimeBasedCursorModel):
@@ -2155,10 +2158,11 @@ def _build_stream_slicer_from_partition_router(
         ],
         config: Config,
         stream_name: Optional[str] = None,
+        **kwargs: Any,
     ) -> PartitionRouter:
         if (
             hasattr(model, "partition_router")
-            and isinstance(model, SimpleRetrieverModel | AsyncRetrieverModel)
+            and isinstance(model, (SimpleRetrieverModel, AsyncRetrieverModel, CustomRetrieverModel))
             and model.partition_router
         ):
             stream_slicer_model = model.partition_router
@@ -2172,6 +2176,23 @@ def _build_stream_slicer_from_partition_router(
                     ],
                     parameters={},
                 )
+            elif isinstance(stream_slicer_model, dict):
+                # partition router comes from CustomRetrieverModel therefore has not been parsed as a model
+                params = stream_slicer_model.get("$parameters")
+                if not isinstance(params, dict):
+                    params = {}
+                    stream_slicer_model["$parameters"] = params
+
+                if stream_name is not None:
+                    params["stream_name"] = stream_name
+
+                return self._create_nested_component(  # type: ignore[no-any-return] # There is no guarantee that this will return a stream slicer. If not, we expect an AttributeError during the call to `stream_slices`
+                    model,
+                    "partition_router",
+                    stream_slicer_model,
+                    config,
+                    **kwargs,
+                )
             else:
                 return self._create_component_from_model(  # type: ignore[no-any-return] # Will be created PartitionRouter as stream_slicer_model is model.partition_router
                     model=stream_slicer_model, config=config, stream_name=stream_name or ""
@@ -2886,7 +2907,7 @@ def create_page_increment(
         )
 
     def create_parent_stream_config(
-        self, model: ParentStreamConfigModel, config: Config, stream_name: str, **kwargs: Any
+        self, model: ParentStreamConfigModel, config: Config, *, stream_name: str, **kwargs: Any
     ) -> ParentStreamConfig:
         declarative_stream = self._create_component_from_model(
             model.stream,
@@ -3446,6 +3467,11 @@ def create_async_retriever(
         transformations: List[RecordTransformation],
         **kwargs: Any,
     ) -> AsyncRetriever:
+        if model.download_target_requester and not model.download_target_extractor:
+            raise ValueError(
+                f"`download_target_extractor` required if using a `download_target_requester`"
+            )
+
         def _get_download_retriever(
             requester: Requester, extractor: RecordExtractor, _decoder: Decoder
         ) -> SimpleRetriever:
@@ -3603,11 +3629,15 @@ def _get_job_timeout() -> datetime.timedelta:
         status_extractor = self._create_component_from_model(
             model=model.status_extractor, decoder=decoder, config=config, name=name
         )
-        download_target_extractor = self._create_component_from_model(
-            model=model.download_target_extractor,
-            decoder=decoder,
-            config=config,
-            name=name,
+        download_target_extractor = (
+            self._create_component_from_model(
+                model=model.download_target_extractor,
+                decoder=decoder,
+                config=config,
+                name=name,
+            )
+            if model.download_target_extractor
+            else None
         )
 
         job_repository: AsyncJobRepository = AsyncHttpJobRepository(
@@ -3693,14 +3723,19 @@ def create_spec(self, model: SpecModel, config: Config, **kwargs: Any) -> Spec:
         )
 
     def create_substream_partition_router(
-        self, model: SubstreamPartitionRouterModel, config: Config, **kwargs: Any
+        self,
+        model: SubstreamPartitionRouterModel,
+        config: Config,
+        *,
+        stream_name: str,
+        **kwargs: Any,
     ) -> SubstreamPartitionRouter:
         parent_stream_configs = []
         if model.parent_stream_configs:
             parent_stream_configs.extend(
                 [
                     self.create_parent_stream_config_with_substream_wrapper(
-                        model=parent_stream_config, config=config, **kwargs
+                        model=parent_stream_config, config=config, stream_name=stream_name, **kwargs
                     )
                     for parent_stream_config in model.parent_stream_configs
                 ]
@@ -3720,7 +3755,7 @@ def create_parent_stream_config_with_substream_wrapper(
 
         # This flag will be used exclusively for StateDelegatingStream when a parent stream is created
         has_parent_state = bool(
-            self._connector_state_manager.get_stream_state(kwargs.get("stream_name", ""), None)
+            self._connector_state_manager.get_stream_state(stream_name, None)
             if model.incremental_dependency
             else False
         )
@@ -4113,11 +4148,17 @@ def set_api_budget(self, component_definition: ComponentDefinition, config: Conf
         )
 
     def create_grouping_partition_router(
-        self, model: GroupingPartitionRouterModel, config: Config, **kwargs: Any
+        self,
+        model: GroupingPartitionRouterModel,
+        config: Config,
+        *,
+        stream_name: str,
+        **kwargs: Any,
     ) -> GroupingPartitionRouter:
         underlying_router = self._create_component_from_model(
             model=model.underlying_partition_router,
             config=config,
+            stream_name=stream_name,
             **kwargs,
         )
         if model.group_size < 1:
diff --git a/airbyte_cdk/sources/declarative/requesters/README.md b/airbyte_cdk/sources/declarative/requesters/README.md
@@ -1,8 +1,19 @@
+# Download Target and Download Requester
+
+- The `creation_response` and `polling_response` interpolation contexts are always available during the job download step of the process.
+
+- The`download_target` interpolation context is generated by the `download_target_extractor` and made available to the job download step as well.
+  - if `download_target_requester` is not provided, `download_target_extractor` will get urls from the `polling_response`
+  - if `download_target_requester` is provided, an additional request will be made to fetch job download targets and `download_target_extractor` will operate on that response
+
+## Some important considerations
+
+- **Note:** If the `download_target_extractor` and `download_target_requester` are not defined, a single job download request will be made without the `download_target` context.
+- **Note:** The `download_target_extractor` is required (not optional) if using a `download_target_requester`
+
 # AsyncHttpJobRepository sequence diagram
 
 - Components marked as optional are not required and can be ignored.
-- if `download_target_requester` is not provided, `download_target_extractor` will get urls from the `polling_response`
-- interpolation_context, e.g. `creation_response` or `polling_response` can be obtained from stream_slice
 
 ```mermaid
 ---
@@ -37,7 +48,7 @@ sequenceDiagram
         UrlRequester -->> AsyncHttpJobRepository: Download URLs
 
         AsyncHttpJobRepository ->> DownloadRetriever: Download reports
-        DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `url`)
+        DownloadRetriever ->> Reporting Server: Retrieve report data (interpolation_context: `download_target`, `creation_response`, `polling_response`)
         Reporting Server -->> DownloadRetriever: Report data
         DownloadRetriever -->> AsyncHttpJobRepository: Report data
     else Status: Failed
diff --git a/airbyte_cdk/sources/declarative/requesters/http_job_repository.py b/airbyte_cdk/sources/declarative/requesters/http_job_repository.py
@@ -43,7 +43,7 @@ class AsyncHttpJobRepository(AsyncJobRepository):
     delete_requester: Optional[Requester]
     status_extractor: DpathExtractor
     status_mapping: Mapping[str, AsyncJobStatus]
-    download_target_extractor: DpathExtractor
+    download_target_extractor: Optional[DpathExtractor]
 
     # timeout for the job to be completed, passed from `polling_job_timeout`
     job_timeout: Optional[timedelta] = None
@@ -213,14 +213,16 @@ def fetch_records(self, job: AsyncJob) -> Iterable[Mapping[str, Any]]:
 
         """
 
-        for target_url in self._get_download_targets(job):
+        for download_target in self._get_download_targets(job):
             job_slice = job.job_parameters()
             stream_slice = StreamSlice(
                 partition=job_slice.partition,
                 cursor_slice=job_slice.cursor_slice,
                 extra_fields={
                     **job_slice.extra_fields,
-                    "download_target": target_url,
+                    "download_target": download_target,
+                    "creation_response": self._get_creation_response_interpolation_context(job),
+                    "polling_response": self._get_polling_response_interpolation_context(job),
                 },
             )
             for message in self.download_retriever.read_records({}, stream_slice):
@@ -330,9 +332,27 @@ def _get_create_job_stream_slice(self, job: AsyncJob) -> StreamSlice:
         )
 
     def _get_download_targets(self, job: AsyncJob) -> Iterable[str]:
-        if not self.download_target_requester:
-            url_response = self._polling_job_response_by_id[job.api_job_id()]
-        else:
+        """Returns an iterable of strings to help target requests for downloading async jobs."""
+        # If neither download_target_extractor nor download_target_requester are provided, yield a single empty string
+        # to express the need to make a single download request without any download_target value
+        if not self.download_target_extractor:
+            if not self.download_target_requester:
+                lazy_log(
+                    LOGGER,
+                    logging.DEBUG,
+                    lambda: "No download_target_extractor or download_target_requester provided. Will attempt a single download request without a `download_target`.",
+                )
+                yield ""
+                return
+            else:
+                raise AirbyteTracedException(
+                    internal_message="Must define a `download_target_extractor` when using a `download_target_requester`.",
+                    failure_type=FailureType.config_error,
+                )
+
+        # We have a download_target_extractor, use it to extract the donload_target
+        if self.download_target_requester:
+            # if a download_target_requester if defined, we extract from the response of a request specifically for download targets.
             stream_slice: StreamSlice = StreamSlice(
                 partition={},
                 cursor_slice={},
@@ -346,5 +366,8 @@ def _get_download_targets(self, job: AsyncJob) -> Iterable[str]:
                     internal_message="Always expect a response or an exception from download_target_requester",
                     failure_type=FailureType.system_error,
                 )
+        else:
+            # if no download_target_requester is defined, we extract from the polling response
+            url_response = self._polling_job_response_by_id[job.api_job_id()]
 
         yield from self.download_target_extractor.extract_records(url_response)  # type: ignore # we expect download_target_extractor to always return list of strings
diff --git a/bin/generate-component-manifest-dagger.sh b/bin/generate-component-manifest-dagger.sh
@@ -7,5 +7,4 @@
 
 set -e
 
-pip install dagger-io==0.13.3
 python bin/generate_component_manifest_files.py
diff --git a/debug_manifest/README.md b/debug_manifest/README.md
@@ -22,11 +22,19 @@ To configure the debugger in VSCode to run the `debug_manifest`, follow these st
       "request": "launch",
       "console": "integratedTerminal",
       "cwd": "${workspaceFolder}/debug_manifest",
-      "python": "<PATH_TO_CDK_ENV>/bin/python",
+      "python": "<PATH_TO_CDK_ENV>/bin/python", // REPLACE ME
       "module": "debug_manifest",
       "args": [
         // SPECIFY THE COMMAND: [spec, check, discover, read]
         "read",
+        // SPECIFY THE MANIFEST FILE
+        "--manifest-path",
+        // PATH TO THE MANIFEST FILE
+        "resources/manifest.yaml",
+        // SPECIFY A COMPONENTS.PY FILE (OPTIONAL)
+        "--components-path",
+        // PATH TO THE COMPONENTS FILE
+        "resources/components.py",
         // SPECIFY THE CONFIG
         "--config",
         // PATH TO THE CONFIG FILE
diff --git a/debug_manifest/debug_manifest.py b/debug_manifest/debug_manifest.py
@@ -3,17 +3,12 @@
 #
 
 import sys
-from typing import Any, Mapping
 
 from airbyte_cdk.entrypoint import AirbyteEntrypoint, launch
 from airbyte_cdk.sources.declarative.yaml_declarative_source import (
     YamlDeclarativeSource,
 )
 
-configuration: Mapping[str, Any] = {
-    "path_to_yaml": "resources/manifest.yaml",
-}
-
 
 def debug_manifest(source: YamlDeclarativeSource, args: list[str]) -> None:
     """
@@ -22,15 +17,56 @@ def debug_manifest(source: YamlDeclarativeSource, args: list[str]) -> None:
     launch(source, args)
 
 
+def _register_components_from_file(filepath: str) -> None:
+    """
+    Dynamically load a Python file containing custom component definitions and register it
+    under specific module names in sys.modules to ensure that these classes can be properly
+    resolved during hydration of the manifest yaml file.
+
+    This is a somewhat hacky replacement for the file structure manipulation we do when building
+    connector images to ensure the custom components can be imported.
+    """
+    import importlib.util
+    import sys
+    from pathlib import Path
+
+    components_path = Path(filepath)
+    if not components_path.exists():
+        raise FileNotFoundError(f"Components file not found: {components_path}")
+
+    module_name = "components"
+    sdm_module_name = "source_declarative_manifest.components"
+
+    spec = importlib.util.spec_from_file_location(module_name, components_path)
+    if spec is None or spec.loader is None:
+        raise ImportError(f"Could not load module from {components_path}")
+
+    # Create module and execute code
+    module = importlib.util.module_from_spec(spec)
+
+    # Register then execute the module
+    # we dual-register the module to mirror what is done elsewhere in the CDK
+    sys.modules[module_name] = module
+    sys.modules[sdm_module_name] = module
+
+    spec.loader.exec_module(module)
+
+
 if __name__ == "__main__":
     args = sys.argv[1:]
+    parsed_args = AirbyteEntrypoint.parse_args(args)
+
+    manifest_path = getattr(parsed_args, "manifest_path", None) or "resources/manifest.yaml"
+    components_path = getattr(parsed_args, "components_path", None)
+    if components_path:
+        _register_components_from_file(components_path)
     catalog_path = AirbyteEntrypoint.extract_catalog(args)
     config_path = AirbyteEntrypoint.extract_config(args)
     state_path = AirbyteEntrypoint.extract_state(args)
 
     debug_manifest(
         YamlDeclarativeSource(
-            path_to_yaml="resources/manifest.yaml",
+            path_to_yaml=manifest_path,
             catalog=YamlDeclarativeSource.read_catalog(catalog_path) if catalog_path else None,
             config=YamlDeclarativeSource.read_config(config_path) if config_path else None,
             state=YamlDeclarativeSource.read_state(state_path) if state_path else None,
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py b/unit_tests/sources/declarative/incremental/test_concurrent_perpartitioncursor.py
diff --git a/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py b/unit_tests/sources/declarative/parsers/test_model_to_component_factory.py

Original file line number	Diff line number	Diff line change
`@@ -2852,8 +2852,8 @@ class AsyncRetriever(BaseModel):`
`2852`	`2852`	`status_extractor: Union[DpathExtractor, CustomRecordExtractor] = Field(`
`2853`	`2853`	`..., description="Responsible for fetching the actual status of the async job."`
`2854`	`2854`	`)`
`2855`		`- download_target_extractor: Union[DpathExtractor, CustomRecordExtractor] = Field(`
`2856`		`- ...,`
	`2855`	`+ download_target_extractor: Optional[Union[DpathExtractor, CustomRecordExtractor]] = Field(`
	`2856`	`+ None,`
`2857`	`2857`	description="Responsible for fetching the final result `urls` provided by the completed / finished / ready async job.",
`2858`	`2858`	`)`
`2859`	`2859`	`download_extractor: Optional[`
Original file line number	Diff line number	Diff line change
`@@ -7,5 +7,4 @@`
`7`	`7`
`8`	`8`	`set -e`
`9`	`9`
`10`		`-pip install dagger-io==0.13.3`
`11`	`10`	`python bin/generate_component_manifest_files.py`