Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
987f454
fix: update jsonschema constraint to allow 4.20.0+ for fastmcp compat…
devin-ai-integration[bot] Aug 29, 2025
8c84b6d
fix: migrate deprecated RefResolver to referencing library for jsonsc…
devin-ai-integration[bot] Aug 29, 2025
42e2d7f
style: apply formatting fixes from format-fix command
devin-ai-integration[bot] Aug 29, 2025
fc37c17
Apply suggestion from @aaronsteers
aaronsteers Aug 29, 2025
3de3785
fix: add referencing dependency to resolve deptry CI failure
devin-ai-integration[bot] Aug 29, 2025
cffad9f
feat: simplify jsonschema migration to require 4.18.0+ minimum
devin-ai-integration[bot] Aug 29, 2025
5ccaca9
Apply suggestion from @aaronsteers
aaronsteers Aug 29, 2025
5e485bd
fix: regenerate poetry.lock after jsonschema constraint update
devin-ai-integration[bot] Aug 29, 2025
def67f7
feat: broaden jsonschema constraint to >=4.17.3,<5.0 based on referen…
devin-ai-integration[bot] Aug 29, 2025
1f55214
fix: implement hybrid RefResolver approach for backward compatibility
devin-ai-integration[bot] Aug 29, 2025
f49761b
style: apply formatting fixes to transform.py after hybrid RefResolve…
devin-ai-integration[bot] Aug 29, 2025
b8fa937
resolve: merge conflict in pyproject.toml referencing dependency
devin-ai-integration[bot] Aug 29, 2025
edc20d6
fix: improve hybrid RefResolver robustness for CI compatibility
devin-ai-integration[bot] Aug 29, 2025
164afb6
cherry-pick-me: unrelated fix of breaking change in create_source()
aaronsteers Aug 29, 2025
d14cd84
format cleanup
aaronsteers Aug 29, 2025
5d75886
style: apply formatting fixes to hybrid RefResolver implementation
devin-ai-integration[bot] Aug 29, 2025
061de78
Merge branch 'devin/1756425696-jsonschema-version-pin' of https://git…
devin-ai-integration[bot] Aug 29, 2025
8f047b9
Apply suggestion from @aaronsteers
aaronsteers Aug 29, 2025
a84f50a
Apply suggestion from @aaronsteers
aaronsteers Aug 29, 2025
3f19659
revert formatting changes
aaronsteers Aug 29, 2025
96eabbf
revert changes moved to #738
aaronsteers Aug 29, 2025
aa5f6a4
clean up validator implementation
aaronsteers Aug 29, 2025
bb13c89
fix imports
aaronsteers Aug 29, 2025
43ee692
try making DRY
aaronsteers Aug 29, 2025
d2c7a51
revert and retry
aaronsteers Aug 29, 2025
35dda44
Update pyproject.toml
aaronsteers Aug 29, 2025
c137302
chore: relock dependencies with referencing >=0.36.2 constraint
devin-ai-integration[bot] Aug 29, 2025
cb69302
fix: update Poetry to 2.0.1 in Dockerfile and fix referencing usage i…
devin-ai-integration[bot] Aug 29, 2025
93aeb9f
Merge remote-tracking branch 'origin/main' into devin/1756425696-json…
aaronsteers Aug 29, 2025
f325a11
poe lock
aaronsteers Aug 29, 2025
98d3296
fixes, simplification
aaronsteers Aug 29, 2025
c742f80
clean up
aaronsteers Aug 29, 2025
5a6aea4
fux unused stray ref
aaronsteers Aug 29, 2025
7b284d5
remove commented-out code (confirmed working)
aaronsteers Aug 29, 2025
749106b
fix format
aaronsteers Aug 29, 2025
51a5ddf
resolve mypy
aaronsteers Aug 29, 2025
09916a9
remove comment
aaronsteers Aug 29, 2025
5f0ac1d
vector db work (split)
aaronsteers Aug 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions airbyte_cdk/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,6 @@
from .utils.constants import ENV_REQUEST_CACHE_PATH
from .utils.event_timing import create_timer
from .utils.oneof_option_config import OneOfOptionConfig
from .utils.spec_schema_transformations import resolve_refs
from .utils.stream_status_utils import as_airbyte_message

__all__ = [
Expand Down Expand Up @@ -340,7 +339,6 @@
"ENV_REQUEST_CACHE_PATH",
"create_timer",
"OneOfOptionConfig",
"resolve_refs",
"as_airbyte_message",
# Types
"Config",
Expand Down
4 changes: 2 additions & 2 deletions airbyte_cdk/destinations/vector_db_based/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@
import dpath
from pydantic.v1 import BaseModel, Field

from airbyte_cdk.sources.utils.schema_helpers import expand_refs
from airbyte_cdk.utils.oneof_option_config import OneOfOptionConfig
from airbyte_cdk.utils.spec_schema_transformations import resolve_refs


class SeparatorSplitterConfigModel(BaseModel):
Expand Down Expand Up @@ -293,6 +293,6 @@ def remove_discriminator(schema: Dict[str, Any]) -> None:
def schema(cls, by_alias: bool = True, ref_template: str = "") -> Dict[str, Any]:
"""we're overriding the schema classmethod to enable some post-processing"""
schema: Dict[str, Any] = super().schema()
Copy link
Preview

Copilot AI Sep 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function call should match the pattern used elsewhere in the codebase. The original code assigned the result back to schema, but expand_refs modifies the schema in-place and returns None. Consider documenting this behavior or maintaining consistency with the assignment pattern for clarity.

Suggested change
schema: Dict[str, Any] = super().schema()
schema: Dict[str, Any] = super().schema()
# expand_refs modifies the schema in-place and returns None.

Copilot uses AI. Check for mistakes.

schema = resolve_refs(schema)
expand_refs(schema)
cls.remove_discriminator(schema)
return schema
4 changes: 2 additions & 2 deletions airbyte_cdk/manifest_server/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ FROM python:3.12-slim-bookworm
RUN apt-get update && \
apt-get install -y git && \
rm -rf /var/lib/apt/lists/* && \
pip install poetry==1.8.3
pip install poetry==2.0.1

# Configure poetry to not create virtual environments and disable interactive mode
ENV POETRY_NO_INTERACTION=1 \
Expand Down Expand Up @@ -42,4 +42,4 @@ USER airbyte:airbyte

EXPOSE 8080

CMD ["uvicorn", "airbyte_cdk.manifest_server.app:app", "--host", "0.0.0.0", "--port", "8080"]
CMD ["uvicorn", "airbyte_cdk.manifest_server.app:app", "--host", "0.0.0.0", "--port", "8080"]
38 changes: 29 additions & 9 deletions airbyte_cdk/sources/utils/schema_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,16 @@
import json
import os
import pkgutil
from typing import Any, ClassVar, Dict, List, Mapping, MutableMapping, Optional, Tuple
from copy import deepcopy
from typing import TYPE_CHECKING, Any, ClassVar, Dict, List, Mapping, MutableMapping, Tuple, cast

import jsonref
from jsonschema import RefResolver, validate
from jsonschema import validate
from jsonschema.exceptions import ValidationError
from pydantic.v1 import BaseModel, Field
from referencing import Registry, Resource
from referencing._core import Resolver # used for type hints
from referencing.jsonschema import DRAFT7

from airbyte_cdk.models import ConnectorSpecification, FailureType
from airbyte_cdk.utils.traced_exception import AirbyteTracedException
Expand Down Expand Up @@ -63,18 +67,30 @@ def resolve_ref_links(obj: Any) -> Any:
return obj


def _expand_refs(schema: Any, ref_resolver: Optional[RefResolver] = None) -> None:
def get_ref_resolver_registry(schema: dict[str, Any]) -> Registry:
"""Get a reference resolver registry for the given schema."""
resource: Resource = Resource.from_contents(
contents=schema,
default_specification=DRAFT7,
)
return cast( # Mypy has a hard time detecting this return type.
"Registry",
Registry().with_resource(
uri="",
resource=resource,
),
)


def _expand_refs(schema: Any, ref_resolver: Resolver) -> None:
"""Internal function to iterate over schema and replace all occurrences of $ref with their definitions. Recursive.

:param schema: schema that will be patched
:param ref_resolver: resolver to get definition from $ref, if None pass it will be instantiated
"""
ref_resolver = ref_resolver or RefResolver.from_schema(schema)

if isinstance(schema, MutableMapping):
if "$ref" in schema:
ref_url = schema.pop("$ref")
_, definition = ref_resolver.resolve(ref_url)
definition = ref_resolver.lookup(ref_url).contents
_expand_refs(
definition, ref_resolver=ref_resolver
) # expand refs in definitions as well
Expand All @@ -90,10 +106,14 @@ def _expand_refs(schema: Any, ref_resolver: Optional[RefResolver] = None) -> Non
def expand_refs(schema: Any) -> None:
"""Iterate over schema and replace all occurrences of $ref with their definitions.

If a "definitions" section is present at the root of the schema, it will be removed
after $ref resolution is complete.

:param schema: schema that will be patched
"""
_expand_refs(schema)
schema.pop("definitions", None) # remove definitions created by $ref
ref_resolver = get_ref_resolver_registry(schema).resolver()
_expand_refs(schema, ref_resolver)
schema.pop("definitions", None)

Comment on lines 106 to 117
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Verification agent

🧩 Analysis chain

Return the schema for ergonomic use and back-compat

Some call sites (tests, etc.) assign schema = expand_refs(schema). Since this mutates in place, could we also return schema to keep both styles working, wdyt?

-def expand_refs(schema: Any) -> None:
+def expand_refs(schema: Any) -> Any:
@@
-    schema.pop("definitions", None)
+    schema.pop("definitions", None)
+    return schema

Run to find assignment-style usages that would otherwise break:


🏁 Script executed:

#!/bin/bash
# Find assignment-style expand_refs usage
rg -nP --type=py '^\s*\w+\s*=\s*expand_refs\s*\(' -C1

# Find all expand_refs call sites for a quick audit
rg -nP --type=py '\bexpand_refs\s*\(' -C2

Length of output: 3461


Return schema from expand_refs for ergonomic use and back-compat?

One assignment-style caller was found at unit_tests/destinations/vector_db_based/config_test.py:67 — change the signature to return the schema so both assignment and in-place usages work.

-def expand_refs(schema: Any) -> None:
+def expand_refs(schema: Any) -> Any:
@@
-    schema.pop("definitions", None)
+    schema.pop("definitions", None)
+    return schema

wdyt?

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
def expand_refs(schema: Any) -> None:
"""Iterate over schema and replace all occurrences of $ref with their definitions.
If a "definitions" section is present at the root of the schema, it will be removed
after $ref resolution is complete.
:param schema: schema that will be patched
"""
_expand_refs(schema)
schema.pop("definitions", None) # remove definitions created by $ref
ref_resolver = get_ref_resolver_registry(schema).resolver()
_expand_refs(schema, ref_resolver)
schema.pop("definitions", None)
def expand_refs(schema: Any) -> Any:
"""Iterate over schema and replace all occurrences of $ref with their definitions.
If a "definitions" section is present at the root of the schema, it will be removed
after $ref resolution is complete.
:param schema: schema that will be patched
"""
ref_resolver = get_ref_resolver_registry(schema).resolver()
_expand_refs(schema, ref_resolver)
schema.pop("definitions", None)
return schema
🤖 Prompt for AI Agents
In airbyte_cdk/sources/utils/schema_helpers.py around lines 106 to 117, update
expand_refs to return the schema after performing in-place $ref expansion so
callers that assign its result continue to work; change the function signature
to return the schema (keep existing in-place behavior), update the docstring to
mention the return value, and ensure the function ends with "return schema"
after popping "definitions"; adjust the type hint to reflect the returned value
if desired and run/update callers (e.g.,
unit_tests/destinations/vector_db_based/config_test.py) that expect an
assignment to use the returned schema.


def rename_key(schema: Any, old_key: str, new_key: str) -> None:
Expand Down
38 changes: 25 additions & 13 deletions airbyte_cdk/sources/utils/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,25 @@
#

import logging
from copy import deepcopy
from enum import Flag, auto
from typing import Any, Callable, Dict, Generator, Mapping, Optional, cast
from typing import TYPE_CHECKING, Any, Callable, Dict, Generator, Mapping, Optional, cast

from jsonschema import Draft7Validator, ValidationError, validators
from referencing import Registry, Resource
from referencing._core import Resolver
from referencing.exceptions import Unresolvable
from referencing.jsonschema import DRAFT7

from airbyte_cdk.sources.utils.schema_helpers import expand_refs

from .schema_helpers import get_ref_resolver_registry

try:
from jsonschema.validators import Validator
except:
Copy link
Preview

Copilot AI Sep 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using bare except: clause is discouraged. Specify the expected exception type (likely ImportError) to avoid catching unintended exceptions.

Suggested change
except:
except ImportError:

Copilot uses AI. Check for mistakes.

from jsonschema import Validator

from jsonschema import Draft7Validator, RefResolver, ValidationError, Validator, validators

MAX_NESTING_DEPTH = 3
json_to_python_simple = {
Expand Down Expand Up @@ -191,30 +206,27 @@ def normalizator(
validators parameter for detailed description.
:
"""
# Very first step is to expand $refs in the schema itself:
expand_refs(schema)

# Now we can expand $refs in the property value:
if isinstance(property_value, dict):
expand_refs(property_value)
Comment on lines +210 to +214
Copy link
Preview

Copilot AI Sep 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The expand_refs function is called multiple times and mutates the schema in-place. Consider creating a deep copy of the schema before the first call to avoid unintended side effects on the original schema object.

Copilot uses AI. Check for mistakes.


def resolve(subschema: dict[str, Any]) -> dict[str, Any]:
if "$ref" in subschema:
_, resolved = cast(
RefResolver,
validator_instance.resolver,
).resolve(subschema["$ref"])
return cast(dict[str, Any], resolved)
return subschema
# Now we can validate and normalize the values:

Comment on lines +209 to 217
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue

Don’t expand refs on every validator callback (perf + possible Unresolvable)

Calling expand_refs(schema) per-visit mutates the schema repeatedly and can raise Unresolvable when expanding dict-valued fragments without root context. Can we remove in-callback expansion and do a single pre-expansion before validator creation, wdyt?

Apply:

-            # Very first step is to expand $refs in the schema itself:
-            expand_refs(schema)
-
-            # Now we can expand $refs in the property value:
-            if isinstance(property_value, dict):
-                expand_refs(property_value)
+            # $refs are pre-expanded once before validator construction.
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Very first step is to expand $refs in the schema itself:
expand_refs(schema)
# Now we can expand $refs in the property value:
if isinstance(property_value, dict):
expand_refs(property_value)
def resolve(subschema: dict[str, Any]) -> dict[str, Any]:
if "$ref" in subschema:
_, resolved = cast(
RefResolver,
validator_instance.resolver,
).resolve(subschema["$ref"])
return cast(dict[str, Any], resolved)
return subschema
# Now we can validate and normalize the values:
# $refs are pre-expanded once before validator construction.
# Now we can validate and normalize the values:

# Transform object and array values before running json schema type checking for each element.
# Recursively normalize every value of the "instance" sub-object,
# if "instance" is an incorrect type - skip recursive normalization of "instance"
if schema_key == "properties" and isinstance(instance, dict):
for k, subschema in property_value.items():
if k in instance:
subschema = resolve(subschema)
instance[k] = self.__normalize(instance[k], subschema)
# Recursively normalize every item of the "instance" sub-array,
# if "instance" is an incorrect type - skip recursive normalization of "instance"
elif schema_key == "items" and isinstance(instance, list):
subschema = resolve(property_value)
for index, item in enumerate(instance):
instance[index] = self.__normalize(item, subschema)
instance[index] = self.__normalize(item, property_value)

# Running native jsonschema traverse algorithm after field normalization is done.
yield from original_validator(
Expand Down
26 changes: 0 additions & 26 deletions airbyte_cdk/utils/spec_schema_transformations.py

This file was deleted.

Loading