From eb42b369acc3b86ac4ea429555d0366b0abd438b Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Fri, 24 Feb 2023 05:14:15 +0000 Subject: [PATCH 1/9] Update to jsonschema>=4.18 and 'referencing' There's a new implementation of ref resolution in newer versions of 'jsonschema'. Upgrade to use this, plus add support for loading YAML, JSON5, and TOML schemas as references (so long as they have the filetype suffixes which make them easily detectable). --- setup.cfg | 2 +- src/check_jsonschema/checker.py | 7 +- src/check_jsonschema/identify_filetype.py | 7 +- src/check_jsonschema/parsers/__init__.py | 11 ++- src/check_jsonschema/schema_loader/main.py | 17 ++-- src/check_jsonschema/schema_loader/readers.py | 4 +- .../schema_loader/resolver.py | 99 +++++++++++-------- tests/acceptance/conftest.py | 24 ++++- .../test_nonjson_schema_handling.py | 14 +-- tox.ini | 1 - 10 files changed, 116 insertions(+), 70 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8d75970a5..0d674f735 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,7 @@ python_requires = >=3.7 install_requires = importlib-resources>=1.4.0;python_version<"3.9" ruamel.yaml==0.17.32 - jsonschema>=4.5.1,<5.0 + jsonschema>=4.18.0,<5.0 requests<3.0 click>=8,<9 package_dir= diff --git a/src/check_jsonschema/checker.py b/src/check_jsonschema/checker.py index 1ecb9c86a..bf1aea0a8 100644 --- a/src/check_jsonschema/checker.py +++ b/src/check_jsonschema/checker.py @@ -5,6 +5,7 @@ import click import jsonschema +import referencing.exceptions from . import utils from .formats import FormatOptions @@ -75,7 +76,11 @@ def _build_result(self) -> CheckResult: def _run(self) -> None: try: result = self._build_result() - except jsonschema.RefResolutionError as e: + except ( + referencing.exceptions.NoSuchResource, + referencing.exceptions.Unretrievable, + referencing.exceptions.Unresolvable, + ) as e: self._fail("Failure resolving $ref within schema\n", e) self._reporter.report_result(result) diff --git a/src/check_jsonschema/identify_filetype.py b/src/check_jsonschema/identify_filetype.py index c81922eac..7fd054758 100644 --- a/src/check_jsonschema/identify_filetype.py +++ b/src/check_jsonschema/identify_filetype.py @@ -18,8 +18,11 @@ } -def path_to_type(path: pathlib.Path, *, default_type: str = "json") -> str: - ext = path.suffix.lstrip(".") +def path_to_type(path: str | pathlib.Path, *, default_type: str = "json") -> str: + if isinstance(path, str): + ext = path.rpartition(".")[2] + else: + ext = path.suffix.lstrip(".") if ext in _EXTENSION_MAP: return _EXTENSION_MAP[ext] diff --git a/src/check_jsonschema/parsers/__init__.py b/src/check_jsonschema/parsers/__init__.py index eedb406ff..edce01feb 100644 --- a/src/check_jsonschema/parsers/__init__.py +++ b/src/check_jsonschema/parsers/__init__.py @@ -82,10 +82,15 @@ def get( + ",".join(self._by_tag.keys()) ) - def parse_file(self, path: pathlib.Path, default_filetype: str) -> t.Any: + def parse_data_with_path( + self, data: t.BinaryIO, path: pathlib.Path | str, default_filetype: str + ) -> t.Any: loadfunc = self.get(path, default_filetype) try: - with open(path, "rb") as fp: - return loadfunc(fp) + return loadfunc(data) except LOADING_FAILURE_ERROR_TYPES as e: raise FailedFileLoadError(f"Failed to parse {path}") from e + + def parse_file(self, path: pathlib.Path | str, default_filetype: str) -> t.Any: + with open(path, "rb") as fp: + return self.parse_data_with_path(fp, path, default_filetype) diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py index 8f7a49336..3b10c3b01 100644 --- a/src/check_jsonschema/schema_loader/main.py +++ b/src/check_jsonschema/schema_loader/main.py @@ -9,10 +9,11 @@ from ..builtin_schemas import get_builtin_schema from ..formats import FormatOptions, make_format_checker +from ..parsers import ParserSet from ..utils import is_url_ish from .errors import UnsupportedUrlScheme from .readers import HttpSchemaReader, LocalSchemaReader -from .resolver import make_ref_resolver +from .resolver import make_reference_registry def _extend_with_default( @@ -71,6 +72,9 @@ def __init__( if is_url_ish(self.schemafile): self.url_info = urllib.parse.urlparse(self.schemafile) + # setup a parser collection + self._parsers = ParserSet() + # setup a schema reader lazily, when needed self._reader: LocalSchemaReader | HttpSchemaReader | None = None @@ -117,11 +121,9 @@ def get_validator( # format checker (which may be None) format_checker = make_format_checker(format_opts, schema_dialect) - # ref resolver which may be built from the schema path - # if the location is a URL, there's no change, but if it's a file path - # it's made absolute and URI-ized - # the resolver should use `$id` if there is one present in the schema - ref_resolver = make_ref_resolver(schema_uri, schema) + # reference resolution + # with support for YAML, TOML, and other formats from the parsers + reference_registry = make_reference_registry(self._parsers, schema_uri, schema) # get the correct validator class and check the schema under its metaschema validator_cls = jsonschema.validators.validator_for(schema) @@ -134,7 +136,7 @@ def get_validator( # now that we know it's safe to try to create the validator instance, do it validator = validator_cls( schema, - resolver=ref_resolver, + registry=reference_registry, format_checker=format_checker, ) return t.cast(jsonschema.Validator, validator) @@ -143,6 +145,7 @@ def get_validator( class BuiltinSchemaLoader(SchemaLoader): def __init__(self, schema_name: str) -> None: self.schema_name = schema_name + self._parsers = ParserSet() def get_schema_ref_base(self) -> str | None: return None diff --git a/src/check_jsonschema/schema_loader/readers.py b/src/check_jsonschema/schema_loader/readers.py index 0f9ec9185..98e24c011 100644 --- a/src/check_jsonschema/schema_loader/readers.py +++ b/src/check_jsonschema/schema_loader/readers.py @@ -25,12 +25,10 @@ def _run_load_callback(schema_location: str, callback: t.Callable) -> dict: class LocalSchemaReader: - FORMATS = ("json", "json5", "yaml") - def __init__(self, filename: str) -> None: self.path = filename2path(filename) self.filename = str(self.path) - self.parsers = ParserSet(supported_formats=self.FORMATS) + self.parsers = ParserSet() def get_ref_base(self) -> str: return self.path.as_uri() diff --git a/src/check_jsonschema/schema_loader/resolver.py b/src/check_jsonschema/schema_loader/resolver.py index 5833896c8..ec4c7af54 100644 --- a/src/check_jsonschema/schema_loader/resolver.py +++ b/src/check_jsonschema/schema_loader/resolver.py @@ -1,42 +1,63 @@ from __future__ import annotations +import pathlib import typing as t - -import click -import jsonschema - - -class _CliRefResolver(jsonschema.RefResolver): - def resolve_remote(self, uri: str) -> t.Any: - if uri.endswith(".yaml") or uri.endswith(".yml"): - click.secho( - """\ -WARNING: You appear to be using a schema which references a YAML file. - -This is not supported by check-jsonschema and may result in errors. -""", - err=True, - fg="yellow", - ) - elif uri.endswith(".json5"): - click.secho( - """\ -WARNING: You appear to be using a schema which references a JSON5 file. - -This is not supported by check-jsonschema and may result in errors. -""", - err=True, - fg="yellow", - ) - return super().resolve_remote(uri) - - -def make_ref_resolver( - schema_uri: str | None, schema: dict -) -> jsonschema.RefResolver | None: - if not schema_uri: - return None - - base_uri = schema.get("$id", schema_uri) - # FIXME: temporary type-ignore because typeshed has the type wrong - return _CliRefResolver(base_uri, schema) # type: ignore[arg-type] +import urllib.parse + +import referencing +import requests +from referencing.jsonschema import DRAFT202012 + +from ..parsers import ParserSet +from ..utils import filename2path + + +def make_reference_registry( + parsers: ParserSet, schema_uri: str | None, schema: dict +) -> referencing.Registry: + schema_resource = referencing.Resource.from_contents( + schema, default_specification=DRAFT202012 + ) + registry = referencing.Registry( + retrieve=create_retrieve_callable(parsers, schema_uri) + ) + + if schema_uri is not None: + registry = registry.with_resource(uri=schema_uri, resource=schema_resource) + + id_attribute = schema.get("$id") + if id_attribute is not None: + registry = registry.with_resource(uri=id_attribute, resource=schema_resource) + + return registry + + +def create_retrieve_callable( + parser_set: ParserSet, schema_uri: str | None +) -> t.Callable[[str], referencing.Resource]: + def get_local_file(uri: str): + path = pathlib.Path(uri) + if not path.is_absolute(): + if schema_uri is None: + raise referencing.exceptions.Unretrievable( + f"Cannot retrieve schema reference data for '{uri}' from " + "local filesystem. " + "The path appears relative, but there is no known local base path." + ) + schema_path = filename2path(schema_uri) + path = schema_path.parent / path + return parser_set.parse_file(path, "json") + + def retrieve_reference(uri: str) -> referencing.Resource: + scheme = urllib.parse.urlsplit(uri).scheme + if scheme in ("http", "https"): + data = requests.get(uri, stream=True) + parsed_object = parser_set.parse_data_with_path(data.raw, uri, "json") + else: + parsed_object = get_local_file(uri) + + return referencing.Resource.from_contents( + parsed_object, default_specification=DRAFT202012 + ) + + return retrieve_reference diff --git a/tests/acceptance/conftest.py b/tests/acceptance/conftest.py index ee7ada8a4..e47b088a1 100644 --- a/tests/acceptance/conftest.py +++ b/tests/acceptance/conftest.py @@ -1,9 +1,21 @@ +import textwrap + import pytest from click.testing import CliRunner from check_jsonschema import main as cli_main +def _render_result(result): + return f""" +output: +{textwrap.indent(result.output, " ")} + +stderr: +{textwrap.indent(result.stderr, " ")} +""" + + @pytest.fixture def cli_runner(): return CliRunner(mix_stderr=False) @@ -22,8 +34,14 @@ def func(cli_args, *args, **kwargs): @pytest.fixture def run_line_simple(run_line): - def func(cli_args, *args, **kwargs): - res = run_line(["check-jsonschema"] + cli_args, *args, **kwargs) - assert res.exit_code == 0 + def func(cli_args, *args, full_traceback: bool = True, **kwargs): + res = run_line( + ["check-jsonschema"] + + (["--traceback-mode", "full"] if full_traceback else []) + + cli_args, + *args, + **kwargs, + ) + assert res.exit_code == 0, _render_result(res) return func diff --git a/tests/acceptance/test_nonjson_schema_handling.py b/tests/acceptance/test_nonjson_schema_handling.py index 26dffde46..d64ac42e7 100644 --- a/tests/acceptance/test_nonjson_schema_handling.py +++ b/tests/acceptance/test_nonjson_schema_handling.py @@ -31,9 +31,10 @@ @pytest.mark.parametrize("passing_data", [True, False]) -def test_warning_on_yaml_reference_passes(run_line, tmp_path, passing_data): +def test_yaml_reference(run_line, tmp_path, passing_data): main_schemafile = tmp_path / "main_schema.json" main_schemafile.write_text(json.dumps(YAML_REF_MAIN_SCHEMA)) + # JSON is a subset of YAML, so this works for generated YAML ref_schema = tmp_path / "title_schema.yaml" ref_schema.write_text(json.dumps(TITLE_SCHEMA)) @@ -47,14 +48,11 @@ def test_warning_on_yaml_reference_passes(run_line, tmp_path, passing_data): ["check-jsonschema", "--schemafile", str(main_schemafile), str(doc)] ) assert result.exit_code == (0 if passing_data else 1) - assert ( - "WARNING: You appear to be using a schema which references a YAML file" - in result.stderr - ) +@pytest.mark.skipif(not JSON5_ENABLED, reason="test requires json5") @pytest.mark.parametrize("passing_data", [True, False]) -def test_warning_on_json5_reference(run_line, tmp_path, passing_data): +def test_json5_reference(run_line, tmp_path, passing_data): main_schemafile = tmp_path / "main_schema.json" main_schemafile.write_text(json.dumps(JSON5_REF_MAIN_SCHEMA)) ref_schema = tmp_path / "title_schema.json5" @@ -70,10 +68,6 @@ def test_warning_on_json5_reference(run_line, tmp_path, passing_data): ["check-jsonschema", "--schemafile", str(main_schemafile), str(doc)] ) assert result.exit_code == (0 if passing_data else 1) - assert ( - "WARNING: You appear to be using a schema which references a JSON5 file" - in result.stderr - ) @pytest.mark.skipif(not JSON5_ENABLED, reason="test requires json5") diff --git a/tox.ini b/tox.ini index b6804e0d7..524f83d3f 100644 --- a/tox.ini +++ b/tox.ini @@ -87,4 +87,3 @@ commands = python ./scripts/generate-hooks-config.py [pytest] filterwarnings = error - ignore:jsonschema\.(RefResolver|exceptions\.RefResolutionError) is deprecated as of (version |v)4\.18\.0:DeprecationWarning From 93edec3a66796701ba80822f6e95ed6233b12587 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Thu, 27 Jul 2023 23:22:48 -0500 Subject: [PATCH 2/9] Drop support for python3.7 `referencing` does not support 3.7, which means that supporting 3.7 would start to require separate compatibility code precisely when it has just reached EOL (4 weeks ago at time of writing). Drop 3.7 and move the 'mindeps' builds to 3.8 --- .github/workflows/build.yaml | 4 ++-- tox.ini | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 612904473..d674fb38a 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -11,7 +11,7 @@ jobs: py: ["3.x"] include: - toxenv: py-mindeps - py: "3.7" + py: "3.8" runs-on: ubuntu-latest name: "Run '${{ matrix.toxenv }}' on python ${{ matrix.py }}" @@ -40,7 +40,7 @@ jobs: strategy: matrix: os: [ubuntu-latest, windows-latest, macos-latest] - py: ['3.7', '3.8', '3.9', '3.10', '3.11'] + py: ['3.8', '3.9', '3.10', '3.11'] name: "Run tests on ${{ matrix.os }}, py${{ matrix.py }}" runs-on: ${{ matrix.os }} steps: diff --git a/tox.ini b/tox.ini index 524f83d3f..1e5ae016b 100644 --- a/tox.ini +++ b/tox.ini @@ -2,10 +2,10 @@ envlist = mypy cov_clean - py37-mindeps - py{311,310,39,38,37} + py38-mindeps + py{311,310,39,38} py310-{notoml,tomli-format} - py{37,311}-{json5,pyjson5} + py{38,311}-{json5,pyjson5} cov skip_missing_interpreters = true minversion = 4.0.0 @@ -18,7 +18,7 @@ description = "run tests with pytest" usedevelop = true extras = dev deps = - mindeps: jsonschema==4.5.1 + mindeps: jsonschema==4.18.0 mindeps: click==8.0.0 mindeps: requests==2.0.0 mindeps: importlib-resources==1.4.0 From 5134dd935fa051884fc86fddd344e173561068ec Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Sat, 29 Jul 2023 12:51:10 -0500 Subject: [PATCH 3/9] Resolve typing issues with referencing+mypy Also setup a pyright tox environment for testing and usage. --- src/check_jsonschema/parsers/__init__.py | 2 +- src/check_jsonschema/schema_loader/resolver.py | 12 +++++++----- tox.ini | 7 +++++++ 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/src/check_jsonschema/parsers/__init__.py b/src/check_jsonschema/parsers/__init__.py index edce01feb..88a66761f 100644 --- a/src/check_jsonschema/parsers/__init__.py +++ b/src/check_jsonschema/parsers/__init__.py @@ -65,7 +65,7 @@ def __init__( } def get( - self, path: pathlib.Path, default_filetype: str + self, path: pathlib.Path | str, default_filetype: str ) -> t.Callable[[t.BinaryIO], t.Any]: filetype = path_to_type(path, default_type=default_filetype) diff --git a/src/check_jsonschema/schema_loader/resolver.py b/src/check_jsonschema/schema_loader/resolver.py index ec4c7af54..6207b30cf 100644 --- a/src/check_jsonschema/schema_loader/resolver.py +++ b/src/check_jsonschema/schema_loader/resolver.py @@ -6,7 +6,7 @@ import referencing import requests -from referencing.jsonschema import DRAFT202012 +from referencing.jsonschema import DRAFT202012, Schema from ..parsers import ParserSet from ..utils import filename2path @@ -18,7 +18,9 @@ def make_reference_registry( schema_resource = referencing.Resource.from_contents( schema, default_specification=DRAFT202012 ) - registry = referencing.Registry( + # mypy does not recognize that Registry is an `attrs` class and has `retrieve` as an + # argument to its implicit initializer + registry: referencing.Registry = referencing.Registry( # type: ignore[call-arg] retrieve=create_retrieve_callable(parsers, schema_uri) ) @@ -34,8 +36,8 @@ def make_reference_registry( def create_retrieve_callable( parser_set: ParserSet, schema_uri: str | None -) -> t.Callable[[str], referencing.Resource]: - def get_local_file(uri: str): +) -> t.Callable[[str], referencing.Resource[Schema]]: + def get_local_file(uri: str) -> t.Any: path = pathlib.Path(uri) if not path.is_absolute(): if schema_uri is None: @@ -48,7 +50,7 @@ def get_local_file(uri: str): path = schema_path.parent / path return parser_set.parse_file(path, "json") - def retrieve_reference(uri: str) -> referencing.Resource: + def retrieve_reference(uri: str) -> referencing.Resource[Schema]: scheme = urllib.parse.urlsplit(uri).scheme if scheme in ("http", "https"): data = requests.get(uri, stream=True) diff --git a/tox.ini b/tox.ini index 1e5ae016b..24ca226f6 100644 --- a/tox.ini +++ b/tox.ini @@ -54,6 +54,13 @@ deps = mypy click==8.1.3 commands = mypy src/ {posargs} +[testenv:pyright] +description = "check type annotations with pyright" +deps = pyright + types-jsonschema + types-requests +commands = pyright src/ {posargs} + [testenv:docs] description = "build docs with sphinx" basepython = python3.10 From ff740f82c4802e1b7ed977e4825ad719386a6860 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Sat, 29 Jul 2023 12:53:21 -0500 Subject: [PATCH 4/9] Update changelog for '-py3.7' and 'referencing' --- CHANGELOG.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b6a645dc7..a8df1f583 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,15 @@ Unreleased .. vendor-insert-here - Update vendored schemas (2023-07-18) +- Remove support for python3.7 +- The minimum supported version of the `jsonschema` library is now `4.18.0`, + which introduces new `$ref` resolution behavior and fixes. That behavior is + used in all cases, which should result in faster evaluation especially on + large schemas. +- `$ref` usage may now refer to YAML, TOML, or JSON5 files, or any other + non-JSON format supported by `check-jsonschema`. The file type is inferred + only from the file extension in these cases and defaults to JSON if there is + no recognizable extension. 0.23.3 ------ From 9481d2a725c0354cf8a3c7eca8b4d87076bf9c8c Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Sat, 29 Jul 2023 22:22:09 -0500 Subject: [PATCH 5/9] Introduce remote $ref tests and fix URL join logic URL joining needs to be done between any relative ref paths and the base URI. This was not correct, as revealed by a new testcase. After fixing, this revealed that the local file loading was somewhat indirect in a way that it no longer needs to be, so local file handling in the ref resolver has been refactored as well. --- .../schema_loader/resolver.py | 26 +++---- .../acceptance/test_remote_ref_resolution.py | 76 +++++++++++++++++++ 2 files changed, 87 insertions(+), 15 deletions(-) create mode 100644 tests/acceptance/test_remote_ref_resolution.py diff --git a/src/check_jsonschema/schema_loader/resolver.py b/src/check_jsonschema/schema_loader/resolver.py index 6207b30cf..5d658eadc 100644 --- a/src/check_jsonschema/schema_loader/resolver.py +++ b/src/check_jsonschema/schema_loader/resolver.py @@ -1,6 +1,5 @@ from __future__ import annotations -import pathlib import typing as t import urllib.parse @@ -38,25 +37,22 @@ def create_retrieve_callable( parser_set: ParserSet, schema_uri: str | None ) -> t.Callable[[str], referencing.Resource[Schema]]: def get_local_file(uri: str) -> t.Any: - path = pathlib.Path(uri) - if not path.is_absolute(): - if schema_uri is None: - raise referencing.exceptions.Unretrievable( - f"Cannot retrieve schema reference data for '{uri}' from " - "local filesystem. " - "The path appears relative, but there is no known local base path." - ) - schema_path = filename2path(schema_uri) - path = schema_path.parent / path + path = filename2path(uri) return parser_set.parse_file(path, "json") def retrieve_reference(uri: str) -> referencing.Resource[Schema]: scheme = urllib.parse.urlsplit(uri).scheme - if scheme in ("http", "https"): - data = requests.get(uri, stream=True) - parsed_object = parser_set.parse_data_with_path(data.raw, uri, "json") + if scheme == "" and schema_uri is not None: + full_uri = urllib.parse.urljoin(schema_uri, uri) else: - parsed_object = get_local_file(uri) + full_uri = uri + + full_uri_scheme = urllib.parse.urlsplit(full_uri).scheme + if full_uri_scheme in ("http", "https"): + data = requests.get(full_uri, stream=True) + parsed_object = parser_set.parse_data_with_path(data.raw, full_uri, "json") + else: + parsed_object = get_local_file(full_uri) return referencing.Resource.from_contents( parsed_object, default_specification=DRAFT202012 diff --git a/tests/acceptance/test_remote_ref_resolution.py b/tests/acceptance/test_remote_ref_resolution.py new file mode 100644 index 000000000..626b93bf9 --- /dev/null +++ b/tests/acceptance/test_remote_ref_resolution.py @@ -0,0 +1,76 @@ +import json + +import pytest +import responses + +from check_jsonschema import cachedownloader + +CASES = { + "case1": { + "main_schema": { + "$schema": "http://json-schema.org/draft-07/schema", + "properties": { + "title": {"$ref": "./title_schema.json"}, + }, + "additionalProperties": False, + }, + "other_schemas": {"title_schema": {"type": "string"}}, + "passing_document": {"title": "doc one"}, + "failing_document": {"title": 2}, + }, + "case2": { + "main_schema": { + "$schema": "http://json-schema.org/draft-07/schema", + "type": "object", + "required": ["test"], + "properties": {"test": {"$ref": "./values.json#/$defs/test"}}, + }, + "other_schemas": { + "values": { + "$schema": "http://json-schema.org/draft-07/schema", + "$defs": {"test": {"type": "string"}}, + } + }, + "passing_document": {"test": "some data"}, + "failing_document": {"test": {"foo": "bar"}}, + }, +} + + +@pytest.mark.parametrize("check_passes", (True, False)) +@pytest.mark.parametrize("casename", ("case1", "case2")) +def test_remote_ref_resolution_simple_case( + run_line, check_passes, casename, tmp_path, monkeypatch +): + def _fake_compute_default_cache_dir(self): + return str(tmp_path) + + monkeypatch.setattr( + cachedownloader.CacheDownloader, + "_compute_default_cache_dir", + _fake_compute_default_cache_dir, + ) + + main_schema_loc = "https://example.com/main.json" + responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"]) + for name, subschema in CASES[casename]["other_schemas"].items(): + other_schema_loc = f"https://example.com/{name}.json" + responses.add("GET", other_schema_loc, json=subschema) + + instance_path = tmp_path / "instance.json" + instance_path.write_text( + json.dumps( + CASES[casename]["passing_document"] + if check_passes + else CASES[casename]["failing_document"] + ) + ) + + result = run_line( + ["check-jsonschema", "--schemafile", main_schema_loc, str(instance_path)] + ) + output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" + if check_passes: + assert result.exit_code == 0, output + else: + assert result.exit_code == 1, output From 90f545dc16f15d543c3d7ffc478729bd76984b45 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Wed, 2 Aug 2023 22:50:16 -0500 Subject: [PATCH 6/9] Add test case for explicit use of 2020 metaschema Relative dyanicRef usage was resolved incorrectly in the prior implementation, but it has been fixed by correct use of `referencing`. Test to ensure that this case is satisfied. See also: #293 --- .../positive/2020-meta/instance.json | 1 + .../positive/2020-meta/schema.json | 80 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 tests/example-files/explicit-schema/positive/2020-meta/instance.json create mode 100644 tests/example-files/explicit-schema/positive/2020-meta/schema.json diff --git a/tests/example-files/explicit-schema/positive/2020-meta/instance.json b/tests/example-files/explicit-schema/positive/2020-meta/instance.json new file mode 100644 index 000000000..0967ef424 --- /dev/null +++ b/tests/example-files/explicit-schema/positive/2020-meta/instance.json @@ -0,0 +1 @@ +{} diff --git a/tests/example-files/explicit-schema/positive/2020-meta/schema.json b/tests/example-files/explicit-schema/positive/2020-meta/schema.json new file mode 100644 index 000000000..605a83375 --- /dev/null +++ b/tests/example-files/explicit-schema/positive/2020-meta/schema.json @@ -0,0 +1,80 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://json-schema.org/draft/2020-12/schema", + "$vocabulary": { + "https://json-schema.org/draft/2020-12/vocab/core": true, + "https://json-schema.org/draft/2020-12/vocab/applicator": true, + "https://json-schema.org/draft/2020-12/vocab/unevaluated": true, + "https://json-schema.org/draft/2020-12/vocab/validation": true, + "https://json-schema.org/draft/2020-12/vocab/meta-data": true, + "https://json-schema.org/draft/2020-12/vocab/format-annotation": true, + "https://json-schema.org/draft/2020-12/vocab/content": true + }, + "$dynamicAnchor": "meta", + "title": "Core and Validation specifications meta-schema", + "allOf": [ + { + "$ref": "meta/core" + }, + { + "$ref": "meta/applicator" + }, + { + "$ref": "meta/unevaluated" + }, + { + "$ref": "meta/validation" + }, + { + "$ref": "meta/meta-data" + }, + { + "$ref": "meta/format-annotation" + }, + { + "$ref": "meta/content" + } + ], + "type": [ + "object", + "boolean" + ], + "$comment": "This meta-schema also defines keywords that have appeared in previous drafts in order to prevent incompatible extensions as they remain in common use.", + "properties": { + "definitions": { + "$comment": "\"definitions\" has been replaced by \"$defs\".", + "type": "object", + "additionalProperties": { + "$dynamicRef": "#meta" + }, + "deprecated": true, + "default": {} + }, + "dependencies": { + "$comment": "\"dependencies\" has been split and replaced by \"dependentSchemas\" and \"dependentRequired\" in order to serve their differing semantics.", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "$dynamicRef": "#meta" + }, + { + "$ref": "meta/validation#/$defs/stringArray" + } + ] + }, + "deprecated": true, + "default": {} + }, + "$recursiveAnchor": { + "$comment": "\"$recursiveAnchor\" has been replaced by \"$dynamicAnchor\".", + "$ref": "meta/core#/$defs/anchorString", + "deprecated": true + }, + "$recursiveRef": { + "$comment": "\"$recursiveRef\" has been replaced by \"$dynamicRef\".", + "$ref": "meta/core#/$defs/uriReferenceString", + "deprecated": true + } + } +} From b7857460f6a3ee84753559a6cfbb0fd900c64663 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Thu, 3 Aug 2023 09:41:54 -0500 Subject: [PATCH 7/9] Add more tests for ref resolution Ensure that the `$id` and retrieval URI handling are both done correctly. JSON Schema specifies that `$id` has precedence, so one of the tests confirms that preference. --- .../acceptance/test_remote_ref_resolution.py | 77 +++++++++++++++++-- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/tests/acceptance/test_remote_ref_resolution.py b/tests/acceptance/test_remote_ref_resolution.py index 626b93bf9..08b74733e 100644 --- a/tests/acceptance/test_remote_ref_resolution.py +++ b/tests/acceptance/test_remote_ref_resolution.py @@ -37,11 +37,8 @@ } -@pytest.mark.parametrize("check_passes", (True, False)) -@pytest.mark.parametrize("casename", ("case1", "case2")) -def test_remote_ref_resolution_simple_case( - run_line, check_passes, casename, tmp_path, monkeypatch -): +@pytest.fixture(autouse=True) +def _mock_schema_cache_dir(monkeypatch, tmp_path): def _fake_compute_default_cache_dir(self): return str(tmp_path) @@ -51,6 +48,10 @@ def _fake_compute_default_cache_dir(self): _fake_compute_default_cache_dir, ) + +@pytest.mark.parametrize("check_passes", (True, False)) +@pytest.mark.parametrize("casename", ("case1", "case2")) +def test_remote_ref_resolution_simple_case(run_line, check_passes, casename, tmp_path): main_schema_loc = "https://example.com/main.json" responses.add("GET", main_schema_loc, json=CASES[casename]["main_schema"]) for name, subschema in CASES[casename]["other_schemas"].items(): @@ -74,3 +75,69 @@ def _fake_compute_default_cache_dir(self): assert result.exit_code == 0, output else: assert result.exit_code == 1, output + + +# this test ensures that `$id` is preferred for the base URI over +# the retrieval URI +@pytest.mark.parametrize("check_passes", (True, False)) +def test_ref_resolution_prefers_id_over_retrieval_uri(run_line, tmp_path, check_passes): + main_schema = { + "$id": "https://example.org/schemas/main.json", + "$schema": "http://json-schema.org/draft-07/schema", + "properties": { + "title": {"$ref": "./title_schema.json"}, + }, + "additionalProperties": False, + } + title_schema = {"type": "string"} + + retrieval_uri = "https://example.org/alternate-path-retrieval-only/schemas/main" + responses.add("GET", retrieval_uri, json=main_schema) + responses.add( + "GET", "https://example.org/schemas/title_schema.json", json=title_schema + ) + + instance_path = tmp_path / "instance.json" + instance_path.write_text(json.dumps({"title": "doc one" if check_passes else 2})) + + result = run_line( + ["check-jsonschema", "--schemafile", retrieval_uri, str(instance_path)] + ) + output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" + if check_passes: + assert result.exit_code == 0, output + else: + assert result.exit_code == 1, output + + +@pytest.mark.parametrize("check_passes", (True, False)) +def test_ref_resolution_does_not_callout_for_absolute_ref_to_retrieval_uri( + run_line, tmp_path, check_passes +): + retrieval_uri = "https://example.net/schemas/main" + + main_schema = { + "$id": "https://example.net/schemas/some-uri-which-will-never-be-used/main.json", + "$schema": "http://json-schema.org/draft-07/schema", + "$defs": {"title": {"type": "string"}}, + "properties": { + "title": {"$ref": f"{retrieval_uri}#/$defs/title"}, + }, + "additionalProperties": False, + } + + # exactly one GET to the retrieval URI will work + responses.add("GET", retrieval_uri, json=main_schema) + responses.add("GET", retrieval_uri, json={"error": "permafrost melted"}, status=500) + + instance_path = tmp_path / "instance.json" + instance_path.write_text(json.dumps({"title": "doc one" if check_passes else 2})) + + result = run_line( + ["check-jsonschema", "--schemafile", retrieval_uri, str(instance_path)] + ) + output = f"\nstdout:\n{result.stdout}\n\nstderr:\n{result.stderr}" + if check_passes: + assert result.exit_code == 0, output + else: + assert result.exit_code == 1, output From 9294f4612d2642b4f8b1e43084fa9aa7d339ae80 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Thu, 3 Aug 2023 09:53:36 -0500 Subject: [PATCH 8/9] Improve internal terminology for retrieval URI The retrieval URI was internally called the 'ref base' or 'schema URI', both of which suggest that it is definitively the base URI used for reference resolution. This makes it confusing that adjusting this URI might not change the base URI used. Instead, call it properly 'retrieval URI'. Base URI is newly introduced inside of the retrieve callback, after receiving both the `$id` (if any) and the retrieval URI. --- src/check_jsonschema/schema_loader/main.py | 10 +++---- src/check_jsonschema/schema_loader/readers.py | 4 +-- .../schema_loader/resolver.py | 26 ++++++++++++------- 3 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py index 3b10c3b01..63c471934 100644 --- a/src/check_jsonschema/schema_loader/main.py +++ b/src/check_jsonschema/schema_loader/main.py @@ -100,8 +100,8 @@ def _get_schema_reader(self) -> LocalSchemaReader | HttpSchemaReader: f"detected parsed URL had an unrecognized scheme: {self.url_info}" ) - def get_schema_ref_base(self) -> str | None: - return self.reader.get_ref_base() + def get_schema_retrieval_uri(self) -> str | None: + return self.reader.get_retrieval_uri() def get_schema(self) -> dict[str, t.Any]: return self.reader.read_schema() @@ -113,7 +113,7 @@ def get_validator( format_opts: FormatOptions, fill_defaults: bool, ) -> jsonschema.Validator: - schema_uri = self.get_schema_ref_base() + retrieval_uri = self.get_schema_retrieval_uri() schema = self.get_schema() schema_dialect = schema.get("$schema") @@ -123,7 +123,7 @@ def get_validator( # reference resolution # with support for YAML, TOML, and other formats from the parsers - reference_registry = make_reference_registry(self._parsers, schema_uri, schema) + reference_registry = make_reference_registry(self._parsers, retrieval_uri, schema) # get the correct validator class and check the schema under its metaschema validator_cls = jsonschema.validators.validator_for(schema) @@ -147,7 +147,7 @@ def __init__(self, schema_name: str) -> None: self.schema_name = schema_name self._parsers = ParserSet() - def get_schema_ref_base(self) -> str | None: + def get_schema_retrieval_uri(self) -> str | None: return None def get_schema(self) -> dict[str, t.Any]: diff --git a/src/check_jsonschema/schema_loader/readers.py b/src/check_jsonschema/schema_loader/readers.py index 98e24c011..c2e469781 100644 --- a/src/check_jsonschema/schema_loader/readers.py +++ b/src/check_jsonschema/schema_loader/readers.py @@ -30,7 +30,7 @@ def __init__(self, filename: str) -> None: self.filename = str(self.path) self.parsers = ParserSet() - def get_ref_base(self) -> str: + def get_retrieval_uri(self) -> str: return self.path.as_uri() def _read_impl(self) -> t.Any: @@ -55,7 +55,7 @@ def __init__( validation_callback=json.loads, ) - def get_ref_base(self) -> str: + def get_retrieval_uri(self) -> str: return self.url def _read_impl(self) -> t.Any: diff --git a/src/check_jsonschema/schema_loader/resolver.py b/src/check_jsonschema/schema_loader/resolver.py index 5d658eadc..b227b5da9 100644 --- a/src/check_jsonschema/schema_loader/resolver.py +++ b/src/check_jsonschema/schema_loader/resolver.py @@ -12,21 +12,25 @@ def make_reference_registry( - parsers: ParserSet, schema_uri: str | None, schema: dict + parsers: ParserSet, retrieval_uri: str | None, schema: dict ) -> referencing.Registry: + id_attribute_: t.Any = schema.get("$id") + if isinstance(id_attribute_, str): + id_attribute: str | None = id_attribute_ + else: + id_attribute = None + schema_resource = referencing.Resource.from_contents( schema, default_specification=DRAFT202012 ) # mypy does not recognize that Registry is an `attrs` class and has `retrieve` as an # argument to its implicit initializer registry: referencing.Registry = referencing.Registry( # type: ignore[call-arg] - retrieve=create_retrieve_callable(parsers, schema_uri) + retrieve=create_retrieve_callable(parsers, retrieval_uri, id_attribute) ) - if schema_uri is not None: - registry = registry.with_resource(uri=schema_uri, resource=schema_resource) - - id_attribute = schema.get("$id") + if retrieval_uri is not None: + registry = registry.with_resource(uri=retrieval_uri, resource=schema_resource) if id_attribute is not None: registry = registry.with_resource(uri=id_attribute, resource=schema_resource) @@ -34,16 +38,20 @@ def make_reference_registry( def create_retrieve_callable( - parser_set: ParserSet, schema_uri: str | None + parser_set: ParserSet, retrieval_uri: str | None, id_attribute: str | None ) -> t.Callable[[str], referencing.Resource[Schema]]: + base_uri = id_attribute + if base_uri is None: + base_uri = retrieval_uri + def get_local_file(uri: str) -> t.Any: path = filename2path(uri) return parser_set.parse_file(path, "json") def retrieve_reference(uri: str) -> referencing.Resource[Schema]: scheme = urllib.parse.urlsplit(uri).scheme - if scheme == "" and schema_uri is not None: - full_uri = urllib.parse.urljoin(schema_uri, uri) + if scheme == "" and base_uri is not None: + full_uri = urllib.parse.urljoin(base_uri, uri) else: full_uri = uri From 547f86b2bd1311db1ead679898927a1f3f6ffd30 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 3 Aug 2023 14:58:56 +0000 Subject: [PATCH 9/9] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/check_jsonschema/schema_loader/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py index 63c471934..c8c9045a1 100644 --- a/src/check_jsonschema/schema_loader/main.py +++ b/src/check_jsonschema/schema_loader/main.py @@ -123,7 +123,9 @@ def get_validator( # reference resolution # with support for YAML, TOML, and other formats from the parsers - reference_registry = make_reference_registry(self._parsers, retrieval_uri, schema) + reference_registry = make_reference_registry( + self._parsers, retrieval_uri, schema + ) # get the correct validator class and check the schema under its metaschema validator_cls = jsonschema.validators.validator_for(schema)