From 73fd93f898966b266c47ac4f23283fd824c2cf34 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Mon, 25 Nov 2024 10:48:57 -0600 Subject: [PATCH 1/2] Internal refactor: no default cache dirname The two cache dirs are currently "refs" and "downloads". Make these always explicit, rather than `"downloads"` being an internal default baked into the downloader. --- src/check_jsonschema/cachedownloader.py | 11 +++-------- src/check_jsonschema/schema_loader/readers.py | 2 +- src/check_jsonschema/schema_loader/resolver.py | 2 +- tests/unit/test_cachedownloader.py | 17 +++++++++-------- 4 files changed, 14 insertions(+), 18 deletions(-) diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py index 6b9865126..40f1d2cbb 100644 --- a/src/check_jsonschema/cachedownloader.py +++ b/src/check_jsonschema/cachedownloader.py @@ -33,7 +33,7 @@ def _base_cache_dir() -> str | None: return cache_dir -def _resolve_cache_dir(dirname: str = "downloads") -> str | None: +def _resolve_cache_dir(dirname: str) -> str | None: cache_dir = _base_cache_dir() if cache_dir: cache_dir = os.path.join(cache_dir, "check_jsonschema", dirname) @@ -100,13 +100,8 @@ class FailedDownloadError(Exception): class CacheDownloader: - def __init__( - self, cache_dir: str | None = None, disable_cache: bool = False - ) -> None: - if cache_dir is None: - self._cache_dir = _resolve_cache_dir() - else: - self._cache_dir = _resolve_cache_dir(cache_dir) + def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None: + self._cache_dir = _resolve_cache_dir(cache_dir) self._disable_cache = disable_cache def _download( diff --git a/src/check_jsonschema/schema_loader/readers.py b/src/check_jsonschema/schema_loader/readers.py index 65280808b..f496d42eb 100644 --- a/src/check_jsonschema/schema_loader/readers.py +++ b/src/check_jsonschema/schema_loader/readers.py @@ -79,7 +79,7 @@ def __init__( self.url = url self.parsers = ParserSet() self.downloader = CacheDownloader( - disable_cache=disable_cache, + "downloads", disable_cache=disable_cache ).bind(url, cache_filename, validation_callback=self._parse) self._parsed_schema: dict | _UnsetType = _UNSET diff --git a/src/check_jsonschema/schema_loader/resolver.py b/src/check_jsonschema/schema_loader/resolver.py index 5084328a5..a82ae31c9 100644 --- a/src/check_jsonschema/schema_loader/resolver.py +++ b/src/check_jsonschema/schema_loader/resolver.py @@ -66,7 +66,7 @@ def create_retrieve_callable( base_uri = retrieval_uri cache = ResourceCache() - downloader = CacheDownloader("refs", disable_cache) + downloader = CacheDownloader("refs", disable_cache=disable_cache) def get_local_file(uri: str) -> t.Any: path = filename2path(uri) diff --git a/tests/unit/test_cachedownloader.py b/tests/unit/test_cachedownloader.py index 30d47052b..99382e921 100644 --- a/tests/unit/test_cachedownloader.py +++ b/tests/unit/test_cachedownloader.py @@ -30,7 +30,7 @@ def default_response(): def test_default_filename_from_uri(default_response): - cd = CacheDownloader().bind("https://example.com/schema1.json") + cd = CacheDownloader("downloads").bind("https://example.com/schema1.json") assert cd._filename == "schema1.json" @@ -76,7 +76,7 @@ def fake_expanduser(path): monkeypatch.setattr(platform, "system", fakesystem) monkeypatch.setattr(os.path, "expanduser", fake_expanduser) - cd = CacheDownloader() + cd = CacheDownloader("downloads") assert cd._cache_dir == expect_value if sysname == "Darwin": @@ -114,7 +114,7 @@ def test_cachedownloader_cached_file(tmp_path, monkeypatch, default_response): f.write_text("{}") # set the cache_dir to the tmp dir (so that cache_dir will always be set) - cd = CacheDownloader(cache_dir=tmp_path).bind(str(f)) + cd = CacheDownloader(tmp_path).bind(str(f)) # patch the downloader to skip any download "work" monkeypatch.setattr( cd._downloader, "_download", lambda file_uri, filename, response_ok: str(f) @@ -128,7 +128,7 @@ def test_cachedownloader_cached_file(tmp_path, monkeypatch, default_response): def test_cachedownloader_on_success(get_download_cache_loc, disable_cache): add_default_response() f = get_download_cache_loc("schema1.json") - cd = CacheDownloader(disable_cache=disable_cache).bind( + cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( "https://example.com/schema1.json" ) @@ -171,7 +171,7 @@ def test_cachedownloader_succeeds_after_few_errors( ) add_default_response() f = get_download_cache_loc("schema1.json") - cd = CacheDownloader(disable_cache=disable_cache).bind( + cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( "https://example.com/schema1.json" ) @@ -205,7 +205,7 @@ def test_cachedownloader_fails_after_many_errors( ) add_default_response() # never reached, the 11th response f = get_download_cache_loc("schema1.json") - cd = CacheDownloader(disable_cache=disable_cache).bind( + cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( "https://example.com/schema1.json" ) with pytest.raises(FailedDownloadError): @@ -226,6 +226,7 @@ def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cac add_default_response() f = get_download_cache_loc("schema1.json") cd = CacheDownloader( + "downloads", disable_cache=disable_cache, ).bind( "https://example.com/schema1.json", @@ -281,7 +282,7 @@ def fake_mktime(*args): if file_exists: inject_cached_download(uri, original_file_contents) - cd = CacheDownloader().bind(uri) + cd = CacheDownloader("downloads").bind(uri) # if the file already existed, it will not be overwritten by the cachedownloader # so the returned value for both the downloader and a direct file read should be the @@ -326,7 +327,7 @@ def dummy_validate_bytes(data): # construct a downloader pointed at the schema and file, expecting a cache hit # and use the above validation method - cd = CacheDownloader().bind( + cd = CacheDownloader("downloads").bind( "https://example.com/schema1.json", validation_callback=dummy_validate_bytes, ) From 6e943027eb1d941132ca6efb2eb85136f4176461 Mon Sep 17 00:00:00 2001 From: Stephen Rosen Date: Fri, 29 Nov 2024 07:54:47 -0600 Subject: [PATCH 2/2] Caching URLs always uses hashes 1. When caching schemas in the `downloads/` dir, hash their URLs, just as is done in the `refs/` cache. This fixes buggy behaviors due to cache confusion on matching filenames. 2. Deprecate (but do not, at this time, remove) the `--cache-filename` option, making it non-functional. This also removes the `cache_filename` internal arg from several components. Some minimal test and doc cleanup is needed, to appropriately handle these changes. --- docs/usage.rst | 7 +- src/check_jsonschema/cachedownloader.py | 26 +++++- src/check_jsonschema/cli/main_command.py | 9 +-- src/check_jsonschema/schema_loader/main.py | 8 +- src/check_jsonschema/schema_loader/readers.py | 7 +- .../schema_loader/resolver.py | 20 +---- tests/conftest.py | 24 ++++-- tests/unit/test_cachedownloader.py | 80 ++++++++++--------- 8 files changed, 91 insertions(+), 90 deletions(-) diff --git a/docs/usage.rst b/docs/usage.rst index 7c8d79c4d..56a5bb12a 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -118,6 +118,10 @@ Downloading and Caching By default, when ``--schemafile`` is used to refer to an ``http://`` or ``https://`` location, the schema is downloaded and cached based on the schema's Last-Modified time. + +Additionally, when ``$ref``\s are looked up during schema resolution, they are +similarly cached. + The following options control caching behaviors. .. list-table:: Caching Options @@ -128,9 +132,6 @@ The following options control caching behaviors. - Description * - ``--no-cache`` - Disable caching. - * - ``--cache-filename`` - - The name to use for caching a remote schema. - Defaults to using the last slash-delimited part of the URI. "format" Validation Options --------------------------- diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py index 40f1d2cbb..41cf2aae3 100644 --- a/src/check_jsonschema/cachedownloader.py +++ b/src/check_jsonschema/cachedownloader.py @@ -1,6 +1,7 @@ from __future__ import annotations import contextlib +import hashlib import io import os import platform @@ -95,6 +96,25 @@ def _cache_hit(cachefile: str, response: requests.Response) -> bool: return local_mtime >= remote_mtime +def url_to_cache_filename(ref_url: str) -> str: + """ + Given a schema URL, convert it to a filename for caching in a cache dir. + + Rules are as follows: + - the base filename is an sha256 hash of the URL + - if the filename ends in an extension (.json, .yaml, etc) that extension + is appended to the hash + + Preserving file extensions preserves the extension-based logic used for parsing, and + it also helps a local editor (browsing the cache) identify filetypes. + """ + filename = hashlib.sha256(ref_url.encode()).hexdigest() + if "." in (last_part := ref_url.rpartition("/")[-1]): + _, _, extension = last_part.rpartition(".") + filename = f"{filename}.{extension}" + return filename + + class FailedDownloadError(Exception): pass @@ -155,7 +175,7 @@ def bind( validation_callback: t.Callable[[bytes], t.Any] | None = None, ) -> BoundCacheDownloader: return BoundCacheDownloader( - file_url, filename, self, validation_callback=validation_callback + file_url, self, filename=filename, validation_callback=validation_callback ) @@ -163,13 +183,13 @@ class BoundCacheDownloader: def __init__( self, file_url: str, - filename: str | None, downloader: CacheDownloader, *, + filename: str | None = None, validation_callback: t.Callable[[bytes], t.Any] | None = None, ) -> None: self._file_url = file_url - self._filename = filename or file_url.split("/")[-1] + self._filename = filename or url_to_cache_filename(file_url) self._downloader = downloader self._validation_callback = validation_callback diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py index 3145019e8..3a7aae643 100644 --- a/src/check_jsonschema/cli/main_command.py +++ b/src/check_jsonschema/cli/main_command.py @@ -130,11 +130,7 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str: help="Disable schema caching. Always download remote schemas.", ) @click.option( - "--cache-filename", - help=( - "The name to use for caching a remote schema. " - "Defaults to the last slash-delimited part of the URI." - ), + "--cache-filename", help="Deprecated. This option no longer has any effect." ) @click.option( "--disable-formats", @@ -271,8 +267,6 @@ def main( args.disable_cache = no_cache args.default_filetype = default_filetype args.fill_defaults = fill_defaults - if cache_filename is not None: - args.cache_filename = cache_filename if data_transform is not None: args.data_transform = TRANSFORM_LIBRARY[data_transform] @@ -300,7 +294,6 @@ def build_schema_loader(args: ParseResult) -> SchemaLoaderBase: assert args.schema_path is not None return SchemaLoader( args.schema_path, - cache_filename=args.cache_filename, disable_cache=args.disable_cache, base_uri=args.base_uri, validator_class=args.validator_class, diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py index 88ff6a0f2..099107455 100644 --- a/src/check_jsonschema/schema_loader/main.py +++ b/src/check_jsonschema/schema_loader/main.py @@ -64,14 +64,12 @@ def __init__( self, schemafile: str, *, - cache_filename: str | None = None, base_uri: str | None = None, validator_class: type[jsonschema.protocols.Validator] | None = None, disable_cache: bool = True, ) -> None: # record input parameters (these are not to be modified) self.schemafile = schemafile - self.cache_filename = cache_filename self.disable_cache = disable_cache self.base_uri = base_uri self.validator_class = validator_class @@ -105,11 +103,7 @@ def _get_schema_reader( return LocalSchemaReader(self.schemafile) if self.url_info.scheme in ("http", "https"): - return HttpSchemaReader( - self.schemafile, - self.cache_filename, - self.disable_cache, - ) + return HttpSchemaReader(self.schemafile, self.disable_cache) else: raise UnsupportedUrlScheme( "check-jsonschema only supports http, https, and local files. " diff --git a/src/check_jsonschema/schema_loader/readers.py b/src/check_jsonschema/schema_loader/readers.py index f496d42eb..61299350a 100644 --- a/src/check_jsonschema/schema_loader/readers.py +++ b/src/check_jsonschema/schema_loader/readers.py @@ -73,14 +73,13 @@ class HttpSchemaReader: def __init__( self, url: str, - cache_filename: str | None, disable_cache: bool, ) -> None: self.url = url self.parsers = ParserSet() - self.downloader = CacheDownloader( - "downloads", disable_cache=disable_cache - ).bind(url, cache_filename, validation_callback=self._parse) + self.downloader = CacheDownloader("schemas", disable_cache=disable_cache).bind( + url, validation_callback=self._parse + ) self._parsed_schema: dict | _UnsetType = _UNSET def _parse(self, schema_bytes: bytes) -> t.Any: diff --git a/src/check_jsonschema/schema_loader/resolver.py b/src/check_jsonschema/schema_loader/resolver.py index a82ae31c9..15344d6bd 100644 --- a/src/check_jsonschema/schema_loader/resolver.py +++ b/src/check_jsonschema/schema_loader/resolver.py @@ -1,6 +1,5 @@ from __future__ import annotations -import hashlib import typing as t import urllib.parse @@ -12,21 +11,6 @@ from ..utils import filename2path -def ref_url_to_cache_filename(ref_url: str) -> str: - """ - Given a $ref URL, convert it to the filename in the refs/ cache dir. - Rules are as follows: - - the base filename is an md5 hash of the URL - - if the filename ends in an extension (.json, .yaml, etc) that extension - is appended to the hash - """ - filename = hashlib.md5(ref_url.encode()).hexdigest() - if "." in (last_part := ref_url.rpartition("/")[-1]): - _, _, extension = last_part.rpartition(".") - filename = f"{filename}.{extension}" - return filename - - def make_reference_registry( parsers: ParserSet, retrieval_uri: str | None, schema: dict, disable_cache: bool ) -> referencing.Registry: @@ -89,9 +73,7 @@ def validation_callback(content: bytes) -> None: parser_set.parse_data_with_path(content, full_uri, "json") bound_downloader = downloader.bind( - full_uri, - ref_url_to_cache_filename(full_uri), - validation_callback, + full_uri, validation_callback=validation_callback ) with bound_downloader.open() as fp: data = fp.read() diff --git a/tests/conftest.py b/tests/conftest.py index 9179f6eed..b0022a8d3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -62,15 +62,25 @@ def patch_cache_dir(monkeypatch, cache_dir): yield m +@pytest.fixture +def url2cachepath(): + from check_jsonschema.cachedownloader import url_to_cache_filename + + def _get(cache_dir, url): + return cache_dir / url_to_cache_filename(url) + + return _get + + @pytest.fixture def downloads_cache_dir(tmp_path): return tmp_path / ".cache" / "check_jsonschema" / "downloads" @pytest.fixture -def get_download_cache_loc(downloads_cache_dir): - def _get(uri): - return downloads_cache_dir / uri.split("/")[-1] +def get_download_cache_loc(downloads_cache_dir, url2cachepath): + def _get(url): + return url2cachepath(downloads_cache_dir, url) return _get @@ -94,11 +104,9 @@ def refs_cache_dir(tmp_path): @pytest.fixture -def get_ref_cache_loc(refs_cache_dir): - from check_jsonschema.schema_loader.resolver import ref_url_to_cache_filename - - def _get(uri): - return refs_cache_dir / ref_url_to_cache_filename(uri) +def get_ref_cache_loc(refs_cache_dir, url2cachepath): + def _get(url): + return url2cachepath(refs_cache_dir, url) return _get diff --git a/tests/unit/test_cachedownloader.py b/tests/unit/test_cachedownloader.py index 99382e921..1a0949aed 100644 --- a/tests/unit/test_cachedownloader.py +++ b/tests/unit/test_cachedownloader.py @@ -11,13 +11,16 @@ CacheDownloader, FailedDownloadError, _cache_hit, + url_to_cache_filename, ) +DEFAULT_RESPONSE_URL = "https://example.com/schema1.json" + def add_default_response(): responses.add( "GET", - "https://example.com/schema1.json", + DEFAULT_RESPONSE_URL, headers={"Last-Modified": "Sun, 01 Jan 2000 00:00:01 GMT"}, json={}, match_querystring=None, @@ -30,8 +33,8 @@ def default_response(): def test_default_filename_from_uri(default_response): - cd = CacheDownloader("downloads").bind("https://example.com/schema1.json") - assert cd._filename == "schema1.json" + cd = CacheDownloader("downloads").bind(DEFAULT_RESPONSE_URL) + assert cd._filename == url_to_cache_filename(DEFAULT_RESPONSE_URL) @pytest.mark.parametrize( @@ -94,7 +97,7 @@ def test_cache_hit_by_mtime(monkeypatch, default_response): monkeypatch.setattr(os.path, "getmtime", lambda x: time.time()) assert _cache_hit( "/tmp/schema1.json", - requests.get("https://example.com/schema1.json", stream=True), + requests.get(DEFAULT_RESPONSE_URL, stream=True), ) # local mtime = 0, cache miss @@ -102,7 +105,7 @@ def test_cache_hit_by_mtime(monkeypatch, default_response): assert ( _cache_hit( "/tmp/schema1.json", - requests.get("https://example.com/schema1.json", stream=True), + requests.get(DEFAULT_RESPONSE_URL, stream=True), ) is False ) @@ -114,7 +117,7 @@ def test_cachedownloader_cached_file(tmp_path, monkeypatch, default_response): f.write_text("{}") # set the cache_dir to the tmp dir (so that cache_dir will always be set) - cd = CacheDownloader(tmp_path).bind(str(f)) + cd = CacheDownloader(tmp_path).bind(str(f), filename="foo.json") # patch the downloader to skip any download "work" monkeypatch.setattr( cd._downloader, "_download", lambda file_uri, filename, response_ok: str(f) @@ -125,11 +128,12 @@ def test_cachedownloader_cached_file(tmp_path, monkeypatch, default_response): @pytest.mark.parametrize("disable_cache", (True, False)) -def test_cachedownloader_on_success(get_download_cache_loc, disable_cache): - add_default_response() - f = get_download_cache_loc("schema1.json") +def test_cachedownloader_on_success( + get_download_cache_loc, disable_cache, default_response +): + f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( - "https://example.com/schema1.json" + DEFAULT_RESPONSE_URL ) with cd.open() as fp: @@ -140,10 +144,12 @@ def test_cachedownloader_on_success(get_download_cache_loc, disable_cache): assert f.exists() -def test_cachedownloader_using_alternate_target_dir(cache_dir): - add_default_response() - f = cache_dir / "check_jsonschema" / "otherdir" / "schema1.json" - cd = CacheDownloader("otherdir").bind("https://example.com/schema1.json") +def test_cachedownloader_using_alternate_target_dir( + cache_dir, default_response, url2cachepath +): + cache_dir = cache_dir / "check_jsonschema" / "otherdir" + f = url2cachepath(cache_dir, DEFAULT_RESPONSE_URL) + cd = CacheDownloader("otherdir").bind(DEFAULT_RESPONSE_URL) with cd.open() as fp: assert fp.read() == b"{}" assert f.exists() @@ -158,21 +164,21 @@ def test_cachedownloader_succeeds_after_few_errors( for _i in range(failures): responses.add( "GET", - "https://example.com/schema1.json", + DEFAULT_RESPONSE_URL, status=500, match_querystring=None, ) else: responses.add( "GET", - "https://example.com/schema1.json", + DEFAULT_RESPONSE_URL, body=failures(), match_querystring=None, ) add_default_response() - f = get_download_cache_loc("schema1.json") + f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( - "https://example.com/schema1.json" + DEFAULT_RESPONSE_URL ) with cd.open() as fp: @@ -192,21 +198,21 @@ def test_cachedownloader_fails_after_many_errors( if connection_error: responses.add( "GET", - "https://example.com/schema1.json", + DEFAULT_RESPONSE_URL, body=requests.ConnectionError(), match_querystring=None, ) else: responses.add( "GET", - "https://example.com/schema1.json", + DEFAULT_RESPONSE_URL, status=500, match_querystring=None, ) add_default_response() # never reached, the 11th response - f = get_download_cache_loc("schema1.json") + f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader("downloads", disable_cache=disable_cache).bind( - "https://example.com/schema1.json" + DEFAULT_RESPONSE_URL ) with pytest.raises(FailedDownloadError): with cd.open(): @@ -218,19 +224,18 @@ def test_cachedownloader_fails_after_many_errors( def test_cachedownloader_retries_on_bad_data(get_download_cache_loc, disable_cache): responses.add( "GET", - "https://example.com/schema1.json", + DEFAULT_RESPONSE_URL, status=200, body="{", match_querystring=None, ) add_default_response() - f = get_download_cache_loc("schema1.json") + f = get_download_cache_loc(DEFAULT_RESPONSE_URL) cd = CacheDownloader( "downloads", disable_cache=disable_cache, ).bind( - "https://example.com/schema1.json", - filename=str(f), + DEFAULT_RESPONSE_URL, validation_callback=json.loads, ) @@ -254,13 +259,14 @@ def test_cachedownloader_handles_bad_lastmod_header( file_exists, failure_mode, ): - uri = "https://example.com/schema1.json" if failure_mode == "header_missing": - responses.add("GET", uri, headers={}, json={}, match_querystring=None) + responses.add( + "GET", DEFAULT_RESPONSE_URL, headers={}, json={}, match_querystring=None + ) elif failure_mode == "header_malformed": responses.add( "GET", - uri, + DEFAULT_RESPONSE_URL, headers={"Last-Modified": "Jan 2000 00:00:01"}, json={}, match_querystring=None, @@ -276,13 +282,13 @@ def fake_mktime(*args): raise NotImplementedError original_file_contents = b'{"foo": "bar"}' - file_path = get_download_cache_loc(uri) + file_path = get_download_cache_loc(DEFAULT_RESPONSE_URL) assert not file_path.exists() if file_exists: - inject_cached_download(uri, original_file_contents) + inject_cached_download(DEFAULT_RESPONSE_URL, original_file_contents) - cd = CacheDownloader("downloads").bind(uri) + cd = CacheDownloader("downloads").bind(DEFAULT_RESPONSE_URL) # if the file already existed, it will not be overwritten by the cachedownloader # so the returned value for both the downloader and a direct file read should be the @@ -303,7 +309,7 @@ def fake_mktime(*args): def test_cachedownloader_validation_is_not_invoked_on_hit( - monkeypatch, inject_cached_download + monkeypatch, default_response, inject_cached_download ): """ Regression test for https://github.com/python-jsonschema/check-jsonschema/issues/453 @@ -311,12 +317,10 @@ def test_cachedownloader_validation_is_not_invoked_on_hit( This was a bug in which the validation callback was invoked eagerly, even on a cache hit. As a result, cache hits did not demonstrate their expected performance gain. """ - uri = "https://example.com/schema1.json" - # 1: construct some perfectly good data (it doesn't really matter what it is) - add_default_response() + # <> # 2: put equivalent data on disk - inject_cached_download(uri, "{}") + inject_cached_download(DEFAULT_RESPONSE_URL, "{}") # 3: construct a validator which marks that it ran in a variable validator_ran = False @@ -328,7 +332,7 @@ def dummy_validate_bytes(data): # construct a downloader pointed at the schema and file, expecting a cache hit # and use the above validation method cd = CacheDownloader("downloads").bind( - "https://example.com/schema1.json", + DEFAULT_RESPONSE_URL, validation_callback=dummy_validate_bytes, )