Merge pull request #503 from python-jsonschema/fix-caching

Fix caching behavior to always use hashed URLs for the cache filenames
python-jsonschema · Nov 29, 2024 · c52714b · c52714b
2 parents 85aa1cc + 6e94302
commit c52714b
Show file tree

Hide file tree

Showing 8 changed files with 101 additions and 104 deletions.
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -118,6 +118,10 @@ Downloading and Caching
 By default, when ``--schemafile`` is used to refer to an ``http://`` or
 ``https://`` location, the schema is downloaded and cached based on the
 schema's Last-Modified time.
+
+Additionally, when ``$ref``\s are looked up during schema resolution, they are
+similarly cached.
+
 The following options control caching behaviors.
 
 .. list-table:: Caching Options
@@ -128,9 +132,6 @@ The following options control caching behaviors.
      - Description
    * - ``--no-cache``
      - Disable caching.
-   * - ``--cache-filename``
-     - The name to use for caching a remote schema.
-       Defaults to using the last slash-delimited part of the URI.
 
 "format" Validation Options
 ---------------------------

diff --git a/src/check_jsonschema/cachedownloader.py b/src/check_jsonschema/cachedownloader.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import contextlib
+import hashlib
 import io
 import os
 import platform
@@ -33,7 +34,7 @@ def _base_cache_dir() -> str | None:
     return cache_dir
 
 
-def _resolve_cache_dir(dirname: str = "downloads") -> str | None:
+def _resolve_cache_dir(dirname: str) -> str | None:
     cache_dir = _base_cache_dir()
     if cache_dir:
         cache_dir = os.path.join(cache_dir, "check_jsonschema", dirname)
@@ -95,18 +96,32 @@ def _cache_hit(cachefile: str, response: requests.Response) -> bool:
     return local_mtime >= remote_mtime
 
 
+def url_to_cache_filename(ref_url: str) -> str:
+    """
+    Given a schema URL, convert it to a filename for caching in a cache dir.
+
+    Rules are as follows:
+    - the base filename is an sha256 hash of the URL
+    - if the filename ends in an extension (.json, .yaml, etc) that extension
+      is appended to the hash
+
+    Preserving file extensions preserves the extension-based logic used for parsing, and
+    it also helps a local editor (browsing the cache) identify filetypes.
+    """
+    filename = hashlib.sha256(ref_url.encode()).hexdigest()
+    if "." in (last_part := ref_url.rpartition("/")[-1]):
+        _, _, extension = last_part.rpartition(".")
+        filename = f"{filename}.{extension}"
+    return filename
+
+
 class FailedDownloadError(Exception):
     pass
 
 
 class CacheDownloader:
-    def __init__(
-        self, cache_dir: str | None = None, disable_cache: bool = False
-    ) -> None:
-        if cache_dir is None:
-            self._cache_dir = _resolve_cache_dir()
-        else:
-            self._cache_dir = _resolve_cache_dir(cache_dir)
+    def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None:
+        self._cache_dir = _resolve_cache_dir(cache_dir)
         self._disable_cache = disable_cache
 
     def _download(
@@ -160,21 +175,21 @@ def bind(
         validation_callback: t.Callable[[bytes], t.Any] | None = None,
     ) -> BoundCacheDownloader:
         return BoundCacheDownloader(
-            file_url, filename, self, validation_callback=validation_callback
+            file_url, self, filename=filename, validation_callback=validation_callback
         )
 
 
 class BoundCacheDownloader:
     def __init__(
         self,
         file_url: str,
-        filename: str | None,
         downloader: CacheDownloader,
         *,
+        filename: str | None = None,
         validation_callback: t.Callable[[bytes], t.Any] | None = None,
     ) -> None:
         self._file_url = file_url
-        self._filename = filename or file_url.split("/")[-1]
+        self._filename = filename or url_to_cache_filename(file_url)
         self._downloader = downloader
         self._validation_callback = validation_callback
 

diff --git a/src/check_jsonschema/cli/main_command.py b/src/check_jsonschema/cli/main_command.py
@@ -130,11 +130,7 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
     help="Disable schema caching. Always download remote schemas.",
 )
 @click.option(
-    "--cache-filename",
-    help=(
-        "The name to use for caching a remote schema. "
-        "Defaults to the last slash-delimited part of the URI."
-    ),
+    "--cache-filename", help="Deprecated. This option no longer has any effect."
 )
 @click.option(
     "--disable-formats",
@@ -271,8 +267,6 @@ def main(
     args.disable_cache = no_cache
     args.default_filetype = default_filetype
     args.fill_defaults = fill_defaults
-    if cache_filename is not None:
-        args.cache_filename = cache_filename
     if data_transform is not None:
         args.data_transform = TRANSFORM_LIBRARY[data_transform]
 
@@ -300,7 +294,6 @@ def build_schema_loader(args: ParseResult) -> SchemaLoaderBase:
         assert args.schema_path is not None
         return SchemaLoader(
             args.schema_path,
-            cache_filename=args.cache_filename,
             disable_cache=args.disable_cache,
             base_uri=args.base_uri,
             validator_class=args.validator_class,

diff --git a/src/check_jsonschema/schema_loader/main.py b/src/check_jsonschema/schema_loader/main.py
@@ -64,14 +64,12 @@ def __init__(
         self,
         schemafile: str,
         *,
-        cache_filename: str | None = None,
         base_uri: str | None = None,
         validator_class: type[jsonschema.protocols.Validator] | None = None,
         disable_cache: bool = True,
     ) -> None:
         # record input parameters (these are not to be modified)
         self.schemafile = schemafile
-        self.cache_filename = cache_filename
         self.disable_cache = disable_cache
         self.base_uri = base_uri
         self.validator_class = validator_class
@@ -105,11 +103,7 @@ def _get_schema_reader(
             return LocalSchemaReader(self.schemafile)
 
         if self.url_info.scheme in ("http", "https"):
-            return HttpSchemaReader(
-                self.schemafile,
-                self.cache_filename,
-                self.disable_cache,
-            )
+            return HttpSchemaReader(self.schemafile, self.disable_cache)
         else:
             raise UnsupportedUrlScheme(
                 "check-jsonschema only supports http, https, and local files. "

diff --git a/src/check_jsonschema/schema_loader/readers.py b/src/check_jsonschema/schema_loader/readers.py
@@ -73,14 +73,13 @@ class HttpSchemaReader:
     def __init__(
         self,
         url: str,
-        cache_filename: str | None,
         disable_cache: bool,
     ) -> None:
         self.url = url
         self.parsers = ParserSet()
-        self.downloader = CacheDownloader(
-            disable_cache=disable_cache,
-        ).bind(url, cache_filename, validation_callback=self._parse)
+        self.downloader = CacheDownloader("schemas", disable_cache=disable_cache).bind(
+            url, validation_callback=self._parse
+        )
         self._parsed_schema: dict | _UnsetType = _UNSET
 
     def _parse(self, schema_bytes: bytes) -> t.Any:

diff --git a/src/check_jsonschema/schema_loader/resolver.py b/src/check_jsonschema/schema_loader/resolver.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import hashlib
 import typing as t
 import urllib.parse
 
@@ -12,21 +11,6 @@
 from ..utils import filename2path
 
 
-def ref_url_to_cache_filename(ref_url: str) -> str:
-    """
-    Given a $ref URL, convert it to the filename in the refs/ cache dir.
-    Rules are as follows:
-    - the base filename is an md5 hash of the URL
-    - if the filename ends in an extension (.json, .yaml, etc) that extension
-      is appended to the hash
-    """
-    filename = hashlib.md5(ref_url.encode()).hexdigest()
-    if "." in (last_part := ref_url.rpartition("/")[-1]):
-        _, _, extension = last_part.rpartition(".")
-        filename = f"{filename}.{extension}"
-    return filename
-
-
 def make_reference_registry(
     parsers: ParserSet, retrieval_uri: str | None, schema: dict, disable_cache: bool
 ) -> referencing.Registry:
@@ -66,7 +50,7 @@ def create_retrieve_callable(
         base_uri = retrieval_uri
 
     cache = ResourceCache()
-    downloader = CacheDownloader("refs", disable_cache)
+    downloader = CacheDownloader("refs", disable_cache=disable_cache)
 
     def get_local_file(uri: str) -> t.Any:
         path = filename2path(uri)
@@ -89,9 +73,7 @@ def validation_callback(content: bytes) -> None:
                 parser_set.parse_data_with_path(content, full_uri, "json")
 
             bound_downloader = downloader.bind(
-                full_uri,
-                ref_url_to_cache_filename(full_uri),
-                validation_callback,
+                full_uri, validation_callback=validation_callback
             )
             with bound_downloader.open() as fp:
                 data = fp.read()

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -62,15 +62,25 @@ def patch_cache_dir(monkeypatch, cache_dir):
         yield m
 
 
+@pytest.fixture
+def url2cachepath():
+    from check_jsonschema.cachedownloader import url_to_cache_filename
+
+    def _get(cache_dir, url):
+        return cache_dir / url_to_cache_filename(url)
+
+    return _get
+
+
 @pytest.fixture
 def downloads_cache_dir(tmp_path):
     return tmp_path / ".cache" / "check_jsonschema" / "downloads"
 
 
 @pytest.fixture
-def get_download_cache_loc(downloads_cache_dir):
-    def _get(uri):
-        return downloads_cache_dir / uri.split("/")[-1]
+def get_download_cache_loc(downloads_cache_dir, url2cachepath):
+    def _get(url):
+        return url2cachepath(downloads_cache_dir, url)
 
     return _get
 
@@ -94,11 +104,9 @@ def refs_cache_dir(tmp_path):
 
 
 @pytest.fixture
-def get_ref_cache_loc(refs_cache_dir):
-    from check_jsonschema.schema_loader.resolver import ref_url_to_cache_filename
-
-    def _get(uri):
-        return refs_cache_dir / ref_url_to_cache_filename(uri)
+def get_ref_cache_loc(refs_cache_dir, url2cachepath):
+    def _get(url):
+        return url2cachepath(refs_cache_dir, url)
 
     return _get