Skip to content

Commit

Permalink
Merge pull request #503 from python-jsonschema/fix-caching
Browse files Browse the repository at this point in the history
Fix caching behavior to always use hashed URLs for the cache filenames
  • Loading branch information
sirosen authored Nov 29, 2024
2 parents 85aa1cc + 6e94302 commit c52714b
Show file tree
Hide file tree
Showing 8 changed files with 101 additions and 104 deletions.
7 changes: 4 additions & 3 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,10 @@ Downloading and Caching
By default, when ``--schemafile`` is used to refer to an ``http://`` or
``https://`` location, the schema is downloaded and cached based on the
schema's Last-Modified time.

Additionally, when ``$ref``\s are looked up during schema resolution, they are
similarly cached.

The following options control caching behaviors.

.. list-table:: Caching Options
Expand All @@ -128,9 +132,6 @@ The following options control caching behaviors.
- Description
* - ``--no-cache``
- Disable caching.
* - ``--cache-filename``
- The name to use for caching a remote schema.
Defaults to using the last slash-delimited part of the URI.

"format" Validation Options
---------------------------
Expand Down
37 changes: 26 additions & 11 deletions src/check_jsonschema/cachedownloader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

import contextlib
import hashlib
import io
import os
import platform
Expand Down Expand Up @@ -33,7 +34,7 @@ def _base_cache_dir() -> str | None:
return cache_dir


def _resolve_cache_dir(dirname: str = "downloads") -> str | None:
def _resolve_cache_dir(dirname: str) -> str | None:
cache_dir = _base_cache_dir()
if cache_dir:
cache_dir = os.path.join(cache_dir, "check_jsonschema", dirname)
Expand Down Expand Up @@ -95,18 +96,32 @@ def _cache_hit(cachefile: str, response: requests.Response) -> bool:
return local_mtime >= remote_mtime


def url_to_cache_filename(ref_url: str) -> str:
"""
Given a schema URL, convert it to a filename for caching in a cache dir.
Rules are as follows:
- the base filename is an sha256 hash of the URL
- if the filename ends in an extension (.json, .yaml, etc) that extension
is appended to the hash
Preserving file extensions preserves the extension-based logic used for parsing, and
it also helps a local editor (browsing the cache) identify filetypes.
"""
filename = hashlib.sha256(ref_url.encode()).hexdigest()
if "." in (last_part := ref_url.rpartition("/")[-1]):
_, _, extension = last_part.rpartition(".")
filename = f"{filename}.{extension}"
return filename


class FailedDownloadError(Exception):
pass


class CacheDownloader:
def __init__(
self, cache_dir: str | None = None, disable_cache: bool = False
) -> None:
if cache_dir is None:
self._cache_dir = _resolve_cache_dir()
else:
self._cache_dir = _resolve_cache_dir(cache_dir)
def __init__(self, cache_dir: str, *, disable_cache: bool = False) -> None:
self._cache_dir = _resolve_cache_dir(cache_dir)
self._disable_cache = disable_cache

def _download(
Expand Down Expand Up @@ -160,21 +175,21 @@ def bind(
validation_callback: t.Callable[[bytes], t.Any] | None = None,
) -> BoundCacheDownloader:
return BoundCacheDownloader(
file_url, filename, self, validation_callback=validation_callback
file_url, self, filename=filename, validation_callback=validation_callback
)


class BoundCacheDownloader:
def __init__(
self,
file_url: str,
filename: str | None,
downloader: CacheDownloader,
*,
filename: str | None = None,
validation_callback: t.Callable[[bytes], t.Any] | None = None,
) -> None:
self._file_url = file_url
self._filename = filename or file_url.split("/")[-1]
self._filename = filename or url_to_cache_filename(file_url)
self._downloader = downloader
self._validation_callback = validation_callback

Expand Down
9 changes: 1 addition & 8 deletions src/check_jsonschema/cli/main_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,7 @@ def pretty_helptext_list(values: list[str] | tuple[str, ...]) -> str:
help="Disable schema caching. Always download remote schemas.",
)
@click.option(
"--cache-filename",
help=(
"The name to use for caching a remote schema. "
"Defaults to the last slash-delimited part of the URI."
),
"--cache-filename", help="Deprecated. This option no longer has any effect."
)
@click.option(
"--disable-formats",
Expand Down Expand Up @@ -271,8 +267,6 @@ def main(
args.disable_cache = no_cache
args.default_filetype = default_filetype
args.fill_defaults = fill_defaults
if cache_filename is not None:
args.cache_filename = cache_filename
if data_transform is not None:
args.data_transform = TRANSFORM_LIBRARY[data_transform]

Expand Down Expand Up @@ -300,7 +294,6 @@ def build_schema_loader(args: ParseResult) -> SchemaLoaderBase:
assert args.schema_path is not None
return SchemaLoader(
args.schema_path,
cache_filename=args.cache_filename,
disable_cache=args.disable_cache,
base_uri=args.base_uri,
validator_class=args.validator_class,
Expand Down
8 changes: 1 addition & 7 deletions src/check_jsonschema/schema_loader/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,12 @@ def __init__(
self,
schemafile: str,
*,
cache_filename: str | None = None,
base_uri: str | None = None,
validator_class: type[jsonschema.protocols.Validator] | None = None,
disable_cache: bool = True,
) -> None:
# record input parameters (these are not to be modified)
self.schemafile = schemafile
self.cache_filename = cache_filename
self.disable_cache = disable_cache
self.base_uri = base_uri
self.validator_class = validator_class
Expand Down Expand Up @@ -105,11 +103,7 @@ def _get_schema_reader(
return LocalSchemaReader(self.schemafile)

if self.url_info.scheme in ("http", "https"):
return HttpSchemaReader(
self.schemafile,
self.cache_filename,
self.disable_cache,
)
return HttpSchemaReader(self.schemafile, self.disable_cache)
else:
raise UnsupportedUrlScheme(
"check-jsonschema only supports http, https, and local files. "
Expand Down
7 changes: 3 additions & 4 deletions src/check_jsonschema/schema_loader/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,13 @@ class HttpSchemaReader:
def __init__(
self,
url: str,
cache_filename: str | None,
disable_cache: bool,
) -> None:
self.url = url
self.parsers = ParserSet()
self.downloader = CacheDownloader(
disable_cache=disable_cache,
).bind(url, cache_filename, validation_callback=self._parse)
self.downloader = CacheDownloader("schemas", disable_cache=disable_cache).bind(
url, validation_callback=self._parse
)
self._parsed_schema: dict | _UnsetType = _UNSET

def _parse(self, schema_bytes: bytes) -> t.Any:
Expand Down
22 changes: 2 additions & 20 deletions src/check_jsonschema/schema_loader/resolver.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import hashlib
import typing as t
import urllib.parse

Expand All @@ -12,21 +11,6 @@
from ..utils import filename2path


def ref_url_to_cache_filename(ref_url: str) -> str:
"""
Given a $ref URL, convert it to the filename in the refs/ cache dir.
Rules are as follows:
- the base filename is an md5 hash of the URL
- if the filename ends in an extension (.json, .yaml, etc) that extension
is appended to the hash
"""
filename = hashlib.md5(ref_url.encode()).hexdigest()
if "." in (last_part := ref_url.rpartition("/")[-1]):
_, _, extension = last_part.rpartition(".")
filename = f"{filename}.{extension}"
return filename


def make_reference_registry(
parsers: ParserSet, retrieval_uri: str | None, schema: dict, disable_cache: bool
) -> referencing.Registry:
Expand Down Expand Up @@ -66,7 +50,7 @@ def create_retrieve_callable(
base_uri = retrieval_uri

cache = ResourceCache()
downloader = CacheDownloader("refs", disable_cache)
downloader = CacheDownloader("refs", disable_cache=disable_cache)

def get_local_file(uri: str) -> t.Any:
path = filename2path(uri)
Expand All @@ -89,9 +73,7 @@ def validation_callback(content: bytes) -> None:
parser_set.parse_data_with_path(content, full_uri, "json")

bound_downloader = downloader.bind(
full_uri,
ref_url_to_cache_filename(full_uri),
validation_callback,
full_uri, validation_callback=validation_callback
)
with bound_downloader.open() as fp:
data = fp.read()
Expand Down
24 changes: 16 additions & 8 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,25 @@ def patch_cache_dir(monkeypatch, cache_dir):
yield m


@pytest.fixture
def url2cachepath():
from check_jsonschema.cachedownloader import url_to_cache_filename

def _get(cache_dir, url):
return cache_dir / url_to_cache_filename(url)

return _get


@pytest.fixture
def downloads_cache_dir(tmp_path):
return tmp_path / ".cache" / "check_jsonschema" / "downloads"


@pytest.fixture
def get_download_cache_loc(downloads_cache_dir):
def _get(uri):
return downloads_cache_dir / uri.split("/")[-1]
def get_download_cache_loc(downloads_cache_dir, url2cachepath):
def _get(url):
return url2cachepath(downloads_cache_dir, url)

return _get

Expand All @@ -94,11 +104,9 @@ def refs_cache_dir(tmp_path):


@pytest.fixture
def get_ref_cache_loc(refs_cache_dir):
from check_jsonschema.schema_loader.resolver import ref_url_to_cache_filename

def _get(uri):
return refs_cache_dir / ref_url_to_cache_filename(uri)
def get_ref_cache_loc(refs_cache_dir, url2cachepath):
def _get(url):
return url2cachepath(refs_cache_dir, url)

return _get

Expand Down
Loading

0 comments on commit c52714b

Please sign in to comment.