diff --git a/.gitignore b/.gitignore index 81db975..4b2a4ba 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ share/python-wheels/ .installed.cfg *.egg MANIFEST +.vscode/ # PyInstaller # Usually these files are written by a python script from a template diff --git a/.python-version b/.python-version new file mode 100644 index 0000000..ac957df --- /dev/null +++ b/.python-version @@ -0,0 +1 @@ +3.10.6 diff --git a/Dockerfile b/Dockerfile index 9013a1c..fe767c9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,12 +1,15 @@ FROM python:3.12.4-slim + +RUN apt-get update -qq && apt-get install -y curl && rm -rf /var/lib/apt/lists/* + # Allow statements and log messages to immediately appear in the Knative logs -ENV PYTHONUNBUFFERED True -ENV APP_HOME /app -ENV PORT 8080 -ENV HOST 0.0.0.0 -ENV STORAGE_BASE / -ENV STORAGE_DIR storage +ENV PYTHONUNBUFFERED=True +ENV APP_HOME=/app +ENV PORT=8080 +ENV HOST=0.0.0.0 +ENV STORAGE_BASE=/ +ENV STORAGE_DIR=storage # Python app installation WORKDIR $APP_HOME diff --git a/build.sh b/build.sh new file mode 100644 index 0000000..e3eb081 --- /dev/null +++ b/build.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +# Define image details +IMAGE_NAME="gcp-storage-emulator" +IMAGE_TAG="latest" +DOCKERFILE_PATH="." +DOCKERHUB_USERNAME="jamesmtc" + +# Build the image for linux, x86, and arm64 +docker buildx build --platform linux/amd64,linux/arm64 --tag $DOCKERHUB_USERNAME/$IMAGE_NAME:$IMAGE_TAG --push $DOCKERFILE_PATH diff --git a/globby.py b/globby.py new file mode 100644 index 0000000..13e7ae5 --- /dev/null +++ b/globby.py @@ -0,0 +1,42 @@ +from wcmatch import glob + +def glob_with_array(file_paths, pattern): + matched_files = [ + path for path in file_paths if + glob.globmatch(path, pattern, flags=glob.GLOBSTAR | glob.BRACE) + or + glob.globmatch(path, pattern.replace('**', '*/*'), flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB) + ] + return matched_files + +def test_match_glob(): + # File paths with distinct patterns + + matcher = glob.globmatch("foo/bar", "foo*bar", flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB) + print(f"{matcher}") + + blob_names = ["foo/bar", "foo/baz", "foo/foobar", "foobar"] + + match_glob_results = { + "foo*bar": ["foobar"], + "foo**bar": ["foo/bar", "foo/foobar", "foobar"], + "**/foobar": ["foo/foobar", "foobar"], + "*/ba[rz]": ["foo/bar", "foo/baz"], + "*/ba[!a-y]": ["foo/baz"], + "**/{foobar,baz}": ["foo/baz", "foo/foobar", "foobar"], + "foo/{foo*,*baz}": ["foo/baz", "foo/foobar"], + } + + # Iterate through the match glob patterns and expected results + for match_glob, expected_names in match_glob_results.items(): + glob_results = glob_with_array(blob_names, match_glob) + print(f"\n") + if glob_results == expected_names: + print(f"Matched: {match_glob}") + else: + print(f"Not matched: {match_glob}") + print(f"Expected: {expected_names}") + print(f"Actual: {glob_results}") + print(f"\n") + +test_match_glob() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d117391..f3c0e2b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,9 @@ requires = [ ] [tool.pytest.ini_options] +pythonpath = [ + "." +] addopts = "--cov=src --cov-report=xml --cov-branch" testpaths = [ "tests" diff --git a/setup.py b/setup.py index 46bb18c..fc488a0 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,7 @@ install_requires=[ "fs", "google-crc32c", + "wcmatch" ], python_requires=">=3.7", ) diff --git a/src/gcp_storage_emulator/exceptions.py b/src/gcp_storage_emulator/exceptions.py index 3990e12..36b5b61 100644 --- a/src/gcp_storage_emulator/exceptions.py +++ b/src/gcp_storage_emulator/exceptions.py @@ -4,3 +4,7 @@ class NotFound(Exception): class Conflict(Exception): pass + + +class BadRequest(Exception): + pass diff --git a/src/gcp_storage_emulator/handlers/buckets.py b/src/gcp_storage_emulator/handlers/buckets.py index 99df939..072c91a 100644 --- a/src/gcp_storage_emulator/handlers/buckets.py +++ b/src/gcp_storage_emulator/handlers/buckets.py @@ -3,7 +3,7 @@ from http import HTTPStatus from gcp_storage_emulator import settings -from gcp_storage_emulator.exceptions import Conflict, NotFound +from gcp_storage_emulator.exceptions import Conflict, NotFound, BadRequest logger = logging.getLogger("api.bucket") @@ -110,5 +110,7 @@ def delete(request, response, storage, *args, **kwargs): storage.delete_bucket(name) except NotFound: response.status = HTTPStatus.NOT_FOUND + except BadRequest: + response.status = HTTPStatus.BAD_REQUEST except Conflict: response.status = HTTPStatus.CONFLICT diff --git a/src/gcp_storage_emulator/handlers/objects.py b/src/gcp_storage_emulator/handlers/objects.py index bbd523a..0cb7a00 100644 --- a/src/gcp_storage_emulator/handlers/objects.py +++ b/src/gcp_storage_emulator/handlers/objects.py @@ -15,7 +15,7 @@ import google_crc32c -from gcp_storage_emulator.exceptions import Conflict, NotFound +from gcp_storage_emulator.exceptions import Conflict, NotFound, BadRequest logger = logging.getLogger("api.object") @@ -354,13 +354,16 @@ def get(request, response, storage, *args, **kwargs): def ls(request, response, storage, *args, **kwargs): bucket_name = request.params["bucket_name"] prefix = request.query.get("prefix")[0] if request.query.get("prefix") else None + matchGlob = request.query.get("matchGlob")[0] if request.query.get("matchGlob") else None delimiter = ( request.query.get("delimiter")[0] if request.query.get("delimiter") else None ) try: - files, prefixes = storage.get_file_list(bucket_name, prefix, delimiter) + files, prefixes = storage.get_file_list(bucket_name, prefix, delimiter, matchGlob) except NotFound: response.status = HTTPStatus.NOT_FOUND + except BadRequest: + response.status = HTTPStatus.BAD_REQUEST else: response.json({"kind": "storage#object", "prefixes": prefixes, "items": files}) diff --git a/src/gcp_storage_emulator/server.py b/src/gcp_storage_emulator/server.py index 428ad24..3660263 100644 --- a/src/gcp_storage_emulator/server.py +++ b/src/gcp_storage_emulator/server.py @@ -207,9 +207,12 @@ def __init__(self, request_handler, method): self._path = request_handler.path self._request_handler = request_handler self._server_address = request_handler.server.server_address - self._base_url = "http://{}:{}".format( - self._server_address[0], self._server_address[1] - ) + if request_handler.hostname: + self._base_url = "http://{}".format(request_handler.hostname) + else: + self._base_url = "http://{}:{}".format( + self._request_handler.server_address[0], self._request_handler.server_address[1] + ) self._full_url = self._base_url + self._path self._parsed_url = urlparse(self._full_url) self._query = parse_qs(self._parsed_url.query) @@ -351,22 +354,27 @@ def __init__(self, storage, *args, **kwargs): super().__init__(*args, **kwargs) def do_GET(self): + self.hostname = self.headers.get('Host') router = Router(self) router.handle(GET) def do_POST(self): + self.hostname = self.headers.get('Host') router = Router(self) router.handle(POST) def do_DELETE(self): + self.hostname = self.headers.get('Host') router = Router(self) router.handle(DELETE) def do_PUT(self): + self.hostname = self.headers.get('Host') router = Router(self) router.handle(PUT) def do_PATCH(self): + self.hostname = self.headers.get('Host') router = Router(self) router.handle(PATCH) diff --git a/src/gcp_storage_emulator/storage.py b/src/gcp_storage_emulator/storage.py index ae11bd9..d673797 100644 --- a/src/gcp_storage_emulator/storage.py +++ b/src/gcp_storage_emulator/storage.py @@ -3,11 +3,12 @@ import logging import os from hashlib import sha256 - +from wcmatch import glob +import re import fs from fs.errors import FileExpected, ResourceNotFound -from gcp_storage_emulator.exceptions import Conflict, NotFound +from gcp_storage_emulator.exceptions import Conflict, NotFound, BadRequest from gcp_storage_emulator.settings import STORAGE_BASE, STORAGE_DIR # Real buckets can't start with an underscore @@ -92,7 +93,7 @@ def get_bucket(self, bucket_name): return self.buckets.get(bucket_name) - def get_file_list(self, bucket_name, prefix=None, delimiter=None): + def get_file_list(self, bucket_name, prefix=None, delimiter=None, match_glob=None): """Lists all the blobs in the bucket that begin with the prefix. This can be used to list all blobs in a "folder", e.g. "public/". @@ -123,29 +124,52 @@ def get_file_list(self, bucket_name, prefix=None, delimiter=None): if bucket_name not in self.buckets: raise NotFound - prefix_len = 0 - prefixes = [] bucket_objects = self.objects.get(bucket_name, {}) - if prefix: - prefix_len = len(prefix) - objs = list( - file_object - for file_name, file_object in bucket_objects.items() - if file_name.startswith(prefix) - and (not delimiter or delimiter not in file_name[prefix_len:]) - ) + objs = [] + prefixes = set() + + + + # If matchGlob is provided, filter objects using the glob pattern + if match_glob: + # Requests that use the matchGlob parameter fail if they also include a delimiter parameter set to a value other than /. + if delimiter: + if delimiter != "/": + raise BadRequest("When listing with a glob pattern, the only supported delimiter is '/'.",) + else: + objs = [ + file_object for file_name, file_object in bucket_objects.items() + if '/' not in file_name + if not re.search(r'/[^*]\*\//gm', match_glob) + if glob.globmatch(file_name.split("/")[-1], match_glob.replace('**/', '*',).replace('**', '*'), flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB) + ] + + else: + objs = [ + file_object for file_name, file_object in bucket_objects.items() + if glob.globmatch(file_name, match_glob, flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB) + or glob.globmatch(file_name, match_glob.replace('**', '*/*'), flags=glob.GLOBSTAR | glob.BRACE | glob.EXTGLOB) + ] + + # If matchGlob is not provided, apply the prefix and delimiter filtering else: - objs = list(bucket_objects.values()) - if delimiter: - prefixes = list( - file_name[:prefix_len] - + file_name[prefix_len:].split(delimiter, 1)[0] - + delimiter - for file_name in list(bucket_objects) - if file_name.startswith(prefix or "") - and delimiter in file_name[prefix_len:] - ) + for file_name, file_object in bucket_objects.items(): + if prefix is None or file_name.startswith(prefix): + prefix_len = len(prefix) if prefix else 0 + if delimiter: + if delimiter in file_name[prefix_len:]: + prefix_end_index = file_name.find(delimiter, prefix_len) + len(delimiter) + prefixes.add(file_name[:prefix_end_index]) + else: + objs.append(file_object) + else: + objs.append(file_object) + + # Convert prefixes set to a sorted list + prefixes = sorted(list(prefixes)) + return objs, prefixes + def create_bucket(self, bucket_name, bucket_obj): """Create a bucket object representation and save it to the current fs diff --git a/tests/test_server.py b/tests/test_server.py index 31afc4d..8be873c 100644 --- a/tests/test_server.py +++ b/tests/test_server.py @@ -3,7 +3,7 @@ from io import BytesIO from tempfile import NamedTemporaryFile from unittest import TestCase as BaseTestCase - +import sys import fs import requests from google.api_core.exceptions import BadRequest, Conflict, NotFound @@ -597,6 +597,71 @@ def test_list_blobs_with_prefix_and_delimiter(self): self._assert_blob_list(blobs, [blob_1, blob_2]) self.assertEqual(blobs.prefixes, {"a/b/"}) + + def test_bucket_list_blobs_w_match_glob(self): + bucket = self._client.create_bucket("bucket_name") + + # File names with distinct patterns + blob_names = ["foo/bar", "foo/baz", "foo/foobar", "foobar"] + for name in blob_names: + blob = bucket.blob(name) + blob.upload_from_string("helloworld") + + match_glob_results = { + "foo*bar": ["foobar"], + "foo**bar": ["foo/bar", "foo/foobar", "foobar"], + "**/foobar": ["foo/foobar", "foobar"], + "*/ba[rz]": ["foo/bar", "foo/baz"], + "*/ba[!a-y]": ["foo/baz"], + "**/{foobar,baz}": ["foo/baz", "foo/foobar", "foobar"], + "foo/{foo*,*baz}": ["foo/baz", "foo/foobar"], + } + + # Iterate through the match glob patterns and expected results + for match_glob, expected_names in match_glob_results.items(): + file_objs = self._client.list_blobs(bucket, match_glob=match_glob) + filtered_names = [obj.name for obj in file_objs if obj] + self.assertEqual(filtered_names, expected_names) + + def test_bucket_list_blobs_w_match_glob_and_delimiter(self): + bucket = self._client.create_bucket("bucket_name") + + # File names with distinct patterns + blob_names = ["all/foo/bar", "foo/baz", "foo/389_bar", "bar", "baz"] + for name in blob_names: + blob = bucket.blob(name) + blob.upload_from_string("helloworld") + + match_glob_results = { + "foo*bar": [], + "foo**bar": [], + "**/bar": ["bar"], + "*/bar": [], + "*/ba[rz]": [], + "**ba[rz]": ["bar","baz"], + "*/ba[!a-y]": [], + "*ba[!a-y]": ["baz"], + "**/{foobar,baz}": ["baz"], + "foo/{foo*,*baz}": [], + "*{foo*,*baz}": ["baz"], + } + + # Iterate through the match glob patterns and expected results + for match_glob, expected_names in match_glob_results.items(): + file_objs = self._client.list_blobs(bucket, match_glob=match_glob, delimiter="/") + filtered_names = [obj.name for obj in file_objs if obj] + self.assertEqual(filtered_names, expected_names) + + + def test_wrong_delimiter_with_matchGlob(self): + bucket = self._client.create_bucket("bucket_name") + + try: + self._client.list_blobs(bucket, delimiter="*", match_glob="*.pdf") + except Exception as ex: + exc_type, exc_obj, exc_tb = sys.exc_info() + self.assertEqual(BadRequest, exc_type) + def test_bucket_copy_existing(self): bucket = self._client.create_bucket("bucket_name") diff --git a/tests/test_storage.py b/tests/test_storage.py index 669b2d3..78b77c9 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -2,7 +2,7 @@ import os from unittest import TestCase as BaseTestCase -from gcp_storage_emulator.exceptions import NotFound +from gcp_storage_emulator.exceptions import NotFound, BadRequest from gcp_storage_emulator.settings import STORAGE_BASE, STORAGE_DIR from gcp_storage_emulator.storage import Storage @@ -120,6 +120,68 @@ def test_file_ids_dont_clash(self): ) self.assertNotEqual(file_id_1, file_id_2) + + def test_get_file_ids_by_matchglob(self): + # File names with distinct patterns + blob_names = ["foo/bar", "foo/baz", "foo/foobar", "foobar"] + content = "Helloworld".encode("utf8") + for name in blob_names: + file_obj = {"bucket": "a_bucket_name", "name": name} + self.storage.create_file("a_bucket_name", name, content, file_obj) + + match_glob_results = { + "foo*bar": ["foobar"], + "foo**bar": ["foo/bar", "foo/foobar", "foobar"], + "**/foobar": ["foo/foobar", "foobar"], + "*/ba[rz]": ["foo/bar", "foo/baz"], + "*/ba[!a-y]": ["foo/baz"], + "**/{foobar,baz}": ["foo/baz", "foo/foobar", "foobar"], + "foo/{foo*,*baz}": ["foo/baz", "foo/foobar"], + } + + # Iterate through the match glob patterns and expected results + for match_glob, expected_names in match_glob_results.items(): + file_objs,prefixes = self.storage.get_file_list("a_bucket_name", match_glob=match_glob) + filtered_names = [obj['name'] for obj in file_objs if obj] + self.assertEqual(filtered_names, expected_names) + + + def test_get_file_ids_by_wrong_delimiter_and_matchglob(self): + content = "Helloworld".encode("utf8") + self.storage.create_file("a_bucket_name", 'file.png', content, {"bucket": "a_bucket_name", "name": "file.png"}) + + # Iterate through the match glob patterns and expected results + with self.assertRaises(BadRequest): + self.storage.get_file_list("a_bucket_name", delimiter="*", match_glob="*.png") + + def test_get_file_ids_by_slash_delimiter_and_matchglob(self): + # File names with distinct patterns + blob_names = ["all/foo/bar", "foo/baz", "foo/389_bar", "bar", "baz"] + content = "Helloworld".encode("utf8") + for name in blob_names: + file_obj = {"bucket": "a_bucket_name", "name": name} + self.storage.create_file("a_bucket_name", name, content, file_obj) + + match_glob_results = { + "foo*bar": [], + "foo**bar": [], + "**/bar": ["bar"], + "*/bar": [], + "*/ba[rz]": [], + "**ba[rz]": ["bar","baz"], + "*/ba[!a-y]": [], + "*ba[!a-y]": ["baz"], + "**/{foobar,baz}": ["baz"], + "foo/{foo*,*baz}": [], + "*{foo*,*baz}": ["baz"], + } + + # Iterate through the match glob patterns and expected results + for match_glob, expected_names in match_glob_results.items(): + file_objs,prefixes = self.storage.get_file_list("a_bucket_name", match_glob=match_glob, delimiter="/") + filtered_names = [obj['name'] for obj in file_objs if obj] + self.assertEqual(filtered_names, expected_names) + def test_create_file_for_resumable_upload(self): test_file = os.path.join( os.getcwd(), STORAGE_BASE, STORAGE_DIR, "a_bucket_name", "file_name.png"