Skip to content

Commit

Permalink
fix: sidecars download permissions TDE-1007 (#806)
Browse files Browse the repository at this point in the history
* wip: optional inputs for downloading files

* fix: download sidecars at same time as tiffs

* fix: optional parameter default value

* fix: formatting

* fix: download sidecar files, ignore 404

* fix: appease pylint

* fix: remove temporary sidecar test file

* fix: handle local exception

* fix: handle local exception better

* fix: remove testing array

* refactor: Use generic error for S3 & local file read

Simplifies users of `read`.

* fix: update translate_ascii script

* docs: update for new functions

* test: for fs functions

* fix: formatting

* test: fix list

* fix: rename test file for pylint issue

* fix: rename test file for pylint issue

* test: minor changes

* fix: formatting

* fix: Enable importing `scripts.stac`

* fix: Disable duplicate code test (#817)

* fix: Disable duplicate code test

It's not particularly useful for test code.

* fix: Run a single pylint command for the repo

Ensures that any checks which deal with multiple files (such as
duplicate code) can detect these.

* fix: disable pylint duplicate check at the individual test level

* fix: remove duplicate check

---------

Co-authored-by: Victor Engmark <[email protected]>
Co-authored-by: Paul Fouquet <[email protected]>
  • Loading branch information
3 people authored Jan 24, 2024
1 parent cde875d commit 16b5b04
Show file tree
Hide file tree
Showing 9 changed files with 115 additions and 41 deletions.
74 changes: 41 additions & 33 deletions scripts/files/fs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import Future, ThreadPoolExecutor, as_completed
from typing import List, Optional

from boto3 import resource
from linz_logger import get_log

from scripts.aws.aws_helper import is_s3
Expand Down Expand Up @@ -33,9 +34,15 @@ def read(path: str) -> bytes:
bytes: The bytes content of the file.
"""
if is_s3(path):
return fs_s3.read(path)
try:
return fs_s3.read(path)
except resource("s3").meta.client.exceptions.NoSuchKey as error:
raise NoSuchFileError from error

return fs_local.read(path)
try:
return fs_local.read(path)
except FileNotFoundError as error:
raise NoSuchFileError from error


def exists(path: str) -> bool:
Expand Down Expand Up @@ -65,10 +72,7 @@ def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) ->
"""
written_tiffs: List[str] = []
with ThreadPoolExecutor(max_workers=concurrency) as executor:
futuress = {
executor.submit(write, os.path.join(target, f"{os.path.basename(input_)}"), read(input_)): input_
for input_ in inputs
}
futuress = {write_file(executor, input_, target): input_ for input_ in inputs}
for future in as_completed(futuress):
if future.exception():
get_log().warn("Failed Read-Write", error=future.exception())
Expand All @@ -77,40 +81,44 @@ def write_all(inputs: List[str], target: str, concurrency: Optional[int] = 4) ->

if len(inputs) != len(written_tiffs):
get_log().error("Missing Files", count=len(inputs) - len(written_tiffs))
raise Exception("Not all source files were written")
raise Exception("Not all mandatory source files were written")

return written_tiffs


def find_sidecars(inputs: List[str], extensions: List[str], concurrency: Optional[int] = 4) -> List[str]:
"""Searches for sidecar files.
A sidecar files is a file with the same name as the input file but with a different extension.
def write_sidecars(inputs: List[str], target: str, concurrency: Optional[int] = 4) -> None:
"""Writes list of files to target destination using multithreading.
Args:
inputs: list of input files to search for extensions
extensions: the sidecar file extensions
inputs: list of files to read
target: target folder to write to
concurrency: max thread pool workers
Returns:
list of existing sidecar files
"""
with ThreadPoolExecutor(max_workers=concurrency) as executor:
try:
results = {write_file(executor, input_, target): input_ for input_ in inputs}
for future in as_completed(results):
get_log().info("wrote_sidecar_file", path=future.result())
except NoSuchFileError:
get_log().info("No sidecar file found; skipping")

def _validate_path(path: str) -> Optional[str]:
"""Helper inner function to re-return the path if it exists rather than a boolean."""
if exists(path):
return path
return None

sidecars: List[str] = []
with ThreadPoolExecutor(max_workers=concurrency) as executor:
for extension in extensions:
futuress = {
executor.submit(_validate_path, f"{os.path.splitext(input_)[0]}{extension}"): input_ for input_ in inputs
}
for future in as_completed(futuress):
if future.exception():
get_log().warn("Find sidecar failed", error=future.exception())
else:
result = future.result()
if result:
sidecars.append(result)
return sidecars
def write_file(executor: ThreadPoolExecutor, input_: str, target: str) -> Future[str]:
"""Read a file from a path and write it to a target path.
Args:
executor: A ThreadPoolExecutor instance.
input_: A path to a file to read.
target: A path to write the file to.
Returns:
Future[str]: The result of the execution.
"""
return executor.submit(write, os.path.join(target, f"{os.path.basename(input_)}"), read(input_))


class NoSuchFileError(Exception):
pass
58 changes: 58 additions & 0 deletions scripts/files/tests/fs_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json

from boto3 import resource
from moto import mock_s3
from moto.s3.responses import DEFAULT_REGION_NAME
from pytest import CaptureFixture, raises

from scripts.files.fs import NoSuchFileError, read, write_all, write_sidecars


def test_read_key_not_found_local() -> None:
with raises(NoSuchFileError):
read("test_dir/test.file")


@mock_s3 # type: ignore
def test_read_key_not_found_s3(capsys: CaptureFixture[str]) -> None:
s3 = resource("s3", region_name=DEFAULT_REGION_NAME)
s3.create_bucket(Bucket="testbucket")

with raises(NoSuchFileError):
read("s3://testbucket/test.file")

logs = json.loads(capsys.readouterr().out)
assert logs["msg"] == "s3_key_not_found"


def test_write_all_file_not_found_local() -> None:
with raises(NoSuchFileError):
write_all(["/test.prj"], "/tmp")


def test_write_sidecars_file_not_found_local(capsys: CaptureFixture[str]) -> None:
write_sidecars(["/test.prj"], "/tmp")

logs = json.loads(capsys.readouterr().out.strip())
assert logs["msg"] == "No sidecar file found; skipping"


@mock_s3 # type: ignore
def test_write_all_key_not_found_s3() -> None:
s3 = resource("s3", region_name=DEFAULT_REGION_NAME)
s3.create_bucket(Bucket="testbucket")

with raises(NoSuchFileError):
write_all(["s3://testbucket/test.tif"], "/tmp")


@mock_s3 # type: ignore
def test_write_sidecars_key_not_found_s3(capsys: CaptureFixture[str]) -> None:
s3 = resource("s3", region_name=DEFAULT_REGION_NAME)
s3.create_bucket(Bucket="testbucket")

write_sidecars(["s3://testbucket/test.prj"], "/tmp")

# capsys.readouterr().out json string format is not valid which implies
# we can't parse it to find the actual `msg`
assert "No sidecar file found; skipping" in capsys.readouterr().out
Empty file added scripts/stac/__init__.py
Empty file.
1 change: 1 addition & 0 deletions scripts/stac/tests/collection_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from scripts.stac.imagery.provider import Provider, ProviderRole


# pylint: disable=duplicate-code
@pytest.fixture(name="metadata", autouse=True)
def setup() -> Generator[CollectionTitleMetadata, None, None]:
metadata: CollectionTitleMetadata = {
Expand Down
1 change: 1 addition & 0 deletions scripts/stac/tests/generate_description_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from scripts.stac.imagery.metadata_constants import CollectionTitleMetadata


# pylint: disable=duplicate-code
@pytest.fixture(name="metadata", autouse=True)
def setup() -> Generator[Tuple[CollectionTitleMetadata, CollectionTitleMetadata], None, None]:
metadata_auck: CollectionTitleMetadata = {
Expand Down
1 change: 1 addition & 0 deletions scripts/stac/tests/generate_title_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from scripts.stac.imagery.metadata_constants import CollectionTitleMetadata


# pylint: disable=duplicate-code
@pytest.fixture(name="metadata", autouse=True)
def setup() -> Generator[Tuple[CollectionTitleMetadata, CollectionTitleMetadata], None, None]:
metadata_auck: CollectionTitleMetadata = {
Expand Down
1 change: 1 addition & 0 deletions scripts/stac/tests/item_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def test_imagery_stac_item(mocker) -> None: # type: ignore
assert {"rel": "self", "href": f"./{id_}.json", "type": "application/json"} in item.stac["links"]


# pylint: disable=duplicate-code
def test_imagery_add_collection(mocker) -> None: # type: ignore
metadata: CollectionTitleMetadata = {
"category": "Urban Aerial Photos",
Expand Down
10 changes: 7 additions & 3 deletions scripts/standardising.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from scripts.cli.cli_helper import TileFiles
from scripts.files.file_tiff import FileTiff, FileTiffType
from scripts.files.files_helper import ContentType, is_tiff
from scripts.files.fs import exists, find_sidecars, read, write, write_all
from scripts.files.fs import exists, read, write, write_all, write_sidecars
from scripts.gdal.gdal_bands import get_gdal_band_offset
from scripts.gdal.gdal_helper import get_gdal_version, run_gdal
from scripts.gdal.gdal_preset import (
Expand Down Expand Up @@ -135,8 +135,12 @@ def standardising(
# Download any needed file from S3 ["/foo/bar.tiff", "s3://foo"] => "/tmp/bar.tiff", "/tmp/foo.tiff"
with tempfile.TemporaryDirectory() as tmp_path:
standardized_working_path = os.path.join(tmp_path, standardized_file_name)
sidecars = find_sidecars(files.inputs, [".prj", ".tfw"])
source_files = write_all(files.inputs + sidecars, f"{tmp_path}/source/")
sidecars: List[str] = []
for extension in [".prj", ".tfw"]:
for file_input in files.inputs:
sidecars.append(f"{os.path.splitext(file_input)[0]}{extension}")
source_files = write_all(files.inputs, f"{tmp_path}/source/")
write_sidecars(sidecars, f"{tmp_path}/source/")
source_tiffs = [file for file in source_files if is_tiff(file)]

vrt_add_alpha = True
Expand Down
10 changes: 5 additions & 5 deletions scripts/translate_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from scripts.cli.cli_helper import is_argo
from scripts.files.files_helper import get_file_name_from_path
from scripts.files.fs import find_sidecars, read, write_all
from scripts.files.fs import read, write_all, write_sidecars
from scripts.gdal.gdal_helper import run_gdal
from scripts.gdal.gdal_preset import get_ascii_translate_command
from scripts.logging.time_helper import time_in_ms
Expand Down Expand Up @@ -56,10 +56,10 @@ def main() -> None:

# copy any sidecar files to target
sidecars = []
start_time = time_in_ms()
for ls in asc_files:
sidecars += find_sidecars(ls, [".prj", ".tfw"])
write_all(inputs=sidecars, target=arguments.target)
for extension in [".prj", ".tfw"]:
for ls in asc_files:
sidecars.append(f"{os.path.splitext(ls)[0]}{extension}")
write_sidecars(sidecars, f"{tmp_path}/source/")
get_log().info("sidecar files copied", duration=time_in_ms() - start_time, count=len(sidecars))


Expand Down

0 comments on commit 16b5b04

Please sign in to comment.