From af4880e0fef0631d30615bffdcb8d1cbf9752364 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 23 Sep 2019 06:35:05 +0200 Subject: [PATCH 1/7] :bug: ocrd zip spill: directory -> dest pretty sure @bertsky fixed this before but cannot find commit --- ocrd/ocrd/cli/zip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/cli/zip.py b/ocrd/ocrd/cli/zip.py index 561a420645..b4be23951e 100644 --- a/ocrd/ocrd/cli/zip.py +++ b/ocrd/ocrd/cli/zip.py @@ -77,7 +77,7 @@ def spill(dest, src): """ resolver = Resolver() workspace_bagger = WorkspaceBagger(resolver) - workspace = workspace_bagger.spill(src, directory) + workspace = workspace_bagger.spill(src, dest) print(workspace) # ---------------------------------------------------------------------- From 774af2ad8e4db024008eeba6458eb153b30b3cba Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Mon, 23 Sep 2019 06:40:13 +0200 Subject: [PATCH 2/7] expose imagefilename workspace validation option to cli, #309 --- ocrd/ocrd/cli/workspace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index fc983eda33..6581a21c2b 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -47,7 +47,7 @@ def workspace_cli(ctx, directory, mets_basename, backup): ''') @pass_workspace @click.option('-a', '--download', is_flag=True, help="Download all files") -@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(['mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'page', 'url'])) +@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(['imagefilename', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'page', 'url'])) @click.option('--page-strictness', help="How strict to check PAGE consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict') @click.argument('mets_url') def validate_workspace(ctx, mets_url, download, skip, page_strictness): From 1389bd4aeabb001fafc21916337f0e12f2aae260 Mon Sep 17 00:00:00 2001 From: "Gerber, Mike" Date: Wed, 9 Oct 2019 17:25:28 +0200 Subject: [PATCH 3/7] Add CLI option to skip image dimension checking --- ocrd/ocrd/cli/workspace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ocrd/ocrd/cli/workspace.py b/ocrd/ocrd/cli/workspace.py index 6581a21c2b..32075ceb95 100644 --- a/ocrd/ocrd/cli/workspace.py +++ b/ocrd/ocrd/cli/workspace.py @@ -47,7 +47,7 @@ def workspace_cli(ctx, directory, mets_basename, backup): ''') @pass_workspace @click.option('-a', '--download', is_flag=True, help="Download all files") -@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(['imagefilename', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'page', 'url'])) +@click.option('-s', '--skip', help="Tests to skip", default=[], multiple=True, type=click.Choice(['imagefilename', 'dimension', 'mets_unique_identifier', 'mets_file_group_names', 'mets_files', 'pixel_density', 'page', 'url'])) @click.option('--page-strictness', help="How strict to check PAGE consistency", type=click.Choice(['strict', 'lax', 'fix', 'off']), default='strict') @click.argument('mets_url') def validate_workspace(ctx, mets_url, download, skip, page_strictness): From 9740532001319867aa7af0df6a6104380b2b3838 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Oct 2019 16:50:15 +0200 Subject: [PATCH 4/7] workspace bagger: update PAGE imageFilenames, #176 --- ocrd/ocrd/workspace_bagger.py | 36 ++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/ocrd/ocrd/workspace_bagger.py b/ocrd/ocrd/workspace_bagger.py index c1021cc4b8..cba60872f1 100644 --- a/ocrd/ocrd/workspace_bagger.py +++ b/ocrd/ocrd/workspace_bagger.py @@ -1,5 +1,5 @@ from datetime import datetime -from os import makedirs, chdir, walk, getcwd +from os import makedirs, chdir, walk from os.path import join, isdir, basename, exists, relpath from shutil import make_archive, rmtree, copyfile, move from tempfile import mkdtemp @@ -16,9 +16,12 @@ is_local_filename, unzip_file_to_dir, + MIMETYPE_PAGE, VERSION, ) from ocrd_validators.constants import BAGIT_TXT, TMP_BAGIT_PREFIX, OCRD_BAGIT_PROFILE_URL +from ocrd_modelfactory import page_from_file +from ocrd_models.ocrd_page import to_xml from .workspace import Workspace @@ -58,9 +61,11 @@ def _log_or_raise(self, msg): def _bag_mets_files(self, workspace, bagdir, ocrd_manifestation_depth, ocrd_mets, processes): mets = workspace.mets + changed_urls = {} # TODO allow filtering by fileGrp@USE and such with pushd_popd(workspace.directory): + # URLs of the files before changing for f in mets.find_files(): log.info("Resolving %s (%s)", f.url, ocrd_manifestation_depth) if is_local_filename(f.url): @@ -77,15 +82,40 @@ def _bag_mets_files(self, workspace, bagdir, ocrd_manifestation_depth, ocrd_mets file_grp_dir = join(bagdir, 'data', f.fileGrp) if not isdir(file_grp_dir): makedirs(file_grp_dir) - self.resolver.download_to_directory(file_grp_dir, f.url, basename="%s%s" % (f.ID, f.extension)) - f.url = join(f.fileGrp, f.ID + f.extension) + + _basename = "%s%s" % (f.ID, f.extension) + _relpath = join(f.fileGrp, _basename) + self.resolver.download_to_directory(file_grp_dir, f.url, basename=_basename) + changed_urls[f.url] = _relpath + f.url = _relpath # save mets.xml with open(join(bagdir, 'data', ocrd_mets), 'wb') as f: f.write(workspace.mets.to_xml()) + # Walk through bagged workspace and fix the PAGE + # Page/@imageFilename and + # AlternativeImage/@filename + bag_workspace = Workspace(self.resolver, directory=join(bagdir, 'data')) + with pushd_popd(bag_workspace.directory): + for page_file in bag_workspace.mets.find_files(mimetype=MIMETYPE_PAGE): + pcgts = page_from_file(page_file) + changed = False + # page_doc.set(imageFileName + # for old, new in changed_urls: + for old, new in changed_urls.items(): + if pcgts.get_Page().imageFilename == old: + pcgts.get_Page().imageFilename = new + changed = True + # TODO replace AlternativeImage, recursively... + if changed: + with open(page_file.url, 'w') as out: + out.write(to_xml(pcgts)) + # log.info("Replace %s -> %s in %s" % (old, new, page_file)) + chdir(bagdir) total_bytes, total_files = make_manifests('data', processes, algorithms=['sha512']) + log.info("New vs. old: %s" % changed_urls) return total_bytes, total_files def _set_bag_info(self, bag, total_bytes, total_files, ocrd_identifier, ocrd_manifestation_depth, ocrd_base_version_checksum): From 9e05d7f3ab8d62572199602b71f1843ea6658384 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Wed, 16 Oct 2019 18:38:02 +0200 Subject: [PATCH 5/7] [WIP] update docker --- .dockerignore | 6 ++---- Dockerfile | 11 ++++++----- Makefile | 6 +++++- 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.dockerignore b/.dockerignore index efc07b377b..ea893ac36c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,7 +1,5 @@ * -!ocrd +!ocrd* !Makefile -!setup.py -!requirements.txt !LICENSE -!README.rst +!README.md diff --git a/Dockerfile b/Dockerfile index 84b9c41885..a273605326 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,10 +7,11 @@ ENV LANG C.UTF-8 WORKDIR /build-ocrd COPY ocrd ./ocrd +COPY ocrd_modelfactory ./ocrd_modelfactory/ +COPY ocrd_models ./ocrd_models +COPY ocrd_utils ./ocrd_utils COPY Makefile . -COPY setup.py . -COPY requirements.txt . -COPY README.rst . +COPY README.md . COPY LICENSE . RUN apt-get update && \ apt-get -y install --no-install-recommends \ @@ -19,9 +20,9 @@ RUN apt-get update && \ sudo \ git \ libglib2.0.0 \ - libsm6 \ - libxrender1 \ libxext6 + # libsm6 \ + # libxrender1 \ RUN make deps-ubuntu RUN pip3 install --upgrade pip RUN make deps install diff --git a/Makefile b/Makefile index 6914e37538..bbf7b8cb2b 100644 --- a/Makefile +++ b/Makefile @@ -16,6 +16,9 @@ BUILD_ORDER = ocrd_utils ocrd_models ocrd_modelfactory ocrd_validators ocrd FIND_VERSION = grep version= ocrd_utils/setup.py|grep -Po "([0-9ab]+\.?)+" +# Additional arguments to docker build. Default: '$(DOCKER_ARGS)' +DOCKER_ARGS = + # BEGIN-EVAL makefile-parser --make-help Makefile help: @@ -44,6 +47,7 @@ help: @echo " Variables" @echo "" @echo " PAGE_VERSION PAGE schema version to use. Default: '$(PAGE_VERSION)'" + @echo " DOCKER_ARGS Additional arguments to docker build. Default: '$(DOCKER_ARGS)'" @echo " DOCKER_TAG Docker tag." @echo " PIP_INSTALL pip install command. Default: $(PIP_INSTALL)" @@ -181,7 +185,7 @@ pyclean: # Build docker image docker: - docker build -t $(DOCKER_TAG) . + docker build -t $(DOCKER_TAG) $(DOCKER_ARGS) . # # bash library From 3b2101718d8d69b3868ffe352c0f059c179a480a Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 18 Oct 2019 16:29:06 +0200 Subject: [PATCH 6/7] update docker: base on 19.10 --- Dockerfile | 9 +++++---- Makefile | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index a273605326..b50d5a5916 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM ubuntu:18.04 +FROM ubuntu:19.10 MAINTAINER OCR-D ENV DEBIAN_FRONTEND noninteractive ENV PYTHONIOENCODING utf8 @@ -10,6 +10,7 @@ COPY ocrd ./ocrd COPY ocrd_modelfactory ./ocrd_modelfactory/ COPY ocrd_models ./ocrd_models COPY ocrd_utils ./ocrd_utils +COPY ocrd_validators/ ./ocrd_validators COPY Makefile . COPY README.md . COPY LICENSE . @@ -19,12 +20,12 @@ RUN apt-get update && \ make \ sudo \ git \ - libglib2.0.0 \ - libxext6 + libglib2.0.0 + # libxext6 # libsm6 \ # libxrender1 \ RUN make deps-ubuntu RUN pip3 install --upgrade pip -RUN make deps install +RUN make deps-ubuntu install ENTRYPOINT ["/usr/local/bin/ocrd"] diff --git a/Makefile b/Makefile index bbf7b8cb2b..d6daef6951 100644 --- a/Makefile +++ b/Makefile @@ -61,7 +61,7 @@ PIP_INSTALL = pip install # Dependencies for deployment in an ubuntu/debian linux deps-ubuntu: - sudo apt install -y python3 python3-pip + sudo apt-get install -y python3 python3-pip # Install test python deps via pip deps-test: From 08ddbb2e54e34c5f21b515d29b0483d55e12bde8 Mon Sep 17 00:00:00 2001 From: Konstantin Baierer Date: Fri, 18 Oct 2019 16:58:41 +0200 Subject: [PATCH 7/7] :package: v1.0.0 --- CHANGELOG.md | 5 +++++ ocrd_utils/setup.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6f15bb46a..40691ca0d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [1.0.0] - 2019-10-18 + * Workspace validation: Validate that files mentioned in pc:Page/@imageFilename exist in METS and on FS, #309 * `ocrd ocrd-tool parse-params` has the string-or-filepath logic for -p/--parameter as for the [CLI](https://ocr-d.github.io/cli#-p---parameter-param_json) @@ -587,6 +589,9 @@ Fixed Initial Release +[1.0.0]: ../../compare/v1.0.0...v1.0.0b19 +[1.0.0b19]: ../../compare/v1.0.0b19...v1.0.0b18 +[1.0.0b18]: ../../compare/v1.0.0b18...v1.0.0b17 [1.0.0b17]: ../../compare/v1.0.0b17...v1.0.0b16 [1.0.0b16]: ../../compare/v1.0.0b16...v1.0.0b15 [1.0.0b15]: ../../compare/v1.0.0b15...v1.0.0b14 diff --git a/ocrd_utils/setup.py b/ocrd_utils/setup.py index b0bac10c6f..7b7e56868c 100644 --- a/ocrd_utils/setup.py +++ b/ocrd_utils/setup.py @@ -5,7 +5,7 @@ setup( name='ocrd_utils', - version='1.0.0b19', + version='1.0.0', description='OCR-D framework - shared code, helpers, constants', long_description=open('README.md').read(), long_description_content_type='text/markdown',