diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..afdbb1c --- /dev/null +++ b/Dockerfile @@ -0,0 +1,22 @@ +FROM python:3.12.3-alpine + +ENV PYTHONDONTWRITEBYTECODE 1 +ENV PYTHONUNBUFFERED 1 + +RUN apk add --no-cache \ + build-base \ + python3-dev \ + py3-pip \ + lapack-dev \ + gfortran \ + libffi-dev + +WORKDIR /app + +COPY . /app/ +COPY pyproject.toml /app/ + +RUN pip install poetry +RUN poetry install --no-root + +CMD [ "python3", "extract_text.py" ] \ No newline at end of file diff --git a/README.md b/README.md index d5ce939..306f83f 100644 --- a/README.md +++ b/README.md @@ -89,6 +89,30 @@ text = dictionary_output(pdf, sort=False, page_range=[1,2,3]) # Optional argumen If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper. pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well. +# Run on Docker +Clone a project +``` +git clone repository + +``` + +Build a docker image +``` +cd pdftext +docker build -t pdftext . + +``` + +Running with docker +``` +# write out a text file +docker run pdftext PDF_PATH --out_path output.txt + +# write out a json file +docker run pdftext PDF_PATH --out_path output.txt --json + +``` + # Benchmarks I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthedocs.io/en/latest/), [pdfplumber](https://github.com/jsvine/pdfplumber), and pdftext. I chose pymupdf because it extracts blocks and lines. Pdfplumber extracts words and bboxes. I did not benchmark pypdf, even though it is a great library, because it doesn't provide individual character/line/block and bbox information. diff --git a/poetry.lock b/poetry.lock index eb9abc9..3c80cad 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1911,4 +1911,4 @@ multidict = ">=4.0" [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13,!=3.9.7" -content-hash = "52cfc286016e488015d31fb2f8b9b92a715b81a352dfbd6dbbacb88808fb0294" +content-hash = "52cfc286016e488015d31fb2f8b9b92a715b81a352dfbd6dbbacb88808fb0294" \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a71f9de..9a94791 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,4 +35,4 @@ requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" [tool.poetry.scripts] -pdftext = "extract_text:main" +pdftext = "extract_text:main" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8ea5bed --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +joblib==1.4.0 +numpy==1.26.4 +pydantic==2.7.1 +pydantic-settings==2.2.1 +pypdfium2==4.29.0 +scikit-learn==1.4.2 +