datalab-to · maen08 · May 1, 2024 · May 1, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,22 @@
+FROM python:3.12.3-alpine
+
+ENV PYTHONDONTWRITEBYTECODE 1
+ENV PYTHONUNBUFFERED 1
+
+RUN apk add --no-cache \
+    build-base \
+    python3-dev \
+    py3-pip \
+    lapack-dev \
+    gfortran \
+    libffi-dev
+
+WORKDIR /app
+
+COPY .  /app/
+COPY pyproject.toml  /app/
+
+RUN pip install poetry
+RUN poetry install --no-root
+
+CMD [ "python3", "extract_text.py" ] 
diff --git a/README.md b/README.md
@@ -89,6 +89,30 @@ text = dictionary_output(pdf, sort=False, page_range=[1,2,3]) # Optional argumen
 
 If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper.  pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well.
 
+# Run on Docker
+Clone a project
+```
+git clone repository
+
+```
+
+Build a docker image
+```
+cd pdftext
+docker build -t pdftext .
+
+```
+
+Running with docker
+```
+# write out a text file
+docker run pdftext PDF_PATH --out_path output.txt
+
+# write out a json file
+docker run pdftext PDF_PATH --out_path output.txt --json
+
+```
+
 # Benchmarks
 
 I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthedocs.io/en/latest/), [pdfplumber](https://github.com/jsvine/pdfplumber), and pdftext.  I chose pymupdf because it extracts blocks and lines.  Pdfplumber extracts words and bboxes.  I did not benchmark pypdf, even though it is a great library, because it doesn't provide individual character/line/block and bbox information.

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -35,4 +35,4 @@ requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry.scripts]
-pdftext = "extract_text:main"
+pdftext = "extract_text:main"
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+joblib==1.4.0
+numpy==1.26.4
+pydantic==2.7.1
+pydantic-settings==2.2.1
+pypdfium2==4.29.0
+scikit-learn==1.4.2
+