This repository has been archived by the owner on Nov 9, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Copied tika config from main lodestone project
- Loading branch information
Showing
2 changed files
with
56 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# taken from https://raw.githubusercontent.com/LogicalSpark/docker-tikaserver/master/Dockerfile | ||
FROM ubuntu:bionic as base | ||
RUN apt-get update | ||
|
||
ENV TIKA_VERSION 1.23 | ||
MAINTAINER [email protected] | ||
|
||
FROM base as dependencies | ||
|
||
RUN DEBIAN_FRONTEND=noninteractive apt-get -y install openjdk-11-jre-headless gdal-bin tesseract-ocr \ | ||
tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu curl | ||
|
||
RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ | ||
&& DEBIAN_FRONTEND=noninteractive apt-get install -y xfonts-utils fonts-freefont-ttf fonts-liberation ttf-mscorefonts-installer wget cabextract | ||
|
||
FROM dependencies as fetch_tika | ||
|
||
ENV NEAREST_TIKA_SERVER_URL="https://www.apache.org/dyn/closer.cgi/tika/tika-server-${TIKA_VERSION}.jar?filename=tika/tika-server-${TIKA_VERSION}.jar&action=download" \ | ||
ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar" \ | ||
DEFAULT_TIKA_SERVER_ASC_URL="https://www.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar.asc" \ | ||
ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar.asc" \ | ||
TIKA_VERSION=$TIKA_VERSION | ||
|
||
RUN DEBIAN_FRONTEND=noninteractive apt-get -y install gnupg2 \ | ||
&& wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://www.apache.org/dist/tika/KEYS | gpg --import \ | ||
&& wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-${TIKA_VERSION}.jar || rm /tika-server-${TIKA_VERSION}.jar \ | ||
&& sh -c "[ -f /tika-server-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-${TIKA_VERSION}.jar || rm /tika-server-${TIKA_VERSION}.jar \ | ||
&& sh -c "[ -f /tika-server-${TIKA_VERSION}.jar ]" || exit 1 \ | ||
&& wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-${TIKA_VERSION}.jar.asc || rm /tika-server-${TIKA_VERSION}.jar.asc \ | ||
&& sh -c "[ -f /tika-server-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-${TIKA_VERSION}.jar.asc || rm /tika-server-${TIKA_VERSION}.jar.asc \ | ||
&& sh -c "[ -f /tika-server-${TIKA_VERSION}.jar.asc ]" || exit 1 \ | ||
&& gpg --verify /tika-server-${TIKA_VERSION}.jar.asc /tika-server-${TIKA_VERSION}.jar | ||
|
||
FROM dependencies as runtime | ||
RUN apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* | ||
ENV TIKA_VERSION=$TIKA_VERSION | ||
COPY --from=fetch_tika /tika-server-${TIKA_VERSION}.jar /tika-server-${TIKA_VERSION}.jar | ||
COPY tika-config.xml /tika-config.xml | ||
|
||
EXPOSE 9998 | ||
ENTRYPOINT java -jar /tika-server-${TIKA_VERSION}.jar -h 0.0.0.0 --config /tika-config.xml | ||
|
||
HEALTHCHECK --interval=5s --timeout=5s --retries=5 CMD curl --silent --fail localhost:9998/tika || exit 1 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<properties> | ||
<parsers> | ||
<parser class="org.apache.tika.parser.DefaultParser"> | ||
<parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> | ||
</parser> | ||
<parser class="org.apache.tika.parser.pdf.PDFParser"> | ||
<params> | ||
<param name="ocrStrategy" type="string">ocr_and_text</param> | ||
</params> | ||
</parser> | ||
</parsers> | ||
</properties> |