Skip to content
This repository has been archived by the owner on Nov 9, 2024. It is now read-only.

Commit

Permalink
Copied tika config from main lodestone project
Browse files Browse the repository at this point in the history
  • Loading branch information
dskaggs committed Feb 4, 2021
1 parent bdd6bd9 commit d2fbc38
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 0 deletions.
43 changes: 43 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# taken from https://raw.githubusercontent.com/LogicalSpark/docker-tikaserver/master/Dockerfile
FROM ubuntu:bionic as base
RUN apt-get update

ENV TIKA_VERSION 1.23
MAINTAINER [email protected]

FROM base as dependencies

RUN DEBIAN_FRONTEND=noninteractive apt-get -y install openjdk-11-jre-headless gdal-bin tesseract-ocr \
tesseract-ocr-eng tesseract-ocr-ita tesseract-ocr-fra tesseract-ocr-spa tesseract-ocr-deu curl

RUN echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \
&& DEBIAN_FRONTEND=noninteractive apt-get install -y xfonts-utils fonts-freefont-ttf fonts-liberation ttf-mscorefonts-installer wget cabextract

FROM dependencies as fetch_tika

ENV NEAREST_TIKA_SERVER_URL="https://www.apache.org/dyn/closer.cgi/tika/tika-server-${TIKA_VERSION}.jar?filename=tika/tika-server-${TIKA_VERSION}.jar&action=download" \
ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar" \
DEFAULT_TIKA_SERVER_ASC_URL="https://www.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar.asc" \
ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/tika-server-${TIKA_VERSION}.jar.asc" \
TIKA_VERSION=$TIKA_VERSION

RUN DEBIAN_FRONTEND=noninteractive apt-get -y install gnupg2 \
&& wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://www.apache.org/dist/tika/KEYS | gpg --import \
&& wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-${TIKA_VERSION}.jar || rm /tika-server-${TIKA_VERSION}.jar \
&& sh -c "[ -f /tika-server-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-${TIKA_VERSION}.jar || rm /tika-server-${TIKA_VERSION}.jar \
&& sh -c "[ -f /tika-server-${TIKA_VERSION}.jar ]" || exit 1 \
&& wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-${TIKA_VERSION}.jar.asc || rm /tika-server-${TIKA_VERSION}.jar.asc \
&& sh -c "[ -f /tika-server-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-${TIKA_VERSION}.jar.asc || rm /tika-server-${TIKA_VERSION}.jar.asc \
&& sh -c "[ -f /tika-server-${TIKA_VERSION}.jar.asc ]" || exit 1 \
&& gpg --verify /tika-server-${TIKA_VERSION}.jar.asc /tika-server-${TIKA_VERSION}.jar

FROM dependencies as runtime
RUN apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
ENV TIKA_VERSION=$TIKA_VERSION
COPY --from=fetch_tika /tika-server-${TIKA_VERSION}.jar /tika-server-${TIKA_VERSION}.jar
COPY tika-config.xml /tika-config.xml

EXPOSE 9998
ENTRYPOINT java -jar /tika-server-${TIKA_VERSION}.jar -h 0.0.0.0 --config /tika-config.xml

HEALTHCHECK --interval=5s --timeout=5s --retries=5 CMD curl --silent --fail localhost:9998/tika || exit 1
13 changes: 13 additions & 0 deletions tika-config.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<properties>
<parsers>
<parser class="org.apache.tika.parser.DefaultParser">
<parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
</parser>
<parser class="org.apache.tika.parser.pdf.PDFParser">
<params>
<param name="ocrStrategy" type="string">ocr_and_text</param>
</params>
</parser>
</parsers>
</properties>

0 comments on commit d2fbc38

Please sign in to comment.