diff --git a/projects/apache-tika/.gitignore b/projects/apache-tika/.gitignore new file mode 100644 index 000000000000..6a51a5b3fba0 --- /dev/null +++ b/projects/apache-tika/.gitignore @@ -0,0 +1,3 @@ +project-parent/tika +project-parent/fuzz-targets/target +project-parent/fuzz-targets/pom.xml.versionsBackup \ No newline at end of file diff --git a/projects/apache-tika/Dockerfile b/projects/apache-tika/Dockerfile new file mode 100644 index 000000000000..49a4e2af8afc --- /dev/null +++ b/projects/apache-tika/Dockerfile @@ -0,0 +1,35 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +FROM ghcr.io/aixcc-finals/base-builder-jvm:v1.3.0 + + +RUN curl -L https://archive.apache.org/dist/maven/maven-3/3.9.11/binaries/apache-maven-3.9.11-bin.zip -o maven.zip && \ + unzip maven.zip -d $SRC/maven && \ + rm -rf maven.zip + +ENV MVN=$SRC/maven/apache-maven-3.9.11/bin/mvn + +COPY project-parent $SRC/project-parent/ + +RUN git clone --depth 1 https://github.com/apache/tika/ $SRC/project-parent/tika + +COPY build.sh build_seeds.sh $SRC/ + +RUN cd $SRC && ./build_seeds.sh + +WORKDIR $SRC/project-parent/tika + diff --git a/projects/apache-tika/build.sh b/projects/apache-tika/build.sh new file mode 100755 index 000000000000..3e4b209ddc91 --- /dev/null +++ b/projects/apache-tika/build.sh @@ -0,0 +1,65 @@ +#!/bin/bash -eu +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +PROJECT=tika +MAIN_REPOSITORY=https://github.com/apache/tika/ + +MAVEN_ARGS="-Djavac.src.version=17 -Djavac.target.version=17 -DskipTests -Dcheckstyle.skip -Dossindex.skip -am -pl :tika-app" + +function set_project_version_in_fuzz_targets_dependency { + PROJECT_VERSION=$(cd $PROJECT && $MVN org.apache.maven.plugins:maven-help-plugin:3.2.0:evaluate -Dexpression=project.version -q -DforceStdout) + # set dependency project version in fuzz-targets + (cd fuzz-targets && $MVN versions:use-dep-version -Dexcludes=com.code-intelligence:jazzer -DdepVersion=$PROJECT_VERSION -DforceVersion=true) +} + +cd $SRC/project-parent + +set_project_version_in_fuzz_targets_dependency + +#install +(cd $PROJECT && $MVN install $MAVEN_ARGS -Dmaven.repo.local=$OUT/m2) +$MVN -pl fuzz-targets install -Dmaven.repo.local=$OUT/m2 + +# build classpath +cp $SRC/project-parent/fuzz-targets/target/fuzz-targets-0.0.1-SNAPSHOT.jar $OUT/fuzz-targets.jar +RUNTIME_CLASSPATH_ABSOLUTE="$OUT/fuzz-targets.jar" +# replace $OUT with placeholder $this_dir that will be dissolved at runtime +RUNTIME_CLASSPATH=$(echo $RUNTIME_CLASSPATH_ABSOLUTE | sed "s|$OUT|\$this_dir|g") + +cp ${SRC}/seeds/*_seed_corpus.zip ${OUT}/ + +for fuzzer in $(find $SRC/project-parent -name '*Fuzzer.java'); do + fuzzer_basename=$(basename -s .java $fuzzer) + + # Create an execution wrapper for every fuzztarget + # This bumps memory to > 2gb to get around new byte[Integer.MAX_VALUE] single + # allocation issues that plague audio, video, image and other parsers. + # if we're able to get an oom > 2gb, we should really fix that. + echo "#!/bin/bash + # LLVMFuzzerTestOneInput comment for fuzzer detection by infrastructure. + this_dir=\$(dirname \"\$0\") + mem_settings='-Xmx3000m:-Xss1024k' + LD_LIBRARY_PATH=\"$JVM_LD_LIBRARY_PATH\":\$this_dir \ + \$this_dir/jazzer_driver --agent_path=\$this_dir/jazzer_agent_deploy.jar \ + --cp=$RUNTIME_CLASSPATH \ + --target_class=com.example.$fuzzer_basename \ + -rss_limit_mb=3600mb \ + --jvm_args=\"\$mem_settings\" \ + --instrumentation_includes=\"com.**:org.**\" \ + \$@" > $OUT/$fuzzer_basename + chmod u+x $OUT/$fuzzer_basename +done \ No newline at end of file diff --git a/projects/apache-tika/build_seeds.sh b/projects/apache-tika/build_seeds.sh new file mode 100755 index 000000000000..8b88a015c88b --- /dev/null +++ b/projects/apache-tika/build_seeds.sh @@ -0,0 +1,101 @@ +#!/bin/bash -eu +# Copyright 2025 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +################################################################################ + +mkdir ${SRC}/seeds +#This packages the unit test files based on file extension from within the Tika project +#we could also pull in other seeds from other parser projects. + +find ${SRC}/project-parent/tika -name "*-webm.noext" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*-mkv.noext" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.aif" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.au" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.flv" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.m4a" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.mkv" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.mp3" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.wav" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip + + +find ${SRC}/project-parent/tika -name "*.Z" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.bz2" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.gz" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.tbz2" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.tgz" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.zst" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -name "*.html" -print0 | xargs -0 zip ${SRC}/seeds/HtmlParserFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -name "*.avif" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.bmp" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.bpg" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.gif" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.heic" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.icns" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.jp2" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.jb2" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.jpg" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.jxl" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.png" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.psd" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.tif" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.webp" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip + + +find ${SRC}/project-parent/tika -name "*.mdb" -print0 | xargs -0 zip ${SRC}/seeds/JackcessParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.accdb" -print0 | xargs -0 zip ${SRC}/seeds/JackcessParserFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -name "*.one" -print0 | xargs -0 zip ${SRC}/seeds/OneNoteParserFuzzer_seed_corpus.zip + +#we could get more seeds by cloning POI +find ${SRC}/project-parent/tika -name "*.msg" -print0 | xargs -0 zip -u ${SRC}/seeds/OfficeParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.doc" -print0 | xargs -0 zip -u ${SRC}/seeds/OfficeParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.ppt" -print0 | xargs -0 zip -u ${SRC}/seeds/OfficeParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.xls" -print0 | xargs -0 zip -u ${SRC}/seeds/OfficeParserFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -name "*.docm" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.docx" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.pptm" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.pptx" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.xlsm" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.xlsx" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -name "*.7z" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.ar" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.jar" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.rar" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.tar" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.zip" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.zlib" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip + + +#we could get more seeds by cloning PDFBox or...? +find ${SRC}/project-parent/tika -name "*.pdf" -print0 | xargs -0 zip ${SRC}/seeds/PDFParserFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -name "*.eml" -print0 | xargs -0 zip ${SRC}/seeds/RFC822ParserFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -name "*.rtf" -print0 | xargs -0 zip ${SRC}/seeds/RTFParserFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -name "*.txt" -print0 | xargs -0 zip ${SRC}/seeds/TextAndCSVParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.tsv" -print0 | xargs -0 zip -u ${SRC}/seeds/TextAndCSVParserFuzzer_seed_corpus.zip +find ${SRC}/project-parent/tika -name "*.csv" -print0 | xargs -0 zip -u ${SRC}/seeds/TextAndCSVParserFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -name "*.xml" -print0 | xargs -0 zip ${SRC}/seeds/XMLReaderUtilsFuzzer_seed_corpus.zip + +find ${SRC}/project-parent/tika -path '*/test-documents/*' -type f | xargs -n1 -d '\n' zip ${SRC}/seeds/AutoDetectParserFuzzer_seed_corpus.zip + +cp ${SRC}/seeds/*_seed_corpus.zip ${OUT}/ + diff --git a/projects/apache-tika/project-parent/fuzz-targets/pom.xml b/projects/apache-tika/project-parent/fuzz-targets/pom.xml new file mode 100644 index 000000000000..9cfd2606039e --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/pom.xml @@ -0,0 +1,106 @@ + + + + 4.0.0 + com.fuzzer + fuzz-targets + 0.0.1-SNAPSHOT + fuzz + fuzz + + + 17 + 17 + 17 + + + + + com.code-intelligence + jazzer + 0.24.0 + provided + + + org.apache.tika + tika-core + Fuzzing-SNAPSHOT + + + org.apache.tika + tika-parsers-standard-package + Fuzzing-SNAPSHOT + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.6.1 + + + package + + shade + + + + false + + + + org.apache.tika:tika-parsers-standard-package:jar: + + + + + *:* + + META-INF/maven/plugin.xml + module-info.class + META-INF/* + LICENSE.txt + NOTICE.txt + CHANGES + README + builddef.lst + + javax/**/* + + + + + + + + + true + + + + + META-INF/LICENSE + target/classes/META-INF/LICENSE + + + META-INF/NOTICE + target/classes/META-INF/NOTICE + + + META-INF/DEPENDENCIES + target/classes/META-INF/DEPENDENCIES + + + META-INF/cxf/bus-extensions.txt + + + + + + + + + + diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/AudioVideoParsersFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/AudioVideoParsersFuzzer.java new file mode 100644 index 000000000000..43b5707f7850 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/AudioVideoParsersFuzzer.java @@ -0,0 +1,57 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; + +import org.apache.tika.parser.audio.AudioParser; +import org.apache.tika.parser.audio.MidiParser; +import org.apache.tika.parser.mp3.Mp3Parser; +import org.apache.tika.parser.mp4.MP4Parser; +import org.apache.tika.parser.video.FLVParser; + +import org.apache.tika.sax.ToTextContentHandler; + + +class AudioVideoParsersFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser[] parsers = new Parser[] { + new AudioParser(), + new MidiParser(), + new Mp3Parser(), + new MP4Parser(), + new FLVParser() + }; + Parser p = new AutoDetectParser(parsers); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (TikaException | SAXException | IOException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/AutoDetectParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/AutoDetectParserFuzzer.java new file mode 100644 index 000000000000..40abd21964b7 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/AutoDetectParserFuzzer.java @@ -0,0 +1,52 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; + +import org.apache.tika.sax.ToTextContentHandler; + + +class AutoDetectParserFuzzer { + private static final Parser AUTO_DETECT_PARSER = new AutoDetectParser(); + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + try { + ParserFuzzer.parseOne(AUTO_DETECT_PARSER, bytes); + } catch (AssertionError | RuntimeException | IOException | TikaException | SAXException e) { + //swallow + } + //now try rmeta + try { + ParserFuzzer.parseRMetaFile(AUTO_DETECT_PARSER, bytes); + } catch (AssertionError | RuntimeException | IOException | TikaException | SAXException e) { + //swallow + } + } + +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/CompressorParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/CompressorParserFuzzer.java new file mode 100644 index 000000000000..b85ea28501db --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/CompressorParserFuzzer.java @@ -0,0 +1,43 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.parser.pkg.CompressorParser; +import org.apache.tika.parser.Parser; + + +class CompressorParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new CompressorParser(); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (NullPointerException | TikaException | SAXException | IOException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/HtmlParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/HtmlParserFuzzer.java new file mode 100644 index 000000000000..2ac5f1e91c8e --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/HtmlParserFuzzer.java @@ -0,0 +1,37 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.html.JSoupParser; +import org.xml.sax.SAXException; + +final class HtmlParserFuzzer { + + public static void fuzzerTestOneInput(final byte[] bytes) throws Throwable { + Parser p = new JSoupParser(); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (TikaException | SAXException | IOException e) { + // swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/ImageParsersFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/ImageParsersFuzzer.java new file mode 100644 index 000000000000..6b0e9f68c1e3 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/ImageParsersFuzzer.java @@ -0,0 +1,63 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.Parser; + +import org.apache.tika.parser.image.BPGParser; +import org.apache.tika.parser.image.ImageParser; +import org.apache.tika.parser.image.PSDParser; +import org.apache.tika.parser.image.TiffParser; +import org.apache.tika.parser.image.WebPParser; +import org.apache.tika.parser.image.JpegParser; +import org.apache.tika.parser.image.HeifParser; +import org.apache.tika.parser.image.ICNSParser; +import org.apache.tika.parser.image.JXLParser; + + +class ImageParsersFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser[] parsers = new Parser[] { + new BPGParser(), + new ImageParser(), + new PSDParser(), + new TiffParser(), + new WebPParser(), + new JpegParser(), + new HeifParser(), + new ICNSParser(), + new JXLParser() + }; + Parser p = new AutoDetectParser(parsers); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (TikaException | SAXException | IOException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/JackcessParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/JackcessParserFuzzer.java new file mode 100644 index 000000000000..f8102dc6d512 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/JackcessParserFuzzer.java @@ -0,0 +1,43 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.parser.microsoft.JackcessParser; +import org.apache.tika.parser.Parser; + + +class JackcessParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new JackcessParser(); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (IllegalArgumentException | TikaException | SAXException | IOException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/OOXMLParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/OOXMLParserFuzzer.java new file mode 100644 index 000000000000..ef5b922ec776 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/OOXMLParserFuzzer.java @@ -0,0 +1,53 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.microsoft.OfficeParserConfig; +import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.ToTextContentHandler; + + +class OOXMLParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new OOXMLParser(); + ParseContext parseContext = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + + try { + ParserFuzzer.parseOne(p, bytes, parseContext); + } catch (org.apache.poi.ooxml.POIXMLException | org.apache.poi.util.RecordFormatException | + AssertionError | IllegalStateException | TikaException | SAXException | IOException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/OfficeParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/OfficeParserFuzzer.java new file mode 100644 index 000000000000..a5263381e8c3 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/OfficeParserFuzzer.java @@ -0,0 +1,59 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.microsoft.OfficeParser; +import org.apache.tika.parser.microsoft.OfficeParserConfig; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.ToTextContentHandler; + + +class OfficeParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new OfficeParser(); + ParseContext parseContext = new ParseContext(); + OfficeParserConfig officeParserConfig = new OfficeParserConfig(); + officeParserConfig.setExtractMacros(true); + parseContext.set(OfficeParserConfig.class, officeParserConfig); + + try { + ParserFuzzer.parseOne(p, bytes, parseContext); + } catch ( org.apache.poi.util.RecordFormatException | + org.apache.poi.hssf.record.RecordInputStream.LeftoverDataException | + org.apache.poi.hslf.exceptions.HSLFException | + IndexOutOfBoundsException | + AssertionError | IllegalArgumentException | IllegalStateException | java.util.NoSuchElementException | + java.nio.BufferUnderflowException | NegativeArraySizeException | NullPointerException | + TikaException | SAXException | IOException e) { + //swallow + //org.apache.poi.hssf.OldExcelFormatException subclasses IllegalArgumentException + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/OneNoteParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/OneNoteParserFuzzer.java new file mode 100644 index 000000000000..cafe457a80e6 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/OneNoteParserFuzzer.java @@ -0,0 +1,42 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.microsoft.onenote.OneNoteParser; +import org.apache.tika.parser.Parser; + + +class OneNoteParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new OneNoteParser(); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (TikaException | SAXException | IOException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/PDFParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/PDFParserFuzzer.java new file mode 100644 index 000000000000..1f5abca04090 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/PDFParserFuzzer.java @@ -0,0 +1,49 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.parser.pdf.PDFParserConfig; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; + + +class PDFParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new PDFParser(); + PDFParserConfig config = new PDFParserConfig(); + //what else do we want to exercise? + config.setExtractActions(true); + ParseContext parseContext = new ParseContext(); + parseContext.set(PDFParserConfig.class, config); + try { + ParserFuzzer.parseOne(p, bytes, parseContext); + } catch (TikaException | SAXException | IOException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/PackageParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/PackageParserFuzzer.java new file mode 100644 index 000000000000..64e1a3940ffa --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/PackageParserFuzzer.java @@ -0,0 +1,43 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.parser.pkg.PackageParser; +import org.apache.tika.parser.Parser; + + +class PackageParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new PackageParser(); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (IllegalArgumentException | TikaException | SAXException | IOException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/ParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/ParserFuzzer.java new file mode 100644 index 000000000000..034e086be7d0 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/ParserFuzzer.java @@ -0,0 +1,76 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.InputStream; + +import org.xml.sax.ContentHandler; + +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.BasicContentHandlerFactory; +import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.apache.tika.sax.ToTextContentHandler; + + +class ParserFuzzer { + + public static void parseOne(Parser parser, byte[] bytes, ParseContext parseContext) throws Throwable { + parseBytes(parser, bytes, parseContext); + parseFile(parser, bytes, parseContext); + } + + + public static void parseOne(Parser parser, byte[] bytes) throws Throwable { + parseBytes(parser, bytes, new ParseContext()); + parseFile(parser, bytes, new ParseContext()); + } + + public static void parseRMetaFile(Parser parser, byte[] bytes) throws Throwable { + RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); + RecursiveParserWrapperHandler rpwh = new RecursiveParserWrapperHandler( + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + tis.getPath(); + wrapper.parse(tis, rpwh, new Metadata(), new ParseContext()); + } + } + + public static void parseBytes(Parser parser, byte[] bytes, ParseContext parseContext) throws Throwable { + ContentHandler handler = new ToTextContentHandler(); + //make sure that other parsers cannot be invoked + parseContext.set(Parser.class, parser); + //try first with bytes + try (InputStream is = TikaInputStream.get(bytes)) { + parser.parse(is, handler, new Metadata(), parseContext); + } + } + + public static void parseFile(Parser parser, byte[] bytes, ParseContext parseContext) throws Throwable { + ContentHandler handler = new ToTextContentHandler(); + //make sure that other parsers cannot be invoked + parseContext.set(Parser.class, parser); + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + //force writing to tmp file + tis.getPath(); + parser.parse(tis, handler, new Metadata(), parseContext); + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/RFC822ParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/RFC822ParserFuzzer.java new file mode 100644 index 000000000000..6dfc6bbecb46 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/RFC822ParserFuzzer.java @@ -0,0 +1,42 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.mail.RFC822Parser; +import org.apache.tika.parser.Parser; + + +class RFC822ParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new RFC822Parser(); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (TikaException | SAXException | IOException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/RTFParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/RTFParserFuzzer.java new file mode 100644 index 000000000000..84c7e73828fd --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/RTFParserFuzzer.java @@ -0,0 +1,47 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.microsoft.rtf.RTFParser; +import org.apache.tika.sax.ToTextContentHandler; + + +class RTFParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new RTFParser(); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (AssertionError | TikaException | SAXException | IOException | + org.apache.tika.metadata.PropertyTypeException e) { + //swallow + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/TextAndCSVParserFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/TextAndCSVParserFuzzer.java new file mode 100644 index 000000000000..fe574553f68b --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/TextAndCSVParserFuzzer.java @@ -0,0 +1,47 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; + +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.csv.TextAndCSVParser; +import org.apache.tika.sax.ToTextContentHandler; + + +class TextAndCSVParserFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Throwable { + Parser p = new TextAndCSVParser(); + try { + ParserFuzzer.parseOne(p, bytes); + } catch (TikaException | SAXException | IOException e) { + //swallow + } + } + +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/XMLReaderUtilsFuzzer.java b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/XMLReaderUtilsFuzzer.java new file mode 100644 index 000000000000..3c8c0d15f247 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/java/com/example/XMLReaderUtilsFuzzer.java @@ -0,0 +1,85 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//////////////////////////////////////////////////////////////////////////////// + +package com.example; + +import java.io.IOException; +import java.io.InputStream; +import java.io.File; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Comparator; +import java.util.stream.Stream; + +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.XMLStreamReader; + +import org.w3c.dom.Document; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.SAXParseException; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.sax.ToTextContentHandler; +import org.apache.tika.utils.XMLReaderUtils; + + +class XMLReaderUtilsFuzzer { + + public static void fuzzerTestOneInput(byte[] bytes) throws Exception { + try { + parseOne(bytes); + } catch (java.io.FileNotFoundException e) { + //this should be rethrown because it could signal an XMLParser looking for a DTD + throw e; + } catch (TikaException | IOException | SAXException e) { + e.printStackTrace(); + } + } + + private static void parseOne(byte[] bytes) throws TikaException, IOException, SAXException { + + //dom + try (InputStream is = TikaInputStream.get(bytes)) { + Document doc = XMLReaderUtils.buildDOM(is, new ParseContext()); + } catch (SAXParseException e) { + //swallow + } + //sax + try (InputStream is = TikaInputStream.get(bytes)) { + ToTextContentHandler toTextContentHandler = new ToTextContentHandler(); + XMLReaderUtils.parseSAX(is, toTextContentHandler, new ParseContext()); + } catch (SAXException e) { + //swallow + } + + //stax + try (InputStream is = TikaInputStream.get(bytes)) { + XMLStreamReader reader = XMLReaderUtils.getXMLInputFactory(new ParseContext()) + .createXMLStreamReader(is); + while (reader.hasNext()) { + reader.next(); + } + } catch (java.util.MissingResourceException | XMLStreamException e) { + //MissingResourceException can be thrown when an internal DTD has an InvalidCharInDTD + //throw new TikaException("xml stream", e); + } + } +} diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/resources/log4j2.xml b/projects/apache-tika/project-parent/fuzz-targets/src/main/resources/log4j2.xml new file mode 100644 index 000000000000..a6cffe2202d0 --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/resources/log4j2.xml @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/projects/apache-tika/project-parent/fuzz-targets/src/main/resources/tika-config.xml b/projects/apache-tika/project-parent/fuzz-targets/src/main/resources/tika-config.xml new file mode 100755 index 000000000000..1247a234edfe --- /dev/null +++ b/projects/apache-tika/project-parent/fuzz-targets/src/main/resources/tika-config.xml @@ -0,0 +1,35 @@ + + + + + + + + + + + + false + + false + + + + diff --git a/projects/apache-tika/project-parent/pom.xml b/projects/apache-tika/project-parent/pom.xml new file mode 100644 index 000000000000..092c64c4cd7e --- /dev/null +++ b/projects/apache-tika/project-parent/pom.xml @@ -0,0 +1,16 @@ + + + 4.0.0 + + com.fuzzer + project-parent + 0.1.0 + pom + + + tika + fuzz-targets + + + \ No newline at end of file diff --git a/projects/apache-tika/project.yaml b/projects/apache-tika/project.yaml new file mode 100644 index 000000000000..0fdb0b378a62 --- /dev/null +++ b/projects/apache-tika/project.yaml @@ -0,0 +1,9 @@ +homepage: "https://tika.apache.org/" +language: jvm +fuzzing_engines: + - libfuzzer +main_repo: "https://github.com/apache/tika/" +sanitizers: + - address +vendor_ccs: + - "dev-owner@tika.apache.org"