Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions projects/apache-tika/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
project-parent/tika
project-parent/fuzz-targets/target
project-parent/fuzz-targets/pom.xml.versionsBackup
35 changes: 35 additions & 0 deletions projects/apache-tika/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################

FROM ghcr.io/aixcc-finals/base-builder-jvm:v1.3.0


RUN curl -L https://archive.apache.org/dist/maven/maven-3/3.9.11/binaries/apache-maven-3.9.11-bin.zip -o maven.zip && \
unzip maven.zip -d $SRC/maven && \
rm -rf maven.zip

ENV MVN=$SRC/maven/apache-maven-3.9.11/bin/mvn

COPY project-parent $SRC/project-parent/

RUN git clone --depth 1 https://github.com/apache/tika/ $SRC/project-parent/tika

COPY build.sh build_seeds.sh $SRC/

RUN cd $SRC && ./build_seeds.sh

WORKDIR $SRC/project-parent/tika

65 changes: 65 additions & 0 deletions projects/apache-tika/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash -eu
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################

PROJECT=tika
MAIN_REPOSITORY=https://github.com/apache/tika/

MAVEN_ARGS="-Djavac.src.version=17 -Djavac.target.version=17 -DskipTests -Dcheckstyle.skip -Dossindex.skip -am -pl :tika-app"

function set_project_version_in_fuzz_targets_dependency {
PROJECT_VERSION=$(cd $PROJECT && $MVN org.apache.maven.plugins:maven-help-plugin:3.2.0:evaluate -Dexpression=project.version -q -DforceStdout)
# set dependency project version in fuzz-targets
(cd fuzz-targets && $MVN versions:use-dep-version -Dexcludes=com.code-intelligence:jazzer -DdepVersion=$PROJECT_VERSION -DforceVersion=true)
}

cd $SRC/project-parent

set_project_version_in_fuzz_targets_dependency

#install
(cd $PROJECT && $MVN install $MAVEN_ARGS -Dmaven.repo.local=$OUT/m2)
$MVN -pl fuzz-targets install -Dmaven.repo.local=$OUT/m2

# build classpath
cp $SRC/project-parent/fuzz-targets/target/fuzz-targets-0.0.1-SNAPSHOT.jar $OUT/fuzz-targets.jar
RUNTIME_CLASSPATH_ABSOLUTE="$OUT/fuzz-targets.jar"
# replace $OUT with placeholder $this_dir that will be dissolved at runtime
RUNTIME_CLASSPATH=$(echo $RUNTIME_CLASSPATH_ABSOLUTE | sed "s|$OUT|\$this_dir|g")

cp ${SRC}/seeds/*_seed_corpus.zip ${OUT}/

for fuzzer in $(find $SRC/project-parent -name '*Fuzzer.java'); do
fuzzer_basename=$(basename -s .java $fuzzer)

# Create an execution wrapper for every fuzztarget
# This bumps memory to > 2gb to get around new byte[Integer.MAX_VALUE] single
# allocation issues that plague audio, video, image and other parsers.
# if we're able to get an oom > 2gb, we should really fix that.
echo "#!/bin/bash
# LLVMFuzzerTestOneInput comment for fuzzer detection by infrastructure.
this_dir=\$(dirname \"\$0\")
mem_settings='-Xmx3000m:-Xss1024k'
LD_LIBRARY_PATH=\"$JVM_LD_LIBRARY_PATH\":\$this_dir \
\$this_dir/jazzer_driver --agent_path=\$this_dir/jazzer_agent_deploy.jar \
--cp=$RUNTIME_CLASSPATH \
--target_class=com.example.$fuzzer_basename \
-rss_limit_mb=3600mb \
--jvm_args=\"\$mem_settings\" \
--instrumentation_includes=\"com.**:org.**\" \
\$@" > $OUT/$fuzzer_basename
chmod u+x $OUT/$fuzzer_basename
done
101 changes: 101 additions & 0 deletions projects/apache-tika/build_seeds.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/bin/bash -eu
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
################################################################################

mkdir ${SRC}/seeds
#This packages the unit test files based on file extension from within the Tika project
#we could also pull in other seeds from other parser projects.

find ${SRC}/project-parent/tika -name "*-webm.noext" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*-mkv.noext" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.aif" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.au" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.flv" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.m4a" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.mkv" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.mp3" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.wav" -print0 | xargs -0 zip -u ${SRC}/seeds/AudioVideoParsersFuzzer_seed_corpus.zip


find ${SRC}/project-parent/tika -name "*.Z" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.bz2" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.gz" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.tbz2" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.tgz" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.zst" -print0 | xargs -0 zip -u ${SRC}/seeds/CompressorParserFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -name "*.html" -print0 | xargs -0 zip ${SRC}/seeds/HtmlParserFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -name "*.avif" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.bmp" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.bpg" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.gif" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.heic" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.icns" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.jp2" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.jb2" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.jpg" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.jxl" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.png" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.psd" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.tif" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.webp" -print0 | xargs -0 zip -u ${SRC}/seeds/ImageParsersFuzzer_seed_corpus.zip


find ${SRC}/project-parent/tika -name "*.mdb" -print0 | xargs -0 zip ${SRC}/seeds/JackcessParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.accdb" -print0 | xargs -0 zip ${SRC}/seeds/JackcessParserFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -name "*.one" -print0 | xargs -0 zip ${SRC}/seeds/OneNoteParserFuzzer_seed_corpus.zip

#we could get more seeds by cloning POI
find ${SRC}/project-parent/tika -name "*.msg" -print0 | xargs -0 zip -u ${SRC}/seeds/OfficeParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.doc" -print0 | xargs -0 zip -u ${SRC}/seeds/OfficeParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.ppt" -print0 | xargs -0 zip -u ${SRC}/seeds/OfficeParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.xls" -print0 | xargs -0 zip -u ${SRC}/seeds/OfficeParserFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -name "*.docm" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.docx" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.pptm" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.pptx" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.xlsm" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.xlsx" -print0 | xargs -0 zip -u ${SRC}/seeds/OOXMLParserFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -name "*.7z" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.ar" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.jar" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.rar" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.tar" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.zip" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.zlib" -print0 | xargs -0 zip -u ${SRC}/seeds/PackageParserFuzzer_seed_corpus.zip


#we could get more seeds by cloning PDFBox or...?
find ${SRC}/project-parent/tika -name "*.pdf" -print0 | xargs -0 zip ${SRC}/seeds/PDFParserFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -name "*.eml" -print0 | xargs -0 zip ${SRC}/seeds/RFC822ParserFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -name "*.rtf" -print0 | xargs -0 zip ${SRC}/seeds/RTFParserFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -name "*.txt" -print0 | xargs -0 zip ${SRC}/seeds/TextAndCSVParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.tsv" -print0 | xargs -0 zip -u ${SRC}/seeds/TextAndCSVParserFuzzer_seed_corpus.zip
find ${SRC}/project-parent/tika -name "*.csv" -print0 | xargs -0 zip -u ${SRC}/seeds/TextAndCSVParserFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -name "*.xml" -print0 | xargs -0 zip ${SRC}/seeds/XMLReaderUtilsFuzzer_seed_corpus.zip

find ${SRC}/project-parent/tika -path '*/test-documents/*' -type f | xargs -n1 -d '\n' zip ${SRC}/seeds/AutoDetectParserFuzzer_seed_corpus.zip

cp ${SRC}/seeds/*_seed_corpus.zip ${OUT}/

106 changes: 106 additions & 0 deletions projects/apache-tika/project-parent/fuzz-targets/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">

<modelVersion>4.0.0</modelVersion>
<groupId>com.fuzzer</groupId>
<artifactId>fuzz-targets</artifactId>
<version>0.0.1-SNAPSHOT</version>
<name>fuzz</name>
<description>fuzz</description>

<properties>
<java.version>17</java.version>
<maven.compiler.source>17</maven.compiler.source>
<maven.compiler.target>17</maven.compiler.target>
</properties>

<dependencies>
<dependency>
<groupId>com.code-intelligence</groupId>
<artifactId>jazzer</artifactId>
<version>0.24.0</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>Fuzzing-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-parsers-standard-package</artifactId>
<version>Fuzzing-SNAPSHOT</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>3.6.1</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<createDependencyReducedPom>
false
</createDependencyReducedPom>
<artifactSet>
<excludes>
<exclude>org.apache.tika:tika-parsers-standard-package:jar:</exclude>
</excludes>
</artifactSet>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/maven/plugin.xml</exclude>
<exclude>module-info.class</exclude>
<exclude>META-INF/*</exclude>
<exclude>LICENSE.txt</exclude>
<exclude>NOTICE.txt</exclude>
<exclude>CHANGES</exclude>
<exclude>README</exclude>
<exclude>builddef.lst</exclude>
<!-- https://issues.apache.org/jira/browse/TIKA-3650 -->
<exclude>javax/**/*</exclude>

</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<!--<mainClass>org.apache.tika.cli.TikaCLI</mainClass>-->
<manifestEntries>
<Multi-Release>true</Multi-Release>
</manifestEntries>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/LICENSE</resource>
<file>target/classes/META-INF/LICENSE</file>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/NOTICE</resource>
<file>target/classes/META-INF/NOTICE</file>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.IncludeResourceTransformer">
<resource>META-INF/DEPENDENCIES</resource>
<file>target/classes/META-INF/DEPENDENCIES</file>
</transformer>
<transformer implementation="org.apache.maven.plugins.shade.resource.AppendingTransformer">
<resource>META-INF/cxf/bus-extensions.txt</resource>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
////////////////////////////////////////////////////////////////////////////////

package com.example;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Paths;

import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;

import org.apache.tika.parser.audio.AudioParser;
import org.apache.tika.parser.audio.MidiParser;
import org.apache.tika.parser.mp3.Mp3Parser;
import org.apache.tika.parser.mp4.MP4Parser;
import org.apache.tika.parser.video.FLVParser;

import org.apache.tika.sax.ToTextContentHandler;


class AudioVideoParsersFuzzer {

public static void fuzzerTestOneInput(byte[] bytes) throws Throwable {
Parser[] parsers = new Parser[] {
new AudioParser(),
new MidiParser(),
new Mp3Parser(),
new MP4Parser(),
new FLVParser()
};
Parser p = new AutoDetectParser(parsers);
try {
ParserFuzzer.parseOne(p, bytes);
} catch (TikaException | SAXException | IOException e) {
//swallow
}
}
}
Loading