Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGES.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
Release 3.2.1 - 06/20/2025
Release 3.2.1 - 6/25/2025

* Fix POIFSContainerDetector regression when wrapping an InputStream in
a TikaInputStream (TIKA-4441).

* Important bug fix for zip-based detection on a non-TikaInputStream (TIKA-4424).

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
Expand All @@ -44,6 +45,7 @@

import org.apache.tika.config.Field;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
Expand Down Expand Up @@ -254,7 +256,7 @@ public class POIFSContainerDetector implements Detector {


@Field
private int markLimit = -1;
private int markLimit = 128 * 1024 * 1024;

/**
* Internal detection of the specific kind of OLE2 document, based on the
Expand Down Expand Up @@ -608,11 +610,42 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException
return handleTikaStream(tis, metadata);
}
if (isOleHeader(input)) {
return OLE;
if (markLimit < 0) {
return OLE;
}
return handleInputStream(input, metadata);
}
return MediaType.OCTET_STREAM;
}

private MediaType handleInputStream(InputStream input, Metadata metadata) throws IOException {
if (markLimit < 0) {
return OLE;
}
BoundedInputStream bis = null;
try {
bis = new BoundedInputStream(markLimit, CloseShieldInputStream.wrap(input));
bis.mark(markLimit);
try (POIFSFileSystem poifs = new POIFSFileSystem(CloseShieldInputStream.wrap(bis))) {
if (bis.hasHitBound()) {
return OLE;
}
Set<String> names = getTopLevelNames(poifs.getRoot());
return detect(names, poifs.getRoot());
} catch (SecurityException e) {
throw e;
} catch (IOException | RuntimeException e) {
//swallow
return OLE;
}
} finally {
if (bis != null) {
bis.reset();
bis.close();
}
}
}

private MediaType handleTikaStream(TikaInputStream tis, Metadata metadata) throws IOException {
//try for an open container
Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,14 @@
import java.util.Objects;
import java.util.Random;

import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;

import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.Tika;
Expand Down Expand Up @@ -606,4 +610,81 @@ public void testBPList() throws Exception {
assertTypeByData("testWEBARCHIVE.webarchive", "application/x-bplist-webarchive");
assertTypeByData("testBPList.bplist", "application/x-bplist-itunes");
}

@Test
public void testPOIFSContainerDetector() throws Exception {
UnsynchronizedByteArrayOutputStream baos = UnsynchronizedByteArrayOutputStream.builder().get();
try (InputStream is = getResourceAsStream("/test-documents/testWORD.doc")) {
IOUtils.copy(is, baos);
}
byte[] bytes = baos.toByteArray();
long len = bytes.length;

//test default
Detector detector = TikaConfig.getDefaultConfig().getDetector();
try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
assertEquals("application/msword",
detector.detect(is, new Metadata()).toString());
assertEquals(len, countBytes(is));
}

detector = loadDetector("tika-4441-neg1.xml");
try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
assertEquals("application/x-tika-msoffice",
detector.detect(is, new Metadata()).toString());
assertEquals(len, countBytes(is));
}

detector = loadDetector("tika-4441-120.xml");
try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
assertEquals("application/x-tika-msoffice",
detector.detect(is, new Metadata()).toString());
assertEquals(len, countBytes(is));
}

detector = loadDetector("tika-4441-12000000.xml");
try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
assertEquals("application/msword",
detector.detect(is, new Metadata()).toString());
assertEquals(len, countBytes(is));
}

//now try wrapping in a TikaInputStream
detector = loadDetector("tika-4441-neg1.xml");
try (InputStream is = TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) {
assertEquals("application/msword",
detector.detect(is, new Metadata()).toString());
assertEquals(len, countBytes(is));
}

detector = loadDetector("tika-4441-120.xml");
try (InputStream is = TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) {
assertEquals("application/x-tika-msoffice",
detector.detect(is, new Metadata()).toString());
assertEquals(len, countBytes(is));
}

detector = loadDetector("tika-4441-12000000.xml");
try (InputStream is = TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) {
assertEquals("application/msword",
detector.detect(is, new Metadata()).toString());
assertEquals(len, countBytes(is));
}
}

private long countBytes(InputStream is) throws IOException {
int b = is.read();
long len = 0;
while (b > -1) {
len++;
b = is.read();
}
return len;
}

private Detector loadDetector(String tikaConfigName) throws IOException, TikaException, SAXException {
try (InputStream is = TestContainerAwareDetector.class.getResourceAsStream("/configs/" + tikaConfigName)) {
return new TikaConfig(is).getDetector();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<properties>
<detectors>
<detector class="org.gagravarr.tika.OggDetector"/>
<detector class="org.apache.tika.detect.apple.BPListDetector"/>
<detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
<detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
<params>
<param name="markLimit" type="int">120</param>
</params>
</detector>
<detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
<detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
<params>
<param name="markLimit" type="int">16777216</param>
</params>
</detector>
<detector class="org.apache.tika.mime.MimeTypes"/>
</detectors>
</properties>
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<properties>
<detectors>
<detector class="org.gagravarr.tika.OggDetector"/>
<detector class="org.apache.tika.detect.apple.BPListDetector"/>
<detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
<detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
<params>
<param name="markLimit" type="int">12000000</param>
</params>
</detector>
<detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
<detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
<params>
<param name="markLimit" type="int">16777216</param>
</params>
</detector>
<detector class="org.apache.tika.mime.MimeTypes"/>
</detectors>
</properties>
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<properties>
<detectors>
<detector class="org.gagravarr.tika.OggDetector"/>
<detector class="org.apache.tika.detect.apple.BPListDetector"/>
<detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
<detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
<params>
<param name="markLimit" type="int">-1</param>
</params>
</detector>
<detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
<detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
<params>
<param name="markLimit" type="int">16777216</param>
</params>
</detector>
<detector class="org.apache.tika.mime.MimeTypes"/>
</detectors>
</properties>