Skip to content

Commit f8869e3

Browse files
authored
TIKA-4441 -- revert markLimit and add unit tests (#2261)
1 parent 586e361 commit f8869e3

File tree

6 files changed

+228
-3
lines changed

6 files changed

+228
-3
lines changed

CHANGES.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
1-
Release 3.2.1 - 06/20/2025
1+
Release 3.2.1 - 6/25/2025
2+
3+
* Fix POIFSContainerDetector regression when wrapping an InputStream in
4+
a TikaInputStream (TIKA-4441).
25

36
* Important bug fix for zip-based detection on a non-TikaInputStream (TIKA-4424).
47

tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/detect/microsoft/POIFSContainerDetector.java

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import java.util.regex.Pattern;
3333

3434
import org.apache.commons.io.IOUtils;
35+
import org.apache.commons.io.input.CloseShieldInputStream;
3536
import org.apache.poi.hssf.model.InternalWorkbook;
3637
import org.apache.poi.poifs.filesystem.DirectoryEntry;
3738
import org.apache.poi.poifs.filesystem.DirectoryNode;
@@ -44,6 +45,7 @@
4445

4546
import org.apache.tika.config.Field;
4647
import org.apache.tika.detect.Detector;
48+
import org.apache.tika.io.BoundedInputStream;
4749
import org.apache.tika.io.TikaInputStream;
4850
import org.apache.tika.metadata.Metadata;
4951
import org.apache.tika.mime.MediaType;
@@ -254,7 +256,7 @@ public class POIFSContainerDetector implements Detector {
254256

255257

256258
@Field
257-
private int markLimit = -1;
259+
private int markLimit = 128 * 1024 * 1024;
258260

259261
/**
260262
* Internal detection of the specific kind of OLE2 document, based on the
@@ -608,11 +610,42 @@ public MediaType detect(InputStream input, Metadata metadata) throws IOException
608610
return handleTikaStream(tis, metadata);
609611
}
610612
if (isOleHeader(input)) {
611-
return OLE;
613+
if (markLimit < 0) {
614+
return OLE;
615+
}
616+
return handleInputStream(input, metadata);
612617
}
613618
return MediaType.OCTET_STREAM;
614619
}
615620

621+
private MediaType handleInputStream(InputStream input, Metadata metadata) throws IOException {
622+
if (markLimit < 0) {
623+
return OLE;
624+
}
625+
BoundedInputStream bis = null;
626+
try {
627+
bis = new BoundedInputStream(markLimit, CloseShieldInputStream.wrap(input));
628+
bis.mark(markLimit);
629+
try (POIFSFileSystem poifs = new POIFSFileSystem(CloseShieldInputStream.wrap(bis))) {
630+
if (bis.hasHitBound()) {
631+
return OLE;
632+
}
633+
Set<String> names = getTopLevelNames(poifs.getRoot());
634+
return detect(names, poifs.getRoot());
635+
} catch (SecurityException e) {
636+
throw e;
637+
} catch (IOException | RuntimeException e) {
638+
//swallow
639+
return OLE;
640+
}
641+
} finally {
642+
if (bis != null) {
643+
bis.reset();
644+
bis.close();
645+
}
646+
}
647+
}
648+
616649
private MediaType handleTikaStream(TikaInputStream tis, Metadata metadata) throws IOException {
617650
//try for an open container
618651
Set<String> names = tryOpenContainerOnTikaInputStream(tis, metadata);

tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/detect/TestContainerAwareDetector.java

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,14 @@
3030
import java.util.Objects;
3131
import java.util.Random;
3232

33+
import org.apache.commons.io.IOUtils;
34+
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
35+
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
3336
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
3437
import org.junit.jupiter.api.AfterEach;
3538
import org.junit.jupiter.api.Disabled;
3639
import org.junit.jupiter.api.Test;
40+
import org.xml.sax.SAXException;
3741

3842
import org.apache.tika.MultiThreadedTikaTest;
3943
import org.apache.tika.Tika;
@@ -606,4 +610,81 @@ public void testBPList() throws Exception {
606610
assertTypeByData("testWEBARCHIVE.webarchive", "application/x-bplist-webarchive");
607611
assertTypeByData("testBPList.bplist", "application/x-bplist-itunes");
608612
}
613+
614+
@Test
615+
public void testPOIFSContainerDetector() throws Exception {
616+
UnsynchronizedByteArrayOutputStream baos = UnsynchronizedByteArrayOutputStream.builder().get();
617+
try (InputStream is = getResourceAsStream("/test-documents/testWORD.doc")) {
618+
IOUtils.copy(is, baos);
619+
}
620+
byte[] bytes = baos.toByteArray();
621+
long len = bytes.length;
622+
623+
//test default
624+
Detector detector = TikaConfig.getDefaultConfig().getDetector();
625+
try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
626+
assertEquals("application/msword",
627+
detector.detect(is, new Metadata()).toString());
628+
assertEquals(len, countBytes(is));
629+
}
630+
631+
detector = loadDetector("tika-4441-neg1.xml");
632+
try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
633+
assertEquals("application/x-tika-msoffice",
634+
detector.detect(is, new Metadata()).toString());
635+
assertEquals(len, countBytes(is));
636+
}
637+
638+
detector = loadDetector("tika-4441-120.xml");
639+
try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
640+
assertEquals("application/x-tika-msoffice",
641+
detector.detect(is, new Metadata()).toString());
642+
assertEquals(len, countBytes(is));
643+
}
644+
645+
detector = loadDetector("tika-4441-12000000.xml");
646+
try (InputStream is = UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get()) {
647+
assertEquals("application/msword",
648+
detector.detect(is, new Metadata()).toString());
649+
assertEquals(len, countBytes(is));
650+
}
651+
652+
//now try wrapping in a TikaInputStream
653+
detector = loadDetector("tika-4441-neg1.xml");
654+
try (InputStream is = TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) {
655+
assertEquals("application/msword",
656+
detector.detect(is, new Metadata()).toString());
657+
assertEquals(len, countBytes(is));
658+
}
659+
660+
detector = loadDetector("tika-4441-120.xml");
661+
try (InputStream is = TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) {
662+
assertEquals("application/x-tika-msoffice",
663+
detector.detect(is, new Metadata()).toString());
664+
assertEquals(len, countBytes(is));
665+
}
666+
667+
detector = loadDetector("tika-4441-12000000.xml");
668+
try (InputStream is = TikaInputStream.get(UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get())) {
669+
assertEquals("application/msword",
670+
detector.detect(is, new Metadata()).toString());
671+
assertEquals(len, countBytes(is));
672+
}
673+
}
674+
675+
private long countBytes(InputStream is) throws IOException {
676+
int b = is.read();
677+
long len = 0;
678+
while (b > -1) {
679+
len++;
680+
b = is.read();
681+
}
682+
return len;
683+
}
684+
685+
private Detector loadDetector(String tikaConfigName) throws IOException, TikaException, SAXException {
686+
try (InputStream is = TestContainerAwareDetector.class.getResourceAsStream("/configs/" + tikaConfigName)) {
687+
return new TikaConfig(is).getDetector();
688+
}
689+
}
609690
}
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2+
<!--
3+
Licensed to the Apache Software Foundation (ASF) under one or more
4+
contributor license agreements. See the NOTICE file distributed with
5+
this work for additional information regarding copyright ownership.
6+
The ASF licenses this file to You under the Apache License, Version 2.0
7+
(the "License"); you may not use this file except in compliance with
8+
the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
-->
18+
<properties>
19+
<detectors>
20+
<detector class="org.gagravarr.tika.OggDetector"/>
21+
<detector class="org.apache.tika.detect.apple.BPListDetector"/>
22+
<detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
23+
<detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
24+
<params>
25+
<param name="markLimit" type="int">120</param>
26+
</params>
27+
</detector>
28+
<detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
29+
<detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
30+
<params>
31+
<param name="markLimit" type="int">16777216</param>
32+
</params>
33+
</detector>
34+
<detector class="org.apache.tika.mime.MimeTypes"/>
35+
</detectors>
36+
</properties>
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2+
<!--
3+
Licensed to the Apache Software Foundation (ASF) under one or more
4+
contributor license agreements. See the NOTICE file distributed with
5+
this work for additional information regarding copyright ownership.
6+
The ASF licenses this file to You under the Apache License, Version 2.0
7+
(the "License"); you may not use this file except in compliance with
8+
the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
-->
18+
<properties>
19+
<detectors>
20+
<detector class="org.gagravarr.tika.OggDetector"/>
21+
<detector class="org.apache.tika.detect.apple.BPListDetector"/>
22+
<detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
23+
<detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
24+
<params>
25+
<param name="markLimit" type="int">12000000</param>
26+
</params>
27+
</detector>
28+
<detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
29+
<detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
30+
<params>
31+
<param name="markLimit" type="int">16777216</param>
32+
</params>
33+
</detector>
34+
<detector class="org.apache.tika.mime.MimeTypes"/>
35+
</detectors>
36+
</properties>
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
2+
<!--
3+
Licensed to the Apache Software Foundation (ASF) under one or more
4+
contributor license agreements. See the NOTICE file distributed with
5+
this work for additional information regarding copyright ownership.
6+
The ASF licenses this file to You under the Apache License, Version 2.0
7+
(the "License"); you may not use this file except in compliance with
8+
the License. You may obtain a copy of the License at
9+
10+
http://www.apache.org/licenses/LICENSE-2.0
11+
12+
Unless required by applicable law or agreed to in writing, software
13+
distributed under the License is distributed on an "AS IS" BASIS,
14+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
See the License for the specific language governing permissions and
16+
limitations under the License.
17+
-->
18+
<properties>
19+
<detectors>
20+
<detector class="org.gagravarr.tika.OggDetector"/>
21+
<detector class="org.apache.tika.detect.apple.BPListDetector"/>
22+
<detector class="org.apache.tika.detect.gzip.GZipSpecializationDetector"/>
23+
<detector class="org.apache.tika.detect.microsoft.POIFSContainerDetector">
24+
<params>
25+
<param name="markLimit" type="int">-1</param>
26+
</params>
27+
</detector>
28+
<detector class="org.apache.tika.detect.ole.MiscOLEDetector"/>
29+
<detector class="org.apache.tika.detect.zip.DefaultZipContainerDetector">
30+
<params>
31+
<param name="markLimit" type="int">16777216</param>
32+
</params>
33+
</detector>
34+
<detector class="org.apache.tika.mime.MimeTypes"/>
35+
</detectors>
36+
</properties>

0 commit comments

Comments
 (0)