apache · lewismc · Sep 23, 2020
diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties
@@ -72,3 +72,7 @@ any23.extraction.head.meta=on
 # Allows to specify a CSV file separator and comment delimeter
 any23.extraction.csv.field=,
 any23.extraction.csv.comment=#
+
+# Optimize SingleDocumentExtraction extractor matching and mimetype detection
+# by trusting the input DocumentSource content type
+any23.extraction.extractor.mimetype.optimization=on
diff --git a/api/src/test/java/org/apache/any23/configuration/SettingsTest.java b/api/src/test/java/org/apache/any23/configuration/SettingsTest.java
@@ -34,7 +34,6 @@
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
-@SuppressWarnings("ResultOfMethodCallIgnored")
 public class SettingsTest {
 
     @Test

diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java
@@ -55,7 +55,6 @@
 import java.io.InputStream;
 import java.io.PrintStream;
 import java.net.URISyntaxException;
-import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -132,6 +131,8 @@ public SingleDocumentExtraction(
         tripleHandlers.add(new CountingTripleHandler());
         this.output = new CompositeTripleHandler(tripleHandlers);
         this.encoderDetector = new TikaEncodingDetector();
+        if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null)
+            optimizeExtractorMatchingAndMimetypeDetection(in.getContentType());
     }
 
     /**
@@ -153,6 +154,8 @@ public SingleDocumentExtraction(
                 output
         );
         this.setMIMETypeDetector(null);
+        if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null)
+            optimizeExtractorMatchingAndMimetypeDetection(in.getContentType());
     }
 
     /**
@@ -174,6 +177,20 @@ public SingleDocumentExtraction(
                 output
         );
         this.setMIMETypeDetector(null);
+        if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null)
+            optimizeExtractorMatchingAndMimetypeDetection(in.getContentType());
+    }
+
+    /**
+     * Simple utility to attempt extractor matches and mimetype detection given
+     * a {@link DocumentSource#getContentType()}.
+     * @param contentType String content type obtained from {@link DocumentSource#getContentType()}
+     * @see <a href="https://issues.apache.org/jira/browse/ANY23-43">https://issues.apache.org/jira/browse/ANY23-43</a>
+     */
+    private void optimizeExtractorMatchingAndMimetypeDetection(String contentType) {
+      if (contentType != null)
+        detectedMIMEType = MIMEType.parse(contentType);
+        matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
     }
 
     /**
@@ -225,7 +242,9 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet
         if (log.isDebugEnabled()) {
             log.debug("Processing " + this.documentIRI);
         }
-        filterExtractorsByMIMEType();
+        if (matchingExtractors != null && detectedMIMEType != null) {
+          filterExtractorsByMIMEType();
+        }
 
         if(log.isDebugEnabled()) {
             StringBuilder sb = new StringBuilder("Extractors ");
@@ -252,22 +271,22 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet
             );
         }
         try {
-	        output.setContentLength(in.getContentLength());
-	        // Create the document context.
+            output.setContentLength(in.getContentLength());
+            // Create the document context.
             final String documentLanguage;
-	        try {
-	            documentLanguage = extractDocumentLanguage(extractionParameters);
-	            ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
+            try {
+                documentLanguage = extractDocumentLanguage(extractionParameters);
+                ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
                 final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
                 ArrayList<String> intersectionOfRdfMimetypes = null;
                 for (ExtractorFactory<?> factory : matchingExtractors) {
-	                final Extractor<?> extractor = factory.createExtractor();
-	                final SingleExtractionReport er = runExtractor(
-	                        extractionParameters,
-	                        documentLanguage,
-	                        extractor
-	                );
-	                // Fix for ANY23-415:
+                    final Extractor<?> extractor = factory.createExtractor();
+                    final SingleExtractionReport er = runExtractor(
+                            extractionParameters,
+                            documentLanguage,
+                            extractor
+                    );
+                    // Fix for ANY23-415:
                     if (mimeTypeIsTooGeneric) {
                         List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream()
                                 .filter(mt -> !isTooGeneric(mt))

diff --git a/core/src/main/java/org/apache/any23/source/FileDocumentSource.java b/core/src/main/java/org/apache/any23/source/FileDocumentSource.java
@@ -34,16 +34,26 @@ public class FileDocumentSource implements DocumentSource {
 
     private final String uri;
 
+    private final String contentType;
+
     public FileDocumentSource(File file) {
         this.file = file;
         this.uri = file.toURI().toString();
+        this.contentType = null;
     }
 
     public FileDocumentSource(File file, String baseIRI) {
         this.file = file;
         this.uri = baseIRI;
+        this.contentType = null;
     }
 
+    public FileDocumentSource(File file, String baseIRI, String contentType) {
+        this.file = file;
+        this.uri = baseIRI;
+        this.contentType = contentType;
+  }
+
     public InputStream openInputStream() throws IOException {
         return new BufferedInputStream( new FileInputStream(file) );
     }
@@ -57,7 +67,7 @@ public String getDocumentIRI() {
     }
 
     public String getContentType() {
-        return null;
+        return contentType;
     }
 
     public boolean isLocal() {

diff --git a/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java b/core/src/test/java/org/apache/any23/extractor/SingleDocumentExtractionTest.java
@@ -116,7 +116,24 @@ public void tearDown() throws SailException, RepositoryException, TripleHandlerE
      */
     @Test
     public void testMicroformatDomains() throws IOException, ExtractionException, RepositoryException {
-        singleDocumentExtraction = getInstance("/microformats/microformat-domains.html");
+        singleDocumentExtraction = getInstance("/microformats/microformat-domains.html", false);
+        singleDocumentExtraction.run();
+        logStorageContent();
+        assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
+    }
+
+    /**
+     * Tests the existence of the domain triples using the SingleDocumentExtraction 
+     * extractor matching and mimetype detection optimization implemented in
+     * <a href="https://issues.apache.org/jira/projects/ANY23/issues/ANY23-43">ANY23-43</a>
+     *
+     * @throws IOException if there is an error loading input data
+     * @throws ExtractionException if an exception is raised during extraction
+     * @throws RepositoryException if an error is encountered whilst loading content from a storage connection
+     */
+    @Test
+    public void testMicroformatDomainsAny2343Optimization() throws IOException, ExtractionException, RepositoryException {
+        singleDocumentExtraction = getInstance("/microformats/microformat-domains.html", true);
         singleDocumentExtraction.run();
         logStorageContent();
         assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
@@ -137,7 +154,7 @@ public void testMicroformatDomains() throws IOException, ExtractionException, Re
      */
     @Test
     public void testNestedMicroformats() throws IOException, ExtractionException, RepositoryException {
-        singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html");
+        singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html", false);
         singleDocumentExtraction.run();
 
         logStorageContent();
@@ -160,7 +177,7 @@ public void testNestedMicroformats() throws IOException, ExtractionException, Re
      */
     @Test
     public void testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException {
-        singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html");
+        singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html", false);
         singleDocumentExtraction.run();
 
         logStorageContent();
@@ -187,7 +204,7 @@ public void testNestedVCardAdr() throws IOException, ExtractionException, Reposi
      */
     @Test
     public void testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException {
-        singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html");
+        singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html", false);
         singleDocumentExtraction.run();
 
         logStorageContent();
@@ -214,7 +231,7 @@ public void testNestedMicroformatsInduced() throws IOException, ExtractionExcept
      *       show the triple property as double. Despite this the model contains it just once.
      */
     public void testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException {
-        singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html");
+        singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html", false);
         singleDocumentExtraction.run();
 
         logStorageContent();
@@ -229,7 +246,7 @@ public void testNestedMicroformatsManaged() throws IOException, ExtractionExcept
         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL)  , vREVIEW.hasReview, 1);
     }
 
-    private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
+    private SingleDocumentExtraction getInstance(String file, Boolean optimizeMimeTypeAndExtractorSelection) throws FileNotFoundException, IOException {
         baos = new ByteArrayOutputStream();
         rdfxmlWriter = new RDFXMLWriter(baos);
         repositoryWriter = new RepositoryWriter(conn);
@@ -238,18 +255,18 @@ private SingleDocumentExtraction getInstance(String file) throws FileNotFoundExc
         cth.addChild(rdfxmlWriter);
         cth.addChild(repositoryWriter);
 
-        final ModifiableConfiguration configuration = DefaultConfiguration.copy();
-        configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
-        SingleDocumentExtraction instance =  new SingleDocumentExtraction(
-                configuration,
-                new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
-                extractorGroup,
-                cth
-        );
-        instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
-        return instance;
-    }
-
+      final ModifiableConfiguration configuration = DefaultConfiguration.copy();
+      configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
+      SingleDocumentExtraction instance =  new SingleDocumentExtraction(
+              configuration,
+              new HTMLFixture(copyResourceToTempFile(file))
+                  .getOpener("http://nested.test.com", optimizeMimeTypeAndExtractorSelection),
+              extractorGroup,
+              cth
+      );
+      instance.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
+      return instance;
+  }
     /**
      * Logs the storage content.
      * @throws RepositoryException if an error is encountered whilst loading content from a storage connection

diff --git a/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java b/core/src/test/java/org/apache/any23/extractor/html/AbstractExtractorTestCase.java
@@ -189,7 +189,7 @@ protected void extract(String resource) throws ExtractionException,
   IOException {
     SingleDocumentExtraction ex = new SingleDocumentExtraction(
             new HTMLFixture(copyResourceToTempFile(resource)).getOpener(baseIRI
-                    .toString()), getExtractorFactory(),
+                    .toString(), false), getExtractorFactory(),
             new RepositoryWriter(conn));
     ex.setMIMETypeDetector(null);
     report = ex.run();

diff --git a/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java b/core/src/test/java/org/apache/any23/extractor/html/HTMLFixture.java
@@ -27,7 +27,7 @@
 import java.io.IOException;
 
 /**
- * This class is a wrapper around an HTML document providing a simply facade.
+ * This class is a wrapper around an HTML document providing a simple facade.
  */
 public class HTMLFixture {
 
@@ -43,8 +43,21 @@ private File getFile() {
         return file;
     }
 
-    public DocumentSource getOpener(String baseIRI) {
-        return new FileDocumentSource(getFile(), baseIRI);
+    /**
+     * 
+     * @param baseIRI the base IRI to use for the DocumentSource
+     * @param optimizeMimeTypeAndExtractorSelection if you wish to optimize 
+     * SingleDocumentExtraction extractor matching and mimetype detection by 
+     * trusting the input DocumentSource content type. See 
+     * https://issues.apache.org/jira/projects/ANY23/issues/ANY23-43
+     * @return the document source which is actually a {@link FileDocumentSource}
+     */
+    public DocumentSource getOpener(String baseIRI, Boolean optimizeMimeTypeAndExtractorSelection) {
+        if (optimizeMimeTypeAndExtractorSelection) {
+            return new FileDocumentSource(getFile(), baseIRI, "text/html");
+        } else {
+            return new FileDocumentSource(getFile(), baseIRI);
+        }
     }
 
     /**