Skip to content
This repository has been archived by the owner on Jul 3, 2023. It is now read-only.

WIP ANY23-43 Add configuration flag to drive the MIME type enforcement by the MIMETypeDetector #161

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions api/src/main/resources/default-configuration.properties
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,7 @@ any23.extraction.head.meta=on
# Allows to specify a CSV file separator and comment delimeter
any23.extraction.csv.field=,
any23.extraction.csv.comment=#

# Optimize SingleDocumentExtraction extractor matching and mimetype detection
# by trusting the input DocumentSource content type
any23.extraction.extractor.mimetype.optimization=on
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

@SuppressWarnings("ResultOfMethodCallIgnored")
public class SettingsTest {

@Test
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
import java.io.InputStream;
import java.io.PrintStream;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -132,6 +131,8 @@ public SingleDocumentExtraction(
tripleHandlers.add(new CountingTripleHandler());
this.output = new CompositeTripleHandler(tripleHandlers);
this.encoderDetector = new TikaEncodingDetector();
if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null)
optimizeExtractorMatchingAndMimetypeDetection(in.getContentType());
}

/**
Expand All @@ -153,6 +154,8 @@ public SingleDocumentExtraction(
output
);
this.setMIMETypeDetector(null);
if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null)
optimizeExtractorMatchingAndMimetypeDetection(in.getContentType());
}

/**
Expand All @@ -174,6 +177,20 @@ public SingleDocumentExtraction(
output
);
this.setMIMETypeDetector(null);
if (configuration.getFlagProperty("any23.extraction.extractor.mimetype.optimization") && in.getContentType() != null)
optimizeExtractorMatchingAndMimetypeDetection(in.getContentType());
}

/**
* Simple utility to attempt extractor matches and mimetype detection given
* a {@link DocumentSource#getContentType()}.
* @param contentType String content type obtained from {@link DocumentSource#getContentType()}
* @see <a href="https://issues.apache.org/jira/browse/ANY23-43">https://issues.apache.org/jira/browse/ANY23-43</a>
*/
private void optimizeExtractorMatchingAndMimetypeDetection(String contentType) {
if (contentType != null)
detectedMIMEType = MIMEType.parse(contentType);
matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
}

/**
Expand Down Expand Up @@ -225,7 +242,9 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet
if (log.isDebugEnabled()) {
log.debug("Processing " + this.documentIRI);
}
filterExtractorsByMIMEType();
if (matchingExtractors != null && detectedMIMEType != null) {
filterExtractorsByMIMEType();
}

if(log.isDebugEnabled()) {
StringBuilder sb = new StringBuilder("Extractors ");
Expand All @@ -252,22 +271,22 @@ public SingleDocumentExtractionReport run(ExtractionParameters extractionParamet
);
}
try {
output.setContentLength(in.getContentLength());
// Create the document context.
output.setContentLength(in.getContentLength());
// Create the document context.
final String documentLanguage;
try {
documentLanguage = extractDocumentLanguage(extractionParameters);
ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
try {
documentLanguage = extractDocumentLanguage(extractionParameters);
ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
ArrayList<String> intersectionOfRdfMimetypes = null;
for (ExtractorFactory<?> factory : matchingExtractors) {
final Extractor<?> extractor = factory.createExtractor();
final SingleExtractionReport er = runExtractor(
extractionParameters,
documentLanguage,
extractor
);
// Fix for ANY23-415:
final Extractor<?> extractor = factory.createExtractor();
final SingleExtractionReport er = runExtractor(
extractionParameters,
documentLanguage,
extractor
);
// Fix for ANY23-415:
if (mimeTypeIsTooGeneric) {
List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream()
.filter(mt -> !isTooGeneric(mt))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,26 @@ public class FileDocumentSource implements DocumentSource {

private final String uri;

private final String contentType;

public FileDocumentSource(File file) {
this.file = file;
this.uri = file.toURI().toString();
this.contentType = null;
}

public FileDocumentSource(File file, String baseIRI) {
this.file = file;
this.uri = baseIRI;
this.contentType = null;
}

public FileDocumentSource(File file, String baseIRI, String contentType) {
this.file = file;
this.uri = baseIRI;
this.contentType = contentType;
}

public InputStream openInputStream() throws IOException {
return new BufferedInputStream( new FileInputStream(file) );
}
Expand All @@ -57,7 +67,7 @@ public String getDocumentIRI() {
}

public String getContentType() {
return null;
return contentType;
}

public boolean isLocal() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,24 @@ public void tearDown() throws SailException, RepositoryException, TripleHandlerE
*/
@Test
public void testMicroformatDomains() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/microformat-domains.html");
singleDocumentExtraction = getInstance("/microformats/microformat-domains.html", false);
singleDocumentExtraction.run();
logStorageContent();
assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
}

/**
* Tests the existence of the domain triples using the SingleDocumentExtraction
* extractor matching and mimetype detection optimization implemented in
* <a href="https://issues.apache.org/jira/projects/ANY23/issues/ANY23-43">ANY23-43</a>
*
* @throws IOException if there is an error loading input data
* @throws ExtractionException if an exception is raised during extraction
* @throws RepositoryException if an error is encountered whilst loading content from a storage connection
*/
@Test
public void testMicroformatDomainsAny2343Optimization() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/microformat-domains.html", true);
singleDocumentExtraction.run();
logStorageContent();
assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
Expand All @@ -137,7 +154,7 @@ public void testMicroformatDomains() throws IOException, ExtractionException, Re
*/
@Test
public void testNestedMicroformats() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html");
singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html", false);
singleDocumentExtraction.run();

logStorageContent();
Expand All @@ -160,7 +177,7 @@ public void testNestedMicroformats() throws IOException, ExtractionException, Re
*/
@Test
public void testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html");
singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html", false);
singleDocumentExtraction.run();

logStorageContent();
Expand All @@ -187,7 +204,7 @@ public void testNestedVCardAdr() throws IOException, ExtractionException, Reposi
*/
@Test
public void testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html");
singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html", false);
singleDocumentExtraction.run();

logStorageContent();
Expand All @@ -214,7 +231,7 @@ public void testNestedMicroformatsInduced() throws IOException, ExtractionExcept
* show the triple property as double. Despite this the model contains it just once.
*/
public void testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html");
singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html", false);
singleDocumentExtraction.run();

logStorageContent();
Expand All @@ -229,7 +246,7 @@ public void testNestedMicroformatsManaged() throws IOException, ExtractionExcept
assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL) , vREVIEW.hasReview, 1);
}

private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
private SingleDocumentExtraction getInstance(String file, Boolean optimizeMimeTypeAndExtractorSelection) throws FileNotFoundException, IOException {
baos = new ByteArrayOutputStream();
rdfxmlWriter = new RDFXMLWriter(baos);
repositoryWriter = new RepositoryWriter(conn);
Expand All @@ -238,18 +255,18 @@ private SingleDocumentExtraction getInstance(String file) throws FileNotFoundExc
cth.addChild(rdfxmlWriter);
cth.addChild(repositoryWriter);

final ModifiableConfiguration configuration = DefaultConfiguration.copy();
configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
SingleDocumentExtraction instance = new SingleDocumentExtraction(
configuration,
new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
extractorGroup,
cth
);
instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
return instance;
}

final ModifiableConfiguration configuration = DefaultConfiguration.copy();
configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
SingleDocumentExtraction instance = new SingleDocumentExtraction(
configuration,
new HTMLFixture(copyResourceToTempFile(file))
.getOpener("http://nested.test.com", optimizeMimeTypeAndExtractorSelection),
extractorGroup,
cth
);
instance.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
return instance;
}
/**
* Logs the storage content.
* @throws RepositoryException if an error is encountered whilst loading content from a storage connection
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ protected void extract(String resource) throws ExtractionException,
IOException {
SingleDocumentExtraction ex = new SingleDocumentExtraction(
new HTMLFixture(copyResourceToTempFile(resource)).getOpener(baseIRI
.toString()), getExtractorFactory(),
.toString(), false), getExtractorFactory(),
new RepositoryWriter(conn));
ex.setMIMETypeDetector(null);
report = ex.run();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
import java.io.IOException;

/**
* This class is a wrapper around an HTML document providing a simply facade.
* This class is a wrapper around an HTML document providing a simple facade.
*/
public class HTMLFixture {

Expand All @@ -43,8 +43,21 @@ private File getFile() {
return file;
}

public DocumentSource getOpener(String baseIRI) {
return new FileDocumentSource(getFile(), baseIRI);
/**
*
* @param baseIRI the base IRI to use for the DocumentSource
* @param optimizeMimeTypeAndExtractorSelection if you wish to optimize
* SingleDocumentExtraction extractor matching and mimetype detection by
* trusting the input DocumentSource content type. See
* https://issues.apache.org/jira/projects/ANY23/issues/ANY23-43
* @return the document source which is actually a {@link FileDocumentSource}
*/
public DocumentSource getOpener(String baseIRI, Boolean optimizeMimeTypeAndExtractorSelection) {
if (optimizeMimeTypeAndExtractorSelection) {
return new FileDocumentSource(getFile(), baseIRI, "text/html");
} else {
return new FileDocumentSource(getFile(), baseIRI);
}
}

/**
Expand Down