Skip to content

Commit

Permalink
Merged GERDAQ implementation into the version1.2.5 branch. Updated th…
Browse files Browse the repository at this point in the history
…e GERDAQ implementation.
  • Loading branch information
MichaelRoeder committed Jan 23, 2017
2 parents 3c07564 + 9759133 commit 8425da1
Show file tree
Hide file tree
Showing 4 changed files with 281 additions and 10 deletions.
179 changes: 179 additions & 0 deletions src/main/java/org/aksw/gerbil/dataset/impl/gerdaq/GERDAQDataset.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.impl.gerdaq;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;

import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.aksw.gerbil.dataset.InitializableDataset;
import org.aksw.gerbil.dataset.impl.AbstractDataset;
import org.aksw.gerbil.datatypes.ErrorTypes;
import org.aksw.gerbil.exceptions.GerbilException;
import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

public class GERDAQDataset extends AbstractDataset implements InitializableDataset {

private static final Logger LOGGER = LoggerFactory.getLogger(GERDAQDataset.class);

private static final String WIKIPEDIA_URI = "http://en.wikipedia.org/wiki/";
private static final String DBPEDIA_URI = "http://dbpedia.org/resource/";
private static final String ANNOTATION_TAG = "annotation";
private static final String DOCUMENT_TAG = "instance";

private String file;
private List<Document> documents;

public GERDAQDataset(String file) {
this.file = file;
}

@Override
public int size() {
return documents.size();
}

@Override
public List<Document> getInstances() {
return documents;
}

@Override
public void init() throws GerbilException {
this.documents = loadDocuments(new File(file));
}

protected static String generateDocumentUri(String datasetName, String fileName) {
StringBuilder builder = new StringBuilder();
builder.append("http://");
builder.append(datasetName.replace(' ', '_'));
builder.append('/');
builder.append(fileName);
builder.append('_');
return builder.toString();
}

private List<Document> loadDocuments(File filePath) throws GerbilException {
List<Document> docs = new ArrayList<>();
if (!filePath.exists()) {
throw new GerbilException("The given file (" + filePath.getAbsolutePath() + ") is not existing.",
ErrorTypes.DATASET_LOADING_ERROR);
}

if (filePath.isDirectory()) {

String directoryPath = filePath.getAbsolutePath();
if (!directoryPath.endsWith(File.separator)) {
directoryPath = directoryPath + File.separator;
}

for (File tmpFile : new File(directoryPath).listFiles()) {
docs.addAll(createDocument(tmpFile));
}

} else {
docs.addAll(createDocument(filePath));
}

return docs;

}

private List<Document> createDocument(File file) throws GerbilException {
List<Document> documents = new ArrayList<Document>();
String documentUriStart = generateDocumentUri(name, file.getName());
InputStream inputStream = null;
InputSource is = null;
try {
inputStream = new BufferedInputStream(new FileInputStream(file));
is = new InputSource(inputStream);
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();

saxParser.parse(is, new DefaultHandler() {

private StringBuilder text = new StringBuilder();
private int markingStart;
private String markingTitle;
private List<Marking> markings;

@Override
public void startDocument() throws SAXException {
super.startDocument();
}

@Override
public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
throws SAXException {

if (qName.equals(ANNOTATION_TAG)) {
markingTitle = atts.getValue("rank_0_title");
if (markingTitle != null) {
markingStart = text.length();
} else {
LOGGER.error("Found a marking without the necessary \"rank_0_title\" attribute.");
}
markingTitle = markingTitle.replace(' ', '_');
} else if (qName.equals(DOCUMENT_TAG)) {
this.markings = new ArrayList<>();
}
}

@Override
public void characters(char[] ch, int start, int length) {
text.append(ch, start, length);
}

@Override
public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
if (qName.equals(DOCUMENT_TAG)) {
documents.add(new DocumentImpl(text.toString(), documentUriStart + documents.size(), markings));
text.delete(0, text.length());
} else if (qName.equals(ANNOTATION_TAG) && (markingTitle != null)) {
markings.add(new NamedEntity(markingStart, text.length() - markingStart, new HashSet<String>(
Arrays.asList(DBPEDIA_URI + markingTitle, WIKIPEDIA_URI + markingTitle))));
}
}
});
} catch (Exception e) {
throw new GerbilException("Exception while reading dataset.", e, ErrorTypes.DATASET_LOADING_ERROR);
} finally {
IOUtils.closeQuietly(inputStream);
}

return documents;
}

}
44 changes: 35 additions & 9 deletions src/main/properties/datasets.properties
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,31 @@ org.aksw.gerbil.datasets.definition.DBPEDIA_SPOTLIGHT.cacheable=true
org.aksw.gerbil.datasets.definition.DBPEDIA_SPOTLIGHT.experimentType=A2KB
org.aksw.gerbil.datasets.definition.DBPEDIA_SPOTLIGHT.constructorArgs=${org.aksw.gerbil.datasets.KnownNIFFileDatasetConfig.DBPEDIA_SPOTLIGHT},${org.aksw.gerbil.datasets.definition.DBPEDIA_SPOTLIGHT.name}

### IITB
org.aksw.gerbil.datasets.IITBDatasetConfig.crawledDocs=${org.aksw.gerbil.DataPath}/datasets/iitb/crawledDocs
org.aksw.gerbil.datasets.IITBDatasetConfig.annotations=${org.aksw.gerbil.DataPath}/datasets/iitb/CSAW_Annotations.xml
org.aksw.gerbil.datasets.definition.IITB.name=IITB
org.aksw.gerbil.datasets.definition.IITB.class=org.aksw.gerbil.dataset.impl.iitb.IITBDataset
org.aksw.gerbil.datasets.definition.IITB.cacheable=true
org.aksw.gerbil.datasets.definition.IITB.experimentType=A2KB
org.aksw.gerbil.datasets.definition.IITB.constructorArgs=${org.aksw.gerbil.datasets.IITBDatasetConfig.crawledDocs},${org.aksw.gerbil.datasets.IITBDatasetConfig.annotations}
### GERDAQ
org.aksw.gerbil.datasets.gerdaq.devFile=${org.aksw.gerbil.DataPath}/datasets/gerdaq/gerdaq_devel.xml
org.aksw.gerbil.datasets.gerdaq.trainingAFile=${org.aksw.gerbil.DataPath}/datasets/gerdaq/gerdaq_trainingA.xml
org.aksw.gerbil.datasets.gerdaq.trainingBFile=${org.aksw.gerbil.DataPath}/datasets/gerdaq/gerdaq_trainingB.xml
org.aksw.gerbil.datasets.gerdaq.testFile=${org.aksw.gerbil.DataPath}/datasets/gerdaq/gerdaq_test.xml
org.aksw.gerbil.datasets.definition.gerdaq_dev.name=GERDAQ-Dev
org.aksw.gerbil.datasets.definition.gerdaq_dev.class=org.aksw.gerbil.dataset.impl.gerdaq.GERDAQDataset
org.aksw.gerbil.datasets.definition.gerdaq_dev.cacheable=true
org.aksw.gerbil.datasets.definition.gerdaq_dev.experimentType=A2KB
org.aksw.gerbil.datasets.definition.gerdaq_dev.constructorArgs=${org.aksw.gerbil.datasets.gerdaq.devFile}
org.aksw.gerbil.datasets.definition.gerdaq_trainingA.name=GERDAQ-TrainingA
org.aksw.gerbil.datasets.definition.gerdaq_trainingA.class=org.aksw.gerbil.dataset.impl.gerdaq.GERDAQDataset
org.aksw.gerbil.datasets.definition.gerdaq_trainingA.cacheable=true
org.aksw.gerbil.datasets.definition.gerdaq_trainingA.experimentType=A2KB
org.aksw.gerbil.datasets.definition.gerdaq_trainingA.constructorArgs=${org.aksw.gerbil.datasets.gerdaq.trainingAFile}
org.aksw.gerbil.datasets.definition.gerdaq_trainingB.name=GERDAQ-TrainingB
org.aksw.gerbil.datasets.definition.gerdaq_trainingB.class=org.aksw.gerbil.dataset.impl.gerdaq.GERDAQDataset
org.aksw.gerbil.datasets.definition.gerdaq_trainingB.cacheable=true
org.aksw.gerbil.datasets.definition.gerdaq_trainingB.experimentType=A2KB
org.aksw.gerbil.datasets.definition.gerdaq_trainingB.constructorArgs=${org.aksw.gerbil.datasets.gerdaq.trainingBFile}
org.aksw.gerbil.datasets.definition.gerdaq_test.name=GERDAQ-Test
org.aksw.gerbil.datasets.definition.gerdaq_test.class=org.aksw.gerbil.dataset.impl.gerdaq.GERDAQDataset
org.aksw.gerbil.datasets.definition.gerdaq_test.cacheable=true
org.aksw.gerbil.datasets.definition.gerdaq_test.experimentType=A2KB
org.aksw.gerbil.datasets.definition.gerdaq_test.constructorArgs=${org.aksw.gerbil.datasets.gerdaq.testFile}

### ERD2014
org.aksw.gerbil.datasets.ERD2014.texts=${org.aksw.gerbil.DataPath}/datasets/erd2014/Trec_beta.query.txt
Expand All @@ -87,6 +104,15 @@ org.aksw.gerbil.datasets.definition.ERD2014.cacheable=true
org.aksw.gerbil.datasets.definition.ERD2014.experimentType=A2KB
org.aksw.gerbil.datasets.definition.ERD2014.constructorArgs=${org.aksw.gerbil.datasets.ERD2014.texts},${org.aksw.gerbil.datasets.ERD2014.annotations}

### IITB
org.aksw.gerbil.datasets.IITBDatasetConfig.crawledDocs=${org.aksw.gerbil.DataPath}/datasets/iitb/crawledDocs
org.aksw.gerbil.datasets.IITBDatasetConfig.annotations=${org.aksw.gerbil.DataPath}/datasets/iitb/CSAW_Annotations.xml
org.aksw.gerbil.datasets.definition.IITB.name=IITB
org.aksw.gerbil.datasets.definition.IITB.class=org.aksw.gerbil.dataset.impl.iitb.IITBDataset
org.aksw.gerbil.datasets.definition.IITB.cacheable=true
org.aksw.gerbil.datasets.definition.IITB.experimentType=A2KB
org.aksw.gerbil.datasets.definition.IITB.constructorArgs=${org.aksw.gerbil.datasets.IITBDatasetConfig.crawledDocs},${org.aksw.gerbil.datasets.IITBDatasetConfig.annotations}

### Kore50
org.aksw.gerbil.datasets.KORE50.file=${org.aksw.gerbil.DataPath}/datasets/KORE50/kore50-nif.ttl
org.aksw.gerbil.datasets.definition.KORE50.name=KORE50
Expand Down Expand Up @@ -204,7 +230,7 @@ org.aksw.gerbil.datasets.definition.Micro2013_2.constructorArgs=${org.aksw.gerbi
org.aksw.gerbil.datasets.definition.Micro2013_2.check.class=org.aksw.gerbil.web.config.check.FileChecker
org.aksw.gerbil.datasets.definition.Micro2013_2.check.args=${org.aksw.gerbil.datasets.Microposts2013DatasetConfig.test}

### N³ collection
### N\u00b3 collection
org.aksw.gerbil.datasets.N3_NEWS_100.file=${org.aksw.gerbil.DataPath}/datasets/N3/News-100.ttl
org.aksw.gerbil.datasets.N3_REUTERS_128.file=${org.aksw.gerbil.DataPath}/datasets/N3/Reuters-128.ttl
org.aksw.gerbil.datasets.N3_RSS_500.file=${org.aksw.gerbil.DataPath}/datasets/N3/RSS-500.ttl
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/**
* This file is part of General Entity Annotator Benchmark.
*
* General Entity Annotator Benchmark is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* General Entity Annotator Benchmark is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with General Entity Annotator Benchmark. If not, see <http://www.gnu.org/licenses/>.
*/
package org.aksw.gerbil.dataset.impl.gerdaq;

import java.io.File;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.aksw.gerbil.transfer.nif.Document;
import org.aksw.gerbil.transfer.nif.Marking;
import org.aksw.gerbil.transfer.nif.data.DocumentImpl;
import org.aksw.gerbil.transfer.nif.data.NamedEntity;
import org.apache.commons.io.Charsets;
import org.apache.commons.io.FileUtils;
import org.junit.Assert;
import org.junit.Test;

public class GERDAQDatasetTest {

private static final String DATASET_NAME = "test";

@Test
public void checkLoadDatasets() throws Exception {
File file = File.createTempFile("GERDAQ", ".xml");
FileUtils.write(file,
"<?xml version='1.0' encoding='UTF-8'?>" + String.format("%n")
+ "<dataset><instance>loris <annotation rank_0_id=\"44017\" rank_0_score=\"0.925555555556\" rank_0_title=\"Candle\">candle</annotation> sampler</instance><instance><annotation rank_0_id=\"230699\" rank_0_score=\"0.666666666667\" rank_0_title=\"Conveyancing\">buying land</annotation> and <annotation rank_0_id=\"21883824\" rank_0_score=\"1.0\" rank_0_title=\"Arizona\">arizona</annotation></instance><instance>hip gry pl</instance></dataset>",
Charsets.UTF_8);
String docUriStart = GERDAQDataset.generateDocumentUri(DATASET_NAME, file.getName());

List<Document> expectedDocuments = Arrays.asList(
new DocumentImpl("loris candle sampler", docUriStart + 0,
Arrays.asList(new NamedEntity(6, 6, "http://dbpedia.org/resource/Candle"))),
new DocumentImpl("buying land and arizona", docUriStart + 1,
Arrays.asList(new NamedEntity(0, 11, "http://dbpedia.org/resource/Conveyancing"),
new NamedEntity(16, 7, "http://dbpedia.org/resource/Arizona"))),
new DocumentImpl("hip gry pl", docUriStart + 2, new ArrayList<Marking>(0)));

GERDAQDataset dataset = new GERDAQDataset(file.getAbsolutePath());
try {
dataset.setName(DATASET_NAME);
dataset.init();

Assert.assertArrayEquals(expectedDocuments.toArray(new Document[3]),
dataset.getInstances().toArray(new Document[3]));
} finally {
dataset.close();
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ public void runTest(int experimentTaskId, ExperimentDAO experimentDAO, SameAsRet
} catch (InterruptedException e) {
e.printStackTrace();
}
Assert.assertNull("Got an exception: " + testError, testError);
Assert.assertNull("Got an exception: " + testError + " " + configuration.toString(), testError);
SameAsRetrieverSingleton4Tests.storeCache();
}

Expand Down

0 comments on commit 8425da1

Please sign in to comment.