-
Notifications
You must be signed in to change notification settings - Fork 37
/
Copy pathTripleIndexCreator.java
executable file
·208 lines (188 loc) · 7.63 KB
/
TripleIndexCreator.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
package org.aksw.agdistis.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.MMapDirectory;
import org.apache.lucene.util.Version;
import org.openrdf.model.Statement;
import org.openrdf.model.URI;
import org.openrdf.rio.RDFHandlerException;
import org.openrdf.rio.RDFParseException;
import org.openrdf.rio.RDFParser;
import org.openrdf.rio.helpers.RDFHandlerBase;
import org.openrdf.rio.turtle.TurtleParser;
import org.slf4j.LoggerFactory;
import info.aduna.io.FileUtil;
public class TripleIndexCreator {
private static org.slf4j.Logger log = LoggerFactory.getLogger(TripleIndexCreator.class);
public static final String N_TRIPLES = "NTriples";
public static final String TTL = "ttl";
public static final String TSV = "tsv";
public static final Version LUCENE_VERSION = Version.LUCENE_44;
private Analyzer urlAnalyzer;
private Analyzer literalAnalyzer;
private DirectoryReader ireader;
private IndexWriter iwriter;
private MMapDirectory directory;
public static void main(String args[]) {
if (args.length > 0) {
log.error("TripleIndexCreator works without parameters. Please use agdistis.properties File");
return;
}
try {
log.info("For using DBpedia we suggest you downlaod the following file: " + "labels_<LANG>.ttl, "
+ "redirects_transitive_<LANG>.ttl, " + "instance_types_<LANG>.ttl, "
+ "mappingbased_properties_<LANG>.ttl, " + "specific_mappingbased_properties_<LANG>.ttl,"
+ "disambiguations_<LANG>.ttl." + ""
+ "Please download them into one folder and configure it in the agdistis.properties File."
+ "For further information have a look at our wiki: https://github.com/AKSW/AGDISTIS/wiki");
Properties prop = new Properties();
InputStream input = new FileInputStream("src/main/resources/config/agdistis.properties");
prop.load(input);
String envIndex = System.getenv("AGDISTIS_INDEX");
String index = envIndex != null ? envIndex : prop.getProperty("index");
log.info("The index will be here: " + index);
String envFolderWithTtlFiles = System.getenv("AGDISTIS_FOLDER_WITH_TTL_FILES");
String folder = envFolderWithTtlFiles != null ? envFolderWithTtlFiles
: prop.getProperty("folderWithTTLFiles");
log.info("Getting triple data from: " + folder);
List<File> listOfFiles = new ArrayList<File>();
for (File file : new File(folder).listFiles()) {
if (file.getName().endsWith("ttl")) {
listOfFiles.add(file);
}
}
String envSurfaceFormTsv = System.getenv("AGDISTIS_SURFACE_FORM_TSV");
String surfaceFormTSV = envSurfaceFormTsv != null ? envSurfaceFormTsv : prop.getProperty("surfaceFormTSV");
log.info("Getting surface forms from: " + surfaceFormTSV);
File file = new File(surfaceFormTSV);
if (file.exists()) {
listOfFiles.add(file);
}
String envBaseUri = System.getenv("AGDISTIS_BASE_URI");
String baseURI = envBaseUri != null ? envBaseUri : prop.getProperty("baseURI");
log.info("Setting Base URI to: " + baseURI);
TripleIndexCreator ic = new TripleIndexCreator();
ic.createIndex(listOfFiles, index, baseURI);
ic.close();
} catch (IOException e) {
log.error("Error while creating index. Maybe the index is corrupt now.", e);
}
}
public void createIndex(List<File> files, String idxDirectory, String baseURI) {
try {
urlAnalyzer = new SimpleAnalyzer(LUCENE_VERSION);
literalAnalyzer = new LiteralAnalyzer(LUCENE_VERSION);
Map<String, Analyzer> mapping = new HashMap<String, Analyzer>();
mapping.put(TripleIndex.FIELD_NAME_SUBJECT, urlAnalyzer);
mapping.put(TripleIndex.FIELD_NAME_PREDICATE, urlAnalyzer);
mapping.put(TripleIndex.FIELD_NAME_OBJECT_URI, urlAnalyzer);
mapping.put(TripleIndex.FIELD_NAME_OBJECT_LITERAL, literalAnalyzer);
PerFieldAnalyzerWrapper perFieldAnalyzer = new PerFieldAnalyzerWrapper(urlAnalyzer, mapping);
File indexDirectory = new File(idxDirectory);
indexDirectory.mkdir();
directory = new MMapDirectory(indexDirectory);
IndexWriterConfig config = new IndexWriterConfig(LUCENE_VERSION, perFieldAnalyzer);
iwriter = new IndexWriter(directory, config);
iwriter.commit();
for (File file : files) {
String type = FileUtil.getFileExtension(file.getName());
if (type.equals(TTL))
indexTTLFile(file, baseURI);
if (type.equals(TSV))
indexTSVFile(file);
iwriter.commit();
}
iwriter.close();
ireader = DirectoryReader.open(directory);
} catch (Exception e) {
log.error("Error while creating TripleIndex.", e);
}
}
private void indexTTLFile(File file, String baseURI)
throws RDFParseException, RDFHandlerException, FileNotFoundException, IOException {
log.info("Start parsing: " + file);
RDFParser parser = new TurtleParser();
OnlineStatementHandler osh = new OnlineStatementHandler();
parser.setRDFHandler(osh);
parser.setStopAtFirstError(false);
if (baseURI == null) {
parser.parse(new FileReader(file), "");
} else {
parser.parse(new FileReader(file), baseURI);
}
log.info("Finished parsing: " + file);
}
private void indexTSVFile(File file) throws IOException {
log.info("Start parsing: " + file);
BufferedReader br = new BufferedReader(new FileReader(file));
while (br.ready()) {
String[] line = br.readLine().split("\t");
String subject = line[0];
for (int i = 1; i < line.length; ++i) {
String object = line[i];
Document doc = new Document();
doc.add(new StringField(TripleIndex.FIELD_NAME_SUBJECT, subject, Store.YES));
doc.add(new StringField(TripleIndex.FIELD_NAME_PREDICATE,
"http://www.w3.org/2004/02/skos/core#altLabel", Store.YES));
doc.add(new TextField(TripleIndex.FIELD_NAME_OBJECT_LITERAL, object, Store.YES));
iwriter.addDocument(doc);
}
}
br.close();
log.info("Finished parsing: " + file);
}
private void addDocumentToIndex(IndexWriter iwriter, String subject, String predicate, String object, boolean isUri)
throws IOException {
Document doc = new Document();
log.debug(subject + " " + predicate + " " + object);
doc.add(new StringField(TripleIndex.FIELD_NAME_SUBJECT, subject, Store.YES));
doc.add(new StringField(TripleIndex.FIELD_NAME_PREDICATE, predicate, Store.YES));
if (isUri) {
doc.add(new StringField(TripleIndex.FIELD_NAME_OBJECT_URI, object, Store.YES));
} else {
doc.add(new TextField(TripleIndex.FIELD_NAME_OBJECT_LITERAL, object, Store.YES));
}
iwriter.addDocument(doc);
}
public void close() throws IOException {
if (ireader != null) {
ireader.close();
}
if (directory != null) {
directory.close();
}
}
private class OnlineStatementHandler extends RDFHandlerBase {
@Override
public void handleStatement(Statement st) {
String subject = st.getSubject().stringValue();
String predicate = st.getPredicate().stringValue();
String object = st.getObject().stringValue();
try {
addDocumentToIndex(iwriter, subject, predicate, object, st.getObject() instanceof URI);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}