Skip to content

Commit b05c4e2

Browse files
committed
Adding last changes, nothing what would have made a positive difference.
1 parent 2795086 commit b05c4e2

File tree

21 files changed

+430
-4
lines changed

21 files changed

+430
-4
lines changed

pom.xml

+20
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,26 @@
153153

154154
<dependencies>
155155
<dependency>
156+
<groupId>ch.qos.logback</groupId>
157+
<artifactId>logback-classic</artifactId>
158+
<version>1.2.3</version>
159+
</dependency>
160+
<dependency>
161+
<groupId>de.julielab</groupId>
162+
<artifactId>jcore-topic-indexing-ae</artifactId>
163+
<version>2.3.0-SNAPSHOT</version>
164+
<exclusions>
165+
<exclusion>
166+
<groupId>commons-beanutils</groupId>
167+
<artifactId>commons-beanutils-core</artifactId>
168+
</exclusion>
169+
</exclusions>
170+
</dependency>
171+
<dependency>
172+
<groupId>de.julielab</groupId>
173+
<artifactId>java-stdio-ipc</artifactId>
174+
<version>1.0.0-SNAPSHOT</version>
175+
</dependency><dependency>
156176
<groupId>com.wcohen</groupId>
157177
<artifactId>com.wcohen.secondstring</artifactId>
158178
<version>0.1</version>

resources/gs2017DocsJson.zip

856 KB
Binary file not shown.

scripts/doc2VecInferer.py

+36
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
import sys
3+
import zipfile
4+
import gzip
5+
from struct import *
6+
7+
import gensim
8+
from gensim.test.utils import common_texts
9+
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
10+
11+
import time
12+
13+
def infervector(text):
14+
tokens = list(gensim.utils.tokenize(text, lower=True))
15+
return model.infer_vector(tokens)
16+
17+
print("Reading doc2vec model...", file=sys.stderr)
18+
model = Doc2Vec.load(sys.argv[1])
19+
vectorSize = model.vector_size
20+
21+
print("Waiting for input on STDIN, one text document batch per line", file=sys.stderr)
22+
for line in sys.stdin:
23+
alltime = time.time()
24+
if line.strip() == "quit":
25+
sys.exit(0)
26+
classifytime = time.time()
27+
vector = infervector(line)
28+
classifytime = time.time() - classifytime
29+
bytes = pack('>%sd' % len(vector), *vector)
30+
sys.stdout.buffer.write(pack('>i', len(bytes)))
31+
sys.stdout.buffer.write(bytes)
32+
print(end='')
33+
alltime = time.time() - alltime
34+
35+
#print("Timing: vector inference time: ", classifytime, file=sys.stderr)
36+
#print("Timing: allover time: ", alltime, file=sys.stderr)

src/main/java/at/medunigraz/imi/bst/pmclassifier/Document.java

+30-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package at.medunigraz.imi.bst.pmclassifier;
22

3+
import com.fasterxml.jackson.annotation.JsonIgnore;
34
import com.fasterxml.jackson.annotation.JsonProperty;
45

56
import java.util.List;
@@ -14,7 +15,36 @@ public class Document {
1415
private List<String> organisms;
1516
private List<String> meshTags;
1617
private List<String> meshTagsMajor;
18+
private List<String> keywords;
1719
private String pmLabel;
20+
private List<String> meshMinor;
21+
private List<String> publicationTypes;
22+
@JsonIgnore
23+
private double[] topicWeight;
24+
25+
public double[] getTopicWeight() {
26+
return topicWeight;
27+
}
28+
29+
public void setTopicWeight(double[] topicWeight) {
30+
this.topicWeight = topicWeight;
31+
}
32+
33+
public List<String> getPublicationTypes() {
34+
return publicationTypes;
35+
}
36+
37+
public void setPublicationTypes(List<String> publicationTypes) {
38+
this.publicationTypes = publicationTypes;
39+
}
40+
41+
public List<String> getKeywords() {
42+
return keywords;
43+
}
44+
45+
public void setKeywords(List<String> keywords) {
46+
this.keywords = keywords;
47+
}
1848

1949
public List<String> getMeshTags() {
2050
return meshTags;
@@ -40,8 +70,6 @@ public void setMeshMinor(List<String> meshMinor) {
4070
this.meshMinor = meshMinor;
4171
}
4272

43-
private List<String> meshMinor;
44-
4573
public String getId() {
4674
return id;
4775
}

src/main/java/at/medunigraz/imi/bst/pmclassifier/InstancePreparator.java

+5
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,11 @@ private Collection<Pipe> getTfIdfPipes() {
112112
pipes.add(new TfIdfPipe());
113113
pipes.add(new HasGenesPipe());
114114
pipes.add(new MeshTagsForTokenPipe());
115+
//pipes.add(new HasDiseasePipe());
116+
//pipes.add(new HasKeywordPipe());
117+
//pipes.add(new Doc2VecPipe());
118+
//pipes.add(new HasPubTypePipe());
119+
pipes.add(new TopicModelPipe());
115120
pipes.add(new Token2FeatureVector());
116121
return pipes;
117122
}

src/main/java/at/medunigraz/imi/bst/pmclassifier/apps/CrossVal.java

+63
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,21 @@
22

33
import at.medunigraz.imi.bst.pmclassifier.*;
44
import cc.mallet.types.InstanceList;
5+
import de.julielab.jcore.ae.topicindexing.TopicIndexer;
6+
import de.julielab.jcore.ae.topicindexing.TopicModelProvider;
7+
import de.julielab.jcore.types.DocumentTopics;
58
import org.apache.logging.log4j.LogManager;
69
import org.apache.logging.log4j.Logger;
10+
import org.apache.uima.UIMAException;
11+
import org.apache.uima.analysis_engine.AnalysisEngine;
12+
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
13+
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
14+
import org.apache.uima.fit.factory.AnalysisEngineFactory;
15+
import org.apache.uima.fit.factory.ExternalResourceFactory;
16+
import org.apache.uima.fit.factory.JCasFactory;
17+
import org.apache.uima.fit.util.JCasUtil;
18+
import org.apache.uima.jcas.JCas;
19+
import org.apache.uima.jcas.cas.DoubleArray;
720

821
import java.io.File;
922
import java.io.IOException;
@@ -21,6 +34,7 @@ public static void main(String args[]) throws DataReadingException, IOException,
2134
int numFolds = 10;
2235

2336
Map<String, Document> documents = DataReader.readDocuments(new File("resources/gs2017DocsJson.zip"));
37+
inferTopics(documents.values());
2438
InstancePreparator ip = InstancePreparator.getInstance();
2539
classifier.setInstancePreparator(ip);
2640

@@ -69,6 +83,55 @@ public static void main(String args[]) throws DataReadingException, IOException,
6983

7084
}
7185

86+
private static void inferTopics(Collection<Document> values) {
87+
try {
88+
AnalysisEngine sentenceDetector = AnalysisEngineFactory.createEngine(
89+
"de.julielab.jcore.ae.jsbd.desc.jcore-jsbd-ae-biomedical-english");
90+
AnalysisEngine tokenizer = AnalysisEngineFactory.createEngine(
91+
"de.julielab.jcore.ae.jtbd.desc.jcore-jtbd-ae-biomedical-english");
92+
AnalysisEngine posTagger = AnalysisEngineFactory.createEngine(
93+
"de.julielab.jcore.ae.opennlp.postag.desc.jcore-opennlp"
94+
+ "-postag-ae-biomedical-english");
95+
AnalysisEngine bioLemmatizer = AnalysisEngineFactory.createEngine(
96+
"de.julielab.jcore.ae.biolemmatizer.desc.jcore-biolemmatizer-ae");
97+
AnalysisEngineDescription desc = AnalysisEngineFactory.createEngineDescription("de.julielab.jcore.ae.topicindexing.desc.jcore-topic-indexing-ae",
98+
TopicIndexer.PARAM_TOPIC_MODEL_CONFIG, "uima/topicmodels/nt100-a1.0-b0.1-genedocs1m.xml",
99+
TopicIndexer.PARAM_NUM_DISPLAYED_TOPIC_WORDS, 0,
100+
TopicIndexer.PARAM_STORE_IN_MODEL_INDEX, false);
101+
ExternalResourceFactory.createDependencyAndBind(desc, TopicIndexer.RESOURCE_KEY_MODEL_FILE_NAME, TopicModelProvider.class, new File("uima/topicmodels/nt100-a1.0-b0.1-genedocs1m.mod.gz").toURI().toURL().toString());
102+
AnalysisEngine topicIndexer = AnalysisEngineFactory.createEngine(desc);
103+
JCas jCas = JCasFactory.createJCas("de.julielab.jcore.types.jcore-document-meta-pubmed-types",
104+
"de.julielab.jcore.types.jcore-xmi-splitter-types",
105+
"de.julielab.jcore.types.extensions.jcore-document-meta-extension-types",
106+
"de.julielab.jcore.types.jcore-document-structure-pubmed-types",
107+
"de.julielab.jcore.types.jcore-morpho-syntax-types");
108+
109+
values.parallelStream().forEach(d -> {
110+
jCas.setDocumentText(d.getTitle() + " " + d.getAbstractText());
111+
try {
112+
sentenceDetector.process(jCas);
113+
tokenizer.process(jCas);
114+
posTagger.process(jCas);
115+
bioLemmatizer.process(jCas);
116+
topicIndexer.process(jCas);
117+
DocumentTopics documentTopics = JCasUtil.selectSingle(jCas, DocumentTopics.class);
118+
DoubleArray weights = documentTopics.getWeights();
119+
double[] doubles = weights.toArray();
120+
d.setTopicWeight(doubles);
121+
jCas.reset();
122+
} catch (AnalysisEngineProcessException e) {
123+
e.printStackTrace();
124+
}
125+
126+
127+
});
128+
} catch (IOException e) {
129+
e.printStackTrace();
130+
} catch (UIMAException e) {
131+
e.printStackTrace();
132+
}
133+
}
134+
72135
private static List<List<Document>> makeStratifiedPartitions(List<Document> pmList, List<Document> notPmList, int numFolds) {
73136
List<List<Document>> ret = new ArrayList<>(numFolds);
74137
for (int i = 0; i < numFolds; i++)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
package at.medunigraz.imi.bst.pmclassifier.featurepipes;
2+
3+
import cc.mallet.pipe.Pipe;
4+
import cc.mallet.types.Instance;
5+
import cc.mallet.types.Token;
6+
import de.julielab.ipc.javabridge.Options;
7+
import de.julielab.ipc.javabridge.StdioBridge;
8+
9+
import java.io.IOException;
10+
import java.nio.ByteBuffer;
11+
import java.nio.DoubleBuffer;
12+
import java.util.Optional;
13+
import java.util.stream.Stream;
14+
15+
public class Doc2VecPipe extends Pipe {
16+
17+
private final StdioBridge<byte[]> bridge;
18+
19+
public Doc2VecPipe() {
20+
Options<byte[]> options = new Options(byte[].class);
21+
options.setExecutable("python");
22+
options.setExternalProgramTerminationSignal("quit");
23+
bridge = new StdioBridge(options, "-u", "scripts/doc2VecInferer.py", "/Users/faessler/Research/trecpm2018/doc2vec/doc2vec_vs300_w15_500k.mod");
24+
try {
25+
bridge.start();
26+
} catch (IOException e) {
27+
e.printStackTrace();
28+
}
29+
}
30+
static int docnum = 0;
31+
@Override
32+
public Instance pipe(Instance inst) {
33+
Token token = (Token) inst.getData();
34+
String text = token.getText();
35+
try {
36+
long time = System.nanoTime();
37+
Stream<byte[]> stream = bridge.sendAndReceive(text);
38+
time = System.nanoTime() - time;
39+
System.out.println("Doc " + docnum++ + ": " + time + " " + text.length());
40+
Optional<byte[]> any = stream.findAny();
41+
if (!any.isPresent())
42+
throw new IllegalStateException("The STDIO bridge to the Gensim Doc2vec program didn't return a value.");
43+
DoubleBuffer db = ByteBuffer.wrap(any.get()).asDoubleBuffer();
44+
for (int i = 0; i < db.capacity(); i++) {
45+
token.setFeatureValue("doc2vec_pos_" + i, db.get(i));
46+
}
47+
} catch (InterruptedException e) {
48+
e.printStackTrace();
49+
}
50+
return inst;
51+
}
52+
}

src/main/java/at/medunigraz/imi/bst/pmclassifier/featurepipes/HasDiseasePipe.java

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public HasDiseasePipe() {
2626
@Override
2727
public Instance pipe(Instance inst) {
2828
Token token = (Token) inst.getData();
29-
ac.match(token.getText(), (start,end,matched) -> token.setFeatureValue("hasDisease=", 1));
29+
ac.match(token.getText(), (start,end,matched) -> token.setFeatureValue("hasDisease="+matched, 1));
3030
return inst;
3131
}
3232
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
package at.medunigraz.imi.bst.pmclassifier.featurepipes;
2+
3+
import at.medunigraz.imi.bst.pmclassifier.AhoCorasickOptimized;
4+
import at.medunigraz.imi.bst.pmclassifier.Document;
5+
import cc.mallet.pipe.Pipe;
6+
import cc.mallet.types.Instance;
7+
import cc.mallet.types.Token;
8+
import de.julielab.java.utilities.FileUtilities;
9+
10+
import java.io.BufferedReader;
11+
import java.io.File;
12+
import java.io.IOException;
13+
import java.util.Arrays;
14+
import java.util.stream.Collectors;
15+
16+
public class HasKeywordPipe extends Pipe {
17+
18+
@Override
19+
public Instance pipe(Instance inst) {
20+
Token token = (Token) inst.getData();
21+
Document document = (Document) inst.getSource();
22+
if (document.getKeywords() != null) {
23+
for (String kw : document.getKeywords()) {
24+
token.setFeatureValue("hasKeyword="+kw, 1);
25+
}
26+
}
27+
return inst;
28+
}
29+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
package at.medunigraz.imi.bst.pmclassifier.featurepipes;
2+
3+
import at.medunigraz.imi.bst.pmclassifier.Document;
4+
import cc.mallet.pipe.Pipe;
5+
import cc.mallet.types.Instance;
6+
import cc.mallet.types.Token;
7+
8+
public class HasPubTypePipe extends Pipe {
9+
10+
@Override
11+
public Instance pipe(Instance inst) {
12+
Token token = (Token) inst.getData();
13+
Document document = (Document) inst.getSource();
14+
if (document.getPublicationTypes() != null) {
15+
for (String pt : document.getPublicationTypes()) {
16+
token.setFeatureValue("hasPubType="+pt, 1);
17+
}
18+
}
19+
return inst;
20+
}
21+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
package at.medunigraz.imi.bst.pmclassifier.featurepipes;
2+
3+
import at.medunigraz.imi.bst.pmclassifier.Document;
4+
import cc.mallet.pipe.Pipe;
5+
import cc.mallet.types.Instance;
6+
import cc.mallet.types.InstanceList;
7+
import cc.mallet.types.Token;
8+
import cc.mallet.types.TokenSequence;
9+
import de.julielab.ipc.javabridge.Options;
10+
import de.julielab.ipc.javabridge.StdioBridge;
11+
import de.julielab.jcore.ae.topicindexing.TopicIndexer;
12+
import de.julielab.jcore.ae.topicindexing.TopicModelProvider;
13+
import de.julielab.jcore.types.DocumentTopics;
14+
import de.julielab.topicmodeling.businessobjects.Model;
15+
import de.julielab.topicmodeling.businessobjects.Topic;
16+
import de.julielab.topicmodeling.services.MalletTopicModeling;
17+
import org.apache.commons.configuration2.XMLConfiguration;
18+
import org.apache.commons.configuration2.ex.ConfigurationException;
19+
import org.apache.uima.UIMAException;
20+
import org.apache.uima.analysis_engine.AnalysisEngine;
21+
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
22+
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
23+
import org.apache.uima.fit.factory.AnalysisEngineFactory;
24+
import org.apache.uima.fit.factory.ExternalResourceFactory;
25+
import org.apache.uima.fit.factory.JCasFactory;
26+
import org.apache.uima.fit.util.JCasUtil;
27+
import org.apache.uima.jcas.JCas;
28+
import org.apache.uima.jcas.cas.DoubleArray;
29+
import org.apache.uima.resource.ResourceInitializationException;
30+
import org.apache.uima.util.InvalidXMLException;
31+
import scala.annotation.bridge;
32+
33+
import java.io.File;
34+
import java.io.IOException;
35+
import java.nio.ByteBuffer;
36+
import java.nio.DoubleBuffer;
37+
import java.util.Collections;
38+
import java.util.List;
39+
import java.util.Map;
40+
import java.util.Optional;
41+
import java.util.stream.Stream;
42+
43+
public class TopicModelPipe extends Pipe {
44+
45+
46+
@Override
47+
public Instance pipe(Instance inst) {
48+
Token token = (Token) inst.getData();
49+
Document document = (Document) inst.getSource();
50+
String text = token.getText();
51+
52+
53+
double[] topicWeight = document.getTopicWeight();
54+
for (int i = 0; i < topicWeight.length; ++i) {
55+
double w = topicWeight[i];
56+
token.setFeatureValue("topic_" + i, w);
57+
//System.out.println("topic_" + i + ": " + w);
58+
}
59+
60+
61+
return inst;
62+
}
63+
}

0 commit comments

Comments
 (0)