2
2
3
3
import at .medunigraz .imi .bst .pmclassifier .*;
4
4
import cc .mallet .types .InstanceList ;
5
+ import de .julielab .jcore .ae .topicindexing .TopicIndexer ;
6
+ import de .julielab .jcore .ae .topicindexing .TopicModelProvider ;
7
+ import de .julielab .jcore .types .DocumentTopics ;
5
8
import org .apache .logging .log4j .LogManager ;
6
9
import org .apache .logging .log4j .Logger ;
10
+ import org .apache .uima .UIMAException ;
11
+ import org .apache .uima .analysis_engine .AnalysisEngine ;
12
+ import org .apache .uima .analysis_engine .AnalysisEngineDescription ;
13
+ import org .apache .uima .analysis_engine .AnalysisEngineProcessException ;
14
+ import org .apache .uima .fit .factory .AnalysisEngineFactory ;
15
+ import org .apache .uima .fit .factory .ExternalResourceFactory ;
16
+ import org .apache .uima .fit .factory .JCasFactory ;
17
+ import org .apache .uima .fit .util .JCasUtil ;
18
+ import org .apache .uima .jcas .JCas ;
19
+ import org .apache .uima .jcas .cas .DoubleArray ;
7
20
8
21
import java .io .File ;
9
22
import java .io .IOException ;
@@ -21,6 +34,7 @@ public static void main(String args[]) throws DataReadingException, IOException,
21
34
int numFolds = 10 ;
22
35
23
36
Map <String , Document > documents = DataReader .readDocuments (new File ("resources/gs2017DocsJson.zip" ));
37
+ inferTopics (documents .values ());
24
38
InstancePreparator ip = InstancePreparator .getInstance ();
25
39
classifier .setInstancePreparator (ip );
26
40
@@ -69,6 +83,55 @@ public static void main(String args[]) throws DataReadingException, IOException,
69
83
70
84
}
71
85
86
+ private static void inferTopics (Collection <Document > values ) {
87
+ try {
88
+ AnalysisEngine sentenceDetector = AnalysisEngineFactory .createEngine (
89
+ "de.julielab.jcore.ae.jsbd.desc.jcore-jsbd-ae-biomedical-english" );
90
+ AnalysisEngine tokenizer = AnalysisEngineFactory .createEngine (
91
+ "de.julielab.jcore.ae.jtbd.desc.jcore-jtbd-ae-biomedical-english" );
92
+ AnalysisEngine posTagger = AnalysisEngineFactory .createEngine (
93
+ "de.julielab.jcore.ae.opennlp.postag.desc.jcore-opennlp"
94
+ + "-postag-ae-biomedical-english" );
95
+ AnalysisEngine bioLemmatizer = AnalysisEngineFactory .createEngine (
96
+ "de.julielab.jcore.ae.biolemmatizer.desc.jcore-biolemmatizer-ae" );
97
+ AnalysisEngineDescription desc = AnalysisEngineFactory .createEngineDescription ("de.julielab.jcore.ae.topicindexing.desc.jcore-topic-indexing-ae" ,
98
+ TopicIndexer .PARAM_TOPIC_MODEL_CONFIG , "uima/topicmodels/nt100-a1.0-b0.1-genedocs1m.xml" ,
99
+ TopicIndexer .PARAM_NUM_DISPLAYED_TOPIC_WORDS , 0 ,
100
+ TopicIndexer .PARAM_STORE_IN_MODEL_INDEX , false );
101
+ ExternalResourceFactory .createDependencyAndBind (desc , TopicIndexer .RESOURCE_KEY_MODEL_FILE_NAME , TopicModelProvider .class , new File ("uima/topicmodels/nt100-a1.0-b0.1-genedocs1m.mod.gz" ).toURI ().toURL ().toString ());
102
+ AnalysisEngine topicIndexer = AnalysisEngineFactory .createEngine (desc );
103
+ JCas jCas = JCasFactory .createJCas ("de.julielab.jcore.types.jcore-document-meta-pubmed-types" ,
104
+ "de.julielab.jcore.types.jcore-xmi-splitter-types" ,
105
+ "de.julielab.jcore.types.extensions.jcore-document-meta-extension-types" ,
106
+ "de.julielab.jcore.types.jcore-document-structure-pubmed-types" ,
107
+ "de.julielab.jcore.types.jcore-morpho-syntax-types" );
108
+
109
+ values .parallelStream ().forEach (d -> {
110
+ jCas .setDocumentText (d .getTitle () + " " + d .getAbstractText ());
111
+ try {
112
+ sentenceDetector .process (jCas );
113
+ tokenizer .process (jCas );
114
+ posTagger .process (jCas );
115
+ bioLemmatizer .process (jCas );
116
+ topicIndexer .process (jCas );
117
+ DocumentTopics documentTopics = JCasUtil .selectSingle (jCas , DocumentTopics .class );
118
+ DoubleArray weights = documentTopics .getWeights ();
119
+ double [] doubles = weights .toArray ();
120
+ d .setTopicWeight (doubles );
121
+ jCas .reset ();
122
+ } catch (AnalysisEngineProcessException e ) {
123
+ e .printStackTrace ();
124
+ }
125
+
126
+
127
+ });
128
+ } catch (IOException e ) {
129
+ e .printStackTrace ();
130
+ } catch (UIMAException e ) {
131
+ e .printStackTrace ();
132
+ }
133
+ }
134
+
72
135
private static List <List <Document >> makeStratifiedPartitions (List <Document > pmList , List <Document > notPmList , int numFolds ) {
73
136
List <List <Document >> ret = new ArrayList <>(numFolds );
74
137
for (int i = 0 ; i < numFolds ; i ++)
0 commit comments