-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfrog.nf
executable file
·165 lines (139 loc) · 5.49 KB
/
frog.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/env nextflow
/*
vim: syntax=groovy
-*- mode: groovy;-*-
*/
log.info "----------------------------------"
log.info "Frog pipeline"
log.info "----------------------------------"
def env = System.getenv()
params.virtualenv = env.containsKey('VIRTUAL_ENV') ? env['VIRTUAL_ENV'] : ""
params.extension = "txt"
params.outputdir = "frog_output"
params.sentenceperline = false
params.inputclass = "current"
params.outputclass = "current"
params.workers = Runtime.runtime.availableProcessors()
params.skip = ""
if (params.containsKey('help') || !params.containsKey('inputdir')) {
log.info "Usage:"
log.info " frog.nf"
log.info ""
log.info "Mandatory parameters:"
log.info " --inputdir DIRECTORY Path to the corpus directory"
log.info ""
log.info "Optional parameters:"
log.info " --extension EXTENSION Extension of input documents (default: txt, suggestion: folia.xml)"
log.info " --inputformat STR Set to 'text' or 'folia', automatically determined from extension if possible"
log.info " --virtualenv PATH Path to Python Virtual Environment to load (usually path to LaMachine)"
log.info " --sentenceperline Indicates that the input (plain text only) is already in a one sentence per line format, skips sentence detection (default: false)"
log.info " --outputdir DIRECTORY Output directory (FoLiA documents)"
log.info " --inputclass CLASS Set the FoLiA text class to use as input (default: current)"
log.info " --outputclass CLASS Set the FoLiA text class to use as output (default: current)"
log.info " --skip=[mptncla] Skip Tokenizer (t), Lemmatizer (l), Morphological Analyzer (a), Chunker (c), Multi-Word Units (m), Named Entity Recognition (n), or Parser (p)"
log.info " --workers NUMBER The number of workers (Frogs in parallel)"
exit 2
}
if ((params.extension.find('xml') != null) || (params.extension.find('folia') != null)) {
params.inputformat = "folia"
} else {
params.inputformat = "text"
}
inputdocuments = Channel.fromPath(params.inputdir + "/**." + params.extension).filter { it.baseName != "trace" }
inputdocuments_counter = Channel.fromPath(params.inputdir + "/**." + params.extension).filter { it.baseName != "trace" }
if (params.inputformat == "folia") {
//group documents into n (=$worker) batches
foliainput_batched = Channel.create()
inputdocuments
.buffer( size: Math.ceil(inputdocuments_counter.count().val / params.workers).toInteger(), remainder: true)
.into(foliainput_batched)
process frog_folia2folia {
publishDir params.outputdir, pattern: "*.xml", mode: 'copy', overwrite: true
input:
file foliadocuments from foliainput_batched
val skip from params.skip
val inputclass from params.inputclass
val outputclass from params.outputclass
val virtualenv from params.virtualenv
output:
file "*.xml" into foliadocuments_output mode flatten
script:
"""
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
opts=""
if [ ! -z "$skip" ]; then
opts="--skip=${skip}"
fi
#move input files to separate staging directory
mkdir input
mv *.xml input/
#output will be in output/
mkdir output
frog \$opts --inputclass "${inputclass}" --outputclass "${outputclass}" --xmldir "output" --nostdout --testdir input/ -x
cd output
for f in *.xml; do
if [[ \${f%.folia.xml} == \$f ]]; then
newf="\${f%.xml}.frogged.folia.xml"
else
newf="\${f%.folia.xml}.frogged.folia.xml"
fi
mv \$f ../\$newf
done
cd ..
"""
}
} else {
//group documents into n (=$worker) batches
textinput_batched = Channel.create()
inputdocuments
.buffer( size: Math.ceil(inputdocuments_counter.count().val / params.workers).toInteger(), remainder: true)
.into(textinput_batched)
process frog_text2folia {
publishDir params.outputdir, pattern: "*.xml", mode: 'copy', overwrite: true
input:
file foliadocuments from textinput_batched
val skip from params.skip
val outputclass from params.outputclass
val extension from params.extension
val sentenceperline from params.sentenceperline
val virtualenv from params.virtualenv
output:
file "*.xml" into foliadocuments_output mode flatten
script:
"""
set +u
if [ ! -z "${virtualenv}" ]; then
source ${virtualenv}/bin/activate
fi
set -u
opts=""
if [[ "$sentenceperline" == "true" ]]; then
opts="\$opts -n"
fi
if [ ! -z "$skip" ]; then
opts="\$opts --skip=${skip}"
fi
#move input files to separate staging directory
mkdir input
mv *.$extension input/
#output will be in cwd
mkdir output
frog \$opts --outputclass "${outputclass}" --xmldir "output" --nostdout --testdir input/
cd output
for f in *.xml; do
if [[ \${f%.folia.xml} == \$f ]]; then
newf="\${f%.xml}.frogged.folia.xml"
else
newf="\${f%.folia.xml}.frogged.folia.xml"
fi
mv \$f ../\$newf
done
cd ..
"""
}
}
foliadocuments_output.subscribe { println "Frog output document written to " + params.outputdir + '/' + it.name }