forked from c-amr/camr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_golds.sh
executable file
·179 lines (162 loc) · 4.11 KB
/
prepare_golds.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
#!/bin/bash
destdir=golds
srcname="LDC2015E86_DEFT_Phase_2_AMR_Annotation_R1"
tgz_file="$srcname.tgz"
rpath="data/amrs/split"
if [ -d "data/$srcname/$rpath" ]; then
srcdir="data/$srcname"
elif [ -d "$srcname/$rpath" ]; then
srcdir="$srcname"
fi
splits=(training dev test)
corpuses=(proxy bolt dfa mt09sdl xinhua wb cctv guidelines consensus)
function usage {
echo "Extract training, dev and test splits from $srcname data"
echo
echo "usage: $0 [--srcdir <src dir>] [--tgz <src.tgz>] [--train|--training] [--dev] [--test] [dest dir] [--] [corpus #1] [corpus #2] ..."
echo
echo "--srcdir DIR path to $srcname (default: [data/]$srcname)"
echo "--tgz FILE path to $tgz_file file (will be extracted in current directory)"
echo
echo "Select splits:"
echo "--train prepare training split"
echo "--dev prepare development split"
echo "--test prepare test split"
echo
echo "Default destination directory: $destdir"
echo
echo "If no source directory found, will search for tgz archive"
echo
# echo "List of splits: ${splits[@]}"
echo "List of corpuses: ${corpuses[@]}"
echo
echo "By default if no splits or corpuses selected, all will be used"
echo
}
for arg in $@; do
if [ "$arg" == "--help" ] || [ "$arg" == "-h" ]; then
usage
exit 0
fi
done
function extract_tgz {
echo -n "Extracting $1 to `dirname $1` ... "
tar -xf $1 -C `dirname $1`
if [ $? -ne 0 ]; then
echo "error: extracting source data from $1"
exit 1
fi
if [ ! -d "`dirname $1`/$srcname" ]; then
echo "error: directory `dirname $1`/$srcname not found after extracting archive $1"
exit 1
fi
echo "ok"
srcdir="`dirname $1`/$srcname"
}
selected_splits=()
selected_corpuses=()
default_destdir=1
while [ $# -gt 0 ]; do
if [ "$1" == "--" ]; then
shift
break
elif [ "$1" == "--srcdir" ]; then
shift
if [ -d "$1" ]; then
srcdir="$1"
else
echo "error: invalid source directory: $1"
fi
shift
elif [ "$1" == "--tgz" ]; then
shift
if [ -f "$1" ]; then
extract_tgz $1
else
echo "error: invalid source data archive: $1"
fi
shift
elif [ "$1" == "--training" ] || [ "$1" == "--train" ]; then
selected_splits=(${selected_splits[@]} training)
shift
elif [ "$1" == "--dev" ]; then
selected_splits=(${selected_splits[@]} dev)
shift
elif [ "$1" == "--test" ]; then
selected_splits=(${selected_splits[@]} test)
shift
elif [ $default_destdir -eq 1 ]; then
destdir="$1"
default_destdir=0
shift
else
found=0
for corpus in ${corpuses[@]}; do
if [ "$corpus" == "$1" ]; then
selected_corpuses=(${selected_corpuses[@]} $1)
found=1
fi
done
if [ $found -eq 0 ]; then
echo "error: corpus $1 not found"
exit 1
fi
shift
fi
done
if [ ${#selected_splits[@]} -gt 0 ]; then
splits=(${selected_splits[@]})
fi
if [ ${#selected_corpuses} -gt 0 ] || [ $# -gt 0 ]; then
for arg in $@; do
for corpus in ${corpuses[@]}; do
if [ "$corpus" == "$arg" ]; then
selected_corpuses=(${selected_corpuses[@]} $arg)
fi
done
done
corpuses=(${selected_corpuses[@]})
# corpuses=(${selected_corpuses[@]} $@)
fi
if [ "$srcdir" == "" ]; then
if [ -f "data/$tgz_file" ]; then
extract_tgz "data/$tgz_file"
elif [ -f "$tgz_file" ]; then
extract_tgz "$tgz_file"
else
echo "error: no source data found."
exit 1
fi
fi
if [ -d "$srcdir/$rpath" ]; then
srcdir="$srcdir/$rpath"
else
echo "Error: invalid source data directory: $srcdir"
exit 1
fi
if [ "$destdir" == "" ]; then
echo "error: output directory not specified"
echo
usage 1
exit 1
fi
echo "Selected splits: ${splits[@]}"
echo "Selected corpuses: ${corpuses[@]}"
mkdir -p "$destdir"
echo "Will write splits to $destdir"
for split in ${splits[@]}; do
dstsplit=$split
if [ "$dstsplit" == "training" ]; then
dstsplit=train
fi
rm -f "$destdir/$dstsplit" 2>/dev/null
if [ ${#corpuses[@]} -ne 0 ]; then
echo "Writing $destdir/$dstsplit"
fi
for corpus in ${corpuses[@]}; do
if [ -f "$srcdir/$split/deft-p2-amr-r1-amrs-$split-$corpus.txt" ]; then
echo "* $split-$corpus"
cat "$srcdir/$split/deft-p2-amr-r1-amrs-$split-$corpus.txt" >> "$destdir/$dstsplit"
fi
done
done