-
Notifications
You must be signed in to change notification settings - Fork 3
/
experiment.py
84 lines (72 loc) · 2.68 KB
/
experiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/python3
import os
import re
from subprocess import call, check_output
cnt = 0
MAX = 5
totalGold = 0
totalCorrect = 0
totalOut = 0
#inputF = 'mya-input.txt'
#goldF = 'mya-gold.txt'
inputF = 'test.input'
goldF = 'test.gold'
gold_lines = []
with open(goldF, 'r') as f:
for line in f:
if line != '\n':
#print(line)
gold_lines.append(line)
with open(inputF, 'r') as f:
for line in f:
if line == '\n':
continue
#if cnt > MAX:
# continue
print("Sentence:", line)
####################### using segmenter ###################################
output = check_output(["./segment" + " burmese.fst '" + line + "'"], shell=True)
outStr = output.decode('utf-8')
print(outStr)
outStr = outStr.replace(" ", "|")
outStr = outStr.replace("၊", "|၊|")
outStr = outStr.replace("။", "|။")
outStr = re.sub(r"(?P<punc>[\(\)\-\"\'])","|\g<punc>|", outStr)
outStr = re.sub(r"(?P<eng>[0-9a-zA-Z]+)","|\g<eng>|", outStr)
outStr = re.sub(r"(?P<bur_digits>[၀-၉,\.]+)","|\g<bur_digits>|", outStr)
outStr = outStr.replace("||", "|")
outStr = outStr.replace("\n", '')
outItems = outStr.split("|")
####################### end using segmenter
####################### baseline #########################
#line = line.replace("\n", "")
#outItems = line.split(" ")
####################### end baseline #####################
goldStr = gold_lines[cnt]
goldStr = goldStr.replace("\n", '')
goldItems = goldStr.split(" ")
goldItems = [x for x in goldItems if x]
print("GOLD:", goldItems)
outItems = [x for x in outItems if x]
correctItems = [x for x in outItems if x in goldItems]
if outItems:
totalGold += len(goldItems)
totalCorrect += len(correctItems)
totalOut += len(outItems)
print("Gold cnt:", len(goldItems))
print("Segmentation:", outItems)
print("Segmented items cnt:", len(outItems))
print("Correct items cnt:", len(correctItems))
recall = len(correctItems) / len(goldItems) * 100
print("Recall:", recall)
precision = len(correctItems) / len(outItems) * 100
print("Precision:", precision)
print("===========================\n")
cnt += 1
recall = round((totalCorrect / totalGold) * 100, 2)
precision = round((totalCorrect / totalOut) * 100, 2)
print("TotalOut:", totalOut)
print("totalGold:", totalGold)
print("totalCorrect:", totalCorrect)
print("Recall:", recall)
print("Precision:", precision)