Skip to content

Commit e6c596f

Browse files
Merge pull request #69 from DeNeutoy/mn-benchmark
🏎 ⚡️ 💯 Benchmark across Segmentation Tools, Libraries and Algorithms
2 parents 4652f40 + 65240ff commit e6c596f

7 files changed

+399
-2
lines changed

benchmarks/__init__.py

Whitespace-only changes.

benchmarks/benchmark.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import blingfire
2+
import nltk
3+
import pysbd
4+
import spacy
5+
import stanza
6+
7+
import syntok
8+
from syntok.tokenizer import Tokenizer
9+
import syntok.segmenter as syntok_segmenter
10+
11+
from english_golden_rules import GOLDEN_EN_RULES
12+
13+
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
14+
15+
nlp = spacy.blank('en')
16+
nlp.add_pipe(nlp.create_pipe("sentencizer"))
17+
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
18+
#stanza.download('en')
19+
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
20+
21+
syntok_tokenizer = Tokenizer()
22+
23+
def blingfire_tokenize(text):
24+
return blingfire.text_to_sentences(text).split('\n')
25+
26+
def nltk_tokenize(text):
27+
return nltk.sent_tokenize(text)
28+
29+
def pysbd_tokenize(text):
30+
return pysbd_segmenter.segment(text)
31+
32+
def spacy_tokenize(text):
33+
return [sent.text for sent in nlp(text).sents]
34+
35+
def spacy_dep_tokenize(text):
36+
return [sent.text for sent in nlp_dep(text).sents]
37+
38+
def stanza_tokenize(text):
39+
return [e.text for e in stanza_nlp(text).sentences]
40+
41+
def make_sentences(segmented_tokens):
42+
for sentence in segmented_tokens:
43+
yield "".join(str(token) for token in sentence).strip()
44+
45+
def syntok_tokenize(text):
46+
tokens = syntok_tokenizer.split(text)
47+
result = syntok_segmenter.split(iter(tokens))
48+
segments = [sent for sent in make_sentences(result)]
49+
return segments
50+
51+
52+
total_rules = len(GOLDEN_EN_RULES)
53+
54+
def benchmark(golden_rules, tokenize_func):
55+
score = 0
56+
for rule in golden_rules:
57+
text, expected = rule
58+
segments = tokenize_func(text)
59+
if segments == expected:
60+
score += 1
61+
percent_score = (score / total_rules) * 100.0
62+
63+
return percent_score
64+
65+
if __name__ == "__main__":
66+
import time
67+
libraries = (
68+
blingfire_tokenize,
69+
nltk_tokenize,
70+
pysbd_tokenize,
71+
spacy_tokenize,
72+
spacy_dep_tokenize,
73+
stanza_tokenize,
74+
syntok_tokenize)
75+
for tokenize_func in libraries:
76+
t = time.time()
77+
for i in range(100):
78+
percent_score = benchmark(GOLDEN_EN_RULES, tokenize_func)
79+
80+
time_taken = time.time() - t
81+
print()
82+
print(tokenize_func.__name__)
83+
print('GRS score: {:0.2f}%'.format(percent_score))
84+
print('Speed(Avg over 100 runs): {:>10.2f} ms'.format(time_taken*1000/100))

benchmarks/english_golden_rules.py

+210
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
# -*- coding: utf-8 -*-
2+
3+
GOLDEN_EN_RULES = [
4+
# 1) Simple period to end sentence
5+
("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
6+
# 2) Question mark to end sentence
7+
("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
8+
# 3) Exclamation point to end sentence
9+
("There it is! I found it.", ["There it is!", "I found it."]),
10+
# 4) One letter upper case abbreviations
11+
("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]),
12+
# 5) One letter lower case abbreviations
13+
("Please turn to p. 55.", ["Please turn to p. 55."]),
14+
# 6) Two letter lower case abbreviations in the middle of a sentence
15+
("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]),
16+
# 7) Two letter upper case abbreviations in the middle of a sentence
17+
("They closed the deal with Pitt, Briggs & Co. at noon.",
18+
["They closed the deal with Pitt, Briggs & Co. at noon."]),
19+
# 8) Two letter lower case abbreviations at the end of a sentence
20+
(
21+
"Let's ask Jane and co. They should know.",
22+
["Let's ask Jane and co.", "They should know."]),
23+
# 9) Two letter upper case abbreviations at the end of a sentence
24+
(
25+
"They closed the deal with Pitt, Briggs & Co. It closed yesterday.", [
26+
"They closed the deal with Pitt, Briggs & Co.",
27+
"It closed yesterday."
28+
],
29+
),
30+
# 10) Two letter (prepositive) abbreviations
31+
("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]),
32+
# 11) Two letter (prepositive & postpositive) abbreviations
33+
(
34+
"St. Michael's Church is on 5th st. near the light.",
35+
["St. Michael's Church is on 5th st. near the light."],
36+
),
37+
# 12) Possesive two letter abbreviations
38+
("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]),
39+
# 13) Multi-period abbreviations in the middle of a sentence
40+
("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]),
41+
# 14) Multi-period abbreviations at the end of a sentence
42+
(
43+
"I live in the E.U. How about you?",
44+
["I live in the E.U.", "How about you?"],
45+
),
46+
# 15) U.S. as sentence boundary
47+
(
48+
"I live in the U.S. How about you?",
49+
["I live in the U.S.", "How about you?"],
50+
),
51+
# 16) U.S. as non sentence boundary with next word capitalized
52+
("I work for the U.S. Government in Virginia.",
53+
["I work for the U.S. Government in Virginia."]),
54+
# 17) U.S. as non sentence boundary
55+
("I have lived in the U.S. for 20 years.",
56+
["I have lived in the U.S. for 20 years."]),
57+
# Most difficult sentence to crack
58+
# 18) A.M. / P.M. as non sentence boundary and sentence boundary
59+
(
60+
"At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.",
61+
[
62+
"At 5 a.m. Mr. Smith went to the bank.",
63+
"He left the bank at 6 P.M.", "Mr. Smith then went to the store."
64+
]
65+
),
66+
# 19) Number as non sentence boundary
67+
("She has $100.00 in her bag.", ["She has $100.00 in her bag."]),
68+
# 20) Number as sentence boundary
69+
("She has $100.00. It is in her bag.", ["She has $100.00.", "It is in her bag."]),
70+
# 21) Parenthetical inside sentence
71+
("He teaches science (He previously worked for 5 years as an engineer.) at the local University.",
72+
["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]),
73+
# 22) Email addresses
74+
("Her email is [email protected]. I sent her an email.",
75+
["Her email is [email protected].", "I sent her an email."]),
76+
# 23) Web addresses
77+
("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
78+
["The site is: https://www.example.50.com/new-site/awesome_content.html.",
79+
"Please check it out."]),
80+
# 24) Single quotations inside sentence
81+
(
82+
"She turned to him, 'This is great.' she said.",
83+
["She turned to him, 'This is great.' she said."],
84+
),
85+
# 25) Double quotations inside sentence
86+
(
87+
'She turned to him, "This is great." she said.',
88+
['She turned to him, "This is great." she said.'],
89+
),
90+
# 26) Double quotations at the end of a sentence
91+
(
92+
'She turned to him, "This is great." She held the book out to show him.',
93+
[
94+
'She turned to him, "This is great."',
95+
"She held the book out to show him."
96+
],
97+
),
98+
# 27) Double punctuation (exclamation point)
99+
("Hello!! Long time no see.", ["Hello!!", "Long time no see."]),
100+
# 28) Double punctuation (question mark)
101+
("Hello?? Who is there?", ["Hello??", "Who is there?"]),
102+
# 29) Double punctuation (exclamation point / question mark)
103+
("Hello!? Is that you?", ["Hello!?", "Is that you?"]),
104+
# 30) Double punctuation (question mark / exclamation point)
105+
("Hello?! Is that you?", ["Hello?!", "Is that you?"]),
106+
# 31) List (period followed by parens and no period to end item)
107+
(
108+
"1.) The first item 2.) The second item",
109+
["1.) The first item", "2.) The second item"],
110+
),
111+
# 32) List (period followed by parens and period to end item)
112+
(
113+
"1.) The first item. 2.) The second item.",
114+
["1.) The first item.", "2.) The second item."],
115+
),
116+
# 33) List (parens and no period to end item)
117+
(
118+
"1) The first item 2) The second item",
119+
["1) The first item", "2) The second item"],
120+
),
121+
# 34) List (parens and period to end item)
122+
("1) The first item. 2) The second item.",
123+
["1) The first item.", "2) The second item."]),
124+
# 35) List (period to mark list and no period to end item)
125+
(
126+
"1. The first item 2. The second item",
127+
["1. The first item", "2. The second item"],
128+
),
129+
# 36) List (period to mark list and period to end item)
130+
(
131+
"1. The first item. 2. The second item.",
132+
["1. The first item.", "2. The second item."],
133+
),
134+
# 37) List with bullet
135+
(
136+
"• 9. The first item • 10. The second item",
137+
["• 9. The first item", "• 10. The second item"],
138+
),
139+
# 38) List with hypthen
140+
(
141+
"⁃9. The first item ⁃10. The second item",
142+
["⁃9. The first item", "⁃10. The second item"],
143+
),
144+
# 39) Alphabetical list
145+
(
146+
"a. The first item b. The second item c. The third list item",
147+
["a. The first item", "b. The second item", "c. The third list item"],
148+
),
149+
# 40) Geo Coordinates
150+
(
151+
"You can find it at N°. 1026.253.553. That is where the treasure is.",
152+
[
153+
"You can find it at N°. 1026.253.553.",
154+
"That is where the treasure is."
155+
],
156+
),
157+
# 41) Named entities with an exclamation point
158+
(
159+
"She works at Yahoo! in the accounting department.",
160+
["She works at Yahoo! in the accounting department."],
161+
),
162+
# 42) I as a sentence boundary and I as an abbreviation
163+
(
164+
"We make a good team, you and I. Did you see Albert I. Jones yesterday?",
165+
[
166+
"We make a good team, you and I.",
167+
"Did you see Albert I. Jones yesterday?"
168+
],
169+
),
170+
# 43) Ellipsis at end of quotation
171+
(
172+
"Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”",
173+
[
174+
"Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"
175+
],
176+
),
177+
# 44) Ellipsis with square brackets
178+
(
179+
""""Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""",
180+
[
181+
'"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).'
182+
],
183+
),
184+
# 45) Ellipsis as sentence boundary (standard ellipsis rules)
185+
("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.",
186+
[
187+
"If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .",
188+
"Next sentence."
189+
]),
190+
# 46) Ellipsis as sentence boundary (non-standard ellipsis rules)
191+
(
192+
"I never meant that.... She left the store.",
193+
["I never meant that....", "She left the store."],
194+
),
195+
# 47) Ellipsis as non sentence boundary
196+
(
197+
"I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.",
198+
[
199+
"I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."
200+
],
201+
),
202+
# 48) 4-dot ellipsis
203+
(
204+
"One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .",
205+
[
206+
"One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.",
207+
". . . The practice was not abandoned. . . ."
208+
],
209+
)
210+
]

benchmarks/genia_benchmark.py

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import blingfire
2+
import nltk
3+
import pysbd
4+
import spacy
5+
import stanza
6+
7+
import syntok
8+
from syntok.tokenizer import Tokenizer
9+
import syntok.segmenter as syntok_segmenter
10+
11+
from pathlib import Path
12+
13+
pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
14+
15+
nlp = spacy.blank('en')
16+
nlp.add_pipe(nlp.create_pipe("sentencizer"))
17+
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
18+
#stanza.download('en')
19+
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
20+
21+
syntok_tokenizer = Tokenizer()
22+
23+
def blingfire_tokenize(text):
24+
return blingfire.text_to_sentences(text).split('\n')
25+
26+
def nltk_tokenize(text):
27+
return nltk.sent_tokenize(text)
28+
29+
def pysbd_tokenize(text):
30+
return pysbd_segmenter.segment(text)
31+
32+
def spacy_tokenize(text):
33+
return [sent.text.strip("\n") for sent in nlp(text).sents]
34+
35+
def spacy_dep_tokenize(text):
36+
return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
37+
38+
def stanza_tokenize(text):
39+
return [e.text for e in stanza_nlp(text).sentences]
40+
41+
def make_sentences(segmented_tokens):
42+
for sentence in segmented_tokens:
43+
yield "".join(str(token) for token in sentence).strip()
44+
45+
def syntok_tokenize(text):
46+
tokens = syntok_tokenizer.split(text)
47+
result = syntok_segmenter.split(iter(tokens))
48+
segments = [sent for sent in make_sentences(result)]
49+
return segments
50+
51+
def load_genia_corpus(genia_raw_dir):
52+
txtfiles = Path(genia_raw_dir).glob("**/*.txt")
53+
txtfiles = list(txtfiles)
54+
all_docs = []
55+
for ind, txtfile in enumerate(txtfiles, start=1):
56+
with open(txtfile) as f:
57+
geniatext = f.read().strip()
58+
expected = geniatext.split('\n')
59+
all_docs.append((geniatext, expected))
60+
61+
return all_docs
62+
63+
def benchmark(docs, tokenize_func):
64+
65+
correct = 0
66+
for (text, expected) in docs:
67+
segments = tokenize_func(text)
68+
if segments == expected:
69+
correct +=1
70+
return correct
71+
72+
73+
if __name__ == "__main__":
74+
import argparse
75+
parser = argparse.ArgumentParser()
76+
parser.add_argument(
77+
'--genia',
78+
help="Path to the directory containing genia data."
79+
)
80+
81+
args = parser.parse_args()
82+
83+
libraries = (
84+
blingfire_tokenize,
85+
nltk_tokenize,
86+
pysbd_tokenize,
87+
spacy_tokenize,
88+
spacy_dep_tokenize,
89+
stanza_tokenize,
90+
syntok_tokenize
91+
)
92+
93+
docs = load_genia_corpus(args.genia)
94+
total = len(docs)
95+
for tokenize_func in libraries:
96+
correct = benchmark(docs, tokenize_func)
97+
percent_score = correct/total * 100
98+
print()
99+
print(tokenize_func.__name__)
100+
print('GENIA abstract acc: {:0.2f}%'.format(percent_score))

0 commit comments

Comments
 (0)