nipunsadvilkar
diff --git a/‎benchmarks/__init__.py b/‎benchmarks/__init__.py
diff --git a/‎benchmarks/benchmark.py
+84 b/‎benchmarks/benchmark.py
+84
diff --git a/‎benchmarks/english_golden_rules.py
+210 b/‎benchmarks/english_golden_rules.py
+210
diff --git a/‎benchmarks/genia_benchmark.py
+100 b/‎benchmarks/genia_benchmark.py
+100
@@ -0,0 +1,84 @@
+import blingfire
+import nltk
+import pysbd
+import spacy
+import stanza
+
+import syntok
+from syntok.tokenizer import Tokenizer
+import syntok.segmenter as syntok_segmenter
+
+from english_golden_rules import GOLDEN_EN_RULES
+
+pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
+
+nlp = spacy.blank('en')
+nlp.add_pipe(nlp.create_pipe("sentencizer"))
+nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
+#stanza.download('en')
+stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
+
+syntok_tokenizer = Tokenizer()
+
+def blingfire_tokenize(text):
+    return blingfire.text_to_sentences(text).split('\n')
+
+def nltk_tokenize(text):
+    return nltk.sent_tokenize(text)
+
+def pysbd_tokenize(text):
+    return pysbd_segmenter.segment(text)
+
+def spacy_tokenize(text):
+    return [sent.text for sent in nlp(text).sents]
+
+def spacy_dep_tokenize(text):
+    return [sent.text for sent in nlp_dep(text).sents]
+
+def stanza_tokenize(text):
+    return [e.text for e in stanza_nlp(text).sentences]
+
+def make_sentences(segmented_tokens):
+    for sentence in segmented_tokens:
+        yield "".join(str(token) for token in sentence).strip()
+
+def syntok_tokenize(text):
+    tokens = syntok_tokenizer.split(text)
+    result = syntok_segmenter.split(iter(tokens))
+    segments = [sent for sent in make_sentences(result)]
+    return segments
+
+
+total_rules = len(GOLDEN_EN_RULES)
+
+def benchmark(golden_rules, tokenize_func):
+    score = 0
+    for rule in golden_rules:
+        text, expected = rule
+        segments = tokenize_func(text)
+        if segments == expected:
+            score += 1
+    percent_score = (score / total_rules) * 100.0
+
+    return percent_score
+
+if __name__ == "__main__":
+    import time
+    libraries = (
+        blingfire_tokenize,
+        nltk_tokenize,
+        pysbd_tokenize,
+        spacy_tokenize,
+        spacy_dep_tokenize,
+        stanza_tokenize,
+        syntok_tokenize)
+    for tokenize_func in libraries:
+        t = time.time()
+        for i in range(100):
+            percent_score = benchmark(GOLDEN_EN_RULES, tokenize_func)
+
+        time_taken = time.time() - t
+        print()
+        print(tokenize_func.__name__)
+        print('GRS score: {:0.2f}%'.format(percent_score))
+        print('Speed(Avg over 100 runs): {:>10.2f} ms'.format(time_taken*1000/100))
@@ -0,0 +1,210 @@
+# -*- coding: utf-8 -*-
+
+GOLDEN_EN_RULES = [
+    # 1) Simple period to end sentence
+    ("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]),
+    # 2) Question mark to end sentence
+    ("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]),
+    # 3) Exclamation point to end sentence
+    ("There it is! I found it.", ["There it is!", "I found it."]),
+    # 4) One letter upper case abbreviations
+    ("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]),
+    # 5) One letter lower case abbreviations
+    ("Please turn to p. 55.", ["Please turn to p. 55."]),
+    # 6) Two letter lower case abbreviations in the middle of a sentence
+    ("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]),
+    # 7) Two letter upper case abbreviations in the middle of a sentence
+    ("They closed the deal with Pitt, Briggs & Co. at noon.",
+        ["They closed the deal with Pitt, Briggs & Co. at noon."]),
+    # 8) Two letter lower case abbreviations at the end of a sentence
+    (
+        "Let's ask Jane and co. They should know.",
+        ["Let's ask Jane and co.", "They should know."]),
+    # 9) Two letter upper case abbreviations at the end of a sentence
+    (
+        "They closed the deal with Pitt, Briggs & Co. It closed yesterday.", [
+            "They closed the deal with Pitt, Briggs & Co.",
+            "It closed yesterday."
+        ],
+    ),
+    # 10) Two letter (prepositive) abbreviations
+    ("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]),
+    # 11) Two letter (prepositive & postpositive) abbreviations
+    (
+        "St. Michael's Church is on 5th st. near the light.",
+        ["St. Michael's Church is on 5th st. near the light."],
+    ),
+    # 12) Possesive two letter abbreviations
+    ("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]),
+    # 13) Multi-period abbreviations in the middle of a sentence
+    ("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]),
+    # 14) Multi-period abbreviations at the end of a sentence
+    (
+        "I live in the E.U. How about you?",
+        ["I live in the E.U.", "How about you?"],
+    ),
+    # 15) U.S. as sentence boundary
+    (
+        "I live in the U.S. How about you?",
+        ["I live in the U.S.", "How about you?"],
+    ),
+    # 16) U.S. as non sentence boundary with next word capitalized
+    ("I work for the U.S. Government in Virginia.",
+        ["I work for the U.S. Government in Virginia."]),
+    # 17) U.S. as non sentence boundary
+    ("I have lived in the U.S. for 20 years.",
+        ["I have lived in the U.S. for 20 years."]),
+    # Most difficult sentence to crack
+    # 18) A.M. / P.M. as non sentence boundary and sentence boundary
+    (
+         "At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.",
+         [
+             "At 5 a.m. Mr. Smith went to the bank.",
+             "He left the bank at 6 P.M.", "Mr. Smith then went to the store."
+         ]
+    ),
+    # 19) Number as non sentence boundary
+    ("She has $100.00 in her bag.", ["She has $100.00 in her bag."]),
+    # 20) Number as sentence boundary
+    ("She has $100.00. It is in her bag.", ["She has $100.00.", "It is in her bag."]),
+    # 21) Parenthetical inside sentence
+    ("He teaches science (He previously worked for 5 years as an engineer.) at the local University.",
+        ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]),
+    # 22) Email addresses
+    ("Her email is [email protected]. I sent her an email.",
+        ["Her email is [email protected].", "I sent her an email."]),
+    # 23) Web addresses
+    ("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.",
+        ["The site is: https://www.example.50.com/new-site/awesome_content.html.",
+            "Please check it out."]),
+    # 24) Single quotations inside sentence
+    (
+        "She turned to him, 'This is great.' she said.",
+        ["She turned to him, 'This is great.' she said."],
+    ),
+    # 25) Double quotations inside sentence
+    (
+        'She turned to him, "This is great." she said.',
+        ['She turned to him, "This is great." she said.'],
+    ),
+    # 26) Double quotations at the end of a sentence
+    (
+        'She turned to him, "This is great." She held the book out to show him.',
+        [
+            'She turned to him, "This is great."',
+            "She held the book out to show him."
+        ],
+    ),
+    # 27) Double punctuation (exclamation point)
+    ("Hello!! Long time no see.", ["Hello!!", "Long time no see."]),
+    # 28) Double punctuation (question mark)
+    ("Hello?? Who is there?", ["Hello??", "Who is there?"]),
+    # 29) Double punctuation (exclamation point / question mark)
+    ("Hello!? Is that you?", ["Hello!?", "Is that you?"]),
+    # 30) Double punctuation (question mark / exclamation point)
+    ("Hello?! Is that you?", ["Hello?!", "Is that you?"]),
+    # 31) List (period followed by parens and no period to end item)
+    (
+        "1.) The first item 2.) The second item",
+        ["1.) The first item", "2.) The second item"],
+    ),
+    # 32) List (period followed by parens and period to end item)
+    (
+        "1.) The first item. 2.) The second item.",
+        ["1.) The first item.", "2.) The second item."],
+    ),
+    # 33) List (parens and no period to end item)
+    (
+        "1) The first item 2) The second item",
+        ["1) The first item", "2) The second item"],
+    ),
+    # 34) List (parens and period to end item)
+    ("1) The first item. 2) The second item.",
+        ["1) The first item.", "2) The second item."]),
+    # 35) List (period to mark list and no period to end item)
+    (
+        "1. The first item 2. The second item",
+        ["1. The first item", "2. The second item"],
+    ),
+    # 36) List (period to mark list and period to end item)
+    (
+        "1. The first item. 2. The second item.",
+        ["1. The first item.", "2. The second item."],
+    ),
+    # 37) List with bullet
+    (
+        "• 9. The first item • 10. The second item",
+        ["• 9. The first item", "• 10. The second item"],
+    ),
+    # 38) List with hypthen
+    (
+        "⁃9. The first item ⁃10. The second item",
+        ["⁃9. The first item", "⁃10. The second item"],
+    ),
+    # 39) Alphabetical list
+    (
+        "a. The first item b. The second item c. The third list item",
+        ["a. The first item", "b. The second item", "c. The third list item"],
+    ),
+    # 40) Geo Coordinates
+    (
+        "You can find it at N°. 1026.253.553. That is where the treasure is.",
+        [
+            "You can find it at N°. 1026.253.553.",
+            "That is where the treasure is."
+        ],
+    ),
+    # 41) Named entities with an exclamation point
+    (
+        "She works at Yahoo! in the accounting department.",
+        ["She works at Yahoo! in the accounting department."],
+    ),
+    # 42) I as a sentence boundary and I as an abbreviation
+    (
+        "We make a good team, you and I. Did you see Albert I. Jones yesterday?",
+        [
+            "We make a good team, you and I.",
+            "Did you see Albert I. Jones yesterday?"
+        ],
+    ),
+    # 43) Ellipsis at end of quotation
+    (
+        "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”",
+        [
+            "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”"
+        ],
+    ),
+    # 44) Ellipsis with square brackets
+    (
+        """"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""",
+        [
+            '"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).'
+        ],
+    ),
+    # 45) Ellipsis as sentence boundary (standard ellipsis rules)
+    ("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.",
+        [
+            "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .",
+            "Next sentence."
+        ]),
+    # 46) Ellipsis as sentence boundary (non-standard ellipsis rules)
+    (
+        "I never meant that.... She left the store.",
+        ["I never meant that....", "She left the store."],
+    ),
+    # 47) Ellipsis as non sentence boundary
+    (
+        "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.",
+        [
+            "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it."
+        ],
+    ),
+    # 48) 4-dot ellipsis
+    (
+        "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .",
+        [
+            "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.",
+            ". . . The practice was not abandoned. . . ."
+        ],
+    )
+]
@@ -0,0 +1,100 @@
+import blingfire
+import nltk
+import pysbd
+import spacy
+import stanza
+
+import syntok
+from syntok.tokenizer import Tokenizer
+import syntok.segmenter as syntok_segmenter
+
+from pathlib import Path
+
+pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)
+
+nlp = spacy.blank('en')
+nlp.add_pipe(nlp.create_pipe("sentencizer"))
+nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
+#stanza.download('en')
+stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')
+
+syntok_tokenizer = Tokenizer()
+
+def blingfire_tokenize(text):
+    return blingfire.text_to_sentences(text).split('\n')
+
+def nltk_tokenize(text):
+    return nltk.sent_tokenize(text)
+
+def pysbd_tokenize(text):
+    return pysbd_segmenter.segment(text)
+
+def spacy_tokenize(text):
+    return [sent.text.strip("\n") for sent in nlp(text).sents]
+
+def spacy_dep_tokenize(text):
+    return [sent.text.strip("\n") for sent in nlp_dep(text).sents]
+
+def stanza_tokenize(text):
+    return [e.text for e in stanza_nlp(text).sentences]
+
+def make_sentences(segmented_tokens):
+    for sentence in segmented_tokens:
+        yield "".join(str(token) for token in sentence).strip()
+
+def syntok_tokenize(text):
+    tokens = syntok_tokenizer.split(text)
+    result = syntok_segmenter.split(iter(tokens))
+    segments = [sent for sent in make_sentences(result)]
+    return segments
+
+def load_genia_corpus(genia_raw_dir):
+    txtfiles = Path(genia_raw_dir).glob("**/*.txt")
+    txtfiles = list(txtfiles)
+    all_docs = []
+    for ind, txtfile in enumerate(txtfiles, start=1):
+        with open(txtfile) as f:
+            geniatext = f.read().strip()
+        expected = geniatext.split('\n')
+        all_docs.append((geniatext, expected))
+
+    return all_docs
+
+def benchmark(docs, tokenize_func):
+
+    correct = 0
+    for (text, expected) in docs:
+        segments = tokenize_func(text)
+        if segments == expected:
+            correct +=1
+    return correct
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+            '--genia',
+            help="Path to the directory containing genia data."
+    )
+
+    args = parser.parse_args()
+
+    libraries = (
+        blingfire_tokenize,
+        nltk_tokenize,
+        pysbd_tokenize,
+        spacy_tokenize,
+        spacy_dep_tokenize,
+        stanza_tokenize,
+        syntok_tokenize
+        )
+
+    docs = load_genia_corpus(args.genia)
+    total = len(docs)
+    for tokenize_func in libraries:
+        correct = benchmark(docs, tokenize_func)
+        percent_score = correct/total * 100
+        print()
+        print(tokenize_func.__name__)
+        print('GENIA abstract acc: {:0.2f}%'.format(percent_score))