|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +GOLDEN_EN_RULES = [ |
| 4 | + # 1) Simple period to end sentence |
| 5 | + ("Hello World. My name is Jonas.", ["Hello World.", "My name is Jonas."]), |
| 6 | + # 2) Question mark to end sentence |
| 7 | + ("What is your name? My name is Jonas.", ["What is your name?", "My name is Jonas."]), |
| 8 | + # 3) Exclamation point to end sentence |
| 9 | + ("There it is! I found it.", ["There it is!", "I found it."]), |
| 10 | + # 4) One letter upper case abbreviations |
| 11 | + ("My name is Jonas E. Smith.", ["My name is Jonas E. Smith."]), |
| 12 | + # 5) One letter lower case abbreviations |
| 13 | + ("Please turn to p. 55.", ["Please turn to p. 55."]), |
| 14 | + # 6) Two letter lower case abbreviations in the middle of a sentence |
| 15 | + ("Were Jane and co. at the party?", ["Were Jane and co. at the party?"]), |
| 16 | + # 7) Two letter upper case abbreviations in the middle of a sentence |
| 17 | + ("They closed the deal with Pitt, Briggs & Co. at noon.", |
| 18 | + ["They closed the deal with Pitt, Briggs & Co. at noon."]), |
| 19 | + # 8) Two letter lower case abbreviations at the end of a sentence |
| 20 | + ( |
| 21 | + "Let's ask Jane and co. They should know.", |
| 22 | + ["Let's ask Jane and co.", "They should know."]), |
| 23 | + # 9) Two letter upper case abbreviations at the end of a sentence |
| 24 | + ( |
| 25 | + "They closed the deal with Pitt, Briggs & Co. It closed yesterday.", [ |
| 26 | + "They closed the deal with Pitt, Briggs & Co.", |
| 27 | + "It closed yesterday." |
| 28 | + ], |
| 29 | + ), |
| 30 | + # 10) Two letter (prepositive) abbreviations |
| 31 | + ("I can see Mt. Fuji from here.", ["I can see Mt. Fuji from here."]), |
| 32 | + # 11) Two letter (prepositive & postpositive) abbreviations |
| 33 | + ( |
| 34 | + "St. Michael's Church is on 5th st. near the light.", |
| 35 | + ["St. Michael's Church is on 5th st. near the light."], |
| 36 | + ), |
| 37 | + # 12) Possesive two letter abbreviations |
| 38 | + ("That is JFK Jr.'s book.", ["That is JFK Jr.'s book."]), |
| 39 | + # 13) Multi-period abbreviations in the middle of a sentence |
| 40 | + ("I visited the U.S.A. last year.", ["I visited the U.S.A. last year."]), |
| 41 | + # 14) Multi-period abbreviations at the end of a sentence |
| 42 | + ( |
| 43 | + "I live in the E.U. How about you?", |
| 44 | + ["I live in the E.U.", "How about you?"], |
| 45 | + ), |
| 46 | + # 15) U.S. as sentence boundary |
| 47 | + ( |
| 48 | + "I live in the U.S. How about you?", |
| 49 | + ["I live in the U.S.", "How about you?"], |
| 50 | + ), |
| 51 | + # 16) U.S. as non sentence boundary with next word capitalized |
| 52 | + ("I work for the U.S. Government in Virginia.", |
| 53 | + ["I work for the U.S. Government in Virginia."]), |
| 54 | + # 17) U.S. as non sentence boundary |
| 55 | + ("I have lived in the U.S. for 20 years.", |
| 56 | + ["I have lived in the U.S. for 20 years."]), |
| 57 | + # Most difficult sentence to crack |
| 58 | + # 18) A.M. / P.M. as non sentence boundary and sentence boundary |
| 59 | + ( |
| 60 | + "At 5 a.m. Mr. Smith went to the bank. He left the bank at 6 P.M. Mr. Smith then went to the store.", |
| 61 | + [ |
| 62 | + "At 5 a.m. Mr. Smith went to the bank.", |
| 63 | + "He left the bank at 6 P.M.", "Mr. Smith then went to the store." |
| 64 | + ] |
| 65 | + ), |
| 66 | + # 19) Number as non sentence boundary |
| 67 | + ("She has $100.00 in her bag.", ["She has $100.00 in her bag."]), |
| 68 | + # 20) Number as sentence boundary |
| 69 | + ("She has $100.00. It is in her bag.", ["She has $100.00.", "It is in her bag."]), |
| 70 | + # 21) Parenthetical inside sentence |
| 71 | + ("He teaches science (He previously worked for 5 years as an engineer.) at the local University.", |
| 72 | + ["He teaches science (He previously worked for 5 years as an engineer.) at the local University."]), |
| 73 | + # 22) Email addresses |
| 74 | + ( "Her email is [email protected]. I sent her an email.", |
| 75 | + [ "Her email is [email protected].", "I sent her an email."]), |
| 76 | + # 23) Web addresses |
| 77 | + ("The site is: https://www.example.50.com/new-site/awesome_content.html. Please check it out.", |
| 78 | + ["The site is: https://www.example.50.com/new-site/awesome_content.html.", |
| 79 | + "Please check it out."]), |
| 80 | + # 24) Single quotations inside sentence |
| 81 | + ( |
| 82 | + "She turned to him, 'This is great.' she said.", |
| 83 | + ["She turned to him, 'This is great.' she said."], |
| 84 | + ), |
| 85 | + # 25) Double quotations inside sentence |
| 86 | + ( |
| 87 | + 'She turned to him, "This is great." she said.', |
| 88 | + ['She turned to him, "This is great." she said.'], |
| 89 | + ), |
| 90 | + # 26) Double quotations at the end of a sentence |
| 91 | + ( |
| 92 | + 'She turned to him, "This is great." She held the book out to show him.', |
| 93 | + [ |
| 94 | + 'She turned to him, "This is great."', |
| 95 | + "She held the book out to show him." |
| 96 | + ], |
| 97 | + ), |
| 98 | + # 27) Double punctuation (exclamation point) |
| 99 | + ("Hello!! Long time no see.", ["Hello!!", "Long time no see."]), |
| 100 | + # 28) Double punctuation (question mark) |
| 101 | + ("Hello?? Who is there?", ["Hello??", "Who is there?"]), |
| 102 | + # 29) Double punctuation (exclamation point / question mark) |
| 103 | + ("Hello!? Is that you?", ["Hello!?", "Is that you?"]), |
| 104 | + # 30) Double punctuation (question mark / exclamation point) |
| 105 | + ("Hello?! Is that you?", ["Hello?!", "Is that you?"]), |
| 106 | + # 31) List (period followed by parens and no period to end item) |
| 107 | + ( |
| 108 | + "1.) The first item 2.) The second item", |
| 109 | + ["1.) The first item", "2.) The second item"], |
| 110 | + ), |
| 111 | + # 32) List (period followed by parens and period to end item) |
| 112 | + ( |
| 113 | + "1.) The first item. 2.) The second item.", |
| 114 | + ["1.) The first item.", "2.) The second item."], |
| 115 | + ), |
| 116 | + # 33) List (parens and no period to end item) |
| 117 | + ( |
| 118 | + "1) The first item 2) The second item", |
| 119 | + ["1) The first item", "2) The second item"], |
| 120 | + ), |
| 121 | + # 34) List (parens and period to end item) |
| 122 | + ("1) The first item. 2) The second item.", |
| 123 | + ["1) The first item.", "2) The second item."]), |
| 124 | + # 35) List (period to mark list and no period to end item) |
| 125 | + ( |
| 126 | + "1. The first item 2. The second item", |
| 127 | + ["1. The first item", "2. The second item"], |
| 128 | + ), |
| 129 | + # 36) List (period to mark list and period to end item) |
| 130 | + ( |
| 131 | + "1. The first item. 2. The second item.", |
| 132 | + ["1. The first item.", "2. The second item."], |
| 133 | + ), |
| 134 | + # 37) List with bullet |
| 135 | + ( |
| 136 | + "• 9. The first item • 10. The second item", |
| 137 | + ["• 9. The first item", "• 10. The second item"], |
| 138 | + ), |
| 139 | + # 38) List with hypthen |
| 140 | + ( |
| 141 | + "⁃9. The first item ⁃10. The second item", |
| 142 | + ["⁃9. The first item", "⁃10. The second item"], |
| 143 | + ), |
| 144 | + # 39) Alphabetical list |
| 145 | + ( |
| 146 | + "a. The first item b. The second item c. The third list item", |
| 147 | + ["a. The first item", "b. The second item", "c. The third list item"], |
| 148 | + ), |
| 149 | + # 40) Geo Coordinates |
| 150 | + ( |
| 151 | + "You can find it at N°. 1026.253.553. That is where the treasure is.", |
| 152 | + [ |
| 153 | + "You can find it at N°. 1026.253.553.", |
| 154 | + "That is where the treasure is." |
| 155 | + ], |
| 156 | + ), |
| 157 | + # 41) Named entities with an exclamation point |
| 158 | + ( |
| 159 | + "She works at Yahoo! in the accounting department.", |
| 160 | + ["She works at Yahoo! in the accounting department."], |
| 161 | + ), |
| 162 | + # 42) I as a sentence boundary and I as an abbreviation |
| 163 | + ( |
| 164 | + "We make a good team, you and I. Did you see Albert I. Jones yesterday?", |
| 165 | + [ |
| 166 | + "We make a good team, you and I.", |
| 167 | + "Did you see Albert I. Jones yesterday?" |
| 168 | + ], |
| 169 | + ), |
| 170 | + # 43) Ellipsis at end of quotation |
| 171 | + ( |
| 172 | + "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”", |
| 173 | + [ |
| 174 | + "Thoreau argues that by simplifying one’s life, “the laws of the universe will appear less complex. . . .”" |
| 175 | + ], |
| 176 | + ), |
| 177 | + # 44) Ellipsis with square brackets |
| 178 | + ( |
| 179 | + """"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).""", |
| 180 | + [ |
| 181 | + '"Bohr [...] used the analogy of parallel stairways [...]" (Smith 55).' |
| 182 | + ], |
| 183 | + ), |
| 184 | + # 45) Ellipsis as sentence boundary (standard ellipsis rules) |
| 185 | + ("If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . . Next sentence.", |
| 186 | + [ |
| 187 | + "If words are left off at the end of a sentence, and that is all that is omitted, indicate the omission with ellipsis marks (preceded and followed by a space) and then indicate the end of the sentence with a period . . . .", |
| 188 | + "Next sentence." |
| 189 | + ]), |
| 190 | + # 46) Ellipsis as sentence boundary (non-standard ellipsis rules) |
| 191 | + ( |
| 192 | + "I never meant that.... She left the store.", |
| 193 | + ["I never meant that....", "She left the store."], |
| 194 | + ), |
| 195 | + # 47) Ellipsis as non sentence boundary |
| 196 | + ( |
| 197 | + "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it.", |
| 198 | + [ |
| 199 | + "I wasn’t really ... well, what I mean...see . . . what I'm saying, the thing is . . . I didn’t mean it." |
| 200 | + ], |
| 201 | + ), |
| 202 | + # 48) 4-dot ellipsis |
| 203 | + ( |
| 204 | + "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds. . . . The practice was not abandoned. . . .", |
| 205 | + [ |
| 206 | + "One further habit which was somewhat weakened . . . was that of combining words into self-interpreting compounds.", |
| 207 | + ". . . The practice was not abandoned. . . ." |
| 208 | + ], |
| 209 | + ) |
| 210 | +] |
0 commit comments