1
1
# -*- coding: utf-8 -*-
2
2
import re
3
3
from pysbd .utils import Text
4
+
4
5
# TODO: SENTENCE_STARTERS should be lang specific
5
6
from pysbd .lang .standard import Abbreviation , SENTENCE_STARTERS
6
- from pysbd .lang .common .numbers import (Common , SingleLetterAbbreviationRules ,
7
- AmPmRules )
7
+ from pysbd .lang .common .numbers import Common , SingleLetterAbbreviationRules , AmPmRules
8
8
9
9
10
10
def replace_pre_number_abbr (txt , abbr ):
11
- txt = re .sub (r'(?<=\s{abbr})\.(?=\s\d)|(?<=^{abbr})\.(?=\s\d)' .format (abbr = abbr .strip ()), "∯" , txt )
12
- txt = re .sub (r'(?<=\s{abbr})\.(?=\s+\()|(?<=^{abbr})\.(?=\s+\()' .format (abbr = abbr .strip ()), "∯" , txt )
11
+ # prepend a space to avoid needing another regex for start of string
12
+ txt = " " + txt
13
+ txt = re .sub (r"(?<=\s{abbr})\.(?=(\s\d|\s+\())" .format (abbr = abbr .strip ()), "∯" , txt )
14
+ # remove the prepended space
15
+ txt = txt [1 :]
13
16
return txt
14
17
15
18
16
19
def replace_prepositive_abbr (txt , abbr ):
17
- txt = re .sub (r'(?<=\s{abbr})\.(?=\s)|(?<=^{abbr})\.(?=\s)' .format (abbr = abbr .strip ()), "∯" , txt )
18
- txt = re .sub (r'(?<=\s{abbr})\.(?=:\d+)|(?<=^{abbr})\.(?=:\d+)' .format (abbr = abbr .strip ()), "∯" , txt )
20
+ # prepend a space to avoid needing another regex for start of string
21
+ txt = " " + txt
22
+ txt = re .sub (r"(?<=\s{abbr})\.(?=(\s|:\d+))" .format (abbr = abbr .strip ()), "∯" , txt )
23
+ # remove the prepended space
24
+ txt = txt [1 :]
19
25
return txt
20
26
21
27
22
28
def replace_period_of_abbr (txt , abbr ):
23
- txt = re .sub (r"(?<=\s{abbr})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^{abbr})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))" .format (abbr = abbr .strip ()), '∯' , txt )
24
- txt = re .sub (r"(?<=\s{abbr})\.(?=,)|(?<=^{abbr})\.(?=,)" .format (abbr = abbr .strip ()), '∯' , txt )
29
+ # prepend a space to avoid needing another regex for start of string
30
+ txt = " " + txt
31
+ txt = re .sub (
32
+ r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))" .format (
33
+ abbr = abbr .strip ()
34
+ ),
35
+ "∯" ,
36
+ txt ,
37
+ )
38
+ # remove the prepended space
39
+ txt = txt [1 :]
25
40
return txt
26
41
27
42
28
43
def replace_abbreviation_as_sentence_boundary (txt ):
29
- for word in SENTENCE_STARTERS :
30
- escaped = re .escape (word )
31
- regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s{}\s)" .format (escaped )
32
- txt = re .sub (regex , '\\ 1.' , txt )
44
+ sent_starters = "|" .join ((r"(?=\s{}\s)" .format (word ) for word in SENTENCE_STARTERS ))
45
+ regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})" .format (sent_starters )
46
+ txt = re .sub (regex , '\\ 1.' , txt )
33
47
return txt
34
48
35
49
36
50
class AbbreviationReplacer (object ):
37
-
38
- def __init__ (self , text , language = 'en' ):
51
+ def __init__ (self , text , language = "en" ):
39
52
self .text = text
40
53
self .language = language
41
54
42
55
def replace (self ):
43
- self .text = Text (self .text ).apply (Common .PossessiveAbbreviationRule ,
44
- Common .KommanditgesellschaftRule ,
45
- * SingleLetterAbbreviationRules .All )
56
+ self .text = Text (self .text ).apply (
57
+ Common .PossessiveAbbreviationRule ,
58
+ Common .KommanditgesellschaftRule ,
59
+ * SingleLetterAbbreviationRules .All
60
+ )
46
61
self .text = self .search_for_abbreviations_in_string ()
47
62
self .replace_multi_period_abbreviations ()
48
63
self .text = Text (self .text ).apply (* AmPmRules .All )
@@ -52,9 +67,15 @@ def replace(self):
52
67
def replace_multi_period_abbreviations (self ):
53
68
def mpa_replace (match ):
54
69
match = match .group ()
55
- match = re .sub (re .escape (r'.' ), '∯' , match )
70
+ match = re .sub (re .escape (r"." ), "∯" , match )
56
71
return match
57
- self .text = re .sub (Common .MULTI_PERIOD_ABBREVIATION_REGEX , mpa_replace , self .text , flags = re .IGNORECASE )
72
+
73
+ self .text = re .sub (
74
+ Common .MULTI_PERIOD_ABBREVIATION_REGEX ,
75
+ mpa_replace ,
76
+ self .text ,
77
+ flags = re .IGNORECASE ,
78
+ )
58
79
59
80
def search_for_abbreviations_in_string (self ):
60
81
original = self .text
@@ -64,25 +85,27 @@ def search_for_abbreviations_in_string(self):
64
85
if stripped not in lowered :
65
86
continue
66
87
abbrev_match = re .findall (
67
- r' (?:^|\s|\r|\n){}' .format (stripped ), original ,
68
- flags = re . IGNORECASE )
88
+ r" (?:^|\s|\r|\n){}" .format (stripped ), original , flags = re . IGNORECASE
89
+ )
69
90
if not abbrev_match :
70
91
continue
71
92
next_word_start = r"(?<={" + str (re .escape (stripped )) + "} ).{1}"
72
93
char_array = re .findall (next_word_start , self .text )
73
94
for ind , match in enumerate (abbrev_match ):
74
- self .text = self .scan_for_replacements (self .text , match , ind , char_array )
95
+ self .text = self .scan_for_replacements (
96
+ self .text , match , ind , char_array
97
+ )
75
98
return self .text
76
99
77
100
def scan_for_replacements (self , txt , am , ind , char_array ):
78
101
try :
79
102
char = char_array [ind ]
80
103
except IndexError :
81
- char = ''
104
+ char = ""
82
105
prepositive = Abbreviation .PREPOSITIVE_ABBREVIATIONS
83
106
number_abbr = Abbreviation .NUMBER_ABBREVIATIONS
84
107
upper = str (char ).isupper ()
85
- if ( not upper or am .strip ().lower () in prepositive ) :
108
+ if not upper or am .strip ().lower () in prepositive :
86
109
if am .strip ().lower () in prepositive :
87
110
txt = replace_prepositive_abbr (txt , am )
88
111
elif am .strip ().lower () in number_abbr :
0 commit comments