Skip to content

Commit f7c640f

Browse files
Reduce some calls to re.sub (#50)
Reduce some calls to re.sub
2 parents 16e8683 + 1ba5f71 commit f7c640f

File tree

4 files changed

+73
-25
lines changed

4 files changed

+73
-25
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -40,3 +40,7 @@
4040
# v0.2.2
4141

4242
- 🐛 Fix unbalanced parenthesis - \#47
43+
44+
# v0.2.3
45+
46+
- 🐛 Performance improvement in `abbreviation_replacer`- \#50

examples/test_timing_script.py

+21
Large diffs are not rendered by default.

pysbd/abbreviation_replacer.py

+47-24
Original file line numberDiff line numberDiff line change
@@ -1,48 +1,63 @@
11
# -*- coding: utf-8 -*-
22
import re
33
from pysbd.utils import Text
4+
45
# TODO: SENTENCE_STARTERS should be lang specific
56
from pysbd.lang.standard import Abbreviation, SENTENCE_STARTERS
6-
from pysbd.lang.common.numbers import (Common, SingleLetterAbbreviationRules,
7-
AmPmRules)
7+
from pysbd.lang.common.numbers import Common, SingleLetterAbbreviationRules, AmPmRules
88

99

1010
def replace_pre_number_abbr(txt, abbr):
11-
txt = re.sub(r'(?<=\s{abbr})\.(?=\s\d)|(?<=^{abbr})\.(?=\s\d)'.format(abbr=abbr.strip()), "∯", txt)
12-
txt = re.sub(r'(?<=\s{abbr})\.(?=\s+\()|(?<=^{abbr})\.(?=\s+\()'.format(abbr=abbr.strip()), "∯", txt)
11+
# prepend a space to avoid needing another regex for start of string
12+
txt = " " + txt
13+
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s\d|\s+\())".format(abbr=abbr.strip()), "∯", txt)
14+
# remove the prepended space
15+
txt = txt[1:]
1316
return txt
1417

1518

1619
def replace_prepositive_abbr(txt, abbr):
17-
txt = re.sub(r'(?<=\s{abbr})\.(?=\s)|(?<=^{abbr})\.(?=\s)'.format(abbr=abbr.strip()), "∯", txt)
18-
txt = re.sub(r'(?<=\s{abbr})\.(?=:\d+)|(?<=^{abbr})\.(?=:\d+)'.format(abbr=abbr.strip()), "∯", txt)
20+
# prepend a space to avoid needing another regex for start of string
21+
txt = " " + txt
22+
txt = re.sub(r"(?<=\s{abbr})\.(?=(\s|:\d+))".format(abbr=abbr.strip()), "∯", txt)
23+
# remove the prepended space
24+
txt = txt[1:]
1925
return txt
2026

2127

2228
def replace_period_of_abbr(txt, abbr):
23-
txt = re.sub(r"(?<=\s{abbr})\.(?=((\.|\:|-|\?)|(\s([a-z]|I\s|I'm|I'll|\d|\())))|(?<=^{abbr})\.(?=((\.|\:|\?)|(\s([a-z]|I\s|I'm|I'll|\d))))".format(abbr=abbr.strip()), '∯', txt)
24-
txt = re.sub(r"(?<=\s{abbr})\.(?=,)|(?<=^{abbr})\.(?=,)".format(abbr=abbr.strip()), '∯', txt)
29+
# prepend a space to avoid needing another regex for start of string
30+
txt = " " + txt
31+
txt = re.sub(
32+
r"(?<=\s{abbr})\.(?=((\.|\:|-|\?|,)|(\s([a-z]|I\s|I'm|I'll|\d|\())))".format(
33+
abbr=abbr.strip()
34+
),
35+
"∯",
36+
txt,
37+
)
38+
# remove the prepended space
39+
txt = txt[1:]
2540
return txt
2641

2742

2843
def replace_abbreviation_as_sentence_boundary(txt):
29-
for word in SENTENCE_STARTERS:
30-
escaped = re.escape(word)
31-
regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯(?=\s{}\s)".format(escaped)
32-
txt = re.sub(regex, '\\1.', txt)
44+
sent_starters = "|".join((r"(?=\s{}\s)".format(word) for word in SENTENCE_STARTERS))
45+
regex = r"(U∯S|U\.S|U∯K|E∯U|E\.U|U∯S∯A|U\.S\.A|I|i.v|I.V)∯({})".format(sent_starters)
46+
txt = re.sub(regex, '\\1.', txt)
3347
return txt
3448

3549

3650
class AbbreviationReplacer(object):
37-
38-
def __init__(self, text, language='en'):
51+
def __init__(self, text, language="en"):
3952
self.text = text
4053
self.language = language
4154

4255
def replace(self):
43-
self.text = Text(self.text).apply(Common.PossessiveAbbreviationRule,
44-
Common.KommanditgesellschaftRule,
45-
*SingleLetterAbbreviationRules.All)
56+
self.text = Text(self.text).apply(
57+
Common.PossessiveAbbreviationRule,
58+
Common.KommanditgesellschaftRule,
59+
*SingleLetterAbbreviationRules.All
60+
)
4661
self.text = self.search_for_abbreviations_in_string()
4762
self.replace_multi_period_abbreviations()
4863
self.text = Text(self.text).apply(*AmPmRules.All)
@@ -52,9 +67,15 @@ def replace(self):
5267
def replace_multi_period_abbreviations(self):
5368
def mpa_replace(match):
5469
match = match.group()
55-
match = re.sub(re.escape(r'.'), '∯', match)
70+
match = re.sub(re.escape(r"."), "∯", match)
5671
return match
57-
self.text = re.sub(Common.MULTI_PERIOD_ABBREVIATION_REGEX, mpa_replace, self.text, flags=re.IGNORECASE)
72+
73+
self.text = re.sub(
74+
Common.MULTI_PERIOD_ABBREVIATION_REGEX,
75+
mpa_replace,
76+
self.text,
77+
flags=re.IGNORECASE,
78+
)
5879

5980
def search_for_abbreviations_in_string(self):
6081
original = self.text
@@ -64,25 +85,27 @@ def search_for_abbreviations_in_string(self):
6485
if stripped not in lowered:
6586
continue
6687
abbrev_match = re.findall(
67-
r'(?:^|\s|\r|\n){}'.format(stripped), original,
68-
flags=re.IGNORECASE)
88+
r"(?:^|\s|\r|\n){}".format(stripped), original, flags=re.IGNORECASE
89+
)
6990
if not abbrev_match:
7091
continue
7192
next_word_start = r"(?<={" + str(re.escape(stripped)) + "} ).{1}"
7293
char_array = re.findall(next_word_start, self.text)
7394
for ind, match in enumerate(abbrev_match):
74-
self.text = self.scan_for_replacements(self.text, match, ind, char_array)
95+
self.text = self.scan_for_replacements(
96+
self.text, match, ind, char_array
97+
)
7598
return self.text
7699

77100
def scan_for_replacements(self, txt, am, ind, char_array):
78101
try:
79102
char = char_array[ind]
80103
except IndexError:
81-
char = ''
104+
char = ""
82105
prepositive = Abbreviation.PREPOSITIVE_ABBREVIATIONS
83106
number_abbr = Abbreviation.NUMBER_ABBREVIATIONS
84107
upper = str(char).isupper()
85-
if (not upper or am.strip().lower() in prepositive):
108+
if not upper or am.strip().lower() in prepositive:
86109
if am.strip().lower() in prepositive:
87110
txt = replace_prepositive_abbr(txt, am)
88111
elif am.strip().lower() in number_abbr:

pysbd/about.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/
33

44
__title__ = "pysbd"
5-
__version__ = "0.2.2"
5+
__version__ = "0.2.3"
66
__summary__ = "pysbd (Python Sentence Boundary Disambiguation) is a rule-based sentence boundary detection that works out-of-the-box across many languages."
77
__uri__ = "http://nipunsadvilkar.github.io/"
88
__author__ = "Nipun Sadvilkar"

0 commit comments

Comments
 (0)