Skip to content

Commit fc02f7a

Browse files
committed
Add regexp-engine
1 parent cc3856c commit fc02f7a

File tree

1 file changed

+150
-0
lines changed

1 file changed

+150
-0
lines changed

regexp-engine/regexp.py

+150
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
from typing import Optional, Iterator, Tuple
2+
import unittest
3+
4+
5+
class Matcher:
6+
def match(self, string: str) -> Iterator[int]:
7+
raise NotImplementedError()
8+
9+
10+
class ZeroMatcher(Matcher):
11+
def match(self, string: str) -> Iterator[int]:
12+
yield 0
13+
14+
15+
class CharacterMatcher(Matcher):
16+
def __init__(self, ch: str) -> None:
17+
self._ch = ch
18+
19+
def match(self, string: str) -> Iterator[int]:
20+
if len(string) > 0 and string[0] == self._ch:
21+
yield 1
22+
23+
24+
class AnyCharacterMatcher(Matcher):
25+
def match(self, string: str) -> Iterator[int]:
26+
if len(string) > 0:
27+
yield 1
28+
29+
30+
class RepeatMatcher(Matcher):
31+
def __init__(self, inner: Matcher) -> None:
32+
self._inner = inner
33+
34+
def match(self, string: str) -> Iterator[int]:
35+
for n1 in self._inner.match(string):
36+
if n1 == 0:
37+
yield 0
38+
else:
39+
for n2 in self.match(string[n1:]):
40+
yield n1 + n2
41+
yield 0
42+
43+
44+
class ConcatenationMatcher(Matcher):
45+
def __init__(self, head: Matcher, tail: Matcher) -> None:
46+
self._head = head
47+
self._tail = tail
48+
49+
def match(self, string: str) -> Iterator[int]:
50+
for n1 in self._head.match(string):
51+
for n2 in self._tail.match(string[n1:]):
52+
yield n1 + n2
53+
54+
55+
class AlternationMatcher(Matcher):
56+
def __init__(self, left: Matcher, right: Matcher) -> None:
57+
self._left = left
58+
self._right = right
59+
60+
def match(self, string: str) -> Iterator[int]:
61+
for n in self._left.match(string):
62+
yield n
63+
for n in self._right.match(string):
64+
yield n
65+
66+
67+
def _compile_character(pattern: str) -> Tuple[Matcher, str]:
68+
if len(pattern) == 0:
69+
return ZeroMatcher(), pattern
70+
if pattern[0] in ('|', ')'):
71+
return ZeroMatcher(), pattern
72+
if pattern[0] == '(':
73+
matcher, rest = _compile_alternation(pattern[1:])
74+
if not rest.startswith(')'):
75+
raise ValueError("カッコが対応していません。")
76+
return matcher, rest[1:]
77+
if pattern[0] == '.':
78+
return AnyCharacterMatcher(), pattern[1:]
79+
if pattern[0] == '\\' and len(pattern) >= 2:
80+
return CharacterMatcher(pattern[1]), pattern[2:]
81+
return CharacterMatcher(pattern[0]), pattern[1:]
82+
83+
84+
def _compile_quantifier(pattern: str) -> Tuple[Matcher, str]:
85+
inner, rest = _compile_character(pattern)
86+
if rest.startswith('*'):
87+
return RepeatMatcher(inner), rest[1:]
88+
return inner, rest
89+
90+
91+
def _compile_concatenation(pattern: str) -> Tuple[Matcher, str]:
92+
matcher, rest = _compile_quantifier(pattern)
93+
while len(rest) > 0 and rest[0] not in ('|', ')'):
94+
m, rest = _compile_quantifier(rest)
95+
matcher = ConcatenationMatcher(matcher, m)
96+
return matcher, rest
97+
98+
99+
def _compile_alternation(pattern: str) -> Tuple[Matcher, str]:
100+
matcher, rest = _compile_concatenation(pattern)
101+
while rest.startswith('|'):
102+
m, rest = _compile_concatenation(rest[1:])
103+
matcher = AlternationMatcher(matcher, m)
104+
return matcher, rest
105+
106+
107+
def _compile(pattern: str) -> Matcher:
108+
matcher, rest = _compile_alternation(pattern)
109+
if len(rest) > 0:
110+
raise ValueError("なんかおかしい")
111+
return matcher
112+
113+
114+
def regexp_match(pattern: str, string: str) -> Optional[str]:
115+
matcher = _compile(pattern)
116+
for n in matcher.match(string):
117+
return string[:n]
118+
return None
119+
120+
121+
class TestRegexp(unittest.TestCase):
122+
def test_regexp(self) -> None:
123+
cases = [
124+
("", "abc", ""),
125+
("", "", ""),
126+
("hello", "hello", "hello"),
127+
("hello", "world", None),
128+
("...", "Beer", "Bee"),
129+
("...", "He", None),
130+
("foo|bar", "barxxx", "bar"),
131+
("foo|bar", "buzzxxx", None),
132+
("a*", "aaaaa", "aaaaa"),
133+
("a*", "bbbbb", ""),
134+
("c(abc)*", "cabcabcd", "cabcabc"),
135+
("c(abc)*", "cabacaabcd", "c"),
136+
("(hello|world)*", "hellohelloworldhelloheywww", "hellohelloworldhello"),
137+
(".*Foo.*Bar", "This is Foo and that is Bar.", "This is Foo and that is Bar"),
138+
(".*Foo.*Bar", "This is Bar and that is Foo.", None),
139+
("(0|1|2|3|4|5|6|7|8|9)* yen", "972 yen.", "972 yen"),
140+
("(0|1|2|3|4|5|6|7|8|9)* yen", "972 dollers.", None),
141+
("c(a*b*)*d", "caaabbbbbbabaaabdaaaaa", "caaabbbbbbabaaabd"),
142+
("(a|b)(a|b)*|c", "cabab", "c"),
143+
(r"\(foo\|bar\)\\\\", r"(foo|bar)\\", r"(foo|bar)\\"),
144+
]
145+
for pattern, string, expected in cases:
146+
self.assertEqual(expected, regexp_match(pattern, string))
147+
148+
149+
if __name__ == '__main__':
150+
unittest.main()

0 commit comments

Comments
 (0)