-
Notifications
You must be signed in to change notification settings - Fork 141
/
test_thefuzz_hypothesis.py
153 lines (124 loc) · 5.06 KB
/
test_thefuzz_hypothesis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from itertools import product
from functools import partial
from string import ascii_letters, digits, punctuation
from hypothesis import given, assume, settings, HealthCheck
import hypothesis.strategies as st
import pytest
from thefuzz import fuzz, process, utils
HYPOTHESIS_ALPHABET = ascii_letters + digits + punctuation
def scorers_processors():
"""
Generate a list of (scorer, processor) pairs for testing
:return: [(scorer, processor), ...]
"""
scorers = [fuzz.ratio,
fuzz.partial_ratio]
processors = [lambda x: x,
partial(utils.full_process, force_ascii=False),
partial(utils.full_process, force_ascii=True)]
splist = list(product(scorers, processors))
splist.extend(
[(fuzz.WRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.QRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.UWRatio, partial(utils.full_process, force_ascii=False)),
(fuzz.UQRatio, partial(utils.full_process, force_ascii=False)),
(fuzz.token_set_ratio, partial(utils.full_process, force_ascii=True)),
(fuzz.token_sort_ratio, partial(utils.full_process, force_ascii=True)),
(fuzz.partial_token_set_ratio, partial(utils.full_process, force_ascii=True)),
(fuzz.partial_token_sort_ratio, partial(utils.full_process, force_ascii=True))]
)
return splist
def full_scorers_processors():
"""
Generate a list of (scorer, processor) pairs for testing for scorers that use the full string only
:return: [(scorer, processor), ...]
"""
scorers = [fuzz.ratio]
processors = [lambda x: x,
partial(utils.full_process, force_ascii=False),
partial(utils.full_process, force_ascii=True)]
splist = list(product(scorers, processors))
splist.extend(
[(fuzz.WRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.QRatio, partial(utils.full_process, force_ascii=True)),
(fuzz.UWRatio, partial(utils.full_process, force_ascii=False)),
(fuzz.UQRatio, partial(utils.full_process, force_ascii=False))]
)
return splist
@pytest.mark.parametrize('scorer,processor',
scorers_processors())
@given(data=st.data())
@settings(max_examples=20, deadline=5000, suppress_health_check=[HealthCheck.data_too_large])
def test_identical_strings_extracted(scorer, processor, data):
"""
Test that identical strings will always return a perfect match.
:param scorer:
:param processor:
:param data:
:return:
"""
# Draw a list of random strings
strings = data.draw(
st.lists(
st.text(min_size=10, max_size=100, alphabet=HYPOTHESIS_ALPHABET),
min_size=1,
max_size=10
)
)
# Draw a random integer for the index in that list
choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1)))
# Extract our choice from the list
choice = strings[choiceidx]
# Check process doesn't make our choice the empty string
assume(processor(choice) != '')
# Extract all perfect matches
result = process.extractBests(choice,
strings,
scorer=scorer,
processor=processor,
score_cutoff=100,
limit=None)
# Check we get a result
assert result != []
# Check the original is in the list
assert (choice, 100) in result
@pytest.mark.parametrize('scorer,processor',
full_scorers_processors())
@given(data=st.data())
@settings(max_examples=20, deadline=5000)
def test_only_identical_strings_extracted(scorer, processor, data):
"""
Test that only identical (post processing) strings score 100 on the test.
If two strings are not identical then using full comparison methods they should
not be a perfect (100) match.
:param scorer:
:param processor:
:param data:
:return:
"""
# Draw a list of random strings
strings = data.draw(
st.lists(
st.text(min_size=10, max_size=100, alphabet=HYPOTHESIS_ALPHABET),
min_size=1,
max_size=10)
)
# Draw a random integer for the index in that list
choiceidx = data.draw(st.integers(min_value=0, max_value=(len(strings) - 1)))
# Extract our choice from the list
choice = strings[choiceidx]
# Check process doesn't make our choice the empty string
assume(processor(choice) != '')
# Extract all perfect matches
result = process.extractBests(choice,
strings,
scorer=scorer,
processor=processor,
score_cutoff=100,
limit=None)
# Check we get a result
assert result != []
# Check THE ONLY result(s) we get are a perfect match for the (processed) original data
pchoice = processor(choice)
for r in result:
assert pchoice == processor(r[0])