diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 576b9b6..ba23a22 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.5', '3.6', '3.7', '3.8'] + python-version: [ '3.6', '3.7', '3.8', '3.9', '3.10'] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4e1ef42..dad184b 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -21,7 +21,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install setuptools wheel twine + pip install setuptools wheel twine mypy - name: Build and publish env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} diff --git a/.gitignore b/.gitignore index 285a359..d486441 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,8 @@ env/ cenv/ senv/ +mypyenv/ **/__pycache__/** -*.egg-info/ \ No newline at end of file +*.egg-info/ +build/ +dist/ \ No newline at end of file diff --git a/dev/benchmark.py b/dev/benchmark.py index 0e1c08e..f188f77 100644 --- a/dev/benchmark.py +++ b/dev/benchmark.py @@ -1,14 +1,22 @@ # coding: utf-8 +"""Module benchmarking + +This is code to benchmakr the performance of the module. + +Requires benchmarker as an additional dependency. Run from main folder with 'python dev/benchmark.py' + + """ + import os, sys sys.path.append(os.getcwd()) from benchmarker import Benchmarker from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor from simstring.measure.cosine import CosineMeasure -from simstring.database.mongo import MongoDatabase from simstring.database.dict import DictDatabase from simstring.searcher import Searcher +from time import time SEARCH_COUNT_LIMIT = 10**4 @@ -35,6 +43,15 @@ def _(bm): result = searcher.search(strings, 0.8) print('benchmark for using dict as database') +start = time() output_similar_strings_of_each_line('./dev/data/company_names.txt', DictDatabase) -print('benchmark for using Mongo as database') -output_similar_strings_of_each_line('./dev/data/company_names.txt', MongoDatabase) +print(f"Benchmark took {time()-start:.2f}s.") + +try: + from simstring.database.mongo import MongoDatabase + print('benchmark for using Mongo as database') + start = time() + output_similar_strings_of_each_line('./dev/data/company_names.txt', MongoDatabase) + print(f"Benchmark took {time()-start:.2f}s.") +except ModuleNotFoundError: + print("Pymongo not installed, won't benchmark against MongoDB") diff --git a/setup.py b/setup.py index 6d87b38..fe31dff 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ import setuptools +from mypyc.build import mypycify with open("README.md", "r") as fh: long_description = fh.read() @@ -8,22 +9,42 @@ version="0.0.1", author="Ruben Menke", author_email="ruben.m.menke@gmail.com", - description="A fork of the Python implementation of the SimString by (Katsuma Narisawa), a simple and efficient algorithm for approximate string matching.", + description="A fork of the Python implementation of the SimString by (Katsuma Narisawa), a simple and efficient algorithm for approximate string matching. Uses mypyc to improve speed", long_description=long_description, long_description_content_type="text/markdown", url="https://github.com/icfly2/simstring-fast", packages=setuptools.find_packages(exclude=("tests",)), - classifiers=( + classifiers=[ "Development Status :: 5 - Production/Stable", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.7", "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", - ), + ], extras_require = { "mongo" : ["pymongo",], "mecab" : ["MeCab"], - } + }, + ext_modules=mypycify([ + 'simstring/__init__.py', + 'simstring/searcher.py', + + 'simstring/feature_extractor/base.py', + 'simstring/feature_extractor/character_ngram.py', + 'simstring/feature_extractor/word_ngram.py', + + 'simstring/database/base.py', + 'simstring/database/dict.py', + # 'simstring/database/mongo.py', + + 'simstring/measure/base.py', + 'simstring/measure/cosine.py', + 'simstring/measure/dice.py', + 'simstring/measure/jaccard.py', + + ]), ) diff --git a/simstring/database/base.py b/simstring/database/base.py index 53020ea..7210fe8 100644 --- a/simstring/database/base.py +++ b/simstring/database/base.py @@ -1,15 +1,15 @@ class BaseDatabase: def __init__(self, feature_extractor): - raise 'Not Implemented' + raise NotImplementedError def add(self, string): - raise 'Not Implemented' + raise NotImplementedError def min_feature_size(self): - raise 'Not Implemented' + raise NotImplementedError def max_feature_size(self): - raise 'Not Implemented' + raise NotImplementedError def lookup_strings_by_feature_set_size_and_feature(self, size, feature): - raise 'Not Implemented' + raise NotImplementedError diff --git a/simstring/database/dict.py b/simstring/database/dict.py index 14205fd..f6fc3ce 100644 --- a/simstring/database/dict.py +++ b/simstring/database/dict.py @@ -1,4 +1,5 @@ from collections import defaultdict +from typing import List from .base import BaseDatabase def defaultdict_set(): @@ -7,11 +8,11 @@ def defaultdict_set(): class DictDatabase(BaseDatabase): def __init__(self, feature_extractor): self.feature_extractor = feature_extractor - self.strings = [] - self.feature_set_size_to_string_map = defaultdict(set) - self.feature_set_size_and_feature_to_string_map = defaultdict(defaultdict_set) + self.strings: List[str] = [] + self.feature_set_size_to_string_map: dict = defaultdict(set) + self.feature_set_size_and_feature_to_string_map: dict = defaultdict(defaultdict_set) - def add(self, string): + def add(self, string: str): features = self.feature_extractor.features(string) size = len(features) diff --git a/simstring/feature_extractor/base.py b/simstring/feature_extractor/base.py index 6b3389b..46414b1 100644 --- a/simstring/feature_extractor/base.py +++ b/simstring/feature_extractor/base.py @@ -1,11 +1,13 @@ +from typing import List + SENTINAL_CHAR = " " # non breaking space class BaseFeatureExtractor: - def features(self, _string): + def features(self, _string) -> List[str]: raise NotImplementedError() - def _each_cons(self, xs, n): + def _each_cons(self, xs, n:int) -> List[str]: return [xs[i:i+n] for i in range(len(xs)-n+1)] - def _words_ngram(self, words, n, SENTINAL_CHAR): + def _words_ngram(self, words: List[str], n:int, SENTINAL_CHAR: str): return [tuple(x) for x in self._each_cons([SENTINAL_CHAR] + words + [SENTINAL_CHAR], n)] diff --git a/simstring/feature_extractor/character_ngram.py b/simstring/feature_extractor/character_ngram.py index 588b8b4..f5a708b 100644 --- a/simstring/feature_extractor/character_ngram.py +++ b/simstring/feature_extractor/character_ngram.py @@ -1,8 +1,9 @@ from .base import BaseFeatureExtractor, SENTINAL_CHAR +from typing import List class CharacterNgramFeatureExtractor(BaseFeatureExtractor): - def __init__(self, n=2): + def __init__(self, n:int=2): self.n = n - def features(self, string): + def features(self, string:str) -> List[str]: return self._each_cons(SENTINAL_CHAR + string + SENTINAL_CHAR, self.n) diff --git a/simstring/feature_extractor/word_ngram.py b/simstring/feature_extractor/word_ngram.py index ebb9bff..48cc5f4 100644 --- a/simstring/feature_extractor/word_ngram.py +++ b/simstring/feature_extractor/word_ngram.py @@ -1,12 +1,12 @@ from .base import BaseFeatureExtractor, SENTINAL_CHAR - +from typing import List class WordNgramFeatureExtractor(BaseFeatureExtractor): def __init__(self, n=2, splitter=" "): self.n = n self.splitter = splitter - def features(self, text): + def features(self, text: str) -> List[str]: # Split text by white space. # If you want to extract words from text in more complicated way or using your favorite library like NLTK, please implement in your own. words = text.split(self.splitter) diff --git a/simstring/measure/base.py b/simstring/measure/base.py index d23fc19..2caeb91 100644 --- a/simstring/measure/base.py +++ b/simstring/measure/base.py @@ -1,12 +1,12 @@ class BaseMeasure: - def min_feature_size(self, _query_size, _alpha): - raise 'Not Implemented' + def min_feature_size(self, _query_size, _alpha) -> int: + raise NotImplementedError - def max_feature_size(self, _query_size, _alpha): - raise 'Not Implemented' + def max_feature_size(self, _query_size, _alpha)-> int: + raise NotImplementedError - def minimum_common_feature_count(self, _query_size, _y_size, _alpha): - raise 'Not Implemented' - - def similarity(self, X, Y): - raise 'Not Implemented' + def minimum_common_feature_count(self, _query_size, _y_size, _alpha)-> int: + raise NotImplementedError + + def similarity(self, X, Y) -> float: + raise NotImplementedError diff --git a/simstring/measure/cosine.py b/simstring/measure/cosine.py index 0475258..bb9eefc 100644 --- a/simstring/measure/cosine.py +++ b/simstring/measure/cosine.py @@ -1,15 +1,16 @@ import math +from typing import Iterable from .base import BaseMeasure class CosineMeasure(BaseMeasure): - def min_feature_size(self, query_size, alpha): + def min_feature_size(self, query_size:int, alpha:float) -> int: return int(math.ceil(alpha * alpha * query_size)) - def max_feature_size(self, query_size, alpha): - return int(math.floor(query_size * 1.0 / (alpha * alpha))) + def max_feature_size(self, query_size:int, alpha:float) -> int: + return int(math.floor(query_size / (alpha * alpha))) - def minimum_common_feature_count(self, query_size, y_size, alpha): + def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int: return int(math.ceil(alpha * math.sqrt(query_size * y_size))) - def similarity(self, X, Y): - return len(set(X) & set(Y)) * 1.0 / math.sqrt(len(set(X)) * len(set(Y))) + def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: + return len(set(X) & set(Y)) / math.sqrt(len(set(X)) * len(set(Y))) diff --git a/simstring/measure/dice.py b/simstring/measure/dice.py index 97b0fba..3389193 100644 --- a/simstring/measure/dice.py +++ b/simstring/measure/dice.py @@ -1,15 +1,18 @@ import math +from typing import Iterable + from .base import BaseMeasure -class DiceMeasure(BaseMeasure): - def min_feature_size(self, query_size, alpha): +class DiceMeasure(BaseMeasure): + + def min_feature_size(self, query_size:int, alpha:float) -> int: return int(math.ceil(alpha * 1.0 / (2 - alpha) * query_size)) - def max_feature_size(self, query_size, alpha): + def max_feature_size(self, query_size:int, alpha:float) -> int: return int(math.floor((2 - alpha) * query_size * 1.0 / alpha)) - def minimum_common_feature_count(self, query_size, y_size, alpha): + def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int: return int(math.ceil(0.5 * alpha * query_size * y_size)) - def similarity(self, X, Y): + def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return len(set(X) & set(Y)) * 2.0 / (len(set(X)) + len(set(Y))) diff --git a/simstring/measure/jaccard.py b/simstring/measure/jaccard.py index d12abb7..4918d2d 100644 --- a/simstring/measure/jaccard.py +++ b/simstring/measure/jaccard.py @@ -1,15 +1,16 @@ import math +from typing import Iterable from .base import BaseMeasure class JaccardMeasure(BaseMeasure): - def min_feature_size(self, query_size, alpha): + def min_feature_size(self, query_size:int, alpha:float) -> int: return int(math.ceil(alpha * query_size)) - def max_feature_size(self, query_size, alpha): + def max_feature_size(self, query_size:int, alpha:float) -> int: return int(math.floor(query_size / alpha)) - def minimum_common_feature_count(self, query_size, y_size, alpha): + def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int: return int(math.ceil(alpha * (query_size + y_size) * 1.0 / (1 + alpha))) - def similarity(self, X, Y): + def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float: return len(set(X) & set(Y)) * 1.0 / len(set(X) | set(Y)) diff --git a/simstring/searcher.py b/simstring/searcher.py index 0c289e9..e3d8c9c 100644 --- a/simstring/searcher.py +++ b/simstring/searcher.py @@ -1,8 +1,7 @@ # -*- coding:utf-8 -*- from collections import defaultdict -from operator import itemgetter -from typing import List +from typing import List, Tuple class Searcher: @@ -10,7 +9,7 @@ def __init__(self, db, measure) -> None: self.db = db self.measure = measure self.feature_extractor = db.feature_extractor - self.lookup_strings_result = defaultdict(dict) + self.lookup_strings_result: dict = defaultdict(dict) def search(self, query_string: str, alpha: float) -> List[str]: features = self.feature_extractor.features(query_string) @@ -24,19 +23,19 @@ def search(self, query_string: str, alpha: float) -> List[str]: results.extend(self.__overlap_join(features, tau, candidate_feature_size)) return results - def ranked_search(self, query_string: str, alpha: float) -> List[str]: + def ranked_search(self, query_string: str, alpha: float) -> List[Tuple[float, str]]: results = self.search(query_string, alpha) - features = self.feature_extractor.features(query_string) - results_with_score = list(map(lambda x: [self.measure.similarity(features, self.feature_extractor.features(x)), x], results)) + features: List[str] = self.feature_extractor.features(query_string) + results_with_score = [(self.measure.similarity(features, self.feature_extractor.features(result)), result) for result in results] return sorted(results_with_score, key=lambda x: (-x[0], x[1])) def __min_overlap(self, query_size: int, candidate_feature_size: int, alpha: float) -> int: return self.measure.minimum_common_feature_count(query_size, candidate_feature_size, alpha) - def __overlap_join(self, features, tau, candidate_feature_size: int) -> List[str]: + def __overlap_join(self, features: List[str], tau:int, candidate_feature_size: int) -> List[str]: query_feature_size = len(features) features.sort(key=lambda x: len(self.__lookup_strings_by_feature_set_size_and_feature(candidate_feature_size, x))) - candidate_string_to_matched_count = defaultdict(int) + candidate_string_to_matched_count: dict = defaultdict(int) results = [] for feature in features[0:query_feature_size - tau + 1]: for s in self.__lookup_strings_by_feature_set_size_and_feature(candidate_feature_size, feature): @@ -55,7 +54,7 @@ def __overlap_join(self, features, tau, candidate_feature_size: int) -> List[str break return results - def __lookup_strings_by_feature_set_size_and_feature(self, feature_size, feature): + def __lookup_strings_by_feature_set_size_and_feature(self, feature_size: int, feature: str): if feature not in self.lookup_strings_result[feature_size]: self.lookup_strings_result[feature_size][feature] = self.db.lookup_strings_by_feature_set_size_and_feature(feature_size, feature) return self.lookup_strings_result[feature_size][feature] diff --git a/tests/measure/test_cosine.py b/tests/measure/test_cosine.py index 195edc2..af77811 100644 --- a/tests/measure/test_cosine.py +++ b/tests/measure/test_cosine.py @@ -20,10 +20,18 @@ def test_minimum_common_feature_count(self): self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 3) def test_similarity(self): - x = [1, 2, 3] - y = [1, 2, 3, 4] + x = ["a", "ab", "bc", "c"] + y = ["a", "ab", "bc", "cd", "e"] self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0) - self.assertEqual(round(self.measure.similarity(x, y), 2), 0.87) - - z = [1, 1, 2, 3] + self.assertEqual(round(self.measure.similarity(x, y), 2), 0.67) + + z = ["a", "ab", "ba", "ab", "a"] self.assertEqual(round(self.measure.similarity(z, z), 2), 1.0) + self.assertEqual(round(self.measure.similarity(x, z), 2), 0.58) + self.assertEqual(round(self.measure.similarity(x, y), 2), 0.67) + + # Test as per paper trigrams with quotes of methyl sulphone and methyl sulfone + a = [' "m', '"me', 'met', 'eth', 'thy', 'hyl', 'yl ', 'l s', ' su', 'sul', 'ulf', 'lfo', 'fon', 'one', 'ne"', 'e" '] + b = [' "m', '"me', 'met', 'eth', 'thy', 'hyl', 'yl ', 'l s', ' su', 'sul', 'ulp', 'lph', 'pho', 'hon', 'one', 'ne"', 'e" '] + self.assertEqual(round(self.measure.similarity(a, b), 3), 0.788) #BUG? Disagrees with paper that claims should be 0.788 + diff --git a/tests/measure/test_dice.py b/tests/measure/test_dice.py index e835f70..26d9174 100644 --- a/tests/measure/test_dice.py +++ b/tests/measure/test_dice.py @@ -3,7 +3,7 @@ from unittest import TestCase from simstring.measure.dice import DiceMeasure -class TestCosine(TestCase): +class TestDice(TestCase): measure = DiceMeasure() def test_min_feature_size(self): @@ -20,7 +20,12 @@ def test_minimum_common_feature_count(self): self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 7) def test_similarity(self): - x = [1, 2, 3] - y = [1, 2, 3, 4] + x = ["1", "2", "3"] + y = ["1", "2", "3", "4"] self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0) self.assertEqual(round(self.measure.similarity(x, y), 2), 0.86) + + + x = ["ni","ig","gh","ht"] + y = ["na","ac","ch","ht"] + self.assertEqual(round(self.measure.similarity(x, y), 2), 0.25) \ No newline at end of file diff --git a/tests/measure/test_jaccard.py b/tests/measure/test_jaccard.py index 53ecd9d..0a51321 100644 --- a/tests/measure/test_jaccard.py +++ b/tests/measure/test_jaccard.py @@ -3,7 +3,7 @@ from unittest import TestCase from simstring.measure.jaccard import JaccardMeasure -class TestCosine(TestCase): +class TestJaccard(TestCase): measure = JaccardMeasure() def test_min_feature_size(self): @@ -20,7 +20,11 @@ def test_minimum_common_feature_count(self): self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 4) def test_similarity(self): - x = [1, 2, 3] - y = [1, 2, 3, 4] + x = ["1", "2", "3"] + y = ["1", "2", "3", "4"] self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0) self.assertEqual(round(self.measure.similarity(x, y), 2), 0.75) + + a = ["A" , "AB", "BC", "C"] + b = ["B" , "BC", "CD", "DE", "E"] + self.assertEqual(round(self.measure.similarity(a, b),3), 0.125) \ No newline at end of file diff --git a/tests/test_searcher.py b/tests/test_searcher.py index 9ff4412..039c70f 100644 --- a/tests/test_searcher.py +++ b/tests/test_searcher.py @@ -29,4 +29,31 @@ def test_search3(self): def test_search4(self): self.assertEqual(self.searcher.search('abcd', 1.0), ['abcd']) - self.assertEqual(self.searcher.search('abcd', 0.9), ['abcd']) \ No newline at end of file + self.assertEqual(self.searcher.search('abcd', 0.9), ['abcd']) + + + def test_ranked_search(self): + self.assertEqual(self.searcher.ranked_search('abcd', 1.0), [(1.0, 'abcd')] ) + self.assertEqual(self.searcher.ranked_search('ab', 0.4), [(1.0, 'ab'), (0.5773502691896258, 'abc'), (0.5163977794943222, 'abcd'), (0.47140452079103173, 'abcde')]) + + +class TestRankedSearch(TestCase): + def setUp(self) -> None: + db = DictDatabase(CharacterNgramFeatureExtractor(2)) + db.add('foo') + db.add('bar') + db.add('fooo') + db.add('food') + db.add('fool') + db.add('follow') + self.searcher = Searcher(db, CosineMeasure()) + + def test_ranked_search_example1(self): + results = self.searcher.ranked_search('fo', 0.5) + goal = [(0.8660254037844387, 'foo'), (0.8660254037844387, 'fooo'), (0.5163977794943222, 'food'), (0.5163977794943222, 'fool')] + self.assertEqual(results, goal) + + def test_ranked_search_example2(self): + results = self.searcher.ranked_search('fo', 0.6) + goal = [(0.8660254037844387, 'foo'), (0.8660254037844387, 'fooo')] + self.assertEqual(results, goal)