Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ '3.5', '3.6', '3.7', '3.8']
python-version: [ '3.6', '3.7', '3.8', '3.9', '3.10']

steps:
- uses: actions/checkout@v2
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install setuptools wheel twine
pip install setuptools wheel twine mypy
- name: Build and publish
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
Expand Down
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
env/
cenv/
senv/
mypyenv/
**/__pycache__/**
*.egg-info/
*.egg-info/
build/
dist/
23 changes: 20 additions & 3 deletions dev/benchmark.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
# coding: utf-8

"""Module benchmarking

This is code to benchmakr the performance of the module.

Requires benchmarker as an additional dependency. Run from main folder with 'python dev/benchmark.py'

"""

import os, sys
sys.path.append(os.getcwd())
from benchmarker import Benchmarker

from simstring.feature_extractor.character_ngram import CharacterNgramFeatureExtractor
from simstring.measure.cosine import CosineMeasure
from simstring.database.mongo import MongoDatabase
from simstring.database.dict import DictDatabase
from simstring.searcher import Searcher
from time import time

SEARCH_COUNT_LIMIT = 10**4

Expand All @@ -35,6 +43,15 @@ def _(bm):
result = searcher.search(strings, 0.8)

print('benchmark for using dict as database')
start = time()
output_similar_strings_of_each_line('./dev/data/company_names.txt', DictDatabase)
print('benchmark for using Mongo as database')
output_similar_strings_of_each_line('./dev/data/company_names.txt', MongoDatabase)
print(f"Benchmark took {time()-start:.2f}s.")

try:
from simstring.database.mongo import MongoDatabase
print('benchmark for using Mongo as database')
start = time()
output_similar_strings_of_each_line('./dev/data/company_names.txt', MongoDatabase)
print(f"Benchmark took {time()-start:.2f}s.")
except ModuleNotFoundError:
print("Pymongo not installed, won't benchmark against MongoDB")
29 changes: 25 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import setuptools
from mypyc.build import mypycify

with open("README.md", "r") as fh:
long_description = fh.read()
Expand All @@ -8,22 +9,42 @@
version="0.0.1",
author="Ruben Menke",
author_email="[email protected]",
description="A fork of the Python implementation of the SimString by (Katsuma Narisawa), a simple and efficient algorithm for approximate string matching.",
description="A fork of the Python implementation of the SimString by (Katsuma Narisawa), a simple and efficient algorithm for approximate string matching. Uses mypyc to improve speed",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/icfly2/simstring-fast",
packages=setuptools.find_packages(exclude=("tests",)),
classifiers=(
classifiers=[
"Development Status :: 5 - Production/Stable",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
),
],
extras_require = {
"mongo" : ["pymongo",],
"mecab" : ["MeCab"],
}
},
ext_modules=mypycify([
'simstring/__init__.py',
'simstring/searcher.py',

'simstring/feature_extractor/base.py',
'simstring/feature_extractor/character_ngram.py',
'simstring/feature_extractor/word_ngram.py',

'simstring/database/base.py',
'simstring/database/dict.py',
# 'simstring/database/mongo.py',

'simstring/measure/base.py',
'simstring/measure/cosine.py',
'simstring/measure/dice.py',
'simstring/measure/jaccard.py',

]),

)
10 changes: 5 additions & 5 deletions simstring/database/base.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
class BaseDatabase:
def __init__(self, feature_extractor):
raise 'Not Implemented'
raise NotImplementedError

def add(self, string):
raise 'Not Implemented'
raise NotImplementedError

def min_feature_size(self):
raise 'Not Implemented'
raise NotImplementedError

def max_feature_size(self):
raise 'Not Implemented'
raise NotImplementedError

def lookup_strings_by_feature_set_size_and_feature(self, size, feature):
raise 'Not Implemented'
raise NotImplementedError
9 changes: 5 additions & 4 deletions simstring/database/dict.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import defaultdict
from typing import List
from .base import BaseDatabase

def defaultdict_set():
Expand All @@ -7,11 +8,11 @@ def defaultdict_set():
class DictDatabase(BaseDatabase):
def __init__(self, feature_extractor):
self.feature_extractor = feature_extractor
self.strings = []
self.feature_set_size_to_string_map = defaultdict(set)
self.feature_set_size_and_feature_to_string_map = defaultdict(defaultdict_set)
self.strings: List[str] = []
self.feature_set_size_to_string_map: dict = defaultdict(set)
self.feature_set_size_and_feature_to_string_map: dict = defaultdict(defaultdict_set)

def add(self, string):
def add(self, string: str):
features = self.feature_extractor.features(string)
size = len(features)

Expand Down
8 changes: 5 additions & 3 deletions simstring/feature_extractor/base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
from typing import List

SENTINAL_CHAR = " " # non breaking space

class BaseFeatureExtractor:
def features(self, _string):
def features(self, _string) -> List[str]:
raise NotImplementedError()

def _each_cons(self, xs, n):
def _each_cons(self, xs, n:int) -> List[str]:
return [xs[i:i+n] for i in range(len(xs)-n+1)]

def _words_ngram(self, words, n, SENTINAL_CHAR):
def _words_ngram(self, words: List[str], n:int, SENTINAL_CHAR: str):
return [tuple(x) for x in self._each_cons([SENTINAL_CHAR] + words + [SENTINAL_CHAR], n)]
5 changes: 3 additions & 2 deletions simstring/feature_extractor/character_ngram.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from .base import BaseFeatureExtractor, SENTINAL_CHAR
from typing import List

class CharacterNgramFeatureExtractor(BaseFeatureExtractor):
def __init__(self, n=2):
def __init__(self, n:int=2):
self.n = n

def features(self, string):
def features(self, string:str) -> List[str]:
return self._each_cons(SENTINAL_CHAR + string + SENTINAL_CHAR, self.n)
4 changes: 2 additions & 2 deletions simstring/feature_extractor/word_ngram.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from .base import BaseFeatureExtractor, SENTINAL_CHAR

from typing import List

class WordNgramFeatureExtractor(BaseFeatureExtractor):
def __init__(self, n=2, splitter=" "):
self.n = n
self.splitter = splitter

def features(self, text):
def features(self, text: str) -> List[str]:
# Split text by white space.
# If you want to extract words from text in more complicated way or using your favorite library like NLTK, please implement in your own.
words = text.split(self.splitter)
Expand Down
18 changes: 9 additions & 9 deletions simstring/measure/base.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
class BaseMeasure:
def min_feature_size(self, _query_size, _alpha):
raise 'Not Implemented'
def min_feature_size(self, _query_size, _alpha) -> int:
raise NotImplementedError

def max_feature_size(self, _query_size, _alpha):
raise 'Not Implemented'
def max_feature_size(self, _query_size, _alpha)-> int:
raise NotImplementedError

def minimum_common_feature_count(self, _query_size, _y_size, _alpha):
raise 'Not Implemented'

def similarity(self, X, Y):
raise 'Not Implemented'
def minimum_common_feature_count(self, _query_size, _y_size, _alpha)-> int:
raise NotImplementedError
def similarity(self, X, Y) -> float:
raise NotImplementedError
13 changes: 7 additions & 6 deletions simstring/measure/cosine.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import math
from typing import Iterable
from .base import BaseMeasure

class CosineMeasure(BaseMeasure):
def min_feature_size(self, query_size, alpha):
def min_feature_size(self, query_size:int, alpha:float) -> int:
return int(math.ceil(alpha * alpha * query_size))

def max_feature_size(self, query_size, alpha):
return int(math.floor(query_size * 1.0 / (alpha * alpha)))
def max_feature_size(self, query_size:int, alpha:float) -> int:
return int(math.floor(query_size / (alpha * alpha)))

def minimum_common_feature_count(self, query_size, y_size, alpha):
def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int:
return int(math.ceil(alpha * math.sqrt(query_size * y_size)))

def similarity(self, X, Y):
return len(set(X) & set(Y)) * 1.0 / math.sqrt(len(set(X)) * len(set(Y)))
def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
return len(set(X) & set(Y)) / math.sqrt(len(set(X)) * len(set(Y)))
13 changes: 8 additions & 5 deletions simstring/measure/dice.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import math
from typing import Iterable

from .base import BaseMeasure

class DiceMeasure(BaseMeasure):
def min_feature_size(self, query_size, alpha):
class DiceMeasure(BaseMeasure):

def min_feature_size(self, query_size:int, alpha:float) -> int:
return int(math.ceil(alpha * 1.0 / (2 - alpha) * query_size))

def max_feature_size(self, query_size, alpha):
def max_feature_size(self, query_size:int, alpha:float) -> int:
return int(math.floor((2 - alpha) * query_size * 1.0 / alpha))

def minimum_common_feature_count(self, query_size, y_size, alpha):
def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int:
return int(math.ceil(0.5 * alpha * query_size * y_size))

def similarity(self, X, Y):
def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
return len(set(X) & set(Y)) * 2.0 / (len(set(X)) + len(set(Y)))
9 changes: 5 additions & 4 deletions simstring/measure/jaccard.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import math
from typing import Iterable
from .base import BaseMeasure

class JaccardMeasure(BaseMeasure):
def min_feature_size(self, query_size, alpha):
def min_feature_size(self, query_size:int, alpha:float) -> int:
return int(math.ceil(alpha * query_size))

def max_feature_size(self, query_size, alpha):
def max_feature_size(self, query_size:int, alpha:float) -> int:
return int(math.floor(query_size / alpha))

def minimum_common_feature_count(self, query_size, y_size, alpha):
def minimum_common_feature_count(self, query_size: int, y_size:int , alpha: float) -> int:
return int(math.ceil(alpha * (query_size + y_size) * 1.0 / (1 + alpha)))

def similarity(self, X, Y):
def similarity(self, X: Iterable[str], Y: Iterable[str]) -> float:
return len(set(X) & set(Y)) * 1.0 / len(set(X) | set(Y))
17 changes: 8 additions & 9 deletions simstring/searcher.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
# -*- coding:utf-8 -*-

from collections import defaultdict
from operator import itemgetter
from typing import List
from typing import List, Tuple


class Searcher:
def __init__(self, db, measure) -> None:
self.db = db
self.measure = measure
self.feature_extractor = db.feature_extractor
self.lookup_strings_result = defaultdict(dict)
self.lookup_strings_result: dict = defaultdict(dict)

def search(self, query_string: str, alpha: float) -> List[str]:
features = self.feature_extractor.features(query_string)
Expand All @@ -24,19 +23,19 @@ def search(self, query_string: str, alpha: float) -> List[str]:
results.extend(self.__overlap_join(features, tau, candidate_feature_size))
return results

def ranked_search(self, query_string: str, alpha: float) -> List[str]:
def ranked_search(self, query_string: str, alpha: float) -> List[Tuple[float, str]]:
results = self.search(query_string, alpha)
features = self.feature_extractor.features(query_string)
results_with_score = list(map(lambda x: [self.measure.similarity(features, self.feature_extractor.features(x)), x], results))
features: List[str] = self.feature_extractor.features(query_string)
results_with_score = [(self.measure.similarity(features, self.feature_extractor.features(result)), result) for result in results]
return sorted(results_with_score, key=lambda x: (-x[0], x[1]))

def __min_overlap(self, query_size: int, candidate_feature_size: int, alpha: float) -> int:
return self.measure.minimum_common_feature_count(query_size, candidate_feature_size, alpha)

def __overlap_join(self, features, tau, candidate_feature_size: int) -> List[str]:
def __overlap_join(self, features: List[str], tau:int, candidate_feature_size: int) -> List[str]:
query_feature_size = len(features)
features.sort(key=lambda x: len(self.__lookup_strings_by_feature_set_size_and_feature(candidate_feature_size, x)))
candidate_string_to_matched_count = defaultdict(int)
candidate_string_to_matched_count: dict = defaultdict(int)
results = []
for feature in features[0:query_feature_size - tau + 1]:
for s in self.__lookup_strings_by_feature_set_size_and_feature(candidate_feature_size, feature):
Expand All @@ -55,7 +54,7 @@ def __overlap_join(self, features, tau, candidate_feature_size: int) -> List[str
break
return results

def __lookup_strings_by_feature_set_size_and_feature(self, feature_size, feature):
def __lookup_strings_by_feature_set_size_and_feature(self, feature_size: int, feature: str):
if feature not in self.lookup_strings_result[feature_size]:
self.lookup_strings_result[feature_size][feature] = self.db.lookup_strings_by_feature_set_size_and_feature(feature_size, feature)
return self.lookup_strings_result[feature_size][feature]
18 changes: 13 additions & 5 deletions tests/measure/test_cosine.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,18 @@ def test_minimum_common_feature_count(self):
self.assertEqual(self.measure.minimum_common_feature_count(5, 5, 0.5), 3)

def test_similarity(self):
x = [1, 2, 3]
y = [1, 2, 3, 4]
x = ["a", "ab", "bc", "c"]
y = ["a", "ab", "bc", "cd", "e"]
self.assertEqual(round(self.measure.similarity(x, x), 2), 1.0)
self.assertEqual(round(self.measure.similarity(x, y), 2), 0.87)

z = [1, 1, 2, 3]
self.assertEqual(round(self.measure.similarity(x, y), 2), 0.67)
z = ["a", "ab", "ba", "ab", "a"]
self.assertEqual(round(self.measure.similarity(z, z), 2), 1.0)
self.assertEqual(round(self.measure.similarity(x, z), 2), 0.58)
self.assertEqual(round(self.measure.similarity(x, y), 2), 0.67)

# Test as per paper trigrams with quotes of methyl sulphone and methyl sulfone
a = [' "m', '"me', 'met', 'eth', 'thy', 'hyl', 'yl ', 'l s', ' su', 'sul', 'ulf', 'lfo', 'fon', 'one', 'ne"', 'e" ']
b = [' "m', '"me', 'met', 'eth', 'thy', 'hyl', 'yl ', 'l s', ' su', 'sul', 'ulp', 'lph', 'pho', 'hon', 'one', 'ne"', 'e" ']
self.assertEqual(round(self.measure.similarity(a, b), 3), 0.788) #BUG? Disagrees with paper that claims should be 0.788

Loading