Skip to content

Commit

Permalink
Improved distance calculation speed when a maximum cost is set.
Browse files Browse the repository at this point in the history
- Reduced complexity from O(|s1|*|s2|) to O(max(|s1|,|s2|)*maxCost).
- Ignore maxCost when it would not help shorten the calculation.
  • Loading branch information
alex-alrux committed Apr 8, 2016
1 parent 999f565 commit 84e2e47
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 32 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ This package implements distance and similarity metrics for strings, based on th

[![Build Status](https://travis-ci.org/agext/lev.svg?branch=master)](https://travis-ci.org/agext/lev)

v1.0 Stable: Guaranteed no breaking changes to the API in future v1.x releases. No known bugs or performance issues. Probably safe to use in production, though provided on "AS IS" basis.
v1.1 Stable: Guaranteed no breaking changes to the API in future v1.x releases. No known bugs or performance issues. Probably safe to use in production, though provided on "AS IS" basis.

## Overview

Expand Down
92 changes: 61 additions & 31 deletions lev.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,6 @@ package lev
// to be greater than maxCost. Therefore, any return value higher than maxCost is a
// lower bound for the actual distance.
func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist, prefixLen, suffixLen int) {
// ToDo: This is O(l1*l2) time and O(min(l1,l2)) space; investigate if it is
// worth to implement diagonal approach - O(l1*(1+dist)) time, up to O(l1*l2) space
// http://www.csse.monash.edu.au/~lloyd/tildeStrings/Alignment/92.IPL.html
l1, l2 := len(str1), len(str2)
// trim common prefix, if any, as it doesn't affect the distance
for ; prefixLen < l1 && prefixLen < l2; prefixLen++ {
Expand Down Expand Up @@ -74,51 +71,84 @@ func Calculate(str1, str2 []rune, maxCost, insCost, subCost, delCost int) (dist,
if l1 > l2 {
str1, str2, l1, l2, insCost, delCost = str2, str1, l2, l1, delCost, insCost
}

d := make([]int, l1+1)
for y := 1; y <= l1; y++ {
d[y] = y * delCost

// variables used in inner "for" loops
var y, dy, c, l int

// if maxCost is higher than the maximum possible distance, it's equivalent to 'unlimited'
if maxCost > 0 {
if subCost < delCost+insCost {
if maxCost > l1*subCost+(l2-l1)*insCost {
maxCost = 0
}
} else {
if maxCost > l1*delCost+l2*insCost {
maxCost = 0
}
}
}

if maxCost > 0 {
overMax := true
for x := 1; x <= l2; x++ {
d[0] = x * insCost
overMax = true
for y, minCost := 1, (x-1)*insCost; y <= l1; y++ {
if str1[y-1] != str2[x-1] {
minCost += subCost
// offset and length of d in the current row
do, dl := 0, 1
for y, dy = 1, delCost; y <= l1 && dy <= maxCost; dl++ {
d[y] = dy
y++
dy = y * delCost
}

for x := 0; x < l2; x++ {
dy, d[do] = d[do], d[do]+insCost
if l = do + dl; l > l1 {
l = l1
}
for y = do; y < l; dy, d[y] = d[y], dy {
if str1[y] != str2[x] {
dy += subCost
}
if cd := d[y-1] + delCost; cd < minCost {
minCost = cd
if c = d[y] + delCost; c < dy {
dy = c
}
if ci := d[y] + insCost; ci < minCost {
minCost = ci
y++
if c = d[y] + insCost; c < dy {
dy = c
}
if minCost <= maxCost {
overMax = false
if dy > maxCost {
dl = y - do
break
}
minCost, d[y] = d[y], minCost
}
if overMax {
for d[do] > maxCost {
do++
dl--
}
if dl == 0 {
dist = maxCost + 1
return
}
}
} else {
for x := 1; x <= l2; x++ {
d[0] = x * insCost
for y, minCost := 1, (x-1)*insCost; y <= l1; y++ {
if str1[y-1] != str2[x-1] {
minCost += subCost
// ToDo: This is O(l1*l2) time and O(min(l1,l2)) space; investigate if it is
// worth to implement diagonal approach - O(l1*(1+dist)) time, up to O(l1*l2) space
// http://www.csse.monash.edu.au/~lloyd/tildeStrings/Alignment/92.IPL.html

for y = 1; y <= l1; y++ {
d[y] = y * delCost
}
for x := 0; x < l2; x++ {
dy, d[0] = d[0], d[0]+insCost
for y = 0; y < l1; dy, d[y] = d[y], dy {
if str1[y] != str2[x] {
dy += subCost
}
if cd := d[y-1] + delCost; cd < minCost {
minCost = cd
if c = d[y] + delCost; c < dy {
dy = c
}
if ci := d[y] + insCost; ci < minCost {
minCost = ci
y++
if c = d[y] + insCost; c < dy {
dy = c
}
minCost, d[y] = d[y], minCost
}
}
}
Expand Down
5 changes: 5 additions & 0 deletions lev_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,11 @@ func Test_Metrics(t *testing.T) {

// When setting a maxCost (should not affect Similarity() and Match())...
{"password", "pass1", "(maxCost=1)", NewParams().MaxCost(1), e{2, 4, 0, 4. / 8, 4. / 8}},
{"pass1word", "passwords1", "(maxCost=2)", NewParams().MaxCost(2), e{3, 4, 0, 7. / 10, 8.2 / 10}},
{"password", "1234", " (D=2,maxCost=1)", NewParams().DelCost(2).MaxCost(1), e{2, 0, 0, 0, 0}},
{"pwd", "password", " (I=0,maxCost=0)", NewParams().InsCost(0).MaxCost(0), e{0, 1, 1, 1, 1}},
{"passXword", "password", "(maxCost=10)", NewParams().MaxCost(10), e{1, 4, 4, 8. / 9, 8.4 / 9}},
{"passXord", "password", "(S=3,maxCost=17)", NewParams().SubCost(3).MaxCost(17), e{2, 4, 3, 14. / 16, 14.8 / 16}},
// ... no change because the Calculate is calculated without getting into the main algorithm:
{"password", "pass", "(maxCost=1)", NewParams().MaxCost(1), e{4, 4, 0, 4. / 8, 4. / 8}},

Expand Down

0 comments on commit 84e2e47

Please sign in to comment.