diff --git a/.gitignore b/.gitignore index b69a3e1..4f3dae1 100644 --- a/.gitignore +++ b/.gitignore @@ -243,3 +243,4 @@ ModelManifest.xml # FAKE - F# Make .fake/ *.DS_Store +.idea/ diff --git a/src/F23.StringSimilarity/Cosine.cs b/src/F23.StringSimilarity/Cosine.cs index a4cc92e..324378f 100644 --- a/src/F23.StringSimilarity/Cosine.cs +++ b/src/F23.StringSimilarity/Cosine.cs @@ -32,7 +32,7 @@ namespace F23.StringSimilarity public class Cosine : ShingleBased, INormalizedStringSimilarity, INormalizedStringDistance { /// - /// Implements Cosine Similarity between strings.The strings are first + /// Implements Cosine Similarity between strings. The strings are first /// transformed in vectors of occurrences of k-shingles(sequences of k /// characters). In this n-dimensional space, the similarity between the two /// strings is the cosine of their respective vectors. @@ -41,7 +41,7 @@ public class Cosine : ShingleBased, INormalizedStringSimilarity, INormalizedStri public Cosine(int k) : base(k) { } /// - /// Implements Cosine Similarity between strings.The strings are first + /// Implements Cosine Similarity between strings. The strings are first /// transformed in vectors of occurrences of k-shingles(sequences of k /// characters). In this n-dimensional space, the similarity between the two /// strings is the cosine of their respective vectors. diff --git a/src/F23.StringSimilarity/Damerau.cs b/src/F23.StringSimilarity/Damerau.cs index d693762..dd6138e 100644 --- a/src/F23.StringSimilarity/Damerau.cs +++ b/src/F23.StringSimilarity/Damerau.cs @@ -41,7 +41,7 @@ namespace F23.StringSimilarity /// This is not to be confused with the optimal string alignment distance, which /// is an extension where no substring can be edited more than once. /// - public class Damerau : IMetricStringDistance + public class Damerau : IMetricStringDistance, IMetricSpanDistance { /// /// Compute the distance between strings: the minimum number of operations @@ -54,6 +54,10 @@ public class Damerau : IMetricStringDistance /// The computed distance. /// If s1 or s2 is null. public double Distance(string s1, string s2) + => Distance(s1.AsSpan(), s2.AsSpan()); + + public double Distance(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable { if (s1 == null) { @@ -65,7 +69,7 @@ public double Distance(string s1, string s2) throw new ArgumentNullException(nameof(s2)); } - if (s1.Equals(s2)) + if (s1.SequenceEqual(s2)) { return 0; } @@ -74,7 +78,7 @@ public double Distance(string s1, string s2) int inf = s1.Length + s2.Length; // Create and initialize the character array indices - var da = new Dictionary(); + var da = new Dictionary(); for (int d = 0; d < s1.Length; d++) { @@ -115,7 +119,7 @@ public double Distance(string s1, string s2) int j1 = db; int cost = 1; - if (s1[i - 1] == s2[j - 1]) + if (s1[i - 1].Equals(s2[j - 1])) { cost = 0; db = j; diff --git a/src/F23.StringSimilarity/F23.StringSimilarity.csproj b/src/F23.StringSimilarity/F23.StringSimilarity.csproj index 7115879..8ef85dd 100644 --- a/src/F23.StringSimilarity/F23.StringSimilarity.csproj +++ b/src/F23.StringSimilarity/F23.StringSimilarity.csproj @@ -1,6 +1,6 @@ - netstandard2.0 + netstandard2.0 F23.StringSimilarity string;similarity;distance;levenshtein;jaro-winkler;lcs;cosine StringSimilarity.NET diff --git a/src/F23.StringSimilarity/Interfaces/IMetricSpanDistance.cs b/src/F23.StringSimilarity/Interfaces/IMetricSpanDistance.cs new file mode 100644 index 0000000..65876e8 --- /dev/null +++ b/src/F23.StringSimilarity/Interfaces/IMetricSpanDistance.cs @@ -0,0 +1,23 @@ +using System; + +namespace F23.StringSimilarity.Interfaces +{ + /// + /// Span distances that implement this interface are metrics, which means: + /// - d(x, y) ≥ 0 (non-negativity, or separation axiom) + /// - d(x, y) = 0 if and only if x = y (identity, or coincidence axiom) + /// - d(x, y) = d(y, x) (symmetry) + /// - d(x, z) ≤ d(x, y) + d(y, z) (triangle inequality). + /// + public interface IMetricSpanDistance : ISpanDistance + { + /// + /// Compute and return the metric distance. + /// + /// The first span. + /// The second span. + /// The metric distance. + new double Distance(ReadOnlySpan b1, ReadOnlySpan b2) + where T : IEquatable; + } +} \ No newline at end of file diff --git a/src/F23.StringSimilarity/Interfaces/INormalizedSpanDistance.cs b/src/F23.StringSimilarity/Interfaces/INormalizedSpanDistance.cs new file mode 100644 index 0000000..c23f17b --- /dev/null +++ b/src/F23.StringSimilarity/Interfaces/INormalizedSpanDistance.cs @@ -0,0 +1,6 @@ +namespace F23.StringSimilarity.Interfaces +{ + public interface INormalizedSpanDistance : ISpanDistance + { + } +} \ No newline at end of file diff --git a/src/F23.StringSimilarity/Interfaces/INormalizedSpanSimilarity.cs b/src/F23.StringSimilarity/Interfaces/INormalizedSpanSimilarity.cs new file mode 100644 index 0000000..c5a46ed --- /dev/null +++ b/src/F23.StringSimilarity/Interfaces/INormalizedSpanSimilarity.cs @@ -0,0 +1,6 @@ +namespace F23.StringSimilarity.Interfaces +{ + public interface INormalizedSpanSimilarity : ISpanSimilarity + { + } +} \ No newline at end of file diff --git a/src/F23.StringSimilarity/Interfaces/ISpanDistance.cs b/src/F23.StringSimilarity/Interfaces/ISpanDistance.cs new file mode 100644 index 0000000..a832ce2 --- /dev/null +++ b/src/F23.StringSimilarity/Interfaces/ISpanDistance.cs @@ -0,0 +1,23 @@ +using System; + +namespace F23.StringSimilarity.Interfaces +{ + public interface ISpanDistance + { + /// + /// Compute and return a measure of distance. + /// Must be >= 0. + /// + /// This method operates on spans such as byte arrays. + /// Note that, when used on bytes, string encodings that + /// use more than one byte per codepoint (such as UTF-8) + /// are not supported and will most likely return + /// incorrect results. + /// + /// The first span. + /// The second span. + /// The measure of distance between the spans. + double Distance(ReadOnlySpan b1, ReadOnlySpan b2) + where T : IEquatable; + } +} \ No newline at end of file diff --git a/src/F23.StringSimilarity/Interfaces/ISpanSimilarity.cs b/src/F23.StringSimilarity/Interfaces/ISpanSimilarity.cs new file mode 100644 index 0000000..b5ab92a --- /dev/null +++ b/src/F23.StringSimilarity/Interfaces/ISpanSimilarity.cs @@ -0,0 +1,16 @@ +using System; + +namespace F23.StringSimilarity.Interfaces +{ + public interface ISpanSimilarity + { + /// + /// Compute and return a measure of similarity between 2 spans. + /// + /// The first span + /// The second span + /// Similarity (0 means both spans are completely different) + double Similarity(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable; + } +} \ No newline at end of file diff --git a/src/F23.StringSimilarity/JaroWinkler.cs b/src/F23.StringSimilarity/JaroWinkler.cs index 528a0a7..88ad93a 100644 --- a/src/F23.StringSimilarity/JaroWinkler.cs +++ b/src/F23.StringSimilarity/JaroWinkler.cs @@ -23,6 +23,7 @@ */ using System; +using System.Data.SqlTypes; using System.Linq; using F23.StringSimilarity.Interfaces; // ReSharper disable SuggestVarOrType_Elsewhere @@ -38,7 +39,7 @@ namespace F23.StringSimilarity /// Jaro-Winkler was developed in the area of record linkage (duplicate /// detection) (Winkler, 1990). It returns a value in the interval [0.0, 1.0]. /// The distance is computed as 1 - Jaro-Winkler similarity. - public class JaroWinkler : INormalizedStringSimilarity, INormalizedStringDistance + public class JaroWinkler : INormalizedStringSimilarity, INormalizedStringDistance, INormalizedSpanSimilarity, INormalizedSpanDistance { private const double DEFAULT_THRESHOLD = 0.7; private const int THREE = 3; @@ -75,6 +76,10 @@ public JaroWinkler(double threshold) /// The Jaro-Winkler similarity in the range [0, 1] /// If s1 or s2 is null. public double Similarity(string s1, string s2) + => Similarity(s1.AsSpan(), s2.AsSpan()); + + public double Similarity(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable { if (s1 == null) { @@ -86,7 +91,7 @@ public double Similarity(string s1, string s2) throw new ArgumentNullException(nameof(s2)); } - if (s1.Equals(s2)) + if (s1.SequenceEqual(s2)) { return 1f; } @@ -117,10 +122,15 @@ public double Similarity(string s1, string s2) /// If s1 or s2 is null. public double Distance(string s1, string s2) => 1.0 - Similarity(s1, s2); + + public double Distance(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable + => 1.0 - Similarity(s1, s2); - private static int[] Matches(string s1, string s2) + private static int[] Matches(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable { - string max, min; + ReadOnlySpan max, min; if (s1.Length > s2.Length) { max = s1; @@ -141,11 +151,11 @@ private static int[] Matches(string s1, string s2) int matches = 0; for (int mi = 0; mi < min.Length; mi++) { - char c1 = min[mi]; + var c1 = min[mi]; for (int xi = Math.Max(mi - range, 0), xn = Math.Min(mi + range + 1, max.Length); xi < xn; xi++) { - if (!match_flags[xi] && c1 == max[xi]) + if (!match_flags[xi] && c1.Equals(max[xi])) { match_indexes[mi] = xi; match_flags[xi] = true; @@ -154,8 +164,8 @@ private static int[] Matches(string s1, string s2) } } } - char[] ms1 = new char[matches]; - char[] ms2 = new char[matches]; + T[] ms1 = new T[matches]; + T[] ms2 = new T[matches]; for (int i = 0, si = 0; i < min.Length; i++) { if (match_indexes[i] != -1) @@ -175,7 +185,7 @@ private static int[] Matches(string s1, string s2) int transpositions = 0; for (int mi = 0; mi < ms1.Length; mi++) { - if (ms1[mi] != ms2[mi]) + if (!ms1[mi].Equals(ms2[mi])) { transpositions++; } @@ -183,7 +193,7 @@ private static int[] Matches(string s1, string s2) int prefix = 0; for (int mi = 0; mi < min.Length; mi++) { - if (s1[mi] == s2[mi]) + if (s1[mi].Equals(s2[mi])) { prefix++; } diff --git a/src/F23.StringSimilarity/Levenshtein.cs b/src/F23.StringSimilarity/Levenshtein.cs index 682161b..32cfe2c 100644 --- a/src/F23.StringSimilarity/Levenshtein.cs +++ b/src/F23.StringSimilarity/Levenshtein.cs @@ -32,7 +32,7 @@ namespace F23.StringSimilarity /// The Levenshtein distance between two words is the Minimum number of /// single-character edits (insertions, deletions or substitutions) required to /// change one string into the other. - public class Levenshtein : IMetricStringDistance + public class Levenshtein : IMetricStringDistance, IMetricSpanDistance { /// /// Equivalent to Distance(s1, s2, Int32.MaxValue). @@ -40,10 +40,7 @@ public class Levenshtein : IMetricStringDistance /// The first string to compare. /// The second string to compare. /// The Levenshtein distance between strings - public double Distance(string s1, string s2) - { - return Distance(s1, s2, int.MaxValue); - } + public double Distance(string s1, string s2) => Distance(s1, s2, int.MaxValue); /// /// The Levenshtein distance, or edit distance, between two words is the @@ -75,6 +72,14 @@ public double Distance(string s1, string s2) /// The Levenshtein distance between strings /// If s1 or s2 is null. public double Distance(string s1, string s2, int limit) + => Distance(s1.AsSpan(), s2.AsSpan(), limit); + + public double Distance(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable + => Distance(s1, s2, int.MaxValue); + + public double Distance(ReadOnlySpan s1, ReadOnlySpan s2, int limit) + where T : IEquatable { if (s1 == null) { @@ -86,7 +91,7 @@ public double Distance(string s1, string s2, int limit) throw new ArgumentNullException(nameof(s2)); } - if (s1.Equals(s2)) + if (s1.SequenceEqual(s2)) { return 0; } @@ -127,15 +132,16 @@ public double Distance(string s1, string s2, int limit) for (int j = 0; j < s2.Length; j++) { int cost = 1; - if (s1[i] == s2[j]) + if (s1[i].Equals(s2[j])) { cost = 0; } + v1[j + 1] = Math.Min( - v1[j] + 1, // Cost of insertion - Math.Min( - v0[j + 1] + 1, // Cost of remove - v0[j] + cost)); // Cost of substitution + v1[j] + 1, // Cost of insertion + Math.Min( + v0[j + 1] + 1, // Cost of remove + v0[j] + cost)); // Cost of substitution minv1 = Math.Min(minv1, v1[j + 1]); } diff --git a/src/F23.StringSimilarity/LongestCommonSubsequence.cs b/src/F23.StringSimilarity/LongestCommonSubsequence.cs index e07f434..ce5d4d0 100644 --- a/src/F23.StringSimilarity/LongestCommonSubsequence.cs +++ b/src/F23.StringSimilarity/LongestCommonSubsequence.cs @@ -44,7 +44,7 @@ namespace F23.StringSimilarity /// /// ! This class currently implements the dynamic programming approach, which has /// a space requirement O(m * n)! - public class LongestCommonSubsequence : IStringDistance + public class LongestCommonSubsequence : IStringDistance, ISpanDistance { /// /// Return the LCS distance between strings s1 and s2, computed as |s1| + @@ -58,6 +58,10 @@ public class LongestCommonSubsequence : IStringDistance /// /// If s1 or s2 is null. public double Distance(string s1, string s2) + => Distance(s1.AsSpan(), s2.AsSpan()); + + public double Distance(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable { if (s1 == null) { @@ -69,7 +73,7 @@ public double Distance(string s1, string s2) throw new ArgumentNullException(nameof(s2)); } - if (s1.Equals(s2)) + if (s1.SequenceEqual(s2)) { return 0; } @@ -86,6 +90,10 @@ public double Distance(string s1, string s2) /// The length of LCS(s2, s2) /// If s1 or s2 is null. public int Length(string s1, string s2) + => Length(s1.AsSpan(), s2.AsSpan()); + + internal static int Length(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable { if (s1 == null) { @@ -113,8 +121,6 @@ public int Length(string s1, string s2) */ int s1_length = s1.Length; int s2_length = s2.Length; - char[] x = s1.ToCharArray(); - char[] y = s2.ToCharArray(); int[,] c = new int[s1_length + 1, s2_length + 1]; @@ -132,10 +138,9 @@ public int Length(string s1, string s2) { for (int j = 1; j <= s2_length; j++) { - if (x[i - 1] == y[j - 1]) + if (s1[i - 1].Equals(s2[j - 1])) { c[i, j] = c[i - 1, j - 1] + 1; - } else { diff --git a/src/F23.StringSimilarity/MetricLCS.cs b/src/F23.StringSimilarity/MetricLCS.cs index 77c4764..1d33ae6 100644 --- a/src/F23.StringSimilarity/MetricLCS.cs +++ b/src/F23.StringSimilarity/MetricLCS.cs @@ -31,10 +31,8 @@ namespace F23.StringSimilarity /// Distance metric based on Longest Common Subsequence, from the notes "An /// LCS-based string metric" by Daniel Bakkelund. /// - public class MetricLCS : IMetricStringDistance, INormalizedStringDistance + public class MetricLCS : IMetricStringDistance, INormalizedStringDistance, IMetricSpanDistance { - private readonly LongestCommonSubsequence lcs = new LongestCommonSubsequence(); - /// /// Distance metric based on Longest Common Subsequence, computed as /// 1 - |LCS(s1, s2)| / max(|s1|, |s2|). @@ -44,6 +42,10 @@ public class MetricLCS : IMetricStringDistance, INormalizedStringDistance /// LCS distance metric /// If s1 or s2 is null. public double Distance(string s1, string s2) + => Distance(s1.AsSpan(), s2.AsSpan()); + + public double Distance(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable { if (s1 == null) { @@ -55,7 +57,7 @@ public double Distance(string s1, string s2) throw new ArgumentNullException(nameof(s2)); } - if (s1.Equals(s2)) + if (s1.SequenceEqual(s2)) { return 0; } @@ -65,8 +67,8 @@ public double Distance(string s1, string s2) if (m_len == 0) return 0.0; return 1.0 - - (1.0 * lcs.Length(s1, s2)) - / m_len; + - (1.0 * LongestCommonSubsequence.Length(s1, s2)) + / m_len; } } } diff --git a/src/F23.StringSimilarity/NormalizedLevenshtein.cs b/src/F23.StringSimilarity/NormalizedLevenshtein.cs index 8b14a65..208e451 100644 --- a/src/F23.StringSimilarity/NormalizedLevenshtein.cs +++ b/src/F23.StringSimilarity/NormalizedLevenshtein.cs @@ -31,7 +31,7 @@ namespace F23.StringSimilarity /// the longest string. The resulting value is always in the interval [0.0 1.0] /// but it is not a metric anymore! The similarity is computed as 1 - normalized /// distance. - public class NormalizedLevenshtein : INormalizedStringDistance, INormalizedStringSimilarity + public class NormalizedLevenshtein : INormalizedStringDistance, INormalizedStringSimilarity, INormalizedSpanDistance, INormalizedSpanSimilarity { private readonly Levenshtein l = new Levenshtein(); @@ -43,6 +43,10 @@ public class NormalizedLevenshtein : INormalizedStringDistance, INormalizedStrin /// The computed distance in the range [0, 1] /// If s1 or s2 is null. public double Distance(string s1, string s2) + => Distance(s1.AsSpan(), s2.AsSpan()); + + public double Distance(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable { if (s1 == null) { @@ -54,7 +58,7 @@ public double Distance(string s1, string s2) throw new ArgumentNullException(nameof(s2)); } - if (s1.Equals(s2)) + if (s1.SequenceEqual(s2)) { return 0.0; } @@ -78,5 +82,9 @@ public double Distance(string s1, string s2) /// If s1 or s2 is null. public double Similarity(string s1, string s2) => 1.0 - Distance(s1, s2); + + public double Similarity(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable + => 1.0 - Distance(s1, s2); } } diff --git a/src/F23.StringSimilarity/OptimalStringAlignment.cs b/src/F23.StringSimilarity/OptimalStringAlignment.cs index f8a8dbb..13af7a1 100644 --- a/src/F23.StringSimilarity/OptimalStringAlignment.cs +++ b/src/F23.StringSimilarity/OptimalStringAlignment.cs @@ -29,7 +29,7 @@ namespace F23.StringSimilarity { - public sealed class OptimalStringAlignment : IStringDistance + public sealed class OptimalStringAlignment : IStringDistance, ISpanDistance { /// /// Compute the distance between strings: the minimum number of operations @@ -42,6 +42,10 @@ public sealed class OptimalStringAlignment : IStringDistance /// the OSA distance /// If s1 or s2 is null. public double Distance(string s1, string s2) + => Distance(s1.AsSpan(), s2.AsSpan()); + + public double Distance(ReadOnlySpan s1, ReadOnlySpan s2) + where T : IEquatable { if (s1 == null) { @@ -53,7 +57,7 @@ public double Distance(string s1, string s2) throw new ArgumentNullException(nameof(s2)); } - if (s1.Equals(s2)) + if (s1.SequenceEqual(s2)) { return 0; } @@ -93,7 +97,7 @@ public double Distance(string s1, string s2) //if s1[i - 1] = s2[j - 1] then cost = 0, else cost = 1 cost = 1; - if (s1[i - 1] == s2[j - 1]) + if (s1[i - 1].Equals(s2[j - 1])) { cost = 0; } @@ -106,8 +110,8 @@ public double Distance(string s1, string s2) //transposition check if (i > 1 && j > 1 - && s1[i - 1] == s2[j - 2] - && s1[i - 2] == s2[j - 1] + && s1[i - 1].Equals(s2[j - 2]) + && s1[i - 2].Equals(s2[j - 1]) ) { d[i, j] = Math.Min(d[i, j], d[i - 2, j - 2] + cost); diff --git a/test/F23.StringSimilarity.Tests/DamerauTest.cs b/test/F23.StringSimilarity.Tests/DamerauTest.cs index 0e9cf65..2faced5 100644 --- a/test/F23.StringSimilarity.Tests/DamerauTest.cs +++ b/test/F23.StringSimilarity.Tests/DamerauTest.cs @@ -22,7 +22,9 @@ * THE SOFTWARE. */ +using System; using System.Diagnostics.CodeAnalysis; +using System.Text; using F23.StringSimilarity.Tests.TestUtil; using Xunit; @@ -33,15 +35,30 @@ namespace F23.StringSimilarity.Tests [SuppressMessage("ReSharper", "ArgumentsStyleOther")] public class DamerauTest { - [Fact] - public void TestDistance() + [InlineData("ABCDEF", "ABDCEF", 1.0)] + [InlineData("ABCDEF", "BACDFE", 2.0)] + [InlineData("ABCDEF", "ABCDE", 1.0)] + [Theory] + public void TestDistance(string s1, string s2, double expected) { var instance = new Damerau(); - Assert.Equal(expected: 1.0, actual: instance.Distance("ABCDEF", "ABDCEF")); - Assert.Equal(expected: 2.0, actual: instance.Distance("ABCDEF", "BACDFE")); - Assert.Equal(expected: 1.0, actual: instance.Distance("ABCDEF", "ABCDE")); - + // test string version + Assert.Equal(expected, actual: instance.Distance(s1, s2)); + + // test char span version + Assert.Equal(expected, actual: instance.Distance(s1.AsSpan(), s2.AsSpan())); + + // test byte span version + Assert.Equal(expected, actual: instance.Distance( + Encoding.Latin1.GetBytes(s1).AsSpan(), + Encoding.Latin1.GetBytes(s2).AsSpan())); + } + + [Fact] + public void NullEmptyDistanceTest() + { + var instance = new Damerau(); NullEmptyTests.TestDistance(instance); } } diff --git a/test/F23.StringSimilarity.Tests/JaroWinklerTest.cs b/test/F23.StringSimilarity.Tests/JaroWinklerTest.cs index fc1cca8..9fc5792 100644 --- a/test/F23.StringSimilarity.Tests/JaroWinklerTest.cs +++ b/test/F23.StringSimilarity.Tests/JaroWinklerTest.cs @@ -22,6 +22,7 @@ * THE SOFTWARE. */ +using System; using System.Diagnostics.CodeAnalysis; using F23.StringSimilarity.Tests.TestUtil; using Xunit; @@ -33,33 +34,51 @@ namespace F23.StringSimilarity.Tests [SuppressMessage("ReSharper", "ArgumentsStyleOther")] public class JaroWinklerTest { - [Fact] - public void TestSimilarity() + [InlineData("My string", "My tsring", 0.974074)] + [InlineData("My string", "My ntrisg", 0.896296)] + [Theory] + public void TestSimilarity(string s1, string s2, double expected) { var instance = new JaroWinkler(); + // test string version Assert.Equal( - expected: 0.974074, - actual: instance.Similarity("My string", "My tsring"), + expected, + actual: instance.Similarity(s1, s2), precision: 6 // 0.000001 ); - + + // test char span version Assert.Equal( - expected: 0.896296, - actual: instance.Similarity("My string", "My ntrisg"), + expected, + actual: instance.Similarity(s1.AsSpan(), s2.AsSpan()), + precision: 6 // 0.000001 + ); + + // test byte span version + Assert.Equal( + expected, + actual: instance.Similarity( + System.Text.Encoding.Latin1.GetBytes(s1).AsSpan(), + System.Text.Encoding.Latin1.GetBytes(s2).AsSpan()), precision: 6 // 0.000001 ); - - NullEmptyTests.TestSimilarity(instance); } [Fact] - public void TestDistance() + public void NullEmptyDistanceTest() { var instance = new JaroWinkler(); NullEmptyTests.TestDistance(instance); // TODO: regular (non-null/empty) distance tests } + + [Fact] + public void NullEmptySimilarityTest() + { + var instance = new JaroWinkler(); + NullEmptyTests.TestSimilarity(instance); + } } } diff --git a/test/F23.StringSimilarity.Tests/LevenshteinTest.cs b/test/F23.StringSimilarity.Tests/LevenshteinTest.cs index 83bc5f5..304fd7e 100644 --- a/test/F23.StringSimilarity.Tests/LevenshteinTest.cs +++ b/test/F23.StringSimilarity.Tests/LevenshteinTest.cs @@ -22,6 +22,7 @@ * THE SOFTWARE. */ +using System; using System.Diagnostics.CodeAnalysis; using F23.StringSimilarity.Tests.TestUtil; using Xunit; @@ -33,20 +34,51 @@ namespace F23.StringSimilarity.Tests [SuppressMessage("ReSharper", "ArgumentsStyleOther")] public class LevenshteinTest { - [Fact] - public void TestDistance() + [InlineData("My string", "My tring", 1.0)] + [InlineData("My string", "M string2", 2.0)] + [InlineData("My string", "My $tring", 1.0)] + [Theory] + public void TestDistance(string s1, string s2, double expected) { var instance = new Levenshtein(); - Assert.Equal(expected: 1.0, actual: instance.Distance("My string", "My tring")); - Assert.Equal(expected: 2.0, actual: instance.Distance("My string", "M string2")); - Assert.Equal(expected: 1.0, actual: instance.Distance("My string", "My $tring")); - - // With limits. - Assert.Equal(2.0, instance.Distance("My string", "M string2", 4)); - Assert.Equal(2.0, instance.Distance("My string", "M string2", 2)); - Assert.Equal(1.0, instance.Distance("My string", "M string2", 1)); + // test string version + Assert.Equal(expected, actual: instance.Distance(s1, s2)); + + // test char span version + Assert.Equal(expected, actual: instance.Distance(s1.AsSpan(), s2.AsSpan())); + + // test byte span version + Assert.Equal(expected, actual: instance.Distance( + System.Text.Encoding.Latin1.GetBytes(s1).AsSpan(), + System.Text.Encoding.Latin1.GetBytes(s2).AsSpan())); + } + [InlineData("My string", "M string2", 4, 2.0)] + [InlineData("My string", "M string2", 2, 2.0)] + [InlineData("My string", "M string2", 1, 1.0)] + [Theory] + public void TestDistanceWithLimits(string s1, string s2, int limit, double expected) + { + var instance = new Levenshtein(); + + // test string version + Assert.Equal(expected, actual: instance.Distance(s1, s2, limit)); + + // test char span version + Assert.Equal(expected, actual: instance.Distance(s1.AsSpan(), s2.AsSpan(), limit)); + + // test byte span version + Assert.Equal(expected, actual: instance.Distance( + System.Text.Encoding.Latin1.GetBytes(s1).AsSpan(), + System.Text.Encoding.Latin1.GetBytes(s2).AsSpan(), + limit)); + } + + [Fact] + public void NullEmptyDistanceTest() + { + var instance = new Levenshtein(); NullEmptyTests.TestDistance(instance); } } diff --git a/test/F23.StringSimilarity.Tests/LongestCommonSubsequenceTest.cs b/test/F23.StringSimilarity.Tests/LongestCommonSubsequenceTest.cs index 8c860d3..55fdcf2 100644 --- a/test/F23.StringSimilarity.Tests/LongestCommonSubsequenceTest.cs +++ b/test/F23.StringSimilarity.Tests/LongestCommonSubsequenceTest.cs @@ -22,6 +22,7 @@ * THE SOFTWARE. */ +using System; using System.Diagnostics.CodeAnalysis; using F23.StringSimilarity.Tests.TestUtil; using Xunit; @@ -33,16 +34,31 @@ namespace F23.StringSimilarity.Tests [SuppressMessage("ReSharper", "ArgumentsStyleOther")] public class LongestCommonSubsequenceTest { - [Fact] - public void TestDistance() + [InlineData("AGCAT", "GAC", 4)] + [InlineData("AGCAT", "AGCT", 1)] + [Theory] + public void TestDistance(string s1, string s2, double expected) { var instance = new LongestCommonSubsequence(); // LCS = GA or GC => distance = 4 (remove 3 letters and add 1) - Assert.Equal(expected: 4, actual: instance.Distance("AGCAT", "GAC")); - Assert.Equal(expected: 1, actual: instance.Distance("AGCAT", "AGCT")); - + // test string version + Assert.Equal(expected, actual: instance.Distance(s1, s2)); + + // test char span version + Assert.Equal(expected, actual: instance.Distance(s1.AsSpan(), s2.AsSpan())); + + // test byte span version + Assert.Equal(expected, actual: instance.Distance( + System.Text.Encoding.Latin1.GetBytes(s1).AsSpan(), + System.Text.Encoding.Latin1.GetBytes(s2).AsSpan())); + } + + [Fact] + public void NullEmptyDistanceTest() + { + var instance = new LongestCommonSubsequence(); NullEmptyTests.TestDistance(instance); } } diff --git a/test/F23.StringSimilarity.Tests/MetricLCSTest.cs b/test/F23.StringSimilarity.Tests/MetricLCSTest.cs index 387977e..01c6216 100644 --- a/test/F23.StringSimilarity.Tests/MetricLCSTest.cs +++ b/test/F23.StringSimilarity.Tests/MetricLCSTest.cs @@ -30,7 +30,7 @@ namespace F23.StringSimilarity.Tests public class MetricLCSTest { [Fact] - public void TestDistance() + public void NullEmptyDistanceTest() { var instance = new MetricLCS(); NullEmptyTests.TestDistance(instance); diff --git a/test/F23.StringSimilarity.Tests/NormalizedLevenshteinTest.cs b/test/F23.StringSimilarity.Tests/NormalizedLevenshteinTest.cs index 502026e..2c295c5 100644 --- a/test/F23.StringSimilarity.Tests/NormalizedLevenshteinTest.cs +++ b/test/F23.StringSimilarity.Tests/NormalizedLevenshteinTest.cs @@ -30,7 +30,7 @@ namespace F23.StringSimilarity.Tests public class NormalizedLevenshteinTest { [Fact] - public void TestDistance() + public void NullEmptyDistanceTest() { var instance = new NormalizedLevenshtein(); NullEmptyTests.TestDistance(instance); @@ -39,7 +39,7 @@ public void TestDistance() } [Fact] - public void TestSimilarity() + public void NullEmptySimilarityTest() { var instance = new NormalizedLevenshtein(); NullEmptyTests.TestSimilarity(instance); diff --git a/test/F23.StringSimilarity.Tests/OptimalStringAlignmentTest.cs b/test/F23.StringSimilarity.Tests/OptimalStringAlignmentTest.cs index 490ff73..91f8704 100644 --- a/test/F23.StringSimilarity.Tests/OptimalStringAlignmentTest.cs +++ b/test/F23.StringSimilarity.Tests/OptimalStringAlignmentTest.cs @@ -22,6 +22,7 @@ * THE SOFTWARE. */ +using System; using System.Diagnostics.CodeAnalysis; using F23.StringSimilarity.Tests.TestUtil; using Xunit; @@ -33,79 +34,51 @@ namespace F23.StringSimilarity.Tests [SuppressMessage("ReSharper", "ArgumentsStyleOther")] public class OptimalStringAlignmentTest { - [Fact] - public void TestDistance() + [InlineData("", "ABCDEF", 6.0)] + [InlineData("ABCDEF", "", 6.0)] + [InlineData("", "", 0.0)] + [InlineData("ABCDEF", "ABCDEF", 0.0)] + [InlineData("ABDCFE", "ABDCEF", 1.0)] + [InlineData("BBDCEF", "ABDCEF", 1.0)] + [InlineData("BDCEF", "ABDCEF", 1.0)] + [InlineData("ABDCEF", "ADCEF", 1.0)] + [InlineData("CA", "ABC", 3.0)] + [InlineData("BAC", "CAB", 2.0)] + [InlineData("abcde", "awxyz", 4.0)] + [InlineData("abcde", "vwxyz", 5.0)] + [Theory] + public void TestDistance(string s1, string s2, double expected) { var instance = new OptimalStringAlignment(); - // zero length - Assert.Equal( - expected: 6.0, - actual: instance.Distance("", "ABCDEF"), - precision: 0 // 0.0 - ); - Assert.Equal( - expected: 6.0, - actual: instance.Distance("ABCDEF", ""), - precision: 0 // 0.0 - ); - Assert.Equal( - expected: 0.0, - actual: instance.Distance("", ""), - precision: 0 // 0.0 - ); - - // equality - Assert.Equal( - expected: 0.0, - actual: instance.Distance("ABCDEF", "ABCDEF"), - precision: 0 // 0.0 - ); - - // single operation - Assert.Equal( - expected: 1.0, - actual: instance.Distance("ABDCFE", "ABDCEF"), - precision: 0 // 0.0 - ); - Assert.Equal( - expected: 1.0, - actual: instance.Distance("BBDCEF", "ABDCEF"), - precision: 0 // 0.0 - ); - Assert.Equal( - expected: 1.0, - actual: instance.Distance("BDCEF", "ABDCEF"), - precision: 0 // 0.0 - ); - Assert.Equal( - expected: 1.0, - actual: instance.Distance("ABDCEF", "ADCEF"), - precision: 0 // 0.0 - ); - - // other + // test string version Assert.Equal( - expected: 3.0, - actual: instance.Distance("CA", "ABC"), + expected: expected, + actual: instance.Distance(s1, s2), precision: 0 // 0.0 ); + + // test char span version Assert.Equal( - expected: 2.0, - actual: instance.Distance("BAC", "CAB"), + expected: expected, + actual: instance.Distance(s1.AsSpan(), s2.AsSpan()), precision: 0 // 0.0 ); + + // test byte span version Assert.Equal( - expected: 4.0, - actual: instance.Distance("abcde", "awxyz"), + expected: expected, + actual: instance.Distance( + System.Text.Encoding.Latin1.GetBytes(s1).AsSpan(), + System.Text.Encoding.Latin1.GetBytes(s2).AsSpan()), precision: 0 // 0.0 ); - Assert.Equal( - expected: 5.0, - actual: instance.Distance("abcde", "vwxyz"), - precision: 0 // 0.0 - ); - + } + + [Fact] + public void NullEmptyDistanceTest() + { + var instance = new OptimalStringAlignment(); NullEmptyTests.TestDistance(instance); } }