Renamed fuzzy feature to strsim.

leontoeides · Nov 15, 2023 · e075ab3 · e075ab3
1 parent ab55808
commit e075ab3
Show file tree

Hide file tree

Showing 30 changed files with 1,644 additions and 19 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "indicium"
-version = "0.5.2"
+version = "0.5.3"
 authors = ["Dylan Bowker <[email protected]>"]
 edition = "2021"
 categories = [ "database-implementations" ]
@@ -16,9 +16,9 @@ repository = "https://github.com/leontoeides/indicium"
 maintenance = { status = "actively-developed" }
 
 [features]
-default = [ "simple", "fuzzy", "ahash" ]
+default = [ "simple", "ahash" ]
 simple = []
-fuzzy = [ "strsim" ]
+strsim = [ "dep:strsim" ]
 ahash = [ "dep:ahash" ]
 select2 = [ "simple", "serde" ]
 

diff --git a/src/simple/autocomplete/context.rs b/src/simple/autocomplete/context.rs
@@ -154,9 +154,9 @@ impl<K: Hash + Ord> SearchIndex<K> {
                 // Collect all keyword autocompletions into a `Vec`:
                 .collect();
 
-            // If fuzzy string searching enabled, examine the resulting
+            // If `strsim` string searching enabled, examine the resulting
             // auto-complete options before using them:
-            #[cfg(feature = "fuzzy")]
+            #[cfg(feature = "strsim")]
             if autocompletions.is_empty() {
                 // No autocomplete options were found for the user's last
                 // (partial) keyword. Attempt to use fuzzy string search to find

diff --git a/src/simple/autocomplete/global.rs b/src/simple/autocomplete/global.rs
@@ -150,9 +150,9 @@ impl<K: Hash + Ord> SearchIndex<K> {
                 // Collect all keyword autocompletions into a `Vec`:
                 .collect();
 
-            // If fuzzy string searching enabled, examine the resulting
+            // If `strsim` string searching enabled, examine the resulting
             // auto-complete options before using them:
-            #[cfg(feature = "fuzzy")]
+            #[cfg(feature = "strsim")]
             if autocompletions.is_empty() {
                 // No autocomplete options were found for the user's last
                 // (partial) keyword. Attempt to use fuzzy string search to find

diff --git a/src/simple/autocomplete/keyword.rs b/src/simple/autocomplete/keyword.rs
@@ -125,9 +125,9 @@ impl<K: Hash + Ord> SearchIndex<K> {
             // Collect all keyword autocompletions into a `Vec`:
             .collect();
 
-        // If fuzzy string searching enabled, examine the resulting
+        // If `strsim` string searching enabled, examine the resulting
         // auto-complete options before returning them:
-        #[cfg(feature = "fuzzy")]
+        #[cfg(feature = "strsim")]
         if autocomplete_options.is_empty() {
             // No autocomplete options were found for the user's last
             // (partial) keyword. Attempt to use fuzzy string search to find
@@ -150,7 +150,7 @@ impl<K: Hash + Ord> SearchIndex<K> {
 
         // If fuzzy string searching disabled, return the resulting
         // auto-complete options without further processing:
-        #[cfg(not(feature = "fuzzy"))]
+        #[cfg(not(feature = "strsim"))]
         autocomplete_options.into_iter().map(|kstring| kstring.as_str()).collect()
 
     } // fn

diff --git a/src/simple/builder.rs b/src/simple/builder.rs
@@ -131,7 +131,7 @@ impl<K: Clone + Ord> SearchIndexBuilder<K> {
     /// **Default:** `StrSimType::Levenshtein`
     ///
     /// [`StrSimType`]: enum.StrSimType.html
-    #[cfg(feature = "fuzzy")]
+    #[cfg(feature = "strsim")]
     pub fn strsim_type(mut self, strsim_type: Option<StrSimType>) -> Self {
         self.strsim_type = strsim_type;
         self
@@ -162,7 +162,7 @@ impl<K: Clone + Ord> SearchIndexBuilder<K> {
     /// be crippling slow on very large search indicies.
     ///
     /// **Default:** `3` characters
-    #[cfg(feature = "fuzzy")]
+    #[cfg(feature = "strsim")]
     pub fn strsim_length(mut self, strsim_length: usize) -> Self {
         self.strsim_length = strsim_length;
         self
@@ -181,7 +181,7 @@ impl<K: Clone + Ord> SearchIndexBuilder<K> {
     /// be returned to the user.
     ///
     /// **Default:** `0.3`
-    #[cfg(feature = "fuzzy")]
+    #[cfg(feature = "strsim")]
     pub fn strsim_minimum_score(mut self, strsim_minimum_score: f64) -> Self {
         self.strsim_minimum_score = strsim_minimum_score;
         self

diff --git a/src/simple/internal/eddie/autocomplete/context_damerau_levenshtein.rs b/src/simple/internal/eddie/autocomplete/context_damerau_levenshtein.rs
@@ -0,0 +1,83 @@
+use crate::simple::internal::StrsimTopScores;
+use crate::simple::search_index::SearchIndex;
+use kstring::KString;
+use std::{cmp::Ord, collections::BTreeSet, hash::Hash};
+use strsim::normalized_damerau_levenshtein;
+
+// -----------------------------------------------------------------------------
+
+impl<K: Hash + Ord> SearchIndex<K> {
+
+    // -------------------------------------------------------------------------
+    //
+    /// Scans the entire search index for the closest matching _n_ keywords
+    /// using the Damerau-Levenshtein string similarity metric from Danny Guo's
+    /// [strsim](https://crates.io/crates/strsim) crate.
+    ///
+    /// When the user's last (partial) keyword that is meant to be autocompleted
+    /// returns no matches, these `strsim_autocomplete_*` methods can be used to
+    /// find the best match for substitution.
+    ///
+    /// * `index_range` limits which keywords to compare the user's keyword
+    /// against. For example, if the `index_range` is "super" and the user's
+    /// keyword is "supersonic": only search index keywords beginning with
+    /// "super" will be compared against the user's keyword: "supersonic"
+    /// against "superalloy", "supersonic" against "supergiant" and so on...
+    ///
+    /// * `key_set` limits which keywords to compare the user's keyword
+    /// against. For a search index keyword to be considered as a fuzzy match,
+    /// it must contain at least one key that is in this key set. This is how
+    /// fuzzy matching is made contextual.
+    //
+    // Note: these `strsim_autocomplete_*` methods are very similar and may seem
+    // repetitive with a lot of boiler plate. These were intentionally made more
+    // "concrete" and less modular in order to be more efficient.
+
+    pub(crate) fn strsim_autocomplete_context_damerau_levenshtein(
+        &self,
+        index_range: &str,
+        key_set: &BTreeSet<&K>,
+        user_keyword: &str,
+    ) -> impl Iterator<Item = (&KString, &BTreeSet<K>)> {
+
+        // This structure will track the top scoring keywords:
+        let mut top_scores: StrsimTopScores<K, f64> =
+            StrsimTopScores::with_capacity(self.maximum_autocomplete_options);
+
+        // Scan the search index for the highest scoring keywords:
+        self.b_tree_map
+            // Get matching keywords starting with (partial) keyword string:
+            .range(KString::from_ref(index_range)..)
+            // We did not specify an end bound for our `range` function (see
+            // above.) `range` will return _every_ keyword greater than the
+            // supplied keyword. The below `take_while` will effectively break
+            // iteration when we reach a keyword that does not start with our
+            // supplied (partial) keyword.
+            .take_while(|(index_keyword, _keys)| index_keyword.starts_with(index_range))
+            // Only examine search index keywords that intersect with the caller
+            // provided key-set. This ensures contextual fuzzy matching. This
+            // will filter out search index keywords that don't contain any keys
+            // from the caller provided key set:
+            .filter(|(_index_keyword, index_keys)|
+                key_set.is_empty() ||
+                    index_keys.iter().any(|index_key| key_set.contains(index_key))
+            ) // filter
+            // For each keyword in the search index:
+            .for_each(|(index_keyword, index_keys)| {
+                // Using this keyword from the search index, calculate its
+                // similarity to the user's keyword:
+                let score = normalized_damerau_levenshtein(index_keyword, user_keyword);
+                // Insert the score into the top scores (if it's normal and high
+                // enough):
+                if score.is_normal() && score >= self.strsim_minimum_score {
+                    top_scores.insert(index_keyword, index_keys, score)
+                } // if
+            }); // for_each
+
+        // Return the top scoring keywords that could be used as autocomplete
+        // options, and their keys, to the caller:
+        top_scores.results()
+
+    } // fn
+
+} // impl
diff --git a/src/simple/internal/eddie/autocomplete/context_jaro.rs b/src/simple/internal/eddie/autocomplete/context_jaro.rs
@@ -0,0 +1,83 @@
+use crate::simple::internal::StrsimTopScores;
+use crate::simple::search_index::SearchIndex;
+use kstring::KString;
+use std::{cmp::Ord, collections::BTreeSet, hash::Hash};
+use strsim::jaro;
+
+// -----------------------------------------------------------------------------
+
+impl<K: Hash + Ord> SearchIndex<K> {
+
+    // -------------------------------------------------------------------------
+    //
+    /// Scans the entire search index for the closest matching _n_ keywords
+    /// using the Jaro string similarity metric from Danny Guo's
+    /// [strsim](https://crates.io/crates/strsim) crate.
+    ///
+    /// When the user's last (partial) keyword that is meant to be autocompleted
+    /// returns no matches, these `strsim_autocomplete_*` methods can be used to
+    /// find the best match for substitution.
+    ///
+    /// * `index_range` limits which keywords to compare the user's keyword
+    /// against. For example, if the `index_range` is "super" and the user's
+    /// keyword is "supersonic": only search index keywords beginning with
+    /// "super" will be compared against the user's keyword: "supersonic"
+    /// against "superalloy", "supersonic" against "supergiant" and so on...
+    ///
+    /// * `key_set` limits which keywords to compare the user's keyword
+    /// against. For a search index keyword to be considered as a fuzzy match,
+    /// it must contain at least one key that is in this key set. This is how
+    /// fuzzy matching is made contextual.
+    //
+    // Note: these `strsim_autocomplete_*` methods are very similar and may seem
+    // repetitive with a lot of boiler plate. These were intentionally made more
+    // "concrete" and less modular in order to be more efficient.
+
+    pub(crate) fn strsim_autocomplete_context_jaro(
+        &self,
+        index_range: &str,
+        key_set: &BTreeSet<&K>,
+        user_keyword: &str,
+    ) -> impl Iterator<Item = (&KString, &BTreeSet<K>)> {
+
+        // This structure will track the top scoring keywords:
+        let mut top_scores: StrsimTopScores<K, f64> =
+            StrsimTopScores::with_capacity(self.maximum_autocomplete_options);
+
+        // Scan the search index for the highest scoring keywords:
+        self.b_tree_map
+            // Get matching keywords starting with (partial) keyword string:
+            .range(KString::from_ref(index_range)..)
+            // We did not specify an end bound for our `range` function (see
+            // above.) `range` will return _every_ keyword greater than the
+            // supplied keyword. The below `take_while` will effectively break
+            // iteration when we reach a keyword that does not start with our
+            // supplied (partial) keyword.
+            .take_while(|(index_keyword, _keys)| index_keyword.starts_with(index_range))
+            // Only examine search index keywords that intersect with the caller
+            // provided key-set. This ensures contextual fuzzy matching. This
+            // will filter out search index keywords that don't contain any keys
+            // from the caller provided key set:
+            .filter(|(_index_keyword, index_keys)|
+                key_set.is_empty() ||
+                    index_keys.iter().any(|index_key| key_set.contains(index_key))
+            ) // filter
+            // For each keyword in the search index:
+            .for_each(|(index_keyword, index_keys)| {
+                // Using this keyword from the search index, calculate its
+                // similarity to the user's keyword:
+                let score = jaro(index_keyword, user_keyword);
+                // Insert the score into the top scores (if it's normal and high
+                // enough):
+                if score.is_normal() && score >= self.strsim_minimum_score {
+                    top_scores.insert(index_keyword, index_keys, score)
+                } // if
+            }); // for_each
+
+        // Return the top scoring keywords that could be used as autocomplete
+        // options, and their keys, to the caller:
+        top_scores.results()
+
+    } // fn
+
+} // impl
diff --git a/src/simple/internal/eddie/autocomplete/context_jaro_winkler.rs b/src/simple/internal/eddie/autocomplete/context_jaro_winkler.rs
@@ -0,0 +1,83 @@
+use crate::simple::internal::StrsimTopScores;
+use crate::simple::search_index::SearchIndex;
+use kstring::KString;
+use std::{cmp::Ord, collections::BTreeSet, hash::Hash};
+use strsim::jaro_winkler;
+
+// -----------------------------------------------------------------------------
+
+impl<K: Hash + Ord> SearchIndex<K> {
+
+    // -------------------------------------------------------------------------
+    //
+    /// Scans the entire search index for the closest matching _n_ keywords
+    /// using the Jaro-Winkler string similarity metric from Danny Guo's
+    /// [strsim](https://crates.io/crates/strsim) crate.
+    ///
+    /// When the user's last (partial) keyword that is meant to be autocompleted
+    /// returns no matches, these `strsim_autocomplete_*` methods can be used to
+    /// find the best match for substitution.
+    ///
+    /// * `index_range` limits which keywords to compare the user's keyword
+    /// against. For example, if the `index_range` is "super" and the user's
+    /// keyword is "supersonic": only search index keywords beginning with
+    /// "super" will be compared against the user's keyword: "supersonic"
+    /// against "superalloy", "supersonic" against "supergiant" and so on...
+    ///
+    /// * `key_set` limits which keywords to compare the user's keyword
+    /// against. For a search index keyword to be considered as a fuzzy match,
+    /// it must contain at least one key that is in this key set. This is how
+    /// fuzzy matching is made contextual.
+    //
+    // Note: these `strsim_autocomplete_*` methods are very similar and may seem
+    // repetitive with a lot of boiler plate. These were intentionally made more
+    // "concrete" and less modular in order to be more efficient.
+
+    pub(crate) fn strsim_autocomplete_context_jaro_winkler(
+        &self,
+        index_range: &str,
+        key_set: &BTreeSet<&K>,
+        user_keyword: &str,
+    ) -> impl Iterator<Item = (&KString, &BTreeSet<K>)> {
+
+        // This structure will track the top scoring keywords:
+        let mut top_scores: StrsimTopScores<K, f64> =
+            StrsimTopScores::with_capacity(self.maximum_autocomplete_options);
+
+        // Scan the search index for the highest scoring keywords:
+        self.b_tree_map
+            // Get matching keywords starting with (partial) keyword string:
+            .range(KString::from_ref(index_range)..)
+            // We did not specify an end bound for our `range` function (see
+            // above.) `range` will return _every_ keyword greater than the
+            // supplied keyword. The below `take_while` will effectively break
+            // iteration when we reach a keyword that does not start with our
+            // supplied (partial) keyword.
+            .take_while(|(index_keyword, _keys)| index_keyword.starts_with(index_range))
+            // Only examine search index keywords that intersect with the caller
+            // provided key-set. This ensures contextual fuzzy matching. This
+            // will filter out search index keywords that don't contain any keys
+            // from the caller provided key set:
+            .filter(|(_index_keyword, index_keys)|
+                key_set.is_empty() ||
+                    index_keys.iter().any(|index_key| key_set.contains(index_key))
+            ) // filter
+            // For each keyword in the search index:
+            .for_each(|(index_keyword, index_keys)| {
+                // Using this keyword from the search index, calculate its
+                // similarity to the user's keyword:
+                let score = jaro_winkler(index_keyword, user_keyword);
+                // Insert the score into the top scores (if it's normal and high
+                // enough):
+                if score.is_normal() && score >= self.strsim_minimum_score {
+                    top_scores.insert(index_keyword, index_keys, score)
+                } // if
+            }); // for_each
+
+        // Return the top scoring keywords that could be used as autocomplete
+        // options, and their keys, to the caller:
+        top_scores.results()
+
+    } // fn
+
+} // impl