Skip to content

Commit

Permalink
Renamed fuzzy feature to strsim.
Browse files Browse the repository at this point in the history
  • Loading branch information
leontoeides committed Nov 15, 2023
1 parent ab55808 commit e075ab3
Show file tree
Hide file tree
Showing 30 changed files with 1,644 additions and 19 deletions.
6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "indicium"
version = "0.5.2"
version = "0.5.3"
authors = ["Dylan Bowker <[email protected]>"]
edition = "2021"
categories = [ "database-implementations" ]
Expand All @@ -16,9 +16,9 @@ repository = "https://github.com/leontoeides/indicium"
maintenance = { status = "actively-developed" }

[features]
default = [ "simple", "fuzzy", "ahash" ]
default = [ "simple", "ahash" ]
simple = []
fuzzy = [ "strsim" ]
strsim = [ "dep:strsim" ]
ahash = [ "dep:ahash" ]
select2 = [ "simple", "serde" ]

Expand Down
4 changes: 2 additions & 2 deletions src/simple/autocomplete/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,9 @@ impl<K: Hash + Ord> SearchIndex<K> {
// Collect all keyword autocompletions into a `Vec`:
.collect();

// If fuzzy string searching enabled, examine the resulting
// If `strsim` string searching enabled, examine the resulting
// auto-complete options before using them:
#[cfg(feature = "fuzzy")]
#[cfg(feature = "strsim")]
if autocompletions.is_empty() {
// No autocomplete options were found for the user's last
// (partial) keyword. Attempt to use fuzzy string search to find
Expand Down
4 changes: 2 additions & 2 deletions src/simple/autocomplete/global.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,9 @@ impl<K: Hash + Ord> SearchIndex<K> {
// Collect all keyword autocompletions into a `Vec`:
.collect();

// If fuzzy string searching enabled, examine the resulting
// If `strsim` string searching enabled, examine the resulting
// auto-complete options before using them:
#[cfg(feature = "fuzzy")]
#[cfg(feature = "strsim")]
if autocompletions.is_empty() {
// No autocomplete options were found for the user's last
// (partial) keyword. Attempt to use fuzzy string search to find
Expand Down
6 changes: 3 additions & 3 deletions src/simple/autocomplete/keyword.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,9 @@ impl<K: Hash + Ord> SearchIndex<K> {
// Collect all keyword autocompletions into a `Vec`:
.collect();

// If fuzzy string searching enabled, examine the resulting
// If `strsim` string searching enabled, examine the resulting
// auto-complete options before returning them:
#[cfg(feature = "fuzzy")]
#[cfg(feature = "strsim")]
if autocomplete_options.is_empty() {
// No autocomplete options were found for the user's last
// (partial) keyword. Attempt to use fuzzy string search to find
Expand All @@ -150,7 +150,7 @@ impl<K: Hash + Ord> SearchIndex<K> {

// If fuzzy string searching disabled, return the resulting
// auto-complete options without further processing:
#[cfg(not(feature = "fuzzy"))]
#[cfg(not(feature = "strsim"))]
autocomplete_options.into_iter().map(|kstring| kstring.as_str()).collect()

} // fn
Expand Down
6 changes: 3 additions & 3 deletions src/simple/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ impl<K: Clone + Ord> SearchIndexBuilder<K> {
/// **Default:** `StrSimType::Levenshtein`
///
/// [`StrSimType`]: enum.StrSimType.html
#[cfg(feature = "fuzzy")]
#[cfg(feature = "strsim")]
pub fn strsim_type(mut self, strsim_type: Option<StrSimType>) -> Self {
self.strsim_type = strsim_type;
self
Expand Down Expand Up @@ -162,7 +162,7 @@ impl<K: Clone + Ord> SearchIndexBuilder<K> {
/// be crippling slow on very large search indicies.
///
/// **Default:** `3` characters
#[cfg(feature = "fuzzy")]
#[cfg(feature = "strsim")]
pub fn strsim_length(mut self, strsim_length: usize) -> Self {
self.strsim_length = strsim_length;
self
Expand All @@ -181,7 +181,7 @@ impl<K: Clone + Ord> SearchIndexBuilder<K> {
/// be returned to the user.
///
/// **Default:** `0.3`
#[cfg(feature = "fuzzy")]
#[cfg(feature = "strsim")]
pub fn strsim_minimum_score(mut self, strsim_minimum_score: f64) -> Self {
self.strsim_minimum_score = strsim_minimum_score;
self
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
use crate::simple::internal::StrsimTopScores;
use crate::simple::search_index::SearchIndex;
use kstring::KString;
use std::{cmp::Ord, collections::BTreeSet, hash::Hash};
use strsim::normalized_damerau_levenshtein;

// -----------------------------------------------------------------------------

impl<K: Hash + Ord> SearchIndex<K> {

// -------------------------------------------------------------------------
//
/// Scans the entire search index for the closest matching _n_ keywords
/// using the Damerau-Levenshtein string similarity metric from Danny Guo's
/// [strsim](https://crates.io/crates/strsim) crate.
///
/// When the user's last (partial) keyword that is meant to be autocompleted
/// returns no matches, these `strsim_autocomplete_*` methods can be used to
/// find the best match for substitution.
///
/// * `index_range` limits which keywords to compare the user's keyword
/// against. For example, if the `index_range` is "super" and the user's
/// keyword is "supersonic": only search index keywords beginning with
/// "super" will be compared against the user's keyword: "supersonic"
/// against "superalloy", "supersonic" against "supergiant" and so on...
///
/// * `key_set` limits which keywords to compare the user's keyword
/// against. For a search index keyword to be considered as a fuzzy match,
/// it must contain at least one key that is in this key set. This is how
/// fuzzy matching is made contextual.
//
// Note: these `strsim_autocomplete_*` methods are very similar and may seem
// repetitive with a lot of boiler plate. These were intentionally made more
// "concrete" and less modular in order to be more efficient.

pub(crate) fn strsim_autocomplete_context_damerau_levenshtein(
&self,
index_range: &str,
key_set: &BTreeSet<&K>,
user_keyword: &str,
) -> impl Iterator<Item = (&KString, &BTreeSet<K>)> {

// This structure will track the top scoring keywords:
let mut top_scores: StrsimTopScores<K, f64> =
StrsimTopScores::with_capacity(self.maximum_autocomplete_options);

// Scan the search index for the highest scoring keywords:
self.b_tree_map
// Get matching keywords starting with (partial) keyword string:
.range(KString::from_ref(index_range)..)
// We did not specify an end bound for our `range` function (see
// above.) `range` will return _every_ keyword greater than the
// supplied keyword. The below `take_while` will effectively break
// iteration when we reach a keyword that does not start with our
// supplied (partial) keyword.
.take_while(|(index_keyword, _keys)| index_keyword.starts_with(index_range))
// Only examine search index keywords that intersect with the caller
// provided key-set. This ensures contextual fuzzy matching. This
// will filter out search index keywords that don't contain any keys
// from the caller provided key set:
.filter(|(_index_keyword, index_keys)|
key_set.is_empty() ||
index_keys.iter().any(|index_key| key_set.contains(index_key))
) // filter
// For each keyword in the search index:
.for_each(|(index_keyword, index_keys)| {
// Using this keyword from the search index, calculate its
// similarity to the user's keyword:
let score = normalized_damerau_levenshtein(index_keyword, user_keyword);
// Insert the score into the top scores (if it's normal and high
// enough):
if score.is_normal() && score >= self.strsim_minimum_score {
top_scores.insert(index_keyword, index_keys, score)
} // if
}); // for_each

// Return the top scoring keywords that could be used as autocomplete
// options, and their keys, to the caller:
top_scores.results()

} // fn

} // impl
83 changes: 83 additions & 0 deletions src/simple/internal/eddie/autocomplete/context_jaro.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
use crate::simple::internal::StrsimTopScores;
use crate::simple::search_index::SearchIndex;
use kstring::KString;
use std::{cmp::Ord, collections::BTreeSet, hash::Hash};
use strsim::jaro;

// -----------------------------------------------------------------------------

impl<K: Hash + Ord> SearchIndex<K> {

// -------------------------------------------------------------------------
//
/// Scans the entire search index for the closest matching _n_ keywords
/// using the Jaro string similarity metric from Danny Guo's
/// [strsim](https://crates.io/crates/strsim) crate.
///
/// When the user's last (partial) keyword that is meant to be autocompleted
/// returns no matches, these `strsim_autocomplete_*` methods can be used to
/// find the best match for substitution.
///
/// * `index_range` limits which keywords to compare the user's keyword
/// against. For example, if the `index_range` is "super" and the user's
/// keyword is "supersonic": only search index keywords beginning with
/// "super" will be compared against the user's keyword: "supersonic"
/// against "superalloy", "supersonic" against "supergiant" and so on...
///
/// * `key_set` limits which keywords to compare the user's keyword
/// against. For a search index keyword to be considered as a fuzzy match,
/// it must contain at least one key that is in this key set. This is how
/// fuzzy matching is made contextual.
//
// Note: these `strsim_autocomplete_*` methods are very similar and may seem
// repetitive with a lot of boiler plate. These were intentionally made more
// "concrete" and less modular in order to be more efficient.

pub(crate) fn strsim_autocomplete_context_jaro(
&self,
index_range: &str,
key_set: &BTreeSet<&K>,
user_keyword: &str,
) -> impl Iterator<Item = (&KString, &BTreeSet<K>)> {

// This structure will track the top scoring keywords:
let mut top_scores: StrsimTopScores<K, f64> =
StrsimTopScores::with_capacity(self.maximum_autocomplete_options);

// Scan the search index for the highest scoring keywords:
self.b_tree_map
// Get matching keywords starting with (partial) keyword string:
.range(KString::from_ref(index_range)..)
// We did not specify an end bound for our `range` function (see
// above.) `range` will return _every_ keyword greater than the
// supplied keyword. The below `take_while` will effectively break
// iteration when we reach a keyword that does not start with our
// supplied (partial) keyword.
.take_while(|(index_keyword, _keys)| index_keyword.starts_with(index_range))
// Only examine search index keywords that intersect with the caller
// provided key-set. This ensures contextual fuzzy matching. This
// will filter out search index keywords that don't contain any keys
// from the caller provided key set:
.filter(|(_index_keyword, index_keys)|
key_set.is_empty() ||
index_keys.iter().any(|index_key| key_set.contains(index_key))
) // filter
// For each keyword in the search index:
.for_each(|(index_keyword, index_keys)| {
// Using this keyword from the search index, calculate its
// similarity to the user's keyword:
let score = jaro(index_keyword, user_keyword);
// Insert the score into the top scores (if it's normal and high
// enough):
if score.is_normal() && score >= self.strsim_minimum_score {
top_scores.insert(index_keyword, index_keys, score)
} // if
}); // for_each

// Return the top scoring keywords that could be used as autocomplete
// options, and their keys, to the caller:
top_scores.results()

} // fn

} // impl
83 changes: 83 additions & 0 deletions src/simple/internal/eddie/autocomplete/context_jaro_winkler.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
use crate::simple::internal::StrsimTopScores;
use crate::simple::search_index::SearchIndex;
use kstring::KString;
use std::{cmp::Ord, collections::BTreeSet, hash::Hash};
use strsim::jaro_winkler;

// -----------------------------------------------------------------------------

impl<K: Hash + Ord> SearchIndex<K> {

// -------------------------------------------------------------------------
//
/// Scans the entire search index for the closest matching _n_ keywords
/// using the Jaro-Winkler string similarity metric from Danny Guo's
/// [strsim](https://crates.io/crates/strsim) crate.
///
/// When the user's last (partial) keyword that is meant to be autocompleted
/// returns no matches, these `strsim_autocomplete_*` methods can be used to
/// find the best match for substitution.
///
/// * `index_range` limits which keywords to compare the user's keyword
/// against. For example, if the `index_range` is "super" and the user's
/// keyword is "supersonic": only search index keywords beginning with
/// "super" will be compared against the user's keyword: "supersonic"
/// against "superalloy", "supersonic" against "supergiant" and so on...
///
/// * `key_set` limits which keywords to compare the user's keyword
/// against. For a search index keyword to be considered as a fuzzy match,
/// it must contain at least one key that is in this key set. This is how
/// fuzzy matching is made contextual.
//
// Note: these `strsim_autocomplete_*` methods are very similar and may seem
// repetitive with a lot of boiler plate. These were intentionally made more
// "concrete" and less modular in order to be more efficient.

pub(crate) fn strsim_autocomplete_context_jaro_winkler(
&self,
index_range: &str,
key_set: &BTreeSet<&K>,
user_keyword: &str,
) -> impl Iterator<Item = (&KString, &BTreeSet<K>)> {

// This structure will track the top scoring keywords:
let mut top_scores: StrsimTopScores<K, f64> =
StrsimTopScores::with_capacity(self.maximum_autocomplete_options);

// Scan the search index for the highest scoring keywords:
self.b_tree_map
// Get matching keywords starting with (partial) keyword string:
.range(KString::from_ref(index_range)..)
// We did not specify an end bound for our `range` function (see
// above.) `range` will return _every_ keyword greater than the
// supplied keyword. The below `take_while` will effectively break
// iteration when we reach a keyword that does not start with our
// supplied (partial) keyword.
.take_while(|(index_keyword, _keys)| index_keyword.starts_with(index_range))
// Only examine search index keywords that intersect with the caller
// provided key-set. This ensures contextual fuzzy matching. This
// will filter out search index keywords that don't contain any keys
// from the caller provided key set:
.filter(|(_index_keyword, index_keys)|
key_set.is_empty() ||
index_keys.iter().any(|index_key| key_set.contains(index_key))
) // filter
// For each keyword in the search index:
.for_each(|(index_keyword, index_keys)| {
// Using this keyword from the search index, calculate its
// similarity to the user's keyword:
let score = jaro_winkler(index_keyword, user_keyword);
// Insert the score into the top scores (if it's normal and high
// enough):
if score.is_normal() && score >= self.strsim_minimum_score {
top_scores.insert(index_keyword, index_keys, score)
} // if
}); // for_each

// Return the top scoring keywords that could be used as autocomplete
// options, and their keys, to the caller:
top_scores.results()

} // fn

} // impl
Loading

0 comments on commit e075ab3

Please sign in to comment.