Skip to content

Commit

Permalink
eddie support complete.
Browse files Browse the repository at this point in the history
  • Loading branch information
leontoeides committed Nov 18, 2023
1 parent ac094fa commit 8de387a
Show file tree
Hide file tree
Showing 51 changed files with 461 additions and 579 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ repository = "https://github.com/leontoeides/indicium"
maintenance = { status = "actively-developed" }

[features]
default = [ "simple", "strsim", "ahash" ]
default = [ "simple", "strsim", "eddie", "ahash" ]
simple = []
select2 = [ "simple", "serde" ]
ahash = [ "dep:ahash" ]
Expand Down
30 changes: 28 additions & 2 deletions src/simple/autocomplete/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,35 @@ impl<K: Hash + Ord> SearchIndex<K> {
// Collect all keyword autocompletions into a `Vec`:
.collect();

// If `strsim` string searching enabled, examine the resulting
// If `eddie` fuzzy matching enabled, examine the resulting
// auto-complete options before using them:
#[cfg(feature = "strsim")]
#[cfg(feature = "eddie")]
if autocompletions.is_empty() {
// No autocomplete options were found for the user's last
// (partial) keyword. Attempt to use fuzzy string search to find
// other autocomplete options:
autocompletions = self.eddie_context_autocomplete(
&search_results,
&last_keyword,
) // strsim_context_autocomplete
.into_iter()
// Only keep this autocompletion if hasn't already been used
// as a keyword:
.filter(|(keyword, _keys)| !keywords.contains(keyword))
// Only return `maximum_autocomplete_options` number of
// keywords:
.take(*maximum_autocomplete_options)
// `strsim_autocomplete` returns both the keyword and keys.
// We're autocompleting the last (partial) keyword, so
// discard the keys:
.map(|(keyword, _keys)| keyword)
// Collect all keyword autocompletions into a `Vec`:
.collect()
} // if

// If `strsim` fuzzy matching enabled, examine the resulting
// auto-complete options before using them:
#[cfg(all(feature = "strsim", not(feature = "eddie")))]
if autocompletions.is_empty() {
// No autocomplete options were found for the user's last
// (partial) keyword. Attempt to use fuzzy string search to find
Expand Down
27 changes: 25 additions & 2 deletions src/simple/autocomplete/global.rs
Original file line number Diff line number Diff line change
Expand Up @@ -150,9 +150,32 @@ impl<K: Hash + Ord> SearchIndex<K> {
// Collect all keyword autocompletions into a `Vec`:
.collect();

// If `strsim` string searching enabled, examine the resulting
// If `eddie` fuzzy matching enabled, examine the resulting
// auto-complete options before using them:
#[cfg(feature = "strsim")]
#[cfg(feature = "eddie")]
if autocompletions.is_empty() {
// No autocomplete options were found for the user's last
// (partial) keyword. Attempt to use fuzzy string search to find
// other autocomplete options:
autocompletions = self.eddie_global_autocomplete(&last_keyword)
.into_iter()
// Only keep this autocompletion if hasn't already been used
// as a keyword:
.filter(|(keyword, _keys)| !keywords.contains(keyword))
// Only return `maximum_autocomplete_options` number of
// keywords:
.take(*maximum_autocomplete_options)
// `strsim_autocomplete` returns both the keyword and keys.
// We're autocompleting the last (partial) keyword, so
// discard the keys:
.map(|(keyword, _keys)| keyword)
// Collect all keyword autocompletions into a `Vec`:
.collect()
} // if

// If `strsim` fuzzy matching enabled, examine the resulting
// auto-complete options before using them:
#[cfg(all(feature = "strsim", not(feature = "eddie")))]
if autocompletions.is_empty() {
// No autocomplete options were found for the user's last
// (partial) keyword. Attempt to use fuzzy string search to find
Expand Down
29 changes: 26 additions & 3 deletions src/simple/autocomplete/keyword.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,32 @@ impl<K: Hash + Ord> SearchIndex<K> {
// Collect all keyword autocompletions into a `Vec`:
.collect();

// If `strsim` string searching enabled, examine the resulting
// If `eddie` fuzzy matching enabled, examine the resulting
// auto-complete options before returning them:
#[cfg(feature = "strsim")]
#[cfg(feature = "eddie")]
if autocomplete_options.is_empty() {
// No autocomplete options were found for the user's last
// (partial) keyword. Attempt to use fuzzy string search to find
// other autocomplete options:
self.eddie_global_autocomplete(&keyword)
.into_iter()
// Only return `maximum_autocomplete_options` number of
// keywords:
.take(*maximum_autocomplete_options)
// `eddie_autocomplete` returns both the keyword and keys.
// We're autocompleting the last (partial) keyword, so discard
// the keys:
.map(|(keyword, _keys)| keyword.as_str())
// Collect all keyword autocompletions into a `Vec`:
.collect()
} else {
// There were some matches. Return the results without processing:
autocomplete_options.into_iter().map(|kstring| kstring.as_str()).collect()
} // if

// If `strsim` fuzzy matching enabled, examine the resulting
// auto-complete options before returning them:
#[cfg(all(feature = "strsim", not(feature = "eddie")))]
if autocomplete_options.is_empty() {
// No autocomplete options were found for the user's last
// (partial) keyword. Attempt to use fuzzy string search to find
Expand All @@ -150,7 +173,7 @@ impl<K: Hash + Ord> SearchIndex<K> {

// If fuzzy string searching disabled, return the resulting
// auto-complete options without further processing:
#[cfg(not(feature = "strsim"))]
#[cfg(not(any(feature = "strsim", feature = "eddie")))]
autocomplete_options.into_iter().map(|kstring| kstring.as_str()).collect()

} // fn
Expand Down
18 changes: 16 additions & 2 deletions src/simple/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,20 @@ impl<K: Clone + Ord> SearchIndexBuilder<K> {
self
} // fn

/// String similarity metric type from Ilia Schelokov's
/// [eddie](https://crates.io/crates/eddie) crate. Used for fuzzy matching
/// user's keywords when no exact matches were found. See [`EddieMetric`] for
/// more information.
///
/// **Default:** `EddieMetric::Levenshtein`
///
/// [`EddieMetric`]: enum.EddieMetric.html
#[cfg(feature = "eddie")]
pub fn eddie_metric(mut self, eddie_metric: Option<EddieMetric>) -> Self {
self.eddie_metric = eddie_metric;
self
} // fn

/// String's minimum length (in chars or codepoints) to use "approximate
/// string matching" or "fuzzy matching."
///
Expand All @@ -165,7 +179,7 @@ impl<K: Clone + Ord> SearchIndexBuilder<K> {
/// be crippling slow on very large search indicies.
///
/// **Default:** `3` characters
#[cfg(feature = "strsim")]
#[cfg(any(feature = "eddie", feature = "strsim"))]
pub fn fuzzy_length(mut self, fuzzy_length: usize) -> Self {
self.fuzzy_length = fuzzy_length;
self
Expand All @@ -184,7 +198,7 @@ impl<K: Clone + Ord> SearchIndexBuilder<K> {
/// be returned to the user.
///
/// **Default:** `0.3`
#[cfg(feature = "strsim")]
#[cfg(any(feature = "eddie", feature = "strsim"))]
pub fn fuzzy_minimum_score(mut self, fuzzy_minimum_score: f64) -> Self {
self.fuzzy_minimum_score = fuzzy_minimum_score;
self
Expand Down
6 changes: 2 additions & 4 deletions src/simple/eddie_metric.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
// -----------------------------------------------------------------------------
//
/// This is used to select a string similarity metric implemented by the
/// Ilia Schelokov's [eddie](https://crates.io/crates/eddie) crate.
/// This is used to select a string similarity metric implemented by Ilia
/// Schelokov's [eddie](https://crates.io/crates/eddie) crate.

#[derive(Clone, Debug, Default, Eq, Hash, Ord, PartialEq, PartialOrd)]
pub enum EddieMetric {
/// See [the detailed description](https://en.wikipedia.org/wiki/Levenshtein_distance).
#[default] Levenshtein,
/// See [the detailed description](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance).
DamerauLevenshtein,
/// See [the detailed description](https://en.wikipedia.org/wiki/Hamming_distance).
Hamming,
/// See [the detailed description](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance#Jaro_Similarity).
Jaro,
/// Like Jaro similarity but gives a higher score to the strings that start
Expand Down
20 changes: 0 additions & 20 deletions src/simple/eddie_type.rs

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
use crate::simple::internal::StrsimTopScores;
use crate::simple::search_index::SearchIndex;
use crate::simple::internal::FuzzyTopScores;
use kstring::KString;
use std::{cmp::Ord, collections::BTreeSet, hash::Hash};
use strsim::normalized_damerau_levenshtein;
use std::collections::BTreeSet;

// -----------------------------------------------------------------------------

impl<K: Hash + Ord> SearchIndex<K> {
impl<K: std::hash::Hash + std::cmp::Ord> crate::simple::search_index::SearchIndex<K> {

// -------------------------------------------------------------------------
//
/// Scans the entire search index for the closest matching _n_ keywords
/// using the Damerau-Levenshtein string similarity metric from Danny Guo's
/// [strsim](https://crates.io/crates/strsim) crate.
/// using the Damerau-Levenshtein string distance metric from Ilia
/// Schelokov's [eddie](https://crates.io/crates/eddie) crate.
///
/// When the user's last (partial) keyword that is meant to be autocompleted
/// returns no matches, these `strsim_autocomplete_*` methods can be used to
/// returns no matches, these `eddie_autocomplete_*` methods can be used to
/// find the best match for substitution.
///
/// * `index_range` limits which keywords to compare the user's keyword
Expand All @@ -29,20 +27,23 @@ impl<K: Hash + Ord> SearchIndex<K> {
/// it must contain at least one key that is in this key set. This is how
/// fuzzy matching is made contextual.
//
// Note: these `strsim_autocomplete_*` methods are very similar and may seem
// Note: these `eddie_autocomplete_*` methods are very similar and may seem
// repetitive with a lot of boiler plate. These were intentionally made more
// "concrete" and less modular in order to be more efficient.

pub(crate) fn strsim_autocomplete_context_damerau_levenshtein(
pub(crate) fn eddie_autocomplete_context_damerau_levenshtein(
&self,
index_range: &str,
key_set: &BTreeSet<&K>,
user_keyword: &str,
) -> impl Iterator<Item = (&KString, &BTreeSet<K>)> {

// Instantiate eddie's Damerau-Levenshtein distance struct:
let damerau_levenshtein = eddie::DamerauLevenshtein::new();

// This structure will track the top scoring keywords:
let mut top_scores: StrsimTopScores<K, f64> =
StrsimTopScores::with_capacity(self.maximum_autocomplete_options);
let mut top_scores: FuzzyTopScores<K, f64> =
FuzzyTopScores::with_capacity(self.maximum_autocomplete_options);

// Scan the search index for the highest scoring keywords:
self.b_tree_map
Expand All @@ -66,7 +67,7 @@ impl<K: Hash + Ord> SearchIndex<K> {
.for_each(|(index_keyword, index_keys)| {
// Using this keyword from the search index, calculate its
// similarity to the user's keyword:
let score = normalized_damerau_levenshtein(index_keyword, user_keyword);
let score = damerau_levenshtein.similarity(index_keyword, user_keyword);
// Insert the score into the top scores (if it's normal and high
// enough):
if score.is_normal() && score >= self.fuzzy_minimum_score {
Expand Down
27 changes: 14 additions & 13 deletions src/simple/internal/eddie/autocomplete/context_jaro.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,19 @@
use crate::simple::internal::StrsimTopScores;
use crate::simple::search_index::SearchIndex;
use crate::simple::internal::FuzzyTopScores;
use kstring::KString;
use std::{cmp::Ord, collections::BTreeSet, hash::Hash};
use strsim::jaro;
use std::collections::BTreeSet;

// -----------------------------------------------------------------------------

impl<K: Hash + Ord> SearchIndex<K> {
impl<K: std::hash::Hash + std::cmp::Ord> crate::simple::search_index::SearchIndex<K> {

// -------------------------------------------------------------------------
//
/// Scans the entire search index for the closest matching _n_ keywords
/// using the Jaro string similarity metric from Danny Guo's
/// [strsim](https://crates.io/crates/strsim) crate.
/// using the Jaro string similarity metric from Ilia Schelokov's
/// [eddie](https://crates.io/crates/eddie) crate.
///
/// When the user's last (partial) keyword that is meant to be autocompleted
/// returns no matches, these `strsim_autocomplete_*` methods can be used to
/// returns no matches, these `eddie_autocomplete_*` methods can be used to
/// find the best match for substitution.
///
/// * `index_range` limits which keywords to compare the user's keyword
Expand All @@ -29,20 +27,23 @@ impl<K: Hash + Ord> SearchIndex<K> {
/// it must contain at least one key that is in this key set. This is how
/// fuzzy matching is made contextual.
//
// Note: these `strsim_autocomplete_*` methods are very similar and may seem
// Note: these `eddie_autocomplete_*` methods are very similar and may seem
// repetitive with a lot of boiler plate. These were intentionally made more
// "concrete" and less modular in order to be more efficient.

pub(crate) fn strsim_autocomplete_context_jaro(
pub(crate) fn eddie_autocomplete_context_jaro(
&self,
index_range: &str,
key_set: &BTreeSet<&K>,
user_keyword: &str,
) -> impl Iterator<Item = (&KString, &BTreeSet<K>)> {

// Instantiate eddie's Jaro similarity struct:
let jaro = eddie::Jaro::new();

// This structure will track the top scoring keywords:
let mut top_scores: StrsimTopScores<K, f64> =
StrsimTopScores::with_capacity(self.maximum_autocomplete_options);
let mut top_scores: FuzzyTopScores<K, f64> =
FuzzyTopScores::with_capacity(self.maximum_autocomplete_options);

// Scan the search index for the highest scoring keywords:
self.b_tree_map
Expand All @@ -66,7 +67,7 @@ impl<K: Hash + Ord> SearchIndex<K> {
.for_each(|(index_keyword, index_keys)| {
// Using this keyword from the search index, calculate its
// similarity to the user's keyword:
let score = jaro(index_keyword, user_keyword);
let score = jaro.similarity(index_keyword, user_keyword);
// Insert the score into the top scores (if it's normal and high
// enough):
if score.is_normal() && score >= self.fuzzy_minimum_score {
Expand Down
Loading

0 comments on commit 8de387a

Please sign in to comment.