Skip to content
This repository was archived by the owner on Apr 4, 2023. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions benchmarks/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,7 @@ harness = false
[[bench]]
name = "indexing"
harness = false

[[bench]]
name = "formatting"
harness = false
68 changes: 68 additions & 0 deletions benchmarks/benches/formatting.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
use criterion::{criterion_group, criterion_main};
use milli::tokenizer::{Analyzer, AnalyzerConfig};
use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords};

#[cfg(target_os = "linux")]
#[global_allocator]
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;

struct Conf<'a> {
name: &'a str,
text: &'a str,
matching_words: MatcherBuilder,
}

fn bench_formatting(c: &mut criterion::Criterion) {
#[rustfmt::skip]
let confs = &[
Conf {
name: "'the door d'",
text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#,
matching_words: MatcherBuilder::from_matching_words(MatchingWords::new(vec![
(vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]),
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]),
(vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]),
(vec![MatchingWord::new("do".to_string(), 0, false), MatchingWord::new("or".to_string(), 0, false)], vec![0]),
(vec![MatchingWord::new("thedoor".to_string(), 1, false)], vec![0, 1]),
(vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]),
(vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]),
(vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]),
])),
},
];

let format_options = &[
FormatOptions { highlight: false, crop: None },
FormatOptions { highlight: true, crop: None },
FormatOptions { highlight: false, crop: Some(10) },
FormatOptions { highlight: true, crop: Some(10) },
FormatOptions { highlight: false, crop: Some(20) },
FormatOptions { highlight: true, crop: Some(20) },
];

for option in format_options {
let highlight = if option.highlight { "highlight" } else { "no-highlight" };

let name = match option.crop {
Some(size) => format!("{}-crop({})", highlight, size),
None => format!("{}-no-crop", highlight),
};

let mut group = c.benchmark_group(&name);
for conf in confs {
group.bench_function(conf.name, |b| {
b.iter(|| {
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let analyzed = analyzer.analyze(&conf.text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = conf.matching_words.build(&tokens[..], conf.text);
matcher.format(option.clone());
})
});
}
group.finish();
}
}

criterion_group!(benches, bench_formatting);
criterion_main!(benches);
47 changes: 17 additions & 30 deletions http-ui/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ use milli::update::{
ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
};
use milli::{
obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatchingWords,
SearchResult, SortError,
obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index,
MatcherBuilder, SearchResult, SortError,
};
use once_cell::sync::OnceCell;
use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -152,43 +152,27 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
Self { analyzer }
}

fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value {
fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value {
match value {
Value::Null => Value::Null,
Value::Bool(boolean) => Value::Bool(boolean),
Value::Number(number) => Value::Number(number),
Value::String(old_string) => {
let mut string = String::new();
let analyzed = self.analyzer.analyze(&old_string);
for (word, token) in analyzed.reconstruct() {
if token.is_word() {
match matching_words.matching_bytes(&token) {
Some(chars_to_highlight) => {
let mut chars = word.chars();

string.push_str("<mark>");
// push the part to highlight
string.extend(chars.by_ref().take(chars_to_highlight));
string.push_str("</mark>");
// push the suffix after highlight
string.extend(chars);
}
// no highlight
None => string.push_str(word),
}
} else {
string.push_str(word);
}
}
Value::String(string)
let analyzed: Vec<_> = analyzed.tokens().collect();
let mut matcher = matcher_builder.build(&analyzed[..], &old_string);

let format_options = FormatOptions { highlight: true, crop: Some(10) };

Value::String(matcher.format(format_options).to_string())
}
Value::Array(values) => Value::Array(
values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(),
values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(),
),
Value::Object(object) => Value::Object(
object
.into_iter()
.map(|(k, v)| (k, self.highlight_value(v, matching_words)))
.map(|(k, v)| (k, self.highlight_value(v, matcher_builder)))
.collect(),
),
}
Expand All @@ -197,14 +181,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
fn highlight_record(
&self,
object: &mut Map<String, Value>,
matching_words: &MatchingWords,
matcher_builder: &MatcherBuilder,
attributes_to_highlight: &HashSet<String>,
) {
// TODO do we need to create a string for element that are not and needs to be highlight?
for (key, value) in object.iter_mut() {
if attributes_to_highlight.contains(key) {
let old_value = mem::take(value);
*value = self.highlight_value(old_value, matching_words);
*value = self.highlight_value(old_value, matcher_builder);
}
}
}
Expand Down Expand Up @@ -819,12 +803,15 @@ async fn main() -> anyhow::Result<()> {
let stop_words = fst::Set::default();
let highlighter = Highlighter::new(&stop_words);

let mut matcher_builder = MatcherBuilder::from_matching_words(matching_words);
matcher_builder.highlight_prefix("<mark>".to_string());
matcher_builder.highlight_suffix("</mark>".to_string());
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
if !disable_highlighting {
highlighter.highlight_record(
&mut object,
&matching_words,
&matcher_builder,
&attributes_to_highlight,
);
}
Expand Down
5 changes: 4 additions & 1 deletion milli/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,10 @@ pub use self::heed_codec::{
RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec,
};
pub use self::index::Index;
pub use self::search::{FacetDistribution, Filter, MatchingWords, Search, SearchResult};
pub use self::search::{
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord,
MatchingWords, Search, SearchResult,
};

pub type Result<T> = std::result::Result<T, error::Error>;

Expand Down
Loading