Skip to content

Commit

Permalink
Handle dictionaries not generated by dictfmt
Browse files Browse the repository at this point in the history
Closes #121.
  • Loading branch information
baskerville committed Jul 6, 2020
1 parent 3264ae6 commit 27570e6
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 11 deletions.
55 changes: 51 additions & 4 deletions src/dictionary/indexing.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use std::io::{BufRead, BufReader};

use levenshtein::levenshtein;

use super::Metadata;
use super::errors::DictError;
use super::errors::DictError::*;

Expand All @@ -41,15 +42,61 @@ pub struct Entry {
}

pub trait IndexReader {
fn load_and_find(&mut self, headword: &str, fuzzy: bool) -> Vec<Entry>;
fn load_and_find(&mut self, headword: &str, fuzzy: bool, metadata: &Metadata) -> Vec<Entry>;
fn find(&self, headword: &str, fuzzy: bool) -> Vec<Entry>;
}

fn normalize(entries: &[Entry], metadata: &Metadata) -> Vec<Entry> {
let mut result: Vec<Entry> = Vec::with_capacity(entries.len());

for entry in entries.iter() {
let mut headword = entry.headword.clone();

if !metadata.all_chars {
headword = headword.chars()
.filter(|c| c.is_alphanumeric() || c.is_whitespace())
.collect();
}

if !metadata.case_sensitive {
headword = headword.to_lowercase();
}

let mut i = result.len();

while i > 0 && headword < result[i-1].headword {
i -= 1;
}

let original = if headword != entry.headword {
Some(entry.headword.clone())
} else {
None
};

result.insert(i, Entry {
headword,
offset: entry.offset,
size: entry.size,
original,
});
}

result
}

impl<R: BufRead> IndexReader for Index<R> {
fn load_and_find(&mut self, headword: &str, fuzzy: bool) -> Vec<Entry> {
fn load_and_find(&mut self, headword: &str, fuzzy: bool, metadata: &Metadata) -> Vec<Entry> {
if let Some(br) = self.state.take() {
if let Ok(mut index) = parse_index(br, false) {
self.entries.append(&mut index.entries);
let has_dictfmt = self.entries.iter()
.any(|e| e.headword.contains("dictfmt"));
if let Ok(index) = parse_index(br, false) {
let mut entries = if has_dictfmt {
index.entries
} else {
normalize(&index.entries, metadata)
};
self.entries.append(&mut entries);
}
}
self.find(headword, fuzzy)
Expand Down
21 changes: 14 additions & 7 deletions src/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,15 @@ use self::indexing::IndexReader;
pub struct Dictionary {
content: Box<dyn DictReader>,
index: Box<dyn IndexReader>,
all_chars: bool,
case_sensitive: bool,
metadata: Metadata,
}

/// The special metadata entries that we care about.
///
/// These entries should appear close to the beginning of the index file.
pub struct Metadata {
pub all_chars: bool,
pub case_sensitive: bool,
}

impl Dictionary {
Expand All @@ -32,13 +39,13 @@ impl Dictionary {
/// found, the returned vector is empty. Errors result from the parsing of the underlying files.
pub fn lookup(&mut self, word: &str, fuzzy: bool) -> Result<Vec<[String; 2]>, errors::DictError> {
let mut query = word.to_string();
if !self.case_sensitive {
if !self.metadata.case_sensitive {
query = query.to_lowercase();
}
if !self.all_chars {
if !self.metadata.all_chars {
query = query.chars().filter(|c| c.is_alphanumeric() || c.is_whitespace()).collect();
}
let entries = self.index.load_and_find(&query, fuzzy);
let entries = self.index.load_and_find(&query, fuzzy, &self.metadata);
let mut results = Vec::new();
for entry in entries.into_iter() {
results.push([entry.original.unwrap_or(entry.headword),
Expand All @@ -52,7 +59,7 @@ impl Dictionary {
/// The metadata headwords start with `00-database-` or `00database`.
pub fn metadata(&mut self, name: &str) -> Result<String, errors::DictError> {
let mut query = format!("00-database-{}", name);
if !self.all_chars {
if !self.metadata.all_chars {
query = query.replace(|c: char| !c.is_alphanumeric(), "");
}
let entries = self.index.find(&query, false);
Expand Down Expand Up @@ -107,5 +114,5 @@ pub fn load_dictionary(content: Box<dyn DictReader>, index: Box<dyn IndexReader>
"00databasecasesensitive"
};
let case_sensitive = !index.find(word, false).is_empty();
Dictionary { content, index, all_chars, case_sensitive }
Dictionary { content, index, metadata: Metadata { all_chars, case_sensitive } }
}

0 comments on commit 27570e6

Please sign in to comment.