Skip to content

Commit

Permalink
Refine Pagefind library interface for long-term maintenance
Browse files Browse the repository at this point in the history
  • Loading branch information
bglw committed Dec 15, 2024
1 parent 5f94a48 commit 5ecb1fd
Show file tree
Hide file tree
Showing 12 changed files with 514 additions and 353 deletions.
33 changes: 27 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 5 additions & 1 deletion pagefind/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ pagefind_stem = { version = "0.2.0", features = [
"yiddish",
] }
convert_case = "0.6.0"
charabia = { version = "0.8.8", optional = true, default-features = false, features = ["chinese", "japanese"] }
charabia = { version = "0.8.8", optional = true, default-features = false, features = [
"chinese",
"japanese",
] }
unicode-segmentation = "1.10.1"
emojis = "0.6.1"
hashbrown = { version = "0.13.1", features = ["serde"] }
Expand Down Expand Up @@ -81,6 +84,7 @@ actix-files = "0.6"
lexical-core = "0.8.5"
path-slash = "0.2"
rust-patch = "0.1.3"
typed-builder = "0.20.0"

[features]

Expand Down
15 changes: 3 additions & 12 deletions pagefind/src/fossick/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use anyhow::{bail, Result};
use async_compression::tokio::bufread::GzipDecoder;
#[cfg(feature = "extended")]
use charabia::Segment;
Expand Down Expand Up @@ -64,16 +65,6 @@ pub struct Fossicker {
}

impl Fossicker {
pub fn new(file_path: PathBuf) -> Self {
Self {
file_path: Some(file_path),
root_path: None,
page_url: None,
synthetic_content: None,
data: None,
}
}

pub fn new_relative_to(file_path: PathBuf, root_path: PathBuf) -> Self {
Self {
file_path: Some(file_path),
Expand Down Expand Up @@ -459,7 +450,7 @@ impl Fossicker {
}
}

pub async fn fossick(mut self, options: &SearchOptions) -> Result<FossickedData, ()> {
pub async fn fossick(mut self, options: &SearchOptions) -> Result<FossickedData> {
if (self.file_path.is_some() || self.synthetic_content.is_some()) && self.data.is_none() {
self.fossick_html(options).await;
};
Expand All @@ -480,7 +471,7 @@ impl Fossicker {
options
.logger
.error("Tried to index file with no specified URL or file path, ignoring.");
return Err(());
bail!("Tried to index file with no specified URL or file path, ignoring.");
};

Ok(FossickedData {
Expand Down
12 changes: 8 additions & 4 deletions pagefind/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::{
utils::full_hash,
SearchOptions,
};
use anyhow::{bail, Result};
use index_filter::{FilterIndex, PackedValue};
use index_metadata::{MetaChunk, MetaIndex, MetaPage};
use index_words::{PackedPage, PackedWord, WordIndex};
Expand Down Expand Up @@ -44,7 +45,7 @@ pub async fn build_indexes(
mut pages: Vec<FossickedData>,
language: String,
options: &SearchOptions,
) -> PagefindIndexes {
) -> Result<PagefindIndexes> {
let mut meta = MetaIndex {
version: options.version.into(),
pages: Vec::new(),
Expand Down Expand Up @@ -265,7 +266,10 @@ pub async fn build_indexes(
language,
u32::MAX
));
std::process::exit(1);
bail!(
"Language {language} has too many documents to index, must be < {}",
u32::MAX
);
}

// TODO: Parameterize these chunk sizes via options
Expand Down Expand Up @@ -306,7 +310,7 @@ pub async fn build_indexes(
&full_hash(&meta_index)[0..=(language.len() + 7)]
);

PagefindIndexes {
Ok(PagefindIndexes {
word_indexes,
filter_indexes,
sorts,
Expand All @@ -317,7 +321,7 @@ pub async fn build_indexes(
.collect(),
language,
word_count,
}
})
}

fn chunk_index(word_map: HashMap<String, PackedWord>, chunk_size: usize) -> Vec<Vec<PackedWord>> {
Expand Down
57 changes: 34 additions & 23 deletions pagefind/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
use std::{cmp::Ordering, path::PathBuf};

pub use fossick::{FossickedData, Fossicker};
use anyhow::{bail, Result};
use fossick::{FossickedData, Fossicker};
use futures::future::join_all;
use hashbrown::HashMap;
use index::PagefindIndexes;
pub use options::{PagefindInboundConfig, SearchOptions};
use options::{PagefindInboundConfig, SearchOptions};
use output::SyntheticFile;
pub use service::api;
use wax::{Glob, WalkEntry};
Expand All @@ -16,16 +17,17 @@ mod fragments;
mod index;
#[macro_use]
mod logging;
mod options;
pub mod options;
mod output;
pub mod serve;
pub mod service;
pub mod runner;
mod serve;
mod service;
mod utils;

pub struct SearchState {
pub options: SearchOptions,
pub fossicked_pages: Vec<FossickedData>,
pub built_indexes: Vec<PagefindIndexes>,
struct SearchState {
options: SearchOptions,
fossicked_pages: Vec<FossickedData>,
built_indexes: Vec<PagefindIndexes>,
}

impl SearchState {
Expand All @@ -37,28 +39,31 @@ impl SearchState {
}
}

pub async fn walk_for_files(&mut self, dir: PathBuf, glob: String) -> Vec<Fossicker> {
pub async fn walk_for_files(&mut self, dir: PathBuf, glob: String) -> Result<Vec<Fossicker>> {
let log = &self.options.logger;

log.status("[Walking source directory]");
if let Ok(glob) = Glob::new(&glob) {
glob.walk(&dir)
Ok(glob
.walk(&dir)
.filter_map(Result::ok)
.map(WalkEntry::into_path)
.map(|file_path| Fossicker::new_relative_to(file_path, dir.clone()))
.collect()
.collect())
} else {
log.error(format!(
"Error: Provided glob \"{}\" did not parse as a valid glob.",
self.options.glob
));
// TODO: Bubble this error back to the Node API if applicable
std::process::exit(1);
bail!(
"Error: Provided glob \"{}\" did not parse as a valid glob.",
self.options.glob
);
}
}

pub async fn fossick_many(&mut self, dir: PathBuf, glob: String) -> Result<usize, ()> {
let files = self.walk_for_files(dir.clone(), glob).await;
pub async fn fossick_many(&mut self, dir: PathBuf, glob: String) -> Result<usize> {
let files = self.walk_for_files(dir.clone(), glob).await?;
let log = &self.options.logger;

log.info(format!(
Expand All @@ -81,23 +86,23 @@ impl SearchState {
Ok(self.fossicked_pages.len() - existing_page_count)
}

pub async fn fossick_one(&mut self, file: Fossicker) -> Result<FossickedData, ()> {
pub async fn fossick_one(&mut self, file: Fossicker) -> Result<FossickedData> {
let result = file.fossick(&self.options).await;
if let Ok(result) = result.clone() {
if let Some(result) = result.as_ref().ok() {
let existing = self
.fossicked_pages
.iter()
.position(|page| page.url == result.url);
if let Some(existing) = existing {
*self.fossicked_pages.get_mut(existing).unwrap() = result;
*self.fossicked_pages.get_mut(existing).unwrap() = result.clone();
} else {
self.fossicked_pages.push(result);
self.fossicked_pages.push(result.clone());
}
}
result
}

pub async fn build_indexes(&mut self) {
pub async fn build_indexes(&mut self) -> Result<()> {
let log = &self.options.logger;

let used_custom_body = self.fossicked_pages.iter().any(|page| page.has_custom_body);
Expand Down Expand Up @@ -211,7 +216,8 @@ impl SearchState {
.into_iter()
.map(|(language, pages)| async { build_indexes(pages, language, &self.options).await })
.collect();
self.built_indexes = join_all(indexes).await;
let built_indexes = join_all(indexes).await;
self.built_indexes = built_indexes.into_iter().flat_map(|i| i.ok()).collect();

let stats = self.built_indexes.iter().fold((0, 0, 0, 0), |mut stats, index| {
log.v_info(format!(
Expand Down Expand Up @@ -267,8 +273,13 @@ impl SearchState {
Most likely, the directory passed to Pagefind was empty \
or did not contain any html files.",
);
std::process::exit(1);
bail!(
"Error: Pagefind wasn't able to build an index. \n\
Most likely, the directory passed to Pagefind was empty \
or did not contain any html files."
);
}
Ok(())
}

pub async fn write_files(&self, custom_outdir: Option<PathBuf>) -> PathBuf {
Expand Down
2 changes: 2 additions & 0 deletions pagefind/src/logging.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub enum LogLevel {
Verbose,
}

#[allow(dead_code)]
#[derive(Debug, Clone)]
pub enum LogStyle {
Info,
Expand Down Expand Up @@ -55,6 +56,7 @@ lazy_static! {
static ref SUCCESS: Style = Style::new().green();
}

#[allow(dead_code)]
impl Logger {
pub fn new(log_level: LogLevel, use_terminal: bool, logfile: Option<PathBuf>) -> Self {
if let Some(filename) = &logfile {
Expand Down
Loading

0 comments on commit 5ecb1fd

Please sign in to comment.