Skip to content

Commit

Permalink
feature: update pagefind-worker to latest pagefind
Browse files Browse the repository at this point in the history
  • Loading branch information
cdxker committed Dec 16, 2024
1 parent 3a6a3fa commit 6c34cdc
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 39 deletions.
21 changes: 21 additions & 0 deletions server/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 21 additions & 39 deletions server/src/operators/pagefind_operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::path::PathBuf;

use actix_web::web;
use base64::{engine::general_purpose, Engine};
use pagefind::service::PagefindIndex;
use pagefind::{api::PagefindIndex, options::PagefindServiceConfig};

use crate::{
data::models::{DatasetConfiguration, EventType, Pool, QdrantChunkMetadata, WorkerEvent},
Expand All @@ -18,23 +18,11 @@ pub async fn build_index_for_dataset_id(
pool: web::Data<Pool>,
event_queue: &web::Data<crate::EventQueue>,
) -> Result<(), ServiceError> {
let mut search_index = PagefindIndex::new(pagefind::PagefindInboundConfig {
source: "source".into(),
site: "site".into(),
bundle_dir: None,
output_subdir: None,
output_path: None,
root_selector: "root_selector".into(),
exclude_selectors: vec![],
glob: "**/*.{html}".into(),
force_language: None,
serve: false,
verbose: false,
logfile: None,
keep_index_url: false,
service: false,
})
.expect("config is valid");
let options = PagefindServiceConfig::builder()
.keep_index_url(true)
.force_language("en".to_string())
.build();
let mut search_index = PagefindIndex::new(Some(options)).expect("config is valid");

let filter = assemble_qdrant_filter(None, None, None, dataset_id, pool.clone()).await?;

Expand All @@ -59,7 +47,7 @@ pub async fn build_index_for_dataset_id(
let payload: QdrantChunkMetadata = result.clone().into();

let _ = search_index
.add_record(
.add_custom_record(
payload.link.unwrap_or_default().to_string(),
payload.chunk_html.unwrap_or_default().to_string(),
"en".to_string(),
Expand All @@ -78,9 +66,13 @@ pub async fn build_index_for_dataset_id(
first_iteration = false;
}

search_index.build_indexes().await;
search_index.build_indexes().await.map_err(|e| {
ServiceError::BadRequest(format!("Could not build pagefind index {:?}", e))
})?;

let files = search_index.get_files().await;
let files = search_index.get_files().await.map_err(|e| {
ServiceError::BadRequest(format!("Could not get files from pagefind index {:?}", e))
})?;
let total_files = files.len();
log::info!("Uploading {:?} pagefind indexed files to S3", total_files);

Expand All @@ -91,24 +83,14 @@ pub async fn build_index_for_dataset_id(
// the user does not want their data to be public.
let mut filename = PathBuf::from("/pagefind");
filename.push(dataset_id.to_string());
filename.push(file.path.clone());

let try_decode = general_purpose::STANDARD.decode(file.content.clone());
match try_decode {
Ok(decoded_contents) => bucket
.put_object(filename.to_string_lossy().to_string(), &decoded_contents)
.await
.map_err(|e| {
ServiceError::BadRequest(format!("Could not upload file to S3 {:?}", e))
})?,
Err(e) => {
return Err(ServiceError::BadRequest(format!(
"Could not decode file {:} content {:?}",
filename.to_string_lossy(),
e
)));
}
};
filename.push(file.filename.clone());

bucket
.put_object(filename.to_string_lossy().to_string(), &file.contents.clone())
.await
.map_err(|e| {
ServiceError::BadRequest(format!("Could not upload file to S3 {:?}", e))
})?;

event_queue
.send(ClickHouseEvent::WorkerEvent(
Expand Down

0 comments on commit 6c34cdc

Please sign in to comment.