|
1 | 1 | use std::path::PathBuf;
|
2 | 2 |
|
3 | 3 | use actix_web::web;
|
4 |
| -use base64::{engine::general_purpose, Engine}; |
5 | 4 | use pagefind::{api::PagefindIndex, options::PagefindServiceConfig};
|
6 | 5 |
|
7 | 6 | use crate::{
|
8 |
| - data::models::{DatasetConfiguration, EventType, Pool, QdrantChunkMetadata, WorkerEvent}, |
| 7 | + data::models::{self, DatasetConfiguration, Pool, QdrantChunkMetadata, WorkerEvent}, |
9 | 8 | errors::ServiceError,
|
10 | 9 | operators::{clickhouse_operator::ClickHouseEvent, file_operator::get_pagefind_aws_bucket},
|
11 | 10 | };
|
@@ -76,36 +75,42 @@ pub async fn build_index_for_dataset_id(
|
76 | 75 | let total_files = files.len();
|
77 | 76 | log::info!("Uploading {:?} pagefind indexed files to S3", total_files);
|
78 | 77 |
|
79 |
| - for (i, file) in files.iter().enumerate() { |
80 |
| - let bucket = get_pagefind_aws_bucket()?; |
81 |
| - |
82 |
| - // WARNING This s3 bucket cannot be default public. put ACL's on this somehow in case |
83 |
| - // the user does not want their data to be public. |
| 78 | + let futures = files.into_iter().enumerate().map(|(i, file)| -> tokio::task::JoinHandle<Result<(), ServiceError>> { |
84 | 79 | let mut filename = PathBuf::from("/pagefind");
|
85 | 80 | filename.push(dataset_id.to_string());
|
86 | 81 | filename.push(file.filename.clone());
|
87 | 82 |
|
88 |
| - bucket |
89 |
| - .put_object(filename.to_string_lossy().to_string(), &file.contents.clone()) |
90 |
| - .await |
91 |
| - .map_err(|e| { |
92 |
| - ServiceError::BadRequest(format!("Could not upload file to S3 {:?}", e)) |
93 |
| - })?; |
94 |
| - |
95 |
| - event_queue |
96 |
| - .send(ClickHouseEvent::WorkerEvent( |
97 |
| - WorkerEvent::from_details( |
98 |
| - dataset_id, |
99 |
| - EventType::PagefindIndexingProgress { |
100 |
| - files_indexed: i + 1, |
101 |
| - total_files, |
102 |
| - }, |
103 |
| - ) |
104 |
| - .into(), |
105 |
| - )) |
106 |
| - .await; |
107 |
| - log::info!("Uploaded file to s3 {:?}/{:?}", i, total_files); |
108 |
| - } |
| 83 | + |
| 84 | + // WARNING This s3 bucket cannot be default public. put ACL's on this somehow in case |
| 85 | + // the user does not want their data to be public. |
| 86 | + tokio::task::spawn(async move { |
| 87 | + let bucket = get_pagefind_aws_bucket()?; |
| 88 | + bucket |
| 89 | + .put_object(filename.to_string_lossy().to_string(), &file.contents.clone()) |
| 90 | + .await |
| 91 | + .map_err(|e| { |
| 92 | + ServiceError::BadRequest(format!("Could not upload file to S3 {:?}", e)) |
| 93 | + })?; |
| 94 | + |
| 95 | + log::info!("Uploaded file {:?} to S3", i); |
| 96 | + Ok(()) |
| 97 | + }) |
| 98 | + }); |
| 99 | + |
| 100 | + futures::future::join_all(futures).await; |
| 101 | + |
| 102 | + event_queue |
| 103 | + .send(ClickHouseEvent::WorkerEvent( |
| 104 | + WorkerEvent::from_details( |
| 105 | + dataset_id, |
| 106 | + models::EventType::PagefindIndexingFinished { |
| 107 | + total_files, |
| 108 | + }, |
| 109 | + ) |
| 110 | + .into(), |
| 111 | + )) |
| 112 | + .await; |
| 113 | + |
109 | 114 |
|
110 | 115 | Ok(())
|
111 | 116 | }
|
0 commit comments