Skip to content

Commit 0eee2f8

Browse files
skeptrunedevdensumesh
authored andcommitted
feature: finish implementing QDRANT_ONLY mode with chunk_count and clear
functionality preserved
1 parent de775d2 commit 0eee2f8

13 files changed

+679
-267
lines changed

.vscode/launch.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"name": "Debug executable 'trieve-server'",
88
"cargo": {
99
"args": [
10-
"+nightly",
10+
"+default",
1111
"build",
1212
"--manifest-path=./server/Cargo.toml",
1313
"--bin=trieve-server",

server/Cargo.lock

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

server/Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ rust-argon2 = "2"
104104
serde_json = { version = "1" }
105105
serde = { version = "1" }
106106
time = { version = "0.3" }
107-
uuid = { version = "1", features = ["v4", "serde"] }
107+
uuid = { version = "1", features = ["v4", "serde", "v5"] }
108108
diesel_migrations = { version = "2.0" }
109109
regex = "1.7.3"
110110
openai_dive = { git = "https://github.com/devflowinc/openai-client.git", branch = "bugfix/parallel-tool-calls-public", features = ["stream"] }

server/src/bin/delete-worker.rs

+24-1
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,29 @@ pub async fn delete_or_clear_dataset(
314314

315315
if delete_worker_message.empty_dataset {
316316
log::info!("Clearing dataset {:?}", delete_worker_message.dataset_id);
317+
318+
if dataset_config.QDRANT_ONLY {
319+
bulk_delete_chunks_query(
320+
None,
321+
delete_worker_message.deleted_at,
322+
delete_worker_message.dataset_id,
323+
dataset_config.clone(),
324+
web_pool.clone(),
325+
)
326+
.await
327+
.map_err(|err| {
328+
log::error!("Failed to bulk delete chunks: {:?}", err);
329+
err
330+
})?;
331+
332+
log::info!(
333+
"Bulk deleted chunks for dataset: {:?}",
334+
delete_worker_message.dataset_id
335+
);
336+
337+
return Ok(());
338+
}
339+
317340
clear_dataset_query(
318341
delete_worker_message.dataset_id,
319342
delete_worker_message.deleted_at,
@@ -412,7 +435,7 @@ pub async fn bulk_delete_chunks(
412435
let dataset_config = DatasetConfiguration::from_json(dataset.server_configuration);
413436

414437
bulk_delete_chunks_query(
415-
chunk_delete_message.filter,
438+
Some(chunk_delete_message.filter),
416439
chunk_delete_message.deleted_at,
417440
chunk_delete_message.dataset_id,
418441
dataset_config,

server/src/bin/ingestion-worker.rs

+88-62
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use trieve_server::handlers::group_handler::dataset_owns_group;
2121
use trieve_server::operators::chunk_operator::{
2222
bulk_insert_chunk_metadata_query, bulk_revert_insert_chunk_metadata_query,
2323
get_row_count_for_organization_id_query, insert_chunk_boost, insert_chunk_metadata_query,
24-
update_chunk_boost_query, update_chunk_metadata_query,
24+
update_chunk_boost_query, update_chunk_metadata_query, update_dataset_chunk_count,
2525
};
2626
use trieve_server::operators::clickhouse_operator::{ClickHouseEvent, EventQueue};
2727
use trieve_server::operators::dataset_operator::{
@@ -567,9 +567,9 @@ pub async fn bulk_upload_chunks(
567567
"calling_BULK_insert_chunk_metadata_query",
568568
);
569569

570-
let only_insert_qdrant = std::env::var("ONLY_INSERT_QDRANT").unwrap_or("false".to_string());
570+
let only_insert_qdrant = dataset_config.QDRANT_ONLY;
571571

572-
let inserted_chunk_metadatas = if only_insert_qdrant == "true" {
572+
let inserted_chunk_metadatas = if only_insert_qdrant {
573573
ingestion_data
574574
.clone()
575575
.into_iter()
@@ -733,7 +733,13 @@ pub async fn bulk_upload_chunks(
733733
))
734734
.then(
735735
|(chunk_data, embedding_vector, splade_vector, bm25_vector)| async {
736-
let qdrant_point_id = chunk_data.chunk_metadata.qdrant_point_id;
736+
let mut qdrant_point_id = chunk_data.chunk_metadata.qdrant_point_id;
737+
if only_insert_qdrant {
738+
if let Some(tracking_id) = chunk_data.clone().chunk_metadata.tracking_id {
739+
qdrant_point_id =
740+
uuid::Uuid::new_v5(&uuid::Uuid::NAMESPACE_OID, tracking_id.as_bytes());
741+
}
742+
}
737743

738744
let chunk_tags: Option<Vec<Option<String>>> =
739745
if let Some(ref group_ids) = chunk_data.group_ids {
@@ -789,7 +795,6 @@ pub async fn bulk_upload_chunks(
789795
);
790796
}
791797

792-
// If qdrant_point_id does not exist, does not get written to qdrant
793798
Ok(PointStruct::new(
794799
qdrant_point_id.to_string(),
795800
vector_payload,
@@ -816,18 +821,31 @@ pub async fn bulk_upload_chunks(
816821
"calling_BULK_create_new_qdrant_points_query",
817822
);
818823

819-
let create_point_result =
824+
let create_point_result: Result<(), ServiceError> =
820825
bulk_upsert_qdrant_points_query(qdrant_points, dataset_config.clone()).await;
821826

822827
insert_tx.finish();
823828

824-
if let Err(err) = create_point_result {
825-
if !upsert_by_tracking_id_being_used {
826-
bulk_revert_insert_chunk_metadata_query(inserted_chunk_metadata_ids, web_pool.clone())
829+
if !only_insert_qdrant {
830+
if let Err(err) = create_point_result {
831+
if !upsert_by_tracking_id_being_used {
832+
bulk_revert_insert_chunk_metadata_query(
833+
inserted_chunk_metadata_ids,
834+
web_pool.clone(),
835+
)
827836
.await?;
828-
}
837+
}
829838

830-
return Err(err);
839+
return Err(err);
840+
}
841+
} else {
842+
create_point_result?;
843+
update_dataset_chunk_count(
844+
payload.dataset_id,
845+
inserted_chunk_metadata_ids.len() as i32,
846+
web_pool.clone(),
847+
)
848+
.await?;
831849
}
832850

833851
Ok(inserted_chunk_metadata_ids)
@@ -841,14 +859,16 @@ async fn upload_chunk(
841859
web_pool: actix_web::web::Data<models::Pool>,
842860
reqwest_client: reqwest::Client,
843861
) -> Result<uuid::Uuid, ServiceError> {
844-
let tx_ctx = sentry::TransactionContext::new(
845-
"ingestion worker upload_chunk",
846-
"ingestion worker upload_chunk",
847-
);
848-
let transaction = sentry::start_transaction(tx_ctx);
849-
sentry::configure_scope(|scope| scope.set_span(Some(transaction.clone().into())));
850-
862+
let dataset_id = payload.dataset_id;
863+
let qdrant_only = dataset_config.QDRANT_ONLY;
851864
let mut qdrant_point_id = uuid::Uuid::new_v4();
865+
if qdrant_only {
866+
if let Some(tracking_id) = payload.chunk.tracking_id.clone() {
867+
qdrant_point_id =
868+
uuid::Uuid::new_v5(&uuid::Uuid::NAMESPACE_OID, tracking_id.as_bytes());
869+
}
870+
}
871+
852872
let content = match payload.chunk.convert_html_to_text.unwrap_or(true) {
853873
true => convert_html_to_text(&(payload.chunk.chunk_html.clone().unwrap_or_default())),
854874
false => payload.chunk.chunk_html.clone().unwrap_or_default(),
@@ -1015,44 +1035,50 @@ async fn upload_chunk(
10151035

10161036
//if collision is not nil, insert chunk with collision
10171037
let chunk_metadata_id = {
1038+
let original_id = payload.ingest_specific_chunk_metadata.id;
1039+
let mut inserted_chunk_id = original_id;
10181040
payload.ingest_specific_chunk_metadata.qdrant_point_id = qdrant_point_id;
10191041

1020-
let insert_tx = transaction.start_child(
1021-
"calling_insert_chunk_metadata_query",
1022-
"calling_insert_chunk_metadata_query",
1023-
);
1024-
1025-
let inserted_chunk = insert_chunk_metadata_query(
1026-
chunk_metadata.clone(),
1027-
payload.chunk.group_ids.clone(),
1028-
payload.dataset_id,
1029-
payload.upsert_by_tracking_id,
1030-
web_pool.clone(),
1031-
)
1032-
.await?;
1033-
1034-
if payload.chunk.fulltext_boost.is_some() || payload.chunk.semantic_boost.is_some() {
1035-
insert_chunk_boost(
1036-
ChunkBoost {
1037-
chunk_id: inserted_chunk.id,
1038-
fulltext_boost_phrase: payload.chunk.fulltext_boost.clone().map(|x| x.phrase),
1039-
fulltext_boost_factor: payload.chunk.fulltext_boost.map(|x| x.boost_factor),
1040-
semantic_boost_phrase: payload.chunk.semantic_boost.clone().map(|x| x.phrase),
1041-
semantic_boost_factor: payload
1042-
.chunk
1043-
.semantic_boost
1044-
.map(|x| x.distance_factor as f64),
1045-
},
1042+
let group_tag_set = if qdrant_only {
1043+
None
1044+
} else {
1045+
let inserted_chunk = insert_chunk_metadata_query(
1046+
chunk_metadata.clone(),
1047+
payload.chunk.group_ids.clone(),
1048+
payload.dataset_id,
1049+
payload.upsert_by_tracking_id,
10461050
web_pool.clone(),
10471051
)
10481052
.await?;
1049-
}
1050-
1051-
insert_tx.finish();
1053+
inserted_chunk_id = inserted_chunk.id;
1054+
1055+
if payload.chunk.fulltext_boost.is_some() || payload.chunk.semantic_boost.is_some() {
1056+
insert_chunk_boost(
1057+
ChunkBoost {
1058+
chunk_id: inserted_chunk.id,
1059+
fulltext_boost_phrase: payload
1060+
.chunk
1061+
.fulltext_boost
1062+
.clone()
1063+
.map(|x| x.phrase),
1064+
fulltext_boost_factor: payload.chunk.fulltext_boost.map(|x| x.boost_factor),
1065+
semantic_boost_phrase: payload
1066+
.chunk
1067+
.semantic_boost
1068+
.clone()
1069+
.map(|x| x.phrase),
1070+
semantic_boost_factor: payload
1071+
.chunk
1072+
.semantic_boost
1073+
.map(|x| x.distance_factor as f64),
1074+
},
1075+
web_pool.clone(),
1076+
)
1077+
.await?;
1078+
}
10521079

1053-
qdrant_point_id = inserted_chunk.qdrant_point_id;
1080+
qdrant_point_id = inserted_chunk.qdrant_point_id;
10541081

1055-
let chunk_tags: Option<Vec<Option<String>>> =
10561082
if let Some(ref group_ids) = payload.chunk.group_ids {
10571083
Some(
10581084
get_groups_from_group_ids_query(group_ids.clone(), web_pool.clone())
@@ -1065,10 +1091,11 @@ async fn upload_chunk(
10651091
)
10661092
} else {
10671093
None
1068-
};
1094+
}
1095+
};
10691096

10701097
let qdrant_payload =
1071-
QdrantPayload::new(chunk_metadata, payload.chunk.group_ids, None, chunk_tags);
1098+
QdrantPayload::new(chunk_metadata, payload.chunk.group_ids, None, group_tag_set);
10721099

10731100
let vector_name = match &embedding_vector {
10741101
Some(embedding_vector) => match embedding_vector.len() {
@@ -1109,28 +1136,27 @@ async fn upload_chunk(
11091136
vector_payload,
11101137
qdrant_payload,
11111138
);
1112-
let insert_tx = transaction.start_child(
1113-
"calling_bulk_create_new_qdrant_points_query",
1114-
"calling_bulk_create_new_qdrant_points_query",
1115-
);
11161139

1117-
if let Err(e) = bulk_upsert_qdrant_points_query(vec![point], dataset_config).await {
1140+
let upsert_qdrant_point_result =
1141+
bulk_upsert_qdrant_points_query(vec![point], dataset_config).await;
1142+
1143+
if let Err(e) = upsert_qdrant_point_result {
11181144
log::error!("Failed to create qdrant point: {:?}", e);
11191145

1120-
if payload.upsert_by_tracking_id {
1121-
bulk_revert_insert_chunk_metadata_query(vec![inserted_chunk.id], web_pool.clone())
1146+
if !qdrant_only && (payload.upsert_by_tracking_id || original_id == inserted_chunk_id) {
1147+
bulk_revert_insert_chunk_metadata_query(vec![inserted_chunk_id], web_pool.clone())
11221148
.await?;
11231149
}
11241150

11251151
return Err(e);
11261152
};
1153+
if qdrant_only {
1154+
update_dataset_chunk_count(dataset_id, 1_i32, web_pool.clone()).await?;
1155+
}
11271156

1128-
insert_tx.finish();
1129-
1130-
inserted_chunk.id
1157+
inserted_chunk_id
11311158
};
11321159

1133-
transaction.finish();
11341160
Ok(chunk_metadata_id)
11351161
}
11361162

0 commit comments

Comments
 (0)