feature: add scoring options req param to search chunks route

skeptrunedev · cdxker · commit 7b91c10a84d7 · 2024-09-02T20:32:06.000-05:00
diff --git a/server/src/data/models.rs b/server/src/data/models.rs
@@ -5,8 +5,8 @@ use crate::errors::ServiceError;
 use crate::get_env;
 use crate::handlers::analytics_handler::CTRDataRequestBody;
 use crate::handlers::chunk_handler::{
-    AutocompleteReqPayload, ChunkFilter, FullTextBoost, ParsedQuery, SearchChunksReqPayload,
-    SemanticBoost,
+    AutocompleteReqPayload, ChunkFilter, FullTextBoost, ParsedQuery, ScoringOptions,
+    SearchChunksReqPayload, SemanticBoost,
 };
 use crate::handlers::file_handler::UploadFileReqPayload;
 use crate::handlers::group_handler::{SearchOverGroupsReqPayload, SearchWithinGroupReqPayload};
@@ -5455,6 +5455,7 @@ impl<'de> Deserialize<'de> for SearchChunksReqPayload {
             get_total_pages: Option<bool>,
             filters: Option<ChunkFilter>,
             sort_options: Option<SortOptions>,
+            scoring_options: Option<ScoringOptions>,
             highlight_options: Option<HighlightOptions>,
             score_threshold: Option<f32>,
             slim_chunks: Option<bool>,
@@ -5486,6 +5487,7 @@ impl<'de> Deserialize<'de> for SearchChunksReqPayload {
             get_total_pages: helper.get_total_pages,
             filters: helper.filters,
             sort_options,
+            scoring_options: helper.scoring_options,
             highlight_options,
             score_threshold: helper.score_threshold,
             slim_chunks: helper.slim_chunks,
@@ -5511,6 +5513,7 @@ impl<'de> Deserialize<'de> for AutocompleteReqPayload {
             page_size: Option<u64>,
             filters: Option<ChunkFilter>,
             sort_options: Option<SortOptions>,
+            scoring_options: Option<ScoringOptions>,
             highlight_options: Option<HighlightOptions>,
             score_threshold: Option<f32>,
             slim_chunks: Option<bool>,
@@ -5541,6 +5544,7 @@ impl<'de> Deserialize<'de> for AutocompleteReqPayload {
             page_size: helper.page_size,
             filters: helper.filters,
             sort_options,
+            scoring_options: helper.scoring_options,
             highlight_options,
             score_threshold: helper.score_threshold,
             slim_chunks: helper.slim_chunks,
diff --git a/server/src/handlers/chunk_handler.rs b/server/src/handlers/chunk_handler.rs
@@ -59,6 +59,15 @@ pub struct SemanticBoost {
     pub distance_factor: f32,
 }
 
+/// Scoring options provides ways to modify the sparse or dense vector created for the query in order to change how potential matches are scored. If not specified, this defaults to no modifications.
+#[derive(Serialize, Deserialize, Debug, Clone, ToSchema)]
+pub struct ScoringOptions {
+    ///  Full text boost is useful for when you want to boost certain phrases in the fulltext (SPLADE) and BM25 search results. I.e. making sure that the listing for AirBNB itself ranks higher than companies who make software for AirBNB hosts by boosting the in-document-frequency of the AirBNB token (AKA word) for its official listing. Conceptually it multiples the in-document-importance second value in the tuples of the SPLADE or BM25 sparse vector of the chunk_html innerText for all tokens present in the boost phrase by the boost factor like so: (token, in-document-importance) -> (token, in-document-importance*boost_factor).
+    pub fulltext_boost: Option<FullTextBoost>,
+    /// Semantic boost is useful for moving the embedding vector of the chunk in the direction of the distance phrase. I.e. you can push a chunk with a chunk_html of "iphone" 25% closer to the term "flagship" by using the distance phrase "flagship" and a distance factor of 0.25. Conceptually it's drawing a line (euclidean/L2 distance) between the vector for the innerText of the chunk_html and distance_phrase then moving the vector of the chunk_html distance_factor*L2Distance closer to or away from the distance_phrase point along the line between the two points.
+    pub semantic_boost: Option<SemanticBoost>,
+}
+
 #[derive(Serialize, Deserialize, Debug, ToSchema, Clone)]
 #[schema(example = json!({
     "chunk_html": "<p>Some HTML content</p>",
@@ -946,12 +955,11 @@ pub struct SearchChunksReqPayload {
     pub filters: Option<ChunkFilter>,
     /// Sort Options lets you specify different methods to rerank the chunks in the result set. If not specified, this defaults to the score of the chunks.
     pub sort_options: Option<SortOptions>,
+    /// Scoring options provides ways to modify the sparse or dense vector created for the query in order to change how potential matches are scored. If not specified, this defaults to no modifications.
+    pub scoring_options: Option<ScoringOptions>,
     /// Highlight Options lets you specify different methods to highlight the chunks in the result set. If not specified, this defaults to the score of the chunks.
     pub highlight_options: Option<HighlightOptions>,
-    /// Set score_threshold to a float to filter out chunks with a score below the threshold for cosine distance metric
-    /// For Manhattan Distance, Euclidean Distance, and Dot Product, it will filter out scores above the threshold distance
-    /// This threshold applies before weight and bias modifications. If not specified, this defaults to no threshold
-    /// A threshold of 0 will default to no threshold
+    /// Set score_threshold to a float to filter out chunks with a score below the threshold for cosine distance metric. For Manhattan Distance, Euclidean Distance, and Dot Product, it will filter out scores above the threshold distance. This threshold applies before weight and bias modifications. If not specified, this defaults to no threshold. A threshold of 0 will default to no threshold.
     pub score_threshold: Option<f32>,
     /// Set slim_chunks to true to avoid returning the content and chunk_html of the chunks. This is useful for when you want to reduce amount of data over the wire for latency improvement (typically 10-50ms). Default is false.
     pub slim_chunks: Option<bool>,
@@ -977,6 +985,7 @@ impl Default for SearchChunksReqPayload {
             page_size: Some(10),
             filters: None,
             sort_options: None,
+            scoring_options: None,
             highlight_options: None,
             score_threshold: None,
             slim_chunks: None,
@@ -1329,6 +1338,8 @@ pub struct AutocompleteReqPayload {
     pub filters: Option<ChunkFilter>,
     /// Sort Options lets you specify different methods to rerank the chunks in the result set. If not specified, this defaults to the score of the chunks.
     pub sort_options: Option<SortOptions>,
+    /// Scoring options provides ways to modify the sparse or dense vector created for the query in order to change how potential matches are scored. If not specified, this defaults to no modifications.
+    pub scoring_options: Option<ScoringOptions>,
     /// Highlight Options lets you specify different methods to highlight the chunks in the result set. If not specified, this defaults to the score of the chunks.
     pub highlight_options: Option<HighlightOptions>,
     /// Set score_threshold to a float to filter out chunks with a score below the threshold. This threshold applies before weight and bias modifications. If not specified, this defaults to 0.0.
@@ -1356,6 +1367,7 @@ impl From<AutocompleteReqPayload> for SearchChunksReqPayload {
             page_size: autocomplete_data.page_size,
             filters: autocomplete_data.filters,
             sort_options: autocomplete_data.sort_options,
+            scoring_options: autocomplete_data.scoring_options,
             highlight_options: autocomplete_data.highlight_options,
             score_threshold: autocomplete_data.score_threshold,
             slim_chunks: autocomplete_data.slim_chunks,
@@ -1653,6 +1665,7 @@ impl From<CountChunksReqPayload> for SearchChunksReqPayload {
             page_size: count_data.limit,
             filters: count_data.filters,
             sort_options: None,
+            scoring_options: None,
             highlight_options: None,
             score_threshold: count_data.score_threshold,
             slim_chunks: None,
diff --git a/server/src/handlers/group_handler.rs b/server/src/handlers/group_handler.rs
@@ -1426,6 +1426,7 @@ impl From<SearchWithinGroupReqPayload> for SearchChunksReqPayload {
             filters: search_within_group_data.filters,
             search_type: search_within_group_data.search_type,
             sort_options: search_within_group_data.sort_options,
+            scoring_options: None,
             highlight_options: search_within_group_data.highlight_options,
             score_threshold: search_within_group_data.score_threshold,
             slim_chunks: search_within_group_data.slim_chunks,
diff --git a/server/src/lib.rs b/server/src/lib.rs
@@ -140,7 +140,7 @@ impl Modify for SecurityAddon {
             name = "BSL",
             url = "https://github.com/devflowinc/trieve/blob/main/LICENSE.txt",
         ),
-        version = "0.11.7",
+        version = "0.11.8",
     ),
     servers(
         (url = "https://api.trieve.ai",
@@ -277,6 +277,7 @@ impl Modify for SecurityAddon {
             handlers::chunk_handler::GetChunksData,
             handlers::chunk_handler::GetTrackingChunksData,
             handlers::chunk_handler::SemanticBoost,
+            handlers::chunk_handler::ScoringOptions,
             handlers::chunk_handler::ChunkReturnTypes,
             handlers::chunk_handler::ScrollChunksReqPayload,
             handlers::chunk_handler::ScrollChunksResponseBody,
diff --git a/server/src/operators/model_operator.rs b/server/src/operators/model_operator.rs
@@ -28,7 +28,7 @@ pub struct EmbeddingParameters {
 #[tracing::instrument]
 pub async fn get_dense_vector(
     message: String,
-    distance_phrase: Option<SemanticBoost>,
+    semantic_boost: Option<SemanticBoost>,
     embed_type: &str,
     dataset_config: DatasetConfiguration,
 ) -> Result<Vec<f32>, ServiceError> {
@@ -91,32 +91,23 @@ pub async fn get_dense_vector(
         };
 
     let clipped_message: String = message.chars().take(20000).collect();
+    let mut messages = vec![format!(
+        "{}{}",
+        dataset_config.EMBEDDING_QUERY_PREFIX, &clipped_message
+    )
+    .to_string()];
+    if let Some(semantic_boost) = semantic_boost.as_ref() {
+        if semantic_boost.distance_factor == 0.0 || semantic_boost.phrase.is_empty() {
+            return Err(ServiceError::BadRequest(
+                "Semantic boost phrase is empty or distance factor is 0. Boost phrase must not be empty and distance factor must be greater than 0".to_string(),
+            ));
+        }
 
-    let mut messages = vec![clipped_message.clone()];
-
-    if distance_phrase.is_some() {
-        let clipped_boost: String = distance_phrase
-            .as_ref()
-            .unwrap()
-            .phrase
-            .chars()
-            .take(20000)
-            .collect();
+        let clipped_boost: String = semantic_boost.phrase.chars().take(20000).collect();
         messages.push(clipped_boost);
     }
 
-    let input = match embed_type {
-        "doc" => EmbeddingInput::StringArray(messages),
-        "query" => EmbeddingInput::String(
-            format!(
-                "{}{}",
-                dataset_config.EMBEDDING_QUERY_PREFIX, &clipped_message
-            )
-            .to_string(),
-        ),
-        _ => EmbeddingInput::StringArray(messages),
-    };
-
+    let input = EmbeddingInput::StringArray(messages);
     let parameters = EmbeddingParameters {
         model: dataset_config.EMBEDDING_MODEL_NAME.to_string(),
         input,
@@ -139,8 +130,8 @@ pub async fn get_dense_vector(
         ))
     })?;
 
-    let embeddings: EmbeddingResponse = format_response(embeddings_resp.into_string().unwrap())
-        .map_err(|e| {
+    let embeddings: EmbeddingResponse =
+        format_response(embeddings_resp.into_string().unwrap_or("".to_string())).map_err(|e| {
             log::error!("Failed to format response from embeddings server {:?}", e);
             ServiceError::InternalServerError(
                 "Failed to format response from embeddings server".to_owned(),
@@ -165,10 +156,24 @@ pub async fn get_dense_vector(
         ));
     }
 
-    if distance_phrase.is_some() {
-        let distance_factor = distance_phrase.unwrap().distance_factor;
-        let boost_vector = vectors.pop().unwrap();
-        let embedding_vector = vectors.pop().unwrap();
+    if let Some(semantic_boost) = semantic_boost {
+        let distance_factor = semantic_boost.distance_factor;
+        let boost_vector = match vectors.pop() {
+            Some(v) => v,
+            None => {
+                return Err(ServiceError::InternalServerError(
+                    "No dense embedding returned from server for boost_vector".to_owned(),
+                ))
+            }
+        };
+        let embedding_vector = match vectors.pop() {
+            Some(v) => v,
+            None => {
+                return Err(ServiceError::InternalServerError(
+                    "No dense embedding returned from server for embedding_vector".to_owned(),
+                ))
+            }
+        };
 
         return Ok(embedding_vector
             .iter()
@@ -190,6 +195,7 @@ pub async fn get_dense_vector(
 #[tracing::instrument]
 pub async fn get_sparse_vector(
     message: String,
+    fulltext_boost: Option<FullTextBoost>,
     embed_type: &str,
 ) -> Result<Vec<(u32, f32)>, ServiceError> {
     let origin_key = match embed_type {
@@ -206,11 +212,22 @@ pub async fn get_sparse_vector(
             origin_key
         )))?;
 
-    let clipped_message = message.chars().take(128000).collect();
+    let clipped_message: String = message.chars().take(20000).collect();
+    let mut inputs = vec![clipped_message.clone()];
+    if let Some(fulltext_boost) = fulltext_boost.as_ref() {
+        if fulltext_boost.phrase.is_empty() {
+            return Err(ServiceError::BadRequest(
+                "Fulltext boost phrase is empty. Non-empty phrase must be specified.".to_string(),
+            ));
+        }
+
+        let clipped_boost: String = fulltext_boost.phrase.chars().take(20000).collect();
+        inputs.push(clipped_boost);
+    }
 
     let embedding_server_call = format!("{}/embed_sparse", server_origin);
 
-    let sparse_vectors = ureq::post(&embedding_server_call)
+    let mut sparse_vectors = ureq::post(&embedding_server_call)
         .set("Content-Type", "application/json")
         .set(
             "Authorization",
@@ -220,7 +237,7 @@ pub async fn get_sparse_vector(
             ),
         )
         .send_json(CustomSparseEmbedData {
-            inputs: vec![clipped_message],
+            inputs,
             encode_type: embed_type.to_string(),
             truncate: true,
         })
@@ -242,6 +259,50 @@ pub async fn get_sparse_vector(
             )
         })?;
 
+    if let Some(fulltext_boost) = fulltext_boost {
+        let boost_amt = fulltext_boost.boost_factor;
+        let boost_vector = match sparse_vectors.pop() {
+            Some(v) => v,
+            None => {
+                return Err(ServiceError::InternalServerError(
+                    "No sparse vector returned from server for boost_vector".to_owned(),
+                ))
+            }
+        };
+        let query_vector = match sparse_vectors.pop() {
+            Some(v) => v,
+            None => {
+                return Err(ServiceError::InternalServerError(
+                    "No sparse vector returned from server for embedding_vector".to_owned(),
+                ))
+            }
+        };
+
+        let boosted_query_vector = query_vector
+            .iter()
+            .map(|splade_indice| {
+                if boost_vector
+                    .iter()
+                    .any(|boost_splade_indice| boost_splade_indice.index == splade_indice.index)
+                {
+                    SpladeIndicies {
+                        index: splade_indice.index,
+                        value: splade_indice.value * (boost_amt as f32),
+                    }
+                    .into_tuple()
+                } else {
+                    SpladeIndicies {
+                        index: splade_indice.index,
+                        value: splade_indice.value,
+                    }
+                    .into_tuple()
+                }
+            })
+            .collect();
+
+        return Ok(boosted_query_vector);
+    }
+
     match sparse_vectors.first() {
         Some(v) => Ok(v
             .iter()
diff --git a/server/src/operators/search_operator.rs b/server/src/operators/search_operator.rs