From 7d220a64ce1ca52f262dcfc49fc25a55bdfa4f8b Mon Sep 17 00:00:00 2001 From: skeptrune Date: Wed, 18 Dec 2024 14:36:17 -0800 Subject: [PATCH] bugfix: youtube crawl options not saving --- .../src/pages/dataset/CrawlingSettings.tsx | 14 +- server/src/operators/crawl_operator.rs | 190 ++++++++++-------- 2 files changed, 114 insertions(+), 90 deletions(-) diff --git a/frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx b/frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx index 08d4073d0..e0ec75a0e 100644 --- a/frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx +++ b/frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx @@ -455,7 +455,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => { { setOptions("ignore_sitemap", e.currentTarget.checked); @@ -472,7 +472,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => { { setOptions("allow_external_links", e.currentTarget.checked); @@ -531,7 +531,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => { { setOptions("limit", parseInt(e.currentTarget.value)); @@ -585,7 +585,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => { /> { /> { /> { /> , broccoli_queue: web::Data, ) -> Result<(), ServiceError> { + use crate::data::schema::crawl_requests::dsl as crawl_requests_table; + + let interval = match crawl_options.interval { + Some(CrawlInterval::Daily) => std::time::Duration::from_secs(60 * 60 * 24), + Some(CrawlInterval::Weekly) => std::time::Duration::from_secs(60 * 60 * 24 * 7), + Some(CrawlInterval::Monthly) => std::time::Duration::from_secs(60 * 60 * 24 * 30), + None => std::time::Duration::from_secs(60 * 60 * 24), + }; + + let new_crawl_request: CrawlRequestPG = CrawlRequest { + id: uuid::Uuid::new_v4(), + url: crawl_options.site_url.clone().unwrap_or_default(), + status: CrawlStatus::Pending, + interval, + next_crawl_at: chrono::Utc::now().naive_utc(), + crawl_options, + scrape_id: uuid::Uuid::default(), + dataset_id, + created_at: chrono::Utc::now().naive_utc(), + attempt_number: 0, + } + .into(); + + let mut conn = pool + .get() + .await + .map_err(|e| ServiceError::InternalServerError(e.to_string()))?; + + diesel::insert_into(crawl_requests_table::crawl_requests) + .values(&new_crawl_request) + .execute(&mut conn) + .await + .map_err(|e| ServiceError::InternalServerError(e.to_string()))?; + let message = VideoCrawlMessage { - channel_url: crawl_options.site_url.clone().unwrap_or_default(), + channel_url: new_crawl_request.url.clone(), dataset_id, }; broccoli_queue @@ -375,100 +408,91 @@ pub async fn update_crawl_settings_for_dataset( redis_pool: web::Data, ) -> Result<(), ServiceError> { use crate::data::schema::crawl_requests::dsl as crawl_requests_table; - let mut merged_options = crawl_options.clone(); - if crawl_options - .scrape_options - .as_ref() - .is_some_and(|f| matches!(f, &ScrapeOptions::Youtube(_))) - { - let mut conn = pool - .get() - .await - .map_err(|e| ServiceError::InternalServerError(e.to_string()))?; - - let prev_crawl_req = crawl_requests_table::crawl_requests - .select(( - crawl_requests_table::id, - crawl_requests_table::url, - crawl_requests_table::status, - crawl_requests_table::next_crawl_at, - crawl_requests_table::interval, - crawl_requests_table::crawl_options, - crawl_requests_table::scrape_id, - crawl_requests_table::dataset_id, - crawl_requests_table::created_at, - )) - .filter(crawl_requests_table::dataset_id.eq(dataset_id)) - .first::(&mut conn) - .await - .optional()?; + let mut conn = pool + .get() + .await + .map_err(|e| ServiceError::InternalServerError(e.to_string()))?; - if let Some(ref url) = crawl_options.site_url { - diesel::update( - crawl_requests_table::crawl_requests - .filter(crawl_requests_table::dataset_id.eq(dataset_id)), - ) - .set(crawl_requests_table::url.eq(url)) - .execute(&mut conn) - .await - .map_err(|e| { - log::error!("Error updating url on crawl_requests: {:?}", e); - ServiceError::InternalServerError( - "Error updating url on crawl_requests".to_string(), - ) - })?; - } + let prev_crawl_req = crawl_requests_table::crawl_requests + .select(( + crawl_requests_table::id, + crawl_requests_table::url, + crawl_requests_table::status, + crawl_requests_table::next_crawl_at, + crawl_requests_table::interval, + crawl_requests_table::crawl_options, + crawl_requests_table::scrape_id, + crawl_requests_table::dataset_id, + crawl_requests_table::created_at, + )) + .filter(crawl_requests_table::dataset_id.eq(dataset_id)) + .first::(&mut conn) + .await + .optional()?; - if let Some(interval) = crawl_options.interval.clone() { - let interval = match interval { - CrawlInterval::Daily => std::time::Duration::from_secs(60 * 60 * 24), - CrawlInterval::Weekly => std::time::Duration::from_secs(60 * 60 * 24 * 7), - CrawlInterval::Monthly => std::time::Duration::from_secs(60 * 60 * 24 * 30), - }; - diesel::update( - crawl_requests_table::crawl_requests - .filter(crawl_requests_table::dataset_id.eq(dataset_id)), - ) - .set(crawl_requests_table::interval.eq(interval.as_secs() as i32)) - .execute(&mut conn) - .await - .map_err(|e| { - log::error!("Error updating interval on crawl_requests: {:?}", e); - ServiceError::InternalServerError( - "Error updating interval on crawl_requests".to_string(), - ) - })?; - } + if let Some(ref url) = crawl_options.site_url { + diesel::update( + crawl_requests_table::crawl_requests + .filter(crawl_requests_table::dataset_id.eq(dataset_id)), + ) + .set(crawl_requests_table::url.eq(url)) + .execute(&mut conn) + .await + .map_err(|e| { + log::error!("Error updating url on crawl_requests: {:?}", e); + ServiceError::InternalServerError("Error updating url on crawl_requests".to_string()) + })?; + } - merged_options = if let Some(prev_crawl_req) = prev_crawl_req { - let previous_crawl_options: CrawlOptions = - serde_json::from_value(prev_crawl_req.crawl_options) - .map_err(|e| ServiceError::InternalServerError(e.to_string()))?; - crawl_options.merge(previous_crawl_options) - } else { - crawl_options + if let Some(interval) = crawl_options.interval.clone() { + let interval = match interval { + CrawlInterval::Daily => std::time::Duration::from_secs(60 * 60 * 24), + CrawlInterval::Weekly => std::time::Duration::from_secs(60 * 60 * 24 * 7), + CrawlInterval::Monthly => std::time::Duration::from_secs(60 * 60 * 24 * 30), }; - diesel::update( crawl_requests_table::crawl_requests .filter(crawl_requests_table::dataset_id.eq(dataset_id)), ) - .set(crawl_requests_table::crawl_options.eq( - serde_json::to_value(merged_options.clone()).map_err(|e| { - log::error!("Failed to serialize crawl options: {:?}", e); - ServiceError::BadRequest("Failed to serialize crawl options".to_string()) - })?, - )) + .set(crawl_requests_table::interval.eq(interval.as_secs() as i32)) .execute(&mut conn) .await .map_err(|e| { - log::error!("Error updating crawl options on crawl_requests: {:?}", e); + log::error!("Error updating interval on crawl_requests: {:?}", e); ServiceError::InternalServerError( - "Error updating crawl options on crawl_requests".to_string(), + "Error updating interval on crawl_requests".to_string(), ) })?; } + let merged_options = if let Some(prev_crawl_req) = prev_crawl_req { + let previous_crawl_options: CrawlOptions = + serde_json::from_value(prev_crawl_req.crawl_options) + .map_err(|e| ServiceError::InternalServerError(e.to_string()))?; + crawl_options.merge(previous_crawl_options) + } else { + crawl_options + }; + + diesel::update( + crawl_requests_table::crawl_requests + .filter(crawl_requests_table::dataset_id.eq(dataset_id)), + ) + .set(crawl_requests_table::crawl_options.eq( + serde_json::to_value(merged_options.clone()).map_err(|e| { + log::error!("Failed to serialize crawl options: {:?}", e); + ServiceError::BadRequest("Failed to serialize crawl options".to_string()) + })?, + )) + .execute(&mut conn) + .await + .map_err(|e| { + log::error!("Error updating crawl options on crawl_requests: {:?}", e); + ServiceError::InternalServerError( + "Error updating crawl options on crawl_requests".to_string(), + ) + })?; + create_crawl_query( merged_options.clone(), pool.clone(),