Skip to content

Commit

Permalink
bugfix: youtube crawl options not saving
Browse files Browse the repository at this point in the history
  • Loading branch information
skeptrunedev committed Dec 19, 2024
1 parent f9db9d6 commit bb8045a
Show file tree
Hide file tree
Showing 2 changed files with 114 additions and 90 deletions.
14 changes: 7 additions & 7 deletions frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
<input
class="h-3 w-3 rounded border border-neutral-300 bg-neutral-100 p-1 accent-magenta-400 dark:border-neutral-900 dark:bg-neutral-800"
type="checkbox"
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
checked={options.ignore_sitemap ?? true}
onChange={(e) => {
setOptions("ignore_sitemap", e.currentTarget.checked);
Expand All @@ -472,7 +472,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
<input
class="h-3 w-3 rounded border border-neutral-300 bg-neutral-100 p-1 accent-magenta-400 dark:border-neutral-900 dark:bg-neutral-800"
type="checkbox"
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
checked={options.allow_external_links ?? false}
onChange={(e) => {
setOptions("allow_external_links", e.currentTarget.checked);
Expand Down Expand Up @@ -531,7 +531,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
<input
class="block max-w-[100px] rounded border border-neutral-300 px-3 py-1.5 shadow-sm placeholder:text-neutral-400 focus:outline-magenta-500 sm:text-sm sm:leading-6"
type="number"
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
value={options.limit || "0"}
onInput={(e) => {
setOptions("limit", parseInt(e.currentTarget.value));
Expand Down Expand Up @@ -585,7 +585,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
/>
</div>
<MultiStringInput
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
placeholder="https://example.com/include/*"
addClass="bg-magenta-100/40 px-2 rounded text-sm border border-magenta-300/40"
inputClass="w-full"
Expand All @@ -606,7 +606,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
/>
</div>
<MultiStringInput
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
placeholder="https://example.com/exclude/*"
addClass="bg-magenta-100/40 px-2 text-sm rounded border border-magenta-300/40"
addLabel="Add Path"
Expand All @@ -626,7 +626,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
/>
</div>
<MultiStringInput
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
placeholder="h1..."
addClass="bg-magenta-100/40 text-sm px-2 rounded border border-magenta-300/40"
addLabel="Add Selector"
Expand All @@ -646,7 +646,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
/>
</div>
<MultiStringInput
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
placeholder="button..."
addClass="bg-magenta-100/40 px-2 text-sm rounded border border-magenta-300/40"
addLabel="Add Selector"
Expand Down
190 changes: 107 additions & 83 deletions server/src/operators/crawl_operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,14 @@ pub async fn create_crawl_query(
.as_ref()
.is_some_and(|f| matches!(f, &ScrapeOptions::Youtube(_)))
{
create_youtube_crawl_request(crawl_options, dataset_id, broccoli_queue)
create_youtube_crawl_request(crawl_options, dataset_id, pool, broccoli_queue)
.await
.map_err(|err| ServiceError::BadRequest(format!("Could not crawl site: {}", err)))?;
Ok(None)
} else {
let webhook_url = format!(
"{}/api/file/html_page",
std::env::var("FOO_BAR").unwrap_or(
"https://5b0f-2600-1700-460-1070-f5b9-429e-fb2-70d5.ngrok-free.app".to_string()
)
std::env::var("BASE_SERVER_URL").unwrap_or("https://api.trieve.ai".to_string())
);
let webhook_metadata = serde_json::json!({
"dataset_id": dataset_id,
Expand Down Expand Up @@ -307,10 +305,45 @@ pub async fn create_crawl_request(
pub async fn create_youtube_crawl_request(
crawl_options: CrawlOptions,
dataset_id: uuid::Uuid,
pool: web::Data<Pool>,
broccoli_queue: web::Data<BroccoliQueue>,
) -> Result<(), ServiceError> {
use crate::data::schema::crawl_requests::dsl as crawl_requests_table;

let interval = match crawl_options.interval {
Some(CrawlInterval::Daily) => std::time::Duration::from_secs(60 * 60 * 24),
Some(CrawlInterval::Weekly) => std::time::Duration::from_secs(60 * 60 * 24 * 7),
Some(CrawlInterval::Monthly) => std::time::Duration::from_secs(60 * 60 * 24 * 30),
None => std::time::Duration::from_secs(60 * 60 * 24),
};

let new_crawl_request: CrawlRequestPG = CrawlRequest {
id: uuid::Uuid::new_v4(),
url: crawl_options.site_url.clone().unwrap_or_default(),
status: CrawlStatus::Pending,
interval,
next_crawl_at: chrono::Utc::now().naive_utc(),
crawl_options,
scrape_id: uuid::Uuid::default(),
dataset_id,
created_at: chrono::Utc::now().naive_utc(),
attempt_number: 0,
}
.into();

let mut conn = pool
.get()
.await
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;

diesel::insert_into(crawl_requests_table::crawl_requests)
.values(&new_crawl_request)
.execute(&mut conn)
.await
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;

let message = VideoCrawlMessage {
channel_url: crawl_options.site_url.clone().unwrap_or_default(),
channel_url: new_crawl_request.url.clone(),
dataset_id,
};
broccoli_queue
Expand Down Expand Up @@ -375,100 +408,91 @@ pub async fn update_crawl_settings_for_dataset(
redis_pool: web::Data<RedisPool>,
) -> Result<(), ServiceError> {
use crate::data::schema::crawl_requests::dsl as crawl_requests_table;
let mut merged_options = crawl_options.clone();
if crawl_options
.scrape_options
.as_ref()
.is_some_and(|f| matches!(f, &ScrapeOptions::Youtube(_)))
{
let mut conn = pool
.get()
.await
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;

let prev_crawl_req = crawl_requests_table::crawl_requests
.select((
crawl_requests_table::id,
crawl_requests_table::url,
crawl_requests_table::status,
crawl_requests_table::next_crawl_at,
crawl_requests_table::interval,
crawl_requests_table::crawl_options,
crawl_requests_table::scrape_id,
crawl_requests_table::dataset_id,
crawl_requests_table::created_at,
))
.filter(crawl_requests_table::dataset_id.eq(dataset_id))
.first::<CrawlRequestPG>(&mut conn)
.await
.optional()?;
let mut conn = pool
.get()
.await
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;

if let Some(ref url) = crawl_options.site_url {
diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::url.eq(url))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating url on crawl_requests: {:?}", e);
ServiceError::InternalServerError(
"Error updating url on crawl_requests".to_string(),
)
})?;
}
let prev_crawl_req = crawl_requests_table::crawl_requests
.select((
crawl_requests_table::id,
crawl_requests_table::url,
crawl_requests_table::status,
crawl_requests_table::next_crawl_at,
crawl_requests_table::interval,
crawl_requests_table::crawl_options,
crawl_requests_table::scrape_id,
crawl_requests_table::dataset_id,
crawl_requests_table::created_at,
))
.filter(crawl_requests_table::dataset_id.eq(dataset_id))
.first::<CrawlRequestPG>(&mut conn)
.await
.optional()?;

if let Some(interval) = crawl_options.interval.clone() {
let interval = match interval {
CrawlInterval::Daily => std::time::Duration::from_secs(60 * 60 * 24),
CrawlInterval::Weekly => std::time::Duration::from_secs(60 * 60 * 24 * 7),
CrawlInterval::Monthly => std::time::Duration::from_secs(60 * 60 * 24 * 30),
};
diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::interval.eq(interval.as_secs() as i32))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating interval on crawl_requests: {:?}", e);
ServiceError::InternalServerError(
"Error updating interval on crawl_requests".to_string(),
)
})?;
}
if let Some(ref url) = crawl_options.site_url {
diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::url.eq(url))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating url on crawl_requests: {:?}", e);
ServiceError::InternalServerError("Error updating url on crawl_requests".to_string())
})?;
}

merged_options = if let Some(prev_crawl_req) = prev_crawl_req {
let previous_crawl_options: CrawlOptions =
serde_json::from_value(prev_crawl_req.crawl_options)
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;
crawl_options.merge(previous_crawl_options)
} else {
crawl_options
if let Some(interval) = crawl_options.interval.clone() {
let interval = match interval {
CrawlInterval::Daily => std::time::Duration::from_secs(60 * 60 * 24),
CrawlInterval::Weekly => std::time::Duration::from_secs(60 * 60 * 24 * 7),
CrawlInterval::Monthly => std::time::Duration::from_secs(60 * 60 * 24 * 30),
};

diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::crawl_options.eq(
serde_json::to_value(merged_options.clone()).map_err(|e| {
log::error!("Failed to serialize crawl options: {:?}", e);
ServiceError::BadRequest("Failed to serialize crawl options".to_string())
})?,
))
.set(crawl_requests_table::interval.eq(interval.as_secs() as i32))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating crawl options on crawl_requests: {:?}", e);
log::error!("Error updating interval on crawl_requests: {:?}", e);
ServiceError::InternalServerError(
"Error updating crawl options on crawl_requests".to_string(),
"Error updating interval on crawl_requests".to_string(),
)
})?;
}

let merged_options = if let Some(prev_crawl_req) = prev_crawl_req {
let previous_crawl_options: CrawlOptions =
serde_json::from_value(prev_crawl_req.crawl_options)
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;
crawl_options.merge(previous_crawl_options)
} else {
crawl_options
};

diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::crawl_options.eq(
serde_json::to_value(merged_options.clone()).map_err(|e| {
log::error!("Failed to serialize crawl options: {:?}", e);
ServiceError::BadRequest("Failed to serialize crawl options".to_string())
})?,
))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating crawl options on crawl_requests: {:?}", e);
ServiceError::InternalServerError(
"Error updating crawl options on crawl_requests".to_string(),
)
})?;

create_crawl_query(
merged_options.clone(),
pool.clone(),
Expand Down

0 comments on commit bb8045a

Please sign in to comment.