Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix: youtube crawl options not saving #2976

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions frontends/dashboard/src/pages/dataset/CrawlingSettings.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -455,7 +455,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
<input
class="h-3 w-3 rounded border border-neutral-300 bg-neutral-100 p-1 accent-magenta-400 dark:border-neutral-900 dark:bg-neutral-800"
type="checkbox"
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
checked={options.ignore_sitemap ?? true}
onChange={(e) => {
setOptions("ignore_sitemap", e.currentTarget.checked);
Expand All @@ -472,7 +472,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
<input
class="h-3 w-3 rounded border border-neutral-300 bg-neutral-100 p-1 accent-magenta-400 dark:border-neutral-900 dark:bg-neutral-800"
type="checkbox"
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
checked={options.allow_external_links ?? false}
onChange={(e) => {
setOptions("allow_external_links", e.currentTarget.checked);
Expand Down Expand Up @@ -531,7 +531,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
<input
class="block max-w-[100px] rounded border border-neutral-300 px-3 py-1.5 shadow-sm placeholder:text-neutral-400 focus:outline-magenta-500 sm:text-sm sm:leading-6"
type="number"
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
value={options.limit || "0"}
onInput={(e) => {
setOptions("limit", parseInt(e.currentTarget.value));
Expand Down Expand Up @@ -585,7 +585,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
/>
</div>
<MultiStringInput
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
placeholder="https://example.com/include/*"
addClass="bg-magenta-100/40 px-2 rounded text-sm border border-magenta-300/40"
inputClass="w-full"
Expand All @@ -606,7 +606,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
/>
</div>
<MultiStringInput
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
placeholder="https://example.com/exclude/*"
addClass="bg-magenta-100/40 px-2 text-sm rounded border border-magenta-300/40"
addLabel="Add Path"
Expand All @@ -626,7 +626,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
/>
</div>
<MultiStringInput
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
placeholder="h1..."
addClass="bg-magenta-100/40 text-sm px-2 rounded border border-magenta-300/40"
addLabel="Add Selector"
Expand All @@ -646,7 +646,7 @@ const RealCrawlingSettings = (props: RealCrawlingSettingsProps) => {
/>
</div>
<MultiStringInput
disabled={isShopify()}
disabled={isShopify() || isYoutube()}
placeholder="button..."
addClass="bg-magenta-100/40 px-2 text-sm rounded border border-magenta-300/40"
addLabel="Add Selector"
Expand Down
190 changes: 107 additions & 83 deletions server/src/operators/crawl_operator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,16 +157,14 @@ pub async fn create_crawl_query(
.as_ref()
.is_some_and(|f| matches!(f, &ScrapeOptions::Youtube(_)))
{
create_youtube_crawl_request(crawl_options, dataset_id, broccoli_queue)
create_youtube_crawl_request(crawl_options, dataset_id, pool, broccoli_queue)
.await
.map_err(|err| ServiceError::BadRequest(format!("Could not crawl site: {}", err)))?;
Ok(None)
} else {
let webhook_url = format!(
"{}/api/file/html_page",
std::env::var("FOO_BAR").unwrap_or(
"https://5b0f-2600-1700-460-1070-f5b9-429e-fb2-70d5.ngrok-free.app".to_string()
)
std::env::var("BASE_SERVER_URL").unwrap_or("https://api.trieve.ai".to_string())
);
let webhook_metadata = serde_json::json!({
"dataset_id": dataset_id,
Expand Down Expand Up @@ -307,10 +305,45 @@ pub async fn create_crawl_request(
pub async fn create_youtube_crawl_request(
crawl_options: CrawlOptions,
dataset_id: uuid::Uuid,
pool: web::Data<Pool>,
broccoli_queue: web::Data<BroccoliQueue>,
) -> Result<(), ServiceError> {
use crate::data::schema::crawl_requests::dsl as crawl_requests_table;

let interval = match crawl_options.interval {
Some(CrawlInterval::Daily) => std::time::Duration::from_secs(60 * 60 * 24),
Some(CrawlInterval::Weekly) => std::time::Duration::from_secs(60 * 60 * 24 * 7),
Some(CrawlInterval::Monthly) => std::time::Duration::from_secs(60 * 60 * 24 * 30),
None => std::time::Duration::from_secs(60 * 60 * 24),
};

let new_crawl_request: CrawlRequestPG = CrawlRequest {
id: uuid::Uuid::new_v4(),
url: crawl_options.site_url.clone().unwrap_or_default(),
status: CrawlStatus::Pending,
interval,
next_crawl_at: chrono::Utc::now().naive_utc(),
crawl_options,
scrape_id: uuid::Uuid::default(),
dataset_id,
created_at: chrono::Utc::now().naive_utc(),
attempt_number: 0,
}
.into();

let mut conn = pool
.get()
.await
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;

diesel::insert_into(crawl_requests_table::crawl_requests)
.values(&new_crawl_request)
.execute(&mut conn)
.await
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;

let message = VideoCrawlMessage {
channel_url: crawl_options.site_url.clone().unwrap_or_default(),
channel_url: new_crawl_request.url.clone(),
dataset_id,
};
broccoli_queue
Expand Down Expand Up @@ -375,100 +408,91 @@ pub async fn update_crawl_settings_for_dataset(
redis_pool: web::Data<RedisPool>,
) -> Result<(), ServiceError> {
use crate::data::schema::crawl_requests::dsl as crawl_requests_table;
let mut merged_options = crawl_options.clone();
if crawl_options
.scrape_options
.as_ref()
.is_some_and(|f| matches!(f, &ScrapeOptions::Youtube(_)))
{
let mut conn = pool
.get()
.await
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;

let prev_crawl_req = crawl_requests_table::crawl_requests
.select((
crawl_requests_table::id,
crawl_requests_table::url,
crawl_requests_table::status,
crawl_requests_table::next_crawl_at,
crawl_requests_table::interval,
crawl_requests_table::crawl_options,
crawl_requests_table::scrape_id,
crawl_requests_table::dataset_id,
crawl_requests_table::created_at,
))
.filter(crawl_requests_table::dataset_id.eq(dataset_id))
.first::<CrawlRequestPG>(&mut conn)
.await
.optional()?;
let mut conn = pool
.get()
.await
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;

if let Some(ref url) = crawl_options.site_url {
diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::url.eq(url))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating url on crawl_requests: {:?}", e);
ServiceError::InternalServerError(
"Error updating url on crawl_requests".to_string(),
)
})?;
}
let prev_crawl_req = crawl_requests_table::crawl_requests
.select((
crawl_requests_table::id,
crawl_requests_table::url,
crawl_requests_table::status,
crawl_requests_table::next_crawl_at,
crawl_requests_table::interval,
crawl_requests_table::crawl_options,
crawl_requests_table::scrape_id,
crawl_requests_table::dataset_id,
crawl_requests_table::created_at,
))
.filter(crawl_requests_table::dataset_id.eq(dataset_id))
.first::<CrawlRequestPG>(&mut conn)
.await
.optional()?;

if let Some(interval) = crawl_options.interval.clone() {
let interval = match interval {
CrawlInterval::Daily => std::time::Duration::from_secs(60 * 60 * 24),
CrawlInterval::Weekly => std::time::Duration::from_secs(60 * 60 * 24 * 7),
CrawlInterval::Monthly => std::time::Duration::from_secs(60 * 60 * 24 * 30),
};
diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::interval.eq(interval.as_secs() as i32))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating interval on crawl_requests: {:?}", e);
ServiceError::InternalServerError(
"Error updating interval on crawl_requests".to_string(),
)
})?;
}
if let Some(ref url) = crawl_options.site_url {
diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::url.eq(url))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating url on crawl_requests: {:?}", e);
ServiceError::InternalServerError("Error updating url on crawl_requests".to_string())
})?;
}

merged_options = if let Some(prev_crawl_req) = prev_crawl_req {
let previous_crawl_options: CrawlOptions =
serde_json::from_value(prev_crawl_req.crawl_options)
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;
crawl_options.merge(previous_crawl_options)
} else {
crawl_options
if let Some(interval) = crawl_options.interval.clone() {
let interval = match interval {
CrawlInterval::Daily => std::time::Duration::from_secs(60 * 60 * 24),
CrawlInterval::Weekly => std::time::Duration::from_secs(60 * 60 * 24 * 7),
CrawlInterval::Monthly => std::time::Duration::from_secs(60 * 60 * 24 * 30),
};

diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::crawl_options.eq(
serde_json::to_value(merged_options.clone()).map_err(|e| {
log::error!("Failed to serialize crawl options: {:?}", e);
ServiceError::BadRequest("Failed to serialize crawl options".to_string())
})?,
))
.set(crawl_requests_table::interval.eq(interval.as_secs() as i32))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating crawl options on crawl_requests: {:?}", e);
log::error!("Error updating interval on crawl_requests: {:?}", e);
ServiceError::InternalServerError(
"Error updating crawl options on crawl_requests".to_string(),
"Error updating interval on crawl_requests".to_string(),
)
})?;
}

let merged_options = if let Some(prev_crawl_req) = prev_crawl_req {
let previous_crawl_options: CrawlOptions =
serde_json::from_value(prev_crawl_req.crawl_options)
.map_err(|e| ServiceError::InternalServerError(e.to_string()))?;
crawl_options.merge(previous_crawl_options)
} else {
crawl_options
};

diesel::update(
crawl_requests_table::crawl_requests
.filter(crawl_requests_table::dataset_id.eq(dataset_id)),
)
.set(crawl_requests_table::crawl_options.eq(
serde_json::to_value(merged_options.clone()).map_err(|e| {
log::error!("Failed to serialize crawl options: {:?}", e);
ServiceError::BadRequest("Failed to serialize crawl options".to_string())
})?,
))
.execute(&mut conn)
.await
.map_err(|e| {
log::error!("Error updating crawl options on crawl_requests: {:?}", e);
ServiceError::InternalServerError(
"Error updating crawl options on crawl_requests".to_string(),
)
})?;

create_crawl_query(
merged_options.clone(),
pool.clone(),
Expand Down