Skip to content

Commit

Permalink
wip: can't deserialize firecrawl response in crawl_operator from craw…
Browse files Browse the repository at this point in the history
…l-worker
  • Loading branch information
skeptrunedev committed Dec 13, 2024
1 parent 5a8f9a7 commit bf9c846
Show file tree
Hide file tree
Showing 10 changed files with 380 additions and 35 deletions.
16 changes: 15 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"name": "Debug executable 'trieve-server'",
"cargo": {
"args": [
"+default",
"+nightly",
"build",
"--manifest-path=./server/Cargo.toml",
"--bin=trieve-server",
Expand Down Expand Up @@ -35,6 +35,20 @@
"args": [],
"cwd": "${workspaceFolder}/server"
},
{
"type": "lldb",
"request": "launch",
"name": "Debug executable 'crawl-worker'",
"cargo": {
"args": [
"build",
"--manifest-path=./server/Cargo.toml",
"--bin=crawl-worker"
]
},
"args": [],
"cwd": "${workspaceFolder}/server"
},
{
"type": "lldb",
"request": "launch",
Expand Down
35 changes: 35 additions & 0 deletions clients/ts-sdk/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -8460,6 +8460,15 @@
"type": "string",
"description": "The URL to crawl",
"nullable": true
},
"webhook_metadata": {
"description": "Metadata to send back with the webhook call for each successful page scrape",
"nullable": true
},
"webhook_url": {
"type": "string",
"description": "Host to call back on the webhook for each successful page scrape",
"nullable": true
}
},
"example": {
Expand Down Expand Up @@ -10261,12 +10270,26 @@
"event_type"
],
"properties": {
"detected_hallucinations": {
"type": "array",
"items": {
"type": "string"
},
"description": "The detected hallucinations of the RAG event",
"nullable": true
},
"event_type": {
"type": "string",
"enum": [
"rag"
]
},
"hallucination_score": {
"type": "number",
"format": "double",
"description": "The hallucination score of the RAG event",
"nullable": true
},
"llm_response": {
"type": "string",
"description": "The response from the LLM",
Expand Down Expand Up @@ -12646,6 +12669,8 @@
"results",
"dataset_id",
"llm_response",
"hallucination_score",
"detected_hallucinations",
"created_at",
"user_id"
],
Expand All @@ -12657,6 +12682,16 @@
"type": "string",
"format": "uuid"
},
"detected_hallucinations": {
"type": "array",
"items": {
"type": "string"
}
},
"hallucination_score": {
"type": "number",
"format": "double"
},
"id": {
"type": "string",
"format": "uuid"
Expand Down
18 changes: 18 additions & 0 deletions clients/ts-sdk/src/types.gen.ts
Original file line number Diff line number Diff line change
Expand Up @@ -624,6 +624,14 @@ export type CrawlOptions = {
* The URL to crawl
*/
site_url?: (string) | null;
/**
* Metadata to send back with the webhook call for each successful page scrape
*/
webhook_metadata?: unknown;
/**
* Host to call back on the webhook for each successful page scrape
*/
webhook_url?: (string) | null;
};

/**
Expand Down Expand Up @@ -1351,7 +1359,15 @@ export type EventTypes = {
*/
user_id?: (string) | null;
} | {
/**
* The detected hallucinations of the RAG event
*/
detected_hallucinations?: Array<(string)> | null;
event_type: 'rag';
/**
* The hallucination score of the RAG event
*/
hallucination_score?: (number) | null;
/**
* The response from the LLM
*/
Expand Down Expand Up @@ -2127,6 +2143,8 @@ export type RAGUsageResponse = {
export type RagQueryEvent = {
created_at: string;
dataset_id: string;
detected_hallucinations: Array<(string)>;
hallucination_score: number;
id: string;
llm_response: string;
query_rating?: ((SearchQueryRating) | null);
Expand Down
8 changes: 3 additions & 5 deletions docker-compose-firecrawl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ name: firecrawl
services:
# Firecrawl services
playwright-service:
image: trieve/puppeteer-service-ts:v0.0.7
image: trieve/puppeteer-service-ts:v0.0.8
environment:
- PORT=3000
- PROXY_SERVER=${PROXY_SERVER}
Expand All @@ -15,7 +15,7 @@ services:
- backend

firecrawl-api:
image: trieve/firecrawl:v0.0.47
image: trieve/firecrawl:v0.0.48
networks:
- backend
environment:
Expand All @@ -27,7 +27,6 @@ services:
- BULL_AUTH_KEY=${BULL_AUTH_KEY}
- TEST_API_KEY=${TEST_API_KEY}
- HOST=${HOST:-0.0.0.0}
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
- LOGGING_LEVEL=${LOGGING_LEVEL}
extra_hosts:
- "host.docker.internal:host-gateway"
Expand All @@ -38,7 +37,7 @@ services:
command: ["pnpm", "run", "start:production"]

firecrawl-worker:
image: trieve/firecrawl:v0.0.46
image: trieve/firecrawl:v0.0.48
networks:
- backend
environment:
Expand All @@ -51,7 +50,6 @@ services:
- TEST_API_KEY=${TEST_API_KEY}
- SCRAPING_BEE_API_KEY=${SCRAPING_BEE_API_KEY}
- HOST=${HOST:-0.0.0.0}
- SELF_HOSTED_WEBHOOK_URL=${SELF_HOSTED_WEBHOOK_URL}
- LOGGING_LEVEL=${LOGGING_LEVEL}
extra_hosts:
- "host.docker.internal:host-gateway"
Expand Down
48 changes: 26 additions & 22 deletions server/src/bin/crawl-worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -296,17 +296,17 @@ async fn get_chunks_with_firecrawl(
let page_count = data.len();

for page in data {
let page = match page {
let crawl_doc = match page {
Some(page) => page,
None => continue,
};

if page.metadata.status_code != Some(200) {
log::error!("Error getting metadata for page: {:?}", page.metadata);
if crawl_doc.metadata.status_code != Some(200) {
log::error!("Error getting metadata for page: {:?}", crawl_doc.metadata);
continue;
}

let page_link = page
let page_link = crawl_doc
.metadata
.source_url
.clone()
Expand All @@ -316,14 +316,18 @@ async fn get_chunks_with_firecrawl(
if page_link.is_empty() {
println!(
"Error page source_url is not present for page_metadata: {:?}",
page.metadata
crawl_doc.metadata
);
continue;
}

let page_title = page.metadata.og_title.clone().unwrap_or_default();
let page_description = page.metadata.og_description.clone().unwrap_or_default();
let page_html = page.html.clone().unwrap_or_default();
let page_title = crawl_doc.metadata.og_title.clone().unwrap_or_default();
let page_description = crawl_doc
.metadata
.og_description
.clone()
.unwrap_or(crawl_doc.metadata.description.unwrap_or_default().clone());
let page_html = crawl_doc.html.clone().unwrap_or_default();
let page_tags = get_tags(page_link.clone());

if let Some(spec) = &spec {
Expand Down Expand Up @@ -588,20 +592,20 @@ async fn get_chunks_with_firecrawl(

#[allow(clippy::print_stdout)]
async fn crawl(
scrape_request: CrawlRequest,
crawl_request: CrawlRequest,
pool: web::Data<Pool>,
redis_pool: web::Data<RedisPool>,
) -> Result<ScrapeReport, ServiceError> {
log::info!("Starting crawl for scrape_id: {}", scrape_request.id);
log::info!("Starting crawl for scrape_id: {}", crawl_request.id);
let (page_count, chunks_created) = if let Some(ScrapeOptions::Shopify(_)) =
scrape_request.crawl_options.scrape_options.clone()
crawl_request.crawl_options.scrape_options.clone()
{
let mut cur_page = 1;
let mut chunk_count = 0;

loop {
let mut chunks: Vec<ChunkReqPayload> = Vec::new();
let cleaned_url = scrape_request.url.trim_end_matches("/");
let cleaned_url = crawl_request.url.trim_end_matches("/");
let url = format!("{}/products.json?page={}", cleaned_url, cur_page);

let response: ShopifyResponse = ureq::get(&url)
Expand All @@ -622,15 +626,15 @@ async fn crawl(
&product,
&product.variants[0],
cleaned_url,
&scrape_request,
&crawl_request,
)?);
} else {
for variant in &product.variants {
chunks.push(create_chunk_req_payload(
&product,
variant,
cleaned_url,
&scrape_request,
&crawl_request,
)?);
}
}
Expand All @@ -640,7 +644,7 @@ async fn crawl(

for chunk in chunks_to_upload {
let (chunk_ingestion_message, chunk_metadatas) =
create_chunk_metadata(chunk.to_vec(), scrape_request.dataset_id).await?;
create_chunk_metadata(chunk.to_vec(), crawl_request.dataset_id).await?;

let mut redis_conn = redis_pool
.get()
Expand Down Expand Up @@ -671,12 +675,12 @@ async fn crawl(
(cur_page, chunk_count)
} else {
let (chunks, page_count) =
get_chunks_with_firecrawl(scrape_request.clone(), pool.clone()).await?;
get_chunks_with_firecrawl(crawl_request.clone(), pool.clone()).await?;
let chunks_to_upload = chunks.chunks(120);

for chunk in chunks_to_upload {
for batch in chunks_to_upload {
let (chunk_ingestion_message, chunk_metadatas) =
create_chunk_metadata(chunk.to_vec(), scrape_request.dataset_id).await?;
create_chunk_metadata(batch.to_vec(), crawl_request.dataset_id).await?;

let mut redis_conn = redis_pool
.get()
Expand All @@ -703,21 +707,21 @@ async fn crawl(
};

update_crawl_status(
scrape_request.scrape_id,
crawl_request.scrape_id,
CrawlStatus::Completed,
pool.clone(),
)
.await?;

update_next_crawl_at(
scrape_request.scrape_id,
scrape_request.next_crawl_at + scrape_request.interval,
crawl_request.scrape_id,
crawl_request.next_crawl_at + crawl_request.interval,
pool.clone(),
)
.await?;

Ok(ScrapeReport {
request_id: scrape_request.id,
request_id: crawl_request.id,
pages_scraped: page_count,
chunks_created,
})
Expand Down
14 changes: 13 additions & 1 deletion server/src/data/models.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7688,6 +7688,10 @@ pub struct CrawlOptions {
pub body_remove_strings: Option<Vec<String>>,
/// Options for including an openapi spec in the crawl
pub scrape_options: Option<ScrapeOptions>,
/// Host to call back on the webhook for each successful page scrape
pub webhook_url: Option<String>,
/// Metadata to send back with the webhook call for each successful page scrape
pub webhook_metadata: Option<serde_json::Value>,
}

#[derive(Serialize, Deserialize, Debug, ToSchema, Clone)]
Expand Down Expand Up @@ -7733,6 +7737,8 @@ impl CrawlOptions {
.body_remove_strings
.clone()
.or(other.body_remove_strings.clone()),
webhook_url: None,
webhook_metadata: None,
}
}
}
Expand All @@ -7753,6 +7759,10 @@ pub struct FirecrawlCrawlRequest {
pub allow_external_links: Option<bool>,
#[serde(skip_serializing_if = "Option::is_none")]
pub scrape_options: Option<FirecrawlScraperOptions>,
#[serde(skip_serializing_if = "Option::is_none")]
pub webhook_url: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub webhook_metadata: Option<serde_json::Value>,
}

#[derive(Debug, Serialize, Deserialize, Clone, ToSchema, Default)]
Expand Down Expand Up @@ -7780,9 +7790,11 @@ impl From<CrawlOptions> for FirecrawlCrawlRequest {
scrape_options: Some(FirecrawlScraperOptions {
include_tags: crawl_options.include_tags,
exclude_tags: crawl_options.exclude_tags,
formats: Some(vec!["html".to_string(), "rawHtml".to_string()]),
formats: Some(vec!["rawHtml".to_string()]),
wait_for: Some(1000),
}),
webhook_url: crawl_options.webhook_url,
webhook_metadata: crawl_options.webhook_metadata,
}
}
}
4 changes: 2 additions & 2 deletions server/src/handlers/dataset_handler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use crate::{
middleware::auth_middleware::{verify_admin, verify_owner},
operators::{
crawl_operator::{
crawl, get_crawl_request_by_dataset_id_query, update_crawl_settings_for_dataset,
create_crawl_query, get_crawl_request_by_dataset_id_query, update_crawl_settings_for_dataset,
validate_crawl_options,
},
dataset_operator::{
Expand Down Expand Up @@ -175,7 +175,7 @@ pub async fn create_dataset(
let d = create_dataset_query(dataset.clone(), pool.clone()).await?;

if let Some(crawl_options) = data.crawl_options.clone() {
crawl(
create_crawl_query(
crawl_options.clone(),
pool.clone(),
redis_pool.clone(),
Expand Down
Loading

0 comments on commit bf9c846

Please sign in to comment.