diff --git a/src/utils.rs b/src/utils.rs index afa2aef..915eeb3 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -37,9 +37,9 @@ pub use self::{ WebPageResourceContentData, WebPageResourceDiffStatus, WebPageResourcesData, WebPageResourcesTrackerGetHistoryParams, WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerCreateParams, WebPageTrackerKind, WebPageTrackerSettings, WebPageTrackerTag, - WebPageTrackerUpdateParams, WebScraperContentRequest, WebScraperContentRequestScripts, - WebScraperContentResponse, WebScraperResource, WebScraperResourcesRequest, - WebScraperResourcesRequestScripts, WebScraperResourcesResponse, + WebPageTrackerUpdateParams, WebScraperContentError, WebScraperContentRequest, + WebScraperContentRequestScripts, WebScraperContentResponse, WebScraperResource, + WebScraperResourcesRequest, WebScraperResourcesRequestScripts, WebScraperResourcesResponse, WEB_PAGE_CONTENT_TRACKER_EXTRACT_SCRIPT_NAME, WEB_PAGE_RESOURCES_TRACKER_FILTER_SCRIPT_NAME, }, diff --git a/src/utils/web_scraping.rs b/src/utils/web_scraping.rs index 8e9d59f..7ba7bf4 100644 --- a/src/utils/web_scraping.rs +++ b/src/utils/web_scraping.rs @@ -15,9 +15,10 @@ pub use self::{ WebPageResource, WebPageResourceContent, WebPageResourceContentData, WebPageResourceDiffStatus, WebPageResourcesData, WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerKind, WebPageTrackerSettings, WebPageTrackerTag, - WebScraperContentRequest, WebScraperContentRequestScripts, WebScraperContentResponse, - WebScraperResource, WebScraperResourcesRequest, WebScraperResourcesRequestScripts, - WebScraperResourcesResponse, MAX_WEB_PAGE_TRACKER_DELAY, MAX_WEB_PAGE_TRACKER_REVISIONS, + WebScraperContentError, WebScraperContentRequest, WebScraperContentRequestScripts, + WebScraperContentResponse, WebScraperResource, WebScraperResourcesRequest, + WebScraperResourcesRequestScripts, WebScraperResourcesResponse, MAX_WEB_PAGE_TRACKER_DELAY, + MAX_WEB_PAGE_TRACKER_REVISIONS, }, }; use crate::{ diff --git a/src/utils/web_scraping/api_ext.rs b/src/utils/web_scraping/api_ext.rs index ce77440..b358c08 100644 --- a/src/utils/web_scraping/api_ext.rs +++ b/src/utils/web_scraping/api_ext.rs @@ -22,9 +22,9 @@ use crate::{ WebScraperResource, MAX_WEB_PAGE_TRACKER_DELAY, MAX_WEB_PAGE_TRACKER_REVISIONS, }, WebPageContentTrackerTag, WebPageDataRevision, WebPageResource, WebPageResourcesData, - WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerTag, WebScraperContentRequest, - WebScraperContentRequestScripts, WebScraperContentResponse, WebScraperResourcesRequest, - WebScraperResourcesRequestScripts, WebScraperResourcesResponse, + WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerTag, WebScraperContentError, + WebScraperContentRequest, WebScraperContentRequestScripts, WebScraperContentResponse, + WebScraperResourcesRequest, WebScraperResourcesRequestScripts, WebScraperResourcesResponse, }, }; use anyhow::{anyhow, bail}; @@ -413,17 +413,47 @@ impl<'a, DR: DnsResolver, ET: EmailTransport> WebScrapingApiExt<'a, DR, ET> { )) .json(&scraper_request) .send() - .await? - .json::() .await .map_err(|err| { - log::error!( - "Cannot fetch content for `{}` ('{}'): {:?}", - tracker.url, + anyhow!( + "Could not connect to the web scraper service to extract content for the web tracker ('{}'): {:?}", tracker.id, err + ) + })?; + + if !scraper_response.status().is_success() { + let is_client_error = scraper_response.status().is_client_error(); + let scraper_error_response = scraper_response + .json::() + .await + .map_err(|err| { + anyhow!( + "Could not deserialize scraper error response for the web tracker ('{}'): {:?}", + tracker.id, + err + ) + })?; + if is_client_error { + bail!(SecutilsError::client(scraper_error_response.message)); + } else { + bail!( + "Unexpected scraper error for the web tracker ('{}'): {:?}", + tracker.id, + scraper_error_response.message ); - anyhow!("Web page tracker cannot fetch content due to unexpected error") + } + } + + let scraper_response = scraper_response + .json::() + .await + .map_err(|err| { + anyhow!( + "Could not deserialize scraper response for the web tracker ('{}'): {:?}", + tracker.id, + err + ) })?; // Check if there is a revision with the same timestamp. If so, drop newly fetched revision. @@ -868,12 +898,13 @@ mod tests { WebPageContentTrackerTag, WebPageResource, WebPageResourceDiffStatus, WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerCreateParams, WebPageTrackerKind, WebPageTrackerSettings, WebPageTrackerUpdateParams, - WebScraperContentRequest, WebScraperContentResponse, WebScraperResource, - WebScraperResourcesRequest, WebScraperResourcesResponse, + WebScraperContentError, WebScraperContentRequest, WebScraperContentResponse, + WebScraperResource, WebScraperResourcesRequest, WebScraperResourcesResponse, WEB_PAGE_CONTENT_TRACKER_EXTRACT_SCRIPT_NAME, WEB_PAGE_RESOURCES_TRACKER_FILTER_SCRIPT_NAME, }, }; + use actix_web::ResponseError; use futures::StreamExt; use httpmock::MockServer; use insta::assert_debug_snapshot; @@ -2711,6 +2742,76 @@ mod tests { Ok(()) } + #[tokio::test] + async fn properly_forwards_error_if_web_page_content_extraction_fails() -> anyhow::Result<()> { + let server = MockServer::start(); + let mut config = mock_config()?; + config.components.web_scraper_url = Url::parse(&server.base_url())?; + + let api = mock_api_with_config(config).await?; + let mock_user = mock_user()?; + api.db.insert_user(&mock_user).await?; + + let web_scraping = WebScrapingApiExt::new(&api); + let tracker = web_scraping + .create_content_tracker( + mock_user.id, + WebPageTrackerCreateParams { + name: "name_one".to_string(), + url: Url::parse("https://secutils.dev/one")?, + settings: WebPageTrackerSettings { + revisions: 3, + delay: Duration::from_millis(2000), + enable_notifications: true, + schedule: Some("0 0 * * * *".to_string()), + scripts: Default::default(), + }, + }, + ) + .await?; + + let content_mock = server.mock(|when, then| { + when.method(httpmock::Method::POST) + .path("/api/web_page/content") + .json_body( + serde_json::to_value( + WebScraperContentRequest::with_default_parameters(&tracker.url) + .set_delay(Duration::from_millis(2000)), + ) + .unwrap(), + ); + then.status(400) + .header("Content-Type", "application/json") + .json_body_obj(&WebScraperContentError { + message: "some client-error".to_string(), + }); + }); + + let scraper_error = web_scraping + .get_content_tracker_history( + mock_user.id, + tracker.id, + WebPageContentTrackerGetHistoryParams { refresh: true }, + ) + .await + .unwrap_err() + .downcast::() + .unwrap(); + assert_eq!(scraper_error.status_code(), 400); + assert_debug_snapshot!( + scraper_error, + @r###""some client-error""### + ); + + let tracker_content = web_scraping + .get_content_tracker_history(mock_user.id, tracker.id, Default::default()) + .await?; + assert!(tracker_content.is_empty()); + content_mock.assert(); + + Ok(()) + } + #[tokio::test] async fn properly_ignores_web_page_resources_with_the_same_timestamp() -> anyhow::Result<()> { let server = MockServer::start(); diff --git a/src/utils/web_scraping/web_page_trackers.rs b/src/utils/web_scraping/web_page_trackers.rs index 6bd8a81..53b9cae 100644 --- a/src/utils/web_scraping/web_page_trackers.rs +++ b/src/utils/web_scraping/web_page_trackers.rs @@ -8,8 +8,8 @@ mod web_page_tracker_tag; pub use self::{ web_page_content::{ - WebPageContentTrackerTag, WebScraperContentRequest, WebScraperContentRequestScripts, - WebScraperContentResponse, + WebPageContentTrackerTag, WebScraperContentError, WebScraperContentRequest, + WebScraperContentRequestScripts, WebScraperContentResponse, }, web_page_data_revision::WebPageDataRevision, web_page_resources::{ diff --git a/src/utils/web_scraping/web_page_trackers/web_page_content.rs b/src/utils/web_scraping/web_page_trackers/web_page_content.rs index 337fa70..22beb5e 100644 --- a/src/utils/web_scraping/web_page_trackers/web_page_content.rs +++ b/src/utils/web_scraping/web_page_trackers/web_page_content.rs @@ -1,9 +1,11 @@ mod web_page_content_tracker_tag; +mod web_scraper_content_error; mod web_scraper_content_request; mod web_scraper_content_response; pub use self::{ web_page_content_tracker_tag::WebPageContentTrackerTag, + web_scraper_content_error::WebScraperContentError, web_scraper_content_request::{WebScraperContentRequest, WebScraperContentRequestScripts}, web_scraper_content_response::WebScraperContentResponse, }; diff --git a/src/utils/web_scraping/web_page_trackers/web_page_content/web_scraper_content_error.rs b/src/utils/web_scraping/web_page_trackers/web_page_content/web_scraper_content_error.rs new file mode 100644 index 0000000..23d5166 --- /dev/null +++ b/src/utils/web_scraping/web_page_trackers/web_page_content/web_scraper_content_error.rs @@ -0,0 +1,46 @@ +use serde::{Deserialize, Serialize}; + +/// Represents error response if scraper couldn't extract content. +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct WebScraperContentError { + /// Error message. + pub message: String, +} + +#[cfg(test)] +mod tests { + use super::WebScraperContentError; + use insta::assert_json_snapshot; + + #[test] + fn deserialization() -> anyhow::Result<()> { + assert_eq!( + serde_json::from_str::( + r#" +{ + "message": "some-error" +} + "# + )?, + WebScraperContentError { + message: "some-error".to_string(), + } + ); + + Ok(()) + } + + #[test] + fn serialization() -> anyhow::Result<()> { + assert_json_snapshot!(WebScraperContentError { + message: "some-error".to_string(), + }, @r###" + { + "message": "some-error" + } + "###); + + Ok(()) + } +}