Skip to content

Commit

Permalink
fix(web-scraping): surface web page content tracker errors in the API…
Browse files Browse the repository at this point in the history
… responses
  • Loading branch information
azasypkin committed Nov 17, 2023
1 parent b879bf1 commit 888c8ac
Show file tree
Hide file tree
Showing 6 changed files with 169 additions and 19 deletions.
6 changes: 3 additions & 3 deletions src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ pub use self::{
WebPageResourceContentData, WebPageResourceDiffStatus, WebPageResourcesData,
WebPageResourcesTrackerGetHistoryParams, WebPageResourcesTrackerTag, WebPageTracker,
WebPageTrackerCreateParams, WebPageTrackerKind, WebPageTrackerSettings, WebPageTrackerTag,
WebPageTrackerUpdateParams, WebScraperContentRequest, WebScraperContentRequestScripts,
WebScraperContentResponse, WebScraperResource, WebScraperResourcesRequest,
WebScraperResourcesRequestScripts, WebScraperResourcesResponse,
WebPageTrackerUpdateParams, WebScraperContentError, WebScraperContentRequest,
WebScraperContentRequestScripts, WebScraperContentResponse, WebScraperResource,
WebScraperResourcesRequest, WebScraperResourcesRequestScripts, WebScraperResourcesResponse,
WEB_PAGE_CONTENT_TRACKER_EXTRACT_SCRIPT_NAME,
WEB_PAGE_RESOURCES_TRACKER_FILTER_SCRIPT_NAME,
},
Expand Down
7 changes: 4 additions & 3 deletions src/utils/web_scraping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,10 @@ pub use self::{
WebPageResource, WebPageResourceContent, WebPageResourceContentData,
WebPageResourceDiffStatus, WebPageResourcesData, WebPageResourcesTrackerTag,
WebPageTracker, WebPageTrackerKind, WebPageTrackerSettings, WebPageTrackerTag,
WebScraperContentRequest, WebScraperContentRequestScripts, WebScraperContentResponse,
WebScraperResource, WebScraperResourcesRequest, WebScraperResourcesRequestScripts,
WebScraperResourcesResponse, MAX_WEB_PAGE_TRACKER_DELAY, MAX_WEB_PAGE_TRACKER_REVISIONS,
WebScraperContentError, WebScraperContentRequest, WebScraperContentRequestScripts,
WebScraperContentResponse, WebScraperResource, WebScraperResourcesRequest,
WebScraperResourcesRequestScripts, WebScraperResourcesResponse, MAX_WEB_PAGE_TRACKER_DELAY,
MAX_WEB_PAGE_TRACKER_REVISIONS,
},
};
use crate::{
Expand Down
123 changes: 112 additions & 11 deletions src/utils/web_scraping/api_ext.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ use crate::{
WebScraperResource, MAX_WEB_PAGE_TRACKER_DELAY, MAX_WEB_PAGE_TRACKER_REVISIONS,
},
WebPageContentTrackerTag, WebPageDataRevision, WebPageResource, WebPageResourcesData,
WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerTag, WebScraperContentRequest,
WebScraperContentRequestScripts, WebScraperContentResponse, WebScraperResourcesRequest,
WebScraperResourcesRequestScripts, WebScraperResourcesResponse,
WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerTag, WebScraperContentError,
WebScraperContentRequest, WebScraperContentRequestScripts, WebScraperContentResponse,
WebScraperResourcesRequest, WebScraperResourcesRequestScripts, WebScraperResourcesResponse,
},
};
use anyhow::{anyhow, bail};
Expand Down Expand Up @@ -413,17 +413,47 @@ impl<'a, DR: DnsResolver, ET: EmailTransport> WebScrapingApiExt<'a, DR, ET> {
))
.json(&scraper_request)
.send()
.await?
.json::<WebScraperContentResponse>()
.await
.map_err(|err| {
log::error!(
"Cannot fetch content for `{}` ('{}'): {:?}",
tracker.url,
anyhow!(
"Could not connect to the web scraper service to extract content for the web tracker ('{}'): {:?}",
tracker.id,
err
)
})?;

if !scraper_response.status().is_success() {
let is_client_error = scraper_response.status().is_client_error();
let scraper_error_response = scraper_response
.json::<WebScraperContentError>()
.await
.map_err(|err| {
anyhow!(
"Could not deserialize scraper error response for the web tracker ('{}'): {:?}",
tracker.id,
err
)
})?;
if is_client_error {
bail!(SecutilsError::client(scraper_error_response.message));
} else {
bail!(
"Unexpected scraper error for the web tracker ('{}'): {:?}",
tracker.id,
scraper_error_response.message
);
anyhow!("Web page tracker cannot fetch content due to unexpected error")
}
}

let scraper_response = scraper_response
.json::<WebScraperContentResponse>()
.await
.map_err(|err| {
anyhow!(
"Could not deserialize scraper response for the web tracker ('{}'): {:?}",
tracker.id,
err
)
})?;

// Check if there is a revision with the same timestamp. If so, drop newly fetched revision.
Expand Down Expand Up @@ -868,12 +898,13 @@ mod tests {
WebPageContentTrackerTag, WebPageResource, WebPageResourceDiffStatus,
WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerCreateParams,
WebPageTrackerKind, WebPageTrackerSettings, WebPageTrackerUpdateParams,
WebScraperContentRequest, WebScraperContentResponse, WebScraperResource,
WebScraperResourcesRequest, WebScraperResourcesResponse,
WebScraperContentError, WebScraperContentRequest, WebScraperContentResponse,
WebScraperResource, WebScraperResourcesRequest, WebScraperResourcesResponse,
WEB_PAGE_CONTENT_TRACKER_EXTRACT_SCRIPT_NAME,
WEB_PAGE_RESOURCES_TRACKER_FILTER_SCRIPT_NAME,
},
};
use actix_web::ResponseError;
use futures::StreamExt;
use httpmock::MockServer;
use insta::assert_debug_snapshot;
Expand Down Expand Up @@ -2711,6 +2742,76 @@ mod tests {
Ok(())
}

#[tokio::test]
async fn properly_forwards_error_if_web_page_content_extraction_fails() -> anyhow::Result<()> {
let server = MockServer::start();
let mut config = mock_config()?;
config.components.web_scraper_url = Url::parse(&server.base_url())?;

let api = mock_api_with_config(config).await?;
let mock_user = mock_user()?;
api.db.insert_user(&mock_user).await?;

let web_scraping = WebScrapingApiExt::new(&api);
let tracker = web_scraping
.create_content_tracker(
mock_user.id,
WebPageTrackerCreateParams {
name: "name_one".to_string(),
url: Url::parse("https://secutils.dev/one")?,
settings: WebPageTrackerSettings {
revisions: 3,
delay: Duration::from_millis(2000),
enable_notifications: true,
schedule: Some("0 0 * * * *".to_string()),
scripts: Default::default(),
},
},
)
.await?;

let content_mock = server.mock(|when, then| {
when.method(httpmock::Method::POST)
.path("/api/web_page/content")
.json_body(
serde_json::to_value(
WebScraperContentRequest::with_default_parameters(&tracker.url)
.set_delay(Duration::from_millis(2000)),
)
.unwrap(),
);
then.status(400)
.header("Content-Type", "application/json")
.json_body_obj(&WebScraperContentError {
message: "some client-error".to_string(),
});
});

let scraper_error = web_scraping
.get_content_tracker_history(
mock_user.id,
tracker.id,
WebPageContentTrackerGetHistoryParams { refresh: true },
)
.await
.unwrap_err()
.downcast::<SecutilsError>()
.unwrap();
assert_eq!(scraper_error.status_code(), 400);
assert_debug_snapshot!(
scraper_error,
@r###""some client-error""###
);

let tracker_content = web_scraping
.get_content_tracker_history(mock_user.id, tracker.id, Default::default())
.await?;
assert!(tracker_content.is_empty());
content_mock.assert();

Ok(())
}

#[tokio::test]
async fn properly_ignores_web_page_resources_with_the_same_timestamp() -> anyhow::Result<()> {
let server = MockServer::start();
Expand Down
4 changes: 2 additions & 2 deletions src/utils/web_scraping/web_page_trackers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ mod web_page_tracker_tag;

pub use self::{
web_page_content::{
WebPageContentTrackerTag, WebScraperContentRequest, WebScraperContentRequestScripts,
WebScraperContentResponse,
WebPageContentTrackerTag, WebScraperContentError, WebScraperContentRequest,
WebScraperContentRequestScripts, WebScraperContentResponse,
},
web_page_data_revision::WebPageDataRevision,
web_page_resources::{
Expand Down
2 changes: 2 additions & 0 deletions src/utils/web_scraping/web_page_trackers/web_page_content.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
mod web_page_content_tracker_tag;
mod web_scraper_content_error;
mod web_scraper_content_request;
mod web_scraper_content_response;

pub use self::{
web_page_content_tracker_tag::WebPageContentTrackerTag,
web_scraper_content_error::WebScraperContentError,
web_scraper_content_request::{WebScraperContentRequest, WebScraperContentRequestScripts},
web_scraper_content_response::WebScraperContentResponse,
};
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
use serde::{Deserialize, Serialize};

/// Represents error response if scraper couldn't extract content.
#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct WebScraperContentError {
/// Error message.
pub message: String,
}

#[cfg(test)]
mod tests {
use super::WebScraperContentError;
use insta::assert_json_snapshot;

#[test]
fn deserialization() -> anyhow::Result<()> {
assert_eq!(
serde_json::from_str::<WebScraperContentError>(
r#"
{
"message": "some-error"
}
"#
)?,
WebScraperContentError {
message: "some-error".to_string(),
}
);

Ok(())
}

#[test]
fn serialization() -> anyhow::Result<()> {
assert_json_snapshot!(WebScraperContentError {
message: "some-error".to_string(),
}, @r###"
{
"message": "some-error"
}
"###);

Ok(())
}
}

0 comments on commit 888c8ac

Please sign in to comment.