From 473191e8f292701891192ed48f0d66ba13cd38f9 Mon Sep 17 00:00:00 2001 From: Aleh Zasypkin Date: Sun, 19 Nov 2023 18:48:52 +0100 Subject: [PATCH] feat(web-scraping): notify users about failed attempts to check changes in resources or content --- assets/templates/tracker_styles.hbs | 14 + ...web_page_content_tracker_changes_email.hbs | 19 +- ...ge_content_tracker_changes_error_email.hbs | 21 + ...b_page_resources_tracker_changes_email.hbs | 19 +- ..._resources_tracker_changes_error_email.hbs | 21 + .../notification_content_template.rs | 137 ++++- .../web_page_content_tracker_changes.rs | 50 +- .../web_page_resources_tracker_changes.rs | 58 ++- .../web_page_trackers_fetch_job.rs | 491 ++++++++++++++++-- src/utils.rs | 4 +- src/utils/web_scraping.rs | 4 +- src/utils/web_scraping/api_ext.rs | 146 +++++- src/utils/web_scraping/web_page_trackers.rs | 6 +- .../web_page_trackers/web_page_content.rs | 2 - .../web_page_trackers/web_scraper.rs | 3 + .../web_scraper_error_response.rs} | 12 +- tools/api/utils/web_scraping_content.http | 15 + 17 files changed, 863 insertions(+), 159 deletions(-) create mode 100644 assets/templates/tracker_styles.hbs create mode 100644 assets/templates/web_page_content_tracker_changes_error_email.hbs create mode 100644 assets/templates/web_page_resources_tracker_changes_error_email.hbs create mode 100644 src/utils/web_scraping/web_page_trackers/web_scraper.rs rename src/utils/web_scraping/web_page_trackers/{web_page_content/web_scraper_content_error.rs => web_scraper/web_scraper_error_response.rs} (71%) create mode 100644 tools/api/utils/web_scraping_content.http diff --git a/assets/templates/tracker_styles.hbs b/assets/templates/tracker_styles.hbs new file mode 100644 index 0000000..9bde122 --- /dev/null +++ b/assets/templates/tracker_styles.hbs @@ -0,0 +1,14 @@ + diff --git a/assets/templates/web_page_content_tracker_changes_email.hbs b/assets/templates/web_page_content_tracker_changes_email.hbs index 157e00e..3fb4508 100644 --- a/assets/templates/web_page_content_tracker_changes_email.hbs +++ b/assets/templates/web_page_content_tracker_changes_email.hbs @@ -1,28 +1,15 @@ - "{{tracker_name}}" content tracker detected changes + "{{tracker_name}}" tracker detected content changes {{> email_styles}} - + {{> tracker_styles}}
-

"{{tracker_name}}" content tracker detected changes

+

"{{tracker_name}}" tracker detected content changes

To learn more, visit the Content trackers page:

Web Scraping → Content trackers

If the button above doesn't work, you can navigate to the following URL directly:

diff --git a/assets/templates/web_page_content_tracker_changes_error_email.hbs b/assets/templates/web_page_content_tracker_changes_error_email.hbs new file mode 100644 index 0000000..18b2229 --- /dev/null +++ b/assets/templates/web_page_content_tracker_changes_error_email.hbs @@ -0,0 +1,21 @@ + + + + "{{tracker_name}}" tracker failed to check for content changes + + + {{> email_styles}} + {{> tracker_styles}} + + +
+

"{{tracker_name}}" tracker failed to check for content changes

+

There was an error while checking content: {{error_message}}.

+

To check the tracker configuration and re-try, visit the Content trackers page:

+ Web Scraping → Content trackers +

If the button above doesn't work, you can navigate to the following URL directly:

+

{{back_link}}

+ Secutils.dev logo +
+ + diff --git a/assets/templates/web_page_resources_tracker_changes_email.hbs b/assets/templates/web_page_resources_tracker_changes_email.hbs index 8beb3af..7e46a33 100644 --- a/assets/templates/web_page_resources_tracker_changes_email.hbs +++ b/assets/templates/web_page_resources_tracker_changes_email.hbs @@ -1,28 +1,15 @@ - "{{tracker_name}}" resources tracker detected {{changes_count}} changes + "{{tracker_name}}" tracker detected {{changes_count}} changes in resources {{> email_styles}} - + {{> tracker_styles}}
-

"{{tracker_name}}" resources tracker detected {{changes_count}} changes

+

"{{tracker_name}}" tracker detected {{changes_count}} changes in resources

To learn more, visit the Resources trackers page:

Web Scraping → Resources trackers

If the button above doesn't work, you can navigate to the following URL directly:

diff --git a/assets/templates/web_page_resources_tracker_changes_error_email.hbs b/assets/templates/web_page_resources_tracker_changes_error_email.hbs new file mode 100644 index 0000000..599a2c7 --- /dev/null +++ b/assets/templates/web_page_resources_tracker_changes_error_email.hbs @@ -0,0 +1,21 @@ + + + + "{{tracker_name}}" tracker failed to check for changes in resources + + + {{> email_styles}} + {{> tracker_styles}} + + +
+

"{{tracker_name}}" tracker failed to check for changes in resources

+

There was an error while checking resources: {{error_message}}.

+

To check the tracker configuration and re-try, visit the Resources trackers page:

+ Web Scraping → Resources trackers +

If the button above doesn't work, you can navigate to the following URL directly:

+

{{back_link}}

+ Secutils.dev logo +
+ + diff --git a/src/notifications/notification_content_template.rs b/src/notifications/notification_content_template.rs index 24814f2..a4f518c 100644 --- a/src/notifications/notification_content_template.rs +++ b/src/notifications/notification_content_template.rs @@ -22,9 +22,11 @@ pub enum NotificationContentTemplate { WebPageResourcesTrackerChanges { tracker_name: String, changes_count: usize, + error_message: Option, }, WebPageContentTrackerChanges { tracker_name: String, + error_message: Option, }, } @@ -44,16 +46,26 @@ impl NotificationContentTemplate { NotificationContentTemplate::WebPageResourcesTrackerChanges { tracker_name, changes_count, + error_message, } => { web_page_resources_tracker_changes::compile_to_email( api, tracker_name, *changes_count, + error_message.as_deref(), ) .await } - NotificationContentTemplate::WebPageContentTrackerChanges { tracker_name } => { - web_page_content_tracker_changes::compile_to_email(api, tracker_name).await + NotificationContentTemplate::WebPageContentTrackerChanges { + tracker_name, + error_message, + } => { + web_page_content_tracker_changes::compile_to_email( + api, + tracker_name, + error_message.as_deref(), + ) + .await } } } @@ -147,6 +159,61 @@ mod tests { let mut template = NotificationContentTemplate::WebPageResourcesTrackerChanges { tracker_name: "tracker".to_string(), changes_count: 10, + error_message: None, + } + .compile_to_email(&api) + .await?; + template + .attachments + .as_mut() + .unwrap() + .iter_mut() + .for_each(|a| { + a.content = a.content.len().to_be_bytes().iter().cloned().collect_vec(); + }); + + assert_debug_snapshot!(template, @r###" + EmailNotificationContent { + subject: "[Secutils.dev] Change detected: \"tracker\"", + text: "\"tracker\" tracker detected 10 changes in resources. Visit http://localhost:1234/ws/web_scraping__resources to learn more.", + html: Some( + "\n\n\n \"tracker\" tracker detected 10 changes in resources\n \n \n \n \n\n\n
\n

\"tracker\" tracker detected 10 changes in resources

\n

To learn more, visit the Resources trackers page:

\n Web Scraping → Resources trackers\n

If the button above doesn't work, you can navigate to the following URL directly:

\n

http://localhost:1234/ws/web_scraping__resources

\n \"Secutils.dev\n
\n\n\n", + ), + attachments: Some( + [ + EmailNotificationAttachment { + disposition: Inline( + "secutils-logo", + ), + content_type: "image/png", + content: [ + 0, + 0, + 0, + 0, + 0, + 0, + 15, + 165, + ], + }, + ], + ), + } + "### + ); + + Ok(()) + } + + #[tokio::test] + async fn can_compile_resources_tracker_changes_error_template_to_email() -> anyhow::Result<()> { + let api = mock_api().await?; + + let mut template = NotificationContentTemplate::WebPageResourcesTrackerChanges { + tracker_name: "tracker".to_string(), + changes_count: 0, + error_message: Some("Something went wrong".to_string()), } .compile_to_email(&api) .await?; @@ -161,10 +228,10 @@ mod tests { assert_debug_snapshot!(template, @r###" EmailNotificationContent { - subject: "Notification: \"tracker\" resources tracker detected 10 changes", - text: "\"tracker\" resources tracker detected 10 changes. Visit http://localhost:1234/ws/web_scraping__resources to learn more.", + subject: "[Secutils.dev] Check failed: \"tracker\"", + text: "\"tracker\" tracker failed to check for changes in resources due to the following error: Something went wrong. Visit http://localhost:1234/ws/web_scraping__resources to learn more.", html: Some( - "\n\n\n \"tracker\" resources tracker detected 10 changes\n \n \n \n \n\n\n
\n

\"tracker\" resources tracker detected 10 changes

\n

To learn more, visit the Resources trackers page:

\n Web Scraping → Resources trackers\n

If the button above doesn't work, you can navigate to the following URL directly:

\n

http://localhost:1234/ws/web_scraping__resources

\n \"Secutils.dev\n
\n\n\n", + "\n\n\n \"tracker\" tracker failed to check for changes in resources\n \n \n \n \n\n\n
\n

\"tracker\" tracker failed to check for changes in resources

\n

There was an error while checking resources: Something went wrong.

\n

To check the tracker configuration and re-try, visit the Resources trackers page:

\n Web Scraping → Resources trackers\n

If the button above doesn't work, you can navigate to the following URL directly:

\n

http://localhost:1234/ws/web_scraping__resources

\n \"Secutils.dev\n
\n\n\n", ), attachments: Some( [ @@ -199,6 +266,60 @@ mod tests { let mut template = NotificationContentTemplate::WebPageContentTrackerChanges { tracker_name: "tracker".to_string(), + error_message: None, + } + .compile_to_email(&api) + .await?; + template + .attachments + .as_mut() + .unwrap() + .iter_mut() + .for_each(|a| { + a.content = a.content.len().to_be_bytes().iter().cloned().collect_vec(); + }); + + assert_debug_snapshot!(template, @r###" + EmailNotificationContent { + subject: "[Secutils.dev] Change detected: \"tracker\"", + text: "\"tracker\" tracker detected content changes. Visit http://localhost:1234/ws/web_scraping__content to learn more.", + html: Some( + "\n\n\n \"tracker\" tracker detected content changes\n \n \n \n \n\n\n
\n

\"tracker\" tracker detected content changes

\n

To learn more, visit the Content trackers page:

\n Web Scraping → Content trackers\n

If the button above doesn't work, you can navigate to the following URL directly:

\n

http://localhost:1234/ws/web_scraping__content

\n \"Secutils.dev\n
\n\n\n", + ), + attachments: Some( + [ + EmailNotificationAttachment { + disposition: Inline( + "secutils-logo", + ), + content_type: "image/png", + content: [ + 0, + 0, + 0, + 0, + 0, + 0, + 15, + 165, + ], + }, + ], + ), + } + "### + ); + + Ok(()) + } + + #[tokio::test] + async fn can_compile_content_tracker_changes_error_template_to_email() -> anyhow::Result<()> { + let api = mock_api().await?; + + let mut template = NotificationContentTemplate::WebPageContentTrackerChanges { + tracker_name: "tracker".to_string(), + error_message: Some("Something went wrong".to_string()), } .compile_to_email(&api) .await?; @@ -213,10 +334,10 @@ mod tests { assert_debug_snapshot!(template, @r###" EmailNotificationContent { - subject: "Notification: \"tracker\" content tracker detected changes", - text: "\"tracker\" content tracker detected changes. Visit http://localhost:1234/ws/web_scraping__content to learn more.", + subject: "[Secutils.dev] Check failed: \"tracker\"", + text: "\"tracker\" tracker failed to check for content changes due to the following error: Something went wrong. Visit http://localhost:1234/ws/web_scraping__content to learn more.", html: Some( - "\n\n\n \"tracker\" content tracker detected changes\n \n \n \n \n\n\n
\n

\"tracker\" content tracker detected changes

\n

To learn more, visit the Content trackers page:

\n Web Scraping → Content trackers\n

If the button above doesn't work, you can navigate to the following URL directly:

\n

http://localhost:1234/ws/web_scraping__content

\n \"Secutils.dev\n
\n\n\n", + "\n\n\n \"tracker\" tracker failed to check for content changes\n \n \n \n \n\n\n
\n

\"tracker\" tracker failed to check for content changes

\n

There was an error while checking content: Something went wrong.

\n

To check the tracker configuration and re-try, visit the Content trackers page:

\n Web Scraping → Content trackers\n

If the button above doesn't work, you can navigate to the following URL directly:

\n

http://localhost:1234/ws/web_scraping__content

\n \"Secutils.dev\n
\n\n\n", ), attachments: Some( [ diff --git a/src/notifications/notification_content_template/web_page_content_tracker_changes.rs b/src/notifications/notification_content_template/web_page_content_tracker_changes.rs index 4c16ae8..46e93a8 100644 --- a/src/notifications/notification_content_template/web_page_content_tracker_changes.rs +++ b/src/notifications/notification_content_template/web_page_content_tracker_changes.rs @@ -12,25 +12,49 @@ pub const NOTIFICATION_LOGO_BYTES: &[u8] = pub async fn compile_to_email( api: &Api, tracker_name: &str, + error_message: Option<&str>, ) -> anyhow::Result { let back_link = format!("{}ws/web_scraping__content", api.config.public_url); - Ok(EmailNotificationContent::html_with_attachments( - format!( - "Notification: \"{}\" content tracker detected changes", - tracker_name - ), - format!( - "\"{}\" content tracker detected changes. Visit {} to learn more.", - tracker_name, back_link - ), - api.templates.render( - "web_page_content_tracker_changes_email", - &json!({ + + let (subject, text, html) = if let Some(error_message) = error_message { + ( + format!("[Secutils.dev] Check failed: \"{}\"", tracker_name), + format!( + "\"{}\" tracker failed to check for content changes due to the following error: {error_message}. Visit {} to learn more.", + tracker_name, back_link + ), + api.templates.render( + "web_page_content_tracker_changes_error_email", + &json!({ "tracker_name": tracker_name, + "error_message": error_message, "back_link": back_link, "home_link": api.config.public_url.as_str(), }), - )?, + )? + ) + } else { + ( + format!("[Secutils.dev] Change detected: \"{}\"", tracker_name), + format!( + "\"{}\" tracker detected content changes. Visit {} to learn more.", + tracker_name, back_link + ), + api.templates.render( + "web_page_content_tracker_changes_email", + &json!({ + "tracker_name": tracker_name, + "back_link": back_link, + "home_link": api.config.public_url.as_str(), + }), + )?, + ) + }; + + Ok(EmailNotificationContent::html_with_attachments( + subject, + text, + html, vec![EmailNotificationAttachment::inline( "secutils-logo", "image/png", diff --git a/src/notifications/notification_content_template/web_page_resources_tracker_changes.rs b/src/notifications/notification_content_template/web_page_resources_tracker_changes.rs index 8ba76a0..f249c0d 100644 --- a/src/notifications/notification_content_template/web_page_resources_tracker_changes.rs +++ b/src/notifications/notification_content_template/web_page_resources_tracker_changes.rs @@ -13,26 +13,50 @@ pub async fn compile_to_email( api: &Api, tracker_name: &str, changes_count: usize, + error_message: Option<&str>, ) -> anyhow::Result { let back_link = format!("{}ws/web_scraping__resources", api.config.public_url); + + let (subject, text, html) = if let Some(error_message) = error_message { + ( + format!("[Secutils.dev] Check failed: \"{}\"", tracker_name), + format!( + "\"{}\" tracker failed to check for changes in resources due to the following error: {error_message}. Visit {} to learn more.", + tracker_name, back_link + ), + api.templates.render( + "web_page_resources_tracker_changes_error_email", + &json!({ + "tracker_name": tracker_name, + "error_message": error_message, + "back_link": back_link, + "home_link": api.config.public_url.as_str(), + }), + )? + ) + } else { + ( + format!("[Secutils.dev] Change detected: \"{}\"", tracker_name), + format!( + "\"{}\" tracker detected {} changes in resources. Visit {} to learn more.", + tracker_name, changes_count, back_link + ), + api.templates.render( + "web_page_resources_tracker_changes_email", + &json!({ + "tracker_name": tracker_name, + "changes_count": changes_count, + "back_link": back_link, + "home_link": api.config.public_url.as_str(), + }), + )?, + ) + }; + Ok(EmailNotificationContent::html_with_attachments( - format!( - "Notification: \"{}\" resources tracker detected {} changes", - tracker_name, changes_count - ), - format!( - "\"{}\" resources tracker detected {} changes. Visit {} to learn more.", - tracker_name, changes_count, back_link - ), - api.templates.render( - "web_page_resources_tracker_changes_email", - &json!({ - "tracker_name": tracker_name, - "changes_count": changes_count, - "back_link": back_link, - "home_link": api.config.public_url.as_str(), - }), - )?, + subject, + text, + html, vec![EmailNotificationAttachment::inline( "secutils-logo", "image/png", diff --git a/src/scheduler/scheduler_jobs/web_page_trackers_fetch_job.rs b/src/scheduler/scheduler_jobs/web_page_trackers_fetch_job.rs index bf153b4..14923c5 100644 --- a/src/scheduler/scheduler_jobs/web_page_trackers_fetch_job.rs +++ b/src/scheduler/scheduler_jobs/web_page_trackers_fetch_job.rs @@ -1,5 +1,6 @@ use crate::{ api::Api, + error::Error as SecutilsError, network::{DnsResolver, EmailTransport, EmailTransportError}, notifications::{NotificationContent, NotificationContentTemplate, NotificationDestination}, scheduler::scheduler_job::SchedulerJob, @@ -118,20 +119,41 @@ impl WebPageTrackersFetchJob { // Create a new revision and retrieve a diff if any changes from the previous version are // detected. If there are any changes and the tracker hasn't opted out of notifications, // schedule a notification about the detected changes. - let new_revision_with_diff = - match web_scraping - .create_resources_tracker_revision(tracker.user_id, tracker.id) - .await - { - Ok(new_revision_with_diff) => new_revision_with_diff, - Err(err) => { - log::error!( + let new_revision_with_diff = match web_scraping + .create_resources_tracker_revision(tracker.user_id, tracker.id) + .await + { + Ok(new_revision_with_diff) => new_revision_with_diff, + Err(err) => { + log::error!( "Failed to create web page tracker ('{}') history revision, took {}: {:?}.", - tracker.id, humantime::format_duration(fetch_start.elapsed()), err + tracker.id, + humantime::format_duration(fetch_start.elapsed()), + err ); - continue; - } - }; + + // Notify user about the error and re-schedule the job. + let tracker_name = tracker.name.clone(); + Self::try_notify_user( + &api, + tracker, + NotificationContentTemplate::WebPageResourcesTrackerChanges { + tracker_name, + changes_count: 0, + error_message: Some( + err.downcast::() + .map(|err| format!("{}", err)) + .unwrap_or_else(|_| "Unknown error".to_string()), + ), + }, + ) + .await; + api.db + .set_scheduler_job_stopped_state(job_id, false) + .await?; + continue; + } + }; log::debug!( "Successfully created web page tracker ('{}') history revision, took {}.", tracker.id, @@ -153,26 +175,17 @@ impl WebPageTrackersFetchJob { .filter(|resource| resource.diff_status.is_some()), ) .count(); - let notification_schedule_result = api - .notifications() - .schedule_notification( - NotificationDestination::User(tracker.user_id), - NotificationContent::Template( - NotificationContentTemplate::WebPageResourcesTrackerChanges { - tracker_name: tracker.name, - changes_count, - }, - ), - OffsetDateTime::now_utc(), - ) - .await; - if let Err(err) = notification_schedule_result { - log::error!( - "Failed to schedule a notification for web page tracker ('{}'): {:?}.", - tracker.id, - err - ); - } + let tracker_name = tracker.name.clone(); + Self::try_notify_user( + &api, + tracker, + NotificationContentTemplate::WebPageResourcesTrackerChanges { + tracker_name, + changes_count, + error_message: None, + }, + ) + .await; } } @@ -218,6 +231,25 @@ impl WebPageTrackersFetchJob { humantime::format_duration(fetch_start.elapsed()), err ); + + // Notify user about the error and re-schedule the job. + let tracker_name = tracker.name.clone(); + Self::try_notify_user( + &api, + tracker, + NotificationContentTemplate::WebPageContentTrackerChanges { + tracker_name, + error_message: Some( + err.downcast::() + .map(|err| format!("{}", err)) + .unwrap_or_else(|_| "Unknown error".to_string()), + ), + }, + ) + .await; + api.db + .set_scheduler_job_stopped_state(job_id, false) + .await?; continue; } }; @@ -227,26 +259,17 @@ impl WebPageTrackersFetchJob { humantime::format_duration(fetch_start.elapsed()) ); - if tracker.settings.enable_notifications && new_revision.is_some() { - let notification_schedule_result = api - .notifications() - .schedule_notification( - NotificationDestination::User(tracker.user_id), - NotificationContent::Template( - NotificationContentTemplate::WebPageContentTrackerChanges { - tracker_name: tracker.name, - }, - ), - OffsetDateTime::now_utc(), - ) - .await; - if let Err(err) = notification_schedule_result { - log::error!( - "Failed to schedule a notification for web page tracker ('{}'): {:?}.", - tracker.id, - err - ); - } + if new_revision.is_some() { + let tracker_name = tracker.name.clone(); + Self::try_notify_user( + &api, + tracker, + NotificationContentTemplate::WebPageContentTrackerChanges { + tracker_name, + error_message: None, + }, + ) + .await; } api.db @@ -287,6 +310,34 @@ impl WebPageTrackersFetchJob { Ok(Some((tracker, job_id))) } + + async fn try_notify_user( + api: &Api, + tracker: WebPageTracker, + template: NotificationContentTemplate, + ) where + ET::Error: EmailTransportError, + { + if !tracker.settings.enable_notifications { + return; + } + + let notification_schedule_result = api + .notifications() + .schedule_notification( + NotificationDestination::User(tracker.user_id), + NotificationContent::Template(template), + OffsetDateTime::now_utc(), + ) + .await; + if let Err(err) = notification_schedule_result { + log::error!( + "Failed to schedule a notification for web page tracker ('{}'): {:?}.", + tracker.id, + err + ); + } + } } #[cfg(test)] @@ -303,8 +354,9 @@ mod tests { WebPageResourceContentData, WebPageResourcesData, WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerCreateParams, WebPageTrackerKind, WebPageTrackerSettings, WebScraperContentRequest, WebScraperContentRequestScripts, WebScraperContentResponse, - WebScraperResource, WebScraperResourcesRequest, WebScraperResourcesRequestScripts, - WebScraperResourcesResponse, WEB_PAGE_CONTENT_TRACKER_EXTRACT_SCRIPT_NAME, + WebScraperErrorResponse, WebScraperResource, WebScraperResourcesRequest, + WebScraperResourcesRequestScripts, WebScraperResourcesResponse, + WEB_PAGE_CONTENT_TRACKER_EXTRACT_SCRIPT_NAME, WEB_PAGE_RESOURCES_TRACKER_FILTER_SCRIPT_NAME, }, }; @@ -979,6 +1031,7 @@ mod tests { WebPageResourcesTrackerChanges { tracker_name: "tracker-one", changes_count: 2, + error_message: None, }, ), ), @@ -1002,6 +1055,172 @@ mod tests { Ok(()) } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn schedules_notification_when_resources_change_check_fails() -> anyhow::Result<()> { + let mut config = mock_config()?; + config.jobs.web_page_trackers_fetch = Schedule::try_from(mock_schedule_in_sec(3).as_str())?; + + let server = MockServer::start(); + config.components.web_scraper_url = Url::parse(&server.base_url())?; + + let user = mock_user()?; + let api = Arc::new(mock_api_with_config(config).await?); + let mut scheduler = JobScheduler::new_with_storage_and_code( + Box::new(SchedulerStore::new(api.db.clone())), + Box::::default(), + Box::::default(), + Box::::default(), + ) + .await?; + + // Make sure that the tracker is only run once during a single minute (2 seconds after the + // current second). + let tracker_schedule = mock_schedule_in_sec(1); + + // Create user, tracker and tracker job. + api.users().upsert(user.clone()).await?; + + let trigger_job_id = scheduler + .add( + WebPageTrackersTriggerJob::create( + api.clone(), + tracker_schedule.clone(), + WebPageTrackerKind::WebPageResources, + ) + .await?, + ) + .await?; + let tracker = WebPageTracker:: { + id: Uuid::now_v7(), + name: "tracker-one".to_string(), + url: "https://localhost:1234/my/app?q=2".parse()?, + settings: WebPageTrackerSettings { + revisions: 2, + schedule: Some(tracker_schedule), + delay: Duration::from_secs(2), + scripts: Default::default(), + enable_notifications: true, + }, + user_id: user.id, + job_id: Some(trigger_job_id), + // Preserve timestamp only up to seconds. + created_at: OffsetDateTime::from_unix_timestamp( + OffsetDateTime::now_utc().unix_timestamp(), + )?, + meta: None, + }; + + // Insert tracker directly to DB to bypass schedule validation. + api.db + .web_scraping(user.id) + .insert_web_page_tracker(&tracker) + .await?; + api.db + .web_scraping(user.id) + .insert_web_page_tracker_history_revision::( + &WebPageDataRevision { + id: uuid!("00000000-0000-0000-0000-000000000001"), + tracker_id: tracker.id, + created_at: OffsetDateTime::from_unix_timestamp(946720700)?, + data: WebPageResourcesData { + scripts: vec![], + styles: vec![], + }, + }, + ) + .await?; + + // Schedule fetch job + scheduler + .add(WebPageTrackersFetchJob::create(api.clone()).await?) + .await?; + + let resources_mock = server.mock(|when, then| { + when.method(httpmock::Method::POST) + .path("/api/web_page/resources") + .json_body( + serde_json::to_value( + WebScraperResourcesRequest::with_default_parameters(&tracker.url) + .set_delay(Duration::from_millis(2000)), + ) + .unwrap(), + ); + then.status(400) + .header("Content-Type", "application/json") + .json_body_obj(&WebScraperErrorResponse { + message: "some client-error".to_string(), + }); + }); + + // Start scheduler and wait for a few seconds, then stop it. + scheduler.start().await?; + + while api + .db + .get_notification_ids( + OffsetDateTime::now_utc().add(Duration::from_secs(3600 * 24 * 365)), + 10, + ) + .collect::>() + .await + .is_empty() + { + thread::sleep(Duration::from_millis(100)); + } + + scheduler.shutdown().await?; + + resources_mock.assert(); + + let mut notification_ids = api + .db + .get_notification_ids( + OffsetDateTime::now_utc().add(Duration::from_secs(3600 * 24 * 365)), + 10, + ) + .collect::>() + .await; + assert_eq!(notification_ids.len(), 1); + + let notification = api.db.get_notification(notification_ids.remove(0)?).await?; + assert_debug_snapshot!(notification.map(|notification| (notification.destination, notification.content)), @r###" + Some( + ( + User( + UserId( + 1, + ), + ), + Template( + WebPageResourcesTrackerChanges { + tracker_name: "tracker-one", + changes_count: 0, + error_message: Some( + "some client-error", + ), + }, + ), + ), + ) + "###); + + assert_eq!( + api.web_scraping() + .get_resources_tracker_history(user.id, tracker.id, Default::default()) + .await? + .len(), + 1 + ); + assert!(!api + .db + .get_scheduler_job(trigger_job_id) + .await? + .map(|job| job.stopped) + .unwrap_or_default()); + + Ok(()) + } + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn schedules_notification_when_content_change() -> anyhow::Result<()> { let mut config = mock_config()?; @@ -1143,6 +1362,7 @@ mod tests { Template( WebPageContentTrackerChanges { tracker_name: "tracker-one", + error_message: None, }, ), ), @@ -1165,4 +1385,167 @@ mod tests { Ok(()) } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn schedules_notification_when_content_change_check_fails() -> anyhow::Result<()> { + let mut config = mock_config()?; + config.jobs.web_page_trackers_fetch = Schedule::try_from(mock_schedule_in_sec(3).as_str())?; + + let server = MockServer::start(); + config.components.web_scraper_url = Url::parse(&server.base_url())?; + + let user = mock_user()?; + let api = Arc::new(mock_api_with_config(config).await?); + let mut scheduler = JobScheduler::new_with_storage_and_code( + Box::new(SchedulerStore::new(api.db.clone())), + Box::::default(), + Box::::default(), + Box::::default(), + ) + .await?; + + // Make sure that the tracker is only run once during a single minute (2 seconds after the + // current second). + let tracker_schedule = mock_schedule_in_sec(1); + + // Create user, tracker and tracker job. + api.users().upsert(user.clone()).await?; + + let trigger_job_id = scheduler + .add( + WebPageTrackersTriggerJob::create( + api.clone(), + tracker_schedule.clone(), + WebPageTrackerKind::WebPageContent, + ) + .await?, + ) + .await?; + let tracker = WebPageTracker:: { + id: Uuid::now_v7(), + name: "tracker-one".to_string(), + url: "https://localhost:1234/my/app?q=2".parse()?, + settings: WebPageTrackerSettings { + revisions: 2, + schedule: Some(tracker_schedule), + delay: Duration::from_secs(2), + scripts: Default::default(), + enable_notifications: true, + }, + user_id: user.id, + job_id: Some(trigger_job_id), + // Preserve timestamp only up to seconds. + created_at: OffsetDateTime::from_unix_timestamp( + OffsetDateTime::now_utc().unix_timestamp(), + )?, + meta: None, + }; + + // Insert tracker directly to DB to bypass schedule validation. + api.db + .web_scraping(user.id) + .insert_web_page_tracker(&tracker) + .await?; + api.db + .web_scraping(user.id) + .insert_web_page_tracker_history_revision::( + &WebPageDataRevision { + id: uuid!("00000000-0000-0000-0000-000000000001"), + tracker_id: tracker.id, + created_at: OffsetDateTime::from_unix_timestamp(946720700)?, + data: "some-content".to_string(), + }, + ) + .await?; + + // Schedule fetch job + scheduler + .add(WebPageTrackersFetchJob::create(api.clone()).await?) + .await?; + + let content_mock = server.mock(|when, then| { + when.method(httpmock::Method::POST) + .path("/api/web_page/content") + .json_body( + serde_json::to_value( + WebScraperContentRequest::with_default_parameters(&tracker.url) + .set_delay(Duration::from_millis(2000)) + .set_previous_content("some-content"), + ) + .unwrap(), + ); + then.status(400) + .header("Content-Type", "application/json") + .json_body_obj(&WebScraperErrorResponse { + message: "some client-error".to_string(), + }); + }); + + // Start scheduler and wait for a few seconds, then stop it. + scheduler.start().await?; + + while api + .db + .get_notification_ids( + OffsetDateTime::now_utc().add(Duration::from_secs(3600 * 24 * 365)), + 10, + ) + .collect::>() + .await + .is_empty() + { + thread::sleep(Duration::from_millis(100)); + } + + scheduler.shutdown().await?; + + content_mock.assert(); + + let mut notification_ids = api + .db + .get_notification_ids( + OffsetDateTime::now_utc().add(Duration::from_secs(3600 * 24 * 365)), + 10, + ) + .collect::>() + .await; + assert_eq!(notification_ids.len(), 1); + + let notification = api.db.get_notification(notification_ids.remove(0)?).await?; + assert_debug_snapshot!(notification.map(|notification| (notification.destination, notification.content)), @r###" + Some( + ( + User( + UserId( + 1, + ), + ), + Template( + WebPageContentTrackerChanges { + tracker_name: "tracker-one", + error_message: Some( + "some client-error", + ), + }, + ), + ), + ) + "###); + + assert_eq!( + api.web_scraping() + .get_content_tracker_history(user.id, tracker.id, Default::default()) + .await? + .len(), + 1 + ); + assert!(!api + .db + .get_scheduler_job(trigger_job_id) + .await? + .map(|job| job.stopped) + .unwrap_or_default()); + + Ok(()) + } } diff --git a/src/utils.rs b/src/utils.rs index 915eeb3..f15d935 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -37,8 +37,8 @@ pub use self::{ WebPageResourceContentData, WebPageResourceDiffStatus, WebPageResourcesData, WebPageResourcesTrackerGetHistoryParams, WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerCreateParams, WebPageTrackerKind, WebPageTrackerSettings, WebPageTrackerTag, - WebPageTrackerUpdateParams, WebScraperContentError, WebScraperContentRequest, - WebScraperContentRequestScripts, WebScraperContentResponse, WebScraperResource, + WebPageTrackerUpdateParams, WebScraperContentRequest, WebScraperContentRequestScripts, + WebScraperContentResponse, WebScraperErrorResponse, WebScraperResource, WebScraperResourcesRequest, WebScraperResourcesRequestScripts, WebScraperResourcesResponse, WEB_PAGE_CONTENT_TRACKER_EXTRACT_SCRIPT_NAME, WEB_PAGE_RESOURCES_TRACKER_FILTER_SCRIPT_NAME, diff --git a/src/utils/web_scraping.rs b/src/utils/web_scraping.rs index 7ba7bf4..8d1c294 100644 --- a/src/utils/web_scraping.rs +++ b/src/utils/web_scraping.rs @@ -15,8 +15,8 @@ pub use self::{ WebPageResource, WebPageResourceContent, WebPageResourceContentData, WebPageResourceDiffStatus, WebPageResourcesData, WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerKind, WebPageTrackerSettings, WebPageTrackerTag, - WebScraperContentError, WebScraperContentRequest, WebScraperContentRequestScripts, - WebScraperContentResponse, WebScraperResource, WebScraperResourcesRequest, + WebScraperContentRequest, WebScraperContentRequestScripts, WebScraperContentResponse, + WebScraperErrorResponse, WebScraperResource, WebScraperResourcesRequest, WebScraperResourcesRequestScripts, WebScraperResourcesResponse, MAX_WEB_PAGE_TRACKER_DELAY, MAX_WEB_PAGE_TRACKER_REVISIONS, }, diff --git a/src/utils/web_scraping/api_ext.rs b/src/utils/web_scraping/api_ext.rs index b358c08..eef3d87 100644 --- a/src/utils/web_scraping/api_ext.rs +++ b/src/utils/web_scraping/api_ext.rs @@ -22,8 +22,8 @@ use crate::{ WebScraperResource, MAX_WEB_PAGE_TRACKER_DELAY, MAX_WEB_PAGE_TRACKER_REVISIONS, }, WebPageContentTrackerTag, WebPageDataRevision, WebPageResource, WebPageResourcesData, - WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerTag, WebScraperContentError, - WebScraperContentRequest, WebScraperContentRequestScripts, WebScraperContentResponse, + WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerTag, WebScraperContentRequest, + WebScraperContentRequestScripts, WebScraperContentResponse, WebScraperErrorResponse, WebScraperResourcesRequest, WebScraperResourcesRequestScripts, WebScraperResourcesResponse, }, }; @@ -270,17 +270,47 @@ impl<'a, DR: DnsResolver, ET: EmailTransport> WebScrapingApiExt<'a, DR, ET> { )) .json(&scraper_request) .send() - .await? - .json::() .await .map_err(|err| { - log::error!( - "Cannot fetch resources for `{}` ('{}'): {:?}", - tracker.url, + anyhow!( + "Could not connect to the web scraper service to extract resources for the web tracker ('{}'): {:?}", + tracker.id, + err + ) + })?; + + if !scraper_response.status().is_success() { + let is_client_error = scraper_response.status().is_client_error(); + let scraper_error_response = scraper_response + .json::() + .await + .map_err(|err| { + anyhow!( + "Could not deserialize scraper error response for the web tracker ('{}'): {:?}", tracker.id, err + ) + })?; + if is_client_error { + bail!(SecutilsError::client(scraper_error_response.message)); + } else { + bail!( + "Unexpected scraper error for the web tracker ('{}'): {:?}", + tracker.id, + scraper_error_response.message ); - anyhow!("Web page tracker cannot fetch resources due to unexpected error") + } + } + + let scraper_response = scraper_response + .json::() + .await + .map_err(|err| { + anyhow!( + "Could not deserialize scraper response for the web tracker ('{}'): {:?}", + tracker.id, + err + ) })?; // Check if there is a revision with the same timestamp. If so, drop newly fetched revision. @@ -425,15 +455,15 @@ impl<'a, DR: DnsResolver, ET: EmailTransport> WebScrapingApiExt<'a, DR, ET> { if !scraper_response.status().is_success() { let is_client_error = scraper_response.status().is_client_error(); let scraper_error_response = scraper_response - .json::() + .json::() .await .map_err(|err| { - anyhow!( - "Could not deserialize scraper error response for the web tracker ('{}'): {:?}", - tracker.id, - err - ) - })?; + anyhow!( + "Could not deserialize scraper error response for the web tracker ('{}'): {:?}", + tracker.id, + err + ) + })?; if is_client_error { bail!(SecutilsError::client(scraper_error_response.message)); } else { @@ -896,11 +926,11 @@ mod tests { utils::{ web_scraping::WebScrapingApiExt, WebPageContentTrackerGetHistoryParams, WebPageContentTrackerTag, WebPageResource, WebPageResourceDiffStatus, - WebPageResourcesTrackerTag, WebPageTracker, WebPageTrackerCreateParams, - WebPageTrackerKind, WebPageTrackerSettings, WebPageTrackerUpdateParams, - WebScraperContentError, WebScraperContentRequest, WebScraperContentResponse, - WebScraperResource, WebScraperResourcesRequest, WebScraperResourcesResponse, - WEB_PAGE_CONTENT_TRACKER_EXTRACT_SCRIPT_NAME, + WebPageResourcesTrackerGetHistoryParams, WebPageResourcesTrackerTag, WebPageTracker, + WebPageTrackerCreateParams, WebPageTrackerKind, WebPageTrackerSettings, + WebPageTrackerUpdateParams, WebScraperContentRequest, WebScraperContentResponse, + WebScraperErrorResponse, WebScraperResource, WebScraperResourcesRequest, + WebScraperResourcesResponse, WEB_PAGE_CONTENT_TRACKER_EXTRACT_SCRIPT_NAME, WEB_PAGE_RESOURCES_TRACKER_FILTER_SCRIPT_NAME, }, }; @@ -2591,6 +2621,80 @@ mod tests { Ok(()) } + #[tokio::test] + async fn properly_forwards_error_if_web_page_resources_extraction_fails() -> anyhow::Result<()> + { + let server = MockServer::start(); + let mut config = mock_config()?; + config.components.web_scraper_url = Url::parse(&server.base_url())?; + + let api = mock_api_with_config(config).await?; + let mock_user = mock_user()?; + api.db.insert_user(&mock_user).await?; + + let web_scraping = WebScrapingApiExt::new(&api); + let tracker = web_scraping + .create_resources_tracker( + mock_user.id, + WebPageTrackerCreateParams { + name: "name_one".to_string(), + url: Url::parse("https://secutils.dev/one")?, + settings: WebPageTrackerSettings { + revisions: 3, + delay: Duration::from_millis(2000), + enable_notifications: true, + schedule: Some("0 0 * * * *".to_string()), + scripts: Default::default(), + }, + }, + ) + .await?; + + let web_scraper_mock = server.mock(|when, then| { + when.method(httpmock::Method::POST) + .path("/api/web_page/resources") + .json_body( + serde_json::to_value( + WebScraperResourcesRequest::with_default_parameters(&tracker.url) + .set_delay(Duration::from_millis(2000)), + ) + .unwrap(), + ); + then.status(400) + .header("Content-Type", "application/json") + .json_body_obj(&WebScraperErrorResponse { + message: "some client-error".to_string(), + }); + }); + + let scraper_error = web_scraping + .get_resources_tracker_history( + mock_user.id, + tracker.id, + WebPageResourcesTrackerGetHistoryParams { + refresh: true, + calculate_diff: false, + }, + ) + .await + .unwrap_err() + .downcast::() + .unwrap(); + assert_eq!(scraper_error.status_code(), 400); + assert_debug_snapshot!( + scraper_error, + @r###""some client-error""### + ); + + let tracker_resources = web_scraping + .get_resources_tracker_history(mock_user.id, tracker.id, Default::default()) + .await?; + assert!(tracker_resources.is_empty()); + web_scraper_mock.assert(); + + Ok(()) + } + #[tokio::test] async fn properly_saves_web_page_content() -> anyhow::Result<()> { let server = MockServer::start(); @@ -2782,7 +2886,7 @@ mod tests { ); then.status(400) .header("Content-Type", "application/json") - .json_body_obj(&WebScraperContentError { + .json_body_obj(&WebScraperErrorResponse { message: "some client-error".to_string(), }); }); diff --git a/src/utils/web_scraping/web_page_trackers.rs b/src/utils/web_scraping/web_page_trackers.rs index 53b9cae..3cb82d3 100644 --- a/src/utils/web_scraping/web_page_trackers.rs +++ b/src/utils/web_scraping/web_page_trackers.rs @@ -5,11 +5,12 @@ mod web_page_tracker; mod web_page_tracker_kind; mod web_page_tracker_settings; mod web_page_tracker_tag; +mod web_scraper; pub use self::{ web_page_content::{ - WebPageContentTrackerTag, WebScraperContentError, WebScraperContentRequest, - WebScraperContentRequestScripts, WebScraperContentResponse, + WebPageContentTrackerTag, WebScraperContentRequest, WebScraperContentRequestScripts, + WebScraperContentResponse, }, web_page_data_revision::WebPageDataRevision, web_page_resources::{ @@ -24,6 +25,7 @@ pub use self::{ WebPageTrackerSettings, MAX_WEB_PAGE_TRACKER_DELAY, MAX_WEB_PAGE_TRACKER_REVISIONS, }, web_page_tracker_tag::WebPageTrackerTag, + web_scraper::WebScraperErrorResponse, }; pub(in crate::utils::web_scraping) use self::web_page_resources::{ diff --git a/src/utils/web_scraping/web_page_trackers/web_page_content.rs b/src/utils/web_scraping/web_page_trackers/web_page_content.rs index 22beb5e..337fa70 100644 --- a/src/utils/web_scraping/web_page_trackers/web_page_content.rs +++ b/src/utils/web_scraping/web_page_trackers/web_page_content.rs @@ -1,11 +1,9 @@ mod web_page_content_tracker_tag; -mod web_scraper_content_error; mod web_scraper_content_request; mod web_scraper_content_response; pub use self::{ web_page_content_tracker_tag::WebPageContentTrackerTag, - web_scraper_content_error::WebScraperContentError, web_scraper_content_request::{WebScraperContentRequest, WebScraperContentRequestScripts}, web_scraper_content_response::WebScraperContentResponse, }; diff --git a/src/utils/web_scraping/web_page_trackers/web_scraper.rs b/src/utils/web_scraping/web_page_trackers/web_scraper.rs new file mode 100644 index 0000000..996845d --- /dev/null +++ b/src/utils/web_scraping/web_page_trackers/web_scraper.rs @@ -0,0 +1,3 @@ +mod web_scraper_error_response; + +pub use web_scraper_error_response::WebScraperErrorResponse; diff --git a/src/utils/web_scraping/web_page_trackers/web_page_content/web_scraper_content_error.rs b/src/utils/web_scraping/web_page_trackers/web_scraper/web_scraper_error_response.rs similarity index 71% rename from src/utils/web_scraping/web_page_trackers/web_page_content/web_scraper_content_error.rs rename to src/utils/web_scraping/web_page_trackers/web_scraper/web_scraper_error_response.rs index 23d5166..26a1dda 100644 --- a/src/utils/web_scraping/web_page_trackers/web_page_content/web_scraper_content_error.rs +++ b/src/utils/web_scraping/web_page_trackers/web_scraper/web_scraper_error_response.rs @@ -1,29 +1,29 @@ use serde::{Deserialize, Serialize}; -/// Represents error response if scraper couldn't extract content. +/// Represents an error returned by the web scraper service. #[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] #[serde(rename_all = "camelCase")] -pub struct WebScraperContentError { +pub struct WebScraperErrorResponse { /// Error message. pub message: String, } #[cfg(test)] mod tests { - use super::WebScraperContentError; + use super::WebScraperErrorResponse; use insta::assert_json_snapshot; #[test] fn deserialization() -> anyhow::Result<()> { assert_eq!( - serde_json::from_str::( + serde_json::from_str::( r#" { "message": "some-error" } "# )?, - WebScraperContentError { + WebScraperErrorResponse { message: "some-error".to_string(), } ); @@ -33,7 +33,7 @@ mod tests { #[test] fn serialization() -> anyhow::Result<()> { - assert_json_snapshot!(WebScraperContentError { + assert_json_snapshot!(WebScraperErrorResponse { message: "some-error".to_string(), }, @r###" { diff --git a/tools/api/utils/web_scraping_content.http b/tools/api/utils/web_scraping_content.http new file mode 100644 index 0000000..d9f30de --- /dev/null +++ b/tools/api/utils/web_scraping_content.http @@ -0,0 +1,15 @@ +### Create web page resources tracker +POST {{host}}/api/utils/web_scraping/content +Authorization: {{api-credentials}} +Accept: application/json +Content-Type: application/json + +{ + "name": "HackerNewsDemo", + "url": "https://news.ycombinator.com/", + "settings": { + "revisions": 1, + "delay": 5000, + "enableNotifications": true + } +}