From cab143c72889c585adbf041e9c248e57d0c4c4ca Mon Sep 17 00:00:00 2001 From: Amaury <1293565+amaury1729@users.noreply.github.com> Date: Wed, 11 Oct 2023 16:00:57 +0200 Subject: [PATCH] feat(core): Add domain-specific rules as JSON file (#1347) * feat(core): Add domain-specific rules as JSON file * Remove timeout for now * Add has_rule() * log debug --- Cargo.lock | 1 + core/Cargo.toml | 7 ++-- core/src/lib.rs | 1 + core/src/rules.json | 19 +++++++++++ core/src/rules.rs | 71 ++++++++++++++++++++++++++++++++++++++++ core/src/smtp/connect.rs | 57 ++++++++++++++++++++++++-------- core/src/smtp/gmail.rs | 3 +- core/src/smtp/mod.rs | 18 ++++------ 8 files changed, 148 insertions(+), 29 deletions(-) create mode 100644 core/src/rules.json create mode 100644 core/src/rules.rs diff --git a/Cargo.lock b/Cargo.lock index 66244f615..5fc0090c9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -402,6 +402,7 @@ dependencies = [ "log", "mailchecker", "md5", + "once_cell", "pwned", "rand", "regex", diff --git a/core/Cargo.toml b/core/Cargo.toml index 39f0b84c0..759ead594 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -21,17 +21,18 @@ async-std-resolver = "0.21.2" fantoccini = { version = "0.19.3", optional = true } futures = { version = "0.3.27", optional = true } fast-socks5 = "0.9.1" +levenshtein = "1.0.5" log = "0.4.20" mailchecker = "5.0.7" +md5 = "0.7.0" +once_cell = "1.17.1" +pwned = "0.5.0" rand = { version = "0.8.5", features = ["small_rng"] } regex = "1.9.4" reqwest = { version = "0.11.16", features = ["json", "socks"] } serde = { version = "1.0.157", features = ["derive"] } serde_json = "1.0.95" trust-dns-proto = "0.21.2" -md5 = "0.7.0" -levenshtein = "1.0.5" -pwned = "0.5.0" [dev-dependencies] tokio = { version = "1.28.2" } diff --git a/core/src/lib.rs b/core/src/lib.rs index e0748d62d..5f6cd5569 100644 --- a/core/src/lib.rs +++ b/core/src/lib.rs @@ -65,6 +65,7 @@ mod haveibeenpwned; pub mod misc; pub mod mx; +mod rules; pub mod smtp; pub mod syntax; mod util; diff --git a/core/src/rules.json b/core/src/rules.json new file mode 100644 index 000000000..d73590cad --- /dev/null +++ b/core/src/rules.json @@ -0,0 +1,19 @@ +{ + "by_domain": { + "gmail.com": { "rules": ["SkipCatchAll"] }, + "hotmail.com": { "rules": ["SkipCatchAll"] }, + "hotmail.fr": { "rules": ["SkipCatchAll"] }, + "hotmail.nl": { "rules": ["SkipCatchAll"] }, + "yahoo.com": { "rules": ["SkipCatchAll"] }, + "yahoo.fr": { "rules": ["SkipCatchAll"] } + }, + "by_mx_suffix": { + ".antispamcloud.com.": { + "rules": ["SkipCatchAll"], + "_comment": "Some take exactly 30s to respond, so we skip the catch-all one, and bump the timeout." + } + }, + "rules": { + "SkipCatchAll": { "_comment": "Don't perform catch-all check." } + } +} diff --git a/core/src/rules.rs b/core/src/rules.rs new file mode 100644 index 000000000..8a521a1a2 --- /dev/null +++ b/core/src/rules.rs @@ -0,0 +1,71 @@ +// check-if-email-exists +// Copyright (C) 2018-2022 Reacher + +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU Affero General Public License as published +// by the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU Affero General Public License for more details. + +// You should have received a copy of the GNU Affero General Public License +// along with this program. If not, see . + +//! Read provider- and domain-specific rules from a JSON, then match each +//! email verification to the domain/provider, and translate those rules into +//! code. +//! +//! IMPORTANT: This is still a beta feature, and probably needs refining. + +use once_cell::sync::Lazy; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; + +#[derive(Debug, Deserialize, Eq, Hash, PartialEq, Serialize)] +pub enum Rule { + /// Don't perform catch-all check. + SkipCatchAll, +} + +#[derive(Debug, Deserialize, Serialize)] +struct RulesByDomain { + rules: Vec, +} + +#[derive(Debug, Deserialize, Serialize)] +struct AllRules { + /// Apply rules by domain name, i.e. after the @ symbol. + by_domain: HashMap, + /// Apply rules by the MX host. Since each domain potentially has multiple + /// MX records, we match by their suffix. + by_mx_suffix: HashMap, +} + +static ALL_RULES: Lazy = + Lazy::new(|| serde_json::from_str::(include_str!("rules.json")).unwrap()); + +fn does_domain_have_rule(domain: &str, rule: &Rule) -> bool { + if let Some(v) = ALL_RULES.by_domain.get(domain) { + return v.rules.contains(rule); + } + + false +} + +fn does_mx_have_rule(host: &str, rule: &Rule) -> bool { + for (k, v) in ALL_RULES.by_mx_suffix.iter() { + if host.ends_with(k) { + return v.rules.contains(rule); + } + } + + false +} + +/// Check if either the domain or the MX host has any given rule. +pub fn has_rule(domain: &str, host: &str, rule: &Rule) -> bool { + does_domain_have_rule(domain, rule) || does_mx_have_rule(host, rule) +} diff --git a/core/src/smtp/connect.rs b/core/src/smtp/connect.rs index 97a579dcd..fd6887f08 100644 --- a/core/src/smtp/connect.rs +++ b/core/src/smtp/connect.rs @@ -27,11 +27,12 @@ use std::iter; use std::str::FromStr; use std::time::Duration; -use trust_dns_proto::rr::Name; - -use super::{gmail::is_gmail, outlook::is_hotmail, parser, yahoo::is_yahoo}; +use super::parser; use super::{SmtpDetails, SmtpError}; -use crate::util::{constants::LOG_TARGET, input_output::CheckEmailInput}; +use crate::{ + rules::{has_rule, Rule}, + util::{constants::LOG_TARGET, input_output::CheckEmailInput}, +}; /// Try to send an smtp command, close and return Err if fails. macro_rules! try_smtp ( @@ -48,13 +49,12 @@ macro_rules! try_smtp ( /// Attempt to connect to host via SMTP, and return SMTP client on success. async fn connect_to_host( - host: &Name, + host: &str, port: u16, input: &CheckEmailInput, ) -> Result { // hostname verification fails if it ends with '.', for example, using // SOCKS5 proxies we can `io: incomplete` error. - let host = host.to_string(); let host = host.trim_end_matches('.').to_string(); let security = { @@ -220,11 +220,16 @@ async fn email_deliverable( async fn smtp_is_catch_all( smtp_transport: &mut SmtpTransport, domain: &str, - host: &Name, + host: &str, + input: &CheckEmailInput, ) -> Result { // Skip catch-all check for known providers. - let host = host.to_string(); - if is_gmail(&host) || is_hotmail(&host) || is_yahoo(&host) { + if has_rule(domain, host, &Rule::SkipCatchAll) { + log::debug!( + target: LOG_TARGET, + "[email={}] Skipping catch-all check for [domain={domain}]", + input.to_email + ); return Ok(false); } @@ -247,7 +252,7 @@ async fn smtp_is_catch_all( async fn create_smtp_future( to_email: &EmailAddress, - host: &Name, + host: &str, port: u16, domain: &str, input: &CheckEmailInput, @@ -256,7 +261,7 @@ async fn create_smtp_future( // Ok(SmtpDetails { can_connect_smtp: false, ... }). let mut smtp_transport = connect_to_host(host, port, input).await?; - let is_catch_all = smtp_is_catch_all(&mut smtp_transport, domain, host) + let is_catch_all = smtp_is_catch_all(&mut smtp_transport, domain, host, input) .await .unwrap_or(false); let deliverability = if is_catch_all { @@ -278,7 +283,8 @@ async fn create_smtp_future( if parser::is_err_io_errors(e) { log::debug!( target: LOG_TARGET, - "Got `io: incomplete` error, reconnecting." + "[email={}] Got `io: incomplete` error, reconnecting.", + input.to_email ); let _ = smtp_transport.close().await; @@ -299,7 +305,7 @@ async fn create_smtp_future( /// retries. async fn check_smtp_without_retry( to_email: &EmailAddress, - host: &Name, + host: &str, port: u16, domain: &str, input: &CheckEmailInput, @@ -325,7 +331,7 @@ async fn check_smtp_without_retry( #[async_recursion] pub async fn check_smtp_with_retry( to_email: &EmailAddress, - host: &Name, + host: &str, port: u16, domain: &str, input: &CheckEmailInput, @@ -376,3 +382,26 @@ pub async fn check_smtp_with_retry( _ => result, } } + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn should_skip_catch_all() { + let smtp_client = SmtpClient::new("gmail.com".into()); + let mut smtp_transport = smtp_client.into_transport(); + + let r = smtp_is_catch_all( + &mut smtp_transport, + "gmail.com", + "alt4.aspmx.l.google.com.", + &CheckEmailInput::default(), + ) + .await; + + assert!(!smtp_transport.is_connected()); // We shouldn't connect to google servers. + assert!(r.is_ok()); + assert_eq!(false, r.unwrap()) + } +} diff --git a/core/src/smtp/gmail.rs b/core/src/smtp/gmail.rs index 8ec9c8bca..5e15d4d95 100644 --- a/core/src/smtp/gmail.rs +++ b/core/src/smtp/gmail.rs @@ -76,7 +76,8 @@ pub async fn check_gmail( }) } -/// Check if the MX host is from Gmail. +/// Check if the MX host is from Google, i.e. either a @gmail.com address, or +/// a Google Suite email. pub fn is_gmail(host: &str) -> bool { host.to_lowercase().ends_with(".google.com.") } diff --git a/core/src/smtp/mod.rs b/core/src/smtp/mod.rs index 6dc2518f5..32d3dde12 100644 --- a/core/src/smtp/mod.rs +++ b/core/src/smtp/mod.rs @@ -62,29 +62,25 @@ pub async fn check_smtp( domain: &str, input: &CheckEmailInput, ) -> Result { - let host_lowercase = host.to_lowercase().to_string(); + let host = host.to_string(); - if input - .skipped_domains - .iter() - .any(|d| host_lowercase.contains(d)) - { + if input.skipped_domains.iter().any(|d| host.contains(d)) { return Err(SmtpError::SkippedDomain(format!( "Reacher currently cannot verify emails from @{domain}" ))); } - if input.yahoo_use_api && is_yahoo(&host_lowercase) { + if input.yahoo_use_api && is_yahoo(&host) { return yahoo::check_yahoo(to_email, input) .await .map_err(|err| err.into()); } - if input.gmail_use_api && is_gmail(&host_lowercase) { + if input.gmail_use_api && is_gmail(&host) { return gmail::check_gmail(to_email, input) .await .map_err(|err| err.into()); } - if input.microsoft365_use_api && is_microsoft365(&host_lowercase) { + if input.microsoft365_use_api && is_microsoft365(&host) { match outlook::microsoft365::check_microsoft365_api(to_email, input).await { Ok(Some(smtp_details)) => return Ok(smtp_details), // Continue in the event of an error/ambiguous result. @@ -101,14 +97,14 @@ pub async fn check_smtp( } #[cfg(feature = "headless")] if let Some(webdriver) = &input.hotmail_use_headless { - if is_outlook(&host_lowercase) { + if is_outlook(&host) { return outlook::hotmail::check_password_recovery(to_email, webdriver) .await .map_err(|err| err.into()); } } - check_smtp_with_retry(to_email, host, port, domain, input, input.retries).await + check_smtp_with_retry(to_email, &host, port, domain, input, input.retries).await } #[cfg(test)]