Improve detection of crawlers/bots

Add support for Facebook crawler
ntop · May 6, 2023 · df21722 · df21722
1 parent 998bedb
commit df21722
Show file tree

Hide file tree

Showing 6 changed files with 295 additions and 5 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -53,7 +53,7 @@ jobs:
       - name: Install Ubuntu Prerequisites
         run: |
           sudo apt-get update
-          sudo apt-get install python3-netaddr git
+          sudo apt-get install python3-netaddr git whois
       - name: Run Scripts
         run: |
           echo 'Running ./utils/bitcoinnodes.sh'

diff --git a/src/lib/inc_generated/ndpi_crawlers_match.c.inc b/src/lib/inc_generated/ndpi_crawlers_match.c.inc
diff --git a/src/lib/protocols/http.c b/src/lib/protocols/http.c
@@ -607,8 +607,8 @@ static void ndpi_check_user_agent(struct ndpi_detection_module_struct *ndpi_stru
     Amazon-Route53-Health-Check-Service (ref 68784dad-be98-49e4-a63c-9fbbe2816d7c; report http://amzn.to/1vsZADi)
     Anonymous Crawler/1.0 (Webcrawler developed with StormCrawler; http://example.com/; [email protected])
    */
-  if((strstr(ua, "+http") != NULL)
-     || (strstr(ua, " http") != NULL)
+  if((strstr(ua, "+http:") != NULL)
+     || (strstr(ua, " http:") != NULL)
      || ndpi_strncasestr(ua, "Crawler", ua_len)
      || ndpi_strncasestr(ua, "Bot", ua_len) /* bot/robot */
      ) {

diff --git a/tests/cfgs/default/pcap/crawler_false_positive.pcapng b/tests/cfgs/default/pcap/crawler_false_positive.pcapng
diff --git a/tests/cfgs/default/result/crawler_false_positive.pcapng.out b/tests/cfgs/default/result/crawler_false_positive.pcapng.out
@@ -0,0 +1,25 @@
+Guessed flow protos:	0
+
+DPI Packets (TCP):	8	(8.00 pkts/flow)
+Confidence DPI              : 1 (flows)
+Num dissector calls: 12 (12.00 diss/flow)
+LRU cache ookla:      0/0/0 (insert/search/found)
+LRU cache bittorrent: 0/0/0 (insert/search/found)
+LRU cache zoom:       0/0/0 (insert/search/found)
+LRU cache stun:       0/0/0 (insert/search/found)
+LRU cache tls_cert:   0/0/0 (insert/search/found)
+LRU cache mining:     0/0/0 (insert/search/found)
+LRU cache msteams:    0/0/0 (insert/search/found)
+LRU cache stun_zoom:  0/0/0 (insert/search/found)
+Automa host:          0/0 (search/found)
+Automa domain:        0/0 (search/found)
+Automa tls cert:      0/0 (search/found)
+Automa risk mask:     0/0 (search/found)
+Automa common alpns:  0/0 (search/found)
+Patricia risk mask:   2/0 (search/found)
+Patricia risk:        0/0 (search/found)
+Patricia protocols:   1/1 (search/found)
+
+OCSP	12	1842	1
+
+	1	TCP 192.168.12.156:38291 <-> 93.184.220.29:80 [proto: 7.63/HTTP.OCSP][IP: 288/Edgecast][ClearText][Confidence: DPI][DPI packets: 8][cat: Web/5][7 pkts/705 bytes <-> 5 pkts/1137 bytes][Goodput ratio: 33/70][0.04 sec][Hostname/SNI: ocsp.digicert.com][bytes ratio: -0.235 (Download)][IAT c2s/s2c min/avg/max/stddev: 0/0 5/6 8/10 4/4][Pkt Len c2s/s2c min/avg/max/stddev: 66/66 101/227 284/865 75/319][StatusCode: 200][Req Content-Type: application/ocsp-request][Content-Type: application/ocsp-response][Server: ECS (mil/6CF7)][User-Agent: zbtls http][PLAIN TEXT (ConnectionTP/1.1)][Plen Bins: 33,0,0,0,0,0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,33,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
diff --git a/utils/crawlers_ip_addresses_download.sh b/utils/crawlers_ip_addresses_download.sh
@@ -9,6 +9,7 @@ TMP1=/tmp/bot_google_c1.json
 TMP2=/tmp/bot_google_c2.json
 TMP3=/tmp/bot_google_c3.json
 TMP_BING=/tmp/bot_bing.json
+TMP_FB=/tmp/bot_fb.list
 LIST=/tmp/bot.list
 #Google Common crawlers
 ORIGIN1="https://developers.google.com/static/search/apis/ipranges/googlebot.json"
@@ -18,7 +19,7 @@ ORIGIN2="https://developers.google.com/static/search/apis/ipranges/special-crawl
 ORIGIN3="https://developers.google.com/static/search/apis/ipranges/user-triggered-fetchers.json"
 #Bing Bot
 ORIGIN_BING="https://www.bing.com/toolbox/bingbot.json"
-
+#Facebook Bot: https://developers.facebook.com/docs/sharing/webmasters/crawler/
 
 echo "(1) Downloading file... ${ORIGIN1}"
 http_response=$(curl -s -o $TMP1 -w "%{http_code}" ${ORIGIN1})
@@ -48,15 +49,19 @@ if [ "$http_response" != "200" ]; then
     exit 1
 fi
 
+echo "(1) Downloading FB crawlers routes... "
+whois -h whois.radb.net -- '-i origin AS32934' | grep ^route > $TMP_FB
+
 echo "(2) Processing IP addresses..."
 {
     jq -r '.prefixes | .[].ipv4Prefix  | select( . != null )' $TMP1 # TODO: ipv6
     jq -r '.prefixes | .[].ipv4Prefix  | select( . != null )' $TMP2 # TODO: ipv6
     jq -r '.prefixes | .[].ipv4Prefix  | select( . != null )' $TMP3 # TODO: ipv6
     jq -r '.prefixes | .[].ipv4Prefix  | select( . != null )' $TMP_BING # TODO: ipv6
+    grep -v route6 $TMP_FB | tr -d 'route:^ ' # TODO: ipv6
 } > $LIST
 ./ipaddr2list.py $LIST NDPI_HTTP_CRAWLER_BOT > $DEST
-rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $LIST
+rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $TMP_FB $LIST
 
 echo "(3) Crawlers IPs are available in $DEST"
 exit 0