Skip to content

Commit

Permalink
Improve detection of crawler/bot traffic
Browse files Browse the repository at this point in the history
  • Loading branch information
IvanNardi committed Apr 28, 2023
1 parent 8934f7b commit fbc2486
Show file tree
Hide file tree
Showing 8 changed files with 680 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/include/ndpi_typedefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1704,6 +1704,7 @@ typedef enum {
correct detection/classification.
See #1946 for other details */
ndpi_enable_tcp_ack_payload_heuristic = (1 << 17),
ndpi_dont_load_crawlers_list = (1 << 18),
} ndpi_prefs;

typedef struct {
Expand Down
592 changes: 592 additions & 0 deletions src/lib/inc_generated/ndpi_crawlers_match.c.inc

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/lib/ndpi_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
#include "inc_generated/ndpi_ms_skype_teams_match.c.inc"
#include "inc_generated/ndpi_google_match.c.inc"
#include "inc_generated/ndpi_google_cloud_match.c.inc"
#include "inc_generated/ndpi_crawlers_match.c.inc"
#include "inc_generated/ndpi_icloud_private_relay_match.c.inc"
#include "inc_generated/ndpi_asn_telegram.c.inc"
#include "inc_generated/ndpi_asn_apple.c.inc"
Expand Down Expand Up @@ -2836,6 +2837,8 @@ struct ndpi_detection_module_struct *ndpi_init_detection_module(ndpi_init_prefs
if((ndpi_str->ip_risk_ptree = ndpi_patricia_new(32 /* IPv4 */)) != NULL) {
if(!(prefs & ndpi_dont_load_icloud_private_relay_list)) {
ndpi_init_ptree_ipv4(ndpi_str, ndpi_str->ip_risk_ptree, ndpi_anonymous_subscriber_protocol_list);
if(!(prefs & ndpi_dont_load_crawlers_list))
ndpi_init_ptree_ipv4(ndpi_str, ndpi_str->ip_risk_ptree, ndpi_http_crawler_bot_protocol_list);
}
}
}
Expand Down
20 changes: 20 additions & 0 deletions src/lib/ndpi_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -2468,6 +2468,26 @@ void ndpi_set_risk(struct ndpi_detection_module_struct *ndpi_str,
}
}
}
} else if(risk_message) {
u_int8_t i;

for(i = 0; i < flow->num_risk_infos; i++)
if(flow->risk_infos[i].id == r)
return;

/* Risk already set without any details, but now we have a specific risk_message
that we want to save.
This might happen with NDPI_HTTP_CRAWLER_BOT which might have been set early via
IP matching (no details) and now via UA matching (with message). */
if(flow->num_risk_infos < MAX_NUM_RISK_INFOS) {
char *s = ndpi_strdup(risk_message);

if(s != NULL) {
flow->risk_infos[flow->num_risk_infos].id = r;
flow->risk_infos[flow->num_risk_infos].info = s;
flow->num_risk_infos++;
}
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion tests/cfgs/default/result/bot.pcap.out
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Automa tls cert: 0/0 (search/found)
Automa risk mask: 0/0 (search/found)
Automa common alpns: 0/0 (search/found)
Patricia risk mask: 2/0 (search/found)
Patricia risk: 2/0 (search/found)
Patricia risk: 1/1 (search/found)
Patricia protocols: 2/1 (search/found)

HTTP 402 431124 1
Expand Down
2 changes: 1 addition & 1 deletion tests/cfgs/default/result/ssh.pcap.out
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ Patricia protocols: 2/0 (search/found)

SSH 258 35546 1

1 TCP 172.16.238.1:58395 <-> 172.16.238.168:22 [proto: 92/SSH][IP: 0/Unknown][Encrypted][Confidence: DPI][DPI packets: 10][cat: RemoteAccess/12][159 pkts/15615 bytes <-> 99 pkts/19931 bytes][Goodput ratio: 33/67][248.48 sec][Hostname/SNI: SSH-2.0-OpenSSH_5.3][bytes ratio: -0.121 (Mixed)][IAT c2s/s2c min/avg/max/stddev: 0/0 1846/2934 166223/166224 14794/19692][Pkt Len c2s/s2c min/avg/max/stddev: 66/66 98/201 970/1346 83/283][Risk: ** SSH Obsolete Cli Vers/Cipher **** SSH Obsolete Ser Vers/Cipher **][Risk Score: 150][HASSH-C: 21B457A327CE7A2D4FCE5EF2C42400BD][Server: SSH-2.0-OpenSSH_5.6][HASSH-S: B1C6C0D56317555B85C7005A3DE29325][Plen Bins: 2,76,12,2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0]
1 TCP 172.16.238.1:58395 <-> 172.16.238.168:22 [proto: 92/SSH][IP: 0/Unknown][Encrypted][Confidence: DPI][DPI packets: 10][cat: RemoteAccess/12][159 pkts/15615 bytes <-> 99 pkts/19931 bytes][Goodput ratio: 33/67][248.48 sec][Hostname/SNI: SSH-2.0-OpenSSH_5.3][bytes ratio: -0.121 (Mixed)][IAT c2s/s2c min/avg/max/stddev: 0/0 1846/2934 166223/166224 14794/19692][Pkt Len c2s/s2c min/avg/max/stddev: 66/66 98/201 970/1346 83/283][Risk: ** SSH Obsolete Cli Vers/Cipher **** SSH Obsolete Ser Vers/Cipher **][Risk Score: 150][Risk Info: Found cipher arcfour128 / Found cipher arcfour128][HASSH-C: 21B457A327CE7A2D4FCE5EF2C42400BD][Server: SSH-2.0-OpenSSH_5.6][HASSH-S: B1C6C0D56317555B85C7005A3DE29325][Plen Bins: 2,76,12,2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0]
60 changes: 60 additions & 0 deletions utils/crawlers_ip_addresses_download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/sh

set -e

cd "$(dirname "${0}")" || exit 1

DEST=../src/lib/inc_generated/ndpi_crawlers_match.c.inc
TMP1=/tmp/bot_google_c1.json
TMP2=/tmp/bot_google_c2.json
TMP3=/tmp/bot_google_c3.json
TMP_BING=/tmp/bot_bing.json
LIST=/tmp/bot.list
#Google Common crawlers
ORIGIN1="https://developers.google.com/static/search/apis/ipranges/googlebot.json"
#Google Special-case crawlers
ORIGIN2="https://developers.google.com/static/search/apis/ipranges/special-crawlers.json"
#Google User-triggered fetchers
ORIGIN3="https://developers.google.com/static/search/apis/ipranges/user-triggered-fetchers.json"
#Bing Bot
ORIGIN_BING="https://www.bing.com/toolbox/bingbot.json"


echo "(1) Downloading file... ${ORIGIN1}"
http_response=$(curl -s -o $TMP1 -w "%{http_code}" ${ORIGIN1})
if [ "$http_response" != "200" ]; then
echo "Error $http_response: you probably need to update the list url!"
exit 1
fi

echo "(1) Downloading file... ${ORIGIN2}"
http_response=$(curl -s -o $TMP2 -w "%{http_code}" ${ORIGIN2})
if [ "$http_response" != "200" ]; then
echo "Error $http_response: you probably need to update the list url!"
exit 1
fi

echo "(1) Downloading file... ${ORIGIN3}"
http_response=$(curl -s -o $TMP3 -w "%{http_code}" ${ORIGIN3})
if [ "$http_response" != "200" ]; then
echo "Error $http_response: you probably need to update the list url!"
exit 1
fi

echo "(1) Downloading file... ${ORIGIN_BING}"
http_response=$(curl -s -o $TMP_BING -w "%{http_code}" ${ORIGIN_BING})
if [ "$http_response" != "200" ]; then
echo "Error $http_response: you probably need to update the list url!"
exit 1
fi

echo "(2) Processing IP addresses..."
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP1 > $LIST # TODO: ipv6
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP2 >> $LIST # TODO: ipv6
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP3 >> $LIST # TODO: ipv6
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP_BING >> $LIST # TODO: ipv6
./ipaddr2list.py $LIST NDPI_HTTP_CRAWLER_BOT > $DEST
rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $LIST

echo "(3) Crawlers IPs are available in $DEST"
exit 0
2 changes: 2 additions & 0 deletions utils/update_every_lists.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ RETVAL=$(( RETVAL + $? ))
RETVAL=$(( RETVAL + $? ))
./icloud_private_relay_ip_addresses_download.sh
RETVAL=$(( RETVAL + $? ))
./crawlers_ip_addresses_download.sh
RETVAL=$(( RETVAL + $? ))

./asn_update.sh
RETVAL=$(( RETVAL + $? ))
Expand Down

0 comments on commit fbc2486

Please sign in to comment.