Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve detection of crawler/bot traffic #1956

Merged
merged 1 commit into from
May 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/include/ndpi_typedefs.h
Original file line number Diff line number Diff line change
Expand Up @@ -1704,6 +1704,7 @@ typedef enum {
correct detection/classification.
See #1946 for other details */
ndpi_enable_tcp_ack_payload_heuristic = (1 << 17),
ndpi_dont_load_crawlers_list = (1 << 18),
} ndpi_prefs;

typedef struct {
Expand Down
592 changes: 592 additions & 0 deletions src/lib/inc_generated/ndpi_crawlers_match.c.inc

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/lib/ndpi_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@
#include "inc_generated/ndpi_ms_skype_teams_match.c.inc"
#include "inc_generated/ndpi_google_match.c.inc"
#include "inc_generated/ndpi_google_cloud_match.c.inc"
#include "inc_generated/ndpi_crawlers_match.c.inc"
#include "inc_generated/ndpi_icloud_private_relay_match.c.inc"
#include "inc_generated/ndpi_asn_telegram.c.inc"
#include "inc_generated/ndpi_asn_apple.c.inc"
Expand Down Expand Up @@ -2836,6 +2837,8 @@ struct ndpi_detection_module_struct *ndpi_init_detection_module(ndpi_init_prefs
if((ndpi_str->ip_risk_ptree = ndpi_patricia_new(32 /* IPv4 */)) != NULL) {
if(!(prefs & ndpi_dont_load_icloud_private_relay_list)) {
ndpi_init_ptree_ipv4(ndpi_str, ndpi_str->ip_risk_ptree, ndpi_anonymous_subscriber_protocol_list);
if(!(prefs & ndpi_dont_load_crawlers_list))
ndpi_init_ptree_ipv4(ndpi_str, ndpi_str->ip_risk_ptree, ndpi_http_crawler_bot_protocol_list);
}
}
}
Expand Down
20 changes: 20 additions & 0 deletions src/lib/ndpi_utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -2468,6 +2468,26 @@ void ndpi_set_risk(struct ndpi_detection_module_struct *ndpi_str,
}
}
}
} else if(risk_message) {
u_int8_t i;

for(i = 0; i < flow->num_risk_infos; i++)
if(flow->risk_infos[i].id == r)
return;

/* Risk already set without any details, but now we have a specific risk_message
that we want to save.
This might happen with NDPI_HTTP_CRAWLER_BOT which might have been set early via
IP matching (no details) and now via UA matching (with message). */
if(flow->num_risk_infos < MAX_NUM_RISK_INFOS) {
char *s = ndpi_strdup(risk_message);

if(s != NULL) {
flow->risk_infos[flow->num_risk_infos].id = r;
flow->risk_infos[flow->num_risk_infos].info = s;
flow->num_risk_infos++;
}
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion tests/cfgs/default/result/bot.pcap.out
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Automa tls cert: 0/0 (search/found)
Automa risk mask: 0/0 (search/found)
Automa common alpns: 0/0 (search/found)
Patricia risk mask: 2/0 (search/found)
Patricia risk: 2/0 (search/found)
Patricia risk: 1/1 (search/found)
Patricia protocols: 2/1 (search/found)

HTTP 402 431124 1
Expand Down
2 changes: 1 addition & 1 deletion tests/cfgs/default/result/ssh.pcap.out
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ Patricia protocols: 2/0 (search/found)

SSH 258 35546 1

1 TCP 172.16.238.1:58395 <-> 172.16.238.168:22 [proto: 92/SSH][IP: 0/Unknown][Encrypted][Confidence: DPI][DPI packets: 10][cat: RemoteAccess/12][159 pkts/15615 bytes <-> 99 pkts/19931 bytes][Goodput ratio: 33/67][248.48 sec][Hostname/SNI: SSH-2.0-OpenSSH_5.3][bytes ratio: -0.121 (Mixed)][IAT c2s/s2c min/avg/max/stddev: 0/0 1846/2934 166223/166224 14794/19692][Pkt Len c2s/s2c min/avg/max/stddev: 66/66 98/201 970/1346 83/283][Risk: ** SSH Obsolete Cli Vers/Cipher **** SSH Obsolete Ser Vers/Cipher **][Risk Score: 150][HASSH-C: 21B457A327CE7A2D4FCE5EF2C42400BD][Server: SSH-2.0-OpenSSH_5.6][HASSH-S: B1C6C0D56317555B85C7005A3DE29325][Plen Bins: 2,76,12,2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0]
1 TCP 172.16.238.1:58395 <-> 172.16.238.168:22 [proto: 92/SSH][IP: 0/Unknown][Encrypted][Confidence: DPI][DPI packets: 10][cat: RemoteAccess/12][159 pkts/15615 bytes <-> 99 pkts/19931 bytes][Goodput ratio: 33/67][248.48 sec][Hostname/SNI: SSH-2.0-OpenSSH_5.3][bytes ratio: -0.121 (Mixed)][IAT c2s/s2c min/avg/max/stddev: 0/0 1846/2934 166223/166224 14794/19692][Pkt Len c2s/s2c min/avg/max/stddev: 66/66 98/201 970/1346 83/283][Risk: ** SSH Obsolete Cli Vers/Cipher **** SSH Obsolete Ser Vers/Cipher **][Risk Score: 150][Risk Info: Found cipher arcfour128 / Found cipher arcfour128][HASSH-C: 21B457A327CE7A2D4FCE5EF2C42400BD][Server: SSH-2.0-OpenSSH_5.6][HASSH-S: B1C6C0D56317555B85C7005A3DE29325][Plen Bins: 2,76,12,2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0]
62 changes: 62 additions & 0 deletions utils/crawlers_ip_addresses_download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/sh

set -e

cd "$(dirname "${0}")" || exit 1

DEST=../src/lib/inc_generated/ndpi_crawlers_match.c.inc
TMP1=/tmp/bot_google_c1.json
TMP2=/tmp/bot_google_c2.json
TMP3=/tmp/bot_google_c3.json
TMP_BING=/tmp/bot_bing.json
LIST=/tmp/bot.list
#Google Common crawlers
ORIGIN1="https://developers.google.com/static/search/apis/ipranges/googlebot.json"
#Google Special-case crawlers
ORIGIN2="https://developers.google.com/static/search/apis/ipranges/special-crawlers.json"
#Google User-triggered fetchers
ORIGIN3="https://developers.google.com/static/search/apis/ipranges/user-triggered-fetchers.json"
#Bing Bot
ORIGIN_BING="https://www.bing.com/toolbox/bingbot.json"


echo "(1) Downloading file... ${ORIGIN1}"
http_response=$(curl -s -o $TMP1 -w "%{http_code}" ${ORIGIN1})
if [ "$http_response" != "200" ]; then
echo "Error $http_response: you probably need to update the list url!"
exit 1
fi

echo "(1) Downloading file... ${ORIGIN2}"
http_response=$(curl -s -o $TMP2 -w "%{http_code}" ${ORIGIN2})
if [ "$http_response" != "200" ]; then
echo "Error $http_response: you probably need to update the list url!"
exit 1
fi

echo "(1) Downloading file... ${ORIGIN3}"
http_response=$(curl -s -o $TMP3 -w "%{http_code}" ${ORIGIN3})
if [ "$http_response" != "200" ]; then
echo "Error $http_response: you probably need to update the list url!"
exit 1
fi

echo "(1) Downloading file... ${ORIGIN_BING}"
http_response=$(curl -s -o $TMP_BING -w "%{http_code}" ${ORIGIN_BING})
if [ "$http_response" != "200" ]; then
echo "Error $http_response: you probably need to update the list url!"
exit 1
fi

echo "(2) Processing IP addresses..."
{
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP1 # TODO: ipv6
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP2 # TODO: ipv6
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP3 # TODO: ipv6
jq -r '.prefixes | .[].ipv4Prefix | select( . != null )' $TMP_BING # TODO: ipv6
} > $LIST
./ipaddr2list.py $LIST NDPI_HTTP_CRAWLER_BOT > $DEST
rm -f $TMP1 $TMP2 $TMP3 $TMP_BING $LIST

echo "(3) Crawlers IPs are available in $DEST"
exit 0
2 changes: 2 additions & 0 deletions utils/update_every_lists.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ RETVAL=$(( RETVAL + $? ))
RETVAL=$(( RETVAL + $? ))
./icloud_private_relay_ip_addresses_download.sh
RETVAL=$(( RETVAL + $? ))
./crawlers_ip_addresses_download.sh
RETVAL=$(( RETVAL + $? ))

./asn_update.sh
RETVAL=$(( RETVAL + $? ))
Expand Down