matomo-org · sanchezzzhak · Apr 23, 2024 · Apr 12, 2024 · Apr 20, 2024 · Apr 20, 2024
diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml
@@ -912,27 +912,27 @@
 -
   user_agent: facebookexternalhit/1.1 (+http://www.facebook.com/externalhit_uatext.php)
   bot:
-    name: Facebook External Hit
+    name: Facebook Crawler
     category: Social Media Agent
-    url: https://www.facebook.com/externalhit_uatext.php
+    url: https://developers.facebook.com/docs/sharing/webmasters/crawler/
     producer:
       name: Meta Platforms, Inc.
       url: https://www.meta.com/
 -
   user_agent: facebookexternalua
   bot:
-    name: Facebook External Hit
+    name: Facebook Crawler
     category: Social Media Agent
-    url: https://www.facebook.com/externalhit_uatext.php
+    url: https://developers.facebook.com/docs/sharing/webmasters/crawler/
     producer:
       name: Meta Platforms, Inc.
       url: https://www.meta.com/
 -
   user_agent: facebookplatform/1.0 (+http://developers.facebook.com)
   bot:
-    name: Facebook External Hit
+    name: Facebook Crawler
     category: Social Media Agent
-    url: https://www.facebook.com/externalhit_uatext.php
+    url: https://developers.facebook.com/docs/sharing/webmasters/crawler/
     producer:
       name: Meta Platforms, Inc.
       url: https://www.meta.com/
@@ -4568,9 +4568,9 @@
 -
   user_agent: facebookcatalog/1.0
   bot:
-    name: Facebook External Hit
+    name: Facebook Crawler
     category: Social Media Agent
-    url: https://www.facebook.com/externalhit_uatext.php
+    url: https://developers.facebook.com/docs/sharing/webmasters/crawler/
     producer:
       name: Meta Platforms, Inc.
       url: https://www.meta.com/
@@ -7472,3 +7472,80 @@
     producer:
       name: Google Inc.
       url: https://www.google.com/
+-
+  user_agent: KvshClient
+  bot:
+    name: Generic Bot
+-
+  user_agent: Mozilla/5.0 infrawatch/0.1
+  bot:
+    name: Generic Bot
+-
+  user_agent: InsytfulBot/1.0; https://www.insytful.com/about-our-bot
+  bot:
+    name: InsytfulBot
+    category: Crawler
+    url: https://www.insytful.com/
+    producer:
+      name: Zengenti Limited
+      url: https://www.zengenti.com/
+-
+  user_agent: statista.com PublicationFinder-Crawler 2.0
+  bot:
+    name: Statista
+    category: Crawler
+    url: https://www.statista.com/
+    producer:
+      name: Statista, Inc.
+      url: https://www.statista.com/
+-
+  user_agent: SubstackContentFetch/1.0; https://substack.com
+  bot:
+    name: Substack Content Fetch
+    category: Crawler
+    url: https://substack.com/
+    producer:
+      name: Substack, Inc.
+      url: https://substack.com/
+-
+  user_agent: ds9 2.000.ec2(+http://www.deepsearchnine.com/ds9.html)
+  bot:
+    name: Deep SEARCH 9
+    category: Crawler
+    url: https://www.copyright.com/blog/ccc-expands-corporate-solutions-offering-with-new-technology/
+    producer:
+      name: Copyright Clearance Center, Inc.
+      url: https://www.copyright.com/
+-
+  user_agent: ds9 2.000.ec2
+  bot:
+    name: Deep SEARCH 9
+    category: Crawler
+    url: https://www.copyright.com/blog/ccc-expands-corporate-solutions-offering-with-new-technology/
+    producer:
+      name: Copyright Clearance Center, Inc.
+      url: https://www.copyright.com/
+-
+  user_agent: LiveJournal.com ([email protected]; for https://www.livejournal.com/users/example/; 1 readers)
+  bot:
+    name: LiveJournal
+    url: https://www.livejournal.com/
+    category: Feed Fetcher
+    producer:
+      name: ООО "СИМ"
+      url: https://www.livejournal.com/
+-
+  user_agent: bitdiscovery-suggestions
+  bot:
+    name: Tenable.asm
+    category: Security Checker
+    url: https://bitdiscovery.com/
+    producer:
+      name: Tenable, Inc.
+      url: https://www.tenable.com/
+-
+  user_agent: Castopod/1.0
+  bot:
+    name: Castopod
+    category: Crawler
+    url: https://www.castopod.org/
diff --git a/regexes/bots.yml b/regexes/bots.yml
@@ -552,9 +552,9 @@
     url: 'http://moz.com/'
 
 - regex: 'facebookexternalhit|facebookplatform|facebookexternalua|facebookcatalog'
-  name: 'Facebook External Hit'
+  name: 'Facebook Crawler'
   category: 'Social Media Agent'
-  url: 'https://www.facebook.com/externalhit_uatext.php'
+  url: 'https://developers.facebook.com/docs/sharing/webmasters/crawler/'
   producer:
     name: 'Meta Platforms, Inc.'
     url: 'https://www.meta.com/'
@@ -4374,10 +4374,63 @@
     name: 'Library and Archives Canada'
     url: 'https://library-archives.canada.ca/'
 
+- regex: 'InsytfulBot/[\d.]+'
+  name: 'InsytfulBot'
+  category: 'Crawler'
+  url: 'https://www.insytful.com/'
+  producer:
+    name: 'Zengenti Limited'
+    url: 'https://www.zengenti.com/'
+
+- regex: 'statista\.com'
+  name: 'Statista'
+  category: 'Crawler'
+  url: 'https://www.statista.com/'
+  producer:
+    name: 'Statista, Inc.'
+    url: 'https://www.statista.com/'
+
+- regex: 'SubstackContentFetch/[\d.]+'
+  name: 'Substack Content Fetch'
+  category: 'Crawler'
+  url: 'https://substack.com/'
+  producer:
+    name: 'Substack, Inc.'
+    url: 'https://substack.com/'
+
+- regex: '^ds9'
+  name: 'Deep SEARCH 9'
+  category: 'Crawler'
+  url: 'https://www.copyright.com/blog/ccc-expands-corporate-solutions-offering-with-new-technology/'
+  producer:
+    name: 'Copyright Clearance Center, Inc.'
+    url: 'https://www.copyright.com/'
+
+- regex: 'LiveJournal\.com'
+  name: 'LiveJournal'
+  url: 'https://www.livejournal.com/'
+  category: 'Feed Fetcher'
+  producer:
+    name: 'ООО "СИМ"'
+    url: 'https://www.livejournal.com/'
+
+- regex: 'bitdiscovery'
+  name: 'Tenable.asm'
+  category: 'Security Checker'
+  url: 'https://bitdiscovery.com/'
+  producer:
+    name: 'Tenable, Inc.'
+    url: 'https://www.tenable.com/'
+
+- regex: 'Castopod/[\d.]+'
+  name: 'Castopod'
+  category: 'Crawler'
+  url: 'https://www.castopod.org/'
+
 # Generic bots
-- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|^xenu|^ZmEu|^(?:chrome|firefox|Zeus)$'
+- regex: 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\.o\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \(cow\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherweb|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|^xenu|^(?:chrome|firefox|KvshClient|Zeus|ZmEu)$'
   name: 'Generic Bot'
 
 # Generic detections
-- regex: '[a-z0-9_-]*(?:(?<!cu|power[ _]|m[ _])bot(?![ _]TAB|[ _]?5[0-9]|[ _]Senior|[ _]Junior)|analyzer|appengine|archiver?|checker|collector|crawl|crawler|fetcher|indexer|inspector|monitor|project(?!or)|(?<!Google Wap )proxy|research|resolver|robots|scanner|scraper|script|searcher|(?<!-)security|spider|study|transcoder|uptime|user[ _]?agent|validator)(?:[^a-z]|$)'
+- regex: '[a-z0-9_-]*(?:(?<!cu|power[ _]|m[ _])bot(?![ _]TAB|[ _]?5[0-9]|[ _]Senior|[ _]Junior)|analyzer|appengine|archiver?|checker|collector|crawl|crawler|fetch(?:er)?|indexer|inspector|monitor|project(?!or)|(?<!Google Wap )proxy|research|resolver|robots|scanner|scraper|script|searcher|(?<!-)security|spider|study|transcoder|uptime|user[ _]?agent|validator)(?:[^a-z]|$)'
   name: 'Generic Bot'