feat!: Rework bot detection rule with allow/deny configuration (#1437)

This reworks the bot detection rule to be configured like the sensitive info rule, which is to say you can either `allow` or `deny` list of bots. I've also reworked the bot detection to look for almost 600 well-known bots. Closes #39 - we've changed the configuration format and have validation on `allow` and `deny`.
arcjet · Sep 4, 2024 · eef18e3 · eef18e3
1 parent 4cb8098
commit eef18e3
Show file tree

Hide file tree

Showing 31 changed files with 1,950 additions and 861 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -84,6 +84,63 @@ updates:
   #       patterns:
   #         - "*"
 
+  - package-ecosystem: npm
+    directory: /examples/express-bots
+    schedule:
+      # Our dependencies should be checked daily
+      interval: daily
+    assignees:
+      - blaine-arcjet
+      - e-moran
+    reviewers:
+      - blaine-arcjet
+      - e-moran
+    commit-message:
+      prefix: deps(example)
+      prefix-development: deps(example)
+    groups:
+      dependencies:
+        patterns:
+          - "*"
+
+  - package-ecosystem: npm
+    directory: /examples/express-newman
+    schedule:
+      # Our dependencies should be checked daily
+      interval: daily
+    assignees:
+      - blaine-arcjet
+      - e-moran
+    reviewers:
+      - blaine-arcjet
+      - e-moran
+    commit-message:
+      prefix: deps(example)
+      prefix-development: deps(example)
+    groups:
+      dependencies:
+        patterns:
+          - "*"
+
+  - package-ecosystem: npm
+    directory: /examples/express-sensitive-info
+    schedule:
+      # Our dependencies should be checked daily
+      interval: daily
+    assignees:
+      - blaine-arcjet
+      - e-moran
+    reviewers:
+      - blaine-arcjet
+      - e-moran
+    commit-message:
+      prefix: deps(example)
+      prefix-development: deps(example)
+    groups:
+      dependencies:
+        patterns:
+          - "*"
+
   - package-ecosystem: npm
     directory: /examples/nextjs-14-app-dir-rl
     schedule:
@@ -430,23 +487,6 @@ updates:
         patterns:
           - "*"
 
-  - package-ecosystem: npm
-    directory: /examples/express-sensitive-info
-    schedule:
-      # Our dependencies should be checked daily
-      interval: daily
-    assignees:
-      - blaine-arcjet
-    reviewers:
-      - blaine-arcjet
-    commit-message:
-      prefix: deps(example)
-      prefix-development: deps(example)
-    groups:
-      dependencies:
-        patterns:
-          - "*"
-
   - package-ecosystem: npm
     directory: /examples/nodejs-express-launchdarkly
     schedule:

diff --git a/analyze/edge-light.ts b/analyze/edge-light.ts
@@ -4,13 +4,13 @@ import { instantiate } from "./wasm/arcjet_analyze_js_req.component.js";
 import type {
   ImportObject,
   EmailValidationConfig,
-  BotDetectionResult,
-  BotType,
   EmailValidationResult,
   DetectedSensitiveInfoEntity,
   SensitiveInfoEntities,
   SensitiveInfoEntity,
   SensitiveInfoResult,
+  BotConfig,
+  BotResult,
 } from "./wasm/arcjet_analyze_js_req.component.js";
 import type { ArcjetJsReqSensitiveInformationIdentifier } from "./wasm/interfaces/arcjet-js-req-sensitive-information-identifier.js";
 
@@ -115,20 +115,7 @@ async function init(
 
 export {
   type EmailValidationConfig,
-  type BotType,
-  /**
-   * Represents the result of the bot detection.
-   *
-   * @property `botType` - What type of bot this is. This will be one of `BotType`.
-   * @property `botScore` - A score ranging from 0 to 99 representing the degree of
-   * certainty. The higher the number within the type category, the greater the
-   * degree of certainty. E.g. `BotType.Automated` with a score of 1 means we are
-   * sure the request was made by an automated bot. `BotType.LikelyNotABot` with a
-   * score of 30 means we don't think this request was a bot, but it's lowest
-   * confidence level. `BotType.LikelyNotABot` with a score of 99 means we are
-   * almost certain this request was not a bot.
-   */
-  type BotDetectionResult,
+  type BotConfig,
   type DetectedSensitiveInfoEntity,
   type SensitiveInfoEntity,
   type DetectSensitiveInfoFunction,
@@ -173,7 +160,7 @@ export async function isValidEmail(
   if (typeof analyze !== "undefined") {
     return analyze.isValidEmail(candidate, optionsOrDefault);
   } else {
-    // Skip the local evaluation of the rule if WASM is not available
+    // Skip the local evaluation of the rule if Wasm is not available
     return {
       validity: "valid",
       blocked: [],
@@ -183,22 +170,22 @@ export async function isValidEmail(
 
 export async function detectBot(
   context: AnalyzeContext,
-  headers: string,
-  patterns_add: string,
-  patterns_remove: string,
-): Promise<BotDetectionResult> {
+  request: AnalyzeRequest,
+  options: BotConfig,
+): Promise<BotResult> {
   const analyze = await init(context);
 
   if (typeof analyze !== "undefined") {
-    return analyze.detectBot(headers, patterns_add, patterns_remove);
+    return analyze.detectBot(JSON.stringify(request), options);
   } else {
-    // TODO: Fallback to JS if we don't have WASM?
+    // Skip the local evaluation of the rule if Wasm is not available
     return {
-      botType: "not-analyzed",
-      botScore: 0,
+      allowed: [],
+      denied: [],
     };
   }
 }
+
 export async function detectSensitiveInfo(
   context: AnalyzeContext,
   candidate: string,

diff --git a/analyze/index.ts b/analyze/index.ts
@@ -4,13 +4,13 @@ import { instantiate } from "./wasm/arcjet_analyze_js_req.component.js";
 import type {
   ImportObject,
   EmailValidationConfig,
-  BotDetectionResult,
-  BotType,
   EmailValidationResult,
   DetectedSensitiveInfoEntity,
   SensitiveInfoEntities,
   SensitiveInfoEntity,
   SensitiveInfoResult,
+  BotConfig,
+  BotResult,
 } from "./wasm/arcjet_analyze_js_req.component.js";
 import type { ArcjetJsReqSensitiveInformationIdentifier } from "./wasm/interfaces/arcjet-js-req-sensitive-information-identifier.js";
 
@@ -129,20 +129,7 @@ async function init(
 
 export {
   type EmailValidationConfig,
-  type BotType,
-  /**
-   * Represents the result of the bot detection.
-   *
-   * @property `botType` - What type of bot this is. This will be one of `BotType`.
-   * @property `botScore` - A score ranging from 0 to 99 representing the degree of
-   * certainty. The higher the number within the type category, the greater the
-   * degree of certainty. E.g. `BotType.Automated` with a score of 1 means we are
-   * sure the request was made by an automated bot. `BotType.LikelyNotABot` with a
-   * score of 30 means we don't think this request was a bot, but it's lowest
-   * confidence level. `BotType.LikelyNotABot` with a score of 99 means we are
-   * almost certain this request was not a bot.
-   */
-  type BotDetectionResult,
+  type BotConfig,
   type DetectedSensitiveInfoEntity,
   type SensitiveInfoEntity,
   type DetectSensitiveInfoFunction,
@@ -197,19 +184,18 @@ export async function isValidEmail(
 
 export async function detectBot(
   context: AnalyzeContext,
-  headers: string,
-  patterns_add: string,
-  patterns_remove: string,
-): Promise<BotDetectionResult> {
+  request: AnalyzeRequest,
+  options: BotConfig,
+): Promise<BotResult> {
   const analyze = await init(context);
 
   if (typeof analyze !== "undefined") {
-    return analyze.detectBot(headers, patterns_add, patterns_remove);
+    return analyze.detectBot(JSON.stringify(request), options);
   } else {
-    // TODO: Fallback to JS if we don't have WASM?
+    // Skip the local evaluation of the rule if Wasm is not available
     return {
-      botType: "not-analyzed",
-      botScore: 0,
+      allowed: [],
+      denied: [],
     };
   }
 }

diff --git a/analyze/wasm/arcjet_analyze_js_req.component.core.wasm b/analyze/wasm/arcjet_analyze_js_req.component.core.wasm
diff --git a/analyze/wasm/arcjet_analyze_js_req.component.core2.wasm b/analyze/wasm/arcjet_analyze_js_req.component.core2.wasm
diff --git a/analyze/wasm/arcjet_analyze_js_req.component.core3.wasm b/analyze/wasm/arcjet_analyze_js_req.component.core3.wasm
diff --git a/analyze/wasm/arcjet_analyze_js_req.component.d.ts b/analyze/wasm/arcjet_analyze_js_req.component.d.ts
@@ -3,26 +3,6 @@ export { SensitiveInfoEntity };
 /**
 * # Variants
 * 
-* ## `"unspecified"`
-* 
-* ## `"not-analyzed"`
-* 
-* ## `"automated"`
-* 
-* ## `"likely-automated"`
-* 
-* ## `"likely-not-a-bot"`
-* 
-* ## `"verified-bot"`
-*/
-export type BotType = 'unspecified' | 'not-analyzed' | 'automated' | 'likely-automated' | 'likely-not-a-bot' | 'verified-bot';
-export interface BotDetectionResult {
-  botType: BotType,
-  botScore: number,
-}
-/**
-* # Variants
-* 
 * ## `"valid"`
 * 
 * ## `"invalid"`
@@ -60,6 +40,28 @@ export interface SensitiveInfoResult {
   allowed: Array<DetectedSensitiveInfoEntity>,
   denied: Array<DetectedSensitiveInfoEntity>,
 }
+export type BotEntity = string;
+export interface AllowedBotConfig {
+  entities: Array<BotEntity>,
+  skipCustomDetect: boolean,
+}
+export interface DeniedBotConfig {
+  entities: Array<BotEntity>,
+  skipCustomDetect: boolean,
+}
+export type BotConfig = BotConfigAllowedBotConfig | BotConfigDeniedBotConfig;
+export interface BotConfigAllowedBotConfig {
+  tag: 'allowed-bot-config',
+  val: AllowedBotConfig,
+}
+export interface BotConfigDeniedBotConfig {
+  tag: 'denied-bot-config',
+  val: DeniedBotConfig,
+}
+export interface BotResult {
+  allowed: Array<BotEntity>,
+  denied: Array<BotEntity>,
+}
 import { ArcjetJsReqEmailValidatorOverrides } from './interfaces/arcjet-js-req-email-validator-overrides.js';
 import { ArcjetJsReqLogger } from './interfaces/arcjet-js-req-logger.js';
 import { ArcjetJsReqSensitiveInformationIdentifier } from './interfaces/arcjet-js-req-sensitive-information-identifier.js';
@@ -69,7 +71,7 @@ export interface ImportObject {
   'arcjet:js-req/sensitive-information-identifier': typeof ArcjetJsReqSensitiveInformationIdentifier,
 }
 export interface Root {
-  detectBot(headers: string, patternsAdd: string, patternsRemove: string): BotDetectionResult,
+  detectBot(request: string, options: BotConfig): BotResult,
   generateFingerprint(request: string, characteristics: Array<string>): string,
   isValidEmail(candidate: string, options: EmailValidationConfig): EmailValidationResult,
   detectSensitiveInfo(content: string, options: SensitiveInfoConfig): SensitiveInfoResult,