openai
diff --git a/‎docs/benchmarking/alignment_roc_curves.png‎
54.2 KB b/‎docs/benchmarking/alignment_roc_curves.png‎
54.2 KB
diff --git a/‎docs/ref/checks/prompt_injection_detection.md‎
Lines changed: 20 additions & 14 deletions b/‎docs/ref/checks/prompt_injection_detection.md‎
Lines changed: 20 additions & 14 deletions
diff --git a/‎docs/ref/types-typescript.md‎
Lines changed: 0 additions & 3 deletions b/‎docs/ref/types-typescript.md‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎src/__tests__/unit/prompt_injection_detection.test.ts‎
Lines changed: 3 additions & 7 deletions b/‎src/__tests__/unit/prompt_injection_detection.test.ts‎
Lines changed: 3 additions & 7 deletions
diff --git a/‎src/base-client.ts‎
Lines changed: 0 additions & 7 deletions b/‎src/base-client.ts‎
Lines changed: 0 additions & 7 deletions
@@ -67,8 +67,14 @@ Returns a `GuardrailResult` with the following `info` dictionary:
     "confidence": 0.1,
     "threshold": 0.7,
     "user_goal": "What's the weather in Tokyo?",
-    "action": "get_weather(location='Tokyo')",
-    "checked_text": "Original input text"
+    "action": [
+        {
+            "type": "function_call",
+            "name": "get_weather",
+            "arguments": "{\"location\": \"Tokyo\"}"
+        }
+    ],
+    "checked_text": "[{\"role\": \"user\", \"content\": \"What is the weather in Tokyo?\"}]"
 }
 ```
 
@@ -77,18 +83,18 @@ Returns a `GuardrailResult` with the following `info` dictionary:
 - **`confidence`**: Confidence score (0.0 to 1.0) that the action is misaligned
 - **`threshold`**: The confidence threshold that was configured
 - **`user_goal`**: The tracked user intent from conversation
-- **`action`**: The specific action being evaluated
-- **`checked_text`**: Original input text
+- **`action`**: The list of function calls or tool outputs analyzed for alignment
+- **`checked_text`**: Serialized conversation history inspected during analysis
 
 ## Benchmark Results
 
 ### Dataset Description
 
-This benchmark evaluates model performance on a synthetic dataset of agent conversation traces:
+This benchmark evaluates model performance on agent conversation traces:
 
-- **Dataset size**: 1,000 samples with 500 positive cases (50% prevalence)
-- **Data type**: Internal synthetic dataset simulating realistic agent traces
-- **Test scenarios**: Multi-turn conversations with function calls and tool outputs
+- **Synthetic dataset**: 1,000 samples with 500 positive cases (50% prevalence) simulating realistic agent traces
+- **AgentDojo dataset**: 1,046 samples from AgentDojo's workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
+- **Test scenarios**: Multi-turn conversations with function calls and tool outputs across realistic workplace domains
 - **Misalignment examples**: Unrelated function calls, harmful operations, and data leakage
 
 **Example of misaligned conversation:**
@@ -107,12 +113,12 @@ This benchmark evaluates model performance on a synthetic dataset of agent conve
 
 | Model         | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 |
 |---------------|---------|-------------|-------------|-------------|-----------------|
-| gpt-5         | 0.9997  | 1.000       | 1.000       | 1.000       | 0.998           |
-| gpt-5-mini    | 0.9998  | 1.000       | 1.000       | 0.998       | 0.998           |
-| gpt-5-nano    | 0.9987  | 0.996       | 0.996       | 0.996       | 0.996           |
-| gpt-4.1       | 0.9990  | 1.000       | 1.000       | 1.000       | 0.998           |
-| gpt-4.1-mini (default) | 0.9930  | 1.000       | 1.000       | 1.000       | 0.986           |
-| gpt-4.1-nano  | 0.9431  | 0.982       | 0.845       | 0.695       | 0.000           |
+| gpt-5         | 0.9604  | 0.998       | 0.995       | 0.963       | 0.431           |
+| gpt-5-mini    | 0.9796  | 0.999       | 0.999       | 0.966       | 0.000           |
+| gpt-5-nano    | 0.8651  | 0.963       | 0.963       | 0.951       | 0.056           |
+| gpt-4.1       | 0.9846  | 0.998       | 0.998       | 0.998       | 0.000           |
+| gpt-4.1-mini (default) | 0.9728  | 0.995       | 0.995       | 0.995       | 0.000           |
+| gpt-4.1-nano  | 0.8677  | 0.974       | 0.974       | 0.974       | 0.000           |
 
 **Notes:**
 
 
@@ -17,8 +17,6 @@ Context interface providing access to the OpenAI client used by guardrails.
 ```typescript
 export interface GuardrailLLMContextWithHistory extends GuardrailLLMContext {
   getConversationHistory(): any[];
-  getInjectionLastCheckedIndex(): number;
-  updateInjectionLastCheckedIndex(index: number): void;
 }
 ```
 
@@ -60,4 +58,3 @@ export type TCfg = object;
 
 For the full source, see [src/types.ts](https://github.com/openai/openai-guardrails-js/blob/main/src/types.ts) in the repository.
 
-
@@ -52,8 +52,6 @@ describe('Prompt Injection Detection Check', () => {
           output: '{"temperature": 22, "condition": "sunny"}',
         },
       ],
-      getInjectionLastCheckedIndex: () => 0,
-      updateInjectionLastCheckedIndex: () => {},
     };
   });
 
@@ -74,7 +72,6 @@ describe('Prompt Injection Detection Check', () => {
     const contextWithOnlyUserMessages = {
       ...mockContext,
       getConversationHistory: () => [{ role: 'user', content: 'Hello there!' }],
-      getInjectionLastCheckedIndex: () => 0,
     };
 
     const result = await promptInjectionDetectionCheck(
@@ -84,14 +81,13 @@ describe('Prompt Injection Detection Check', () => {
     );
 
     expect(result.tripwireTriggered).toBe(false);
-    expect(result.info.observation).toBe('No function calls or function call outputs to evaluate');
+    expect(result.info.observation).toBe('No actionable tool messages to evaluate');
   });
 
   it('should return skip result when no LLM actions', async () => {
     const contextWithNoLLMActions = {
       ...mockContext,
       getConversationHistory: () => [{ role: 'user', content: 'Hello there!' }],
-      getInjectionLastCheckedIndex: () => 1, // Already checked all messages
     };
 
     const result = await promptInjectionDetectionCheck(
@@ -101,7 +97,7 @@ describe('Prompt Injection Detection Check', () => {
     );
 
     expect(result.tripwireTriggered).toBe(false);
-    expect(result.info.observation).toBe('No function calls or function call outputs to evaluate');
+    expect(result.info.observation).toBe('No actionable tool messages to evaluate');
   });
 
   it('should extract user intent correctly', async () => {
@@ -123,6 +119,6 @@ describe('Prompt Injection Detection Check', () => {
     const result = await promptInjectionDetectionCheck(contextWithError, 'test data', config);
 
     expect(result.tripwireTriggered).toBe(false);
-    expect(result.info.observation).toContain('Error during prompt injection detection check');
+    expect(result.info.observation).toBe('No conversation history available');
   });
 });
@@ -109,7 +109,6 @@ export abstract class GuardrailsBaseClient {
   protected guardrails!: StageGuardrails;
   protected context!: GuardrailLLMContext;
   protected _resourceClient!: OpenAI;
-  protected _injectionLastCheckedIndex: number = 0;
   public raiseGuardrailErrors: boolean = false;
 
   /**
@@ -527,14 +526,8 @@ export abstract class GuardrailsBaseClient {
       guardrailLlm: this.context.guardrailLlm,
       // Add conversation history methods
       getConversationHistory: () => conversationHistory,
-      getInjectionLastCheckedIndex: () => this._injectionLastCheckedIndex,
-      updateInjectionLastCheckedIndex: (newIndex: number) => {
-        this._injectionLastCheckedIndex = newIndex;
-      },
     } as GuardrailLLMContext & {
       getConversationHistory(): any[];
-      getInjectionLastCheckedIndex(): number;
-      updateInjectionLastCheckedIndex(index: number): void;
     };
   }