Skip to content

Commit d75cfb3

Browse files
authored
Update PI sys prompt and new eval (#16)
* update PI sys prompt and new eval * removed last checked index * addressing copilot comments * Updating PI docs with results * extract constants, added conversation util
1 parent b9b99b4 commit d75cfb3

File tree

9 files changed

+563
-264
lines changed

9 files changed

+563
-264
lines changed
54.2 KB
Loading

docs/ref/checks/prompt_injection_detection.md

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,14 @@ Returns a `GuardrailResult` with the following `info` dictionary:
6767
"confidence": 0.1,
6868
"threshold": 0.7,
6969
"user_goal": "What's the weather in Tokyo?",
70-
"action": "get_weather(location='Tokyo')",
71-
"checked_text": "Original input text"
70+
"action": [
71+
{
72+
"type": "function_call",
73+
"name": "get_weather",
74+
"arguments": "{\"location\": \"Tokyo\"}"
75+
}
76+
],
77+
"checked_text": "[{\"role\": \"user\", \"content\": \"What is the weather in Tokyo?\"}]"
7278
}
7379
```
7480

@@ -77,18 +83,18 @@ Returns a `GuardrailResult` with the following `info` dictionary:
7783
- **`confidence`**: Confidence score (0.0 to 1.0) that the action is misaligned
7884
- **`threshold`**: The confidence threshold that was configured
7985
- **`user_goal`**: The tracked user intent from conversation
80-
- **`action`**: The specific action being evaluated
81-
- **`checked_text`**: Original input text
86+
- **`action`**: The list of function calls or tool outputs analyzed for alignment
87+
- **`checked_text`**: Serialized conversation history inspected during analysis
8288

8389
## Benchmark Results
8490

8591
### Dataset Description
8692

87-
This benchmark evaluates model performance on a synthetic dataset of agent conversation traces:
93+
This benchmark evaluates model performance on agent conversation traces:
8894

89-
- **Dataset size**: 1,000 samples with 500 positive cases (50% prevalence)
90-
- **Data type**: Internal synthetic dataset simulating realistic agent traces
91-
- **Test scenarios**: Multi-turn conversations with function calls and tool outputs
95+
- **Synthetic dataset**: 1,000 samples with 500 positive cases (50% prevalence) simulating realistic agent traces
96+
- **AgentDojo dataset**: 1,046 samples from AgentDojo's workspace, travel, banking, and Slack suite combined with the "important_instructions" attack (949 positive cases, 97 negative samples)
97+
- **Test scenarios**: Multi-turn conversations with function calls and tool outputs across realistic workplace domains
9298
- **Misalignment examples**: Unrelated function calls, harmful operations, and data leakage
9399

94100
**Example of misaligned conversation:**
@@ -107,12 +113,12 @@ This benchmark evaluates model performance on a synthetic dataset of agent conve
107113

108114
| Model | ROC AUC | Prec@R=0.80 | Prec@R=0.90 | Prec@R=0.95 | Recall@FPR=0.01 |
109115
|---------------|---------|-------------|-------------|-------------|-----------------|
110-
| gpt-5 | 0.9997 | 1.000 | 1.000 | 1.000 | 0.998 |
111-
| gpt-5-mini | 0.9998 | 1.000 | 1.000 | 0.998 | 0.998 |
112-
| gpt-5-nano | 0.9987 | 0.996 | 0.996 | 0.996 | 0.996 |
113-
| gpt-4.1 | 0.9990 | 1.000 | 1.000 | 1.000 | 0.998 |
114-
| gpt-4.1-mini (default) | 0.9930 | 1.000 | 1.000 | 1.000 | 0.986 |
115-
| gpt-4.1-nano | 0.9431 | 0.982 | 0.845 | 0.695 | 0.000 |
116+
| gpt-5 | 0.9604 | 0.998 | 0.995 | 0.963 | 0.431 |
117+
| gpt-5-mini | 0.9796 | 0.999 | 0.999 | 0.966 | 0.000 |
118+
| gpt-5-nano | 0.8651 | 0.963 | 0.963 | 0.951 | 0.056 |
119+
| gpt-4.1 | 0.9846 | 0.998 | 0.998 | 0.998 | 0.000 |
120+
| gpt-4.1-mini (default) | 0.9728 | 0.995 | 0.995 | 0.995 | 0.000 |
121+
| gpt-4.1-nano | 0.8677 | 0.974 | 0.974 | 0.974 | 0.000 |
116122

117123
**Notes:**
118124

docs/ref/types-typescript.md

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,6 @@ Context interface providing access to the OpenAI client used by guardrails.
1717
```typescript
1818
export interface GuardrailLLMContextWithHistory extends GuardrailLLMContext {
1919
getConversationHistory(): any[];
20-
getInjectionLastCheckedIndex(): number;
21-
updateInjectionLastCheckedIndex(index: number): void;
2220
}
2321
```
2422

@@ -60,4 +58,3 @@ export type TCfg = object;
6058

6159
For the full source, see [src/types.ts](https://github.com/openai/openai-guardrails-js/blob/main/src/types.ts) in the repository.
6260

63-

src/__tests__/unit/prompt_injection_detection.test.ts

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,6 @@ describe('Prompt Injection Detection Check', () => {
5252
output: '{"temperature": 22, "condition": "sunny"}',
5353
},
5454
],
55-
getInjectionLastCheckedIndex: () => 0,
56-
updateInjectionLastCheckedIndex: () => {},
5755
};
5856
});
5957

@@ -74,7 +72,6 @@ describe('Prompt Injection Detection Check', () => {
7472
const contextWithOnlyUserMessages = {
7573
...mockContext,
7674
getConversationHistory: () => [{ role: 'user', content: 'Hello there!' }],
77-
getInjectionLastCheckedIndex: () => 0,
7875
};
7976

8077
const result = await promptInjectionDetectionCheck(
@@ -84,14 +81,13 @@ describe('Prompt Injection Detection Check', () => {
8481
);
8582

8683
expect(result.tripwireTriggered).toBe(false);
87-
expect(result.info.observation).toBe('No function calls or function call outputs to evaluate');
84+
expect(result.info.observation).toBe('No actionable tool messages to evaluate');
8885
});
8986

9087
it('should return skip result when no LLM actions', async () => {
9188
const contextWithNoLLMActions = {
9289
...mockContext,
9390
getConversationHistory: () => [{ role: 'user', content: 'Hello there!' }],
94-
getInjectionLastCheckedIndex: () => 1, // Already checked all messages
9591
};
9692

9793
const result = await promptInjectionDetectionCheck(
@@ -101,7 +97,7 @@ describe('Prompt Injection Detection Check', () => {
10197
);
10298

10399
expect(result.tripwireTriggered).toBe(false);
104-
expect(result.info.observation).toBe('No function calls or function call outputs to evaluate');
100+
expect(result.info.observation).toBe('No actionable tool messages to evaluate');
105101
});
106102

107103
it('should extract user intent correctly', async () => {
@@ -123,6 +119,6 @@ describe('Prompt Injection Detection Check', () => {
123119
const result = await promptInjectionDetectionCheck(contextWithError, 'test data', config);
124120

125121
expect(result.tripwireTriggered).toBe(false);
126-
expect(result.info.observation).toContain('Error during prompt injection detection check');
122+
expect(result.info.observation).toBe('No conversation history available');
127123
});
128124
});

src/base-client.ts

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,6 @@ export abstract class GuardrailsBaseClient {
109109
protected guardrails!: StageGuardrails;
110110
protected context!: GuardrailLLMContext;
111111
protected _resourceClient!: OpenAI;
112-
protected _injectionLastCheckedIndex: number = 0;
113112
public raiseGuardrailErrors: boolean = false;
114113

115114
/**
@@ -527,14 +526,8 @@ export abstract class GuardrailsBaseClient {
527526
guardrailLlm: this.context.guardrailLlm,
528527
// Add conversation history methods
529528
getConversationHistory: () => conversationHistory,
530-
getInjectionLastCheckedIndex: () => this._injectionLastCheckedIndex,
531-
updateInjectionLastCheckedIndex: (newIndex: number) => {
532-
this._injectionLastCheckedIndex = newIndex;
533-
},
534529
} as GuardrailLLMContext & {
535530
getConversationHistory(): any[];
536-
getInjectionLastCheckedIndex(): number;
537-
updateInjectionLastCheckedIndex(index: number): void;
538531
};
539532
}
540533

0 commit comments

Comments
 (0)