Skip to content
5 changes: 5 additions & 0 deletions .changeset/young-dots-fry.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@browserbasehq/stagehand": minor
---

Added native Stagehand agentic loop functionality. This allows you to build agentic workflows with a single prompt without using a computer-use model. To try it out, create a `stagehand.agent` without passing in a provider.
47 changes: 47 additions & 0 deletions examples/operator-example.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import { LogLine, Stagehand } from "@/dist";
import dotenv from "dotenv";
import StagehandConfig from "@/stagehand.config";
import chalk from "chalk";

// Load environment variables
dotenv.config();

const INSTRUCTION =
"Go to Google Japan and interact with it in Japanese. Tell me (in English) an authentic recipe that I can make with ingredients found in American grocery stores.";

async function main() {
console.log(`\n${chalk.bold("Stagehand 🤘 Operator Example")}\n`);

// Initialize Stagehand
const stagehand = new Stagehand({
...StagehandConfig,
logger: ({ level, message, timestamp }: LogLine) => {
console.log({ level, message, timestamp });
},
});

await stagehand.init();

try {
const agent = stagehand.agent();

// Execute the agent
console.log(`${chalk.cyan("↳")} Instruction: ${INSTRUCTION}`);

const result = await agent.execute({
instruction: INSTRUCTION,
maxSteps: 20,
});

console.log(`${chalk.green("✓")} Execution complete`);
console.log(`${chalk.yellow("⤷")} Result:`);
console.log(JSON.stringify(result, null, 2));
console.log(chalk.white(result.message));
} catch (error) {
console.log(`${chalk.red("✗")} Error: ${error}`);
} finally {
await stagehand.close();
}
}

main();
221 changes: 221 additions & 0 deletions lib/handlers/operatorHandler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
import { AgentAction, AgentExecuteOptions, AgentResult } from "@/types/agent";
import { LogLine } from "@/types/log";
import {
OperatorResponse,
operatorResponseSchema,
OperatorSummary,
operatorSummarySchema,
} from "@/types/operator";
import { LLMParsedResponse } from "../inference";
import { ChatMessage, LLMClient } from "../llm/LLMClient";
import { buildOperatorSystemPrompt } from "../prompt";
import { StagehandPage } from "../StagehandPage";
import { ObserveResult } from "@/types/stagehand";

export class StagehandOperatorHandler {
private stagehandPage: StagehandPage;
private logger: (message: LogLine) => void;
private llmClient: LLMClient;
private messages: ChatMessage[];

constructor(
stagehandPage: StagehandPage,
logger: (message: LogLine) => void,
llmClient: LLMClient,
) {
this.stagehandPage = stagehandPage;
this.logger = logger;
this.llmClient = llmClient;
}

public async execute(
instructionOrOptions: string | AgentExecuteOptions,
): Promise<AgentResult> {
const options =
typeof instructionOrOptions === "string"
? { instruction: instructionOrOptions }
: instructionOrOptions;

this.messages = [buildOperatorSystemPrompt(options.instruction)];
let completed = false;
let currentStep = 0;
const maxSteps = options.maxSteps || 10;
const actions: AgentAction[] = [];

while (!completed && currentStep < maxSteps) {
const url = this.stagehandPage.page.url();

if (!url || url === "about:blank") {
this.messages.push({
role: "user",
content: [
{
type: "text",
text: "No page is currently loaded. The first step should be a 'goto' action to navigate to a URL.",
},
],
});
} else {
const screenshot = await this.stagehandPage.page.screenshot({
type: "png",
fullPage: false,
});

const base64Image = screenshot.toString("base64");

let messageText = `Here is a screenshot of the current page (URL: ${url}):`;

messageText = `Previous actions were: ${actions
.map((action) => {
let result: string = "";
if (action.type === "act") {
const args = action.playwrightArguments as ObserveResult;
result = `Performed a "${args.method}" action ${args.arguments.length > 0 ? `with arguments: ${args.arguments.map((arg) => `"${arg}"`).join(", ")}` : ""} on "${args.description}"`;
} else if (action.type === "extract") {
result = `Extracted data: ${action.extractionResult}`;
}
return `[${action.type}] ${action.reasoning}. Result: ${result}`;
})
.join("\n")}\n\n${messageText}`;

this.messages.push({
role: "user",
content: [
{
type: "text",
text: messageText,
},
{
type: "image_url",
image_url: { url: `data:image/png;base64,${base64Image}` },
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

anthropic's format:

messages: [
      {
          "role": "user",
          "content": [
              {
                  "type": "image",
                  "source": {
                      "type": "base64",
                      "media_type": image_media_type,
                      "data": image_data,
                  },
              },
...

},
],
});
}

const result = await this.getNextStep(currentStep);

if (result.method === "close") {
completed = true;
}

let playwrightArguments: ObserveResult | undefined;
if (result.method === "act") {
[playwrightArguments] = await this.stagehandPage.page.observe(
result.parameters,
);
}
let extractionResult: unknown | undefined;
if (result.method === "extract") {
extractionResult = await this.stagehandPage.page.extract(
result.parameters,
);
}

await this.executeAction(result, playwrightArguments, extractionResult);

actions.push({
type: result.method,
reasoning: result.reasoning,
taskCompleted: result.taskComplete,
parameters: result.parameters,
playwrightArguments,
extractionResult,
});

currentStep++;
}

return {
success: true,
message: await this.getSummary(options.instruction),
actions,
completed: actions[actions.length - 1].taskCompleted as boolean,
};
}

private async getNextStep(currentStep: number): Promise<OperatorResponse> {
const { data: response } =
(await this.llmClient.createChatCompletion<OperatorResponse>({
options: {
messages: this.messages,
response_model: {
name: "operatorResponseSchema",
schema: operatorResponseSchema,
},
requestId: `operator-step-${currentStep}`,
},
logger: this.logger,
})) as LLMParsedResponse<OperatorResponse>;

return response;
}

private async getSummary(goal: string): Promise<string> {
const { data: response } =
(await this.llmClient.createChatCompletion<OperatorSummary>({
options: {
messages: [
...this.messages,
{
role: "user",
content: [
{
type: "text",
text: `Now use the steps taken to answer the original instruction of ${goal}.`,
},
],
},
],
response_model: {
name: "operatorSummarySchema",
schema: operatorSummarySchema,
},
requestId: "operator-summary",
},
logger: this.logger,
})) as LLMParsedResponse<OperatorSummary>;

return response.answer;
}
private async executeAction(
action: OperatorResponse,
playwrightArguments?: ObserveResult,
extractionResult?: unknown,
): Promise<unknown> {
const { method, parameters } = action;
const page = this.stagehandPage.page;

if (method === "close") {
return;
}

switch (method) {
case "act":
if (!playwrightArguments) {
throw new Error("No playwright arguments provided");
}
await page.act(playwrightArguments);
break;
case "extract":
if (!extractionResult) {
throw new Error("No extraction result provided");
}
return extractionResult;
case "goto":
await page.goto(parameters, { waitUntil: "load" });
break;
case "wait":
await page.waitForTimeout(parseInt(parameters));
break;
case "navback":
await page.goBack();
break;
case "refresh":
await page.reload();
break;
default:
throw new Error(`Unknown action: ${method}`);
}
}
}
23 changes: 21 additions & 2 deletions lib/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ import { logLineToString, isRunningInBun } from "./utils";
import { ApiResponse, ErrorResponse } from "@/types/api";
import { AgentExecuteOptions, AgentResult } from "../types/agent";
import { StagehandAgentHandler } from "./handlers/agentHandler";
import { StagehandOperatorHandler } from "./handlers/operatorHandler";

dotenv.config({ path: ".env" });

Expand Down Expand Up @@ -818,18 +819,35 @@ export class Stagehand {
* Create an agent instance that can be executed with different instructions
* @returns An agent instance with execute() method
*/
agent(options: AgentConfig): {
agent(options?: AgentConfig): {
execute: (
instructionOrOptions: string | AgentExecuteOptions,
) => Promise<AgentResult>;
} {
if (!options || !options.provider) {
// use open operator agent
return {
execute: async (instructionOrOptions: string | AgentExecuteOptions) => {
return new StagehandOperatorHandler(
this.stagehandPage,
this.logger,
this.llmClient,
).execute(instructionOrOptions);
},
};
}

const agentHandler = new StagehandAgentHandler(
this.stagehandPage,
this.logger,
{
modelName: options.model,
clientOptions: options.options,
userProvidedInstructions: options.instructions,
userProvidedInstructions:
options.instructions ??
`You are a helpful assistant that can use a web browser.
You are currently on the following page: ${this.stagehandPage.page.url()}.
Do not ask follow up questions, the user will trust your judgement.`,
agentType: options.provider,
},
);
Expand Down Expand Up @@ -889,5 +907,6 @@ export * from "../types/model";
export * from "../types/page";
export * from "../types/playwright";
export * from "../types/stagehand";
export * from "../types/operator";
export * from "../types/agent";
export * from "./llm/LLMClient";
2 changes: 1 addition & 1 deletion lib/inference.ts
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ interface LLMUsage {
/**
* For calls that use a schema: the LLMClient may return { data: T; usage?: LLMUsage }
*/
interface LLMParsedResponse<T> {
export interface LLMParsedResponse<T> {
data: T;
usage?: LLMUsage;
}
Expand Down
21 changes: 21 additions & 0 deletions lib/prompt.ts
Original file line number Diff line number Diff line change
Expand Up @@ -411,3 +411,24 @@ export function buildActObservePrompt(

return instruction;
}

export function buildOperatorSystemPrompt(goal: string): ChatMessage {
return {
role: "system",
content: `You are a general-purpose agent whose job is to accomplish the user's goal across multiple model calls by running actions on the page.

You will be given a goal and a list of steps that have been taken so far. Your job is to determine if either the user's goal has been completed or if there are still steps that need to be taken.

# Your current goal
${goal}

# Important guidelines
1. Break down complex actions into individual atomic steps
2. For \`act\` commands, use only one action at a time, such as:
- Single click on a specific element
- Type into a single input field
- Select a single option
3. Avoid combining multiple actions in one instruction
4. If multiple actions are needed, they should be separate steps`,
};
}
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"2048": "npm run build && tsx examples/2048.ts",
"popup": "npm run build && tsx examples/popup.ts",
"cua": "npm run build && tsx examples/cua-example.ts",
"operator": "npm run build && tsx examples/operator-example.ts",
"example": "npm run build && tsx examples/example.ts",
"langchain": "npm run build && tsx examples/langchain.ts",
"debug-url": "npm run build && tsx examples/debugUrl.ts",
Expand Down
Loading