diff --git a/.changeset/young-dots-fry.md b/.changeset/young-dots-fry.md new file mode 100644 index 000000000..bea408cb2 --- /dev/null +++ b/.changeset/young-dots-fry.md @@ -0,0 +1,5 @@ +--- +"@browserbasehq/stagehand": minor +--- + +Added native Stagehand agentic loop functionality. This allows you to build agentic workflows with a single prompt without using a computer-use model. To try it out, create a `stagehand.agent` without passing in a provider. diff --git a/examples/operator-example.ts b/examples/operator-example.ts new file mode 100644 index 000000000..5978e8d72 --- /dev/null +++ b/examples/operator-example.ts @@ -0,0 +1,47 @@ +import { LogLine, Stagehand } from "@/dist"; +import dotenv from "dotenv"; +import StagehandConfig from "@/stagehand.config"; +import chalk from "chalk"; + +// Load environment variables +dotenv.config(); + +const INSTRUCTION = + "Go to Google Japan and interact with it in Japanese. Tell me (in English) an authentic recipe that I can make with ingredients found in American grocery stores."; + +async function main() { + console.log(`\n${chalk.bold("Stagehand 🤘 Operator Example")}\n`); + + // Initialize Stagehand + const stagehand = new Stagehand({ + ...StagehandConfig, + logger: ({ level, message, timestamp }: LogLine) => { + console.log({ level, message, timestamp }); + }, + }); + + await stagehand.init(); + + try { + const agent = stagehand.agent(); + + // Execute the agent + console.log(`${chalk.cyan("↳")} Instruction: ${INSTRUCTION}`); + + const result = await agent.execute({ + instruction: INSTRUCTION, + maxSteps: 20, + }); + + console.log(`${chalk.green("✓")} Execution complete`); + console.log(`${chalk.yellow("⤷")} Result:`); + console.log(JSON.stringify(result, null, 2)); + console.log(chalk.white(result.message)); + } catch (error) { + console.log(`${chalk.red("✗")} Error: ${error}`); + } finally { + await stagehand.close(); + } +} + +main(); diff --git a/lib/handlers/operatorHandler.ts b/lib/handlers/operatorHandler.ts new file mode 100644 index 000000000..fceae9a90 --- /dev/null +++ b/lib/handlers/operatorHandler.ts @@ -0,0 +1,221 @@ +import { AgentAction, AgentExecuteOptions, AgentResult } from "@/types/agent"; +import { LogLine } from "@/types/log"; +import { + OperatorResponse, + operatorResponseSchema, + OperatorSummary, + operatorSummarySchema, +} from "@/types/operator"; +import { LLMParsedResponse } from "../inference"; +import { ChatMessage, LLMClient } from "../llm/LLMClient"; +import { buildOperatorSystemPrompt } from "../prompt"; +import { StagehandPage } from "../StagehandPage"; +import { ObserveResult } from "@/types/stagehand"; + +export class StagehandOperatorHandler { + private stagehandPage: StagehandPage; + private logger: (message: LogLine) => void; + private llmClient: LLMClient; + private messages: ChatMessage[]; + + constructor( + stagehandPage: StagehandPage, + logger: (message: LogLine) => void, + llmClient: LLMClient, + ) { + this.stagehandPage = stagehandPage; + this.logger = logger; + this.llmClient = llmClient; + } + + public async execute( + instructionOrOptions: string | AgentExecuteOptions, + ): Promise { + const options = + typeof instructionOrOptions === "string" + ? { instruction: instructionOrOptions } + : instructionOrOptions; + + this.messages = [buildOperatorSystemPrompt(options.instruction)]; + let completed = false; + let currentStep = 0; + const maxSteps = options.maxSteps || 10; + const actions: AgentAction[] = []; + + while (!completed && currentStep < maxSteps) { + const url = this.stagehandPage.page.url(); + + if (!url || url === "about:blank") { + this.messages.push({ + role: "user", + content: [ + { + type: "text", + text: "No page is currently loaded. The first step should be a 'goto' action to navigate to a URL.", + }, + ], + }); + } else { + const screenshot = await this.stagehandPage.page.screenshot({ + type: "png", + fullPage: false, + }); + + const base64Image = screenshot.toString("base64"); + + let messageText = `Here is a screenshot of the current page (URL: ${url}):`; + + messageText = `Previous actions were: ${actions + .map((action) => { + let result: string = ""; + if (action.type === "act") { + const args = action.playwrightArguments as ObserveResult; + result = `Performed a "${args.method}" action ${args.arguments.length > 0 ? `with arguments: ${args.arguments.map((arg) => `"${arg}"`).join(", ")}` : ""} on "${args.description}"`; + } else if (action.type === "extract") { + result = `Extracted data: ${action.extractionResult}`; + } + return `[${action.type}] ${action.reasoning}. Result: ${result}`; + }) + .join("\n")}\n\n${messageText}`; + + this.messages.push({ + role: "user", + content: [ + { + type: "text", + text: messageText, + }, + { + type: "image_url", + image_url: { url: `data:image/png;base64,${base64Image}` }, + }, + ], + }); + } + + const result = await this.getNextStep(currentStep); + + if (result.method === "close") { + completed = true; + } + + let playwrightArguments: ObserveResult | undefined; + if (result.method === "act") { + [playwrightArguments] = await this.stagehandPage.page.observe( + result.parameters, + ); + } + let extractionResult: unknown | undefined; + if (result.method === "extract") { + extractionResult = await this.stagehandPage.page.extract( + result.parameters, + ); + } + + await this.executeAction(result, playwrightArguments, extractionResult); + + actions.push({ + type: result.method, + reasoning: result.reasoning, + taskCompleted: result.taskComplete, + parameters: result.parameters, + playwrightArguments, + extractionResult, + }); + + currentStep++; + } + + return { + success: true, + message: await this.getSummary(options.instruction), + actions, + completed: actions[actions.length - 1].taskCompleted as boolean, + }; + } + + private async getNextStep(currentStep: number): Promise { + const { data: response } = + (await this.llmClient.createChatCompletion({ + options: { + messages: this.messages, + response_model: { + name: "operatorResponseSchema", + schema: operatorResponseSchema, + }, + requestId: `operator-step-${currentStep}`, + }, + logger: this.logger, + })) as LLMParsedResponse; + + return response; + } + + private async getSummary(goal: string): Promise { + const { data: response } = + (await this.llmClient.createChatCompletion({ + options: { + messages: [ + ...this.messages, + { + role: "user", + content: [ + { + type: "text", + text: `Now use the steps taken to answer the original instruction of ${goal}.`, + }, + ], + }, + ], + response_model: { + name: "operatorSummarySchema", + schema: operatorSummarySchema, + }, + requestId: "operator-summary", + }, + logger: this.logger, + })) as LLMParsedResponse; + + return response.answer; + } + private async executeAction( + action: OperatorResponse, + playwrightArguments?: ObserveResult, + extractionResult?: unknown, + ): Promise { + const { method, parameters } = action; + const page = this.stagehandPage.page; + + if (method === "close") { + return; + } + + switch (method) { + case "act": + if (!playwrightArguments) { + throw new Error("No playwright arguments provided"); + } + await page.act(playwrightArguments); + break; + case "extract": + if (!extractionResult) { + throw new Error("No extraction result provided"); + } + return extractionResult; + case "goto": + await page.goto(parameters, { waitUntil: "load" }); + break; + case "wait": + await page.waitForTimeout(parseInt(parameters)); + break; + case "navback": + await page.goBack(); + break; + case "refresh": + await page.reload(); + break; + default: + throw new Error(`Unknown action: ${method}`); + } + } +} diff --git a/lib/index.ts b/lib/index.ts index b3f16337f..b026c19b0 100644 --- a/lib/index.ts +++ b/lib/index.ts @@ -39,6 +39,7 @@ import { logLineToString, isRunningInBun } from "./utils"; import { ApiResponse, ErrorResponse } from "@/types/api"; import { AgentExecuteOptions, AgentResult } from "../types/agent"; import { StagehandAgentHandler } from "./handlers/agentHandler"; +import { StagehandOperatorHandler } from "./handlers/operatorHandler"; dotenv.config({ path: ".env" }); @@ -818,18 +819,35 @@ export class Stagehand { * Create an agent instance that can be executed with different instructions * @returns An agent instance with execute() method */ - agent(options: AgentConfig): { + agent(options?: AgentConfig): { execute: ( instructionOrOptions: string | AgentExecuteOptions, ) => Promise; } { + if (!options || !options.provider) { + // use open operator agent + return { + execute: async (instructionOrOptions: string | AgentExecuteOptions) => { + return new StagehandOperatorHandler( + this.stagehandPage, + this.logger, + this.llmClient, + ).execute(instructionOrOptions); + }, + }; + } + const agentHandler = new StagehandAgentHandler( this.stagehandPage, this.logger, { modelName: options.model, clientOptions: options.options, - userProvidedInstructions: options.instructions, + userProvidedInstructions: + options.instructions ?? + `You are a helpful assistant that can use a web browser. + You are currently on the following page: ${this.stagehandPage.page.url()}. + Do not ask follow up questions, the user will trust your judgement.`, agentType: options.provider, }, ); @@ -889,5 +907,6 @@ export * from "../types/model"; export * from "../types/page"; export * from "../types/playwright"; export * from "../types/stagehand"; +export * from "../types/operator"; export * from "../types/agent"; export * from "./llm/LLMClient"; diff --git a/lib/inference.ts b/lib/inference.ts index 100d576b8..fbdefc0a8 100644 --- a/lib/inference.ts +++ b/lib/inference.ts @@ -48,7 +48,7 @@ interface LLMUsage { /** * For calls that use a schema: the LLMClient may return { data: T; usage?: LLMUsage } */ -interface LLMParsedResponse { +export interface LLMParsedResponse { data: T; usage?: LLMUsage; } diff --git a/lib/prompt.ts b/lib/prompt.ts index 2f8cdfbbb..e59a3527f 100644 --- a/lib/prompt.ts +++ b/lib/prompt.ts @@ -411,3 +411,24 @@ export function buildActObservePrompt( return instruction; } + +export function buildOperatorSystemPrompt(goal: string): ChatMessage { + return { + role: "system", + content: `You are a general-purpose agent whose job is to accomplish the user's goal across multiple model calls by running actions on the page. + +You will be given a goal and a list of steps that have been taken so far. Your job is to determine if either the user's goal has been completed or if there are still steps that need to be taken. + +# Your current goal +${goal} + +# Important guidelines +1. Break down complex actions into individual atomic steps +2. For \`act\` commands, use only one action at a time, such as: + - Single click on a specific element + - Type into a single input field + - Select a single option +3. Avoid combining multiple actions in one instruction +4. If multiple actions are needed, they should be separate steps`, + }; +} diff --git a/package.json b/package.json index 61f72ce2f..210f757ac 100644 --- a/package.json +++ b/package.json @@ -9,6 +9,7 @@ "2048": "npm run build && tsx examples/2048.ts", "popup": "npm run build && tsx examples/popup.ts", "cua": "npm run build && tsx examples/cua-example.ts", + "operator": "npm run build && tsx examples/operator-example.ts", "example": "npm run build && tsx examples/example.ts", "langchain": "npm run build && tsx examples/langchain.ts", "debug-url": "npm run build && tsx examples/debugUrl.ts", diff --git a/types/operator.ts b/types/operator.ts new file mode 100644 index 000000000..08ee870f3 --- /dev/null +++ b/types/operator.ts @@ -0,0 +1,50 @@ +import { z } from "zod"; + +export const operatorResponseSchema = z.object({ + reasoning: z + .string() + .describe( + "The reasoning for the step taken. If this step's method is `close`, the goal was to extract data, and the task was successful, state the data that was extracted.", + ), + method: z.enum([ + "act", + "extract", + "goto", + "close", + "wait", + "navback", + "refresh", + ]) + .describe(`The action to perform on the page based off of the goal and the current state of the page. + goto: Navigate to a specific URL. + act: Perform an action on the page. + extract: Extract data from the page. + close: The task is complete, close the browser. + wait: Wait for a period of time. + navback: Navigate back to the previous page. Do not navigate back if you are already on the first page. + refresh: Refresh the page.`), + parameters: z + .string() + .describe( + `The parameter for the action. Only pass in a parameter for the following methods: + - act: The action to perform. e.g. "click on the submit button" or "type [email] into the email input field and press enter" + - extract: The data to extract. e.g. "the title of the article". If you want to extract all of the text on the page, leave this undefined. + - wait: The amount of time to wait in milliseconds. + - goto: The URL to navigate to. e.g. "https://www.google.com" + The other methods do not require a parameter.`, + ) + .optional(), + taskComplete: z + .boolean() + .describe( + "Whether the task is complete. If true, the task is complete and no more steps are needed. If you chose to close the task because the goal is not achievable, set this to false.", + ), +}); + +export type OperatorResponse = z.infer; + +export const operatorSummarySchema = z.object({ + answer: z.string().describe("The final answer to the original instruction."), +}); + +export type OperatorSummary = z.infer;