diff --git a/examples/operator-example.ts b/examples/operator-example.ts index 141cd5039..5978e8d72 100644 --- a/examples/operator-example.ts +++ b/examples/operator-example.ts @@ -1,4 +1,4 @@ -import { Stagehand } from "@/dist"; +import { LogLine, Stagehand } from "@/dist"; import dotenv from "dotenv"; import StagehandConfig from "@/stagehand.config"; import chalk from "chalk"; @@ -6,78 +6,42 @@ import chalk from "chalk"; // Load environment variables dotenv.config(); +const INSTRUCTION = + "Go to Google Japan and interact with it in Japanese. Tell me (in English) an authentic recipe that I can make with ingredients found in American grocery stores."; + async function main() { - console.log(`\n${chalk.bold("Stagehand ๐Ÿค˜ Native Agent Example")}\n`); + console.log(`\n${chalk.bold("Stagehand ๐Ÿค˜ Operator Example")}\n`); // Initialize Stagehand - console.log(`${chalk.cyan("โ†’")} Initializing Stagehand...`); const stagehand = new Stagehand({ ...StagehandConfig, + logger: ({ level, message, timestamp }: LogLine) => { + console.log({ level, message, timestamp }); + }, }); await stagehand.init(); - console.log(`${chalk.green("โœ“")} Stagehand initialized`); try { - const page = stagehand.page; - - console.log(`\n${chalk.magenta.bold("โšก First Agent Execution")}`); - - const agent = stagehand.agent({ - instructions: `You are a helpful assistant that can use a web browser. - You are currently on the following page: ${page.url()}. - Do not ask follow up questions, the user will trust your judgement.`, - }); - - console.log(`${chalk.yellow("โ†’")} Navigating to Google...`); - await stagehand.page.goto("https://www.google.com"); - console.log(`${chalk.green("โœ“")} Loaded: ${chalk.dim(page.url())}`); - - // Execute the agent again with a different instruction - const firstInstruction = - "Search for openai news on google and extract the name of the first 3 results"; - console.log( - `${chalk.cyan("โ†ณ")} Instruction: ${chalk.white(firstInstruction)}`, - ); - - const result1 = await agent.execute(firstInstruction); - - console.log(`${chalk.green("โœ“")} Execution complete`); - console.log(`${chalk.yellow("โคท")} Result:`); - console.log(chalk.white(JSON.stringify(result1, null, 2))); - - console.log(`\n${chalk.magenta.bold("โšก Second Agent Execution")}`); - - console.log(`\n${chalk.yellow("โ†’")} Navigating to Apple...`); - await page.goto("https://www.apple.com/shop/buy-mac/macbook-air"); - console.log(`${chalk.green("โœ“")} Loaded: ${chalk.dim(page.url())}`); + const agent = stagehand.agent(); - const instruction = - "Add a macbook air to the cart. Choose the most expensive configuration."; - console.log(`${chalk.cyan("โ†ณ")} Instruction: ${chalk.white(instruction)}`); + // Execute the agent + console.log(`${chalk.cyan("โ†ณ")} Instruction: ${INSTRUCTION}`); const result = await agent.execute({ - instruction, + instruction: INSTRUCTION, maxSteps: 20, }); console.log(`${chalk.green("โœ“")} Execution complete`); console.log(`${chalk.yellow("โคท")} Result:`); - console.log(chalk.white(JSON.stringify(result, null, 2))); + console.log(JSON.stringify(result, null, 2)); + console.log(chalk.white(result.message)); } catch (error) { console.log(`${chalk.red("โœ—")} Error: ${error}`); - if (error instanceof Error && error.stack) { - console.log(chalk.dim(error.stack.split("\n").slice(1).join("\n"))); - } } finally { - // Close the browser - console.log(`\n${chalk.yellow("โ†’")} Closing browser...`); await stagehand.close(); - console.log(`${chalk.green("โœ“")} Browser closed\n`); } } -main().catch((error) => { - console.log(`${chalk.red("โœ—")} Unhandled error in main function`); - console.log(chalk.red(error)); -}); +main(); diff --git a/lib/handlers/operatorHandler.ts b/lib/handlers/operatorHandler.ts index cb7833a6d..fceae9a90 100644 --- a/lib/handlers/operatorHandler.ts +++ b/lib/handlers/operatorHandler.ts @@ -1,23 +1,22 @@ -import { - AgentAction, - AgentExecuteOptions, - AgentResult, - ActionExecutionResult, -} from "@/types/agent"; +import { AgentAction, AgentExecuteOptions, AgentResult } from "@/types/agent"; import { LogLine } from "@/types/log"; -import { OperatorResponse, operatorResponseSchema } from "@/types/operator"; +import { + OperatorResponse, + operatorResponseSchema, + OperatorSummary, + operatorSummarySchema, +} from "@/types/operator"; import { LLMParsedResponse } from "../inference"; import { ChatMessage, LLMClient } from "../llm/LLMClient"; import { buildOperatorSystemPrompt } from "../prompt"; import { StagehandPage } from "../StagehandPage"; +import { ObserveResult } from "@/types/stagehand"; export class StagehandOperatorHandler { private stagehandPage: StagehandPage; private logger: (message: LogLine) => void; private llmClient: LLMClient; private messages: ChatMessage[]; - private lastActionResult: ActionExecutionResult | null = null; - private lastMethod: string | null = null; constructor( stagehandPage: StagehandPage, @@ -66,21 +65,18 @@ export class StagehandOperatorHandler { let messageText = `Here is a screenshot of the current page (URL: ${url}):`; - if (this.lastMethod && this.lastActionResult) { - const statusMessage = this.lastActionResult.success - ? "was successful" - : `failed with error: ${this.lastActionResult.error}`; - - messageText = `Previous action '${this.lastMethod}' ${statusMessage}.\n\n${messageText}`; - - if ( - this.lastMethod === "extract" && - this.lastActionResult.success && - this.lastActionResult.data - ) { - messageText = `Previous extraction result: ${JSON.stringify(this.lastActionResult.data, null, 2)}\n\n${messageText}`; - } - } + messageText = `Previous actions were: ${actions + .map((action) => { + let result: string = ""; + if (action.type === "act") { + const args = action.playwrightArguments as ObserveResult; + result = `Performed a "${args.method}" action ${args.arguments.length > 0 ? `with arguments: ${args.arguments.map((arg) => `"${arg}"`).join(", ")}` : ""} on "${args.description}"`; + } else if (action.type === "extract") { + result = `Extracted data: ${action.extractionResult}`; + } + return `[${action.type}] ${action.reasoning}. Result: ${result}`; + }) + .join("\n")}\n\n${messageText}`; this.messages.push({ role: "user", @@ -103,33 +99,36 @@ export class StagehandOperatorHandler { completed = true; } + let playwrightArguments: ObserveResult | undefined; + if (result.method === "act") { + [playwrightArguments] = await this.stagehandPage.page.observe( + result.parameters, + ); + } + let extractionResult: unknown | undefined; + if (result.method === "extract") { + extractionResult = await this.stagehandPage.page.extract( + result.parameters, + ); + } + + await this.executeAction(result, playwrightArguments, extractionResult); + actions.push({ type: result.method, reasoning: result.reasoning, taskCompleted: result.taskComplete, + parameters: result.parameters, + playwrightArguments, + extractionResult, }); currentStep++; - - try { - const actionResult = await this.executeAction(result); - this.lastActionResult = { - success: true, - data: actionResult, - }; - } catch (error) { - this.lastActionResult = { - success: false, - error: error instanceof Error ? error.message : String(error), - }; - } - - this.lastMethod = result.method; } return { success: true, - message: actions[actions.length - 1].reasoning as string, + message: await this.getSummary(options.instruction), actions, completed: actions[actions.length - 1].taskCompleted as boolean, }; @@ -152,7 +151,38 @@ export class StagehandOperatorHandler { return response; } - private async executeAction(action: OperatorResponse): Promise { + private async getSummary(goal: string): Promise { + const { data: response } = + (await this.llmClient.createChatCompletion({ + options: { + messages: [ + ...this.messages, + { + role: "user", + content: [ + { + type: "text", + text: `Now use the steps taken to answer the original instruction of ${goal}.`, + }, + ], + }, + ], + response_model: { + name: "operatorSummarySchema", + schema: operatorSummarySchema, + }, + requestId: "operator-summary", + }, + logger: this.logger, + })) as LLMParsedResponse; + + return response.answer; + } + private async executeAction( + action: OperatorResponse, + playwrightArguments?: ObserveResult, + extractionResult?: unknown, + ): Promise { const { method, parameters } = action; const page = this.stagehandPage.page; @@ -162,14 +192,16 @@ export class StagehandOperatorHandler { switch (method) { case "act": - await page.act({ - action: parameters, - slowDomBasedAct: false, - timeoutMs: 5000, - }); + if (!playwrightArguments) { + throw new Error("No playwright arguments provided"); + } + await page.act(playwrightArguments); break; case "extract": - return await page.extract(parameters); + if (!extractionResult) { + throw new Error("No extraction result provided"); + } + return extractionResult; case "goto": await page.goto(parameters, { waitUntil: "load" }); break; diff --git a/lib/index.ts b/lib/index.ts index 929873542..b026c19b0 100644 --- a/lib/index.ts +++ b/lib/index.ts @@ -843,7 +843,11 @@ export class Stagehand { { modelName: options.model, clientOptions: options.options, - userProvidedInstructions: options.instructions, + userProvidedInstructions: + options.instructions ?? + `You are a helpful assistant that can use a web browser. + You are currently on the following page: ${this.stagehandPage.page.url()}. + Do not ask follow up questions, the user will trust your judgement.`, agentType: options.provider, }, ); diff --git a/types/operator.ts b/types/operator.ts index 344f532c2..08ee870f3 100644 --- a/types/operator.ts +++ b/types/operator.ts @@ -42,3 +42,9 @@ export const operatorResponseSchema = z.object({ }); export type OperatorResponse = z.infer; + +export const operatorSummarySchema = z.object({ + answer: z.string().describe("The final answer to the original instruction."), +}); + +export type OperatorSummary = z.infer;