Skip to content
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion knip.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
"project": ["**/*.ts"]
},
"webview-ui": {
"entry": ["src/index.tsx"],
"entry": ["src/index.tsx", "src/browser-panel.tsx"],
"project": ["src/**/*.{ts,tsx}", "../src/shared/*.ts"]
},
"packages/{build,cloud,evals,ipc,telemetry,types}": {
Expand Down
1 change: 1 addition & 0 deletions packages/types/src/message.ts
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ export const clineSays = [
"shell_integration_warning",
"browser_action",
"browser_action_result",
"browser_session_status",
"mcp_server_request_started",
"mcp_server_response",
"subtask_result",
Expand Down
35 changes: 30 additions & 5 deletions src/core/assistant-message/presentAssistantMessage.ts
Original file line number Diff line number Diff line change
Expand Up @@ -437,8 +437,32 @@ export async function presentAssistantMessage(cline: Task) {
return text.replace(tagRegex, "")
}

if (block.name !== "browser_action") {
await cline.browserSession.closeBrowser()
// Keep browser open during an active session so other tools can run.
// Session is active if we've seen any browser_action_result and the last browser_action is not "close".
try {
const messages = cline.clineMessages || []
const hasStarted = messages.some((m: any) => m.say === "browser_action_result")
let isClosed = false
for (let i = messages.length - 1; i >= 0; i--) {
const m = messages[i]
if (m.say === "browser_action") {
try {
const act = JSON.parse(m.text || "{}")
isClosed = act.action === "close"
} catch {}
break
}
}
const sessionActive = hasStarted && !isClosed
// Only auto-close when no active browser session is present, and this isn't a browser_action
if (!sessionActive && block.name !== "browser_action") {
await cline.browserSession.closeBrowser()
}
} catch {
// On any unexpected error, fall back to conservative behavior
if (block.name !== "browser_action") {
await cline.browserSession.closeBrowser()
}
}

if (!block.partial) {
Expand Down Expand Up @@ -645,13 +669,14 @@ export async function presentAssistantMessage(cline: Task) {
})
break
case "browser_action":
await browserActionTool.handle(cline, block as ToolUse<"browser_action">, {
await browserActionTool(
cline,
block as ToolUse<"browser_action">,
askApproval,
handleError,
pushToolResult,
removeClosingTag,
toolProtocol,
})
)
break
case "execute_command":
await executeCommandTool.handle(cline, block as ToolUse<"execute_command">, {
Expand Down
19 changes: 18 additions & 1 deletion src/core/environment/__tests__/getEnvironmentDetails.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,9 @@ describe("getEnvironmentDetails", () => {
deref: vi.fn().mockReturnValue(mockProvider),
[Symbol.toStringTag]: "WeakRef",
} as unknown as WeakRef<ClineProvider>,
browserSession: {
isSessionActive: vi.fn().mockReturnValue(false),
} as any,
}

// Mock other dependencies.
Expand Down Expand Up @@ -393,7 +396,6 @@ describe("getEnvironmentDetails", () => {
const result = await getEnvironmentDetails(cline as Task)
expect(result).toContain("REMINDERS")
})

it("should include git status when maxGitStatusFiles > 0", async () => {
;(getGitStatus as Mock).mockResolvedValue("## main\nM file1.ts")
mockProvider.getState.mockResolvedValue({
Expand Down Expand Up @@ -456,4 +458,19 @@ describe("getEnvironmentDetails", () => {

expect(getGitStatus).toHaveBeenCalledWith(mockCwd, 5)
})

it("should include Browser Session Status when inactive", async () => {
const result = await getEnvironmentDetails(mockCline as Task)
expect(result).toContain("# Browser Session Status")
expect(result).toContain("Inactive - Browser is not launched")
})

it("should include Browser Session Status with current viewport when active", async () => {
;(mockCline.browserSession as any).isSessionActive = vi.fn().mockReturnValue(true)
;(mockCline.browserSession as any).getViewportSize = vi.fn().mockReturnValue({ width: 1280, height: 720 })

const result = await getEnvironmentDetails(mockCline as Task)
expect(result).toContain("Active - A browser session is currently open and ready for browser_action commands")
expect(result).toContain("Current viewport size: 1280x720 pixels.")
})
})
32 changes: 32 additions & 0 deletions src/core/environment/getEnvironmentDetails.ts
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,38 @@ export async function getEnvironmentDetails(cline: Task, includeFileDetails: boo
}
}

// Add browser session status - Always show to prevent LLM from trying browser actions when no session is active
const isBrowserActive = cline.browserSession.isSessionActive()

// Build viewport info for status (prefer actual viewport if available, else fallback to configured setting)
const configuredViewport = (state?.browserViewportSize as string | undefined) ?? "900x600"
let configuredWidth: number | undefined
let configuredHeight: number | undefined
if (configuredViewport.includes("x")) {
const parts = configuredViewport.split("x").map((v) => Number(v))
configuredWidth = parts[0]
configuredHeight = parts[1]
}

let actualWidth: number | undefined
let actualHeight: number | undefined
// Use optional chaining to avoid issues with tests that stub browserSession
const vp = isBrowserActive ? (cline.browserSession as any).getViewportSize?.() : undefined
if (vp) {
actualWidth = vp.width
actualHeight = vp.height
}

const width = actualWidth ?? configuredWidth
const height = actualHeight ?? configuredHeight
const viewportInfo = isBrowserActive && width && height ? `\nCurrent viewport size: ${width}x${height} pixels.` : ""

details += `\n# Browser Session Status\n${
isBrowserActive
? "Active - A browser session is currently open and ready for browser_action commands"
: "Inactive - Browser is not launched. Using any browser action except the browser_action with action='launch' to start a new session will result in an error."
}${viewportInfo}\n`

if (includeFileDetails) {
details += `\n\n# Current Workspace Directory (${cline.cwd.toPosix()}) Files\n`
const isDesktop = arePathsEqual(cline.cwd, path.join(os.homedir(), "Desktop"))
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

35 changes: 25 additions & 10 deletions src/core/prompts/tools/browser-action.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ export function getBrowserActionDescription(args: ToolArgs): string | undefined
}
return `## browser_action
Description: Request to interact with a Puppeteer-controlled browser. Every action, except \`close\`, will be responded to with a screenshot of the browser's current state, along with any new console logs. You may only perform one browser action per message, and wait for the user's response including a screenshot and logs to determine the next action.
- The sequence of actions **must always start with** launching the browser at a URL, and **must always end with** closing the browser. If you need to visit a new URL that is not possible to navigate to from the current webpage, you must first close the browser, then launch again at the new URL.
- While the browser is active, only the \`browser_action\` tool can be used. No other tools should be called during this time. You may proceed to use other tools only after closing the browser. For example if you run into an error and need to fix a file, you must close the browser, then use other tools to make the necessary changes, then re-launch the browser to verify the result.
- The browser window has a resolution of **${args.browserViewportSize}** pixels. When performing any click actions, ensure the coordinates are within this resolution range.
- Before clicking on any elements such as icons, links, or buttons, you must consult the provided screenshot of the page to determine the coordinates of the element. The click should be targeted at the **center of the element**, not on its edges.

**Browser Session Lifecycle:**
- Browser sessions **start** with \`launch\` and **end** with \`close\`
- The session remains active across multiple messages and tool uses
- You can use other tools while the browser session is active - it will stay open in the background

Parameters:
- action: (required) The action to perform. The available actions are:
* launch: Launch a new Puppeteer-controlled browser instance at the specified URL. This **must always be the first action**.
Expand All @@ -23,6 +25,12 @@ Parameters:
- Always click in the center of an element (icon, button, link, etc.) based on coordinates derived from a screenshot.
* type: Type a string of text on the keyboard. You might use this after clicking on a text field to input text.
- Use with the \`text\` parameter to provide the string to type.
* press: Press a single keyboard key or key combination (e.g., Enter, Tab, Escape, Cmd+K, Shift+Enter).
- Use with the \`text\` parameter to provide the key name or combination.
- For single keys: Enter, Tab, Escape, etc.
- For key combinations: Cmd+K, Ctrl+C, Shift+Enter, Alt+F4, etc.
- Supported modifiers: Cmd/Command/Meta, Ctrl/Control, Shift, Alt/Option
- Example: <text>Cmd+K</text> or <text>Shift+Enter</text>
* resize: Resize the viewport to a specific w,h size.
- Use with the \`size\` parameter to specify the new size.
* scroll_down: Scroll down the page by one page height.
Expand All @@ -31,17 +39,24 @@ Parameters:
- Example: \`<action>close</action>\`
- url: (optional) Use this for providing the URL for the \`launch\` action.
* Example: <url>https://example.com</url>
- coordinate: (optional) The X and Y coordinates for the \`click\` and \`hover\` actions. Coordinates should be within the **${args.browserViewportSize}** resolution.
* Example: <coordinate>450,300</coordinate>
- coordinate: (optional) The X and Y coordinates for the \`click\` and \`hover\` actions.
* **CRITICAL**: Screenshot dimensions are NOT the same as the browser viewport dimensions
* Format: <coordinate>x,y@widthxheight</coordinate>
* Measure x,y on the screenshot image you see in chat
* The widthxheight MUST be the EXACT pixel size of that screenshot image (never the browser viewport)
* Never use the browser viewport size for widthxheight - the viewport is only a reference and is often larger than the screenshot
* Images are often downscaled before you see them, so the screenshot's dimensions will likely be smaller than the viewport
* Example A: If the screenshot you see is 1094x1092 and you want to click (450,300) on that image, use: <coordinate>450,300@1094x1092</coordinate>
* Example B: If the browser viewport is 1280x800 but the screenshot is 1000x625 and you want to click (500,300) on the screenshot, use: <coordinate>500,300@1000x625</coordinate>
- size: (optional) The width and height for the \`resize\` action.
* Example: <size>1280,720</size>
- text: (optional) Use this for providing the text for the \`type\` action.
* Example: <text>Hello, world!</text>
Usage:
<browser_action>
<action>Action to perform (e.g., launch, click, type, scroll_down, scroll_up, close)</action>
<action>Action to perform (e.g., launch, click, type, press, scroll_down, scroll_up, close)</action>
<url>URL to launch the browser at (optional)</url>
<coordinate>x,y coordinates (optional)</coordinate>
<coordinate>x,y@widthxheight coordinates (optional)</coordinate>
<text>Text to type (optional)</text>
</browser_action>

Expand All @@ -51,9 +66,9 @@ Example: Requesting to launch a browser at https://example.com
<url>https://example.com</url>
</browser_action>

Example: Requesting to click on the element at coordinates 450,300
Example: Requesting to click on the element at coordinates 450,300 on a 1024x768 image
<browser_action>
<action>click</action>
<coordinate>450,300</coordinate>
<coordinate>450,300@1024x768</coordinate>
</browser_action>`
}
42 changes: 10 additions & 32 deletions src/core/prompts/tools/native-tools/browser_action.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,59 +5,37 @@ export default {
function: {
name: "browser_action",
description:
"Interact with a Puppeteer-controlled browser session. Always start by launching at a URL and always finish by closing the browser. While the browser is active, do not call any other tools. Use coordinates within the viewport to hover or click, provide text for typing, and ensure actions are grounded in the latest screenshot and console logs.",
"Interact with a browser session. Always start by launching at a URL and always finish by closing the browser. While the browser is active, do not call any other tools. Use coordinates within the viewport to hover or click, provide text for typing, and ensure actions are grounded in the latest screenshot and console logs.",
strict: true,
parameters: {
type: "object",
properties: {
action: {
type: "string",
description: "Browser action to perform",
enum: ["launch", "hover", "click", "type", "resize", "scroll_down", "scroll_up", "close"],
enum: ["launch", "click", "hover", "type", "press", "scroll_down", "scroll_up", "resize", "close"],
},
url: {
type: ["string", "null"],
description: "URL to open when performing the launch action; must include protocol",
},
coordinate: {
type: ["object", "null"],
type: ["string", "null"],
description:
"Screen coordinate for hover or click actions; target the center of the desired element",
properties: {
x: {
type: "number",
description: "Horizontal pixel position within the current viewport",
},
y: {
type: "number",
description: "Vertical pixel position within the current viewport",
},
},
required: ["x", "y"],
additionalProperties: false,
"Screen coordinate for hover or click actions in format 'x,y@WIDTHxHEIGHT' where x,y is the target position on the screenshot image and WIDTHxHEIGHT is the exact pixel dimensions of the screenshot image (not the browser viewport). Example: '450,203@900x600' means click at (450,203) on a 900x600 screenshot. The coordinates will be automatically scaled to match the actual viewport dimensions.",
},
size: {
type: ["object", "null"],
description: "Viewport dimensions to apply when performing the resize action",
properties: {
width: {
type: "number",
description: "Viewport width in pixels",
},
height: {
type: "number",
description: "Viewport height in pixels",
},
},
required: ["width", "height"],
additionalProperties: false,
type: ["string", "null"],
description:
"Viewport dimensions for the resize action in format 'WIDTHxHEIGHT' or 'WIDTH,HEIGHT'. Example: '1280x800' or '1280,800'",
},
text: {
type: ["string", "null"],
description: "Text to type when performing the type action",
description:
"Text to type when performing the type action, or key name to press when performing the press action (e.g., 'Enter', 'Tab', 'Escape')",
},
},
required: ["action", "url", "coordinate", "size", "text"],
required: ["action"],
additionalProperties: false,
},
},
Expand Down
Loading
Loading