Initial commit

2025-11-29 18:02:35 +08:00
commit 7cc317ecdf
7 changed files with 1196 additions and 0 deletions
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -0,0 +1,12 @@
+{
+  "name": "browser-automation",
+  "description": "Automate web browser interactions using natural language. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications.",
+  "version": "0.0.0-2025.11.28",
+  "author": {
+    "name": "Browserbase",
+    "email": "support@browserbase.com"
+  },
+  "skills": [
+    "./skills/browser-automation"
+  ]
+}
--- a/README.md
+++ b/README.md
@@ -0,0 +1,3 @@
+# browser-automation
+
+Automate web browser interactions using natural language. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications.
--- a/plugin.lock.json
+++ b/plugin.lock.json
@@ -0,0 +1,56 @@
+{
+  "$schema": "internal://schemas/plugin.lock.v1.json",
+  "pluginId": "gh:browserbase/agent-browse:browser-automation",
+  "normalized": {
+    "repo": null,
+    "ref": "refs/tags/v20251128.0",
+    "commit": "51ebd8dfffa00dff79f294c840530f8e3dbe612a",
+    "treeHash": "124bd1b56b4fb60f7a5e60221dd58f3f9d28a8e9be0096088790e750dc18b60d",
+    "generatedAt": "2025-11-28T10:14:25.143410Z",
+    "toolVersion": "publish_plugins.py@0.2.0"
+  },
+  "origin": {
+    "remote": "git@github.com:zhongweili/42plugin-data.git",
+    "branch": "master",
+    "commit": "aa1497ed0949fd50e99e70d6324a29c5b34f9390",
+    "repoRoot": "/Users/zhongweili/projects/openmind/42plugin-data"
+  },
+  "manifest": {
+    "name": "browser-automation",
+    "description": "Automate web browser interactions using natural language. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications."
+  },
+  "content": {
+    "files": [
+      {
+        "path": "README.md",
+        "sha256": "22a572f2b385308ad65c34d6dfc326220aec1e8efa59bffd6291790a9b50f488"
+      },
+      {
+        "path": ".claude-plugin/plugin.json",
+        "sha256": "35e10dc8d220c9b01ec0e9a057172bfca6b9f09ddf7c9e1d3c3b92e1b8e2795c"
+      },
+      {
+        "path": "skills/browser-automation/EXAMPLES.md",
+        "sha256": "3eabcc39097f44d2b645cb97ef85a2d9a47e6511f5d990588bf7394e6cce8884"
+      },
+      {
+        "path": "skills/browser-automation/REFERENCE.md",
+        "sha256": "c2ec0b9a4fe6fd26a6a49891fa32b3e56544a7c7a84654fd95a985f507b53424"
+      },
+      {
+        "path": "skills/browser-automation/setup.json",
+        "sha256": "757323736cdce64d28e39dc7d7a57df848e78091258726607aa7bfe59bf16f5b"
+      },
+      {
+        "path": "skills/browser-automation/SKILL.md",
+        "sha256": "6c37428c8bb563719e2cfb26162522bcda425ba6bf0abc1e813028f7035576d7"
+      }
+    ],
+    "dirSha256": "124bd1b56b4fb60f7a5e60221dd58f3f9d28a8e9be0096088790e750dc18b60d"
+  },
+  "security": {
+    "scannedAt": null,
+    "scannerVersion": null,
+    "flags": []
+  }
+}
--- a/skills/browser-automation/EXAMPLES.md
+++ b/skills/browser-automation/EXAMPLES.md
@@ -0,0 +1,308 @@
+# Browser Automation Examples
+
+This document provides detailed examples of common browser automation tasks using the CLI tool.
+
+## Example 1: Extract Product Information from E-commerce
+
+**User request**: "Go to example.com/product/123 and extract the product details"
+
+**Workflow**:
+
+1. **Navigate** to the product page:
+   ```bash
+   browser navigate https://example.com/product/123
+   ```
+
+2. **Extract** product data with schema:
+   ```bash
+   browser extract "Extract the product information" '{"productName": "string", "price": "number", "currency": "string", "inStock": "boolean", "rating": "number", "reviewCount": "number"}'
+   ```
+
+3. **Close** the browser:
+   ```bash
+   browser close
+   ```
+
+**Expected result**: JSON object with product details that can be analyzed or stored.
+
+---
+
+## Example 2: Fill Out and Submit a Contact Form
+
+**User request**: "Fill out the contact form on example.com with my information"
+
+**Workflow**:
+
+1. **Navigate** to contact page:
+   ```bash
+   browser navigate https://example.com/contact
+   ```
+
+2. **Act**: Fill in name field:
+   ```bash
+   browser act "Fill in the name field with 'John Doe'"
+   ```
+
+3. **Act**: Fill in email field:
+   ```bash
+   browser act "Fill in the email field with 'john.doe@example.com'"
+   ```
+
+4. **Act**: Fill in message field:
+   ```bash
+   browser act "Fill in the message field with 'I would like to inquire about your services'"
+   ```
+
+5. **Act**: Submit the form:
+   ```bash
+   browser act "Click the Submit button"
+   ```
+
+6. **Screenshot** to capture confirmation:
+   ```bash
+   browser screenshot
+   ```
+
+7. **Close** the browser:
+   ```bash
+   browser close
+   ```
+
+---
+
+## Example 3: Research and Summarize News Articles
+
+**User request**: "Check the latest tech news on techcrunch.com and summarize the top stories"
+
+**Workflow**:
+
+1. **Navigate** to news site:
+   ```bash
+   browser navigate https://techcrunch.com
+   ```
+
+2. **Extract** article headlines and summaries:
+   ```bash
+   browser extract "Extract the top 5 article headlines and their summaries" '{"headlines": "string", "summary": "string", "author": "string", "publishedDate": "string"}'
+   ```
+
+3. **Close** the browser:
+   ```bash
+   browser close
+   ```
+
+4. Analyze and summarize the extracted data using Claude's text analysis capabilities.
+
+---
+
+## Example 4: Login and Navigate Authenticated Area
+
+**User request**: "Log into example.com and navigate to my dashboard"
+
+**Workflow**:
+
+1. **Navigate** to login page:
+   ```bash
+   browser navigate https://example.com/login
+   ```
+
+2. **Act**: Fill in username:
+   ```bash
+   browser act "Fill in the username field with 'myusername'"
+   ```
+
+3. **Act**: Fill in password:
+   ```bash
+   browser act "Fill in the password field with 'mypassword'"
+   ```
+
+4. **Act**: Click login button:
+   ```bash
+   browser act "Click the Login button"
+   ```
+
+5. **Act**: Wait for page load:
+   ```bash
+   browser act "Wait for the page to fully load"
+   ```
+
+6. **Navigate** to dashboard:
+   ```bash
+   browser navigate https://example.com/dashboard
+   ```
+
+7. **Screenshot** the dashboard:
+   ```bash
+   browser screenshot
+   ```
+
+8. **Close** the browser:
+   ```bash
+   browser close
+   ```
+
+**Note**: This example uses Chrome's user profile (`.chrome-profile/`) which may preserve session cookies between runs.
+
+---
+
+## Example 5: Search and Collect Results
+
+**User request**: "Search Google for 'best TypeScript practices' and get the top 5 results"
+
+**Workflow**:
+
+1. **Navigate** to Google:
+   ```bash
+   browser navigate https://www.google.com
+   ```
+
+2. **Act**: Perform search:
+   ```bash
+   browser act "Type 'best TypeScript practices' in the search box and press Enter"
+   ```
+
+3. **Act**: Wait for results:
+   ```bash
+   browser act "Wait for search results to load"
+   ```
+
+4. **Extract** search results:
+   ```bash
+   browser extract "Extract the top 5 search results" '{"title": "string", "url": "string", "snippet": "string"}'
+   ```
+
+5. **Close** the browser:
+   ```bash
+   browser close
+   ```
+
+---
+
+## Example 6: Download a File
+
+**User request**: "Download the PDF file from example.com/documents/report.pdf"
+
+**Workflow**:
+
+1. **Navigate** to the file URL:
+   ```bash
+   browser navigate https://example.com/documents/report.pdf
+   ```
+
+2. **Act**: Wait for download to start:
+   ```bash
+   browser act "Wait for 5 seconds for the download to complete"
+   ```
+
+3. **Close** the browser:
+   ```bash
+   browser close
+   ```
+
+**Note**: Files are automatically downloaded to `./agent/downloads/` directory due to CDP configuration.
+
+---
+
+## Example 7: Debugging a Page Issue
+
+**User request**: "Check why the submit button isn't working on example.com/form"
+
+**Workflow**:
+
+1. **Navigate** to the form page:
+   ```bash
+   browser navigate https://example.com/form
+   ```
+
+2. **Screenshot** initial state:
+   ```bash
+   browser screenshot
+   ```
+
+3. **Observe** available elements:
+   ```bash
+   browser observe "Find all buttons and their states"
+   ```
+
+4. **Observe** form fields:
+   ```bash
+   browser observe "Find all form input fields and their required status"
+   ```
+
+5. **Act**: Try filling required fields:
+   ```bash
+   browser act "Fill in all required fields with test data"
+   ```
+
+6. **Screenshot** after filling:
+   ```bash
+   browser screenshot
+   ```
+
+7. **Observe** button state again:
+   ```bash
+   browser observe "Check if the submit button is now enabled"
+   ```
+
+8. **Close** the browser:
+   ```bash
+   browser close
+   ```
+
+Analyze the screenshots and observations to determine the issue.
+
+---
+
+## Example 8: Multi-Page Data Collection
+
+**User request**: "Extract product information from the first 3 pages of results on example.com/products"
+
+**Workflow**:
+
+1. **Navigate** to products page:
+   ```bash
+   browser navigate https://example.com/products
+   ```
+
+2. **Extract** products from page 1:
+   ```bash
+   browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}'
+   ```
+
+3. **Act**: Click next page:
+   ```bash
+   browser act "Click the Next Page button"
+   ```
+
+4. **Extract** products from page 2:
+   ```bash
+   browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}'
+   ```
+
+5. **Act**: Click next page:
+   ```bash
+   browser act "Click the Next Page button"
+   ```
+
+6. **Extract** products from page 3:
+   ```bash
+   browser extract "Extract all products on this page" '{"name": "string", "price": "number", "imageUrl": "string"}'
+   ```
+
+7. **Close** the browser:
+   ```bash
+   browser close
+   ```
+
+Combine and process all extracted data.
+
+---
+
+## Tips for Success
+
+- **Be specific with natural language**: "Click the blue Submit button in the footer" is better than "click submit". This is **extremely important** because there's much ambiguity in many websites. 
+- **Wait when needed**: After navigation or actions that trigger page changes, explicitly wait
+- **Use observe for discovery**: When unsure what elements exist, use observe first
+- **Take screenshots for debugging**: Visual confirmation helps understand what the browser sees
+- **Handle errors gracefully**: If an action fails, try breaking it into smaller steps
+- **Clean up resources**: Always close the browser when done to free up system resources
--- a/skills/browser-automation/REFERENCE.md
+++ b/skills/browser-automation/REFERENCE.md
@@ -0,0 +1,536 @@
+# Browser Automation CLI Reference
+
+This document provides detailed technical reference for the CLI browser automation tool.
+
+## Architecture Overview
+
+The browser automation system consists of:
+
+- **Stagehand**: TypeScript library wrapping Playwright for AI-driven browser control. Uses AI model to find and interact with the right elements, so be specific
+- **Chrome CDP**: Chrome DevTools Protocol connection on port 9222
+- **CLI Tool**: Command-line interface in `src/cli.ts` for browser automation
+- **Local Chrome**: Chrome browser launched with remote debugging enabled
+
+### File Locations
+
+- **Chrome Profile**: `.chrome-profile/` - Persistent browser profile directory
+- **Screenshots**: `./agent/browser_screenshots/` - Screenshot output directory
+- **Downloads**: `./agent/downloads/` - File download directory
+
+## CLI Command Reference
+
+### navigate
+
+Navigate to a URL in the browser.
+
+**Usage**:
+```bash
+browser navigate <url>
+```
+
+**Parameters**:
+- `url` (string, required): The URL to navigate to. Must include protocol (http:// or https://)
+
+**Returns**:
+JSON output:
+```json
+{
+  "success": true,
+  "message": "Successfully navigated to <url>",
+  "screenshot": "/path/to/screenshot.png"
+}
+```
+
+**Implementation Details**:
+- Uses Playwright's `page.goto()` under the hood
+- Waits for network idle and DOM content loaded
+- Automatically takes a screenshot after navigation
+- Supports HTTPS upgrade for HTTP URLs
+
+**Example**:
+```bash
+browser navigate https://example.com
+```
+
+**Error Handling**:
+- Invalid URLs return error with `success: false`
+- Network timeouts return timeout error
+- SSL certificate errors may fail navigation
+
+---
+
+### act
+
+Perform an action on the page using natural language.
+
+**Usage**:
+```bash
+browser act "<action>"
+```
+
+**Parameters**:
+- `action` (string, required): Natural language description of the action to perform
+
+**Returns**:
+JSON output:
+```json
+{
+  "success": true,
+  "message": "Successfully performed action: <action>",
+  "screenshot": "/path/to/screenshot.png"
+}
+```
+
+Note: Without specificity it might succeed on the wrong element!
+
+**Implementation Details**:
+- Uses Stagehand's `page.act()` which leverages Claude Haiku 4.5
+- AI model interprets natural language and executes corresponding browser actions
+- Supports: clicking, typing, selecting, scrolling, waiting, hovering, and more
+- Automatically handles element location and interaction
+- Automatically takes a screenshot after the action
+
+**Natural Language Examples**:
+```bash
+browser act "Click the login button"
+browser act "Fill in email field with test@example.com"
+browser act "Scroll to the bottom of the page"
+browser act "Select 'California' from the state dropdown"
+browser act "Hover over the menu icon"
+browser act "Wait for 3 seconds"
+browser act "Press the Enter key"
+browser act "Double-click the file icon"
+```
+
+**Best Practices**:
+- Be **specific** about which element to interact with
+- Include visual descriptors ("button next to the form", "top menu", "form at bottom")
+- For ambiguous elements, mention nearby context
+- Break complex actions into multiple simple actions
+
+**Error Handling**:
+- Element not found errors indicate selector couldn't be resolved
+- Timeout errors occur when action takes too long
+- Action not possible errors indicate element state prevents action
+- All errors return JSON with `success: false`
+
+---
+
+### extract
+
+Extract structured data from the current page using a schema.
+
+**Usage**:
+```bash
+browser extract "<instruction>" '{"field": "type"}'
+```
+
+**Parameters**:
+- `instruction` (string, required): Natural language description of what to extract
+- `schema` (JSON string, required): Schema definition mapping field names to types
+
+**Schema Types**:
+- `"string"`: Text content
+- `"number"`: Numeric values (integers or floats)
+- `"boolean"`: True/false values
+
+**Returns**:
+JSON output:
+```json
+{
+  "success": true,
+  "data": {
+    "field1": "value",
+    "field2": 123
+  }
+}
+```
+
+**Implementation Details**:
+- Uses Stagehand's `page.extract()` with Zod schema validation
+- AI model (Claude Haiku 4.5) identifies relevant page elements
+- Automatically handles pagination and dynamic content
+- Validates extracted data against schema
+
+**Schema Example**:
+```bash
+browser extract "Extract the product information" '{"productName": "string", "price": "number", "inStock": "boolean", "description": "string", "rating": "number"}'
+```
+
+**Complex Extraction Example**:
+```bash
+browser extract "Extract all items from the shopping cart" '{"itemName": "string", "quantity": "number", "unitPrice": "number", "totalPrice": "number", "imageUrl": "string"}'
+```
+
+**Best Practices**:
+- Use clear, descriptive field names
+- Match schema types to expected data types
+- Provide specific extraction instructions
+- Handle missing data by checking result properties
+
+**Error Handling**:
+- Schema validation errors indicate type mismatch
+- Extraction failures occur when data not found on page
+- Timeout errors for pages that take too long to analyze
+- All errors return JSON with `success: false`
+
+---
+
+### observe
+
+Discover available actions on the page.
+
+**Usage**:
+```bash
+browser observe "<query>"
+```
+
+**Parameters**:
+- `query` (string, required): Natural language query to discover elements
+
+**Returns**:
+JSON output:
+```json
+{
+  "success": true,
+  "data": [
+    {
+      "selector": "button.submit-btn",
+      "text": "Submit Form",
+      "type": "button",
+      "visible": true,
+      "enabled": true
+    }
+  ]
+}
+```
+
+**Implementation Details**:
+- Uses Stagehand's `page.observe()` to scan page elements
+- Returns actionable elements matching the query
+- Provides element properties, states, and available actions
+
+**Query Examples**:
+```bash
+browser observe "Find all buttons"
+browser observe "Find clickable links in the navigation"
+browser observe "Find form input fields"
+browser observe "Find all submit buttons"
+browser observe "Find elements with text 'Login'"
+browser observe "Find all images"
+```
+
+**Use Cases**:
+- Page exploration and discovery
+- Debugging action failures
+- Understanding page structure
+- Finding dynamic element selectors
+
+**Error Handling**:
+- Empty array returned when no elements match
+- Timeout for pages that take too long to scan
+- All errors return JSON with `success: false`
+
+---
+
+### screenshot
+
+Take a screenshot of the current page.
+
+**Usage**:
+```bash
+browser screenshot
+```
+
+**Parameters**: None
+
+**Returns**:
+JSON output:
+```json
+{
+  "success": true,
+  "screenshot": "/path/to/screenshot.png"
+}
+```
+
+**Implementation Details**:
+- Uses Chrome DevTools Protocol `Page.captureScreenshot`
+- Captures full viewport at current scroll position
+- Saves as PNG format with timestamp in filename
+- Automatically resizes images larger than 2000x2000 pixels using Sharp
+- Uses lossless PNG compression
+
+**Screenshot Path Format**:
+```
+./agent/browser_screenshots/screenshot-YYYY-MM-DDTHH-MM-SS-mmmZ.png
+```
+
+**Example**:
+```bash
+browser screenshot
+```
+
+**Image Processing**:
+- Original resolution preserved if ≤ 2000x2000
+- Larger images resized to fit within 2000x2000 while maintaining aspect ratio
+- Uses Sharp library for high-quality image processing
+
+**Best Practices**:
+- Take screenshots before and after important actions
+- Use for visual debugging and verification
+- Screenshot after navigation to confirm page loaded
+- Capture error states for troubleshooting
+
+**Error Handling**:
+- Directory creation errors if screenshots folder can't be created
+- CDP errors if Chrome DevTools Protocol connection fails
+- File write errors if disk space insufficient
+- All errors return JSON with `success: false`
+
+---
+
+### close
+
+Close the browser and cleanup resources.
+
+**Usage**:
+```bash
+browser close
+```
+
+**Parameters**: None
+
+**Returns**:
+JSON output:
+```json
+{
+  "success": true,
+  "message": "Browser closed"
+}
+```
+
+**Implementation Details**:
+- Calls `stagehand.close()` to clean up Playwright resources
+- Kills Chrome process if it was started by the CLI tool
+- Clears internal state variables
+- Does NOT delete `.chrome-profile/` directory (preserved for reuse)
+
+**Resource Cleanup**:
+- Closes all browser tabs and windows
+- Terminates Chrome process (only if started by this tool)
+- Releases CDP connection
+- Clears Stagehand instance
+
+**Best Practices**:
+- Always call at the end of browser automation tasks
+- Call even if errors occurred during automation
+- Don't call mid-workflow unless explicitly needed
+
+**Error Handling**:
+- Continues cleanup even if some steps fail
+- Safe to call multiple times
+- Gracefully handles already-closed browser
+- All errors return JSON with `success: false`
+
+---
+
+## Configuration Details
+
+### Stagehand Initialization
+
+The Stagehand instance is configured in `src/cli.ts` with:
+
+```typescript
+new Stagehand({
+  env: "LOCAL",
+  verbose: 0,
+  enableCaching: true,
+  modelName: "anthropic/claude-haiku-4-5-20251001",
+  localBrowserLaunchOptions: {
+    cdpUrl: `http://localhost:9222`,
+  },
+})
+```
+
+**Configuration Options**:
+- `env: "LOCAL"`: Uses local Chrome instead of remote browser
+- `verbose: 0`: Minimal logging output
+- `enableCaching: true`: Caches page analysis for better performance
+- `modelName`: Claude Haiku 4.5 for AI-driven actions and extraction
+- `cdpUrl`: Chrome DevTools Protocol endpoint
+
+### Chrome Launch Arguments
+
+Chrome is launched by `src/cli.ts` with:
+
+```bash
+--remote-debugging-port=9222
+--user-data-dir=.chrome-profile
+--window-position=-9999,-9999
+--window-size=1280,720
+```
+
+**Arguments**:
+- `--remote-debugging-port`: Enables CDP on port 9222
+- `--user-data-dir`: Persistent profile directory for session/cookie persistence
+- `--window-position`: Launches minimized off-screen
+- `--window-size`: Default window size
+
+### Download Configuration
+
+Downloads are configured via CDP:
+
+```typescript
+await client.send("Browser.setDownloadBehavior", {
+  behavior: "allow",
+  downloadPath: "./agent/downloads",
+  eventsEnabled: true,
+})
+```
+
+**Behavior**:
+- Downloads start automatically (no dialog)
+- Files saved to `./agent/downloads/`
+- Download events can be monitored via CDP
+
+---
+
+## Error Messages Reference
+
+### Common Errors
+
+**"Could not find local Chrome installation"**
+- Cause: Chrome/Chromium not installed or not in standard locations
+- Solution: Install Chrome from https://www.google.com/chrome/
+
+**"Chrome failed to start with remote debugging on port 9222"**
+- Cause: Port 9222 already in use or Chrome can't bind to port
+- Solution: Close other Chrome instances or change CDP port
+
+**"Browser failed to become ready within timeout"**
+- Cause: Chrome launched but page context not ready
+- Solution: Check Chrome version compatibility, restart system
+
+**"Error performing action: element not found"**
+- Cause: Natural language description didn't match any page element
+- Solution: Use more specific description or use observe to find elements
+
+**"Error extracting data: schema validation failed"**
+- Cause: Extracted data type doesn't match schema
+- Solution: Verify schema types match actual page data
+
+**"Error taking screenshot: directory not writable"**
+- Cause: Insufficient permissions for screenshots directory
+- Solution: Check file permissions on `./agent/browser_screenshots/`
+
+---
+
+## Performance Considerations
+
+### Caching
+
+Stagehand caches page analysis to improve performance on repeated actions. Cache is maintained for:
+- Element selectors
+- Page structure analysis
+- Vision model results
+
+### Timeouts
+
+Default timeouts:
+- Navigation: 30 seconds
+- Action execution: 30 seconds
+- Extraction: 60 seconds
+- CDP connection: 15 seconds (50 retries × 300ms)
+
+### Resource Usage
+
+Browser automation consumes:
+- Memory: ~200-500MB for Chrome process
+- CPU: Variable based on page complexity
+- Disk: ~50-200MB for Chrome profile
+- Network: Depends on pages visited
+
+---
+
+## Security Considerations
+
+### Credential Handling
+
+- Browser uses persistent profile (`.chrome-profile/`)
+- Saved passwords and cookies persist between sessions
+- Consider using isolated profiles for sensitive operations
+
+### Download Safety
+
+- Downloads automatically saved to `./agent/downloads/`
+- No file type restrictions enforced
+- Verify downloaded file integrity before use
+
+### Network Access
+
+- Browser has full network access
+- Respects system proxy settings
+- Can access localhost and internal networks
+
+---
+
+## Debugging Tips
+
+### Enable Verbose Logging
+
+Edit `src/cli.ts` and change verbose level in Stagehand configuration:
+
+```typescript
+// Change verbose: 0 to verbose: 1 or 2
+verbose: 2,  // Maximum verbosity
+```
+
+### View Chrome Console
+
+Connect to Chrome DevTools manually:
+1. Open Chrome
+2. Navigate to `chrome://inspect`
+3. Click "inspect" under Remote Target
+
+### Check CDP Connection
+
+Test CDP endpoint:
+```bash
+curl http://localhost:9222/json/version
+```
+
+### Monitor Browser Process
+
+Check Chrome process:
+```bash
+ps aux | grep chrome
+```
+
+### View Screenshots
+
+Screenshots provide visual debugging:
+```bash
+ls -lh ./agent/browser_screenshots/
+open ./agent/browser_screenshots/screenshot-*.png
+```
+
+### Test CLI Commands
+
+Test individual commands:
+```bash
+browser navigate https://example.com
+browser screenshot
+browser close
+```
+
+---
+
+## Version Information
+
+- **Stagehand**: Uses `@browserbasehq/stagehand` package v2.5.2+
+- **Model**: Claude Haiku 4.5 (claude-haiku-4-5-20251001) for browser actions
+- **CLI Tool**: TypeScript CLI in `src/cli.ts`
+- **Agent SDK**: `@anthropic-ai/claude-agent-sdk` for conversation framework
+- **Browser**: Local Chrome/Chromium installation
+
+For updates and changelog, see the main project repository.
--- a/skills/browser-automation/SKILL.md
+++ b/skills/browser-automation/SKILL.md
@@ -0,0 +1,246 @@
+---
+name: Browser Automation
+description: Automate web browser interactions using natural language via CLI commands. Use when the user asks to browse websites, navigate web pages, extract data from websites, take screenshots, fill forms, click buttons, or interact with web applications. Triggers include "browse", "navigate to", "go to website", "extract data from webpage", "screenshot", "web scraping", "fill out form", "click on", "search for on the web". When taking actions be as specific as possible.
+allowed-tools: Bash
+---
+
+# Browser Automation
+
+Automate browser interactions using Stagehand CLI with Claude. This skill provides natural language control over a Chrome browser through command-line tools for navigation, interaction, data extraction, and screenshots.
+
+## Overview
+
+This skill uses a CLI-based approach where Claude Code calls browser automation commands via bash. The browser stays open between commands for faster sequential operations and preserves browser state (cookies, sessions, etc.).
+
+## Setup Verification
+
+**IMPORTANT: Before using any browser commands, you MUST check setup.json in this directory.**
+
+### First-Time Setup Check
+
+1. **Read `setup.json`** (located in `.claude/skills/browser-automation/setup.json`)
+2. **Check `setupComplete` field**:
+   - If `true`: All prerequisites are met, proceed with browser commands
+   - If `false`: Setup required - follow the steps below
+
+### If Setup is Required (`setupComplete: false`)
+
+Run these commands in the plugin directory:
+
+```bash
+# 1. Install dependencies and build (REQUIRED)
+# This automatically builds TypeScript
+npm install
+# or: pnpm install
+# or: bun install
+
+# 2. Link the browser command globally (REQUIRED)
+npm link
+
+# 3. Configure API key (REQUIRED)
+# Option 1 (RECOMMENDED): Export in your terminal
+export ANTHROPIC_API_KEY="your-api-key-here"
+
+# Option 2: Or use .env file
+cp .env.example .env
+# Then edit .env and add: ANTHROPIC_API_KEY="your-api-key-here"
+
+# 4. Verify Chrome is installed
+# Chrome should be at standard location for your OS
+
+# 5. Test the installation
+browser navigate https://example.com
+
+# 6. If test succeeds, update setup.json
+# Set all "installed"/"configured" fields to true
+# Set "setupComplete" to true
+```
+
+### Prerequisites Summary
+
+- ✅ Google Chrome installed on your system
+- ✅ Node.js dependencies installed and TypeScript built (`npm install` runs build automatically)
+- ✅ Browser command globally available (`npm link` creates the global symlink)
+- ✅ Anthropic API key configured (exported as `ANTHROPIC_API_KEY` environment variable or in `.env` file)
+
+**DO NOT attempt to use browser commands if `setupComplete: false` in setup.json. Guide the user through setup first.**
+
+## Available Commands
+
+### Navigate to URLs
+```bash
+browser navigate <url>
+```
+
+**When to use**: Opening any website, loading a specific URL, going to a web page.
+
+**Example usage**:
+- `browser navigate https://example.com`
+- `browser navigate https://news.ycombinator.com`
+
+**Output**: JSON with success status, message, and screenshot path
+
+### Interact with Pages
+```bash
+browser act "<action>"
+```
+
+**When to use**: Clicking buttons, filling forms, scrolling, selecting options, typing text.
+
+**Example usage**:
+- `browser act "click the Sign In button"`
+- `browser act "fill in the email field with test@example.com"`
+- `browser act "scroll down to the footer"`
+- `browser act "type 'laptop' in the search box and press enter"`
+
+**Important**: Be as specific as possible - details make a world of difference. When filling fields, you don't need to combine 'click and type'; the tool will perform a fill similar to Playwright's fill function.
+
+**Output**: JSON with success status, message, and screenshot path
+
+### Extract Data
+```bash
+browser extract "<instruction>" ['{"field": "type"}']
+```
+
+**When to use**: Scraping data, getting specific information, collecting structured content.
+
+**Schema format** (optional): JSON object where keys are field names and values are types:
+- `"string"` for text
+- `"number"` for numeric values
+- `"boolean"` for true/false values
+
+**Note**: The schema parameter is optional. If omitted or if schema validation fails, extraction will proceed without type validation.
+
+**Example usage**:
+- `browser extract "get the product title and price" '{"title": "string", "price": "number"}'`
+- `browser extract "get all article headlines" '{"headlines": "string"}'`
+- `browser extract "get the page title"` (no schema)
+
+**Output**: JSON with success status, extracted data, and screenshot path
+
+### Discover Elements
+```bash
+browser observe "<query>"
+```
+
+**When to use**: Understanding page structure, finding what's clickable, discovering form fields.
+
+**Example usage**:
+- `browser observe "find all clickable buttons"`
+- `browser observe "find all form fields"`
+- `browser observe "find all navigation links"`
+
+**Output**: JSON with success status, discovered elements, and screenshot path
+
+### Take Screenshots
+```bash
+browser screenshot
+```
+
+**When to use**: Visual verification, documenting page state, debugging, creating records.
+
+**Notes**:
+- Screenshots are saved to the plugin directory's `agent/browser_screenshots/` folder
+- Images larger than 2000x2000 pixels are automatically resized
+- Filename includes timestamp for uniqueness
+
+**Output**: JSON with success status and screenshot path
+
+### Clean Up
+```bash
+browser close
+```
+
+**When to use**: After completing all browser interactions, to free up resources.
+
+**Output**: JSON with success status and message
+
+## Browser Behavior
+
+**Persistent Browser**: The browser stays open between commands for faster sequential operations and to preserve browser state (cookies, sessions, etc.).
+
+**Reuse Existing**: If Chrome is already running on port 9222, it will reuse that instance.
+
+**Minimized Launch**: Chrome opens off-screen (position -9999,-9999) to avoid disrupting workflow.
+
+**Safe Cleanup**: The browser only closes when you explicitly call the `close` command.
+
+## Best Practices
+
+1. **Always navigate first**: Before interacting with a page, navigate to the URL
+2. **📸 Always view screenshots**: After each command (navigate, act, extract, observe), use the Read tool to view the screenshot and verify the command worked correctly
+3. **Use natural language**: Describe actions as you would instruct a human
+4. **Extract with clear schemas**: Define field names and types explicitly in JSON
+5. **Handle errors gracefully**: Check the `success` field in JSON output; if an action fails, view the screenshot and try using `observe` to understand the page better
+6. **Close when done**: Always clean up browser resources after completing tasks
+7. **Be specific**: Use precise selectors in natural language ("the blue Submit button" vs "the button")
+8. **Chain commands**: Run multiple commands sequentially without reopening the browser
+
+## Common Patterns
+
+### Simple browsing task
+```bash
+browser navigate https://example.com
+browser act "click the login button"
+browser screenshot
+browser close
+```
+
+### Data extraction task
+```bash
+browser navigate https://example.com/products
+browser act "wait for page to load"
+browser extract "get all products" '{"name": "string", "price": "number"}'
+# Or without schema:
+# browser extract "get the page content"
+browser close
+```
+
+### Multi-step interaction
+```bash
+browser navigate https://example.com/login
+browser act "fill in email with user@example.com"
+browser act "fill in password with mypassword"
+browser act "click the submit button"
+browser screenshot
+browser close
+```
+
+### Debugging workflow
+```bash
+browser navigate https://example.com
+browser screenshot
+browser observe "find all buttons"
+browser act "click the specific button"
+browser screenshot
+browser close
+```
+
+## Troubleshooting
+
+**Page not loading**: Wait a few seconds after navigation before acting. You can explicitly: `browser act "wait for the page to fully load"`
+
+**Element not found**: Use `observe` to discover what elements are actually available on the page
+
+**Action fails**: Be more specific in natural language description. Instead of "click the button", try "click the blue Submit button in the form"
+
+**Screenshots missing**: Check the plugin directory's `agent/browser_screenshots/` folder for saved files
+
+**Chrome not found**: Install Google Chrome or the CLI will show an error with installation instructions
+
+**Port 9222 in use**: Another Chrome debugging session is running. Close it or wait for timeout
+
+For detailed examples, see [EXAMPLES.md](EXAMPLES.md).
+For API reference and technical details, see [REFERENCE.md](REFERENCE.md).
+
+## Dependencies
+
+To use this skill, install these dependencies only if they aren't already present:
+
+```bash
+npm install
+# or
+pnpm install
+# or
+bun install
+```
--- a/skills/browser-automation/setup.json
+++ b/skills/browser-automation/setup.json
@@ -0,0 +1,35 @@
+{
+  "setupComplete": false,
+  "prerequisites": {
+    "chrome": {
+      "required": true,
+      "installed": false,
+      "description": "Google Chrome browser"
+    },
+    "dependencies": {
+      "required": true,
+      "installed": false,
+      "description": "Node.js dependencies (npm install completed)"
+    },
+    "apiKey": {
+      "required": true,
+      "configured": false,
+      "description": "ANTHROPIC_API_KEY exported (i.e $ANTHROPIC_API_KEY) or in .env file"
+    },
+    "browserCommand": {
+      "required": true,
+      "installed": false,
+      "description": "Browser CLI command globally linked (npm link)"
+    }
+  },
+  "setupInstructions": [
+    "1. Run: npm install (this will automatically build TypeScript)",
+    "2. Run: npm link (this creates the global 'browser' command)",
+    "3. (RECOMMENDED) Export ANTHROPIC_API_KEY: export ANTHROPIC_API_KEY='your-api-key-here' (check if already exported)",
+    "   OR alternatively create .env file: cp .env.example .env and edit it to add your API key",
+    "4. Ensure Google Chrome is installed on your system",
+    "5. Test installation: browser navigate https://example.com",
+    "6. Update this setup.json file: set all 'installed'/'configured' to true and 'setupComplete' to true"
+  ],
+  "verifySetup": "Run 'browser navigate https://example.com' from any directory to verify installation"
+}