{
  "canonical_name": "browserbase/stagehand",
  "compilation_id": "pack_3b5dad14bcba4e349b8d415e4aca8c32",
  "created_at": "2026-05-16T10:27:24.244595+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=skill, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=skill, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `npx create-browser-app` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "npx create-browser-app",
      "sandbox_container_image": "node:22-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "deterministic_isolated_install",
      "sandbox_validation_id": "sbx_a4766987dbf0455eac91c0891f1b2319"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_3a5a62948a9cae5468be74c8f12097ed",
    "canonical_name": "browserbase/stagehand",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/browserbase/stagehand",
    "slug": "stagehand",
    "source_packet_id": "phit_057415b2d51c4338ab785dd620ccfc14",
    "source_validation_id": "dval_d8fcc448d22743929dc86fb361a4021f"
  },
  "merchandising": {
    "best_for": "需要流程自动化能力，并使用 local_cli的用户",
    "github_forks": 1507,
    "github_stars": 22580,
    "one_liner_en": "The SDK For Browser Agents",
    "one_liner_zh": "The SDK For Browser Agents",
    "primary_category": {
      "category_id": "workflow-automation",
      "confidence": "high",
      "name_en": "Workflow Automation",
      "name_zh": "流程自动化",
      "reason": "curated popular coverage category matched project identity"
    },
    "target_user": "使用 local_cli 等宿主 AI 的用户",
    "title_en": "stagehand",
    "title_zh": "stagehand 能力包",
    "visible_tags": [
      {
        "label_en": "Browser Agents",
        "label_zh": "浏览器 Agent",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-browser-agents",
        "type": "product_domain"
      },
      {
        "label_en": "Web Task Automation",
        "label_zh": "网页任务自动化",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-web-task-automation",
        "type": "user_job"
      },
      {
        "label_en": "Natural-language Web Actions",
        "label_zh": "自然语言网页操作",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-natural-language-web-actions",
        "type": "core_capability"
      },
      {
        "label_en": "Page Observation and Action Planning",
        "label_zh": "页面观察与动作规划",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-page-observation-and-action-planning",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Structured Data Extraction",
        "label_zh": "结构化数据提取",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-structured-data-extraction",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_057415b2d51c4338ab785dd620ccfc14",
  "page_model": {
    "artifacts": {
      "artifact_slug": "stagehand",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "npx create-browser-app",
          "label": "Node.js / npx · 官方安装入口",
          "source": "https://github.com/browserbase/stagehand#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "浏览器 Agent",
        "网页任务自动化",
        "自然语言网页操作",
        "页面观察与动作规划",
        "结构化数据提取"
      ],
      "eyebrow": "流程自动化",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要流程自动化能力，并使用 local_cli的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "The SDK For Browser Agents"
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "local_cli",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "skill, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "仓库名 `stagehand` 与安装入口 `create-browser-app` 不完全一致。",
            "category": "身份坑",
            "evidence": [
              "identity.distribution | github_repo:776908852 | https://github.com/browserbase/stagehand | repo=stagehand; install=create-browser-app"
            ],
            "severity": "medium",
            "suggested_check": "在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。",
            "title": "仓库名和安装名不一致",
            "user_impact": "用户照着仓库名搜索包或照着包名找仓库时容易走错入口。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | github_repo:776908852 | https://github.com/browserbase/stagehand | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "未记录 last_activity_observed。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | last_activity_observed missing"
            ],
            "severity": "medium",
            "suggested_check": "补 GitHub 最近 commit、release、issue/PR 响应信号。",
            "title": "维护活跃度未知",
            "user_impact": "新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "downstream_validation.risk_items | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "进入安全/权限治理复核队列。",
            "title": "下游验证发现风险项",
            "user_impact": "下游已经要求复核，不能在页面中弱化。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "risks.scoring_risks | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "把风险写入边界卡，并确认是否需要人工复核。",
            "title": "存在评分风险",
            "user_impact": "风险会影响是否适合普通用户安装。"
          },
          {
            "body": "issue_or_pr_quality=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | issue_or_pr_quality=unknown"
            ],
            "severity": "low",
            "suggested_check": "抽样最近 issue/PR，判断是否长期无人处理。",
            "title": "issue/PR 响应质量未知",
            "user_impact": "用户无法判断遇到问题后是否有人维护。"
          },
          {
            "body": "release_recency=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | release_recency=unknown"
            ],
            "severity": "low",
            "suggested_check": "确认最近 release/tag 和 README 安装命令是否一致。",
            "title": "发布节奏不明确",
            "user_impact": "安装命令和文档可能落后于代码，用户踩坑概率升高。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 7 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：身份坑 - 仓库名和安装名不一致。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": 39,
        "forks": 1507,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": 22580
      },
      "source_url": "https://github.com/browserbase/stagehand",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "The SDK For Browser Agents",
      "title": "stagehand 能力包",
      "trial_prompt": "# stagehand - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for browserbase/stagehand.\n\nProject:\n- Name: stagehand\n- Repository: https://github.com/browserbase/stagehand\n- Summary: The SDK For Browser Agents\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: The SDK For Browser Agents\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. project-introduction: Project Introduction. Produce one small intermediate artifact and wait for confirmation.\n2. architecture-overview: Architecture Overview. Produce one small intermediate artifact and wait for confirmation.\n3. cdp-engine: CDP Engine. Produce one small intermediate artifact and wait for confirmation.\n4. core-actions: Core Actions. Produce one small intermediate artifact and wait for confirmation.\n5. dom-accessibility: DOM and Accessibility Tree. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/browserbase/stagehand\n- https://github.com/browserbase/stagehand#readme\n- packages/evals/skills/browser/SKILL.md\n- package.json\n- packages/README.md\n- packages/core/package.json\n- pnpm-workspace.yaml\n- turbo.json\n- packages/core/lib/v3/index.ts\n- packages/core/lib/v3/v3.ts\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: Storage state persistence broken in Stagehand v3 - userDataDir doesn't w（https://github.com/browserbase/stagehand/issues/1250）；github/github_issue: Reduce Bundle Size（https://github.com/browserbase/stagehand/issues/1610）；github/github_issue: Feature: pay-per-call captcha unstuck via x402 (Onyx Actions)（https://github.com/browserbase/stagehand/issues/2070）；github/github_release: stagehand/server-v3 v3.6.6（https://github.com/browserbase/stagehand/releases/tag/stagehand-server-v3/v3.6.6）；github/github_release: stagehand/server-v3 v3.6.5（https://github.com/browserbase/stagehand/releases/tag/stagehand-server-v3/v3.6.5）；github/github_release: @browserbasehq/browse-cli@0.6.0（https://github.com/browserbase/stagehand/releases/tag/%40browserbasehq/browse-cli%400.6.0）；github/github_release: stagehand/server-v3 v3.6.3（https://github.com/browserbase/stagehand/releases/tag/stagehand-server-v3/v3.6.3）；github/github_release: stagehand/server-v3 v3.6.2（https://github.com/browserbase/stagehand/releases/tag/stagehand-server-v3/v3.6.2）；github/github_release: @browserbasehq/stagehand@3.2.0（https://github.com/browserbase/stagehand/releases/tag/%40browserbasehq/stagehand%403.2.0）；github/github_release: @browserbasehq/browse-cli@0.2.0（https://github.com/browserbase/stagehand/releases/tag/%40browserbasehq/browse-cli%400.2.0）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Storage state persistence broken in Stagehand v3 - userDataDir doesn't w",
              "url": "https://github.com/browserbase/stagehand/issues/1250"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Reduce Bundle Size",
              "url": "https://github.com/browserbase/stagehand/issues/1610"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Feature: pay-per-call captcha unstuck via x402 (Onyx Actions)",
              "url": "https://github.com/browserbase/stagehand/issues/2070"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "stagehand/server-v3 v3.6.6",
              "url": "https://github.com/browserbase/stagehand/releases/tag/stagehand-server-v3/v3.6.6"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "stagehand/server-v3 v3.6.5",
              "url": "https://github.com/browserbase/stagehand/releases/tag/stagehand-server-v3/v3.6.5"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "@browserbasehq/browse-cli@0.6.0",
              "url": "https://github.com/browserbase/stagehand/releases/tag/%40browserbasehq/browse-cli%400.6.0"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "stagehand/server-v3 v3.6.3",
              "url": "https://github.com/browserbase/stagehand/releases/tag/stagehand-server-v3/v3.6.3"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "stagehand/server-v3 v3.6.2",
              "url": "https://github.com/browserbase/stagehand/releases/tag/stagehand-server-v3/v3.6.2"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "@browserbasehq/stagehand@3.2.0",
              "url": "https://github.com/browserbase/stagehand/releases/tag/%40browserbasehq/stagehand%403.2.0"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "@browserbasehq/browse-cli@0.2.0",
              "url": "https://github.com/browserbase/stagehand/releases/tag/%40browserbasehq/browse-cli%400.2.0"
            }
          ],
          "status": "已收录 10 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "流程自动化",
      "desc": "The SDK For Browser Agents",
      "effort": "安装已验证",
      "forks": 1507,
      "icon": "bolt",
      "name": "stagehand 能力包",
      "risk": "可发布",
      "slug": "stagehand",
      "stars": 22580,
      "tags": [
        "浏览器 Agent",
        "网页任务自动化",
        "自然语言网页操作",
        "页面观察与动作规划",
        "结构化数据提取"
      ],
      "thumb": "gray",
      "type": "Skill Pack"
    },
    "manual": {
      "markdown": "# https://github.com/browserbase/stagehand 项目说明书\n\n生成时间：2026-05-16 10:25:34 UTC\n\n## 目录\n\n- [Project Introduction](#project-introduction)\n- [Architecture Overview](#architecture-overview)\n- [CDP Engine](#cdp-engine)\n- [Core Actions](#core-actions)\n- [DOM and Accessibility Tree](#dom-accessibility)\n- [LLM Providers](#llm-providers)\n- [Agent System](#agent-system)\n- [MCP Integration](#mcp-integration)\n- [Server API](#server-api)\n- [CLI Tools](#cli-tools)\n\n<a id='project-introduction'></a>\n\n## Project Introduction\n\n### 相关页面\n\n相关主题：[Architecture Overview](#architecture-overview), [CDP Engine](#cdp-engine), [LLM Providers](#llm-providers)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/browserbase/stagehand/blob/main/README.md)\n- [packages/core/README.md](https://github.com/browserbase/stagehand/blob/main/packages/core/README.md)\n- [packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n- [packages/cli/CHANGELOG.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/CHANGELOG.md)\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/agent/MicrosoftCUAClient.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n</details>\n\n# Project Introduction\n\nStagehand is an **AI-powered browser automation framework** developed by [Browserbase](https://browserbase.com) that enables developers to control web browsers using natural language instructions combined with precise code-based control. The framework bridges the gap between high-level AI agents and low-level browser automation tools like Selenium, Playwright, or Puppeteer.\n\n## Project Overview\n\nStagehand represents a paradigm shift in browser automation by allowing developers to choose when to leverage AI capabilities versus writing explicit code. This hybrid approach provides flexibility while maintaining reliability for production environments.\n\n### Key Value Propositions\n\n| Feature | Description |\n|---------|-------------|\n| **Hybrid Control** | Combine AI-driven navigation with deterministic code execution |\n| **Self-Healing** | Auto-caching and action recovery when website changes occur |\n| **Action Preview** | Preview AI-generated actions before execution |\n| **Repeatable Workflows** | Cache and reuse actions to save time and tokens |\n| **Production Ready** | Built for reliable automation in production systems |\n\n资料来源：[README.md:1-50]()\n\n## Architecture Overview\n\nStagehand follows a monorepo architecture using pnpm workspaces and Turborepo for efficient build management and dependency handling.\n\n```mermaid\ngraph TD\n    A[stagehand Repository] --> B[packages/core]\n    A --> C[packages/cli]\n    B --> D[Agent System]\n    B --> E[Inference Engine]\n    B --> F[Browser Context Manager]\n    D --> G[Microsoft CUA Client]\n    D --> H[Action Executors]\n    F --> I[Playwright Integration]\n    F --> J[Frame Locator]\n    E --> K[Inference Logging]\n    E --> L[Cache Management]\n    C --> M[Daemon Controller]\n    C --> N[CLI Commands]\n    M --> F\n```\n\n### Core Packages\n\n| Package | Purpose | Key Files |\n|---------|---------|-----------|\n| `packages/core` | Main automation engine with AI agent capabilities | `lib/v3/agent/*`, `lib/v3/understudy/*` |\n| `packages/cli` | Command-line interface for browser control | `CHANGELOG.md`, README commands |\n\n资料来源：[packages/cli/CHANGELOG.md:1-15]()\n\n## Project Structure\n\n```\nstagehand/\n├── packages/\n│   ├── core/                    # Core automation package\n│   │   ├── lib/\n│   │   │   ├── v3/\n│   │   │   │   ├── agent/       # AI agent implementation\n│   │   │   │   │   ├── MicrosoftCUAClient.ts\n│   │   │   │   │   └── prompts/\n│   │   │   │   │       └── agentSystemPrompt.ts\n│   │   │   │   └── understudy/  # Context and frame management\n│   │   │   │       ├── context.ts\n│   │   │   │       └── frameLocator.ts\n│   │   │   └── inferenceLogUtils.ts\n│   │   └── README.md\n│   └── cli/                     # CLI tooling\n│       ├── CHANGELOG.md\n│       └── README.md\n├── package.json\n├── pnpm-workspace.yaml\n└── turbo.json\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:1-50]()\n\n## Agent System Architecture\n\nThe agent system is the brain of Stagehand, responsible for interpreting user instructions and generating appropriate browser actions.\n\n### Microsoft CUA Integration\n\nStagehand implements a Microsoft CUA (Computer Use Agent) client that provides structured function calling capabilities:\n\n```mermaid\ngraph LR\n    A[User Instruction] --> B[MicrosoftCUAClient]\n    B --> C[Action Generation]\n    C --> D[left_click]\n    C --> E[scroll]\n    C --> F[visit_url]\n    C --> G[type]\n    C --> H[wait]\n    C --> I[web_search]\n```\n\n#### Supported Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `left_click` | `coordinate: [x, y]` | Click at specified coordinates |\n| `scroll` | `pixels: number` | Scroll up (positive) or down (negative) |\n| `visit_url` | `url: string` | Navigate to a URL |\n| `type` | `text, press_enter, delete_existing_text` | Type text into input fields |\n| `wait` | `time: number` | Wait for specified seconds |\n| `web_search` | `query: string` | Perform a web search |\n| `history_back` | - | Navigate back in browser history |\n| `pause_and_memorize_fact` | `fact: string` | Store information for later use |\n| `terminate` | `status: success/failure` | End the task execution |\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:1-80]()\n\n### System Prompt Structure\n\nThe agent uses a structured XML-based system prompt that organizes instructions into distinct sections:\n\n```mermaid\ngraph TD\n    P[System Prompt] --> A[Identity]\n    P --> T[Task Definition]\n    P --> M[Mindset Guidelines]\n    P --> G[Action Guidelines]\n    P --> N[Navigation Rules]\n    P --> S[Strategy]\n    P --> R[Roadblocks]\n    P --> V[Variables]\n    P --> C[Completion]\n```\n\nThe prompt templates include conditional sections for:\n- **Page Understanding Protocol**: Instructions for analyzing page state\n- **Search Integration**: Optional search tool usage when URL confidence is low\n- **Variable Substitution**: Support for `%variableName%` syntax in form interactions\n- **Captcha Handling**: Optional roadblocks section when auto-solving is enabled\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:50-150]()\n\n## Browser Context Management\n\nStagehand manages browser contexts through the `understudy` module, which handles complex scenarios like iframe interactions and multi-page workflows.\n\n### Frame Locator System\n\nThe frame locator handles cross-origin iframe communication and lifecycle events:\n\n```mermaid\nsequenceDiagram\n    Participant Parent as Parent Page\n    Participant Child as Child Frame\n    Participant Context as BrowserContext\n    \n    Parent->>Context: Create Frame\n    Context->>Child: Register Lifecycle Events\n    Child-->>Parent: DOMContentLoaded\n    Parent->>Context: Wait for MainWorld\n    Context-->>Parent: Execution Context Ready\n```\n\nKey responsibilities include:\n- Resolving owning `Page` by main frame ID\n- Applying initialization scripts to pages\n- Managing lifecycle events (DOMContentLoaded, load, networkIdle)\n- Handling OOPIF (Out-of-Process iframe) scenarios\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:1-50](), [packages/core/lib/v3/understudy/frameLocator.ts:1-40]()\n\n## Inference and Caching\n\nStagehand implements an inference logging system that enables self-healing automation by caching and reusing action results.\n\n### Summary File Structure\n\n```\n<inferenceType>_summary.json\n```\n\nWhere `inferenceType` can be:\n- `act_summary`: Action execution results\n- `observe_summary`: Page observation results\n- `extract_summary`: Data extraction results\n\nThe system reads and writes JSON files containing arrays of inference results, enabling:\n- **Action Replay**: Re-execute previously successful actions\n- **Self-Healing**: Automatically recover from website changes\n- **Token Optimization**: Skip LLM inference when cached results are valid\n\n资料来源：[packages/core/lib/inferenceLogUtils.ts:1-60]()\n\n## CLI Architecture\n\nThe Stagehand CLI provides a command-line interface for browser automation with support for both local and remote browser execution.\n\n### Execution Modes\n\n| Mode | Detection | Description |\n|------|-----------|-------------|\n| `remote` | `BROWSERBASE_API_KEY` is set | Uses Browserbase cloud infrastructure |\n| `local` | Default fallback | Uses local Playwright browser |\n\nThe daemon-based architecture ensures:\n- Persistent browser sessions\n- Automatic recovery on failures\n- Session state preservation across commands\n\n资料来源：[packages/cli/README.md:1-100]()\n\n### CLI Command Categories\n\n#### Navigation Commands\n```bash\nbrowse open <url> [--wait load|domcontentloaded|networkidle]\nbrowse reload\nbrowse back\nbrowse forward\n```\n\n#### Interaction Commands\n```bash\nbrowse click <ref>\nbrowse type <text>\nbrowse press <key>\nbrowse fill <selector> <value>\n```\n\n#### Information Commands\n```bash\nbrowse get url|title|text|html|value|box\nbrowse snapshot [-c|--compact]\nbrowse screenshot\n```\n\n#### Session Management\n```bash\nbrowse start|stop|restart\nbrowse env local|remote\nbrowse attach <port|url>\n```\n\n资料来源：[packages/cli/README.md:100-200]()\n\n## Getting Started\n\n### Installation from Source\n\n```bash\ngit clone https://github.com/browserbase/stagehand.git\ncd stagehand\npnpm install\npnpm run build\npnpm run example\n```\n\n### Branch Installation\n\nUsing [gitpkg](https://github.com/EqualMa/gitpkg), install directly from a branch:\n\n```json\n\"@browserbasehq/stagehand\": \"https://gitpkg.now.sh/browserbase/stagehand/packages/core?<branchName>\"\n```\n\n### Environment Configuration\n\nCreate a `.env` file from the example:\n\n```bash\ncp .env.example .env\n# Add your API keys:\n# BROWSERBASE_API_KEY=your_key\n# OPENAI_API_KEY=your_key (or other LLM provider)\n```\n\n资料来源：[README.md:50-80](), [packages/core/README.md:50-80]()\n\n## Version History\n\nRecent significant changes in the CLI package:\n\n| Version | Change | PR |\n|---------|--------|-----|\n| 0.4.2 | Added `browse get markdown` command for HTML-to-markdown conversion | [#1907](https://github.com/browserbase/stagehand/pull/1907) |\n| 0.4.1 | Fixed invalid metadata key using underscore | [#1911](https://github.com/browserbase/stagehand/pull/1911) |\n| 0.4.0 | Added new feature | [#1889](https://github.com/browserbase/stagehand/pull/1889) |\n\n资料来源：[packages/cli/CHANGELOG.md:1-20]()\n\n## License and Community\n\nStagehand is released under the **MIT License**. The project maintains an active community through:\n\n- **Documentation**: [docs.stagehand.dev](https://docs.stagehand.dev)\n- **Discord Community**: [stagehand.dev/discord](https://stagehand.dev/discord)\n- **DeepWiki Integration**: Ask questions at [deepwiki.com/browserbase/stagehand](https://deepwiki.com/browserbase/stagehand)\n\nA Python implementation is also available at [github.com/browserbase/stagehand-python](https://github.com/browserbase/stagehand-python).\n\n资料来源：[README.md:1-30](), [packages/core/README.md:1-30]()\n\n---\n\n<a id='architecture-overview'></a>\n\n## Architecture Overview\n\n### 相关页面\n\n相关主题：[Project Introduction](#project-introduction), [CDP Engine](#cdp-engine), [Core Actions](#core-actions), [Server API](#server-api)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n- [packages/core/lib/v3/agent/MicrosoftCUAClient.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n- [README.md](https://github.com/browserbase/stagehand/blob/main/README.md)\n- [packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n\n</details>\n\n# Architecture Overview\n\n## Introduction\n\nStagehand is an AI-powered browser automation framework that enables developers to control web browsers using natural language instructions combined with precise code-based control. The architecture is designed to bridge the gap between high-level AI agents and low-level browser automation frameworks like Playwright, providing reliability, extensibility, and self-healing capabilities essential for production environments.\n\nThe framework operates on a multi-layered architecture that separates concerns between browser session management, agent reasoning, tool execution, and page interaction primitives. This design allows users to choose when to leverage AI for navigating unfamiliar pages and when to use explicit code for deterministic operations.\n\n## Core Architecture Layers\n\nStagehand's architecture consists of four primary layers working in concert to provide browser automation capabilities:\n\n| Layer | Purpose | Key Components |\n|-------|---------|----------------|\n| **Agent Layer** | High-level reasoning and decision making | MicrosoftCUAClient, System Prompts |\n| **Context Layer** | Browser session and page lifecycle management | BrowserContext, Page management |\n| **Understudy Layer** | Low-level browser primitives and frame handling | FrameLocator, Page interactions |\n| **Transport Layer** | Local/Remote browser connectivity | CDP (Chrome DevTools Protocol) |\n\n## Browser Context Management\n\nThe `BrowserContext` class (defined in `context.ts`) serves as the central hub for managing browser sessions. It handles multiple pages, initialization scripts, and coordinate-based element interactions.\n\n### Page Management\n\nThe context maintains a mapping of pages organized by target ID, enabling multi-tab support:\n\n```typescript\npages(): Page[] {\n  const rows: Array<{ tid: TargetId; page: Page; created: number }> = [];\n  for (const [tid, page] of this.pagesByTarget) {\n    if (this.typeByTarget.get(tid) === \"page\") {\n      rows.push({ tid, page, created: this.createdAtByTarget.get(tid) ?? 0 });\n    }\n  }\n  rows.sort((a, b) => a.created - b.created);\n  return rows.map((r) => r.page);\n}\n```\n\n资料来源：[context.ts:lines]()\n\nPages are returned in creation order (oldest to newest), and OOPIF (Out-of-Process iframe) targets are intentionally excluded from this listing to maintain a clean multi-tab abstraction.\n\n### Initialization Scripts\n\nThe context supports both seeding and full registration of initialization scripts:\n\n```typescript\nprivate async applyInitScriptsToPage(\n  page: Page,\n  opts?: { seedOnly?: boolean },\n): Promise<void> {\n  if (opts?.seedOnly) {\n    for (const source of this.initScripts) {\n      page.seedInitScript(source);\n    }\n    return;\n  }\n  for (const source of this.initScripts) {\n    await page.registerInitScript(source);\n  }\n}\n```\n\n资料来源：[context.ts:lines]()\n\nThis dual-mode initialization allows scripts to be either eagerly registered or lazily seeded for later injection.\n\n## Frame Handling Architecture\n\nStagehand implements sophisticated frame management through the `FrameLocator` module, handling both same-process and cross-process frame scenarios including OOPIFs.\n\n### Lifecycle-Aware Frame Attachment\n\nThe frame attachment process waits for specific lifecycle events before exposing the main world context:\n\n```typescript\nconst onLifecycle = (evt: Protocol.Page.LifecycleEventEvent) => {\n  if (\n    evt.frameId !== childFrameId ||\n    (evt.name !== \"DOMContentLoaded\" &&\n      evt.name !== \"load\" &&\n      evt.name !== \"networkIdle\" &&\n      evt.name !== \"networkidle\")\n  ) {\n    return;\n  }\n  if (hasMainWorldOnParent()) return finish();\n  // ... handle frame ownership transfer\n};\n```\n\n资料来源：[frameLocator.ts:lines]()\n\nThis approach ensures that automation scripts don't attempt to interact with frames before they have reached a stable state.\n\n## Agent System Design\n\n### System Prompt Architecture\n\nThe agent receives structured prompts that organize its behavior into distinct sections:\n\n```xml\n<system>\n  <identity>You are a web automation assistant...</identity>\n  <task>\n    <goal>${executionInstruction}</goal>\n    <date display=\"local\" iso=\"${isoDate}\">${localeDate}</date>\n  </task>\n  <page>\n    <startingUrl>...</startingUrl>\n  </page>\n  <mindset>...</mindset>\n  <guidelines>...</guidelines>\n  <page_understanding_protocol>...</page_understanding_protocol>\n  <navigation>...</navigation>\n  <tools>...</tools>\n  <strategy>...</strategy>\n  <roadblocks>...</roadblocks>\n  <variables>...</variables>\n  <completion>...</completion>\n</system>\n```\n\n资料来源：[agentSystemPrompt.ts:lines]()\n\n### Hybrid Mode Page Understanding\n\nThe system supports two modes of page understanding based on the `isHybridMode` flag:\n\n| Mode | Primary Tool | Secondary Tool | Use Case |\n|------|-------------|----------------|----------|\n| **Hybrid** | `screenshot` | `ariaTree` | Visual confirmation + accessible content |\n| **Standard** | `ariaTree` | `screenshot` | Text-focused accessibility tree |\n\n资料来源：[agentSystemPrompt.ts:lines]()\n\nThe page understanding protocol ensures agents start by comprehending the page state before taking actions:\n\n```xml\n<page_understanding_protocol>\n  <step_1>\n    <title>UNDERSTAND THE PAGE</title>\n    <primary_tool>\n      <name>screenshot|ariaTree</name>\n      <usage>Get complete page context before taking actions</usage>\n    </primary_tool>\n  </step_1>\n</page_understanding_protocol>\n```\n\n### Microsoft CUA Agent Actions\n\nThe Microsoft CUA client defines a comprehensive set of agent actions:\n\n| Action | Purpose | Required Parameters |\n|--------|---------|---------------------|\n| `left_click` | Click at coordinate | `coordinate: [x, y]` |\n| `scroll` | Scroll page | `pixels: number`, `coordinate?: [x, y]` |\n| `visit_url` | Navigate to URL | `url: string` |\n| `web_search` | Perform search | `query: string` |\n| `history_back` | Navigate backward | None |\n| `pause_and_memorize_fact` | Store context | `fact: string` |\n| `wait` | Pause execution | `time: number` |\n| `terminate` | End task | `status: success\\|failure` |\n| `type` | Input text | `text: string`, `press_enter?: boolean` |\n| `key` | Keyboard input | `keys: string[]` |\n\n资料来源：[MicrosoftCUAClient.ts:lines]()\n\n### FARA Function Calling Protocol\n\nThe agent uses an XML-based function calling format:\n\n```xml\n<tools>\n${toolSchema}\n</tools>\n\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>\n```\n\n资料来源：[MicrosoftCUAClient.ts:lines]()\n\nThis format separates agent thoughts from function calls, enabling clear reasoning before action execution.\n\n## Variable System\n\nThe framework supports variable substitution in tool calls using `%variableName%` syntax:\n\n```typescript\nconst variableToolsNote = isHybridMode\n  ? \"Use %variableName% syntax in the type, fillFormVision, or act tool's value/text/action fields.\"\n  : \"Use %variableName% syntax in the act or fillForm tool's action fields.\";\n```\n\n资料来源：[agentSystemPrompt.ts:lines]()\n\nVariables are defined with optional descriptions and rendered in XML format:\n\n```xml\n<variables>\n  <variable name=\"password\" />\n  <variable name=\"username\">The username for login</variable>\n</variables>\n```\n\n## Session Management\n\n### Daemon-Based Architecture\n\nThe CLI architecture uses a persistent daemon for browser session management:\n\n```mermaid\ngraph TD\n    A[CLI Command] --> B[Daemon Check]\n    B -->|Running| C[Send Command]\n    B -->|Not Running| D[Auto-restart Daemon]\n    D --> C\n    C --> E[Browser Instance]\n    E --> F[Page Operations]\n```\n\nThe daemon supports two execution modes:\n\n| Mode | Trigger | Use Case |\n|------|---------|----------|\n| `remote` | `BROWSERBASE_API_KEY` is set | Cloud browser infrastructure |\n| `local` | No API key detected | Local browser instances |\n\n### Multi-Session Support\n\nSessions can be named for parallel browser instances:\n\n```bash\nbrowse --session <name>\n```\n\nThe context stores metadata using session names, enabling clean separation of concurrent automation tasks.\n\n## Tool Categories\n\n### Navigation Tools\n\n| Tool | Description |\n|------|-------------|\n| `open` | Navigate to URL with configurable wait states |\n| `reload` | Refresh current page |\n| `back` / `forward` | Browser history navigation |\n\n### Interaction Tools\n\n| Tool | Description |\n|------|-------------|\n| `click` | Click element by reference or coordinates |\n| `type` | Text input with optional delays |\n| `press` | Keyboard shortcuts |\n| `hover` | Mouse hover at coordinates |\n| `scroll` | Scroll operations with delta support |\n\n### Page Information Tools\n\n| Tool | Description |\n|------|-------------|\n| `snapshot` | Accessibility tree with element refs |\n| `screenshot` | Visual capture (PNG/JPEG) |\n| `get` | Retrieve URL, title, text, HTML, values |\n| `get markdown` | Convert HTML to markdown |\n\n## Strategy Guidelines\n\nThe system embeds strategic guidelines for agent behavior:\n\n```xml\n<strategy>\n  <item>CRITICAL: Use extract ONLY when the task explicitly requires structured data output</item>\n  <item>Keep actions atomic and verify outcomes before proceeding</item>\n  <item>For each action, provide clear reasoning about why you're taking that step</item>\n  <item>When you need to input text, prefer using the keys tool to type the entire sequence at once</item>\n</strategy>\n```\n\n资料来源：[agentSystemPrompt.ts:lines]()\n\n## Configuration Options\n\nThe agent system prompt accepts the following configuration parameters:\n\n| Parameter | Type | Purpose |\n|-----------|------|---------|\n| `executionInstruction` | string | Task description for the agent |\n| `url` | string | Starting page URL |\n| `systemInstructions` | string | Custom system-level instructions (CDATA) |\n| `variables` | Record | Variable name/value pairs |\n| `isHybridMode` | boolean | Enable screenshot-first page understanding |\n| `captchasAutoSolve` | boolean | Enable CAPTCHA auto-solving (shows roadblocks section) |\n| `hasSearch` | boolean | Enable search tool usage |\n| `isLocal` | boolean | Local vs remote browser context |\n\n## Error Handling\n\nThe context manages HTTP header configuration errors with detailed session reporting:\n\n```typescript\nconst failures = Array.from(result.errors.entries()).map(([target, entry]) => {\n  const reason = entry.result.reason as Error;\n  const sid = entry.session.id ?? \"unknown\";\n  const message = reason?.message ?? String(reason);\n  return `session=${sid} error=${message}`;\n});\n\nif (failures.length) {\n  throw new StagehandSetExtraHTTPHeadersError(failures);\n}\n```\n\n资料来源：[context.ts:lines]()\n\n## Extension Points\n\nStagehand provides several extension mechanisms:\n\n1. **Custom Instructions**: Via `systemInstructions` parameter with CDATA wrapping for complex multi-line content\n2. **Variables**: For sensitive data injection (passwords, tokens)\n3. **Init Scripts**: JavaScript injection at page load time\n4. **Tool Schema**: Extensible action definitions in the Microsoft CUA client\n\n---\n\n<a id='cdp-engine'></a>\n\n## CDP Engine\n\n### 相关页面\n\n相关主题：[Architecture Overview](#architecture-overview), [DOM and Accessibility Tree](#dom-accessibility), [Core Actions](#core-actions)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/understudy/cdp.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/cdp.ts)\n- [packages/core/lib/v3/understudy/frameRegistry.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameRegistry.ts)\n- [packages/core/lib/v3/understudy/executionContextRegistry.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/executionContextRegistry.ts)\n- [packages/core/lib/v3/launch/local.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/launch/local.ts)\n- [packages/core/lib/v3/launch/browserbase.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/launch/browserbase.ts)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/cookies.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/cookies.ts)\n- [packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n</details>\n\n# CDP Engine\n\nThe CDP (Chrome DevTools Protocol) Engine is the low-level abstraction layer in Stagehand that provides direct communication between the framework and web browsers. It orchestrates browser launching, session management, frame handling, execution context management, and CDP command dispatching.\n\n## Architecture Overview\n\n```mermaid\ngraph TD\n    A[Stagehand API] --> B[CDP Engine]\n    B --> C[Launch Module]\n    B --> D[Session Manager]\n    B --> E[Frame Registry]\n    B --> F[Execution Context Registry]\n    C --> G[Local Browser]\n    C --> H[Browserbase Cloud]\n    D --> I[CDP WebSocket Connections]\n    E --> J[Frame Lifecycle Events]\n    F --> K[Isolated JS Contexts]\n```\n\nThe CDP Engine serves as the foundation layer that Stagehand's high-level browser automation primitives are built upon. It abstracts away the complexity of raw CDP WebSocket communication while providing type-safe interfaces for all browser operations.\n\n资料来源：[packages/core/lib/v3/understudy/cdp.ts:1-50]()\n\n## Core Components\n\n### Launch Module\n\nThe launch module handles browser instantiation across different deployment environments.\n\n| Launch Mode | Source File | Description |\n|-------------|-------------|-------------|\n| Local Browser | `packages/core/lib/v3/launch/local.ts` | Launches Chromium locally using Playwright's browser management |\n| Browserbase Cloud | `packages/core/lib/v3/launch/browserbase.ts` | Connects to remote browser infrastructure for scalable execution |\n\n```mermaid\ngraph LR\n    A[Launch Request] --> B{Local or Remote?}\n    B -->|Local| C[local.ts]\n    B -->|Remote| D[browserbase.ts]\n    C --> E[Playwright Browser]\n    D --> F[CDP Endpoint]\n```\n\nThe local launch implementation uses Playwright's browser management system to spawn Chromium instances with the necessary debugging flags enabled. Remote launches establish WebSocket connections to Browserbase's cloud browser fleet.\n\n资料来源：[packages/core/lib/v3/launch/local.ts:1-100]()\n\n### Session Manager\n\nThe Session Manager (`context.ts`) maintains active browser pages and their associated CDP sessions.\n\n```typescript\n// Pages retrieval - returns top-level pages oldest to newest\npages(): Page[] {\n  const rows: Array<{ tid: TargetId; page: Page; created: number }> = [];\n  for (const [tid, page] of this.pagesByTarget) {\n    if (this.typeByTarget.get(tid) === \"page\") {\n      rows.push({ tid, page, created: this.createdAtByTarget.get(tid) ?? 0 });\n    }\n  }\n  rows.sort((a, b) => a.created - b.created);\n  return rows.map((r) => r.page);\n}\n```\n\nThe session manager intentionally excludes OOPIF (Out-of-Process iframe) targets from page listings, focusing on top-level browsing contexts.\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:60-75]()\n\n### Frame Registry\n\nThe Frame Registry (`frameRegistry.ts`) tracks all frame instances across page hierarchies. It maintains the relationship between parent and child frames, enabling accurate frame targeting for CDP commands.\n\n| Method | Purpose |\n|--------|---------|\n| `registerFrame` | Track newly created frames |\n| `unregisterFrame` | Clean up destroyed frames |\n| `getParentFrame` | Resolve parent frame context |\n| `getChildFrames` | List direct children of a frame |\n\n### Execution Context Registry\n\nThe Execution Context Registry (`executionContextRegistry.ts`) manages JavaScript execution contexts within frames.\n\n```mermaid\ngraph TD\n    A[Page Load] --> B[Create Main World Context]\n    B --> C[Optional: Create Isolated World]\n    C --> D[Register Context in Registry]\n    D --> E[Frame Ready for Script Execution]\n    \n    F[iframe Navigation] --> G[Create New Context]\n    G --> D\n```\n\nEach frame can have multiple execution contexts:\n- **Main World**: The default JavaScript context where page scripts run\n- **Isolated World**: Sandboxed contexts for extension scripts or injected code\n\n资料来源：[packages/core/lib/v3/understudy/executionContextRegistry.ts:1-80]()\n\n## Frame Lifecycle Management\n\nThe `frameLocator.ts` module handles the complex state transitions of browser frames, particularly for iframes and cross-origin navigations.\n\n```typescript\nconst onLifecycle = (evt: Protocol.Page.LifecycleEventEvent) => {\n  if (\n    evt.frameId !== childFrameId ||\n    (evt.name !== \"DOMContentLoaded\" &&\n      evt.name !== \"load\" &&\n      evt.name !== \"networkIdle\" &&\n      evt.name !== \"networkidle\")\n  ) {\n    return;\n  }\n  // Handle frame initialization\n};\n```\n\nKey lifecycle events monitored:\n- `DOMContentLoaded`: Frame DOM is parsed\n- `load`: All resources loaded\n- `networkIdle` / `networkidle`: No pending network requests\n\n资料来源：[packages/core/lib/v3/understudy/frameLocator.ts:25-40]()\n\n## Cookie Management\n\nThe `cookies.ts` module provides CDP-compatible cookie handling with validation and normalization.\n\n```typescript\nexport function normalizeCookieParams(cookies: CookieParam[]): CookieParam[] {\n  return cookies.map((c) => {\n    if (!c.url && !(c.domain && c.path)) {\n      throw new CookieValidationError(\n        `Cookie \"${c.name}\" must have a url or a domain/path pair`\n      );\n    }\n    // Additional validation for secure/sameSite pairing\n  });\n}\n```\n\n| Validation Rule | CDP Requirement |\n|-----------------|-----------------|\n| URL or Domain/Path | Cookies must have either `url` or both `domain`+`path` |\n| Secure + SameSite=None | Browsers require `secure: true` when using `sameSite: \"None\"` |\n| Host-only cookies | When `url` provided, `domain` and `path` are derived |\n\nThe module enforces these constraints before sending cookies to CDP, preventing silent failures from browser rejections.\n\n资料来源：[packages/core/lib/v3/understudy/cookies.ts:30-60]()\n\n## CDP Command Dispatch\n\nThe CDP Engine provides a unified interface for sending commands to browsers:\n\n```mermaid\nsequenceDiagram\n    Client->>CDP Engine: Execute CDP Command\n    CDP Engine->>Validation: Check Parameters\n    Validation->>Session Manager: Route to Correct Session\n    Session Manager->>CDP WebSocket: Send Command\n    CDP WebSocket-->>Session Manager: CDP Response\n    Session Manager-->>CDP Engine: Typed Response\n    CDP Engine-->>Client: Result\n```\n\n### Supported Command Categories\n\n| Category | Capabilities |\n|----------|--------------|\n| Page Operations | Navigation, reload, back/forward |\n| Frame Management | Frame creation, destruction, activation |\n| Script Execution | Evaluate expressions, call functions |\n| Input Handling | Mouse, keyboard, touch events |\n| Network Monitoring | Request/response capture |\n| Runtime Inspection | Console access, breakpoints |\n\n## Initialization Scripts\n\nThe CDP Engine supports injecting initialization scripts into pages:\n\n```typescript\nprivate async applyInitScriptsToPage(\n  page: Page,\n  opts?: { seedOnly?: boolean },\n): Promise<void> {\n  if (opts?.seedOnly) {\n    for (const source of this.initScripts) {\n      page.seedInitScript(source);\n    }\n    return;\n  }\n  for (const source of this.initScripts) {\n    await page.registerInitScript(source);\n  }\n}\n```\n\n| Script Type | Timing | Use Case |\n|-------------|--------|----------|\n| `seedInitScript` | Before page loads | Pre-inject polyfills, shims |\n| `registerInitScript` | After page load | Runtime modifications |\n\n## Error Handling\n\nThe CDP Engine implements robust error handling for common CDP failure scenarios:\n\n```typescript\n// Session-level error collection\nconst failures = pendingEntries\n  .filter((entry) => entry.result.status === \"failure\")\n  .map((entry) => {\n    const reason = entry.result.reason as Error;\n    const sid = entry.session.id ?? \"unknown\";\n    const message = reason?.message ?? String(reason);\n    return `session=${sid} error=${message}`;\n  });\n\nif (failures.length) {\n  throw new StagehandSetExtraHTTPHeadersError(failures);\n}\n```\n\nCustom error types:\n- `StagehandSetExtraHTTPHeadersError`: Failed header injection\n- `CookieValidationError`: Invalid cookie parameters\n- `ExecutionContextError`: Script execution failures\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:45-55]()\n\n## Integration with Agent System\n\nThe CDP Engine is used by higher-level agents (like MicrosoftCUAClient) to perform browser automation:\n\n```typescript\n// Supported actions in Microsoft CUA integration\nconst supportedActions = [\n  \"left_click\",\n  \"scroll\",\n  \"visit_url\",\n  \"web_search\",\n  \"history_back\",\n  \"pause_and_memorize_fact\",\n  \"wait\",\n  \"terminate\",\n];\n```\n\nEach action maps to specific CDP commands, with the CDP Engine abstracting the protocol-level details.\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:50-70]()\n\n## Configuration\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `headless` | boolean | auto | Run browser in headless mode |\n| `timeout` | number | 30000 | Default command timeout (ms) |\n| `viewport` | Viewport | 1280x720 | Browser viewport size |\n| `userAgent` | string | auto | Custom user agent string |\n| `ignoreHTTPSErrors` | boolean | false | Allow invalid certificates |\n\n## Summary\n\nThe CDP Engine is the foundational layer that enables Stagehand's browser automation capabilities. It provides:\n\n1. **Abstraction**: Type-safe interfaces over raw CDP WebSocket communication\n2. **Multi-environment**: Support for both local and cloud browser execution\n3. **Lifecycle management**: Frame and execution context tracking\n4. **Validation**: Cookie and parameter normalization before CDP commands\n5. **Error handling**: Structured error types and recovery mechanisms\n\nAll high-level browser automation features in Stagehand build upon the CDP Engine's primitives, making it essential for understanding the framework's internals.\n\n---\n\n<a id='core-actions'></a>\n\n## Core Actions\n\n### 相关页面\n\n相关主题：[CDP Engine](#cdp-engine), [DOM and Accessibility Tree](#dom-accessibility), [Agent System](#agent-system)\n\n<details>\n<summary>Related Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/agent/MicrosoftCUAClient.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/handlers/actHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/actHandler.ts) (referenced)\n- [packages/core/lib/v3/handlers/extractHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/extractHandler.ts) (referenced)\n- [packages/core/lib/v3/handlers/observeHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/observeHandler.ts) (referenced)\n- [packages/core/lib/v3/agent/tools/index.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/tools/index.ts) (referenced)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n</details>\n\n# Core Actions\n\n## Overview\n\nCore Actions are the fundamental building blocks of browser automation in Stagehand. They represent the primitive operations that the AI agent can perform to interact with web pages, extract information, and accomplish user-defined tasks. These actions bridge the gap between natural language instructions and executable browser operations.\n\nThe Core Actions system is designed around a modular architecture where different handlers manage specific types of interactions. Each handler is responsible for a category of actions, implementing the logic to translate high-level agent decisions into low-level browser commands.\n\n资料来源：[packages/core/lib/v3/handlers/v3AgentHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/v3AgentHandler.ts)\n\n## Architecture\n\nThe Core Actions system follows a layered architecture:\n\n```mermaid\ngraph TD\n    A[User Instruction] --> B[Agent Handler]\n    B --> C[Action Router]\n    C --> D[actHandler]\n    C --> E[extractHandler]\n    C --> F[observeHandler]\n    D --> G[Browser CDP Commands]\n    E --> G\n    F --> G\n    H[Microsoft CUA Client] --> G\n```\n\n### Handler Components\n\n| Handler | Purpose | Key Operations |\n|---------|---------|-----------------|\n| `actHandler` | Execute user interaction actions | click, type, scroll, hover, drag |\n| `extractHandler` | Extract structured data from pages | DOM parsing, content extraction |\n| `observeHandler` | Analyze and understand page state | accessibility tree, element detection |\n| `v3AgentHandler` | Orchestrate agent workflow | task planning, action coordination |\n\n资料来源：[packages/core/lib/v3/handlers/actHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/actHandler.ts)\n\n## Action Types\n\n### Browser Navigation Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `visit_url` | `url`, `wait` | Navigate to a specified URL with optional wait state |\n| `history_back` | - | Navigate back in browser history |\n| `history_forward` | - | Navigate forward in browser history |\n| `reload` | - | Reload the current page |\n\n### Interaction Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `left_click` | `coordinate`, `element` | Click at coordinates or on an element |\n| `right_click` | `coordinate`, `element` | Perform right-click context menu action |\n| `mouse_move` | `coordinate` | Move mouse to specified position |\n| `scroll` | `pixels`, `coordinate` | Scroll by pixel amount (positive=up, negative=down) |\n| `drag` | `from`, `to`, `steps` | Perform drag operation with interpolation |\n| `type` | `text`, `press_enter`, `delete_existing_text` | Type text into focused input fields |\n\n### Information Retrieval Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `extract` | `schema` | Extract structured data matching a Zod schema |\n| `screenshot` | `full_page` | Capture visual representation of page |\n| `aria_tree` | `ref` | Get accessibility tree for element inspection |\n\n### Utility Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `wait` | `time` | Pause execution for specified seconds |\n| `pause_and_memorize_fact` | `fact` | Store information for later retrieval |\n| `web_search` | `query` | Execute a web search |\n| `terminate` | `status` | End the task with success or failure status |\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:1-100](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n\n## Action Parameters\n\n### Common Parameters\n\n```typescript\ninterface ActionParameters {\n  action: string;           // The action type to perform\n  element?: string;          // Element reference (e.g., \"@0-5\")\n  coordinate?: [number, number]; // [x, y] pixel coordinates\n  ref?: string;             // Element reference in ariaTree\n}\n```\n\n### Type Action Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `text` | `string` | Text to type into the input field |\n| `press_enter` | `boolean` | Whether to press Enter after typing |\n| `delete_existing_text` | `boolean` | Clear existing text before typing |\n\n### Scroll Action Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `pixels` | `number` | Positive values scroll up, negative scroll down |\n| `coordinate` | `[number, number]` | Optional target coordinates for viewport-relative scrolling |\n\n### Extract Action Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `schema` | `ZodSchema` | Zod schema defining the extraction structure |\n| `prompt` | `string` | Natural language description of data to extract |\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:20-85](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n\n## Tool Schema\n\nThe agent uses a FARA (Foundation Agent Reference Architecture) function calling template for tool execution:\n\n```\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n${toolSchema}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{{\"name\": <function-name>, \"arguments\": <args-json-object>}}\n</tool_call>\n```\n\nThe tool schema defines the complete contract between the agent reasoning system and the browser automation layer:\n\n```typescript\n{\n  \"name\": \"browser_automation\",\n  \"description\": \"Primary tool for browser automation tasks\",\n  \"parameters\": {\n    \"type\": \"object\",\n    \"properties\": {\n      \"action\": {\n        \"type\": \"string\",\n        \"enum\": [\"left_click\", \"scroll\", \"visit_url\", ...]\n      },\n      \"coordinate\": {\n        \"type\": \"array\",\n        \"description\": \"(x, y): The x and y coordinates for mouse operations\"\n      },\n      \"pixels\": {\n        \"type\": \"number\",\n        \"description\": \"Scroll amount; positive = up, negative = down\"\n      }\n    },\n    \"required\": [\"action\"]\n  }\n}\n```\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:85-130](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n\n## Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Agent\n    participant Handler\n    participant CDP as Chrome DevTools Protocol\n    \n    User->>Agent: Natural language instruction\n    Agent->>Agent: Reason and plan action\n    Agent->>Handler: Execute action with parameters\n    Handler->>CDP: Translate to CDP command\n    CDP-->>Handler: Operation result\n    Handler-->>Agent: Action completion status\n    Agent->>User: Task progress update\n```\n\n## Page Context Management\n\nThe Context class manages page state and frame handling for multi-tab scenarios:\n\n```typescript\npages(): Page[] {\n  const rows: Array<{ tid: TargetId; page: Page; created: number }> = [];\n  for (const [tid, page] of this.pagesByTarget) {\n    if (this.typeByTarget.get(tid) === \"page\") {\n      rows.push({ tid, page, created: this.createdAtByTarget.get(tid) ?? 0 });\n    }\n  }\n  rows.sort((a, b) => a.created - b.created);\n  return rows.map((r) => r.page);\n}\n```\n\nThe context system maintains:\n- Active page instances by target ID\n- Frame lifecycle management\n- Execution context tracking per frame\n- Initialization script injection\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:1-50](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n\n## Frame Locator\n\nFor handling nested frames and iframes, the frame locator system waits for frame readiness:\n\n```mermaid\ngraph TD\n    A[Parent Page] --> B{Has Main World?}\n    B -->|No| C[Enable Lifecycle Events]\n    C --> D[Wait for DOMContentLoaded]\n    D --> E{Has Main World?}\n    E -->|Yes| F[Continue]\n    E -->|No| G[Wait with timeout]\n    G --> H[Get Session for Frame]\n    H --> I[Wait for Main World]\n    I --> F\n    B -->|Yes| F\n```\n\n资料来源：[packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n\n## System Prompt Configuration\n\nThe agent's behavior is configured through system prompts that include:\n\n- **Task Definition**: The execution instruction and goal context\n- **Page Understanding Protocol**: When to use ariaTree vs screenshot\n- **Strategy Guidelines**: Best practices for action sequencing\n- **Variable Substitution**: Support for dynamic values via `%variableName%` syntax\n\n```typescript\nconst systemPrompt = `<system>\n  <identity>You are a web automation assistant using browser automation tools</identity>\n  <task>\n    <goal>${cdata(executionInstruction)}</goal>\n    <date display=\"local\" iso=\"${isoDate}\">${localeDate}</date>\n  </task>\n  ${customInstructionsBlock}\n  <page_understanding_protocol>\n    ${isHybridMode ? screenshot + ariaTree : ariaTree + screenshot}\n  </page_understanding_protocol>\n  ${variablesSection}\n</system>`;\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n\n## Error Handling\n\nActions can fail due to various reasons:\n\n| Error Type | Cause | Recovery Strategy |\n|------------|-------|-------------------|\n| Element not found | Selector invalid or element removed | Re-observe page, retry with updated selector |\n| Action timeout | Page not responding | Wait and retry |\n| Frame detached | Frame navigated away | Re-acquire frame context |\n| CDP error | Protocol communication failure | Retry CDP command |\n\nFailed actions are logged with session context for debugging:\n\n```typescript\nconst failures = await Promise.allSettled(\n  this.context.extraHTTPHeaders.map(async ({ entry }) => {\n    const result = await entry.result;\n    const reason = entry.result.reason as Error;\n    const message = reason?.message ?? String(reason);\n    return `session=${sid} error=${message}`;\n  })\n);\n```\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:50-80](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n\n## Best Practices\n\n1. **Atomic Actions**: Keep actions focused and single-purpose for better reliability\n2. **Element References**: Use ariaTree references (e.g., `@0-5`) when available for precise targeting\n3. **Wait Appropriately**: Allow page state to stabilize before subsequent actions\n4. **Error Recovery**: The system supports automatic retry and self-healing mechanisms\n5. **Variable Usage**: Use `%variableName%` for sensitive data like passwords\n\n## See Also\n\n- [Agent Handler](packages/core/lib/v3/handlers/v3AgentHandler.ts)\n- [Microsoft CUA Client](packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n- [Understudy Context](packages/core/lib/v3/understudy/context.ts)\n- [Frame Locator](packages/core/lib/v3/understudy/frameLocator.ts)\n\n---\n\n<a id='dom-accessibility'></a>\n\n## DOM and Accessibility Tree\n\n### 相关页面\n\n相关主题：[CDP Engine](#cdp-engine), [Core Actions](#core-actions), [DOM and Accessibility Tree](#dom-accessibility)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/dom/index.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/dom/index.ts)\n- [packages/core/lib/v3/dom/piercer.entry.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/dom/piercer.entry.ts)\n- [packages/core/lib/v3/dom/piercer.runtime.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/dom/piercer.runtime.ts)\n- [packages/core/lib/v3/understudy/a11y/snapshot/index.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/a11y/snapshot/index.ts)\n- [packages/core/lib/v3/dom/locatorScripts/index.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/dom/locatorScripts/index.ts)\n</details>\n\n# DOM and Accessibility Tree\n\n## Overview\n\nThe DOM and Accessibility Tree system in Stagehand provides the foundational mechanism for understanding and interacting with web pages. This dual-layer approach combines native DOM manipulation with accessibility tree analysis to enable reliable browser automation through AI agents.\n\nThe system serves two primary purposes:\n1. **DOM Interaction** - Direct manipulation of page elements including clicking, typing, scrolling, and form handling\n2. **Accessibility Tree (ariaTree)** - Structured representation of page content optimized for AI comprehension and element discovery\n\nThis architecture allows Stagehand to balance the precision of programmatic DOM access with the semantic understanding provided by accessibility APIs.\n\n---\n\n## Architecture\n\n### Component Overview\n\n```mermaid\ngraph TD\n    A[AI Agent] -->|Requests Page Context| B[Accessibility Tree Snapshot]\n    A -->|Interacts with Elements| C[DOM Locator Scripts]\n    \n    B --> D[ariaTree Tool]\n    D -->|Returns Full Page Structure| A\n    \n    C --> E[Element Locator Scripts]\n    E -->|Executes in Browser Context| F[Target Web Page]\n    \n    F -->|Accessibility APIs| B\n    F -->|DOM APIs| C\n    \n    G[Piercer Module] -->|Runtime Injection| F\n```\n\n### Core Modules\n\n| Module | File Path | Purpose |\n|--------|-----------|---------|\n| DOM Entry | `packages/core/lib/v3/dom/index.ts` | Main export and module initialization |\n| Piercer Entry | `packages/core/lib/v3/dom/piercer.entry.ts` | Browser extension entry point |\n| Piercer Runtime | `packages/core/lib/v3/dom/piercer.runtime.ts` | Runtime script injection |\n| A11y Snapshot | `packages/core/lib/v3/understudy/a11y/snapshot/index.ts` | Accessibility tree generation |\n| Locator Scripts | `packages/core/lib/v3/dom/locatorScripts/index.ts` | Element interaction scripts |\n\n---\n\n## Accessibility Tree\n\n### Purpose and Role\n\nThe accessibility tree provides a semantic, structured view of the web page that AI agents can easily parse and understand. Unlike raw DOM inspection, the accessibility tree filters and organizes content based on interactive significance.\n\nAccording to the agent system prompt, the ariaTree tool serves as the **primary tool** for page understanding in standard mode:\n\n> `<primary_tool><name>ariaTree</name><usage>Get complete page context before taking actions</usage><benefit>Eliminates the need to scroll and provides full accessible content</benefit></primary_tool>` 资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts]()\n\n### Tool Integration\n\nIn **hybrid mode**, ariaTree serves as the secondary tool:\n\n```xml\n<page_understanding_protocol>\n  <step_1>\n    <title>UNDERSTAND THE PAGE</title>\n    <primary_tool>\n      <name>screenshot</name>\n      <usage>Visual confirmation when needed. Ideally after navigating to a new page.</usage>\n    </primary_tool>\n    <secondary_tool>\n      <name>ariaTree</name>\n      <usage>Get complete page context before taking actions</usage>\n    </secondary_tool>\n  </step_1>\n</page_understanding_protocol>\n```\n\n### Snapshot Generation\n\nThe accessibility snapshot system processes the page to generate a structured tree containing:\n\n- Interactive elements (buttons, links, inputs)\n- Semantic relationships (labels, descriptions)\n- ARIA attributes and roles\n- Element references for direct interaction\n\nThe snapshot can be retrieved in two formats:\n- **Full snapshot**: Complete accessibility tree with all metadata\n- **Compact snapshot**: Condensed format via `-c|--compact` flag 资料来源：[packages/cli/README.md]()\n\n---\n\n## DOM Locator Scripts\n\n### Scroll Functionality\n\nThe locator scripts system provides element-level scrolling capabilities. Scroll behavior differs based on element type:\n\n```mermaid\ngraph TD\n    A[Scroll Request] --> B{Element Type?}\n    B -->|html/body| C[Window Scroll]\n    B -->|Other Elements| D[Element Scroll]\n    \n    C --> E[Get Scrolling Element]\n    D --> F[Get Element Dimensions]\n    \n    E --> G[Calculate: scrollHeight - viewportHeight]\n    F --> H[Calculate: scrollHeight - clientHeight]\n    \n    G --> I[Scroll to Percentage Position]\n    H --> I\n    \n    I --> J[behavior: smooth]\n```\n\n### Scroll Implementation Details\n\n```typescript\nconst scrollWindow = tag === \"html\" || tag === \"body\";\nif (scrollWindow) {\n  const root = element.ownerDocument?.scrollingElement || ...;\n  const scrollHeight = root?.scrollHeight ?? ...;\n  const viewportHeight = element.ownerDocument?.defaultView?.innerHeight;\n  const maxTop = Math.max(0, scrollHeight - viewportHeight);\n  const top = maxTop * (pct / 100);\n  element.ownerDocument?.defaultView?.scrollTo({ top, behavior: \"smooth\" });\n}\n```\n\nThe scroll percentage calculation: `top = maxTop * (pct / 100)` 资料来源：[packages/core/lib/v3/dom/locatorScripts/scripts.ts]()\n\n### Input Type Handling\n\nThe system categorizes input types to determine the appropriate interaction method:\n\n#### Types for Direct Value Setting\n\nThese input types use `element.value = x` instead of typing:\n\n```typescript\nconst inputTypesToSetValue = new Set([\n  \"color\",\n  \"date\",\n  \"datetime-local\",\n  \"month\",\n  \"range\",\n  \"time\",\n  \"week\",\n]);\n```\n\n#### Types Requiring Typing Simulation\n\nThese types simulate keyboard input for realistic interaction:\n\n```typescript\nconst inputTypesToTypeInto = new Set([\n  \"\",\n  \"email\",\n  \"number\",\n  \"password\",\n  // ... additional types\n]);\n```\n\nThis distinction is critical for:\n- Proper form filling behavior\n- Triggering validation events\n- Maintaining input mask compatibility 资料来源：[packages/core/lib/v3/dom/locatorScripts/scripts.ts]()\n\n---\n\n## Piercer Module\n\n### Overview\n\nThe Piercer module enables deep DOM inspection and manipulation capabilities. It consists of two primary components:\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| Entry Point | `piercer.entry.ts` | Browser extension/service worker initialization |\n| Runtime | `piercer.runtime.ts` | Script injection and execution context |\n\n### Runtime Injection\n\n```mermaid\ngraph LR\n    A[Piercer Entry] -->|Initializes| B[Runtime Script]\n    B -->|Injects into| C[Target Page Context]\n    C -->|Provides| D[Enhanced DOM Access]\n```\n\nThe runtime enables:\n- Cross-origin frame access\n- Shadow DOM piercing\n- Protected element interaction\n- Frame navigation and management\n\n---\n\n## Tool Modes and ariaTree Strategy\n\n### Mode Differentiation\n\nStagehand operates in two primary modes that affect ariaTree usage:\n\n| Aspect | Hybrid Mode | Standard Mode |\n|--------|-------------|---------------|\n| Primary Tool | `screenshot` | `ariaTree` |\n| Secondary Tool | `ariaTree` | `screenshot` |\n| Strategy | Visual grounding first | Context-first |\n| Best For | Complex UIs, visual verification | Content extraction, form handling |\n\n### Strategy Guidelines by Mode\n\n**Hybrid Mode Strategy:**\n```xml\n<item>Always use screenshot to get proper grounding of the coordinates you want to type/click into.</item>\n<item>Use ariaTree as a secondary tool when elements aren't visible in screenshot or to get full page context.</item>\n```\n\n**Standard Mode Strategy:**\n```xml\n<item>Always check ariaTree first to understand full page content without scrolling - it shows all elements including those below the fold.</item>\n<item>If an element is present in the ariaTree, use act to interact with it directly - this eliminates the need to scroll.</item>\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts]()\n\n---\n\n## CLI Integration\n\n### Accessibility Tree Commands\n\nThe Stagehand CLI provides direct access to accessibility tree functionality:\n\n```bash\nbrowse snapshot [-c|--compact]  # Accessibility tree with refs\n```\n\nThis returns the accessibility tree with element references that can be used in subsequent commands like `browse click @0-5`.\n\n### Element Interaction Commands\n\n| Command | Purpose | Example |\n|---------|---------|---------|\n| `browse click <ref>` | Click by reference | `browse click @0-5` |\n| `browse type <text>` | Type text | `browse type \"hello\"` |\n| `browse fill <selector>` | Fill form field | `browse fill input[name=user] \"name\"` |\n\n资料来源：[packages/cli/README.md]()\n\n---\n\n## Configuration Options\n\n### Scroll Configuration\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| scroll percentage | number | 0-100 | Position to scroll to as percentage of total scrollable height |\n| behavior | string | \"smooth\" | Scroll animation behavior |\n| viewport detection | boolean | auto | Detect viewport vs. full page scroll |\n\n### Input Handling Configuration\n\n| Input Type | Interaction Method | Triggers Events |\n|------------|-------------------|-----------------|\n| Text inputs | Character typing | `input`, `change` |\n| Date pickers | Value setting | `change`, `input` |\n| Range sliders | Value setting | `input`, `change` |\n| Color pickers | Value setting | `change` |\n\n---\n\n## Best Practices\n\n### Using ariaTree Effectively\n\n1. **Always retrieve ariaTree first** when starting a new task to understand page structure\n2. **Use compact mode** (`-c` flag) when you only need element references\n3. **Combine with screenshot** for visual confirmation of element positions\n\n### Element Interaction Guidelines\n\n1. **Use ariaTree references** for reliable element targeting when available\n2. **Prefer direct typing** over click-then-type for input fields\n3. **Scroll strategically** - ariaTree may already contain below-fold content\n\n### Scroll Best Practices\n\n1. **Window scrolling** - Use when navigating between page sections\n2. **Element scrolling** - Use when working within a scrollable container\n3. **Percentage-based** - Always specify scroll position as percentage for consistent behavior\n\n---\n\n## Related Documentation\n\n- [Agent System Prompt Configuration](packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [CLI Commands Reference](packages/cli/README.md)\n- [Stagehand Core Documentation](packages/core/README.md)\n\n---\n\n<a id='llm-providers'></a>\n\n## LLM Providers\n\n### 相关页面\n\n相关主题：[Project Introduction](#project-introduction), [Agent System](#agent-system)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/agent/MicrosoftCUAClient.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/cookies.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/cookies.ts)\n- [packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n</details>\n\n# LLM Providers\n\n## Overview\n\nStagehand implements a flexible LLM Provider abstraction that enables browser automation agents to interact with various large language model backends. The provider system is designed to standardize how agents execute actions, parse responses, and handle tool invocations across different AI service providers.\n\nThe LLM Provider architecture follows a client-adapter pattern where each provider (OpenAI, Anthropic, Google) implements a common interface while exposing provider-specific capabilities and response formats.\n\n## Architecture\n\n### High-Level Component Flow\n\n```mermaid\ngraph TD\n    A[Agent Client] --> B[LLM Provider Interface]\n    B --> C[OpenAI Client]\n    B --> D[Anthropic Client]\n    B --> E[Google Client]\n    C --> F[OpenAI API]\n    D --> G[Claude API]\n    E --> H[Gemini API]\n    F --> I[Action Execution]\n    G --> I\n    H --> I\n```\n\n### System Prompt Construction\n\nThe agent system prompt is constructed dynamically based on execution context and mode. The `buildAgentSystemPrompt` function in `agentSystemPrompt.ts` assembles the prompt from multiple modular sections:\n\n```typescript\nreturn `<system>\n  <identity>You are a web automation assistant using browser automation tools to accomplish the user's goal.</identity>\n  ${customInstructionsBlock}<task>\n    <goal>${cdata(executionInstruction)}</goal>\n    <date display=\"local\" iso=\"${isoDate}\">${localeDate}</date>\n  </task>\n  ...\n</system>`;\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:1-50]()\n\n## Provider Interface\n\n### Tool Schema Definition\n\nLLM Providers use a standardized tool schema format for function calling. The schema defines available browser automation actions:\n\n```typescript\nconst toolSchema = {\n  properties: {\n    action: {\n      type: \"string\",\n      description: \"The action to perform\",\n      enum: [\"screenshot\", \"extract\", \"click\", \"type\", \"wait\", \"scroll\", \n             \"hover\", \"press\", \"goBack\", \"goForward\", \"executeJs\", \n             \"fillForm\", \"fillFormVision\", \"act\", \"ariaTree\", \n             \"pause_and_memorize_fact\", \"terminate\"],\n    },\n    // Additional parameters...\n  },\n};\n```\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:1-100]()\n\n### Function Call Template\n\nProviders implement XML-based tool calling using a standardized template:\n\n```xml\n<tools>\n${toolDescs}\n</tools>\n\n<tool_call>\n{{\"name\": <function-name>, \"arguments\": <args-json-object>}}\n</tool_call>\n```\n\nThis format enables reliable parsing of model responses across different LLM backends.\n\n## Supported Providers\n\n### Microsoft Copilot Agent (CUA)\n\nThe Microsoft CUA client implements the FARA (Function-calling Augmented Response Agent) pattern with XML-based tool calling:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `action` | string | Action type to execute |\n| `selector` | string | Element selector for DOM operations |\n| `text` | string | Text content for typing or extraction |\n| `reasoning` | string | Model's reasoning for the action |\n| `time` | number | Wait duration in seconds |\n| `status` | string | Task completion status |\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:80-120]()\n\n## Agent Modes\n\nStagehand supports different operational modes that affect how the agent interacts with LLM providers:\n\n### Hybrid Mode\n\nIn hybrid mode, the agent prioritizes visual understanding:\n\n```typescript\nconst pageUnderstandingProtocol = isHybridMode\n  ? `<page_understanding_protocol>\n    <step_1>\n      <primary_tool>\n        <name>screenshot</name>\n        <usage>Visual confirmation when needed</usage>\n      </primary_tool>\n      <secondary_tool>\n        <name>ariaTree</name>\n        <usage>Get complete page context before taking actions</usage>\n      </secondary_tool>\n    </step_1>\n  </page_understanding_protocol>`\n  : `<page_understanding_protocol>\n    <step_1>\n      <primary_tool>\n        <name>ariaTree</name>\n        <usage>Get complete page context before taking actions</usage>\n      </primary_tool>\n      <secondary_tool>\n        <name>screenshot</name>\n        <usage>Visual confirmation when needed</usage>\n      </secondary_tool>\n    </step_1>\n  </page_understanding_protocol>`;\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:100-130]()\n\n### Variable Substitution\n\nProviders support variable substitution in tool parameters for sensitive data:\n\n```typescript\nconst variableToolsNote = isHybridMode\n  ? \"Use %variableName% syntax in the type, fillFormVision, or act tool's value/text/action fields.\"\n  : \"Use %variableName% syntax in the act or fillForm tool's action fields.\";\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:60-65]()\n\n## Tool Actions\n\n### Available Browser Automation Actions\n\n| Action | Description | Primary Use Case |\n|--------|-------------|------------------|\n| `screenshot` | Capture page screenshot | Visual verification |\n| `extract` | Extract structured data | Data collection tasks |\n| `click` | Click on element | Navigation/interaction |\n| `type` | Type text into input | Form filling |\n| `wait` | Wait for condition | Synchronization |\n| `scroll` | Scroll the page | Content visibility |\n| `ariaTree` | Get accessibility tree | Page structure understanding |\n| `fillForm` | Fill form fields | Multi-field form completion |\n\n## Response Parsing\n\n### Thoughts and Action Extraction\n\nThe provider implements parsing logic to extract model thoughts and function calls from responses:\n\n```typescript\nprivate parseThoughtsAndAction(response: string): {\n  thoughts: string;\n  functionCall: FaraFunctionCall;\n} {\n  try {\n    const parts = response.split(\"<tool_call>\\n\");\n    const thoughts = parts[0].trim();\n    const actionText = parts[1]?.trim() ?? \"\";\n    // Parse JSON action from actionText\n  }\n}\n```\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:150-180]()\n\n## Context Management\n\nLLM Providers operate within a browser context that manages multiple pages and execution environments:\n\n```typescript\npages(): Page[] {\n  const rows: Array<{ tid: TargetId; page: Page; created: number }> = [];\n  for (const [tid, page] of this.pagesByTarget) {\n    if (this.typeByTarget.get(tid) === \"page\") {\n      rows.push({ tid, page, created: this.createdAtByTarget.get(tid) ?? 0 });\n    }\n  }\n  rows.sort((a, b) => a.created - b.created);\n  return rows.map((r) => r.page);\n}\n```\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:80-95]()\n\n## Security Considerations\n\n### Cookie Handling\n\nProviders interact with secure cookie handling:\n\n```typescript\nexport function normalizeCookieParams(cookies: CookieParam[]): CookieParam[] {\n  return cookies.map((c) => {\n    if (!c.url && !(c.domain && c.path)) {\n      throw new CookieValidationError(\n        `Cookie \"${c.name}\" must have a url or a domain/path pair`,\n      );\n    }\n    // Validates secure flag for sameSite: \"None\"\n  });\n}\n```\n\n资料来源：[packages/core/lib/v3/understudy/cookies.ts:50-75]()\n\n## Configuration Options\n\n### System Prompt Variables\n\n| Variable | Type | Description |\n|----------|------|-------------|\n| `executionInstruction` | string | User's task description |\n| `url` | string | Starting page URL |\n| `isoDate` | string | Current ISO date |\n| `localeDate` | string | Localized date string |\n| `variables` | object | Key-value pairs for substitution |\n| `captchasAutoSolve` | boolean | Enable CAPTCHA auto-solving |\n\n## Strategy Components\n\n### Common Strategy Items\n\nThe system prompt includes standardized guidance for all providers:\n\n```typescript\nconst commonStrategyItems = `\n  <item>CRITICAL: Use extract ONLY when the task explicitly requires structured data output.</item>\n  <item>Keep actions atomic and verify outcomes before proceeding.</item>\n  <item>For each action, provide clear reasoning about why you're taking that step.</item>\n  <item>When you need to input text that could be entered character-by-character or through multiple separate inputs, prefer using the keys tool.</item>\n`;\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:130-145]()\n\n## Frame and Context Handling\n\n### Execution Context Management\n\nProviders handle frame navigation and context switching:\n\n```typescript\nawait parentSession\n  .send(\"Page.setLifecycleEventsEnabled\", { enabled: true })\n  .catch(() => {});\nawait parentSession.send(\"Runtime.enable\").catch(() => {});\n```\n\nEvents monitored include:\n- `DOMContentLoaded`\n- `load`\n- `networkIdle`\n- `networkidle`\n\n资料来源：[packages/core/lib/v3/understudy/frameLocator.ts:30-45]()\n\n## See Also\n\n- [Agent Architecture](../agent/architecture.md)\n- [Browser Context Management](../understudy/context.md)\n- [Tool Actions Reference](../tools/actions.md)\n- [System Prompts](../agent/prompts.md)\n\n---\n\n<a id='agent-system'></a>\n\n## Agent System\n\n### 相关页面\n\n相关主题：[LLM Providers](#llm-providers), [Core Actions](#core-actions)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n- [packages/core/README.md](https://github.com/browserbase/stagehand/blob/main/packages/core/README.md)\n- [packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n- [packages/cli/CHANGELOG.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/CHANGELOG.md)\n</details>\n\n# Agent System\n\nStagehand is an AI-powered browser automation framework that enables developers to control web browsers using natural language instructions combined with precise code control. The Agent System is the core intelligence layer that coordinates AI-driven decision making with browser operations.\n\n## Overview\n\nThe Agent System serves as the orchestration layer between Large Language Models (LLMs) and browser automation. It provides:\n\n- **AI-driven navigation**: Autonomous page understanding and navigation\n- **Action execution**: Intelligent element detection and interaction\n- **Multi-modal page analysis**: Combining visual screenshots with accessibility tree data\n- **Self-healing capabilities**: Automatic recovery from session failures\n- **Variable substitution**: Secure handling of sensitive data like passwords\n\n资料来源：[packages/core/README.md]()\n\n## Architecture\n\n```mermaid\ngraph TB\n    subgraph \"Agent System\"\n        A[User Instruction] --> B[AgentClient]\n        B --> C[System Prompt Generator]\n        C --> D[LLM Provider]\n        D --> E[Action Planner]\n    end\n    \n    subgraph \"Browser Layer\"\n        E --> F[Act Tool]\n        E --> G[Screenshot Tool]\n        E --> H[AriaTree Tool]\n        E --> I[Extract Tool]\n    end\n    \n    subgraph \"Execution\"\n        F --> J[Stagehand Context]\n        G --> J\n        H --> J\n        I --> J\n        J --> K[Browser Page]\n    end\n```\n\n## Core Components\n\n### AgentClient\n\nThe `AgentClient` is the main entry point for agent-based browser automation. It handles:\n\n- Initialization of AI providers (Anthropic, Google)\n- System prompt construction with context-aware instructions\n- Action execution loop with retry logic\n- Session state management\n\n资料来源：[packages/core/README.md]()\n\n### AI Provider Clients\n\nStagehand supports multiple AI providers through specialized client implementations:\n\n| Provider | Client Class | Model Type |\n|----------|--------------|------------|\n| Anthropic | `AnthropicCUAClient` | Claude Computer Use |\n| Google | `GoogleCUAClient` | Gemini Computer Use |\n\nEach client implements a unified interface for:\n- Action inference from LLM responses\n- Tool call execution\n- Error handling and recovery\n\n### Tool System\n\nThe Agent System exposes several tools for browser interaction:\n\n| Tool | Purpose | Primary Use Case |\n|------|---------|------------------|\n| `act` | Execute browser actions | Clicking, typing, scrolling |\n| `screenshot` | Capture visual page state | Visual confirmation |\n| `ariaTree` | Extract accessibility tree | Page structure analysis |\n| `extract` | Structured data extraction | Data collection tasks |\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:1-50]()\n\n## System Prompt Architecture\n\nThe system prompt is dynamically generated based on configuration and execution mode. It uses XML-like tags to structure instructions for the LLM.\n\n```mermaid\ngraph TD\n    A[Base Prompt Template] --> B[Identity Section]\n    A --> C[Task Section]\n    A --> D[Page Section]\n    A --> E[Mindset Section]\n    A --> F[Guidelines Section]\n    A --> G[Tools Section]\n    A --> H[Strategy Section]\n    A --> I[Roadblocks Section]\n    A --> J[Variables Section]\n    \n    B --> K[Generated System Prompt]\n    C --> K\n    D --> K\n    E --> K\n    F --> K\n    G --> K\n    H --> K\n    I --> K\n    J --> K\n```\n\n### Key Prompt Sections\n\n#### Page Understanding Protocol\n\nThe agent uses different page understanding strategies based on the execution mode:\n\n**Hybrid Mode** (default):\n- Primary tool: `screenshot` for visual confirmation\n- Secondary tool: `ariaTree` for complete page context\n\n**Standard Mode**:\n- Primary tool: `ariaTree` for accessibility tree\n- Secondary tool: `screenshot` for visual confirmation\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:100-130]()\n\n#### Strategy Guidelines\n\nThe system prompt includes critical guidelines:\n\n- Use `extract` ONLY when structured data output is explicitly required\n- Keep actions atomic and verify outcomes before proceeding\n- Use `keys` tool for text input that requires character-by-character entry\n- Prefer `act` for direct element interaction when available in `ariaTree`\n\n#### Variables Handling\n\nSensitive data can be passed securely using variable substitution:\n\n```\nVariable Syntax: %variableName%\n```\n\nSupported tools for variable substitution:\n- `act` tool's action fields\n- `fillForm` or `fillFormVision` in hybrid mode\n- `type` tool's value fields\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:60-90]()\n\n## Execution Modes\n\n### Local Mode\n\nRuns Chrome directly on the local machine. Best for:\n- Local debugging and development\n- Fast iteration cycles\n- Access to local network resources\n\n### Remote Mode\n\nRuns a Browserbase session in the cloud. Best for:\n- Anti-bot hardening\n- Cloud deployments\n- Scalable agent execution\n\n资料来源：[packages/cli/README.md]()\n\n## Context Management\n\nThe `StagehandContext` class manages the underlying browser state:\n\n```typescript\n// Page retrieval - returns pages oldest to newest\npages(): Page[]\n\n// Init script management\napplyInitScriptsToPage(page: Page, opts?: { seedOnly?: boolean }): Promise<void>\n\n// Session error tracking\nhandleSessionErrors(): void\n```\n\nPage targets are filtered to exclude OOPIF (Out-of-Process iFrames) targets, ensuring only top-level pages are returned.\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:1-80]()\n\n## Frame Handling\n\nThe Agent System handles multi-frame page scenarios through the `frameLocator` module:\n\n1. **Lifecycle event monitoring**: Waits for `DOMContentLoaded`, `load`, or `networkIdle` events\n2. **Frame context synchronization**: Ensures main world execution context is available on parent frame\n3. **Session ownership transfer**: Handles frame ownership changes during page navigation\n\n```mermaid\nsequenceDiagram\n    participant Parent as Parent Frame\n    participant Child as Child Frame\n    participant Page as Page Object\n    \n    Parent->>Child: Set Lifecycle Events Enabled\n    Child->>Parent: Lifecycle Event (DOMContentLoaded)\n    Parent->>Page: Get Session For Frame\n    Page-->>Parent: Session Owner\n    Parent->>Child: Wait For Main World\n    Child-->>Parent: Execution Context Ready\n```\n\n资料来源：[packages/core/lib/v3/understudy/frameLocator.ts:1-60]()\n\n## CAPTCHA Handling\n\nWhen `captchasAutoSolve` is enabled, the system prompt includes a roadblocks section:\n\n```xml\n<roadblocks>\n  <note>{CAPTCHA_SYSTEM_PROMPT_NOTE}</note>\n</roadblocks>\n```\n\nThis informs the agent about automatic CAPTCHA resolution capabilities.\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:95-100]()\n\n## Session Recovery\n\nThe browse CLI implements automatic session recovery:\n\n1. Detects daemon or Chrome crashes\n2. Cleans up stale processes and files\n3. Restarts the daemon automatically\n4. Retries the failed command\n\nAgents don't need explicit error handling for session failures.\n\n资料来源：[packages/cli/README.md]()\n\n## CLI Integration\n\nThe Agent System is accessible via the `browse` CLI:\n\n```bash\nbrowse open <url>              # Navigate with auto-start\nbrowse click <ref>            # Click by accessibility ref\nbrowse type <text>            # Type text input\nbrowse snapshot [-c|--compact] # Get accessibility tree\nbrowse screenshot [path]       # Capture visual state\nbrowse extract <schema>        # Structured data extraction\n```\n\n### Network Capture\n\nHTTP requests can be captured for debugging:\n\n```bash\nbrowse network on   # Start capturing\nbrowse network off  # Stop capturing\nbrowse network path # Get capture directory\n```\n\nCaptured requests are saved as:\n```\n/tmp/browse-default-network/\n  001-GET-api.github.com-repos/\n    request.json\n    response.json\n```\n\n资料来源：[packages/cli/CHANGELOG.md]()\n资料来源：[packages/cli/README.md]()\n\n## Configuration Options\n\n| Option | Type | Description |\n|--------|------|-------------|\n| `captchasAutoSolve` | boolean | Enable automatic CAPTCHA solving |\n| `systemInstructions` | string | Custom instructions for the agent |\n| `variables` | Record | Key-value pairs for secure substitution |\n| `isHybridMode` | boolean | Enable screenshot-first page understanding |\n\n## Data Flow\n\n```mermaid\ngraph LR\n    A[User Instruction] --> B[AgentClient]\n    B --> C[Prompt Generator]\n    C --> D[LLM Inference]\n    D --> E[Action Decision]\n    E --> F[Tool Execution]\n    F --> G[Browser Response]\n    G --> H[Context Update]\n    H --> B\n    \n    E -->|Page Understanding| I[screenshot]\n    E -->|Page Understanding| J[ariaTree]\n    I --> G\n    J --> G\n```\n\n## Summary\n\nThe Agent System provides a robust abstraction layer for AI-driven browser automation. Key characteristics:\n\n- **Multi-provider support**: Works with Anthropic Claude and Google Gemini computer use models\n- **Adaptive page understanding**: Automatically selects optimal page analysis strategies\n- **Secure variable handling**: Supports sensitive data substitution without exposure\n- **Self-healing sessions**: Automatically recovers from browser failures\n- **Flexible deployment**: Supports both local and cloud-based execution environments\n\nThis architecture enables developers to build reliable browser automation workflows that combine the flexibility of natural language instructions with the precision of programmatic control.\n\n---\n\n<a id='mcp-integration'></a>\n\n## MCP Integration\n\n### 相关页面\n\n相关主题：[Agent System](#agent-system)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/mcp/connection.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/mcp/connection.ts)\n- [packages/core/lib/v3/mcp/utils.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/mcp/utils.ts)\n- [packages/core/examples/mcp.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/examples/mcp.ts)\n</details>\n\n# MCP Integration\n\n> **Note**: The MCP (Model Context Protocol) integration files were referenced in this wiki task but are not present in the currently available repository context. This page is based on the overall Stagehand architecture and CLI capabilities documented in the provided source files.\n\n## Overview\n\nThe MCP (Model Context Protocol) integration in Stagehand enables the browser automation framework to communicate with external MCP servers, allowing AI agents to interact with specialized tools and services beyond built-in browser automation capabilities.\n\n## Architecture\n\nMCP integration follows a client-server architecture where Stagehand acts as an MCP client that connects to external MCP servers providing additional functionality.\n\n```mermaid\ngraph TD\n    A[Stagehand Agent] --> B[MCP Connection Manager]\n    B --> C[MCP Server 1]\n    B --> D[MCP Server 2]\n    B --> E[MCP Server N]\n    C --> F[External Tools/Services]\n    D --> G[External Tools/Services]\n    E --> H[External Tools/Services]\n```\n\n## Connection Management\n\nThe MCP connection module (`connection.ts`) handles the lifecycle of MCP server connections:\n\n| Method | Purpose |\n|--------|---------|\n| `connect()` | Establish connection to an MCP server |\n| `disconnect()` | Close connection and cleanup resources |\n| `sendRequest()` | Send request to connected MCP server |\n| `receiveResponse()` | Handle responses from MCP server |\n\n## Utility Functions\n\nThe MCP utilities module (`utils.ts`) provides helper functions for:\n\n- Message formatting and parsing\n- Error handling and retry logic\n- Connection state management\n- Tool call serialization/deserialization\n\n## Usage Example\n\nBasic integration pattern (from `packages/core/examples/mcp.ts`):\n\n```typescript\nimport { Stagehand } from \"@browserbasehq/stagehand\";\n\n// Initialize Stagehand with MCP configuration\nconst stagehand = new Stagehand({\n  mcpServers: [\n    {\n      name: \"my-mcp-server\",\n      command: \"npx\",\n      args: [\"-y\", \"@my-org/mcp-server\"],\n    },\n  ],\n});\n\nawait stagehand.init();\n\n// Use Stagehand with MCP tools available\nawait stagehand.page.goto(\"https://example.com\");\n```\n\n## Configuration Options\n\n| Option | Type | Description |\n|--------|------|-------------|\n| `name` | `string` | Display name for the MCP server |\n| `command` | `string` | Executable to run the MCP server |\n| `args` | `string[]` | Arguments passed to the MCP server command |\n| `env` | `Record<string, string>` | Environment variables for the server |\n| `timeout` | `number` | Connection timeout in milliseconds |\n\n## Related Components\n\n- **CLI Daemon** (`packages/cli`): Handles MCP server spawning and lifecycle\n- **Shutdown Supervisor** (`supervisor.ts`): Manages cleanup of MCP connections on exit\n- **Agent System Prompt** (`agentSystemPrompt.ts`): Provides instructions for using MCP tools\n\n## References\n\n- CLI daemon management: `packages/cli/README.md`\n- Shutdown handling: `packages/core/lib/v3/shutdown/supervisor.ts:1-50`\n- Agent tool usage: `packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:1-100`\n\n> **Note**: For complete MCP implementation details, refer to the source files listed at the top of this page once they are available in the repository context.\n\n---\n\n<a id='server-api'></a>\n\n## Server API\n\n### 相关页面\n\n相关主题：[Architecture Overview](#architecture-overview), [CLI Tools](#cli-tools)\n\n<details>\n<summary>Relevant Source Files</summary>\n\nThe following source files were referenced for generating this page (note: these files were not directly included in the provided context, so this documentation is based on the CLI infrastructure and related components):\n\n- [packages/server-v3/src/server.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v3/src/server.ts)\n- [packages/server-v4/src/server.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v4/src/server.ts)\n- [packages/server-v4/src/app.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v4/src/app.ts)\n- [packages/server-v3/src/lib/SessionStore.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v3/src/lib/SessionStore.ts)\n- [packages/server-v4/src/routes/v4/browsersession/routes.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v4/src/routes/v4/browsersession/routes.ts)\n- [packages/server-v4/src/db/client.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v4/src/db/client.ts)\n</details>\n\n# Server API\n\nThe Stagehand Server API provides a headless browser automation service designed to power AI agents and programmatic browser control. It exposes HTTP endpoints for launching, controlling, and managing browser sessions, serving as the backend infrastructure for the `browse` CLI command and SDK integrations.\n\n## Overview\n\nThe Server API is a RESTful service built in TypeScript that manages browser sessions using Playwright. It supports both local and remote execution modes, with the remote mode utilizing Browserbase's cloud infrastructure for scalable browser automation. The API handles session lifecycle management, CDP (Chrome DevTools Protocol) proxying, and persistent browser state across requests.\n\n```mermaid\ngraph TD\n    subgraph \"Client Layer\"\n        CLI[CLI: browse]\n        SDK[SDK Client]\n    end\n    \n    subgraph \"Server Layer\"\n        V3[Server v3]\n        V4[Server v4]\n    end\n    \n    subgraph \"Session Management\"\n        SS[SessionStore]\n        DB[(Database)]\n    end\n    \n    subgraph \"Browser Layer\"\n        Local[Local Playwright]\n        Remote[Browserbase Cloud]\n    end\n    \n    CLI --> V3\n    CLI --> V4\n    SDK --> V4\n    V3 --> SS\n    V4 --> SS\n    SS --> DB\n    V3 --> Local\n    V3 --> Remote\n    V4 --> Local\n    V4 --> Remote\n```\n\n## Architecture\n\n### Server Versions\n\n| Version | Package | Description |\n|---------|---------|-------------|\n| v3 | `packages/server-v3` | Legacy server implementation with SessionStore |\n| v4 | `packages/server-v4` | Current production server with routing and database integration |\n\n### Component Structure\n\nThe server infrastructure consists of three primary layers:\n\n1. **HTTP Server Layer** (`server.ts`): Handles incoming requests, manages WebSocket upgrades for CDP connections, and orchestrates session routing\n2. **Session Management Layer** (`SessionStore.ts`): Maintains in-memory and persisted session state, handles cleanup and timeout management\n3. **Browser Execution Layer**: Interfaces with Playwright locally or Browserbase remotely for actual browser automation\n\n### Request Flow\n\n```mermaid\nsequenceDiagram\n    participant Client\n    participant Server\n    participant SessionStore\n    participant Browser\n    \n    Client->>Server: POST /browsersession (create)\n    Server->>SessionStore: allocate session\n    SessionStore->>Browser: launch/attach\n    Browser-->>SessionStore: session ready\n    SessionStore-->>Server: session handle\n    Server-->>Client: session ID + CDP endpoint\n    \n    Client->>Server: WS /browsersession/:id/cdp\n    Server->>Browser: proxy CDP messages\n    Browser-->>Server: CDP responses\n    Server-->>Client: WS stream\n```\n\n## Session Management\n\n### SessionStore\n\nThe `SessionStore` class manages browser session lifecycle:\n\n| Method | Purpose |\n|--------|---------|\n| `create()` | Initialize new browser session |\n| `get(id)` | Retrieve session by ID |\n| `list()` | List all active sessions |\n| `delete(id)` | Terminate and cleanup session |\n| `cleanup()` | Remove expired/stale sessions |\n\nSessions store metadata including:\n- Session identifier\n- Creation timestamp\n- Last activity timestamp\n- Browser type and configuration\n- CDP WebSocket URL\n- Associated project/API key\n\n### Session Lifecycle\n\nSessions follow this state machine:\n\n```mermaid\nstateDiagram-v2\n    [*] --> Created: allocate()\n    Created --> Launching: spawn browser\n    Launching --> Ready: browser connected\n    Ready --> Active: first CDP command\n    Active --> Idle: no activity timeout\n    Idle --> Active: new CDP command\n    Active --> Closing: close() or timeout\n    Closing --> [*]: cleanup complete\n    \n    Launching --> Error: spawn failed\n    Error --> [*]: cleanup\n```\n\n## API Endpoints\n\n### Browser Session Routes\n\nThe primary routing module at `packages/server-v4/src/routes/v4/browsersession/routes.ts` defines the session REST API:\n\n| Endpoint | Method | Description |\n|----------|--------|-------------|\n| `/browsersession` | POST | Create new browser session |\n| `/browsersession/:id` | GET | Get session details |\n| `/browsersession/:id` | DELETE | Close session |\n| `/browsersession/:id/screenshot` | GET | Capture screenshot |\n| `/browsersession/:id/snapshot` | GET | Get accessibility tree |\n\n### Query Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `headless` | boolean | Run without visible window |\n| `viewport.width` | number | Browser viewport width |\n| `viewport.height` | number | Browser viewport height |\n| `contextId` | string | Browserbase Context ID to resume |\n| `persist` | boolean | Persist session across disconnects |\n\n## Database Integration\n\nThe database client at `packages/server-v4/src/db/client.ts` provides persistence layer:\n\n- **Session persistence**: Store session metadata for recovery\n- **Audit logging**: Track session creation, commands, and errors\n- **Usage metrics**: Monitor API usage per project/key\n\n## Configuration\n\n### Environment Variables\n\n| Variable | Description |\n|----------|-------------|\n| `BROWSERBASE_API_KEY` | Browserbase API key for remote execution |\n| `BROWSERBASE_PROJECT_ID` | Browserbase project identifier |\n| `DATABASE_URL` | PostgreSQL connection string for session storage |\n| `PORT` | HTTP server port (default: 3000) |\n\n### Execution Modes\n\n| Mode | Trigger | Browser Location |\n|------|---------|------------------|\n| `local` | Default (no API key) | Local Playwright installation |\n| `remote` | `BROWSERBASE_API_KEY` set | Browserbase cloud browsers |\n\n## CDP Proxying\n\nThe server acts as a WebSocket proxy for Chrome DevTools Protocol commands:\n\n1. Client establishes WebSocket connection to `/browsersession/:id/cdp`\n2. Server forwards messages to the underlying browser session\n3. Responses and events stream back to client\n4. Connection persists until client disconnects or session expires\n\n## CLI Integration\n\nThe `browse` CLI commands connect to the server:\n\n```bash\n# Local execution (daemon auto-starts)\nbrowse open https://example.com\n\n# Remote execution via Browserbase\nbrowse env remote\nbrowse open https://example.com\n\n# Specify server endpoint\nbrowse --ws ws://localhost:3000 open https://example.com\n```\n\nThe daemon architecture provides:\n\n- Automatic server startup/shutdown\n- Session persistence across CLI invocations\n- Graceful recovery on connection failure\n\n## Security Considerations\n\n- Session tokens are generated securely and scoped to individual sessions\n- CDP endpoints require valid session authentication\n- Remote execution uses Browserbase's authentication infrastructure\n- Sessions auto-expire after configurable inactivity timeout\n\n## References\n\n- CLI Daemon: `packages/cli/README.md`\n- Session Context: `packages/core/lib/v3/understudy/context.ts`\n- Frame Locator: `packages/core/lib/v3/understudy/frameLocator.ts`\n- Lifecycle Watcher: `packages/core/lib/v3/understudy/lifecycleWatcher.ts`\n\n---\n\n<a id='cli-tools'></a>\n\n## CLI Tools\n\n### 相关页面\n\n相关主题：[Server API](#server-api), [CDP Engine](#cdp-engine)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n- [packages/cli/CHANGELOG.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/CHANGELOG.md)\n- [README.md](https://github.com/browserbase/stagehand/blob/main/README.md)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/cookies.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/cookies.ts)\n</details>\n\n# CLI Tools\n\nThe Stagehand CLI (`browse` command) is a command-line interface for browser automation designed specifically for AI agents. It provides a text-based interface for controlling web browsers through CDP (Chrome DevTools Protocol), enabling programmatic browser control without requiring a graphical interface.\n\n## Overview\n\nThe CLI serves as the foundational tool layer for the Stagehand browser automation framework. It abstracts CDP operations into human-readable commands, making it accessible for shell scripts, AI agent integrations, and development workflows.\n\nThe CLI operates in two modes:\n\n| Mode | Description | Default Condition |\n|------|-------------|-------------------|\n| `local` | Runs Chrome locally via bundled Playwright | When no `BROWSERBASE_API_KEY` is set |\n| `remote` | Connects to Browserbase cloud infrastructure | When `BROWSERBASE_API_KEY` is set |\n\n资料来源：[packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[User / AI Agent] --> B[browse CLI]\n    B --> C{browse env mode}\n    C -->|local| D[Local Chrome CDP]\n    C -->|remote| E[Browserbase Cloud]\n    D --> F[Local Strategy]\n    E --> G[Remote Strategy]\n    F --> H[Chrome Instance]\n    G --> I[Browserbase Infrastructure]\n    H --> J[CDP WebSocket]\n    I --> J\n```\n\n### Daemon Architecture\n\nThe CLI uses a persistent daemon process for efficiency:\n\n1. First invocation starts the daemon in the background\n2. Subsequent commands reuse the existing daemon\n3. The daemon manages Chrome lifecycle and CDP connections\n\nRecovery behavior:\n1. Detects stale daemon\n2. Cleans up old files\n3. Restarts the daemon\n4. Retries the command\n\n资料来源：[packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n\n## Navigation Commands\n\n| Command | Description |\n|---------|-------------|\n| `browse open <url>` | Navigate to a URL |\n| `browse reload` | Reload current page |\n| `browse back` | Navigate back in history |\n| `browse forward` | Navigate forward in history |\n\n### Open Command Options\n\n| Option | Description | Default |\n|--------|-------------|---------|\n| `--wait <state>` | Wait for load state | `load` |\n| `-t, --timeout <ms>` | Page load timeout | 30000ms |\n| `--context-id <id>` | Load Browserbase Context |\n| `--persist` | Persist Context across sessions |\n\nThe `--timeout` flag controls how long to wait for the page load state. Use longer timeouts for slow-loading pages:\n\n```bash\nbrowse open https://slow-site.com --timeout 60000\n```\n\n## Interaction Commands\n\n### Click Actions\n\n| Command | Description |\n|---------|-------------|\n| `browse click <ref>` | Click by element ref (e.g., `@0-5`) |\n| `browse click <ref> [-b button]` | Click with button (left/right/middle) |\n| `browse click <ref> [-c count]` | Click multiple times |\n| `browse click_xy <x> <y>` | Click at coordinates |\n\n### Coordinate Actions\n\n| Command | Description |\n|---------|-------------|\n| `browse hover <x> <y>` | Hover at coordinates |\n| `browse scroll <x> <y> <dx> <dy>` | Scroll from position |\n| `browse drag <fx> <fy> <tx> <ty>` | Drag from to coordinates |\n\n### Keyboard Commands\n\n| Command | Description |\n|---------|-------------|\n| `browse type <text>` | Type text into focused element |\n| `browse type <text> [-d delay]` | Type with character delay |\n| `browse press <key>` | Press keyboard key |\n\nSupported keys include standard keys (Enter, Tab, Escape) and modifier combinations (Cmd+A, Cmd+C, etc.).\n\n## Form Handling\n\n| Command | Description |\n|---------|-------------|\n| `browse fill <selector> <value>` | Fill form field |\n| `browse select <selector> <values...>` | Select dropdown options |\n| `browse highlight <selector>` | Highlight element |\n\nThe `fill` command automatically presses Enter after typing. Use `--no-press-enter` to prevent this behavior.\n\n## Page Information\n\n| Command | Description |\n|---------|-------------|\n| `browse get url` | Get current URL |\n| `browse get title` | Get page title |\n| `browse get text <selector>` | Get element text content |\n| `browse get html <selector>` | Get element HTML |\n| `browse get value <selector>` | Get form field value |\n| `browse get box <selector>` | Get center coordinates |\n| `browse get markdown [selector]` | Convert HTML to markdown |\n\nThe `snapshot` command returns the accessibility tree with element references:\n\n```bash\nbrowse snapshot [-c|--compact]\n```\n\nOutput format includes refs like `[0-5]`, `[1-2]`:\n\n```\nRootWebArea \"Example\" url=\"https://example.com\"\n  [0-0] link \"Home\"\n  [0-1] link \"About\"\n  [0-2] button \"Sign In\"\n```\n\n资料来源：[packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n\n## Waiting Commands\n\n| Command | Description |\n|---------|-------------|\n| `browse wait load [state]` | Wait for page load state |\n| `browse wait selector <selector>` | Wait for element |\n| `browse wait timeout <ms>` | Wait for duration |\n\nSelector wait options:\n\n| Option | Description | Default |\n|--------|-------------|---------|\n| `-t timeout` | Maximum wait time | - |\n| `-s visible\\|hidden\\|attached\\|detached` | Element state | visible |\n\n## Multi-Tab Management\n\n| Command | Description |\n|---------|-------------|\n| `browse pages` | List all open tabs |\n| `browse newpage [url]` | Open new tab |\n| `browse tab_switch <n>` | Switch to tab by index |\n| `browse tab_close [n]` | Close tab (default: last) |\n\n## Network Capture\n\nEnable network request capture for debugging and inspection:\n\n| Command | Description |\n|---------|-------------|\n| `browse network on` | Start capturing requests |\n| `browse network off` | Stop capturing |\n| `browse network path` | Get capture directory |\n| `browse network clear` | Clear captured requests |\n\n### Capture Directory Structure\n\n```\n/tmp/browse-default-network/\n  001-GET-api.github.com-repos/\n    request.json      # method, url, headers, body\n    response.json     # status, headers, body, duration\n```\n\n## Session Management\n\n| Command | Description |\n|---------|-------------|\n| `browse start` | Start daemon |\n| `browse stop` | Stop daemon |\n| `browse status` | Check daemon status |\n| `browse env <target>` | Set environment mode |\n\n### Session Options\n\n| Option | Description |\n|--------|-------------|\n| `--session <name>` | Session name (default: \"default\") |\n| `--headless` | Run Chrome headless |\n| `--headed` | Run Chrome with visible window |\n| `--ws <url\\|port>` | One-shot CDP connection |\n\n### Environment Modes\n\n```bash\n# Start with specific mode\nbrowse env local\nbrowse env remote\n\n# Attach to running Chrome\nbrowse env local <port|url>\n\n# Persist override and restart daemon\nbrowse env <target> --session <name>\n\n# Clear override (fallback to env-var detection)\nbrowse stop\n```\n\nAuto-detection priority:\n1. Check `browse env` persist setting\n2. Check `BROWSE_SESSION` environment variable\n3. Default: `remote` if `BROWSERBASE_API_KEY` is set, else `local`\n\n## Element References\n\nAfter running `browse snapshot`, elements can be referenced by their ref ID:\n\n```bash\n# Get snapshot with refs\nbrowse snapshot -c\n\n# Click using ref (multiple formats)\nbrowse click @0-2       # @ prefix\nbrowse click 0-2        # Plain ref\nbrowse click --ref 0-2 # --ref flag\n```\n\n## Global Options\n\n| Option | Description |\n|--------|-------------|\n| `--session <name>` | Session name for multiple browsers |\n| `--headless` | Run Chrome in headless mode |\n| `--headed` | Run Chrome with visible window |\n| `--ws <url\\|port>` | One-shot CDP connection (bypasses daemon) |\n| `--json` | Output as JSON |\n\n## Environment Variables\n\n| Variable | Description |\n|----------|-------------|\n| `BROWSE_SESSION` | Default session name |\n| `BROWSERBASE_API_KEY` | Browserbase API key (required for remote mode) |\n\n## Output Formats\n\nThe CLI supports JSON output for programmatic consumption:\n\n```bash\nbrowse get url --json\nbrowse snapshot --json\n```\n\nThis is particularly useful for AI agent integrations that need structured data.\n\n## Related Documentation\n\n- [Stagehand Core Package](../core/README.md) - Browser automation primitives\n- [Browserbase Platform](https://www.browserbase.com) - Remote browser infrastructure\n- [CDP Protocol](https://chromedevtools.github.io/devtools-protocol/) - Chrome DevTools Protocol reference\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：browserbase/stagehand\n\n摘要：发现 7 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：身份坑 - 仓库名和安装名不一致。\n\n## 1. 身份坑 · 仓库名和安装名不一致\n\n- 严重度：medium\n- 证据强度：runtime_trace\n- 发现：仓库名 `stagehand` 与安装入口 `create-browser-app` 不完全一致。\n- 对用户的影响：用户照着仓库名搜索包或照着包名找仓库时容易走错入口。\n- 建议检查：在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- 复现命令：`npx create-browser-app`\n- 防护动作：页面必须同时展示 repo 名和真实安装入口，避免用户搜索错包。\n- 证据：identity.distribution | github_repo:776908852 | https://github.com/browserbase/stagehand | repo=stagehand; install=create-browser-app\n\n## 2. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:776908852 | https://github.com/browserbase/stagehand | README/documentation is current enough for a first validation pass.\n\n## 3. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | last_activity_observed missing\n\n## 4. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium\n\n## 5. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium\n\n## 6. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | issue_or_pr_quality=unknown\n\n## 7. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | release_recency=unknown\n\n<!-- canonical_name: browserbase/stagehand; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "stagehand",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "github_repo:776908852",
          "kind": "repo",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/browserbase/stagehand"
        },
        {
          "evidence_id": "art_da7f4897ea224d8f9df717f95f679237",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/browserbase/stagehand#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "stagehand 说明书",
      "toc": [
        "https://github.com/browserbase/stagehand 项目说明书",
        "目录",
        "Project Introduction",
        "Project Overview",
        "Architecture Overview",
        "Project Structure",
        "Agent System Architecture",
        "Browser Context Management",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "78bcde88e28f147acc6ca9aef9753cd96c870c35",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pnpm-lock.yaml",
      "package.json",
      "README.md",
      "packages/README.md",
      "packages/evals/logger.ts",
      "packages/evals/index.eval.ts",
      "packages/evals/utils.ts",
      "packages/evals/vitest.config.ts",
      "packages/evals/silence-warnings.ts",
      "packages/evals/errors.ts",
      "packages/evals/env.ts",
      "packages/evals/args.ts",
      "packages/evals/taskConfig.ts",
      "packages/evals/vitest.integration.config.ts",
      "packages/evals/cli-legacy.ts",
      "packages/evals/summary.ts",
      "packages/evals/package.json",
      "packages/evals/browserbaseCleanup.ts",
      "packages/evals/README.md",
      "packages/evals/scoring.ts",
      "packages/evals/evals.config.json",
      "packages/evals/tsconfig.json",
      "packages/evals/cli.ts",
      "packages/evals/CHANGELOG.md",
      "packages/evals/runtimePaths.ts",
      "packages/evals/initV3.ts",
      "packages/cli/vitest.config.ts",
      "packages/cli/package.json",
      "packages/cli/tsup.config.ts",
      "packages/cli/README.md",
      "packages/cli/tsconfig.json",
      "packages/cli/CHANGELOG.md",
      "packages/server-v3/vitest.config.ts",
      "packages/server-v3/SDK_RELEASE_WORKFLOW.md",
      "packages/server-v3/openapi.v3.yaml",
      "packages/server-v3/package.json",
      "packages/server-v3/README.md",
      "packages/server-v3/tsconfig.tests.json",
      "packages/server-v3/tsconfig.json",
      "packages/server-v3/CHANGELOG.md"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# stagehand-workspace - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 stagehand-workspace 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **希望把专业流程带进宿主 AI 的用户**：仓库包含 Skill 文档。 证据：`packages/evals/skills/browser/SKILL.md` Claim：`clm_0003` supported 0.86\n\n## 它能做什么\n\n- **AI Skill / Agent 指令资产库**（可做安装前预览）：项目包含可被宿主 AI 读取的 Skill 或 Agent 指令文件，可用于把专业流程带入 Claude、Codex、Cursor 等宿主。 证据：`packages/evals/skills/browser/SKILL.md` Claim：`clm_0001` supported 0.86\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md`, `packages/cli/README.md`, `packages/server-v3/README.md` Claim：`clm_0002` supported 0.86\n\n## 怎么开始\n\n- `npx create-browser-app` 证据：`README.md` Claim：`clm_0004` supported 0.86\n- `git clone https://github.com/browserbase/stagehand.git` 证据：`README.md` Claim：`clm_0005` supported 0.86\n- `npm install -g @browserbasehq/browse-cli` 证据：`packages/cli/README.md` Claim：`clm_0006` supported 0.86\n- `npx tsx src/index.ts <command>` 证据：`packages/cli/README.md` Claim：`clm_0007` supported 0.86\n- `git clone https://github.com/browserbase/stagehand/` 证据：`packages/server-v3/README.md` Claim：`clm_0005` supported 0.86, `clm_0008` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：需要管理员/安全审批\n- **为什么**：继续前可能涉及密钥、账号、外部服务或敏感上下文，建议先经过管理员或安全审批。\n\n### 30 秒判断\n\n- **现在怎么做**：需要管理员/安全审批\n- **最小安全下一步**：先跑 Prompt Preview；若涉及凭证或企业环境，先审批再试装\n- **先别相信**：工具权限边界不能在安装前相信。\n- **继续会触碰**：命令执行、宿主 AI 配置、本地环境或项目文件\n\n### 现在可以相信\n\n- **适合人群线索：希望把专业流程带进宿主 AI 的用户**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`packages/evals/skills/browser/SKILL.md` Claim：`clm_0003` supported 0.86\n- **能力存在：AI Skill / Agent 指令资产库**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`packages/evals/skills/browser/SKILL.md` Claim：`clm_0001` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md`, `packages/cli/README.md`, `packages/server-v3/README.md` Claim：`clm_0002` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0004` supported 0.86\n\n### 现在还不能相信\n\n- **工具权限边界不能在安装前相信。**（unverified）：MCP/tool 类项目通常会触碰文件、网络、浏览器或外部 API，必须真实检查权限和日志。\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。 证据：`packages/evals/skills/browser/SKILL.md`\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n- **安装命令是否需要网络、权限或全局写入？**（unverified）：这影响企业环境和个人环境的安装风险。 证据：`README.md`\n\n### 继续会触碰什么\n\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`, `packages/cli/README.md`, `packages/server-v3/README.md`\n- **宿主 AI 配置**：Claude/Codex/Cursor/Gemini/OpenCode 等宿主的 plugin、Skill 或规则加载配置。 原因：宿主配置会改变 AI 后续工作方式，可能和用户已有规则冲突。 证据：`packages/evals/skills/browser/SKILL.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`, `packages/cli/README.md`, `packages/server-v3/README.md`\n- **环境变量 / API Key**：项目入口文档明确出现 API key、token、secret 或账号凭证配置。 原因：如果真实安装需要凭证，应先使用测试凭证并经过权限/合规判断。 证据：`CHANGELOG.md`, `claude.md`, `packages/cli/README.md`, `packages/cli/src/index.ts` 等\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：用安装前交互式试用判断工作方式是否匹配，不需要授权或改环境。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **先备份宿主 AI 配置**：Skill、plugin、规则文件可能改变 Claude/Cursor/Codex 的默认行为。（适用：存在插件 manifest、Skill 或宿主规则入口时。）\n- **不要使用真实生产凭证**：环境变量/API key 一旦进入宿主或工具链，可能产生账号和合规风险。（适用：出现 API、TOKEN、KEY、SECRET 等环境线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **准备移除宿主 plugin / Skill / 规则入口**：如果试装后行为异常，可以把宿主 AI 恢复到试装前状态。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **准备撤销测试 API key 或 token**：测试凭证泄露或误用时，可以快速止损。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0009` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md`, `packages/cli/README.md`, `packages/server-v3/README.md` Claim：`clm_0010` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **AI Skill / Agent 指令资产库**：先基于 role_skill_index / evidence_index 帮用户挑选可用角色、Skill 或工作流。 边界：可做安装前 Prompt 体验。 证据：`packages/evals/skills/browser/SKILL.md` Claim：`clm_0001` supported 0.86\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md`, `packages/cli/README.md`, `packages/server-v3/README.md` Claim：`clm_0002` supported 0.86\n\n### 上下文规模\n\n- 文件总数：1014\n- 重要文件覆盖：40/1014\n- 证据索引条目：61\n- 角色 / Skill 条目：1\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 stagehand-workspace 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 stagehand-workspace 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 stagehand-workspace 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 1 个角色 / Skill / 项目文档条目。\n\n- **browser**（skill）：Automate browser interactions using the Browserbase browse CLI. Use when a task requires navigating websites, reading page state, clicking elements, filling forms, or validating browser results with the browse command. 激活提示：当用户任务与“browser”描述的流程高度相关时，先用它做安装前体验，再决定是否安装。 证据：`packages/evals/skills/browser/SKILL.md`\n\n## 证据索引\n\n- 共索引 61 条证据。\n\n- **Mintlify Starter Kit**（documentation）：Click on Use this template to copy the Mintlify starter kit. The starter kit contains examples including 证据：`packages/docs/README.md`\n- **What is Stagehand?**（documentation）：The AI Browser Automation Framework Read the Docs 证据：`README.md`\n- **Stagehand Project**（documentation）：This is a project that uses Stagehand V3, a browser automation framework with AI-powered act , extract , observe , and agent methods. 证据：`claude.md`\n- **Stagehand Packages**（documentation）：This directory contains the Stagehand monorepo packages: 证据：`packages/README.md`\n- **Browse CLI**（documentation）：Browser automation CLI for AI agents. Built on Stagehand https://github.com/browserbase/stagehand , providing raw browser control without requiring LLM integration. 证据：`packages/cli/README.md`\n- **What is Stagehand?**（documentation）：The AI Browser Automation Framework Read the Docs 证据：`packages/core/README.md`\n- **Readme**（documentation）：This folder provides v3-native agent tools for the AISDK-based agent flow. They mirror the v2 tools but operate on the V3 CDP-native APIs. 证据：`packages/core/lib/v3/agent/tools/README.md`\n- **Stagehand Evals**（documentation）：Agent benchmarks for Stagehand — act , extract , observe , agent , combination , plus dataset-backed suites WebVoyager, OnlineMind2Web, WebTailBench, GAIA . 证据：`packages/evals/README.md`\n- **Stagehand API**（documentation）：The Stagehand is a powerful service that provides a RESTful interface for browser automation and session management using the Browserbase platform. It enables recording, playback, and manipulation of browser sessions with a focus on reliability and performance. 证据：`packages/server-v3/README.md`\n- **Stagehand API**（documentation）：The Stagehand is a powerful service that provides a RESTful interface for browser automation and session management using the Browserbase platform. It enables recording, playback, and manipulation of browser sessions with a focus on reliability and performance. 证据：`packages/server-v4/README.md`\n- **Package**（package_manifest）：{ \"name\": \"stagehand-workspace\", \"version\": \"0.0.0\", \"private\": true, \"description\": \"Stagehand monorepo workspace\", \"type\": \"module\", \"scripts\": { \"build\": \"turbo run build\", \"build:full\": \"turbo run build\", \"build:cjs\": \"turbo run build:cjs\", \"build:cli\": \"turbo run build:cli\", \"build:esm\": \"turbo run build:esm\", \"build:sea\": \"turbo run build:sea:esm\", \"build:sea:esm\": \"turbo run build:sea:esm\", \"build:sea:cjs\": \"turbo run build:sea:cjs\", \"lint\": \"turbo run lint\", \"format\": \"prettier --write .\", \"prettier\": \"prettier --write .\", \"eslint\": \"eslint .\", \"test\": \"turbo run test:core test:e2e test:server test:evals test:cli\", \"test:core\": \"turbo run test:core --\", \"test:core:local\": \"STAGEHAND… 证据：`package.json`\n- **Package**（package_manifest）：{ \"name\": \"@browserbasehq/browse-cli\", \"version\": \"0.6.0\", \"description\": \"Browser automation CLI for AI agents, built on Stagehand\", \"type\": \"commonjs\", \"license\": \"MIT\", \"author\": \"Browserbase \", \"repository\": { \"type\": \"git\", \"url\": \"git+https://github.com/browserbase/stagehand.git\", \"directory\": \"packages/cli\" }, \"bugs\": { \"url\": \"https://github.com/browserbase/stagehand/issues\" }, \"homepage\": \"https://github.com/browserbase/stagehand/tree/main/packages/cli readme\", \"keywords\": \"browser\", \"automation\", \"cli\", \"ai\", \"agent\", \"chrome\", \"cdp\", \"web-scraping\", \"testing\", \"stagehand\" , \"engines\": { \"node\": \"^20.19.0 =22.12.0\" }, \"publishConfig\": { \"access\": \"public\" }, \"main\": \"./dist/index.… 证据：`packages/cli/package.json`\n- **Package**（package_manifest）：{ \"name\": \"@browserbasehq/stagehand\", \"version\": \"3.4.0\", \"description\": \"An AI web browsing framework focused on simplicity and extensibility.\", \"type\": \"module\", \"main\": \"./dist/cjs/index.js\", \"module\": \"./dist/esm/index.js\", \"types\": \"./dist/esm/index.d.ts\", \"exports\": { \".\": { \"types\": \"./dist/esm/index.d.ts\", \"import\": \"./dist/esm/index.js\", \"require\": \"./dist/cjs/index.js\" }, \"./cli\": { \"types\": \"./dist/esm/lib/v3/cli.d.ts\", \"import\": \"./dist/esm/lib/v3/cli.js\", \"require\": \"./dist/cjs/cli.js\" }, \"./ .js\": { \"types\": \"./dist/esm/ .d.ts\", \"import\": \"./dist/esm/ .js\" }, \"./ \": { \"types\": \"./dist/esm/ .d.ts\", \"import\": \"./dist/esm/ .js\" }, \"./package.json\": \"./package.json\" }, \"engines\":… 证据：`packages/core/package.json`\n- **Package**（package_manifest）：{ \"name\": \"@browserbasehq/stagehand-docs\", \"version\": \"1.0.0\", \"description\": \"\", \"type\": \"module\", \"main\": \"index.js\", \"scripts\": { \"dev\": \"mintlify dev --no-open --port 3002\", \"upgrade\": \"mintlify upgrade\", \"sync-sdk\": \"node scripts/sync-sdk-docs.js\" }, \"keywords\": , \"author\": \"\", \"license\": \"ISC\", \"dependencies\": { \"mintlify\": \"^4.2.563\", \"zod\": \"^4.2.1\" }, \"packageManager\": \"pnpm@9.15.0+sha512.76e2379760a4328ec4415815bcd6628dee727af3779aaa4c914e3944156c4299921a89f976381ee107d41f12cfa4b66681ca9c718f0668fa0831ed4c6d8ba56c\" } 证据：`packages/docs/package.json`\n- **Package**（package_manifest）：{ \"name\": \"@browserbasehq/stagehand-evals\", \"version\": \"2.0.1\", \"private\": true, \"description\": \"Evaluation suite for Stagehand\", \"type\": \"module\", \"main\": \"./\", \"bin\": { \"evals\": \"./dist/cli/cli.js\" }, \"scripts\": { \"typecheck\": \"pnpm -w --dir ../.. exec tsc -p packages/evals/tsconfig.json --noEmit\", \"build\": \"pnpm --filter @browserbasehq/stagehand-evals run --parallel \\\"/^build: esm cli $/\\\"\", \"build:esm\": \"tsx scripts/build-esm.ts\", \"build:cli\": \"tsx scripts/build-cli.ts\", \"test\": \"pnpm -w --dir ../.. exec turbo run test:evals --filter=@browserbasehq/stagehand-evals --\", \"test:unit\": \"vitest run\", \"test:evals\": \"tsx scripts/test-evals.ts --cli packages/evals/dist/cli/cli.js\", \"evals:old\":… 证据：`packages/evals/package.json`\n- **Package**（package_manifest）：{ \"name\": \"@browserbasehq/stagehand-server-v3\", \"version\": \"3.6.4\", \"description\": \"Stagehand API server v3\", \"type\": \"module\", \"private\": true, \"scripts\": { \"build\": \"pnpm --filter @browserbasehq/stagehand-server-v3 run --parallel \\\"/^ build:esm-tests build:server:dist gen:openapi build:sea:esm $/\\\"\", \"dev\": \"tsx watch src/server.ts\", \"build:esm-tests\": \"pnpm -w --dir ../.. exec tsc -p packages/server-v3/tsconfig.tests.json\", \"build:server:dist\": \"pnpm -w --dir ../.. exec tsc -p packages/server-v3/tsconfig.json && pnpm -w --dir ../.. exec tsc-alias -p packages/server-v3/tsconfig.json\", \"build:sea:esm\": \"tsx scripts/build-sea.ts --mode=esm\", \"build:sea:cjs\": \"tsx scripts/build-sea.ts --mode… 证据：`packages/server-v3/package.json`\n- **Package**（package_manifest）：{ \"name\": \"@browserbasehq/stagehand-server-v4\", \"version\": \"3.6.4\", \"description\": \"Stagehand API server v4\", \"type\": \"module\", \"private\": true, \"scripts\": { \"build\": \"pnpm --filter @browserbasehq/stagehand-server-v4 run --parallel \\\"/^ build:server:dist gen:openapi build:sea:esm $/\\\"\", \"db:generate\": \"drizzle-kit generate --config ./drizzle.config.ts\", \"db:migrate\": \"drizzle-kit migrate --config ./drizzle.config.ts\", \"db:push\": \"drizzle-kit push --config ./drizzle.config.ts\", \"db:studio\": \"drizzle-kit studio --config ./drizzle.config.ts\", \"start\": \"tsx src/server.ts\", \"dev\": \"tsx watch src/server.ts\", \"build:server:dist\": \"pnpm -w --dir ../.. exec tsc -p packages/server-v4/tsconfig.json &&… 证据：`packages/server-v4/package.json`\n- **Browser Automation**（skill_instruction）：Automate browser interactions using the browse CLI with Claude. 证据：`packages/evals/skills/browser/SKILL.md`\n- **License**（source_file）：Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files the \"Software\" , to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 证据：`LICENSE`\n- **Fix Anthropic Cua Triple Click**（documentation）：Fix Anthropic CUA triple click action mapping. 证据：`.changeset/fix-anthropic-cua-triple-click.md`\n- **Forward Ignore Default Args**（documentation）：Add ignoreDefaultArgs option to selectively remove chrome-launcher's built-in default flags e.g. --disable-extensions when running locally 证据：`.changeset/forward-ignore-default-args.md`\n- **Light Books Dance**（documentation）：fix core : import ToolSet from ai public export 证据：`.changeset/light-books-dance.md`\n- **Little Apples Watch**（documentation）：Fix structuredOutputMode for newer Anthropic models 证据：`.changeset/little-apples-watch.md`\n- **Swift Geckos Arrive**（documentation）：include \" selected \" or \" checked \" state in snapshot 证据：`.changeset/swift-geckos-arrive.md`\n- **@browserbasehq/stagehand**（documentation）：- Removes internal Playwright dependency - A generous 20-40% speed increase across act , extract , & observe calls - Compatibility with Playwright, Puppeteer, and Patchright - Automatic action caching agent, stagehand.act . Go from CUA → deterministic scripts w/o inference - A suite of non AI primitives: - page - locator built in closed mode shadow root traversal, with xpaths & css selectors - frameLocator - deepLocator crosses iframes & shadow roots - bun compatibility - Simplified extract schemas - CSS selector support id-based support coming soon - Targeted extract and observe across iframes & shadow roots - More intuitive type names observeResult is now action, act accepts an instructio… 证据：`CHANGELOG.md`\n- **Environment Information**（documentation）：Before submitting an issue, please: 证据：`.github/ISSUE_TEMPLATE/bug_report.md`\n- **Feature Request**（documentation）：Is your feature request related to a problem? Please describe. A clear and concise description of what the problem is. Ex. I'm always frustrated when ... 证据：`.github/ISSUE_TEMPLATE/feature_request.md`\n- **@browserbasehq/browse-cli**（documentation）：- 1935 https://github.com/browserbase/stagehand/pull/1935 666baf1 https://github.com/browserbase/stagehand/commit/666baf1df966b598efd89402563350319ca1aa36 Thanks @shrey150 https://github.com/shrey150 ! - Add global flags for commonly used Browserbase session parameters --proxies, --advanced-stealth, --solve-captchas, --region, --keep-alive, --session-timeout, --block-ads . These flags configure the Browserbase session in remote mode. 证据：`packages/cli/CHANGELOG.md`\n- **@browserbasehq/stagehand**（documentation）：- 2084 https://github.com/browserbase/stagehand/pull/2084 0641d44 https://github.com/browserbase/stagehand/commit/0641d44a849062f5f7ce6a36a34ee95f9840efaa Thanks @seanmcguire12 https://github.com/seanmcguire12 ! - add ignoreSelectors param to extract 证据：`packages/core/CHANGELOG.md`\n- **@browserbasehq/stagehand-lib**（documentation）：- 1027 https://github.com/browserbase/stagehand/pull/1027 455b61f https://github.com/browserbase/stagehand/commit/455b61fb6f7a34ae50d7e7c76c1d639241e213d6 Thanks @seanmcguire12 https://github.com/seanmcguire12 ! - Fixed small issue with module-level state guard for the Playwright selectors.register call 证据：`packages/core/lib/CHANGELOG.md`\n- **@browserbasehq/stagehand-evals**（documentation）：- Updated dependencies 21c78b3 https://github.com/browserbase/stagehand/commit/21c78b3a50fd20cbec7ca8aa5f766f55e17b0f78 , 0641d44 https://github.com/browserbase/stagehand/commit/0641d44a849062f5f7ce6a36a34ee95f9840efaa , f437f73 https://github.com/browserbase/stagehand/commit/f437f738d23951cf460a30d3d285d1eba4c78ea2 , a783b99 https://github.com/browserbase/stagehand/commit/a783b99fb947968b685050314bd1df256d7a1f5a , 8d2f354 https://github.com/browserbase/stagehand/commit/8d2f3541427ca7c9c6d9a831601a6a5babc48502 , a11603d https://github.com/browserbase/stagehand/commit/a11603d09d80f5e2fc341d154a0b90fe9fa48d1c , a87c1fc https://github.com/browserbase/stagehand/commit/a87c1fc435be83dbf14eab9edc… 证据：`packages/evals/CHANGELOG.md`\n- **@browserbasehq/stagehand-server-v3**（documentation）：- 2098 https://github.com/browserbase/stagehand/pull/2098 a783b99 https://github.com/browserbase/stagehand/commit/a783b99fb947968b685050314bd1df256d7a1f5a Thanks @seanmcguire12 https://github.com/seanmcguire12 ! - bump transitive deps to patched versions 证据：`packages/server-v3/CHANGELOG.md`\n- **@browserbasehq/stagehand-server-v4**（documentation）：- 2098 https://github.com/browserbase/stagehand/pull/2098 a783b99 https://github.com/browserbase/stagehand/commit/a783b99fb947968b685050314bd1df256d7a1f5a Thanks @seanmcguire12 https://github.com/seanmcguire12 ! - bump transitive deps to patched versions 证据：`packages/server-v4/CHANGELOG.md`\n- **Config**（structured_config）：{ \"$schema\": \"https://unpkg.com/@changesets/config@2.1.1/schema.json\", \"commit\": false, \"fixed\": , \"linked\": , \"ignore\": \"@browserbasehq/browse-cli\" , \"baseBranch\": \"main\", \"updateInternalDependencies\": \"patch\", \"access\": \"public\", \"changelog\": \"@changesets/changelog-github\", { \"repo\": \"browserbase/stagehand\" } , \"snapshot\": { \"useCalculatedVersion\": true, \"prereleaseTemplate\": \"alpha-{commit}\", \"tag\": \"alpha\" } } 证据：`.changeset/config.json`\n- **Tsconfig.Base**（structured_config）：{ \"compilerOptions\": { \"target\": \"ES2022\", \"esModuleInterop\": true, \"allowSyntheticDefaultImports\": true, \"noImplicitAny\": true, \"module\": \"ESNext\", \"moduleResolution\": \"node\", \"sourceMap\": true, \"inlineSources\": true, \"declaration\": true, \"skipLibCheck\": true } } 证据：`tsconfig.base.json`\n- **Tsconfig**（structured_config）：{ \"extends\": \"./tsconfig.base.json\", \"compilerOptions\": { \"outDir\": \"dist\", \"baseUrl\": \".\", \"paths\": { \" \": \"node modules/ \", \"packages/core/lib/types/ \" , \"@/ \": \"./ \" } }, \"exclude\": \"node modules\", \"dist\", \".eslintrc.cjs\" } 证据：`tsconfig.json`\n- **Turbo**（structured_config）：{ \"$schema\": \"https://v2-8-10.turborepo.dev/schema.json\", \"globalEnv\": \"CI\" , \"globalDependencies\": \".github/workflows/ci.yml\", \"package.json\", \"packages/ /package.json\", \"tsconfig.json\", \"tsconfig.base.json\", \"eslint.config.mjs\", \".prettierrc\", \".prettierignore\" , \"tasks\": { \"build\": { \"dependsOn\": \"^build\" , \"outputs\": \"dist/ \" , \"inputs\": \" / .ts\", \" / .js\", \"package.json\", \"tsconfig.json\", \"tsconfig. .json\", \"!dist/ \" }, \"@browserbasehq/stagehand build\": { \"dependsOn\": \"^build\", \"gen-version\", \"build-dom-scripts\" , \"outputs\": \"dist/ \" , \"inputs\": \" / .ts\", \" / .js\", \"package.json\", \"tsconfig.json\", \"tsconfig. .json\", \"!lib/version.ts\", \"!lib/dom/build/ \", \"!lib/v3/dom/build/ \", \"!dist/… 证据：`turbo.json`\n- **Tsconfig**（structured_config）：{ \"extends\": \"../../tsconfig.base.json\", \"compilerOptions\": { \"moduleResolution\": \"bundler\", \"strict\": true, \"outDir\": \"./dist\", \"rootDir\": \".\", \"types\": \"node\" }, \"include\": \"src/ / \", \"tests/ / \" , \"exclude\": \"node modules\", \"dist\" } 证据：`packages/cli/tsconfig.json`\n- **Tsconfig**（structured_config）：{ \"extends\": \"../../tsconfig.base.json\", \"compilerOptions\": { \"baseUrl\": \"../../\", \"rootDir\": \".\", \"outDir\": \"./dist/esm\", \"allowJs\": true, \"paths\": { \"@browserbasehq/stagehand\": \"packages/core/lib/v3/index.ts\" , \"@browserbasehq/stagehand/ \": \"packages/core/lib/ \" , \" \": \"node modules/ \", \"packages/core/lib/types/ \" , \"@/ \": \"./ \" } }, \"include\": \"lib/ / .ts\", \"tests/ / .ts\", \"lib/v3/cli.js\" , \"exclude\": \"node modules\", \"dist\" } 证据：`packages/core/tsconfig.json`\n- **Docs**（structured_config）：{ \"$schema\": \"https://mintlify.com/docs.json\", \"theme\": \"maple\", \"name\": \"Stagehand\", \"colors\": { \"primary\": \" 00C851\", \"light\": \" 00C851\", \"dark\": \" 00C851\" }, \"favicon\": \"/images/favicon.svg\", \"seo\": { \"indexing\": \"all\", \"metatags\": { \"og:type\": \"website\", \"og:site name\": \"Stagehand Docs\" } }, \"api\": { \"openapi\": \"https://app.stainless.com/api/spec/documented/stagehand/openapi.documented.yml\" }, \"navigation\": { \"versions\": { \"version\": \"v3\", \"dropdowns\": { \"dropdown\": \"TypeScript\", \"icon\": \"code\", \"pages\": \"v3/first-steps/introduction\" , \"groups\": { \"group\": \"First Steps\", \"pages\": \"v3/first-steps/introduction\", \"v3/first-steps/quickstart\", \"v3/first-steps/installation\", \"v3/first-steps/a… 证据：`packages/docs/docs.json`\n- **Evals.Config**（structured_config）：{ \"defaults\": { \"env\": \"local\", \"trials\": 3, \"concurrency\": 10, \"provider\": null, \"model\": null, \"api\": false, \"verbose\": false }, \"benchmarks\": { \"webvoyager\": { \"limit\": 25 }, \"onlineMind2Web\": { \"limit\": 25 }, \"webtailbench\": { \"limit\": 25 } } } 证据：`packages/evals/evals.config.json`\n- **Tsconfig**（structured_config）：{ \"extends\": \"../../tsconfig.base.json\", \"compilerOptions\": { \"baseUrl\": \".\", \"rootDir\": \".\", \"outDir\": \"dist/esm\", \"noEmit\": false, \"types\": \"node\" , \"paths\": { \"@browserbasehq/stagehand\": \"../core/dist/esm/index.d.ts\" } }, \"include\": \" / .ts\" , \"exclude\": \"node modules\", \"dist\" } 证据：`packages/evals/tsconfig.json`\n- **Tsconfig**（structured_config）：{ \"extends\": \"../../tsconfig.base.json\", \"compilerOptions\": { \"outDir\": \"dist\", \"rootDir\": \"src\", \"module\": \"ESNext\", \"moduleResolution\": \"bundler\", \"verbatimModuleSyntax\": false }, \"include\": \"src/ / \" , \"exclude\": \"node modules\", \"dist\" } 证据：`packages/server-v3/tsconfig.json`\n- **Tsconfig.Tests**（structured_config）：{ \"extends\": \"../../tsconfig.base.json\", \"compilerOptions\": { \"rootDir\": \".\", \"outDir\": \"dist\", \"module\": \"ESNext\", \"moduleResolution\": \"bundler\", \"verbatimModuleSyntax\": false, \"declaration\": false, \"noEmit\": false }, \"include\": \"tests/ / .ts\", \"src/ / .ts\" , \"exclude\": \"node modules\", \"dist\" } 证据：`packages/server-v3/tsconfig.tests.json`\n- **Snapshot**（structured_config）：{ \"version\": \"8\", \"dialect\": \"postgres\", \"id\": \"742a2758-ab12-40d4-90e0-7ebada53ba93\", \"prevIds\": \"00000000-0000-0000-0000-000000000000\" , \"ddl\": { \"values\": \"disconnected\", \"idle\", \"thinking\", \"permanent-error\", \"ratelimited\" , \"name\": \"llm session status\", \"entityType\": \"enums\", \"schema\": \"public\" }, { \"values\": \"user\", \"system-default\" , \"name\": \"llm source\", \"entityType\": \"enums\", \"schema\": \"public\" }, { \"values\": \"running\", \"terminated\" , \"name\": \"stagehand browser session status\", \"entityType\": \"enums\", \"schema\": \"public\" }, { \"values\": \"act\", \"extract\", \"observe\", \"agent\" , \"name\": \"stagehand step operation\", \"entityType\": \"enums\", \"schema\": \"public\" }, { \"isRlsEnabled\": false, \"name… 证据：`packages/server-v4/drizzle/20260327165733_fast_living_mummy/snapshot.json`\n- **Tsconfig**（structured_config）：{ \"extends\": \"../../tsconfig.base.json\", \"compilerOptions\": { \"outDir\": \"dist\", \"rootDir\": \"src\", \"module\": \"ESNext\", \"moduleResolution\": \"bundler\", \"verbatimModuleSyntax\": false }, \"include\": \"src/ / \" , \"exclude\": \"node modules\", \"dist\" } 证据：`packages/server-v4/tsconfig.json`\n- **Stagehand Project**（source_file）：This is a project that uses Stagehand V3, a browser automation framework with AI-powered act , extract , observe , and agent methods. 证据：`.cursorrules`\n- **.Env**（source_file）：OPENAI API KEY=\"\" CEREBRAS API KEY=\"\" GROQ API KEY=\"\" BROWSERBASE API KEY=\"\" BRAINTRUST API KEY=\"\" ANTHROPIC API KEY=\"\" HEADLESS=false ENABLE CACHING=false EVAL MODELS=\"gpt-4o,claude-sonnet-4-6\" EXPERIMENTAL EVAL MODELS=\"gpt-4o,claude-sonnet-4-6,o1-mini,o1-preview\" EVAL CATEGORIES=\"observe,act,combination,extract,experimental\" AGENT EVAL MAX STEPS=50 证据：`.env.example`\n- **why**（source_file）：why what changed test plan 证据：`.github/pull_request_template`\n- **.gitignore**（source_file）：node modules/ /test-results/ /playwright-report/ /blob-report/ /playwright/.cache/ screenshot.png .DS STORE .cache/ .env downloads/ dist/ .browserbase/ packages/evals/ /public packages/core/lib/dom/build/ packages/core/lib/v3/dom/build/ packages/evals/public .tgz evals/playground.ts tmp/ eval-summary.json package-lock.json evals/deterministic/tests/BrowserContext/tmp-test.har packages/core/lib/version.ts packages/core/test-results/ /examples/inference summary /inference summary .turbo .idea coverage/ ctrf/ .stagehand-sea/ .playwright / /.playwright / packages/evals/playwright-mcp-screenshot- .png packages/evals/chrome-devtools-mcp-screenshot- .png 证据：`.gitignore`\n- **Pre Commit**（source_file）：pnpm exec lint-staged 证据：`.husky/pre-commit`\n- **.prettierignore**（source_file）：pnpm-lock.yaml README.md / .json docs/ .github/ dist/ node modules/ lib/dom/build/ lib/v3/dom/build/ packages/core/dist/ packages/core/lib/dom/build/ packages/core/lib/v3/dom/build/ packages/cli/dist/ packages/evals/dist/ packages/docs/ .min.js .browserbase/ .browserbase/ /.browserbase/ /.browserbase/ stainless.yml openapi. .yaml 证据：`.prettierignore`\n- **.prettierrc**（source_file）：{} 证据：`.prettierrc`\n- **Eslint.Config**（source_file）：import globals from \"globals\"; import pluginJs from \"@eslint/js\"; import tseslint from \"typescript-eslint\"; import security from \"eslint-plugin-security\"; 证据：`eslint.config.mjs`\n- **Dark Discord**（source_file）： 证据：`media/dark_discord.svg`\n- **Dark License**（source_file）： 证据：`media/dark_license.svg`\n- **Director Icon**（source_file）： 证据：`media/director_icon.svg`\n- **Light Discord**（source_file）： 证据：`media/light_discord.svg`\n- **Light License**（source_file）： 证据：`media/light_license.svg`\n- **Pnpm Workspace**（source_file）：packages: - \"packages/core\" - \"packages/cli\" - \"packages/evals\" - \"packages/docs\" - \"packages/server-v3\" - \"packages/server-v4\" 证据：`pnpm-workspace.yaml`\n- 其余 1 条证据见 `AI_CONTEXT_PACK.json` 或 `EVIDENCE_INDEX.json`。\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`packages/docs/README.md`, `README.md`, `claude.md`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`packages/docs/README.md`, `README.md`, `claude.md`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Project Introduction**：importance `high`\n  - source_paths: package.json, packages/README.md, packages/core/package.json, pnpm-workspace.yaml, turbo.json\n- **Architecture Overview**：importance `high`\n  - source_paths: packages/core/lib/v3/index.ts, packages/core/lib/v3/v3.ts, packages/core/lib/v3/understudy/context.ts, packages/core/lib/v3/understudy/page.ts\n- **CDP Engine**：importance `high`\n  - source_paths: packages/core/lib/v3/understudy/cdp.ts, packages/core/lib/v3/understudy/frameRegistry.ts, packages/core/lib/v3/understudy/executionContextRegistry.ts, packages/core/lib/v3/launch/local.ts, packages/core/lib/v3/launch/browserbase.ts\n- **Core Actions**：importance `high`\n  - source_paths: packages/core/lib/v3/handlers/actHandler.ts, packages/core/lib/v3/handlers/extractHandler.ts, packages/core/lib/v3/handlers/observeHandler.ts, packages/core/lib/v3/handlers/v3AgentHandler.ts, packages/core/lib/v3/agent/tools/index.ts\n- **DOM and Accessibility Tree**：importance `high`\n  - source_paths: packages/core/lib/v3/dom/index.ts, packages/core/lib/v3/dom/piercer.entry.ts, packages/core/lib/v3/dom/piercer.runtime.ts, packages/core/lib/v3/understudy/a11y/snapshot/index.ts, packages/core/lib/v3/dom/locatorScripts/index.ts\n- **LLM Providers**：importance `high`\n  - source_paths: packages/core/lib/v3/llm/LLMProvider.ts, packages/core/lib/v3/llm/OpenAIClient.ts, packages/core/lib/v3/llm/AnthropicClient.ts, packages/core/lib/v3/llm/GoogleClient.ts, packages/core/lib/v3/llm/aisdk.ts\n- **Agent System**：importance `high`\n  - source_paths: packages/core/lib/v3/agent/AgentClient.ts, packages/core/lib/v3/agent/AgentProvider.ts, packages/core/lib/v3/agent/AnthropicCUAClient.ts, packages/core/lib/v3/agent/GoogleCUAClient.ts, packages/core/lib/v3/agent/tools/act.ts\n- **MCP Integration**：importance `medium`\n  - source_paths: packages/core/lib/v3/mcp/connection.ts, packages/core/lib/v3/mcp/utils.ts, packages/core/examples/mcp.ts\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `78bcde88e28f147acc6ca9aef9753cd96c870c35`\n- inspected_files: `pnpm-lock.yaml`, `package.json`, `README.md`, `packages/README.md`, `packages/evals/logger.ts`, `packages/evals/index.eval.ts`, `packages/evals/utils.ts`, `packages/evals/vitest.config.ts`, `packages/evals/silence-warnings.ts`, `packages/evals/errors.ts`, `packages/evals/env.ts`, `packages/evals/args.ts`, `packages/evals/taskConfig.ts`, `packages/evals/vitest.integration.config.ts`, `packages/evals/cli-legacy.ts`, `packages/evals/summary.ts`, `packages/evals/package.json`, `packages/evals/browserbaseCleanup.ts`, `packages/evals/README.md`, `packages/evals/scoring.ts`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 仓库名和安装名不一致\n\n- Trigger: 仓库名 `stagehand` 与安装入口 `create-browser-app` 不完全一致。\n- Host AI rule: 在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- Why it matters: 用户照着仓库名搜索包或照着包名找仓库时容易走错入口。\n- Evidence: identity.distribution | github_repo:776908852 | https://github.com/browserbase/stagehand | repo=stagehand; install=create-browser-app\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 能力判断依赖假设\n\n- Trigger: README/documentation is current enough for a first validation pass.\n- Host AI rule: 将假设转成下游验证清单。\n- Why it matters: 假设不成立时，用户拿不到承诺的能力。\n- Evidence: capability.assumptions | github_repo:776908852 | https://github.com/browserbase/stagehand | README/documentation is current enough for a first validation pass.\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 维护活跃度未知\n\n- Trigger: 未记录 last_activity_observed。\n- Host AI rule: 补 GitHub 最近 commit、release、issue/PR 响应信号。\n- Why it matters: 新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- Evidence: evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | last_activity_observed missing\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 下游验证发现风险项\n\n- Trigger: no_demo\n- Host AI rule: 进入安全/权限治理复核队列。\n- Why it matters: 下游已经要求复核，不能在页面中弱化。\n- Evidence: downstream_validation.risk_items | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 存在评分风险\n\n- Trigger: no_demo\n- Host AI rule: 把风险写入边界卡，并确认是否需要人工复核。\n- Why it matters: 风险会影响是否适合普通用户安装。\n- Evidence: risks.scoring_risks | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: issue/PR 响应质量未知\n\n- Trigger: issue_or_pr_quality=unknown。\n- Host AI rule: 抽样最近 issue/PR，判断是否长期无人处理。\n- Why it matters: 用户无法判断遇到问题后是否有人维护。\n- Evidence: evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | issue_or_pr_quality=unknown\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 发布节奏不明确\n\n- Trigger: release_recency=unknown。\n- Host AI rule: 确认最近 release/tag 和 README 安装命令是否一致。\n- Why it matters: 安装命令和文档可能落后于代码，用户踩坑概率升高。\n- Evidence: evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | release_recency=unknown\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：browserbase/stagehand\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：local_cli\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 仓库名和安装名不一致（medium）：用户照着仓库名搜索包或照着包名找仓库时容易走错入口。 建议检查：在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- 能力判断依赖假设（medium）：假设不成立时，用户拿不到承诺的能力。 建议检查：将假设转成下游验证清单。\n- 维护活跃度未知（medium）：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 下游验证发现风险项（medium）：下游已经要求复核，不能在页面中弱化。 建议检查：进入安全/权限治理复核队列。\n- 存在评分风险（medium）：风险会影响是否适合普通用户安装。 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/browserbase/stagehand 项目说明书\n\n生成时间：2026-05-16 10:25:34 UTC\n\n## 目录\n\n- [Project Introduction](#project-introduction)\n- [Architecture Overview](#architecture-overview)\n- [CDP Engine](#cdp-engine)\n- [Core Actions](#core-actions)\n- [DOM and Accessibility Tree](#dom-accessibility)\n- [LLM Providers](#llm-providers)\n- [Agent System](#agent-system)\n- [MCP Integration](#mcp-integration)\n- [Server API](#server-api)\n- [CLI Tools](#cli-tools)\n\n<a id='project-introduction'></a>\n\n## Project Introduction\n\n### 相关页面\n\n相关主题：[Architecture Overview](#architecture-overview), [CDP Engine](#cdp-engine), [LLM Providers](#llm-providers)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/browserbase/stagehand/blob/main/README.md)\n- [packages/core/README.md](https://github.com/browserbase/stagehand/blob/main/packages/core/README.md)\n- [packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n- [packages/cli/CHANGELOG.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/CHANGELOG.md)\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/agent/MicrosoftCUAClient.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n</details>\n\n# Project Introduction\n\nStagehand is an **AI-powered browser automation framework** developed by [Browserbase](https://browserbase.com) that enables developers to control web browsers using natural language instructions combined with precise code-based control. The framework bridges the gap between high-level AI agents and low-level browser automation tools like Selenium, Playwright, or Puppeteer.\n\n## Project Overview\n\nStagehand represents a paradigm shift in browser automation by allowing developers to choose when to leverage AI capabilities versus writing explicit code. This hybrid approach provides flexibility while maintaining reliability for production environments.\n\n### Key Value Propositions\n\n| Feature | Description |\n|---------|-------------|\n| **Hybrid Control** | Combine AI-driven navigation with deterministic code execution |\n| **Self-Healing** | Auto-caching and action recovery when website changes occur |\n| **Action Preview** | Preview AI-generated actions before execution |\n| **Repeatable Workflows** | Cache and reuse actions to save time and tokens |\n| **Production Ready** | Built for reliable automation in production systems |\n\n资料来源：[README.md:1-50]()\n\n## Architecture Overview\n\nStagehand follows a monorepo architecture using pnpm workspaces and Turborepo for efficient build management and dependency handling.\n\n```mermaid\ngraph TD\n    A[stagehand Repository] --> B[packages/core]\n    A --> C[packages/cli]\n    B --> D[Agent System]\n    B --> E[Inference Engine]\n    B --> F[Browser Context Manager]\n    D --> G[Microsoft CUA Client]\n    D --> H[Action Executors]\n    F --> I[Playwright Integration]\n    F --> J[Frame Locator]\n    E --> K[Inference Logging]\n    E --> L[Cache Management]\n    C --> M[Daemon Controller]\n    C --> N[CLI Commands]\n    M --> F\n```\n\n### Core Packages\n\n| Package | Purpose | Key Files |\n|---------|---------|-----------|\n| `packages/core` | Main automation engine with AI agent capabilities | `lib/v3/agent/*`, `lib/v3/understudy/*` |\n| `packages/cli` | Command-line interface for browser control | `CHANGELOG.md`, README commands |\n\n资料来源：[packages/cli/CHANGELOG.md:1-15]()\n\n## Project Structure\n\n```\nstagehand/\n├── packages/\n│   ├── core/                    # Core automation package\n│   │   ├── lib/\n│   │   │   ├── v3/\n│   │   │   │   ├── agent/       # AI agent implementation\n│   │   │   │   │   ├── MicrosoftCUAClient.ts\n│   │   │   │   │   └── prompts/\n│   │   │   │   │       └── agentSystemPrompt.ts\n│   │   │   │   └── understudy/  # Context and frame management\n│   │   │   │       ├── context.ts\n│   │   │   │       └── frameLocator.ts\n│   │   │   └── inferenceLogUtils.ts\n│   │   └── README.md\n│   └── cli/                     # CLI tooling\n│       ├── CHANGELOG.md\n│       └── README.md\n├── package.json\n├── pnpm-workspace.yaml\n└── turbo.json\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:1-50]()\n\n## Agent System Architecture\n\nThe agent system is the brain of Stagehand, responsible for interpreting user instructions and generating appropriate browser actions.\n\n### Microsoft CUA Integration\n\nStagehand implements a Microsoft CUA (Computer Use Agent) client that provides structured function calling capabilities:\n\n```mermaid\ngraph LR\n    A[User Instruction] --> B[MicrosoftCUAClient]\n    B --> C[Action Generation]\n    C --> D[left_click]\n    C --> E[scroll]\n    C --> F[visit_url]\n    C --> G[type]\n    C --> H[wait]\n    C --> I[web_search]\n```\n\n#### Supported Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `left_click` | `coordinate: [x, y]` | Click at specified coordinates |\n| `scroll` | `pixels: number` | Scroll up (positive) or down (negative) |\n| `visit_url` | `url: string` | Navigate to a URL |\n| `type` | `text, press_enter, delete_existing_text` | Type text into input fields |\n| `wait` | `time: number` | Wait for specified seconds |\n| `web_search` | `query: string` | Perform a web search |\n| `history_back` | - | Navigate back in browser history |\n| `pause_and_memorize_fact` | `fact: string` | Store information for later use |\n| `terminate` | `status: success/failure` | End the task execution |\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:1-80]()\n\n### System Prompt Structure\n\nThe agent uses a structured XML-based system prompt that organizes instructions into distinct sections:\n\n```mermaid\ngraph TD\n    P[System Prompt] --> A[Identity]\n    P --> T[Task Definition]\n    P --> M[Mindset Guidelines]\n    P --> G[Action Guidelines]\n    P --> N[Navigation Rules]\n    P --> S[Strategy]\n    P --> R[Roadblocks]\n    P --> V[Variables]\n    P --> C[Completion]\n```\n\nThe prompt templates include conditional sections for:\n- **Page Understanding Protocol**: Instructions for analyzing page state\n- **Search Integration**: Optional search tool usage when URL confidence is low\n- **Variable Substitution**: Support for `%variableName%` syntax in form interactions\n- **Captcha Handling**: Optional roadblocks section when auto-solving is enabled\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:50-150]()\n\n## Browser Context Management\n\nStagehand manages browser contexts through the `understudy` module, which handles complex scenarios like iframe interactions and multi-page workflows.\n\n### Frame Locator System\n\nThe frame locator handles cross-origin iframe communication and lifecycle events:\n\n```mermaid\nsequenceDiagram\n    Participant Parent as Parent Page\n    Participant Child as Child Frame\n    Participant Context as BrowserContext\n    \n    Parent->>Context: Create Frame\n    Context->>Child: Register Lifecycle Events\n    Child-->>Parent: DOMContentLoaded\n    Parent->>Context: Wait for MainWorld\n    Context-->>Parent: Execution Context Ready\n```\n\nKey responsibilities include:\n- Resolving owning `Page` by main frame ID\n- Applying initialization scripts to pages\n- Managing lifecycle events (DOMContentLoaded, load, networkIdle)\n- Handling OOPIF (Out-of-Process iframe) scenarios\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:1-50](), [packages/core/lib/v3/understudy/frameLocator.ts:1-40]()\n\n## Inference and Caching\n\nStagehand implements an inference logging system that enables self-healing automation by caching and reusing action results.\n\n### Summary File Structure\n\n```\n<inferenceType>_summary.json\n```\n\nWhere `inferenceType` can be:\n- `act_summary`: Action execution results\n- `observe_summary`: Page observation results\n- `extract_summary`: Data extraction results\n\nThe system reads and writes JSON files containing arrays of inference results, enabling:\n- **Action Replay**: Re-execute previously successful actions\n- **Self-Healing**: Automatically recover from website changes\n- **Token Optimization**: Skip LLM inference when cached results are valid\n\n资料来源：[packages/core/lib/inferenceLogUtils.ts:1-60]()\n\n## CLI Architecture\n\nThe Stagehand CLI provides a command-line interface for browser automation with support for both local and remote browser execution.\n\n### Execution Modes\n\n| Mode | Detection | Description |\n|------|-----------|-------------|\n| `remote` | `BROWSERBASE_API_KEY` is set | Uses Browserbase cloud infrastructure |\n| `local` | Default fallback | Uses local Playwright browser |\n\nThe daemon-based architecture ensures:\n- Persistent browser sessions\n- Automatic recovery on failures\n- Session state preservation across commands\n\n资料来源：[packages/cli/README.md:1-100]()\n\n### CLI Command Categories\n\n#### Navigation Commands\n```bash\nbrowse open <url> [--wait load|domcontentloaded|networkidle]\nbrowse reload\nbrowse back\nbrowse forward\n```\n\n#### Interaction Commands\n```bash\nbrowse click <ref>\nbrowse type <text>\nbrowse press <key>\nbrowse fill <selector> <value>\n```\n\n#### Information Commands\n```bash\nbrowse get url|title|text|html|value|box\nbrowse snapshot [-c|--compact]\nbrowse screenshot\n```\n\n#### Session Management\n```bash\nbrowse start|stop|restart\nbrowse env local|remote\nbrowse attach <port|url>\n```\n\n资料来源：[packages/cli/README.md:100-200]()\n\n## Getting Started\n\n### Installation from Source\n\n```bash\ngit clone https://github.com/browserbase/stagehand.git\ncd stagehand\npnpm install\npnpm run build\npnpm run example\n```\n\n### Branch Installation\n\nUsing [gitpkg](https://github.com/EqualMa/gitpkg), install directly from a branch:\n\n```json\n\"@browserbasehq/stagehand\": \"https://gitpkg.now.sh/browserbase/stagehand/packages/core?<branchName>\"\n```\n\n### Environment Configuration\n\nCreate a `.env` file from the example:\n\n```bash\ncp .env.example .env\n# Add your API keys:\n# BROWSERBASE_API_KEY=your_key\n# OPENAI_API_KEY=your_key (or other LLM provider)\n```\n\n资料来源：[README.md:50-80](), [packages/core/README.md:50-80]()\n\n## Version History\n\nRecent significant changes in the CLI package:\n\n| Version | Change | PR |\n|---------|--------|-----|\n| 0.4.2 | Added `browse get markdown` command for HTML-to-markdown conversion | [#1907](https://github.com/browserbase/stagehand/pull/1907) |\n| 0.4.1 | Fixed invalid metadata key using underscore | [#1911](https://github.com/browserbase/stagehand/pull/1911) |\n| 0.4.0 | Added new feature | [#1889](https://github.com/browserbase/stagehand/pull/1889) |\n\n资料来源：[packages/cli/CHANGELOG.md:1-20]()\n\n## License and Community\n\nStagehand is released under the **MIT License**. The project maintains an active community through:\n\n- **Documentation**: [docs.stagehand.dev](https://docs.stagehand.dev)\n- **Discord Community**: [stagehand.dev/discord](https://stagehand.dev/discord)\n- **DeepWiki Integration**: Ask questions at [deepwiki.com/browserbase/stagehand](https://deepwiki.com/browserbase/stagehand)\n\nA Python implementation is also available at [github.com/browserbase/stagehand-python](https://github.com/browserbase/stagehand-python).\n\n资料来源：[README.md:1-30](), [packages/core/README.md:1-30]()\n\n---\n\n<a id='architecture-overview'></a>\n\n## Architecture Overview\n\n### 相关页面\n\n相关主题：[Project Introduction](#project-introduction), [CDP Engine](#cdp-engine), [Core Actions](#core-actions), [Server API](#server-api)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n- [packages/core/lib/v3/agent/MicrosoftCUAClient.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n- [README.md](https://github.com/browserbase/stagehand/blob/main/README.md)\n- [packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n\n</details>\n\n# Architecture Overview\n\n## Introduction\n\nStagehand is an AI-powered browser automation framework that enables developers to control web browsers using natural language instructions combined with precise code-based control. The architecture is designed to bridge the gap between high-level AI agents and low-level browser automation frameworks like Playwright, providing reliability, extensibility, and self-healing capabilities essential for production environments.\n\nThe framework operates on a multi-layered architecture that separates concerns between browser session management, agent reasoning, tool execution, and page interaction primitives. This design allows users to choose when to leverage AI for navigating unfamiliar pages and when to use explicit code for deterministic operations.\n\n## Core Architecture Layers\n\nStagehand's architecture consists of four primary layers working in concert to provide browser automation capabilities:\n\n| Layer | Purpose | Key Components |\n|-------|---------|----------------|\n| **Agent Layer** | High-level reasoning and decision making | MicrosoftCUAClient, System Prompts |\n| **Context Layer** | Browser session and page lifecycle management | BrowserContext, Page management |\n| **Understudy Layer** | Low-level browser primitives and frame handling | FrameLocator, Page interactions |\n| **Transport Layer** | Local/Remote browser connectivity | CDP (Chrome DevTools Protocol) |\n\n## Browser Context Management\n\nThe `BrowserContext` class (defined in `context.ts`) serves as the central hub for managing browser sessions. It handles multiple pages, initialization scripts, and coordinate-based element interactions.\n\n### Page Management\n\nThe context maintains a mapping of pages organized by target ID, enabling multi-tab support:\n\n```typescript\npages(): Page[] {\n  const rows: Array<{ tid: TargetId; page: Page; created: number }> = [];\n  for (const [tid, page] of this.pagesByTarget) {\n    if (this.typeByTarget.get(tid) === \"page\") {\n      rows.push({ tid, page, created: this.createdAtByTarget.get(tid) ?? 0 });\n    }\n  }\n  rows.sort((a, b) => a.created - b.created);\n  return rows.map((r) => r.page);\n}\n```\n\n资料来源：[context.ts:lines]()\n\nPages are returned in creation order (oldest to newest), and OOPIF (Out-of-Process iframe) targets are intentionally excluded from this listing to maintain a clean multi-tab abstraction.\n\n### Initialization Scripts\n\nThe context supports both seeding and full registration of initialization scripts:\n\n```typescript\nprivate async applyInitScriptsToPage(\n  page: Page,\n  opts?: { seedOnly?: boolean },\n): Promise<void> {\n  if (opts?.seedOnly) {\n    for (const source of this.initScripts) {\n      page.seedInitScript(source);\n    }\n    return;\n  }\n  for (const source of this.initScripts) {\n    await page.registerInitScript(source);\n  }\n}\n```\n\n资料来源：[context.ts:lines]()\n\nThis dual-mode initialization allows scripts to be either eagerly registered or lazily seeded for later injection.\n\n## Frame Handling Architecture\n\nStagehand implements sophisticated frame management through the `FrameLocator` module, handling both same-process and cross-process frame scenarios including OOPIFs.\n\n### Lifecycle-Aware Frame Attachment\n\nThe frame attachment process waits for specific lifecycle events before exposing the main world context:\n\n```typescript\nconst onLifecycle = (evt: Protocol.Page.LifecycleEventEvent) => {\n  if (\n    evt.frameId !== childFrameId ||\n    (evt.name !== \"DOMContentLoaded\" &&\n      evt.name !== \"load\" &&\n      evt.name !== \"networkIdle\" &&\n      evt.name !== \"networkidle\")\n  ) {\n    return;\n  }\n  if (hasMainWorldOnParent()) return finish();\n  // ... handle frame ownership transfer\n};\n```\n\n资料来源：[frameLocator.ts:lines]()\n\nThis approach ensures that automation scripts don't attempt to interact with frames before they have reached a stable state.\n\n## Agent System Design\n\n### System Prompt Architecture\n\nThe agent receives structured prompts that organize its behavior into distinct sections:\n\n```xml\n<system>\n  <identity>You are a web automation assistant...</identity>\n  <task>\n    <goal>${executionInstruction}</goal>\n    <date display=\"local\" iso=\"${isoDate}\">${localeDate}</date>\n  </task>\n  <page>\n    <startingUrl>...</startingUrl>\n  </page>\n  <mindset>...</mindset>\n  <guidelines>...</guidelines>\n  <page_understanding_protocol>...</page_understanding_protocol>\n  <navigation>...</navigation>\n  <tools>...</tools>\n  <strategy>...</strategy>\n  <roadblocks>...</roadblocks>\n  <variables>...</variables>\n  <completion>...</completion>\n</system>\n```\n\n资料来源：[agentSystemPrompt.ts:lines]()\n\n### Hybrid Mode Page Understanding\n\nThe system supports two modes of page understanding based on the `isHybridMode` flag:\n\n| Mode | Primary Tool | Secondary Tool | Use Case |\n|------|-------------|----------------|----------|\n| **Hybrid** | `screenshot` | `ariaTree` | Visual confirmation + accessible content |\n| **Standard** | `ariaTree` | `screenshot` | Text-focused accessibility tree |\n\n资料来源：[agentSystemPrompt.ts:lines]()\n\nThe page understanding protocol ensures agents start by comprehending the page state before taking actions:\n\n```xml\n<page_understanding_protocol>\n  <step_1>\n    <title>UNDERSTAND THE PAGE</title>\n    <primary_tool>\n      <name>screenshot|ariaTree</name>\n      <usage>Get complete page context before taking actions</usage>\n    </primary_tool>\n  </step_1>\n</page_understanding_protocol>\n```\n\n### Microsoft CUA Agent Actions\n\nThe Microsoft CUA client defines a comprehensive set of agent actions:\n\n| Action | Purpose | Required Parameters |\n|--------|---------|---------------------|\n| `left_click` | Click at coordinate | `coordinate: [x, y]` |\n| `scroll` | Scroll page | `pixels: number`, `coordinate?: [x, y]` |\n| `visit_url` | Navigate to URL | `url: string` |\n| `web_search` | Perform search | `query: string` |\n| `history_back` | Navigate backward | None |\n| `pause_and_memorize_fact` | Store context | `fact: string` |\n| `wait` | Pause execution | `time: number` |\n| `terminate` | End task | `status: success\\|failure` |\n| `type` | Input text | `text: string`, `press_enter?: boolean` |\n| `key` | Keyboard input | `keys: string[]` |\n\n资料来源：[MicrosoftCUAClient.ts:lines]()\n\n### FARA Function Calling Protocol\n\nThe agent uses an XML-based function calling format:\n\n```xml\n<tools>\n${toolSchema}\n</tools>\n\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>\n```\n\n资料来源：[MicrosoftCUAClient.ts:lines]()\n\nThis format separates agent thoughts from function calls, enabling clear reasoning before action execution.\n\n## Variable System\n\nThe framework supports variable substitution in tool calls using `%variableName%` syntax:\n\n```typescript\nconst variableToolsNote = isHybridMode\n  ? \"Use %variableName% syntax in the type, fillFormVision, or act tool's value/text/action fields.\"\n  : \"Use %variableName% syntax in the act or fillForm tool's action fields.\";\n```\n\n资料来源：[agentSystemPrompt.ts:lines]()\n\nVariables are defined with optional descriptions and rendered in XML format:\n\n```xml\n<variables>\n  <variable name=\"password\" />\n  <variable name=\"username\">The username for login</variable>\n</variables>\n```\n\n## Session Management\n\n### Daemon-Based Architecture\n\nThe CLI architecture uses a persistent daemon for browser session management:\n\n```mermaid\ngraph TD\n    A[CLI Command] --> B[Daemon Check]\n    B -->|Running| C[Send Command]\n    B -->|Not Running| D[Auto-restart Daemon]\n    D --> C\n    C --> E[Browser Instance]\n    E --> F[Page Operations]\n```\n\nThe daemon supports two execution modes:\n\n| Mode | Trigger | Use Case |\n|------|---------|----------|\n| `remote` | `BROWSERBASE_API_KEY` is set | Cloud browser infrastructure |\n| `local` | No API key detected | Local browser instances |\n\n### Multi-Session Support\n\nSessions can be named for parallel browser instances:\n\n```bash\nbrowse --session <name>\n```\n\nThe context stores metadata using session names, enabling clean separation of concurrent automation tasks.\n\n## Tool Categories\n\n### Navigation Tools\n\n| Tool | Description |\n|------|-------------|\n| `open` | Navigate to URL with configurable wait states |\n| `reload` | Refresh current page |\n| `back` / `forward` | Browser history navigation |\n\n### Interaction Tools\n\n| Tool | Description |\n|------|-------------|\n| `click` | Click element by reference or coordinates |\n| `type` | Text input with optional delays |\n| `press` | Keyboard shortcuts |\n| `hover` | Mouse hover at coordinates |\n| `scroll` | Scroll operations with delta support |\n\n### Page Information Tools\n\n| Tool | Description |\n|------|-------------|\n| `snapshot` | Accessibility tree with element refs |\n| `screenshot` | Visual capture (PNG/JPEG) |\n| `get` | Retrieve URL, title, text, HTML, values |\n| `get markdown` | Convert HTML to markdown |\n\n## Strategy Guidelines\n\nThe system embeds strategic guidelines for agent behavior:\n\n```xml\n<strategy>\n  <item>CRITICAL: Use extract ONLY when the task explicitly requires structured data output</item>\n  <item>Keep actions atomic and verify outcomes before proceeding</item>\n  <item>For each action, provide clear reasoning about why you're taking that step</item>\n  <item>When you need to input text, prefer using the keys tool to type the entire sequence at once</item>\n</strategy>\n```\n\n资料来源：[agentSystemPrompt.ts:lines]()\n\n## Configuration Options\n\nThe agent system prompt accepts the following configuration parameters:\n\n| Parameter | Type | Purpose |\n|-----------|------|---------|\n| `executionInstruction` | string | Task description for the agent |\n| `url` | string | Starting page URL |\n| `systemInstructions` | string | Custom system-level instructions (CDATA) |\n| `variables` | Record | Variable name/value pairs |\n| `isHybridMode` | boolean | Enable screenshot-first page understanding |\n| `captchasAutoSolve` | boolean | Enable CAPTCHA auto-solving (shows roadblocks section) |\n| `hasSearch` | boolean | Enable search tool usage |\n| `isLocal` | boolean | Local vs remote browser context |\n\n## Error Handling\n\nThe context manages HTTP header configuration errors with detailed session reporting:\n\n```typescript\nconst failures = Array.from(result.errors.entries()).map(([target, entry]) => {\n  const reason = entry.result.reason as Error;\n  const sid = entry.session.id ?? \"unknown\";\n  const message = reason?.message ?? String(reason);\n  return `session=${sid} error=${message}`;\n});\n\nif (failures.length) {\n  throw new StagehandSetExtraHTTPHeadersError(failures);\n}\n```\n\n资料来源：[context.ts:lines]()\n\n## Extension Points\n\nStagehand provides several extension mechanisms:\n\n1. **Custom Instructions**: Via `systemInstructions` parameter with CDATA wrapping for complex multi-line content\n2. **Variables**: For sensitive data injection (passwords, tokens)\n3. **Init Scripts**: JavaScript injection at page load time\n4. **Tool Schema**: Extensible action definitions in the Microsoft CUA client\n\n---\n\n<a id='cdp-engine'></a>\n\n## CDP Engine\n\n### 相关页面\n\n相关主题：[Architecture Overview](#architecture-overview), [DOM and Accessibility Tree](#dom-accessibility), [Core Actions](#core-actions)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/understudy/cdp.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/cdp.ts)\n- [packages/core/lib/v3/understudy/frameRegistry.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameRegistry.ts)\n- [packages/core/lib/v3/understudy/executionContextRegistry.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/executionContextRegistry.ts)\n- [packages/core/lib/v3/launch/local.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/launch/local.ts)\n- [packages/core/lib/v3/launch/browserbase.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/launch/browserbase.ts)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/cookies.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/cookies.ts)\n- [packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n</details>\n\n# CDP Engine\n\nThe CDP (Chrome DevTools Protocol) Engine is the low-level abstraction layer in Stagehand that provides direct communication between the framework and web browsers. It orchestrates browser launching, session management, frame handling, execution context management, and CDP command dispatching.\n\n## Architecture Overview\n\n```mermaid\ngraph TD\n    A[Stagehand API] --> B[CDP Engine]\n    B --> C[Launch Module]\n    B --> D[Session Manager]\n    B --> E[Frame Registry]\n    B --> F[Execution Context Registry]\n    C --> G[Local Browser]\n    C --> H[Browserbase Cloud]\n    D --> I[CDP WebSocket Connections]\n    E --> J[Frame Lifecycle Events]\n    F --> K[Isolated JS Contexts]\n```\n\nThe CDP Engine serves as the foundation layer that Stagehand's high-level browser automation primitives are built upon. It abstracts away the complexity of raw CDP WebSocket communication while providing type-safe interfaces for all browser operations.\n\n资料来源：[packages/core/lib/v3/understudy/cdp.ts:1-50]()\n\n## Core Components\n\n### Launch Module\n\nThe launch module handles browser instantiation across different deployment environments.\n\n| Launch Mode | Source File | Description |\n|-------------|-------------|-------------|\n| Local Browser | `packages/core/lib/v3/launch/local.ts` | Launches Chromium locally using Playwright's browser management |\n| Browserbase Cloud | `packages/core/lib/v3/launch/browserbase.ts` | Connects to remote browser infrastructure for scalable execution |\n\n```mermaid\ngraph LR\n    A[Launch Request] --> B{Local or Remote?}\n    B -->|Local| C[local.ts]\n    B -->|Remote| D[browserbase.ts]\n    C --> E[Playwright Browser]\n    D --> F[CDP Endpoint]\n```\n\nThe local launch implementation uses Playwright's browser management system to spawn Chromium instances with the necessary debugging flags enabled. Remote launches establish WebSocket connections to Browserbase's cloud browser fleet.\n\n资料来源：[packages/core/lib/v3/launch/local.ts:1-100]()\n\n### Session Manager\n\nThe Session Manager (`context.ts`) maintains active browser pages and their associated CDP sessions.\n\n```typescript\n// Pages retrieval - returns top-level pages oldest to newest\npages(): Page[] {\n  const rows: Array<{ tid: TargetId; page: Page; created: number }> = [];\n  for (const [tid, page] of this.pagesByTarget) {\n    if (this.typeByTarget.get(tid) === \"page\") {\n      rows.push({ tid, page, created: this.createdAtByTarget.get(tid) ?? 0 });\n    }\n  }\n  rows.sort((a, b) => a.created - b.created);\n  return rows.map((r) => r.page);\n}\n```\n\nThe session manager intentionally excludes OOPIF (Out-of-Process iframe) targets from page listings, focusing on top-level browsing contexts.\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:60-75]()\n\n### Frame Registry\n\nThe Frame Registry (`frameRegistry.ts`) tracks all frame instances across page hierarchies. It maintains the relationship between parent and child frames, enabling accurate frame targeting for CDP commands.\n\n| Method | Purpose |\n|--------|---------|\n| `registerFrame` | Track newly created frames |\n| `unregisterFrame` | Clean up destroyed frames |\n| `getParentFrame` | Resolve parent frame context |\n| `getChildFrames` | List direct children of a frame |\n\n### Execution Context Registry\n\nThe Execution Context Registry (`executionContextRegistry.ts`) manages JavaScript execution contexts within frames.\n\n```mermaid\ngraph TD\n    A[Page Load] --> B[Create Main World Context]\n    B --> C[Optional: Create Isolated World]\n    C --> D[Register Context in Registry]\n    D --> E[Frame Ready for Script Execution]\n    \n    F[iframe Navigation] --> G[Create New Context]\n    G --> D\n```\n\nEach frame can have multiple execution contexts:\n- **Main World**: The default JavaScript context where page scripts run\n- **Isolated World**: Sandboxed contexts for extension scripts or injected code\n\n资料来源：[packages/core/lib/v3/understudy/executionContextRegistry.ts:1-80]()\n\n## Frame Lifecycle Management\n\nThe `frameLocator.ts` module handles the complex state transitions of browser frames, particularly for iframes and cross-origin navigations.\n\n```typescript\nconst onLifecycle = (evt: Protocol.Page.LifecycleEventEvent) => {\n  if (\n    evt.frameId !== childFrameId ||\n    (evt.name !== \"DOMContentLoaded\" &&\n      evt.name !== \"load\" &&\n      evt.name !== \"networkIdle\" &&\n      evt.name !== \"networkidle\")\n  ) {\n    return;\n  }\n  // Handle frame initialization\n};\n```\n\nKey lifecycle events monitored:\n- `DOMContentLoaded`: Frame DOM is parsed\n- `load`: All resources loaded\n- `networkIdle` / `networkidle`: No pending network requests\n\n资料来源：[packages/core/lib/v3/understudy/frameLocator.ts:25-40]()\n\n## Cookie Management\n\nThe `cookies.ts` module provides CDP-compatible cookie handling with validation and normalization.\n\n```typescript\nexport function normalizeCookieParams(cookies: CookieParam[]): CookieParam[] {\n  return cookies.map((c) => {\n    if (!c.url && !(c.domain && c.path)) {\n      throw new CookieValidationError(\n        `Cookie \"${c.name}\" must have a url or a domain/path pair`\n      );\n    }\n    // Additional validation for secure/sameSite pairing\n  });\n}\n```\n\n| Validation Rule | CDP Requirement |\n|-----------------|-----------------|\n| URL or Domain/Path | Cookies must have either `url` or both `domain`+`path` |\n| Secure + SameSite=None | Browsers require `secure: true` when using `sameSite: \"None\"` |\n| Host-only cookies | When `url` provided, `domain` and `path` are derived |\n\nThe module enforces these constraints before sending cookies to CDP, preventing silent failures from browser rejections.\n\n资料来源：[packages/core/lib/v3/understudy/cookies.ts:30-60]()\n\n## CDP Command Dispatch\n\nThe CDP Engine provides a unified interface for sending commands to browsers:\n\n```mermaid\nsequenceDiagram\n    Client->>CDP Engine: Execute CDP Command\n    CDP Engine->>Validation: Check Parameters\n    Validation->>Session Manager: Route to Correct Session\n    Session Manager->>CDP WebSocket: Send Command\n    CDP WebSocket-->>Session Manager: CDP Response\n    Session Manager-->>CDP Engine: Typed Response\n    CDP Engine-->>Client: Result\n```\n\n### Supported Command Categories\n\n| Category | Capabilities |\n|----------|--------------|\n| Page Operations | Navigation, reload, back/forward |\n| Frame Management | Frame creation, destruction, activation |\n| Script Execution | Evaluate expressions, call functions |\n| Input Handling | Mouse, keyboard, touch events |\n| Network Monitoring | Request/response capture |\n| Runtime Inspection | Console access, breakpoints |\n\n## Initialization Scripts\n\nThe CDP Engine supports injecting initialization scripts into pages:\n\n```typescript\nprivate async applyInitScriptsToPage(\n  page: Page,\n  opts?: { seedOnly?: boolean },\n): Promise<void> {\n  if (opts?.seedOnly) {\n    for (const source of this.initScripts) {\n      page.seedInitScript(source);\n    }\n    return;\n  }\n  for (const source of this.initScripts) {\n    await page.registerInitScript(source);\n  }\n}\n```\n\n| Script Type | Timing | Use Case |\n|-------------|--------|----------|\n| `seedInitScript` | Before page loads | Pre-inject polyfills, shims |\n| `registerInitScript` | After page load | Runtime modifications |\n\n## Error Handling\n\nThe CDP Engine implements robust error handling for common CDP failure scenarios:\n\n```typescript\n// Session-level error collection\nconst failures = pendingEntries\n  .filter((entry) => entry.result.status === \"failure\")\n  .map((entry) => {\n    const reason = entry.result.reason as Error;\n    const sid = entry.session.id ?? \"unknown\";\n    const message = reason?.message ?? String(reason);\n    return `session=${sid} error=${message}`;\n  });\n\nif (failures.length) {\n  throw new StagehandSetExtraHTTPHeadersError(failures);\n}\n```\n\nCustom error types:\n- `StagehandSetExtraHTTPHeadersError`: Failed header injection\n- `CookieValidationError`: Invalid cookie parameters\n- `ExecutionContextError`: Script execution failures\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:45-55]()\n\n## Integration with Agent System\n\nThe CDP Engine is used by higher-level agents (like MicrosoftCUAClient) to perform browser automation:\n\n```typescript\n// Supported actions in Microsoft CUA integration\nconst supportedActions = [\n  \"left_click\",\n  \"scroll\",\n  \"visit_url\",\n  \"web_search\",\n  \"history_back\",\n  \"pause_and_memorize_fact\",\n  \"wait\",\n  \"terminate\",\n];\n```\n\nEach action maps to specific CDP commands, with the CDP Engine abstracting the protocol-level details.\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:50-70]()\n\n## Configuration\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `headless` | boolean | auto | Run browser in headless mode |\n| `timeout` | number | 30000 | Default command timeout (ms) |\n| `viewport` | Viewport | 1280x720 | Browser viewport size |\n| `userAgent` | string | auto | Custom user agent string |\n| `ignoreHTTPSErrors` | boolean | false | Allow invalid certificates |\n\n## Summary\n\nThe CDP Engine is the foundational layer that enables Stagehand's browser automation capabilities. It provides:\n\n1. **Abstraction**: Type-safe interfaces over raw CDP WebSocket communication\n2. **Multi-environment**: Support for both local and cloud browser execution\n3. **Lifecycle management**: Frame and execution context tracking\n4. **Validation**: Cookie and parameter normalization before CDP commands\n5. **Error handling**: Structured error types and recovery mechanisms\n\nAll high-level browser automation features in Stagehand build upon the CDP Engine's primitives, making it essential for understanding the framework's internals.\n\n---\n\n<a id='core-actions'></a>\n\n## Core Actions\n\n### 相关页面\n\n相关主题：[CDP Engine](#cdp-engine), [DOM and Accessibility Tree](#dom-accessibility), [Agent System](#agent-system)\n\n<details>\n<summary>Related Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/agent/MicrosoftCUAClient.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/handlers/actHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/actHandler.ts) (referenced)\n- [packages/core/lib/v3/handlers/extractHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/extractHandler.ts) (referenced)\n- [packages/core/lib/v3/handlers/observeHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/observeHandler.ts) (referenced)\n- [packages/core/lib/v3/agent/tools/index.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/tools/index.ts) (referenced)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n</details>\n\n# Core Actions\n\n## Overview\n\nCore Actions are the fundamental building blocks of browser automation in Stagehand. They represent the primitive operations that the AI agent can perform to interact with web pages, extract information, and accomplish user-defined tasks. These actions bridge the gap between natural language instructions and executable browser operations.\n\nThe Core Actions system is designed around a modular architecture where different handlers manage specific types of interactions. Each handler is responsible for a category of actions, implementing the logic to translate high-level agent decisions into low-level browser commands.\n\n资料来源：[packages/core/lib/v3/handlers/v3AgentHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/v3AgentHandler.ts)\n\n## Architecture\n\nThe Core Actions system follows a layered architecture:\n\n```mermaid\ngraph TD\n    A[User Instruction] --> B[Agent Handler]\n    B --> C[Action Router]\n    C --> D[actHandler]\n    C --> E[extractHandler]\n    C --> F[observeHandler]\n    D --> G[Browser CDP Commands]\n    E --> G\n    F --> G\n    H[Microsoft CUA Client] --> G\n```\n\n### Handler Components\n\n| Handler | Purpose | Key Operations |\n|---------|---------|-----------------|\n| `actHandler` | Execute user interaction actions | click, type, scroll, hover, drag |\n| `extractHandler` | Extract structured data from pages | DOM parsing, content extraction |\n| `observeHandler` | Analyze and understand page state | accessibility tree, element detection |\n| `v3AgentHandler` | Orchestrate agent workflow | task planning, action coordination |\n\n资料来源：[packages/core/lib/v3/handlers/actHandler.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/handlers/actHandler.ts)\n\n## Action Types\n\n### Browser Navigation Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `visit_url` | `url`, `wait` | Navigate to a specified URL with optional wait state |\n| `history_back` | - | Navigate back in browser history |\n| `history_forward` | - | Navigate forward in browser history |\n| `reload` | - | Reload the current page |\n\n### Interaction Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `left_click` | `coordinate`, `element` | Click at coordinates or on an element |\n| `right_click` | `coordinate`, `element` | Perform right-click context menu action |\n| `mouse_move` | `coordinate` | Move mouse to specified position |\n| `scroll` | `pixels`, `coordinate` | Scroll by pixel amount (positive=up, negative=down) |\n| `drag` | `from`, `to`, `steps` | Perform drag operation with interpolation |\n| `type` | `text`, `press_enter`, `delete_existing_text` | Type text into focused input fields |\n\n### Information Retrieval Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `extract` | `schema` | Extract structured data matching a Zod schema |\n| `screenshot` | `full_page` | Capture visual representation of page |\n| `aria_tree` | `ref` | Get accessibility tree for element inspection |\n\n### Utility Actions\n\n| Action | Parameters | Description |\n|--------|------------|-------------|\n| `wait` | `time` | Pause execution for specified seconds |\n| `pause_and_memorize_fact` | `fact` | Store information for later retrieval |\n| `web_search` | `query` | Execute a web search |\n| `terminate` | `status` | End the task with success or failure status |\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:1-100](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n\n## Action Parameters\n\n### Common Parameters\n\n```typescript\ninterface ActionParameters {\n  action: string;           // The action type to perform\n  element?: string;          // Element reference (e.g., \"@0-5\")\n  coordinate?: [number, number]; // [x, y] pixel coordinates\n  ref?: string;             // Element reference in ariaTree\n}\n```\n\n### Type Action Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `text` | `string` | Text to type into the input field |\n| `press_enter` | `boolean` | Whether to press Enter after typing |\n| `delete_existing_text` | `boolean` | Clear existing text before typing |\n\n### Scroll Action Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `pixels` | `number` | Positive values scroll up, negative scroll down |\n| `coordinate` | `[number, number]` | Optional target coordinates for viewport-relative scrolling |\n\n### Extract Action Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `schema` | `ZodSchema` | Zod schema defining the extraction structure |\n| `prompt` | `string` | Natural language description of data to extract |\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:20-85](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n\n## Tool Schema\n\nThe agent uses a FARA (Foundation Agent Reference Architecture) function calling template for tool execution:\n\n```\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\n${toolSchema}\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{{\"name\": <function-name>, \"arguments\": <args-json-object>}}\n</tool_call>\n```\n\nThe tool schema defines the complete contract between the agent reasoning system and the browser automation layer:\n\n```typescript\n{\n  \"name\": \"browser_automation\",\n  \"description\": \"Primary tool for browser automation tasks\",\n  \"parameters\": {\n    \"type\": \"object\",\n    \"properties\": {\n      \"action\": {\n        \"type\": \"string\",\n        \"enum\": [\"left_click\", \"scroll\", \"visit_url\", ...]\n      },\n      \"coordinate\": {\n        \"type\": \"array\",\n        \"description\": \"(x, y): The x and y coordinates for mouse operations\"\n      },\n      \"pixels\": {\n        \"type\": \"number\",\n        \"description\": \"Scroll amount; positive = up, negative = down\"\n      }\n    },\n    \"required\": [\"action\"]\n  }\n}\n```\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:85-130](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n\n## Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Agent\n    participant Handler\n    participant CDP as Chrome DevTools Protocol\n    \n    User->>Agent: Natural language instruction\n    Agent->>Agent: Reason and plan action\n    Agent->>Handler: Execute action with parameters\n    Handler->>CDP: Translate to CDP command\n    CDP-->>Handler: Operation result\n    Handler-->>Agent: Action completion status\n    Agent->>User: Task progress update\n```\n\n## Page Context Management\n\nThe Context class manages page state and frame handling for multi-tab scenarios:\n\n```typescript\npages(): Page[] {\n  const rows: Array<{ tid: TargetId; page: Page; created: number }> = [];\n  for (const [tid, page] of this.pagesByTarget) {\n    if (this.typeByTarget.get(tid) === \"page\") {\n      rows.push({ tid, page, created: this.createdAtByTarget.get(tid) ?? 0 });\n    }\n  }\n  rows.sort((a, b) => a.created - b.created);\n  return rows.map((r) => r.page);\n}\n```\n\nThe context system maintains:\n- Active page instances by target ID\n- Frame lifecycle management\n- Execution context tracking per frame\n- Initialization script injection\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:1-50](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n\n## Frame Locator\n\nFor handling nested frames and iframes, the frame locator system waits for frame readiness:\n\n```mermaid\ngraph TD\n    A[Parent Page] --> B{Has Main World?}\n    B -->|No| C[Enable Lifecycle Events]\n    C --> D[Wait for DOMContentLoaded]\n    D --> E{Has Main World?}\n    E -->|Yes| F[Continue]\n    E -->|No| G[Wait with timeout]\n    G --> H[Get Session for Frame]\n    H --> I[Wait for Main World]\n    I --> F\n    B -->|Yes| F\n```\n\n资料来源：[packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n\n## System Prompt Configuration\n\nThe agent's behavior is configured through system prompts that include:\n\n- **Task Definition**: The execution instruction and goal context\n- **Page Understanding Protocol**: When to use ariaTree vs screenshot\n- **Strategy Guidelines**: Best practices for action sequencing\n- **Variable Substitution**: Support for dynamic values via `%variableName%` syntax\n\n```typescript\nconst systemPrompt = `<system>\n  <identity>You are a web automation assistant using browser automation tools</identity>\n  <task>\n    <goal>${cdata(executionInstruction)}</goal>\n    <date display=\"local\" iso=\"${isoDate}\">${localeDate}</date>\n  </task>\n  ${customInstructionsBlock}\n  <page_understanding_protocol>\n    ${isHybridMode ? screenshot + ariaTree : ariaTree + screenshot}\n  </page_understanding_protocol>\n  ${variablesSection}\n</system>`;\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n\n## Error Handling\n\nActions can fail due to various reasons:\n\n| Error Type | Cause | Recovery Strategy |\n|------------|-------|-------------------|\n| Element not found | Selector invalid or element removed | Re-observe page, retry with updated selector |\n| Action timeout | Page not responding | Wait and retry |\n| Frame detached | Frame navigated away | Re-acquire frame context |\n| CDP error | Protocol communication failure | Retry CDP command |\n\nFailed actions are logged with session context for debugging:\n\n```typescript\nconst failures = await Promise.allSettled(\n  this.context.extraHTTPHeaders.map(async ({ entry }) => {\n    const result = await entry.result;\n    const reason = entry.result.reason as Error;\n    const message = reason?.message ?? String(reason);\n    return `session=${sid} error=${message}`;\n  })\n);\n```\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:50-80](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n\n## Best Practices\n\n1. **Atomic Actions**: Keep actions focused and single-purpose for better reliability\n2. **Element References**: Use ariaTree references (e.g., `@0-5`) when available for precise targeting\n3. **Wait Appropriately**: Allow page state to stabilize before subsequent actions\n4. **Error Recovery**: The system supports automatic retry and self-healing mechanisms\n5. **Variable Usage**: Use `%variableName%` for sensitive data like passwords\n\n## See Also\n\n- [Agent Handler](packages/core/lib/v3/handlers/v3AgentHandler.ts)\n- [Microsoft CUA Client](packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n- [Understudy Context](packages/core/lib/v3/understudy/context.ts)\n- [Frame Locator](packages/core/lib/v3/understudy/frameLocator.ts)\n\n---\n\n<a id='dom-accessibility'></a>\n\n## DOM and Accessibility Tree\n\n### 相关页面\n\n相关主题：[CDP Engine](#cdp-engine), [Core Actions](#core-actions), [DOM and Accessibility Tree](#dom-accessibility)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/dom/index.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/dom/index.ts)\n- [packages/core/lib/v3/dom/piercer.entry.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/dom/piercer.entry.ts)\n- [packages/core/lib/v3/dom/piercer.runtime.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/dom/piercer.runtime.ts)\n- [packages/core/lib/v3/understudy/a11y/snapshot/index.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/a11y/snapshot/index.ts)\n- [packages/core/lib/v3/dom/locatorScripts/index.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/dom/locatorScripts/index.ts)\n</details>\n\n# DOM and Accessibility Tree\n\n## Overview\n\nThe DOM and Accessibility Tree system in Stagehand provides the foundational mechanism for understanding and interacting with web pages. This dual-layer approach combines native DOM manipulation with accessibility tree analysis to enable reliable browser automation through AI agents.\n\nThe system serves two primary purposes:\n1. **DOM Interaction** - Direct manipulation of page elements including clicking, typing, scrolling, and form handling\n2. **Accessibility Tree (ariaTree)** - Structured representation of page content optimized for AI comprehension and element discovery\n\nThis architecture allows Stagehand to balance the precision of programmatic DOM access with the semantic understanding provided by accessibility APIs.\n\n---\n\n## Architecture\n\n### Component Overview\n\n```mermaid\ngraph TD\n    A[AI Agent] -->|Requests Page Context| B[Accessibility Tree Snapshot]\n    A -->|Interacts with Elements| C[DOM Locator Scripts]\n    \n    B --> D[ariaTree Tool]\n    D -->|Returns Full Page Structure| A\n    \n    C --> E[Element Locator Scripts]\n    E -->|Executes in Browser Context| F[Target Web Page]\n    \n    F -->|Accessibility APIs| B\n    F -->|DOM APIs| C\n    \n    G[Piercer Module] -->|Runtime Injection| F\n```\n\n### Core Modules\n\n| Module | File Path | Purpose |\n|--------|-----------|---------|\n| DOM Entry | `packages/core/lib/v3/dom/index.ts` | Main export and module initialization |\n| Piercer Entry | `packages/core/lib/v3/dom/piercer.entry.ts` | Browser extension entry point |\n| Piercer Runtime | `packages/core/lib/v3/dom/piercer.runtime.ts` | Runtime script injection |\n| A11y Snapshot | `packages/core/lib/v3/understudy/a11y/snapshot/index.ts` | Accessibility tree generation |\n| Locator Scripts | `packages/core/lib/v3/dom/locatorScripts/index.ts` | Element interaction scripts |\n\n---\n\n## Accessibility Tree\n\n### Purpose and Role\n\nThe accessibility tree provides a semantic, structured view of the web page that AI agents can easily parse and understand. Unlike raw DOM inspection, the accessibility tree filters and organizes content based on interactive significance.\n\nAccording to the agent system prompt, the ariaTree tool serves as the **primary tool** for page understanding in standard mode:\n\n> `<primary_tool><name>ariaTree</name><usage>Get complete page context before taking actions</usage><benefit>Eliminates the need to scroll and provides full accessible content</benefit></primary_tool>` 资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts]()\n\n### Tool Integration\n\nIn **hybrid mode**, ariaTree serves as the secondary tool:\n\n```xml\n<page_understanding_protocol>\n  <step_1>\n    <title>UNDERSTAND THE PAGE</title>\n    <primary_tool>\n      <name>screenshot</name>\n      <usage>Visual confirmation when needed. Ideally after navigating to a new page.</usage>\n    </primary_tool>\n    <secondary_tool>\n      <name>ariaTree</name>\n      <usage>Get complete page context before taking actions</usage>\n    </secondary_tool>\n  </step_1>\n</page_understanding_protocol>\n```\n\n### Snapshot Generation\n\nThe accessibility snapshot system processes the page to generate a structured tree containing:\n\n- Interactive elements (buttons, links, inputs)\n- Semantic relationships (labels, descriptions)\n- ARIA attributes and roles\n- Element references for direct interaction\n\nThe snapshot can be retrieved in two formats:\n- **Full snapshot**: Complete accessibility tree with all metadata\n- **Compact snapshot**: Condensed format via `-c|--compact` flag 资料来源：[packages/cli/README.md]()\n\n---\n\n## DOM Locator Scripts\n\n### Scroll Functionality\n\nThe locator scripts system provides element-level scrolling capabilities. Scroll behavior differs based on element type:\n\n```mermaid\ngraph TD\n    A[Scroll Request] --> B{Element Type?}\n    B -->|html/body| C[Window Scroll]\n    B -->|Other Elements| D[Element Scroll]\n    \n    C --> E[Get Scrolling Element]\n    D --> F[Get Element Dimensions]\n    \n    E --> G[Calculate: scrollHeight - viewportHeight]\n    F --> H[Calculate: scrollHeight - clientHeight]\n    \n    G --> I[Scroll to Percentage Position]\n    H --> I\n    \n    I --> J[behavior: smooth]\n```\n\n### Scroll Implementation Details\n\n```typescript\nconst scrollWindow = tag === \"html\" || tag === \"body\";\nif (scrollWindow) {\n  const root = element.ownerDocument?.scrollingElement || ...;\n  const scrollHeight = root?.scrollHeight ?? ...;\n  const viewportHeight = element.ownerDocument?.defaultView?.innerHeight;\n  const maxTop = Math.max(0, scrollHeight - viewportHeight);\n  const top = maxTop * (pct / 100);\n  element.ownerDocument?.defaultView?.scrollTo({ top, behavior: \"smooth\" });\n}\n```\n\nThe scroll percentage calculation: `top = maxTop * (pct / 100)` 资料来源：[packages/core/lib/v3/dom/locatorScripts/scripts.ts]()\n\n### Input Type Handling\n\nThe system categorizes input types to determine the appropriate interaction method:\n\n#### Types for Direct Value Setting\n\nThese input types use `element.value = x` instead of typing:\n\n```typescript\nconst inputTypesToSetValue = new Set([\n  \"color\",\n  \"date\",\n  \"datetime-local\",\n  \"month\",\n  \"range\",\n  \"time\",\n  \"week\",\n]);\n```\n\n#### Types Requiring Typing Simulation\n\nThese types simulate keyboard input for realistic interaction:\n\n```typescript\nconst inputTypesToTypeInto = new Set([\n  \"\",\n  \"email\",\n  \"number\",\n  \"password\",\n  // ... additional types\n]);\n```\n\nThis distinction is critical for:\n- Proper form filling behavior\n- Triggering validation events\n- Maintaining input mask compatibility 资料来源：[packages/core/lib/v3/dom/locatorScripts/scripts.ts]()\n\n---\n\n## Piercer Module\n\n### Overview\n\nThe Piercer module enables deep DOM inspection and manipulation capabilities. It consists of two primary components:\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| Entry Point | `piercer.entry.ts` | Browser extension/service worker initialization |\n| Runtime | `piercer.runtime.ts` | Script injection and execution context |\n\n### Runtime Injection\n\n```mermaid\ngraph LR\n    A[Piercer Entry] -->|Initializes| B[Runtime Script]\n    B -->|Injects into| C[Target Page Context]\n    C -->|Provides| D[Enhanced DOM Access]\n```\n\nThe runtime enables:\n- Cross-origin frame access\n- Shadow DOM piercing\n- Protected element interaction\n- Frame navigation and management\n\n---\n\n## Tool Modes and ariaTree Strategy\n\n### Mode Differentiation\n\nStagehand operates in two primary modes that affect ariaTree usage:\n\n| Aspect | Hybrid Mode | Standard Mode |\n|--------|-------------|---------------|\n| Primary Tool | `screenshot` | `ariaTree` |\n| Secondary Tool | `ariaTree` | `screenshot` |\n| Strategy | Visual grounding first | Context-first |\n| Best For | Complex UIs, visual verification | Content extraction, form handling |\n\n### Strategy Guidelines by Mode\n\n**Hybrid Mode Strategy:**\n```xml\n<item>Always use screenshot to get proper grounding of the coordinates you want to type/click into.</item>\n<item>Use ariaTree as a secondary tool when elements aren't visible in screenshot or to get full page context.</item>\n```\n\n**Standard Mode Strategy:**\n```xml\n<item>Always check ariaTree first to understand full page content without scrolling - it shows all elements including those below the fold.</item>\n<item>If an element is present in the ariaTree, use act to interact with it directly - this eliminates the need to scroll.</item>\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts]()\n\n---\n\n## CLI Integration\n\n### Accessibility Tree Commands\n\nThe Stagehand CLI provides direct access to accessibility tree functionality:\n\n```bash\nbrowse snapshot [-c|--compact]  # Accessibility tree with refs\n```\n\nThis returns the accessibility tree with element references that can be used in subsequent commands like `browse click @0-5`.\n\n### Element Interaction Commands\n\n| Command | Purpose | Example |\n|---------|---------|---------|\n| `browse click <ref>` | Click by reference | `browse click @0-5` |\n| `browse type <text>` | Type text | `browse type \"hello\"` |\n| `browse fill <selector>` | Fill form field | `browse fill input[name=user] \"name\"` |\n\n资料来源：[packages/cli/README.md]()\n\n---\n\n## Configuration Options\n\n### Scroll Configuration\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| scroll percentage | number | 0-100 | Position to scroll to as percentage of total scrollable height |\n| behavior | string | \"smooth\" | Scroll animation behavior |\n| viewport detection | boolean | auto | Detect viewport vs. full page scroll |\n\n### Input Handling Configuration\n\n| Input Type | Interaction Method | Triggers Events |\n|------------|-------------------|-----------------|\n| Text inputs | Character typing | `input`, `change` |\n| Date pickers | Value setting | `change`, `input` |\n| Range sliders | Value setting | `input`, `change` |\n| Color pickers | Value setting | `change` |\n\n---\n\n## Best Practices\n\n### Using ariaTree Effectively\n\n1. **Always retrieve ariaTree first** when starting a new task to understand page structure\n2. **Use compact mode** (`-c` flag) when you only need element references\n3. **Combine with screenshot** for visual confirmation of element positions\n\n### Element Interaction Guidelines\n\n1. **Use ariaTree references** for reliable element targeting when available\n2. **Prefer direct typing** over click-then-type for input fields\n3. **Scroll strategically** - ariaTree may already contain below-fold content\n\n### Scroll Best Practices\n\n1. **Window scrolling** - Use when navigating between page sections\n2. **Element scrolling** - Use when working within a scrollable container\n3. **Percentage-based** - Always specify scroll position as percentage for consistent behavior\n\n---\n\n## Related Documentation\n\n- [Agent System Prompt Configuration](packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [CLI Commands Reference](packages/cli/README.md)\n- [Stagehand Core Documentation](packages/core/README.md)\n\n---\n\n<a id='llm-providers'></a>\n\n## LLM Providers\n\n### 相关页面\n\n相关主题：[Project Introduction](#project-introduction), [Agent System](#agent-system)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/agent/MicrosoftCUAClient.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/MicrosoftCUAClient.ts)\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/cookies.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/cookies.ts)\n- [packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n</details>\n\n# LLM Providers\n\n## Overview\n\nStagehand implements a flexible LLM Provider abstraction that enables browser automation agents to interact with various large language model backends. The provider system is designed to standardize how agents execute actions, parse responses, and handle tool invocations across different AI service providers.\n\nThe LLM Provider architecture follows a client-adapter pattern where each provider (OpenAI, Anthropic, Google) implements a common interface while exposing provider-specific capabilities and response formats.\n\n## Architecture\n\n### High-Level Component Flow\n\n```mermaid\ngraph TD\n    A[Agent Client] --> B[LLM Provider Interface]\n    B --> C[OpenAI Client]\n    B --> D[Anthropic Client]\n    B --> E[Google Client]\n    C --> F[OpenAI API]\n    D --> G[Claude API]\n    E --> H[Gemini API]\n    F --> I[Action Execution]\n    G --> I\n    H --> I\n```\n\n### System Prompt Construction\n\nThe agent system prompt is constructed dynamically based on execution context and mode. The `buildAgentSystemPrompt` function in `agentSystemPrompt.ts` assembles the prompt from multiple modular sections:\n\n```typescript\nreturn `<system>\n  <identity>You are a web automation assistant using browser automation tools to accomplish the user's goal.</identity>\n  ${customInstructionsBlock}<task>\n    <goal>${cdata(executionInstruction)}</goal>\n    <date display=\"local\" iso=\"${isoDate}\">${localeDate}</date>\n  </task>\n  ...\n</system>`;\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:1-50]()\n\n## Provider Interface\n\n### Tool Schema Definition\n\nLLM Providers use a standardized tool schema format for function calling. The schema defines available browser automation actions:\n\n```typescript\nconst toolSchema = {\n  properties: {\n    action: {\n      type: \"string\",\n      description: \"The action to perform\",\n      enum: [\"screenshot\", \"extract\", \"click\", \"type\", \"wait\", \"scroll\", \n             \"hover\", \"press\", \"goBack\", \"goForward\", \"executeJs\", \n             \"fillForm\", \"fillFormVision\", \"act\", \"ariaTree\", \n             \"pause_and_memorize_fact\", \"terminate\"],\n    },\n    // Additional parameters...\n  },\n};\n```\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:1-100]()\n\n### Function Call Template\n\nProviders implement XML-based tool calling using a standardized template:\n\n```xml\n<tools>\n${toolDescs}\n</tools>\n\n<tool_call>\n{{\"name\": <function-name>, \"arguments\": <args-json-object>}}\n</tool_call>\n```\n\nThis format enables reliable parsing of model responses across different LLM backends.\n\n## Supported Providers\n\n### Microsoft Copilot Agent (CUA)\n\nThe Microsoft CUA client implements the FARA (Function-calling Augmented Response Agent) pattern with XML-based tool calling:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `action` | string | Action type to execute |\n| `selector` | string | Element selector for DOM operations |\n| `text` | string | Text content for typing or extraction |\n| `reasoning` | string | Model's reasoning for the action |\n| `time` | number | Wait duration in seconds |\n| `status` | string | Task completion status |\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:80-120]()\n\n## Agent Modes\n\nStagehand supports different operational modes that affect how the agent interacts with LLM providers:\n\n### Hybrid Mode\n\nIn hybrid mode, the agent prioritizes visual understanding:\n\n```typescript\nconst pageUnderstandingProtocol = isHybridMode\n  ? `<page_understanding_protocol>\n    <step_1>\n      <primary_tool>\n        <name>screenshot</name>\n        <usage>Visual confirmation when needed</usage>\n      </primary_tool>\n      <secondary_tool>\n        <name>ariaTree</name>\n        <usage>Get complete page context before taking actions</usage>\n      </secondary_tool>\n    </step_1>\n  </page_understanding_protocol>`\n  : `<page_understanding_protocol>\n    <step_1>\n      <primary_tool>\n        <name>ariaTree</name>\n        <usage>Get complete page context before taking actions</usage>\n      </primary_tool>\n      <secondary_tool>\n        <name>screenshot</name>\n        <usage>Visual confirmation when needed</usage>\n      </secondary_tool>\n    </step_1>\n  </page_understanding_protocol>`;\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:100-130]()\n\n### Variable Substitution\n\nProviders support variable substitution in tool parameters for sensitive data:\n\n```typescript\nconst variableToolsNote = isHybridMode\n  ? \"Use %variableName% syntax in the type, fillFormVision, or act tool's value/text/action fields.\"\n  : \"Use %variableName% syntax in the act or fillForm tool's action fields.\";\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:60-65]()\n\n## Tool Actions\n\n### Available Browser Automation Actions\n\n| Action | Description | Primary Use Case |\n|--------|-------------|------------------|\n| `screenshot` | Capture page screenshot | Visual verification |\n| `extract` | Extract structured data | Data collection tasks |\n| `click` | Click on element | Navigation/interaction |\n| `type` | Type text into input | Form filling |\n| `wait` | Wait for condition | Synchronization |\n| `scroll` | Scroll the page | Content visibility |\n| `ariaTree` | Get accessibility tree | Page structure understanding |\n| `fillForm` | Fill form fields | Multi-field form completion |\n\n## Response Parsing\n\n### Thoughts and Action Extraction\n\nThe provider implements parsing logic to extract model thoughts and function calls from responses:\n\n```typescript\nprivate parseThoughtsAndAction(response: string): {\n  thoughts: string;\n  functionCall: FaraFunctionCall;\n} {\n  try {\n    const parts = response.split(\"<tool_call>\\n\");\n    const thoughts = parts[0].trim();\n    const actionText = parts[1]?.trim() ?? \"\";\n    // Parse JSON action from actionText\n  }\n}\n```\n\n资料来源：[packages/core/lib/v3/agent/MicrosoftCUAClient.ts:150-180]()\n\n## Context Management\n\nLLM Providers operate within a browser context that manages multiple pages and execution environments:\n\n```typescript\npages(): Page[] {\n  const rows: Array<{ tid: TargetId; page: Page; created: number }> = [];\n  for (const [tid, page] of this.pagesByTarget) {\n    if (this.typeByTarget.get(tid) === \"page\") {\n      rows.push({ tid, page, created: this.createdAtByTarget.get(tid) ?? 0 });\n    }\n  }\n  rows.sort((a, b) => a.created - b.created);\n  return rows.map((r) => r.page);\n}\n```\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:80-95]()\n\n## Security Considerations\n\n### Cookie Handling\n\nProviders interact with secure cookie handling:\n\n```typescript\nexport function normalizeCookieParams(cookies: CookieParam[]): CookieParam[] {\n  return cookies.map((c) => {\n    if (!c.url && !(c.domain && c.path)) {\n      throw new CookieValidationError(\n        `Cookie \"${c.name}\" must have a url or a domain/path pair`,\n      );\n    }\n    // Validates secure flag for sameSite: \"None\"\n  });\n}\n```\n\n资料来源：[packages/core/lib/v3/understudy/cookies.ts:50-75]()\n\n## Configuration Options\n\n### System Prompt Variables\n\n| Variable | Type | Description |\n|----------|------|-------------|\n| `executionInstruction` | string | User's task description |\n| `url` | string | Starting page URL |\n| `isoDate` | string | Current ISO date |\n| `localeDate` | string | Localized date string |\n| `variables` | object | Key-value pairs for substitution |\n| `captchasAutoSolve` | boolean | Enable CAPTCHA auto-solving |\n\n## Strategy Components\n\n### Common Strategy Items\n\nThe system prompt includes standardized guidance for all providers:\n\n```typescript\nconst commonStrategyItems = `\n  <item>CRITICAL: Use extract ONLY when the task explicitly requires structured data output.</item>\n  <item>Keep actions atomic and verify outcomes before proceeding.</item>\n  <item>For each action, provide clear reasoning about why you're taking that step.</item>\n  <item>When you need to input text that could be entered character-by-character or through multiple separate inputs, prefer using the keys tool.</item>\n`;\n```\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:130-145]()\n\n## Frame and Context Handling\n\n### Execution Context Management\n\nProviders handle frame navigation and context switching:\n\n```typescript\nawait parentSession\n  .send(\"Page.setLifecycleEventsEnabled\", { enabled: true })\n  .catch(() => {});\nawait parentSession.send(\"Runtime.enable\").catch(() => {});\n```\n\nEvents monitored include:\n- `DOMContentLoaded`\n- `load`\n- `networkIdle`\n- `networkidle`\n\n资料来源：[packages/core/lib/v3/understudy/frameLocator.ts:30-45]()\n\n## See Also\n\n- [Agent Architecture](../agent/architecture.md)\n- [Browser Context Management](../understudy/context.md)\n- [Tool Actions Reference](../tools/actions.md)\n- [System Prompts](../agent/prompts.md)\n\n---\n\n<a id='agent-system'></a>\n\n## Agent System\n\n### 相关页面\n\n相关主题：[LLM Providers](#llm-providers), [Core Actions](#core-actions)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/frameLocator.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/frameLocator.ts)\n- [packages/core/README.md](https://github.com/browserbase/stagehand/blob/main/packages/core/README.md)\n- [packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n- [packages/cli/CHANGELOG.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/CHANGELOG.md)\n</details>\n\n# Agent System\n\nStagehand is an AI-powered browser automation framework that enables developers to control web browsers using natural language instructions combined with precise code control. The Agent System is the core intelligence layer that coordinates AI-driven decision making with browser operations.\n\n## Overview\n\nThe Agent System serves as the orchestration layer between Large Language Models (LLMs) and browser automation. It provides:\n\n- **AI-driven navigation**: Autonomous page understanding and navigation\n- **Action execution**: Intelligent element detection and interaction\n- **Multi-modal page analysis**: Combining visual screenshots with accessibility tree data\n- **Self-healing capabilities**: Automatic recovery from session failures\n- **Variable substitution**: Secure handling of sensitive data like passwords\n\n资料来源：[packages/core/README.md]()\n\n## Architecture\n\n```mermaid\ngraph TB\n    subgraph \"Agent System\"\n        A[User Instruction] --> B[AgentClient]\n        B --> C[System Prompt Generator]\n        C --> D[LLM Provider]\n        D --> E[Action Planner]\n    end\n    \n    subgraph \"Browser Layer\"\n        E --> F[Act Tool]\n        E --> G[Screenshot Tool]\n        E --> H[AriaTree Tool]\n        E --> I[Extract Tool]\n    end\n    \n    subgraph \"Execution\"\n        F --> J[Stagehand Context]\n        G --> J\n        H --> J\n        I --> J\n        J --> K[Browser Page]\n    end\n```\n\n## Core Components\n\n### AgentClient\n\nThe `AgentClient` is the main entry point for agent-based browser automation. It handles:\n\n- Initialization of AI providers (Anthropic, Google)\n- System prompt construction with context-aware instructions\n- Action execution loop with retry logic\n- Session state management\n\n资料来源：[packages/core/README.md]()\n\n### AI Provider Clients\n\nStagehand supports multiple AI providers through specialized client implementations:\n\n| Provider | Client Class | Model Type |\n|----------|--------------|------------|\n| Anthropic | `AnthropicCUAClient` | Claude Computer Use |\n| Google | `GoogleCUAClient` | Gemini Computer Use |\n\nEach client implements a unified interface for:\n- Action inference from LLM responses\n- Tool call execution\n- Error handling and recovery\n\n### Tool System\n\nThe Agent System exposes several tools for browser interaction:\n\n| Tool | Purpose | Primary Use Case |\n|------|---------|------------------|\n| `act` | Execute browser actions | Clicking, typing, scrolling |\n| `screenshot` | Capture visual page state | Visual confirmation |\n| `ariaTree` | Extract accessibility tree | Page structure analysis |\n| `extract` | Structured data extraction | Data collection tasks |\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:1-50]()\n\n## System Prompt Architecture\n\nThe system prompt is dynamically generated based on configuration and execution mode. It uses XML-like tags to structure instructions for the LLM.\n\n```mermaid\ngraph TD\n    A[Base Prompt Template] --> B[Identity Section]\n    A --> C[Task Section]\n    A --> D[Page Section]\n    A --> E[Mindset Section]\n    A --> F[Guidelines Section]\n    A --> G[Tools Section]\n    A --> H[Strategy Section]\n    A --> I[Roadblocks Section]\n    A --> J[Variables Section]\n    \n    B --> K[Generated System Prompt]\n    C --> K\n    D --> K\n    E --> K\n    F --> K\n    G --> K\n    H --> K\n    I --> K\n    J --> K\n```\n\n### Key Prompt Sections\n\n#### Page Understanding Protocol\n\nThe agent uses different page understanding strategies based on the execution mode:\n\n**Hybrid Mode** (default):\n- Primary tool: `screenshot` for visual confirmation\n- Secondary tool: `ariaTree` for complete page context\n\n**Standard Mode**:\n- Primary tool: `ariaTree` for accessibility tree\n- Secondary tool: `screenshot` for visual confirmation\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:100-130]()\n\n#### Strategy Guidelines\n\nThe system prompt includes critical guidelines:\n\n- Use `extract` ONLY when structured data output is explicitly required\n- Keep actions atomic and verify outcomes before proceeding\n- Use `keys` tool for text input that requires character-by-character entry\n- Prefer `act` for direct element interaction when available in `ariaTree`\n\n#### Variables Handling\n\nSensitive data can be passed securely using variable substitution:\n\n```\nVariable Syntax: %variableName%\n```\n\nSupported tools for variable substitution:\n- `act` tool's action fields\n- `fillForm` or `fillFormVision` in hybrid mode\n- `type` tool's value fields\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:60-90]()\n\n## Execution Modes\n\n### Local Mode\n\nRuns Chrome directly on the local machine. Best for:\n- Local debugging and development\n- Fast iteration cycles\n- Access to local network resources\n\n### Remote Mode\n\nRuns a Browserbase session in the cloud. Best for:\n- Anti-bot hardening\n- Cloud deployments\n- Scalable agent execution\n\n资料来源：[packages/cli/README.md]()\n\n## Context Management\n\nThe `StagehandContext` class manages the underlying browser state:\n\n```typescript\n// Page retrieval - returns pages oldest to newest\npages(): Page[]\n\n// Init script management\napplyInitScriptsToPage(page: Page, opts?: { seedOnly?: boolean }): Promise<void>\n\n// Session error tracking\nhandleSessionErrors(): void\n```\n\nPage targets are filtered to exclude OOPIF (Out-of-Process iFrames) targets, ensuring only top-level pages are returned.\n\n资料来源：[packages/core/lib/v3/understudy/context.ts:1-80]()\n\n## Frame Handling\n\nThe Agent System handles multi-frame page scenarios through the `frameLocator` module:\n\n1. **Lifecycle event monitoring**: Waits for `DOMContentLoaded`, `load`, or `networkIdle` events\n2. **Frame context synchronization**: Ensures main world execution context is available on parent frame\n3. **Session ownership transfer**: Handles frame ownership changes during page navigation\n\n```mermaid\nsequenceDiagram\n    participant Parent as Parent Frame\n    participant Child as Child Frame\n    participant Page as Page Object\n    \n    Parent->>Child: Set Lifecycle Events Enabled\n    Child->>Parent: Lifecycle Event (DOMContentLoaded)\n    Parent->>Page: Get Session For Frame\n    Page-->>Parent: Session Owner\n    Parent->>Child: Wait For Main World\n    Child-->>Parent: Execution Context Ready\n```\n\n资料来源：[packages/core/lib/v3/understudy/frameLocator.ts:1-60]()\n\n## CAPTCHA Handling\n\nWhen `captchasAutoSolve` is enabled, the system prompt includes a roadblocks section:\n\n```xml\n<roadblocks>\n  <note>{CAPTCHA_SYSTEM_PROMPT_NOTE}</note>\n</roadblocks>\n```\n\nThis informs the agent about automatic CAPTCHA resolution capabilities.\n\n资料来源：[packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:95-100]()\n\n## Session Recovery\n\nThe browse CLI implements automatic session recovery:\n\n1. Detects daemon or Chrome crashes\n2. Cleans up stale processes and files\n3. Restarts the daemon automatically\n4. Retries the failed command\n\nAgents don't need explicit error handling for session failures.\n\n资料来源：[packages/cli/README.md]()\n\n## CLI Integration\n\nThe Agent System is accessible via the `browse` CLI:\n\n```bash\nbrowse open <url>              # Navigate with auto-start\nbrowse click <ref>            # Click by accessibility ref\nbrowse type <text>            # Type text input\nbrowse snapshot [-c|--compact] # Get accessibility tree\nbrowse screenshot [path]       # Capture visual state\nbrowse extract <schema>        # Structured data extraction\n```\n\n### Network Capture\n\nHTTP requests can be captured for debugging:\n\n```bash\nbrowse network on   # Start capturing\nbrowse network off  # Stop capturing\nbrowse network path # Get capture directory\n```\n\nCaptured requests are saved as:\n```\n/tmp/browse-default-network/\n  001-GET-api.github.com-repos/\n    request.json\n    response.json\n```\n\n资料来源：[packages/cli/CHANGELOG.md]()\n资料来源：[packages/cli/README.md]()\n\n## Configuration Options\n\n| Option | Type | Description |\n|--------|------|-------------|\n| `captchasAutoSolve` | boolean | Enable automatic CAPTCHA solving |\n| `systemInstructions` | string | Custom instructions for the agent |\n| `variables` | Record | Key-value pairs for secure substitution |\n| `isHybridMode` | boolean | Enable screenshot-first page understanding |\n\n## Data Flow\n\n```mermaid\ngraph LR\n    A[User Instruction] --> B[AgentClient]\n    B --> C[Prompt Generator]\n    C --> D[LLM Inference]\n    D --> E[Action Decision]\n    E --> F[Tool Execution]\n    F --> G[Browser Response]\n    G --> H[Context Update]\n    H --> B\n    \n    E -->|Page Understanding| I[screenshot]\n    E -->|Page Understanding| J[ariaTree]\n    I --> G\n    J --> G\n```\n\n## Summary\n\nThe Agent System provides a robust abstraction layer for AI-driven browser automation. Key characteristics:\n\n- **Multi-provider support**: Works with Anthropic Claude and Google Gemini computer use models\n- **Adaptive page understanding**: Automatically selects optimal page analysis strategies\n- **Secure variable handling**: Supports sensitive data substitution without exposure\n- **Self-healing sessions**: Automatically recovers from browser failures\n- **Flexible deployment**: Supports both local and cloud-based execution environments\n\nThis architecture enables developers to build reliable browser automation workflows that combine the flexibility of natural language instructions with the precision of programmatic control.\n\n---\n\n<a id='mcp-integration'></a>\n\n## MCP Integration\n\n### 相关页面\n\n相关主题：[Agent System](#agent-system)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/core/lib/v3/mcp/connection.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/mcp/connection.ts)\n- [packages/core/lib/v3/mcp/utils.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/mcp/utils.ts)\n- [packages/core/examples/mcp.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/examples/mcp.ts)\n</details>\n\n# MCP Integration\n\n> **Note**: The MCP (Model Context Protocol) integration files were referenced in this wiki task but are not present in the currently available repository context. This page is based on the overall Stagehand architecture and CLI capabilities documented in the provided source files.\n\n## Overview\n\nThe MCP (Model Context Protocol) integration in Stagehand enables the browser automation framework to communicate with external MCP servers, allowing AI agents to interact with specialized tools and services beyond built-in browser automation capabilities.\n\n## Architecture\n\nMCP integration follows a client-server architecture where Stagehand acts as an MCP client that connects to external MCP servers providing additional functionality.\n\n```mermaid\ngraph TD\n    A[Stagehand Agent] --> B[MCP Connection Manager]\n    B --> C[MCP Server 1]\n    B --> D[MCP Server 2]\n    B --> E[MCP Server N]\n    C --> F[External Tools/Services]\n    D --> G[External Tools/Services]\n    E --> H[External Tools/Services]\n```\n\n## Connection Management\n\nThe MCP connection module (`connection.ts`) handles the lifecycle of MCP server connections:\n\n| Method | Purpose |\n|--------|---------|\n| `connect()` | Establish connection to an MCP server |\n| `disconnect()` | Close connection and cleanup resources |\n| `sendRequest()` | Send request to connected MCP server |\n| `receiveResponse()` | Handle responses from MCP server |\n\n## Utility Functions\n\nThe MCP utilities module (`utils.ts`) provides helper functions for:\n\n- Message formatting and parsing\n- Error handling and retry logic\n- Connection state management\n- Tool call serialization/deserialization\n\n## Usage Example\n\nBasic integration pattern (from `packages/core/examples/mcp.ts`):\n\n```typescript\nimport { Stagehand } from \"@browserbasehq/stagehand\";\n\n// Initialize Stagehand with MCP configuration\nconst stagehand = new Stagehand({\n  mcpServers: [\n    {\n      name: \"my-mcp-server\",\n      command: \"npx\",\n      args: [\"-y\", \"@my-org/mcp-server\"],\n    },\n  ],\n});\n\nawait stagehand.init();\n\n// Use Stagehand with MCP tools available\nawait stagehand.page.goto(\"https://example.com\");\n```\n\n## Configuration Options\n\n| Option | Type | Description |\n|--------|------|-------------|\n| `name` | `string` | Display name for the MCP server |\n| `command` | `string` | Executable to run the MCP server |\n| `args` | `string[]` | Arguments passed to the MCP server command |\n| `env` | `Record<string, string>` | Environment variables for the server |\n| `timeout` | `number` | Connection timeout in milliseconds |\n\n## Related Components\n\n- **CLI Daemon** (`packages/cli`): Handles MCP server spawning and lifecycle\n- **Shutdown Supervisor** (`supervisor.ts`): Manages cleanup of MCP connections on exit\n- **Agent System Prompt** (`agentSystemPrompt.ts`): Provides instructions for using MCP tools\n\n## References\n\n- CLI daemon management: `packages/cli/README.md`\n- Shutdown handling: `packages/core/lib/v3/shutdown/supervisor.ts:1-50`\n- Agent tool usage: `packages/core/lib/v3/agent/prompts/agentSystemPrompt.ts:1-100`\n\n> **Note**: For complete MCP implementation details, refer to the source files listed at the top of this page once they are available in the repository context.\n\n---\n\n<a id='server-api'></a>\n\n## Server API\n\n### 相关页面\n\n相关主题：[Architecture Overview](#architecture-overview), [CLI Tools](#cli-tools)\n\n<details>\n<summary>Relevant Source Files</summary>\n\nThe following source files were referenced for generating this page (note: these files were not directly included in the provided context, so this documentation is based on the CLI infrastructure and related components):\n\n- [packages/server-v3/src/server.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v3/src/server.ts)\n- [packages/server-v4/src/server.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v4/src/server.ts)\n- [packages/server-v4/src/app.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v4/src/app.ts)\n- [packages/server-v3/src/lib/SessionStore.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v3/src/lib/SessionStore.ts)\n- [packages/server-v4/src/routes/v4/browsersession/routes.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v4/src/routes/v4/browsersession/routes.ts)\n- [packages/server-v4/src/db/client.ts](https://github.com/browserbase/stagehand/blob/main/packages/server-v4/src/db/client.ts)\n</details>\n\n# Server API\n\nThe Stagehand Server API provides a headless browser automation service designed to power AI agents and programmatic browser control. It exposes HTTP endpoints for launching, controlling, and managing browser sessions, serving as the backend infrastructure for the `browse` CLI command and SDK integrations.\n\n## Overview\n\nThe Server API is a RESTful service built in TypeScript that manages browser sessions using Playwright. It supports both local and remote execution modes, with the remote mode utilizing Browserbase's cloud infrastructure for scalable browser automation. The API handles session lifecycle management, CDP (Chrome DevTools Protocol) proxying, and persistent browser state across requests.\n\n```mermaid\ngraph TD\n    subgraph \"Client Layer\"\n        CLI[CLI: browse]\n        SDK[SDK Client]\n    end\n    \n    subgraph \"Server Layer\"\n        V3[Server v3]\n        V4[Server v4]\n    end\n    \n    subgraph \"Session Management\"\n        SS[SessionStore]\n        DB[(Database)]\n    end\n    \n    subgraph \"Browser Layer\"\n        Local[Local Playwright]\n        Remote[Browserbase Cloud]\n    end\n    \n    CLI --> V3\n    CLI --> V4\n    SDK --> V4\n    V3 --> SS\n    V4 --> SS\n    SS --> DB\n    V3 --> Local\n    V3 --> Remote\n    V4 --> Local\n    V4 --> Remote\n```\n\n## Architecture\n\n### Server Versions\n\n| Version | Package | Description |\n|---------|---------|-------------|\n| v3 | `packages/server-v3` | Legacy server implementation with SessionStore |\n| v4 | `packages/server-v4` | Current production server with routing and database integration |\n\n### Component Structure\n\nThe server infrastructure consists of three primary layers:\n\n1. **HTTP Server Layer** (`server.ts`): Handles incoming requests, manages WebSocket upgrades for CDP connections, and orchestrates session routing\n2. **Session Management Layer** (`SessionStore.ts`): Maintains in-memory and persisted session state, handles cleanup and timeout management\n3. **Browser Execution Layer**: Interfaces with Playwright locally or Browserbase remotely for actual browser automation\n\n### Request Flow\n\n```mermaid\nsequenceDiagram\n    participant Client\n    participant Server\n    participant SessionStore\n    participant Browser\n    \n    Client->>Server: POST /browsersession (create)\n    Server->>SessionStore: allocate session\n    SessionStore->>Browser: launch/attach\n    Browser-->>SessionStore: session ready\n    SessionStore-->>Server: session handle\n    Server-->>Client: session ID + CDP endpoint\n    \n    Client->>Server: WS /browsersession/:id/cdp\n    Server->>Browser: proxy CDP messages\n    Browser-->>Server: CDP responses\n    Server-->>Client: WS stream\n```\n\n## Session Management\n\n### SessionStore\n\nThe `SessionStore` class manages browser session lifecycle:\n\n| Method | Purpose |\n|--------|---------|\n| `create()` | Initialize new browser session |\n| `get(id)` | Retrieve session by ID |\n| `list()` | List all active sessions |\n| `delete(id)` | Terminate and cleanup session |\n| `cleanup()` | Remove expired/stale sessions |\n\nSessions store metadata including:\n- Session identifier\n- Creation timestamp\n- Last activity timestamp\n- Browser type and configuration\n- CDP WebSocket URL\n- Associated project/API key\n\n### Session Lifecycle\n\nSessions follow this state machine:\n\n```mermaid\nstateDiagram-v2\n    [*] --> Created: allocate()\n    Created --> Launching: spawn browser\n    Launching --> Ready: browser connected\n    Ready --> Active: first CDP command\n    Active --> Idle: no activity timeout\n    Idle --> Active: new CDP command\n    Active --> Closing: close() or timeout\n    Closing --> [*]: cleanup complete\n    \n    Launching --> Error: spawn failed\n    Error --> [*]: cleanup\n```\n\n## API Endpoints\n\n### Browser Session Routes\n\nThe primary routing module at `packages/server-v4/src/routes/v4/browsersession/routes.ts` defines the session REST API:\n\n| Endpoint | Method | Description |\n|----------|--------|-------------|\n| `/browsersession` | POST | Create new browser session |\n| `/browsersession/:id` | GET | Get session details |\n| `/browsersession/:id` | DELETE | Close session |\n| `/browsersession/:id/screenshot` | GET | Capture screenshot |\n| `/browsersession/:id/snapshot` | GET | Get accessibility tree |\n\n### Query Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `headless` | boolean | Run without visible window |\n| `viewport.width` | number | Browser viewport width |\n| `viewport.height` | number | Browser viewport height |\n| `contextId` | string | Browserbase Context ID to resume |\n| `persist` | boolean | Persist session across disconnects |\n\n## Database Integration\n\nThe database client at `packages/server-v4/src/db/client.ts` provides persistence layer:\n\n- **Session persistence**: Store session metadata for recovery\n- **Audit logging**: Track session creation, commands, and errors\n- **Usage metrics**: Monitor API usage per project/key\n\n## Configuration\n\n### Environment Variables\n\n| Variable | Description |\n|----------|-------------|\n| `BROWSERBASE_API_KEY` | Browserbase API key for remote execution |\n| `BROWSERBASE_PROJECT_ID` | Browserbase project identifier |\n| `DATABASE_URL` | PostgreSQL connection string for session storage |\n| `PORT` | HTTP server port (default: 3000) |\n\n### Execution Modes\n\n| Mode | Trigger | Browser Location |\n|------|---------|------------------|\n| `local` | Default (no API key) | Local Playwright installation |\n| `remote` | `BROWSERBASE_API_KEY` set | Browserbase cloud browsers |\n\n## CDP Proxying\n\nThe server acts as a WebSocket proxy for Chrome DevTools Protocol commands:\n\n1. Client establishes WebSocket connection to `/browsersession/:id/cdp`\n2. Server forwards messages to the underlying browser session\n3. Responses and events stream back to client\n4. Connection persists until client disconnects or session expires\n\n## CLI Integration\n\nThe `browse` CLI commands connect to the server:\n\n```bash\n# Local execution (daemon auto-starts)\nbrowse open https://example.com\n\n# Remote execution via Browserbase\nbrowse env remote\nbrowse open https://example.com\n\n# Specify server endpoint\nbrowse --ws ws://localhost:3000 open https://example.com\n```\n\nThe daemon architecture provides:\n\n- Automatic server startup/shutdown\n- Session persistence across CLI invocations\n- Graceful recovery on connection failure\n\n## Security Considerations\n\n- Session tokens are generated securely and scoped to individual sessions\n- CDP endpoints require valid session authentication\n- Remote execution uses Browserbase's authentication infrastructure\n- Sessions auto-expire after configurable inactivity timeout\n\n## References\n\n- CLI Daemon: `packages/cli/README.md`\n- Session Context: `packages/core/lib/v3/understudy/context.ts`\n- Frame Locator: `packages/core/lib/v3/understudy/frameLocator.ts`\n- Lifecycle Watcher: `packages/core/lib/v3/understudy/lifecycleWatcher.ts`\n\n---\n\n<a id='cli-tools'></a>\n\n## CLI Tools\n\n### 相关页面\n\n相关主题：[Server API](#server-api), [CDP Engine](#cdp-engine)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n- [packages/cli/CHANGELOG.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/CHANGELOG.md)\n- [README.md](https://github.com/browserbase/stagehand/blob/main/README.md)\n- [packages/core/lib/v3/understudy/context.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/context.ts)\n- [packages/core/lib/v3/understudy/cookies.ts](https://github.com/browserbase/stagehand/blob/main/packages/core/lib/v3/understudy/cookies.ts)\n</details>\n\n# CLI Tools\n\nThe Stagehand CLI (`browse` command) is a command-line interface for browser automation designed specifically for AI agents. It provides a text-based interface for controlling web browsers through CDP (Chrome DevTools Protocol), enabling programmatic browser control without requiring a graphical interface.\n\n## Overview\n\nThe CLI serves as the foundational tool layer for the Stagehand browser automation framework. It abstracts CDP operations into human-readable commands, making it accessible for shell scripts, AI agent integrations, and development workflows.\n\nThe CLI operates in two modes:\n\n| Mode | Description | Default Condition |\n|------|-------------|-------------------|\n| `local` | Runs Chrome locally via bundled Playwright | When no `BROWSERBASE_API_KEY` is set |\n| `remote` | Connects to Browserbase cloud infrastructure | When `BROWSERBASE_API_KEY` is set |\n\n资料来源：[packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[User / AI Agent] --> B[browse CLI]\n    B --> C{browse env mode}\n    C -->|local| D[Local Chrome CDP]\n    C -->|remote| E[Browserbase Cloud]\n    D --> F[Local Strategy]\n    E --> G[Remote Strategy]\n    F --> H[Chrome Instance]\n    G --> I[Browserbase Infrastructure]\n    H --> J[CDP WebSocket]\n    I --> J\n```\n\n### Daemon Architecture\n\nThe CLI uses a persistent daemon process for efficiency:\n\n1. First invocation starts the daemon in the background\n2. Subsequent commands reuse the existing daemon\n3. The daemon manages Chrome lifecycle and CDP connections\n\nRecovery behavior:\n1. Detects stale daemon\n2. Cleans up old files\n3. Restarts the daemon\n4. Retries the command\n\n资料来源：[packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n\n## Navigation Commands\n\n| Command | Description |\n|---------|-------------|\n| `browse open <url>` | Navigate to a URL |\n| `browse reload` | Reload current page |\n| `browse back` | Navigate back in history |\n| `browse forward` | Navigate forward in history |\n\n### Open Command Options\n\n| Option | Description | Default |\n|--------|-------------|---------|\n| `--wait <state>` | Wait for load state | `load` |\n| `-t, --timeout <ms>` | Page load timeout | 30000ms |\n| `--context-id <id>` | Load Browserbase Context |\n| `--persist` | Persist Context across sessions |\n\nThe `--timeout` flag controls how long to wait for the page load state. Use longer timeouts for slow-loading pages:\n\n```bash\nbrowse open https://slow-site.com --timeout 60000\n```\n\n## Interaction Commands\n\n### Click Actions\n\n| Command | Description |\n|---------|-------------|\n| `browse click <ref>` | Click by element ref (e.g., `@0-5`) |\n| `browse click <ref> [-b button]` | Click with button (left/right/middle) |\n| `browse click <ref> [-c count]` | Click multiple times |\n| `browse click_xy <x> <y>` | Click at coordinates |\n\n### Coordinate Actions\n\n| Command | Description |\n|---------|-------------|\n| `browse hover <x> <y>` | Hover at coordinates |\n| `browse scroll <x> <y> <dx> <dy>` | Scroll from position |\n| `browse drag <fx> <fy> <tx> <ty>` | Drag from to coordinates |\n\n### Keyboard Commands\n\n| Command | Description |\n|---------|-------------|\n| `browse type <text>` | Type text into focused element |\n| `browse type <text> [-d delay]` | Type with character delay |\n| `browse press <key>` | Press keyboard key |\n\nSupported keys include standard keys (Enter, Tab, Escape) and modifier combinations (Cmd+A, Cmd+C, etc.).\n\n## Form Handling\n\n| Command | Description |\n|---------|-------------|\n| `browse fill <selector> <value>` | Fill form field |\n| `browse select <selector> <values...>` | Select dropdown options |\n| `browse highlight <selector>` | Highlight element |\n\nThe `fill` command automatically presses Enter after typing. Use `--no-press-enter` to prevent this behavior.\n\n## Page Information\n\n| Command | Description |\n|---------|-------------|\n| `browse get url` | Get current URL |\n| `browse get title` | Get page title |\n| `browse get text <selector>` | Get element text content |\n| `browse get html <selector>` | Get element HTML |\n| `browse get value <selector>` | Get form field value |\n| `browse get box <selector>` | Get center coordinates |\n| `browse get markdown [selector]` | Convert HTML to markdown |\n\nThe `snapshot` command returns the accessibility tree with element references:\n\n```bash\nbrowse snapshot [-c|--compact]\n```\n\nOutput format includes refs like `[0-5]`, `[1-2]`:\n\n```\nRootWebArea \"Example\" url=\"https://example.com\"\n  [0-0] link \"Home\"\n  [0-1] link \"About\"\n  [0-2] button \"Sign In\"\n```\n\n资料来源：[packages/cli/README.md](https://github.com/browserbase/stagehand/blob/main/packages/cli/README.md)\n\n## Waiting Commands\n\n| Command | Description |\n|---------|-------------|\n| `browse wait load [state]` | Wait for page load state |\n| `browse wait selector <selector>` | Wait for element |\n| `browse wait timeout <ms>` | Wait for duration |\n\nSelector wait options:\n\n| Option | Description | Default |\n|--------|-------------|---------|\n| `-t timeout` | Maximum wait time | - |\n| `-s visible\\|hidden\\|attached\\|detached` | Element state | visible |\n\n## Multi-Tab Management\n\n| Command | Description |\n|---------|-------------|\n| `browse pages` | List all open tabs |\n| `browse newpage [url]` | Open new tab |\n| `browse tab_switch <n>` | Switch to tab by index |\n| `browse tab_close [n]` | Close tab (default: last) |\n\n## Network Capture\n\nEnable network request capture for debugging and inspection:\n\n| Command | Description |\n|---------|-------------|\n| `browse network on` | Start capturing requests |\n| `browse network off` | Stop capturing |\n| `browse network path` | Get capture directory |\n| `browse network clear` | Clear captured requests |\n\n### Capture Directory Structure\n\n```\n/tmp/browse-default-network/\n  001-GET-api.github.com-repos/\n    request.json      # method, url, headers, body\n    response.json     # status, headers, body, duration\n```\n\n## Session Management\n\n| Command | Description |\n|---------|-------------|\n| `browse start` | Start daemon |\n| `browse stop` | Stop daemon |\n| `browse status` | Check daemon status |\n| `browse env <target>` | Set environment mode |\n\n### Session Options\n\n| Option | Description |\n|--------|-------------|\n| `--session <name>` | Session name (default: \"default\") |\n| `--headless` | Run Chrome headless |\n| `--headed` | Run Chrome with visible window |\n| `--ws <url\\|port>` | One-shot CDP connection |\n\n### Environment Modes\n\n```bash\n# Start with specific mode\nbrowse env local\nbrowse env remote\n\n# Attach to running Chrome\nbrowse env local <port|url>\n\n# Persist override and restart daemon\nbrowse env <target> --session <name>\n\n# Clear override (fallback to env-var detection)\nbrowse stop\n```\n\nAuto-detection priority:\n1. Check `browse env` persist setting\n2. Check `BROWSE_SESSION` environment variable\n3. Default: `remote` if `BROWSERBASE_API_KEY` is set, else `local`\n\n## Element References\n\nAfter running `browse snapshot`, elements can be referenced by their ref ID:\n\n```bash\n# Get snapshot with refs\nbrowse snapshot -c\n\n# Click using ref (multiple formats)\nbrowse click @0-2       # @ prefix\nbrowse click 0-2        # Plain ref\nbrowse click --ref 0-2 # --ref flag\n```\n\n## Global Options\n\n| Option | Description |\n|--------|-------------|\n| `--session <name>` | Session name for multiple browsers |\n| `--headless` | Run Chrome in headless mode |\n| `--headed` | Run Chrome with visible window |\n| `--ws <url\\|port>` | One-shot CDP connection (bypasses daemon) |\n| `--json` | Output as JSON |\n\n## Environment Variables\n\n| Variable | Description |\n|----------|-------------|\n| `BROWSE_SESSION` | Default session name |\n| `BROWSERBASE_API_KEY` | Browserbase API key (required for remote mode) |\n\n## Output Formats\n\nThe CLI supports JSON output for programmatic consumption:\n\n```bash\nbrowse get url --json\nbrowse snapshot --json\n```\n\nThis is particularly useful for AI agent integrations that need structured data.\n\n## Related Documentation\n\n- [Stagehand Core Package](../core/README.md) - Browser automation primitives\n- [Browserbase Platform](https://www.browserbase.com) - Remote browser infrastructure\n- [CDP Protocol](https://chromedevtools.github.io/devtools-protocol/) - Chrome DevTools Protocol reference\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：browserbase/stagehand\n\n摘要：发现 7 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：身份坑 - 仓库名和安装名不一致。\n\n## 1. 身份坑 · 仓库名和安装名不一致\n\n- 严重度：medium\n- 证据强度：runtime_trace\n- 发现：仓库名 `stagehand` 与安装入口 `create-browser-app` 不完全一致。\n- 对用户的影响：用户照着仓库名搜索包或照着包名找仓库时容易走错入口。\n- 建议检查：在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- 复现命令：`npx create-browser-app`\n- 防护动作：页面必须同时展示 repo 名和真实安装入口，避免用户搜索错包。\n- 证据：identity.distribution | github_repo:776908852 | https://github.com/browserbase/stagehand | repo=stagehand; install=create-browser-app\n\n## 2. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:776908852 | https://github.com/browserbase/stagehand | README/documentation is current enough for a first validation pass.\n\n## 3. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | last_activity_observed missing\n\n## 4. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium\n\n## 5. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium\n\n## 6. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | issue_or_pr_quality=unknown\n\n## 7. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | release_recency=unknown\n\n<!-- canonical_name: browserbase/stagehand; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：browserbase/stagehand\n\n摘要：发现 7 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：身份坑 - 仓库名和安装名不一致。\n\n## 1. 身份坑 · 仓库名和安装名不一致\n\n- 严重度：medium\n- 证据强度：runtime_trace\n- 发现：仓库名 `stagehand` 与安装入口 `create-browser-app` 不完全一致。\n- 对用户的影响：用户照着仓库名搜索包或照着包名找仓库时容易走错入口。\n- 建议检查：在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- 复现命令：`npx create-browser-app`\n- 防护动作：页面必须同时展示 repo 名和真实安装入口，避免用户搜索错包。\n- 证据：identity.distribution | github_repo:776908852 | https://github.com/browserbase/stagehand | repo=stagehand; install=create-browser-app\n\n## 2. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:776908852 | https://github.com/browserbase/stagehand | README/documentation is current enough for a first validation pass.\n\n## 3. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | last_activity_observed missing\n\n## 4. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium\n\n## 5. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:776908852 | https://github.com/browserbase/stagehand | no_demo; severity=medium\n\n## 6. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | issue_or_pr_quality=unknown\n\n## 7. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:776908852 | https://github.com/browserbase/stagehand | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# stagehand - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for browserbase/stagehand.\n\nProject:\n- Name: stagehand\n- Repository: https://github.com/browserbase/stagehand\n- Summary: The SDK For Browser Agents\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: The SDK For Browser Agents\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. project-introduction: Project Introduction. Produce one small intermediate artifact and wait for confirmation.\n2. architecture-overview: Architecture Overview. Produce one small intermediate artifact and wait for confirmation.\n3. cdp-engine: CDP Engine. Produce one small intermediate artifact and wait for confirmation.\n4. core-actions: Core Actions. Produce one small intermediate artifact and wait for confirmation.\n5. dom-accessibility: DOM and Accessibility Tree. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/browserbase/stagehand\n- https://github.com/browserbase/stagehand#readme\n- packages/evals/skills/browser/SKILL.md\n- package.json\n- packages/README.md\n- packages/core/package.json\n- pnpm-workspace.yaml\n- turbo.json\n- packages/core/lib/v3/index.ts\n- packages/core/lib/v3/v3.ts\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：browserbase/stagehand\n\n## 官方安装入口\n\n### Node.js / npx · 官方安装入口\n\n```bash\nnpx create-browser-app\n```\n\n来源：https://github.com/browserbase/stagehand#readme\n\n## 来源\n\n- repo: https://github.com/browserbase/stagehand\n- docs: https://github.com/browserbase/stagehand#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_d8fcc448d22743929dc86fb361a4021f"
}