{
  "canonical_name": "unclecode/crawl4ai",
  "compilation_id": "pack_479b6254134042e68127212e3d9dd1a2",
  "created_at": "2026-05-15T09:28:59.331946+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=skill, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=skill, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install -U crawl4ai` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install -U crawl4ai",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "deterministic_isolated_install",
      "sandbox_validation_id": "sbx_04b45173085e497e9d1c68bfc718cdc8"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_f459513f9c710fd6f978d264bb77da29",
    "canonical_name": "unclecode/crawl4ai",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/unclecode/crawl4ai",
    "slug": "crawl4ai",
    "source_packet_id": "phit_c25a2b2243834ef9837d77f952b3d1bd",
    "source_validation_id": "dval_854bda8c953f459280cd4085d3ec4f00"
  },
  "merchandising": {
    "best_for": "需要流程自动化能力，并使用 local_cli的用户",
    "github_forks": 6702,
    "github_stars": 65505,
    "one_liner_en": "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.",
    "one_liner_zh": "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.",
    "primary_category": {
      "category_id": "workflow-automation",
      "confidence": "high",
      "name_en": "Workflow Automation",
      "name_zh": "流程自动化",
      "reason": "strong category phrase match from project identity and outcome"
    },
    "target_user": "使用 local_cli 等宿主 AI 的用户",
    "title_en": "crawl4ai",
    "title_zh": "crawl4ai 能力包",
    "visible_tags": [
      {
        "label_en": "Security & Permissions",
        "label_zh": "安全审查与权限治理",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-security-permissions",
        "type": "product_domain"
      },
      {
        "label_en": "Web Task Automation",
        "label_zh": "网页任务自动化",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-web-task-automation",
        "type": "user_job"
      },
      {
        "label_en": "Structured Data Extraction",
        "label_zh": "结构化数据提取",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-structured-data-extraction",
        "type": "core_capability"
      },
      {
        "label_en": "Node-based Workflow",
        "label_zh": "节点式流程编排",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-node-based-workflow",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Open Source Tool",
        "label_zh": "开源工具",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-open-source-tool",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_c25a2b2243834ef9837d77f952b3d1bd",
  "page_model": {
    "artifacts": {
      "artifact_slug": "crawl4ai",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install -U crawl4ai",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/unclecode/crawl4ai#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "安全审查与权限治理",
        "网页任务自动化",
        "结构化数据提取",
        "节点式流程编排",
        "开源工具"
      ],
      "eyebrow": "流程自动化",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要流程自动化能力，并使用 local_cli的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper."
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "local_cli",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "skill, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: arun() and arun_many() type hinting needs fixing",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_d3b6cfd3700147f690e0e65875f15424 | https://github.com/unclecode/crawl4ai/issues/1898 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: arun() and arun_many() type hinting needs fixing",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason is shown",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_ad61b108bf894cc286ca7966e8c86758 | https://github.com/unclecode/crawl4ai/issues/1949 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason…",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_1ee99f5d72f143f4b064732cc19e0c85 | https://github.com/unclecode/crawl4ai/issues/1963 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_d7fa967632a948008efbc182d1f2c96b | https://github.com/unclecode/crawl4ai/issues/1938 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_2e9fbf659fbb40aba437886a87f8e2d7 | https://github.com/unclecode/crawl4ai/issues/1962 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_af29278fd7294d4a8f0f6f37ab987b5c | https://github.com/unclecode/crawl4ai/issues/1968 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_97d44cedb21a4908a7743fde11209954 | https://github.com/unclecode/crawl4ai/issues/1950 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_ae45861377894b99a57d6bbdc06af313 | https://github.com/unclecode/crawl4ai/issues/1959 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.1:Update",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_a6ae9133fff54443b712725f51769fa1 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.7.1:Update",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.2: CI/CD & Dependency Optimization Update",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_14954e0431ca426ebeaa4bb31778d4af | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.2 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.7.2: CI/CD & Dependency Optimization Update",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: Markdown export loses heading hierarchy and table structure",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_c3eac8ab81e34bf3b6cc050f7f8e9826 | https://github.com/unclecode/crawl4ai/issues/1964 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: Markdown export loses heading hierarchy and table structure",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | github_repo:798201435 | https://github.com/unclecode/crawl4ai | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "未记录 last_activity_observed。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | last_activity_observed missing"
            ],
            "severity": "medium",
            "suggested_check": "补 GitHub 最近 commit、release、issue/PR 响应信号。",
            "title": "维护活跃度未知",
            "user_impact": "新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "downstream_validation.risk_items | github_repo:798201435 | https://github.com/unclecode/crawl4ai | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "进入安全/权限治理复核队列。",
            "title": "下游验证发现风险项",
            "user_impact": "下游已经要求复核，不能在页面中弱化。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "risks.scoring_risks | github_repo:798201435 | https://github.com/unclecode/crawl4ai | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "把风险写入边界卡，并确认是否需要人工复核。",
            "title": "存在评分风险",
            "user_impact": "风险会影响是否适合普通用户安装。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.3",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_e2b75670cbcc4814a86423818b9f6f48 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.3 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Release v0.7.3",
            "user_impact": "可能阻塞安装或首次运行。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 21 个潜在踩坑项，其中 5 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: arun() and arun_many() type hinting needs fixing。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": 76,
        "forks": 6702,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": 65505
      },
      "source_url": "https://github.com/unclecode/crawl4ai",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.",
      "title": "crawl4ai 能力包",
      "trial_prompt": "# crawl4ai - Prompt Preview\n\n> 复制下面这段 Prompt 到你常用的 AI，先试一次，不需要安装。\n> 它的目标是让你直接体验这个项目的服务方式，而不是阅读项目介绍。\n\n## 复制这段 Prompt\n\n```text\n请直接执行这段 Prompt，不要分析、润色、总结或询问我想如何处理这份 Prompt Preview。\n\n你现在扮演 crawl4ai 的“安装前体验版”。\n这不是项目介绍、不是评价报告、不是 README 总结。你的任务是让我用最小成本体验它的核心服务。\n\n我的试用任务：我想用它完成一个真实的流程自动化任务。\n我常用的宿主 AI：Local CLI\n\n【体验目标】\n围绕我的真实任务，现场演示这个项目如何把输入转成 示例引导, 判断线索。重点是让我感受到工作方式，而不是给我项目背景。\n\n【业务流约束】\n- 你必须像一个正在提供服务的项目能力包，而不是像一个讲解员。\n- 每一轮只推进一个步骤；提出问题后必须停下来等我回答。\n- 每一步都必须让我感受到一个具体服务动作：澄清、整理、规划、检查、判断或收尾。\n- 每一步都要说明：当前目标、你需要我提供什么、我回答后你会产出什么。\n- 不要安装、不要运行命令、不要写代码、不要声称测试通过、不要声称已经修改文件。\n- 需要真实安装或宿主加载后才能验证的内容，必须明确说“这一步需要安装后验证”。\n- 如果我说“用示例继续”，你可以用虚构示例推进，但仍然不能声称真实执行。\n\n【可体验服务能力】\n- 安装前能力预览: 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper. 输入：用户任务, 当前 AI 对话上下文；输出：示例引导, 判断线索。\n\n【必须安装后才可验证的能力】\n- 命令行启动或安装流程: 项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 输入：终端环境, 包管理器, 项目依赖；输出：安装结果, 列表/更新/运行结果。\n\n【核心服务流】\n请严格按这个顺序带我体验。不要一次性输出完整流程：\n1. introduction：Introduction to Crawl4AI。围绕“Introduction to Crawl4AI”模拟一次用户任务，不展示安装或运行结果。\n2. quickstart：Quick Start Guide。围绕“Quick Start Guide”模拟一次用户任务，不展示安装或运行结果。\n3. architecture：System Architecture。围绕“System Architecture”模拟一次用户任务，不展示安装或运行结果。\n4. browser_management：Browser Management。围绕“Browser Management”模拟一次用户任务，不展示安装或运行结果。\n5. async_crawler：Async Web Crawler。围绕“Async Web Crawler”模拟一次用户任务，不展示安装或运行结果。\n\n【核心能力体验剧本】\n每一步都必须按“输入 -> 服务动作 -> 中间产物”执行。不要只说流程名：\n1. introduction\n输入：用户提供的“Introduction to Crawl4AI”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n2. quickstart\n输入：用户提供的“Quick Start Guide”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n3. architecture\n输入：用户提供的“System Architecture”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n4. browser_management\n输入：用户提供的“Browser Management”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n5. async_crawler\n输入：用户提供的“Async Web Crawler”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n【项目服务规则】\n这些规则决定你如何服务用户。不要解释规则本身，而要在每一步执行时遵守：\n- 先确认用户任务、输入材料和成功标准，再模拟项目能力。\n- 每一步都必须形成可检查的小产物，并等待用户确认后再继续。\n- 凡是需要安装、调用工具或访问外部服务的能力，都必须标记为安装后验证。\n\n【每一步的服务约束】\n- Step 1 / introduction：Step 1 必须围绕“Introduction to Crawl4AI”形成一个小中间产物，并等待用户确认。\n- Step 2 / quickstart：Step 2 必须围绕“Quick Start Guide”形成一个小中间产物，并等待用户确认。\n- Step 3 / architecture：Step 3 必须围绕“System Architecture”形成一个小中间产物，并等待用户确认。\n- Step 4 / browser_management：Step 4 必须围绕“Browser Management”形成一个小中间产物，并等待用户确认。\n- Step 5 / async_crawler：Step 5 必须围绕“Async Web Crawler”形成一个小中间产物，并等待用户确认。\n\n【边界与风险】\n- 不要声称已经安装、运行、调用 API、读写本地文件或完成真实任务。\n- 安装前预览只能展示工作方式，不能证明兼容性、性能或输出质量。\n- 涉及安装、插件加载、工具调用或外部服务的能力必须安装后验证。\n\n【可追溯依据】\n这些路径只用于你内部校验或在我追问“依据是什么”时简要引用。不要在首次回复主动展开：\n- https://github.com/unclecode/crawl4ai\n- https://github.com/unclecode/crawl4ai#readme\n- README.md\n- MISSION.md\n- pyproject.toml\n- docs/examples/quickstart.py\n- docs/examples/hello_world.py\n- crawl4ai/__init__.py\n- deploy/docker/ARCHITECTURE.md\n- crawl4ai/async_webcrawler.py\n- crawl4ai/browser_manager.py\n- crawl4ai/async_dispatcher.py\n\n【首次问题规则】\n- 首次三问必须先确认用户目标、成功标准和边界，不要提前进入工具、安装或实现细节。\n- 如果后续需要技术条件、文件路径或运行环境，必须等用户确认目标后再追问。\n\n首次回复必须只输出下面 4 个部分：\n1. 体验开始：用 1 句话说明你将带我体验 crawl4ai 的核心服务。\n2. 当前步骤：明确进入 Step 1，并说明这一步要解决什么。\n3. 你会如何服务我：说明你会先改变我完成任务的哪个动作。\n4. 只问我 3 个问题，然后停下等待回答。\n\n首次回复禁止输出：后续完整流程、证据清单、安装命令、项目评价、营销文案、已经安装或运行的说法。\n\nStep 1 / brainstorming 的二轮协议：\n- 我回答首次三问后，你仍然停留在 Step 1 / brainstorming，不要进入 Step 2。\n- 第二次回复必须产出 6 个部分：澄清后的任务定义、成功标准、边界条件、\n  2-3 个可选方案、每个方案的权衡、推荐方案。\n- 第二次回复最后必须问我是否确认推荐方案；只有我明确确认后，才能进入下一步。\n- 第二次回复禁止输出 git worktree、代码计划、测试文件、命令或真实执行结果。\n\n后续对话规则：\n- 我回答后，你先完成当前步骤的中间产物并等待确认；只有我确认后，才能进入下一步。\n- 每一步都要生成一个小的中间产物，例如澄清后的目标、计划草案、测试意图、验证清单或继续/停止判断。\n- 所有演示都写成“我会建议/我会引导/这一步会形成”，不要写成已经真实执行。\n- 不要声称已经测试通过、文件已修改、命令已运行或结果已产生。\n- 如果某个能力必须安装后验证，请直接说“这一步需要安装后验证”。\n- 如果证据不足，请明确说“证据不足”，不要补事实。\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: [Bug] AsyncLogger writes to stdout, breaking MCP stdio transport（https://github.com/unclecode/crawl4ai/issues/1968）；github/github_issue: [Bug]: Markdown text extraction drops text when element contains empty e（https://github.com/unclecode/crawl4ai/issues/1966）；github/github_issue: [Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-（https://github.com/unclecode/crawl4ai/issues/1962）；github/github_issue: [Bug]: MCP scrape tools lack wait_until / SPA support that REST API and （https://github.com/unclecode/crawl4ai/issues/1963）；github/github_issue: [Bug]: Markdown export loses heading hierarchy and table structure（https://github.com/unclecode/crawl4ai/issues/1964）；github/github_issue: [Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports sy（https://github.com/unclecode/crawl4ai/issues/1959）；github/github_issue: [Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked （https://github.com/unclecode/crawl4ai/issues/1949）；github/github_issue: [Bug]: arun() and arun_many() type hinting needs fixing（https://github.com/unclecode/crawl4ai/issues/1898）；github/github_issue: [Bug]: The install with pip on just about any system rarely works. It re（https://github.com/unclecode/crawl4ai/issues/1950）；github/github_issue: [Bug]: `remove_empty_elements_fast()` drops trailing text when removing （https://github.com/unclecode/crawl4ai/issues/1938）；github/github_release: Release v0.7.7（https://github.com/unclecode/crawl4ai/releases/tag/v0.7.7）；github/github_release: Release v0.7.5（https://github.com/unclecode/crawl4ai/releases/tag/v0.7.5）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport",
              "url": "https://github.com/unclecode/crawl4ai/issues/1968"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: Markdown text extraction drops text when element contains empty e",
              "url": "https://github.com/unclecode/crawl4ai/issues/1966"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-",
              "url": "https://github.com/unclecode/crawl4ai/issues/1962"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and ",
              "url": "https://github.com/unclecode/crawl4ai/issues/1963"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: Markdown export loses heading hierarchy and table structure",
              "url": "https://github.com/unclecode/crawl4ai/issues/1964"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports sy",
              "url": "https://github.com/unclecode/crawl4ai/issues/1959"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked ",
              "url": "https://github.com/unclecode/crawl4ai/issues/1949"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: arun() and arun_many() type hinting needs fixing",
              "url": "https://github.com/unclecode/crawl4ai/issues/1898"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: The install with pip on just about any system rarely works. It re",
              "url": "https://github.com/unclecode/crawl4ai/issues/1950"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: `remove_empty_elements_fast()` drops trailing text when removing ",
              "url": "https://github.com/unclecode/crawl4ai/issues/1938"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "Release v0.7.7",
              "url": "https://github.com/unclecode/crawl4ai/releases/tag/v0.7.7"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "Release v0.7.5",
              "url": "https://github.com/unclecode/crawl4ai/releases/tag/v0.7.5"
            }
          ],
          "status": "已收录 12 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "流程自动化",
      "desc": "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.",
      "effort": "安装已验证",
      "forks": 6702,
      "icon": "bolt",
      "name": "crawl4ai 能力包",
      "risk": "可发布",
      "slug": "crawl4ai",
      "stars": 65505,
      "tags": [
        "安全审查与权限治理",
        "网页任务自动化",
        "结构化数据提取",
        "节点式流程编排",
        "开源工具"
      ],
      "thumb": "gray",
      "type": "Skill Pack"
    },
    "manual": {
      "markdown": "# https://github.com/unclecode/crawl4ai 项目说明书\n\n生成时间：2026-05-15 08:23:04 UTC\n\n## 目录\n\n- [Introduction to Crawl4AI](#introduction)\n- [Installation Guide](#installation)\n- [Quick Start Guide](#quickstart)\n- [System Architecture](#architecture)\n- [Browser Management](#browser_management)\n- [Async Web Crawler](#async_crawler)\n- [Markdown Generation](#markdown_generation)\n- [Extraction Strategies](#extraction_strategies)\n- [Deep Crawling Strategies](#deep_crawling)\n- [Anti-Bot Detection and Proxy Management](#anti_bot_detection)\n\n<a id='introduction'></a>\n\n## Introduction to Crawl4AI\n\n### 相关页面\n\n相关主题：[Installation Guide](#installation), [Quick Start Guide](#quickstart)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/unclecode/crawl4ai/blob/main/README.md)\n- [MISSION.md](https://github.com/unclecode/crawl4ai/blob/main/MISSION.md)\n- [pyproject.toml](https://github.com/unclecode/crawl4ai/blob/main/pyproject.toml)\n- [docs/README.md](https://github.com/unclecode/crawl4ai/blob/main/docs/README.md)\n- [CONTRIBUTING.md](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md)\n- [sbom/README.md](https://github.com/unclecode/crawl4ai/blob/main/sbom/README.md)\n</details>\n\n# Introduction to Crawl4AI\n\n## Overview\n\nCrawl4AI is an open-source AI-powered web crawling framework designed to extract structured data from web pages and deliver clean, LLM-ready output. It serves as a modern alternative to traditional web scraping tools by combining intelligent crawling capabilities with AI-driven content extraction and formatting.\n\nThe project emphasizes ease of use, providing both programmatic APIs and command-line interfaces for rapid integration into data pipelines, research workflows, and AI applications.\n\n## Purpose and Scope\n\nCrawl4AI addresses the fundamental challenge of extracting meaningful data from unstructured web content. While conventional web crawlers focus on fetching page content, Crawl4AI goes further by:\n\n- **Semantic Understanding**: Analyzing page content to identify and extract relevant information based on context rather than rigid selectors\n- **Structured Output**: Delivering data in formats optimized for large language model consumption, including Markdown, JSON, and structured extractions\n- **Performance Optimization**: Enabling high-throughput crawling with configurable browser automation and connection pooling\n- ** flexibility**: Supporting various output strategies including simple crawling, chunked extraction, and memory-aware processing\n\nThe scope encompasses web crawling, content extraction, link navigation, media handling, and output formatting—providing an end-to-end solution from URL input to structured data output.\n\n## Core Architecture\n\nCrawl4AI follows a modular architecture composed of distinct processing stages:\n\n```mermaid\ngraph TD\n    A[URL Input] --> B[Crawl Strategy]\n    B --> C[Browser Automation]\n    C --> D[Content Extraction]\n    D --> E[AI Processing]\n    E --> F[Output Formatter]\n    F --> G[Structured Output]\n    \n    C -->|JS Rendering| H[JavaScript Executor]\n    D -->|Media| I[Media Handler]\n    E -->|Memory| J[Memory Manager]\n```\n\n### Processing Pipeline\n\n| Stage | Component | Description |\n|-------|-----------|-------------|\n| Input | URL Parser | Validates and normalizes target URLs |\n| Crawl | Strategy Engine | Selects crawling approach based on configuration |\n| Render | Browser Pool | Manages headless browser instances |\n| Extract | AI Extractor | Uses ML models to identify relevant content |\n| Format | Output Serializer | Converts to target format (JSON/Markdown/HTML) |\n\n## Key Features\n\n### 1. AI-Powered Extraction\n\nCrawl4AI leverages machine learning models to understand page content semantically. Rather than relying solely on CSS selectors or XPath expressions, the extractor can identify:\n\n- Main article content and metadata\n- Structured data elements (tables, lists, forms)\n- Semantic sections and their relationships\n- Relevant versus boilerplate content\n\n### 2. Multiple Output Strategies\n\nThe framework supports various extraction strategies optimized for different use cases:\n\n| Strategy | Use Case | Output |\n|----------|----------|--------|\n| `default` | General purpose | Clean Markdown with metadata |\n| `cosine` | Semantic clustering | Grouped content chunks |\n| `no-cache` | Fresh data | Bypass internal caching |\n| `passive` | Low resource | Minimal processing |\n| `brainless` | Simple fetch | Raw HTML without AI processing |\n\n### 3. Browser Automation\n\nIntegrated headless browser support enables:\n\n- JavaScript rendering for single-page applications\n- Cookie and session management\n- Custom headers and authentication\n- Screenshot capture\n- PDF generation\n\n### 4. Media Handling\n\nCrawl4AI processes various media types during extraction:\n\n- **Images**: Download, compress, and embed with alt-text preservation\n- **Videos**: Extract metadata and embed URLs\n- **Audio**: Handle media references for podcasts and audio content\n- **Documents**: Process embedded PDFs and downloadable files\n\n## Installation and Setup\n\n### Prerequisites\n\n- Python 3.9 or higher\n- Chrome/Chromium browser (for browser automation features)\n- pip or poetry package manager\n\n### Basic Installation\n\n```bash\npip install crawl4ai\n```\n\n### Development Installation\n\n```bash\ngit clone https://github.com/unclecode/crawl4ai.git\ncd crawl4ai\npip install -e \".[dev]\"\n```\n\n### Verify Installation\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(url=\"https://example.com\")\n    print(result.markdown)\n```\n\n## Basic Usage Patterns\n\n### Simple Crawl\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(url=\"https://example.com\")\n    print(f\"Title: {result.metadata.get('title')}\")\n    print(f\"Content: {result.markdown}\")\n```\n\n### Configured Extraction\n\n```python\nfrom crawl4ai import CrawlerRunConfig, AsyncWebCrawler\n\nconfig = CrawlerRunConfig(\n    mode=\"aggressive\",\n    word_count_threshold=10,\n    remove_hidden_text=True,\n    process_iframes=True,\n    scroll_delay=1.0\n)\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(\n        url=\"https://example.com/article\",\n        config=config\n    )\n    print(result.markdown)\n```\n\n### Batch Crawling\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n\nurls = [\n    \"https://example.com/page1\",\n    \"https://example.com/page2\",\n    \"https://example.com/page3\"\n]\n\nasync with AsyncWebCrawler() as crawler:\n    results = await crawler.arun_many(urls=urls)\n    for result in results:\n        print(f\"URL: {result.url}\")\n        print(f\"Status: {result.status_code}\")\n```\n\n## Configuration Reference\n\n### CrawlerRunConfig Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `mode` | str | `\"default\"` | Extraction strategy |\n| `headless` | bool | `True` | Run browser in headless mode |\n| `verbose` | bool | `False` | Enable verbose logging |\n| `text_threshold` | int | | Minimum text length filter |\n| `word_count_threshold` | int | | Words per chunk threshold |\n| `skip_download_images` | bool | `False` | Skip image downloads |\n| `page_timeout` | int | `30000` | Page load timeout (ms) |\n| `scroll_delay` | float | `0` | Delay between scrolls |\n\n### Browser Configuration\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `browser_type` | str | Chromium/Firefox/WebKit |\n| `headless` | bool | Headless mode toggle |\n| `proxy` | dict | Proxy configuration |\n| `user_agent` | str | Custom user agent string |\n\n## Project Structure\n\n```\ncrawl4ai/\n├── src/crawl4ai/          # Main package source\n│   ├── core/              # Core crawling engine\n│   ├── extractors/        # Content extraction strategies\n│   ├── formatters/        # Output formatters\n│   └── utils/             # Utility functions\n├── examples/              # Usage examples\n├── tests/                 # Test suite\n├── docs/                  # Documentation\n├── scripts/               # Build and utility scripts\n└── sbom/                  # Software Bill of Materials\n```\n\n## Dependencies and SBOM\n\nCrawl4AI maintains a comprehensive Software Bill of Materials (SBOM) documenting all direct and transitive dependencies. This SBOM is generated using CycloneDX format and regenerated on a best-effort basis through automated scripts.\n\n### Regenerating SBOM\n\n```bash\n./scripts/gen-sbom.sh\n```\n\nThe SBOM provides visibility into the project's dependency tree, supporting security audits and license compliance verification.\n\n## Extensibility\n\n### Custom Extractors\n\nExtend the extraction framework by implementing the base extractor interface:\n\n```python\nfrom crawl4ai.extractors import BaseExtractor\n\nclass CustomExtractor(BaseExtractor):\n    async def extract(self, html: str, url: str) -> dict:\n        # Custom extraction logic\n        return {\"content\": html, \"custom_field\": \"value\"}\n```\n\n### Output Formatters\n\nCreate custom output formats by implementing the formatter interface:\n\n```python\nfrom crawl4ai.formatters import BaseFormatter\n\nclass CustomFormatter(BaseFormatter):\n    def format(self, result: CrawlResult) -> str:\n        # Custom formatting logic\n        return custom_string\n```\n\n## Contributing\n\nThe project welcomes contributions from the community. Developers interested in contributing should:\n\n1. Fork the repository\n2. Create a feature branch\n3. Follow the established coding standards\n4. Add tests for new functionality\n5. Submit a pull request with clear documentation\n\n## Resources and Documentation\n\n| Resource | Location |\n|----------|----------|\n| Source Code | [GitHub Repository](https://github.com/unclecode/crawl4ai) |\n| Documentation | `/docs` directory |\n| Examples | `/examples` directory |\n| SBOM | `/sbom` directory |\n| Issue Tracker | GitHub Issues |\n\n## License\n\nCrawl4AI is released under open-source licensing terms. Refer to the LICENSE file in the repository for specific terms and conditions.\n\n---\n\n<a id='installation'></a>\n\n## Installation Guide\n\n### 相关页面\n\n相关主题：[Quick Start Guide](#quickstart)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [setup.py](https://github.com/unclecode/crawl4ai/blob/main/setup.py)\n- [requirements.txt](https://github.com/unclecode/crawl4ai/blob/main/requirements.txt)\n- [Dockerfile](https://github.com/unclecode/crawl4ai/blob/main/Dockerfile)\n- [crawl4ai/install.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/install.py)\n</details>\n\n# Installation Guide\n\n## Overview\n\nThis guide covers all supported methods for installing crawl4ai, a powerful web crawling and data extraction framework. The installation process handles automatic setup of core dependencies including Playwright for browser automation, supporting multiple installation scenarios from simple pip installations to Docker containerized deployments.\n\n## System Requirements\n\n### Hardware Requirements\n\n| Component | Minimum | Recommended |\n|-----------|---------|-------------|\n| RAM | 4 GB | 8 GB |\n| Disk Space | 2 GB | 5 GB |\n| CPU | 2 cores | 4+ cores |\n\n### Software Prerequisites\n\n| Requirement | Version | Notes |\n|-------------|---------|-------|\n| Python | >= 3.9 | Tested up to 3.12 |\n| pip | Latest | For pip installations |\n| Docker | 20.10+ | For Docker installations |\n| Chrome/Chromium | Latest | Auto-installed by Playwright |\n\n## Installation Methods\n\n### Method 1: pip Installation (Recommended)\n\nThe simplest and most common installation method uses pip package manager.\n\n```bash\npip install crawl4ai\n```\n\nAfter pip installation, you must run the post-installation setup to configure browser dependencies:\n\n```bash\npython -m crawl4ai install\n```\n\nThis command installs Playwright browsers and configures the necessary system dependencies. 资料来源：[crawl4ai/install.py:1-50]()\n\n### Method 2: Docker Installation\n\nDocker provides an isolated environment with all dependencies pre-configured.\n\n#### Pulling the Official Image\n\n```bash\ndocker pull unclecode/crawl4ai\n```\n\n#### Running with Docker\n\n```bash\ndocker run -d \\\n  --name crawl4ai \\\n  -p 8000:8000 \\\n  unclecode/crawl4ai\n```\n\n#### Building Custom Docker Image\n\nYou can build your own image using the provided Dockerfile:\n\n```dockerfile\nFROM python:3.11-slim\nWORKDIR /app\nCOPY requirements.txt .\nRUN pip install --no-cache-dir -r requirements.txt\nCOPY . .\nCMD [\"python\", \"-m\", \"crawl4ai\"]\n```\n\n资料来源：[Dockerfile](https://github.com/unclecode/crawl4ai/blob/main/Dockerfile)\n\n### Method 3: Installation from Source\n\nFor development or customization purposes, install from the source repository:\n\n```bash\ngit clone https://github.com/unclecode/crawl4ai.git\ncd crawl4ai\npip install -e .\n```\n\n## Dependencies Management\n\n### Core Dependencies\n\nThe project defines dependencies in `setup.py` for package distribution and `requirements.txt` for development.\n\n**setup.py dependencies:**\n\n| Package | Purpose |\n|---------|---------|\n| playwright | Browser automation |\n| asyncio | Async operations |\n| aiohttp | HTTP client |\n| beautifulsoup4 | HTML parsing |\n| lxml | XML/HTML processing |\n\n资料来源：[setup.py](https://github.com/unclecode/crawl4ai/blob/main/setup.py)\n\n### Runtime Dependency Installation\n\nThe `crawl4ai/install.py` module handles automatic installation of runtime dependencies:\n\n```python\n# Key installation steps in install.py\nimport subprocess\nimport sys\n\ndef install_browsers():\n    subprocess.check_call([\n        sys.executable, \"-m\", \"playwright\", \"install\", \"chromium\"\n    ])\n```\n\n资料来源：[crawl4ai/install.py:20-30]()\n\n### Installing Optional Dependencies\n\n| Extra | Command | Description |\n|-------|---------|-------------|\n| All extras | `pip install crawl4ai[all]` | Install all optional packages |\n| Dev tools | `pip install crawl4ai[dev]` | Development dependencies |\n| LLM support | `pip install crawl4ai[llm]` | Language model integration |\n\n## Installation Workflow\n\n```mermaid\ngraph TD\n    A[Start Installation] --> B{Installation Method}\n    B -->|pip| C[Run pip install]\n    B -->|Docker| D[Pull/Build Image]\n    B -->|Source| E[Clone Repository]\n    C --> F[Run post-install]\n    F --> G[Install Playwright Browsers]\n    D --> H[Run Container]\n    E --> I[Install in Editable Mode]\n    I --> F\n    G --> J[Verify Installation]\n    H --> J\n    J --> K{Success?}\n    K -->|Yes| L[Installation Complete]\n    K -->|No| M[Troubleshoot]\n    M --> G\n```\n\n## Post-Installation Verification\n\nVerify that crawl4ai is installed correctly by checking the installation status:\n\n```bash\npython -m crawl4ai --version\n```\n\nOr test the installation programmatically:\n\n```python\nimport crawl4ai\nprint(crawl4ai.__version__)\n```\n\n### Browser Verification\n\nEnsure Playwright browsers are properly installed:\n\n```bash\npython -m playwright install-deps chromium\npython -m playwright install chromium\n```\n\n## Environment Configuration\n\n### Environment Variables\n\n| Variable | Default | Description |\n|----------|---------|-------------|\n| `CRAWL4AI_BROWSER_HEADLESS` | `true` | Run browser in headless mode |\n| `CRAWL4AI_MAX_CONCURRENT` | `5` | Maximum concurrent crawls |\n| `CRAWL4AI_CACHE_DIR` | `~/.crawl4ai/cache` | Cache directory path |\n\n### Configuration File\n\nCreate `~/.crawl4ai/config.json` for persistent configuration:\n\n```json\n{\n  \"browser\": {\n    \"headless\": true,\n    \"viewport\": {\"width\": 1920, \"height\": 1080}\n  },\n  \"cache\": {\n    \"enabled\": true,\n    \"ttl\": 3600\n  }\n}\n```\n\n## Common Installation Issues\n\n### Issue: Playwright Installation Fails\n\n**Solution:** Install system dependencies manually:\n\n```bash\n# Ubuntu/Debian\napt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2\n\n# Then retry browser installation\npython -m playwright install chromium\n```\n\n### Issue: Permission Denied\n\n**Solution:** Use virtual environment or `--user` flag:\n\n```bash\npython -m venv venv\nsource venv/bin/activate  # Linux/Mac\npip install crawl4ai\n```\n\n### Issue: Import Errors After Installation\n\n**Solution:** Verify Python path and reinstall:\n\n```bash\npip uninstall crawl4ai\npip install crawl4ai --force-reinstall\n```\n\n## Docker-Specific Configuration\n\n### Volume Mounts\n\nMount local directories for persistent data:\n\n```bash\ndocker run -v /path/to/data:/data crawl4ai\n```\n\n### Network Configuration\n\nFor web crawling behind proxies:\n\n```bash\ndocker run -e HTTP_PROXY=http://proxy:8080 \\\n           -e HTTPS_PROXY=https://proxy:8080 \\\n           crawl4ai\n```\n\n## Upgrading crawl4ai\n\n### pip Upgrade\n\n```bash\npip install crawl4ai --upgrade\n```\n\n### Docker Upgrade\n\n```bash\ndocker pull unclecode/crawl4ai\ndocker stop old_container\ndocker rm old_container\ndocker run crawl4ai\n```\n\n### Source Upgrade\n\n```bash\ngit pull origin main\npip install -e . --force-reinstall\n```\n\n## Next Steps\n\nAfter successful installation, proceed to:\n\n1. **Quick Start** - Run your first crawl operation\n2. **Configuration Guide** - Customize crawl behavior\n3. **API Reference** - Explore available methods and options\n4. **Examples** - Review usage patterns and best practices\n\n---\n\n<a id='quickstart'></a>\n\n## Quick Start Guide\n\n### 相关页面\n\n相关主题：[Async Web Crawler](#async_crawler), [Markdown Generation](#markdown_generation)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [docs/examples/quickstart.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py)\n- [docs/examples/hello_world.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world.py)\n- [crawl4ai/__init__.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/__init__.py)\n- [sbom/README.md](https://github.com/unclecode/crawl4ai/blob/main/sbom/README.md)\n</details>\n\n# Quick Start Guide\n\n## Overview\n\nThe **Quick Start Guide** provides developers with a rapid introduction to using crawl4ai for web crawling and data extraction tasks. It serves as the entry point for users who want to begin extracting structured data from websites within minutes of installation.\n\n### Purpose and Scope\n\nThe Quick Start Guide is designed to:\n\n- Demonstrate the simplest possible usage pattern for crawling web pages\n- Show how to extract and structure content from HTML pages\n- Provide copy-paste-ready code examples for immediate experimentation\n- Bridge the gap between installation and production usage\n\n## Installation\n\n### Prerequisites\n\n| Requirement | Description |\n|-------------|-------------|\n| Python | Version 3.8 or higher |\n| pip | Latest version recommended |\n| Browser | Chrome/Chromium (for JavaScript rendering) |\n\n### Installation Command\n\n```bash\npip install crawl4ai\n```\n\n## Basic Usage Pattern\n\nThe fundamental workflow in crawl4ai follows a simple three-step pattern:\n\n```mermaid\ngraph TD\n    A[Create AsyncWebCrawler Instance] --> B[Configure Parameters]\n    B --> C[Call crawl Method with URL]\n    C --> D[Process Result Object]\n    D --> E[Extract Content/Markdown/HTML]\n```\n\n### Hello World Example\n\nThe simplest possible usage demonstrates core functionality:\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def main():\n    async with AsyncWebCrawler() as crawler:\n        result = await crawler.crawl(url=\"https://example.com\")\n        \n        if result.success:\n            print(f\"Content: {result.markdown}\")\n            print(f\"Links found: {len(result.links)}\")\n        else:\n            print(f\"Crawl failed: {result.error_message}\")\n\nasyncio.run(main())\n```\n\n## Core Components\n\n### AsyncWebCrawler\n\nThe primary entry point for all crawling operations:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `verbose` | bool | Enable detailed logging output |\n| `headless` | bool | Run browser in headless mode |\n| `browser_type` | str | Specify browser engine |\n\n资料来源：[crawl4ai/__init__.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/__init__.py)\n\n### CrawlResult Object\n\nThe return value from `crawler.crawl()` contains extracted data:\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `success` | bool | Whether crawl completed successfully |\n| `markdown` | str | Extracted content as markdown |\n| `html` | str | Raw HTML content |\n| `links` | dict | Dictionary of internal/external links |\n| `media` | dict | Images, videos, and other media |\n| `error_message` | str | Error details if `success` is False |\n\n## Common Usage Patterns\n\n### Pattern 1: Simple Content Extraction\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def extract_content():\n    async with AsyncWebCrawler(verbose=True) as crawler:\n        result = await crawler.crawl(\n            url=\"https://example.com\",\n            word_count_threshold=10\n        )\n        \n        if result.success:\n            return result.markdown\n        return None\n\ncontent = asyncio.run(extract_content())\n```\n\n### Pattern 2: Batch Crawling\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def crawl_multiple(urls):\n    async with AsyncWebCrawler() as crawler:\n        tasks = [crawler.crawl(url=url) for url in urls]\n        results = await asyncio.gather(*tasks)\n        return [r for r in results if r.success]\n\nurls = [\"https://example.com\", \"https://example.org\"]\nsuccessful_results = asyncio.run(crawl_multiple(urls))\n```\n\n## Configuration Options\n\n### Browser Configuration\n\n```python\nfrom crawl4ai import BrowserConfig, CrawlerRunConfig\n\nbrowser_config = BrowserConfig(\n    headless=True,\n    verbose=False\n)\n\nrun_config = CrawlerRunConfig(\n    word_count_threshold=10,\n    page_timeout=30000\n)\n\nasync with AsyncWebCrawler(config=browser_config) as crawler:\n    result = await crawler.crawl(\n        url=\"https://example.com\",\n        config=run_config\n    )\n```\n\n## Error Handling\n\nAlways check the `success` property before accessing extracted content:\n\n```python\nresult = await crawler.crawl(url=\"https://example.com\")\n\nif result.success:\n    process_data(result.markdown)\nelse:\n    log_error(f\"Crawl failed: {result.error_message}\")\n    handle_failure()\n```\n\n## Next Steps\n\nAfter completing the Quick Start Guide, users should explore:\n\n- Advanced extraction strategies with CSS selectors and XPath\n- JavaScript-heavy page crawling\n- Rate limiting and polite crawling practices\n- Integration with AI/LLM pipelines for content analysis\n\n---\n\n## Summary\n\nThe Quick Start Guide provides the essential foundation for using crawl4ai effectively. By following the patterns shown, developers can immediately begin extracting structured web content with minimal configuration overhead.\n\n---\n\n<a id='architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Browser Management](#browser_management), [Async Web Crawler](#async_crawler)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [deploy/docker/ARCHITECTURE.md](https://github.com/unclecode/crawl4ai/blob/main/deploy/docker/ARCHITECTURE.md)\n- [crawl4ai/async_webcrawler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_webcrawler.py)\n- [crawl4ai/browser_manager.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_manager.py)\n- [crawl4ai/async_dispatcher.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_dispatcher.py)\n- [crawl4ai/models.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/models.py)\n</details>\n\n# System Architecture\n\n## Overview\n\nCrawl4AI is a high-performance web crawling framework designed for AI applications. It enables efficient extraction of web content along with metadata, supporting both single-page crawling and large-scale asynchronous crawling operations. The architecture emphasizes separation of concerns between browser management, crawling logic, and result processing.\n\n## Core Components\n\nThe system is built around three primary modules that work in coordination:\n\n| Component | File | Responsibility |\n|-----------|------|----------------|\n| AsyncWebCrawler | `crawl4ai/async_webcrawler.py` | Main entry point for crawling operations |\n| BrowserManager | `crawl4ai/browser_manager.py` | Handles browser lifecycle and page interactions |\n| AsyncDispatcher | `crawl4ai/async_dispatcher.py` | Manages concurrent crawling tasks |\n\n## Component Architecture\n\n```mermaid\ngraph TD\n    A[User / API Client] --> B[AsyncWebCrawler]\n    B --> C[BrowserManager]\n    B --> D[AsyncDispatcher]\n    C --> E[Browser Instance]\n    D --> F[Task Queue]\n    F --> E\n    E --> G[Content Extraction]\n    G --> H[Result Models]\n    H --> B\n```\n\n## AsyncWebCrawler\n\nThe `AsyncWebCrawler` class serves as the primary interface for initiating crawl operations. It accepts configuration parameters and coordinates the crawling workflow.\n\n### Key Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| config | CrawlerRunConfig | Configuration for the crawl session |\n| browser_manager | BrowserManager | Shared browser manager instance |\n| dispatcher | AsyncDispatcher | Task dispatcher for async operations |\n\n资料来源：[crawl4ai/async_webcrawler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_webcrawler.py)\n\n### Workflow\n\n```mermaid\ngraph LR\n    A[Initialize Crawler] --> B[Configure Browser]\n    B --> C[Create BrowserContext]\n    C --> D[Navigate to URL]\n    D --> E[Extract Content]\n    E --> F[Return CrawlResult]\n```\n\n## BrowserManager\n\nThe `BrowserManager` handles the lifecycle of browser instances, managing Chrome/Chromium processes and providing isolated contexts for crawling sessions.\n\n资料来源：[crawl4ai/browser_manager.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_manager.py)\n\n### Browser Lifecycle\n\n```mermaid\ngraph TD\n    A[Launch Browser] --> B[Create Context]\n    B --> C[Create Page]\n    C --> D[Execute Crawl]\n    D --> E[Close Context]\n    E --> F[Repeat or Shutdown]\n    F --> A\n```\n\n## AsyncDispatcher\n\nThe `AsyncDispatcher` enables concurrent crawling operations, managing task queues and coordinating multiple browser contexts for parallel extraction.\n\n资料来源：[crawl4ai/async_dispatcher.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_dispatcher.py)\n\n### Parallel Execution Model\n\n```mermaid\ngraph TD\n    A[URL List] --> B[Dispatcher Queue]\n    B --> C[Worker 1]\n    B --> D[Worker 2]\n    B --> E[Worker N]\n    C --> F[Results Aggregator]\n    D --> F\n    E --> F\n    F --> G[Combined Output]\n```\n\n## Data Models\n\nResults from crawling operations are structured using Pydantic models defined in `models.py`.\n\n资料来源：[crawl4ai/models.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/models.py)\n\n| Model | Purpose |\n|-------|---------|\n| CrawlResult | Container for extracted content and metadata |\n| CrawlerRunConfig | Configuration parameters for crawl sessions |\n\n## Docker Deployment Architecture\n\nThe project includes Docker deployment specifications that containerize the crawling infrastructure.\n\n```mermaid\ngraph TD\n    A[Docker Compose] --> B[Crawl4AI Container]\n    A --> C[Redis Cache]\n    A --> D[Chrome Browser]\n    B --> C\n    B --> D\n```\n\n资料来源：[deploy/docker/ARCHITECTURE.md](https://github.com/unclecode/crawl4ai/blob/main/deploy/docker/ARCHITECTURE.md)\n\n## Technology Stack\n\n| Layer | Technology |\n|-------|------------|\n| Runtime | Python 3.10+ |\n| Browser Engine | Chrome/Chromium via Playwright |\n| Async Framework | asyncio |\n| Data Validation | Pydantic |\n| Containerization | Docker |\n\n## Configuration\n\nThe system supports extensive configuration options through `CrawlerRunConfig`, including:\n\n- JavaScript execution toggles\n- Memory management settings\n- Request throttling parameters\n- Content extraction strategies\n\n## Dependency Management\n\nThe project maintains a Software Bill of Materials (SBOM) for tracking dependencies and ensuring reproducible builds.\n\n资料来源：[sbom/README.md](https://github.com/unclecode/crawl4ai/blob/main/sbom/README.md)\n\nTo regenerate the SBOM:\n\n```bash\n./scripts/gen-sbom.sh\n\n---\n\n<a id='browser_management'></a>\n\n## Browser Management\n\n### 相关页面\n\n相关主题：[Anti-Bot Detection and Proxy Management](#anti_bot_detection)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/browser_manager.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_manager.py)\n- [crawl4ai/browser_adapter.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_adapter.py)\n- [crawl4ai/browser_profiler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_profiler.py)\n- [crawl4ai/js_snippet](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/js_snippet)\n- [docs/codebase/browser.md](https://github.com/unclecode/crawl4ai/blob/main/docs/codebase/browser.md)\n</details>\n\n# Browser Management\n\n## Overview\n\nBrowser Management in crawl4ai provides a comprehensive abstraction layer for controlling and orchestrating browser instances used during web crawling and scraping operations. The system abstracts the complexity of browser automation, allowing users to focus on data extraction rather than browser lifecycle management.\n\n## Architecture Overview\n\nThe browser management system follows a modular architecture with distinct components that handle specific responsibilities:\n\n```mermaid\ngraph TD\n    A[BrowserManager] --> B[BrowserAdapter]\n    A --> C[BrowserProfiler]\n    B --> D[Playwright/Chromium]\n    C --> E[JS Snippets]\n    F[User Request] --> A\n    A --> G[Crawled Result]\n```\n\n## Core Components\n\n### BrowserManager\n\nThe central orchestrator responsible for:\n\n- Browser instance lifecycle (creation, configuration, teardown)\n- Session management and isolation\n- Resource allocation and cleanup\n- Coordination between adapters and profilers\n\n**Key Responsibilities:**\n\n| Responsibility | Description |\n|----------------|-------------|\n| Instance Creation | Creates and initializes browser contexts |\n| Configuration | Applies user-defined browser settings |\n| Lifecycle Control | Manages startup and shutdown sequences |\n| Pool Management | Handles browser pool for concurrent operations |\n\n资料来源：[crawl4ai/browser_manager.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_manager.py)\n\n### BrowserAdapter\n\nThe adapter pattern implementation that provides a consistent interface for interacting with different browser engines (Playwright, Chromium, Firefox, WebKit).\n\n**Adapter Features:**\n\n| Feature | Description |\n|---------|-------------|\n| Engine Abstraction | Unified API across browser backends |\n| Command Translation | Converts high-level commands to browser-specific instructions |\n| Response Normalization | Standardizes browser responses |\n\n资料来源：[crawl4ai/browser_adapter.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_adapter.py)\n\n### BrowserProfiler\n\nHandles JavaScript injection and performance profiling during browser operations.\n\n**Profiler Capabilities:**\n\n| Capability | Purpose |\n|------------|---------|\n| JS Injection | Execute custom JavaScript in page context |\n| Performance Tracking | Monitor page load and execution metrics |\n| Resource Profiling | Track network requests and responses |\n\n资料来源：[crawl4ai/browser_profiler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_profiler.py)\n\n## JavaScript Integration\n\nThe `js_snippet` module provides pre-built JavaScript utilities for browser automation tasks:\n\n```mermaid\ngraph LR\n    A[Browser Context] --> B[js_snippet Module]\n    B --> C[DOM Manipulation]\n    B --> D[Data Extraction]\n    B --> E[Event Handling]\n```\n\n**Common JS Snippet Categories:**\n\n- DOM traversal and manipulation\n- Content extraction\n- Scroll management\n- Wait conditions\n- Network request interception\n\n资料来源：[crawl4ai/js_snippet](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/js_snippet)\n\n## Configuration Options\n\n### Browser Launch Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| headless | bool | true | Run browser in headless mode |\n| args | list | [] | Additional browser arguments |\n| timeout | int | 30000 | Navigation timeout in milliseconds |\n| viewport | dict | {\"width\": 1920, \"height\": 1080} | Browser viewport dimensions |\n| user_agent | str | None | Custom user agent string |\n| proxy | dict | None | Proxy configuration |\n\n### Context Options\n\n| Option | Type | Description |\n|--------|------|-------------|\n| java_script_enabled | bool | Enable/disable JavaScript |\n| ignore_https_errors | bool | Ignore SSL certificate errors |\n| java_script_enabled | bool | Browser context JavaScript state |\n\n资料来源：[docs/codebase/browser.md](https://github.com/unclecode/crawl4ai/blob/main/docs/codebase/browser.md)\n\n## Browser Lifecycle\n\n```mermaid\nstateDiagram-v2\n    [*] --> Initializing: Create BrowserManager\n    Initializing --> Launching: Launch Browser\n    Launching --> Ready: Browser Context Created\n    Ready --> Navigating: Load URL\n    Navigating --> Ready: Page Loaded\n    Ready --> Executing: Run JS/Commands\n    Executing --> Ready: Commands Complete\n    Ready --> Closing: Shutdown Request\n    Closing --> [*]: Resources Freed\n```\n\n## Usage Patterns\n\n### Basic Browser Usage\n\n```python\nfrom crawl4ai import BrowserManager\n\n# Initialize browser manager\nbrowser_mgr = BrowserManager(\n    headless=True,\n    viewport={\"width\": 1920, \"height\": 1080}\n)\n\n# Create browser context\ncontext = browser_mgr.new_context()\n\n# Use context for crawling\nresult = await context.goto(\"https://example.com\")\n```\n\n### Advanced Configuration\n\n```python\nbrowser_mgr = BrowserManager(\n    headless=False,\n    args=[\n        \"--disable-blink-features=AutomationControlled\",\n        \"--disable-dev-shm-usage\"\n    ],\n    timeout=60000,\n    user_agent=\"Custom User Agent\"\n)\n```\n\n## Session Management\n\nThe system supports multiple concurrent sessions through isolated browser contexts:\n\n```mermaid\ngraph TD\n    A[BrowserManager] --> B1[Session 1 Context]\n    A --> B2[Session 2 Context]\n    A --> B3[Session N Context]\n    B1 --> C1[Page 1]\n    B2 --> C2[Page 2]\n    B3 --> C3[Page N]\n```\n\n## Error Handling\n\nThe browser management system implements comprehensive error handling:\n\n| Error Type | Handling Strategy |\n|------------|-------------------|\n| Navigation Timeout | Retry with exponential backoff |\n| Browser Crash | Automatic restart and context recreation |\n| Resource Exhaustion | Automatic cleanup of stale contexts |\n| Network Errors | Graceful degradation with cached content |\n\n## Performance Considerations\n\n### Optimization Strategies\n\n1. **Context Reuse**: Reuse browser contexts for multiple pages when possible\n2. **Lazy Loading**: Only load resources when explicitly requested\n3. **Resource Limits**: Configure memory and CPU limits per context\n4. **Connection Pooling**: Maintain warm browser instances for rapid access\n\n### Memory Management\n\n| Strategy | Description |\n|----------|-------------|\n| Context Isolation | Each session runs in isolated context |\n| Automatic Cleanup | Temporary files and caches cleared automatically |\n| Resource Limits | Configurable memory caps per browser instance |\n\n## Related Documentation\n\n- [Browser Adapter API Reference](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_adapter.py)\n- [JavaScript Snippets Library](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/js_snippet)\n- [Codebase Overview](https://github.com/unclecode/crawl4ai/blob/main/docs/codebase/browser.md)\n\n---\n\n<a id='async_crawler'></a>\n\n## Async Web Crawler\n\n### 相关页面\n\n相关主题：[Markdown Generation](#markdown_generation), [Extraction Strategies](#extraction_strategies)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/async_webcrawler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_webcrawler.py)\n- [crawl4ai/async_configs.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_configs.py)\n- [crawl4ai/cache_context.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/cache_context.py)\n- [crawl4ai/types.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/types.py)\n- [docs/md_v2/api/async-webcrawler.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/api/async-webcrawler.md)\n</details>\n\n# Async Web Crawler\n\n## Overview\n\nThe Async Web Crawler is the core component of crawl4ai, providing an asynchronous, high-performance web crawling engine built on Python's `asyncio` framework. It enables concurrent crawling of multiple URLs with built-in caching, configurable extraction strategies, and comprehensive result handling.\n\nThe primary purpose of this module is to fetch web pages, extract meaningful content, and return structured results that include HTML, markdown, media assets, metadata, and optional AI-generated summaries. The async design allows for efficient I/O-bound operations, making it suitable for large-scale web scraping projects.\n\n资料来源：[crawl4ai/async_webcrawler.py:1-50]()\n\n## Architecture\n\n### System Components\n\nThe async web crawler system consists of several interconnected components that work together to provide a seamless crawling experience.\n\n```mermaid\ngraph TD\n    A[AsyncWebCrawler] --> B[Browser Manager]\n    A --> C[Cache Layer]\n    A --> D[Extraction Strategy]\n    A --> E[Result Processor]\n    \n    B --> F[Playwright/Chromium]\n    C --> G[File System Cache]\n    C --> H[Memory Cache]\n    \n    D --> I[LLM-based Extraction]\n    D --> J[CSS/XPath Extraction]\n    \n    E --> K[CrawlResult]\n    E --> L[Raw HTML]\n    E --> M[Markdown]\n```\n\n资料来源：[crawl4ai/async_webcrawler.py:1-30]()\n\n### Core Classes\n\n| Class | File | Purpose |\n|-------|------|---------|\n| `AsyncWebCrawler` | async_webcrawler.py | Main crawler entry point with `arun()` and `arun_many()` methods |\n| `BrowserConfig` | async_configs.py | Configuration for headless browser behavior |\n| `CrawlCache` | cache_context.py | Manages caching strategies for crawled content |\n| `CrawlResult` | types.py | Data model for returning crawl results |\n\n资料来源：[crawl4ai/async_configs.py:1-30]()\n\n## AsyncWebCrawler Class\n\n### Initialization\n\nThe `AsyncWebCrawler` class can be initialized with optional configuration parameters:\n\n```python\nclass AsyncWebCrawler:\n    def __init__(\n        self,\n        config: BrowserConfig | None = None,\n        verbose: bool = False\n    ) -> None:\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `config` | `BrowserConfig` | `None` | Browser configuration object |\n| `verbose` | `bool` | `False` | Enable verbose logging output |\n\n资料来源：[crawl4ai/async_webcrawler.py:50-70]()\n\n### Core Methods\n\n#### `arun()`\n\nSingle URL crawling with comprehensive result extraction:\n\n```python\nasync def arun(\n    self,\n    url: str,\n    config: BrowserConfig | None = None,\n    **kwargs\n) -> CrawlResult\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `url` | `str` | Target URL to crawl |\n| `config` | `BrowserConfig` | Override browser configuration |\n\n资料来源：[crawl4ai/async_webcrawler.py:100-150]()\n\n#### `arun_many()`\n\nBatch crawling for multiple URLs concurrently:\n\n```python\nasync def arun_many(\n    self,\n    urls: list[str],\n    config: BrowserConfig | None = None,\n    **kwargs\n) -> list[CrawlResult]\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `urls` | `list[str]` | List of target URLs |\n| `config` | `BrowserConfig` | Shared configuration for all URLs |\n\n资料来源：[crawl4ai/async_webcrawler.py:200-250]()\n\n### Context Manager Support\n\nThe `AsyncWebCrawler` implements the async context manager protocol for proper resource cleanup:\n\n```python\nasync def __aenter__(self) -> \"AsyncWebCrawler\":\n    await self.start()\n    return self\n\nasync def __aexit__(\n    self,\n    exc_type, exc_val, exc_tb\n) -> None:\n    await self.close()\n```\n\n资料来源：[crawl4ai/async_webcrawler.py:80-100]()\n\n## CrawlResult Data Model\n\nThe `CrawlResult` class encapsulates all information retrieved from a crawled page:\n\n```mermaid\nclassDiagram\n    class CrawlResult {\n        +str url\n        +str html\n        +str markdown\n        +list~MediaItem~ media\n        +list~Link~ links\n        +dict metadata\n        +str|None success\n        +str|None error\n        +dict~str, Any~ extracted_content\n        +int status_code\n        +datetime created_at\n    }\n```\n\n资料来源：[crawl4ai/types.py:1-80]()\n\n### Properties\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `url` | `str` | Original request URL |\n| `html` | `str` | Raw HTML content |\n| `markdown` | `str` | Converted markdown content |\n| `media` | `list[MediaItem]` | Extracted images, videos, audio |\n| `links` | `list[Link]` | Internal and external links |\n| `metadata` | `dict` | Page metadata (title, description) |\n| `success` | `str \\| None` | Success status message |\n| `error` | `str \\| None` | Error message if failed |\n| `status_code` | `int` | HTTP response status code |\n| `created_at` | `datetime` | Timestamp of crawl operation |\n\n资料来源：[crawl4ai/types.py:50-100]()\n\n## Configuration\n\n### BrowserConfig\n\nThe `BrowserConfig` class provides fine-grained control over browser behavior:\n\n```python\n@dataclass\nclass BrowserConfig:\n    headless: bool = True\n    browser_type: str = \"chromium\"\n    viewport_size: dict = {\"width\": 1920, \"height\": 1080}\n    user_agent: str | None = None\n    verbose: bool = False\n```\n\n资料来源：[crawl4ai/async_configs.py:30-80]()\n\n### Configuration Options\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `headless` | `bool` | `True` | Run browser in headless mode |\n| `browser_type` | `str` | `\"chromium\"` | Browser engine (chromium, firefox, webkit) |\n| `viewport_size.width` | `int` | `1920` | Viewport width in pixels |\n| `viewport_size.height` | `int` | `1080` | Viewport height in pixels |\n| `user_agent` | `str \\| None` | `None` | Custom user agent string |\n| `verbose` | `bool` | `False` | Enable debug output |\n\n资料来源：[crawl4ai/async_configs.py:40-90]()\n\n### Advanced Configuration\n\nAdditional crawling parameters can be passed via kwargs:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `word_count_threshold` | `int` | Minimum word count for content extraction |\n| `extraction_strategy` | `ExtractionStrategy` | Strategy for content extraction |\n| `cache_mode` | `CacheMode` | Caching behavior (enabled/disabled/bypass) |\n| `js_enabled` | `bool` | Enable JavaScript execution |\n| `wait_for` | `str` | CSS selector to wait for before returning |\n| `delay_before_return_html` | `float` | Delay in seconds before capturing HTML |\n\n资料来源：[docs/md_v2/api/async-webcrawler.md:1-60]()\n\n## Caching System\n\n### Cache Modes\n\nThe crawl4ai framework implements a multi-layered caching strategy:\n\n```mermaid\ngraph LR\n    A[Request] --> B{Memory Cache}\n    B -->|Hit| C[Return Cached]\n    B -->|Miss| D{File System Cache}\n    D -->|Hit| E[Return Cached]\n    D -->|Miss| F[Fetch Remote]\n    F --> G[Store in Both Layers]\n```\n\n资料来源：[crawl4ai/cache_context.py:1-50]()\n\n### CacheMode Enum\n\n| Mode | Description |\n|------|-------------|\n| `ENABLED` | Use cache if available, otherwise fetch and cache |\n| `DISABLED` | Always fetch fresh content, bypass cache |\n| `BYPASS` | Fetch and update cache but don't read from it |\n| `READ_ONLY` | Only read from cache, never fetch |\n\n资料来源：[crawl4ai/cache_context.py:30-60]()\n\n### Cache Context Manager\n\n```python\nasync with cache_context(cache_mode=CacheMode.ENABLED):\n    result = await crawler.arun(url=\"https://example.com\")\n```\n\n资料来源：[crawl4ai/cache_context.py:60-90]()\n\n## Usage Examples\n\n### Basic Single URL Crawl\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def main():\n    async with AsyncWebCrawler(verbose=True) as crawler:\n        result = await crawler.arun(\n            url=\"https://example.com\",\n            config=BrowserConfig(headless=True)\n        )\n        \n        print(f\"Success: {result.success}\")\n        print(f\"Markdown content: {result.markdown[:500]}\")\n\nasyncio.run(main())\n```\n\n资料来源：[crawl4ai/async_webcrawler.py:150-200]()\n\n### Batch Crawling\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def main():\n    urls = [\n        \"https://example.com/page1\",\n        \"https://example.com/page2\",\n        \"https://example.com/page3\"\n    ]\n    \n    async with AsyncWebCrawler() as crawler:\n        results = await crawler.arun_many(urls=urls)\n        \n        for result in results:\n            print(f\"URL: {result.url}, Status: {result.status_code}\")\n\nasyncio.run(main())\n```\n\n资料来源：[docs/md_v2/api/async-webcrawler.md:60-100]()\n\n### With Custom Extraction Strategy\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler, BrowserConfig\nfrom crawl4ai.extraction_strategy import LLMExtractionStrategy\n\nasync def main():\n    config = BrowserConfig(\n        headless=True,\n        verbose=True\n    )\n    \n    strategy = LLMExtractionStrategy(\n        provider=\"openai/gpt-4\",\n        api_token=\"your-token\"\n    )\n    \n    async with AsyncWebCrawler(config=config) as crawler:\n        result = await crawler.arun(\n            url=\"https://news-site.com/article\",\n            extraction_strategy=strategy\n        )\n        \n        print(result.extracted_content)\n\nasyncio.run(main())\n```\n\n资料来源：[crawl4ai/async_configs.py:90-130]()\n\n## Error Handling\n\nThe crawler returns comprehensive error information through the `CrawlResult` object:\n\n```python\nresult = await crawler.arun(url=\"https://invalid-url.xyz\")\n\nif not result.success:\n    print(f\"Error: {result.error}\")\n    print(f\"Status Code: {result.status_code}\")\n```\n\n| Error Scenario | `success` Value | `error` Field |\n|----------------|-----------------|---------------|\n| Network timeout | `False` | Connection timeout message |\n| Invalid URL | `False` | URL validation error |\n| JavaScript error | `False` | Browser console error |\n| HTTP 404/500 | `False` | HTTP status message |\n| Success | `True` | `None` |\n\n资料来源：[crawl4ai/types.py:80-120]()\n\n## Performance Considerations\n\n### Async Benefits\n\nThe async architecture provides several performance advantages:\n\n1. **Concurrent Requests**: Multiple URLs can be crawled simultaneously\n2. **Non-blocking I/O**: Browser operations don't block other tasks\n3. **Resource Efficiency**: Single event loop manages all crawling tasks\n\n### Best Practices\n\n| Practice | Benefit |\n|----------|---------|\n| Use `arun_many()` for batch operations | Reduces connection overhead |\n| Enable caching for repeated URLs | Avoids redundant network requests |\n| Set appropriate `word_count_threshold` | Reduces unnecessary processing |\n| Use `headless=True` in production | Reduces memory usage |\n\n资料来源：[crawl4ai/async_webcrawler.py:250-300]()\n\n## Related Components\n\n| Component | File | Relationship |\n|-----------|------|--------------|\n| `ExtractionStrategy` | extraction_strategy.py | Defines how content is extracted |\n| `MediaItem` | types.py | Represents extracted media |\n| `Link` | types.py | Represents extracted links |\n| `CacheBackend` | cache_backend.py | Abstract cache implementation |\n\n资料来源：[crawl4ai/types.py:1-30]()\n\n## Summary\n\nThe Async Web Crawler is the foundational building block of crawl4ai, providing:\n\n- **Asynchronous operation** for high-performance concurrent crawling\n- **Flexible configuration** via `BrowserConfig` dataclass\n- **Comprehensive result types** through `CrawlResult` model\n- **Multi-layered caching** with configurable modes\n- **Extensible extraction** via pluggable strategies\n- **Production-ready error handling** with detailed error reporting\n\nThis architecture enables developers to build scalable web scraping solutions while maintaining clean, readable code patterns familiar to Python async developers.\n\n---\n\n<a id='markdown_generation'></a>\n\n## Markdown Generation\n\n### 相关页面\n\n相关主题：[Extraction Strategies](#extraction_strategies), [Async Web Crawler](#async_crawler)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/markdown_generation_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/markdown_generation_strategy.py)\n- [crawl4ai/content_filter_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/content_filter_strategy.py)\n- [crawl4ai/html2text](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/html2text)\n- [docs/md_v2/core/markdown-generation.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/core/markdown-generation.md)\n- [crawl4ai/extraction_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n</details>\n\n# Markdown Generation\n\nMarkdown Generation is a core feature in crawl4ai that transforms raw HTML content into clean, readable Markdown format. This system provides flexible strategies for content extraction, filtering, and conversion with extensive customization options.\n\n## Overview\n\nThe Markdown Generation system converts web page HTML into structured Markdown text suitable for LLM consumption, RAG systems, or documentation purposes. It offers multiple generation strategies, content filtering capabilities, and fine-grained control over extraction behavior.\n\n资料来源：[docs/md_v2/core/markdown-generation.md:1-15]()\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[HTML Input] --> B[Content Filter Strategy]\n    B --> C[HTML Processing]\n    C --> D[Markdown Generation Strategy]\n    D --> E[Markdown Output]\n    \n    F[Configuration] --> B\n    F --> D\n    \n    G[BestProvider] --> B\n    G --> D\n```\n\n## Core Components\n\n### MarkdownGenerationStrategy\n\nThe primary abstraction for generating Markdown from HTML content.\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `provider` | `str` | `\"best\"` | Content extraction provider |\n| `configs` | `dict` | `{}` | Provider-specific configurations |\n| `strict` | `bool` | `False` | Raise errors on failure |\n| `override_system_prompt` | `str` | `None` | Custom system prompt |\n| `override_user_prompt` | `str` | `None` | Custom user prompt |\n\n资料来源：[crawl4ai/markdown_generation_strategy.py:1-50]()\n\n### ContentFilterStrategy\n\nAbstract base class for filtering and selecting content before Markdown conversion.\n\n| Filter | Description |\n|--------|-------------|\n| `PruningContentFilter` | Removes low-value content nodes |\n| `BM25ContentFilter` | Uses BM25 ranking for content selection |\n| `OrgAnnContentFilter` | Organic annotation-based filtering |\n\n资料来源：[crawl4ai/content_filter_strategy.py:1-100]()\n\n## Generation Providers\n\n### Available Providers\n\n| Provider | Description |\n|----------|-------------|\n| `best` | Automatically selects optimal provider |\n| `playwright` | Uses Playwright for JavaScript rendering |\n| `curl` | Lightweight extraction via curl |\n| `trafilatura` | Trafilatura library extraction |\n| `lxml` | LXML-based HTML parsing |\n| `readability` | Mozilla Readability algorithm |\n\n资料来源：[crawl4ai/markdown_generation_strategy.py:50-150]()\n\n### Best Provider Selection\n\nThe `BestProvider` class intelligently selects the most appropriate extraction method based on content characteristics.\n\n```python\nclass BestProvider:\n    def get_strategy(self, html: str) -> MarkdownGenerationStrategy:\n        # Analyzes HTML and selects optimal provider\n        pass\n```\n\n资料来源：[crawl4ai/markdown_generation_strategy.py:150-200]()\n\n## Workflow\n\n```mermaid\ngraph LR\n    A[Fetch HTML] --> B{Content Filter Enabled?}\n    B -->|Yes| C[Apply Filter Strategy]\n    B -->|No| D[Skip Filtering]\n    C --> E[Generate Markdown]\n    D --> E\n    E --> F{Post-Processing?}\n    F -->|Yes| G[Apply Custom Rules]\n    F -->|No| H[Return Result]\n    G --> H\n```\n\n## Configuration Options\n\n### Generator Config\n\n```python\n{\n    \"word_threshold\": 50,          # Minimum words per chunk\n    \"language\": \"en\",              # Content language\n    \"skip_internal_links\": True,   # Ignore internal links\n    \"content_type\": \"markdown\"     # Output format\n}\n```\n\n### BM25 Filter Config\n\n```python\n{\n    \"query\": \"relevant keywords\",\n    \"top_n\": 5,                    # Number of chunks\n    \"use_stem\": True               # Apply stemming\n}\n```\n\n资料来源：[crawl4ai/content_filter_strategy.py:100-180]()\n\n## HTML to Markdown Conversion\n\n### html2text Module\n\nThe `html2text` submodule handles low-level HTML to Markdown conversion.\n\n| Method | Purpose |\n|--------|---------|\n| `handle_anchor` | Convert `<a>` tags to `[text](url)` |\n| `handle_image` | Convert `<img>` to `![alt](src)` |\n| `handle_heading` | Convert `<h1>`-`<h6>` to `#` - `######` |\n| `handle_table` | Convert `<table>` to Markdown tables |\n| `handle_code` | Preserve `<code>` and `<pre>` formatting |\n\n资料来源：[crawl4ai/html2text:core.py:1-100]()\n\n### Conversion Features\n\n- **Link Preservation**: External links converted to Markdown format with titles\n- **Image Extraction**: Images extracted with alt text and sources\n- **Table Conversion**: HTML tables converted to GFM tables\n- **Code Block Handling**: Syntax-aware code block extraction\n- **List Recognition**: Ordered and unordered lists properly formatted\n\n## Usage Examples\n\n### Basic Usage\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(\n        url=\"https://example.com\",\n        markdown_generator={\n            \"provider\": \"best\",\n            \"configs\": {\"word_threshold\": 100}\n        }\n    )\n    print(result.markdown)\n```\n\n### With Content Filter\n\n```python\nfrom crawl4ai import AsyncWebCrawler\nfrom crawl4ai.content_filter_strategy import BM25ContentFilter\n\nfilter_strategy = BM25ContentFilter(\n    query=\"getting started installation\",\n    top_n=10\n)\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(\n        url=\"https://docs.example.com\",\n        markdown_generator={\n            \"provider\": \"playwright\",\n            \"filter_strategy\": filter_strategy\n        }\n    )\n```\n\n## Advanced Configuration\n\n### Custom Prompts\n\nOverride system and user prompts for specialized extraction:\n\n```python\nmarkdown_generator = {\n    \"override_system_prompt\": \"Extract only technical documentation...\",\n    \"override_user_prompt\": \"Focus on API endpoints and code examples...\"\n}\n```\n\n### Strict Mode\n\nEnable strict mode to raise exceptions on extraction failures:\n\n```python\nmarkdown_generator = {\n    \"strict\": True,\n    \"provider\": \"playwright\"\n}\n```\n\n## Performance Considerations\n\n| Aspect | Recommendation |\n|--------|----------------|\n| Large Pages | Use `BM25ContentFilter` to reduce content |\n| JavaScript-heavy Sites | Use `playwright` provider |\n| Simple Pages | Use `lxml` or `trafilatura` for speed |\n| Batch Processing | Set appropriate `word_threshold` |\n\n## Error Handling\n\nThe system provides graceful degradation:\n\n1. **Provider Fallback**: Falls back to alternative provider on failure\n2. **Strict Mode**: Raises exceptions when enabled\n3. **Partial Results**: Returns available content on partial failures\n\n资料来源：[crawl4ai/extraction_strategy.py:50-120]()\n\n## Related Components\n\n- **Chunking**: Content can be further processed with text chunking strategies\n- **Extraction**: Works alongside extraction strategies for structured data\n- **Cache**: Generated Markdown can be cached for repeated access\n\n---\n\n<a id='extraction_strategies'></a>\n\n## Extraction Strategies\n\n### 相关页面\n\n相关主题：[Markdown Generation](#markdown_generation), [Deep Crawling Strategies](#deep_crawling)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/extraction_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n- [crawl4ai/chunking_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/chunking_strategy.py)\n- [crawl4ai/content_scraping_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/content_scraping_strategy.py)\n- [crawl4ai/table_extraction.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/table_extraction.py)\n- [docs/md_v2/extraction/llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/llm-strategies.md)\n- [docs/md_v2/extraction/no-llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/no-llm-strategies.md)\n</details>\n\n# Extraction Strategies\n\nExtraction Strategies in crawl4ai define how content is parsed, structured, and extracted from crawled web pages. They form the core abstraction layer that determines whether unstructured HTML becomes meaningful, machine-readable data.\n\n## Overview\n\nExtraction Strategies handle the transformation pipeline from raw HTML to structured output. The system supports two primary categories:\n\n| Category | Use Case | Performance |\n|----------|----------|-------------|\n| LLM-based | Complex, semantic extraction | Slower, higher accuracy |\n| No-LLM | Fast, pattern-based extraction | Faster, rule-dependent |\n\n资料来源：[crawl4ai/extraction_strategy.py:1-50](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[HTML Content] --> B[Content Scraping Strategy]\n    B --> C[Chunking Strategy]\n    C --> D{Extraction Strategy}\n    D -->|LLM-based| E[LLM Strategy]\n    D -->|No-LLM| F[No-LLM Strategy]\n    E --> G[Structured JSON/Markdown]\n    F --> G\n    G --> H[Table Extraction Optional]\n    H --> I[Final Output]\n```\n\nThe extraction pipeline flows through scraping → chunking → extraction, with table extraction as an optional final step.\n\n资料来源：[crawl4ai/extraction_strategy.py:50-100](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## LLM-Based Strategies\n\nLLM strategies leverage large language models for semantic understanding and intelligent content extraction.\n\n### Supported Providers\n\n| Provider | Model Support | Configuration |\n|----------|--------------|---------------|\n| OpenAI | GPT-4, GPT-3.5 | `OPENAI_API_KEY` |\n| Anthropic | Claude 3, Claude 2 | `ANTHROPIC_API_KEY` |\n| Azure OpenAI | Custom deployments | `AZURE_API_KEY`, `AZURE_API_BASE` |\n| Ollama | Local models | `OLLAMA_BASE_URL` |\n\n### Configuration Parameters\n\n```python\nclass LLMExtractionStrategy:\n    def __init__(\n        self,\n        provider: str = \"openai\",\n        model: str = \"gpt-4\",\n        api_token: Optional[str] = None,\n        system_prompt: Optional[str] = None,\n        user_prompt: Optional[str] = None,\n        extraction_type: str = \"block\",\n        input_format: str = \"html\",\n        instruction: Optional[str] = None\n    )\n```\n\n资料来源：[docs/md_v2/extraction/llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/llm-strategies.md)\n\n### Extraction Types\n\n| Type | Description | Best For |\n|------|-------------|----------|\n| `block` | Block-level extraction | Paragraphs, sections |\n| `schema` | Schema-based extraction | Structured data, forms |\n| `custom` | Custom instructions | Specific extraction needs |\n\n资料来源：[crawl4ai/extraction_strategy.py:100-150](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## No-LLM Strategies\n\nNo-LLM strategies provide fast, deterministic extraction without external API dependencies.\n\n### Available Strategies\n\n| Strategy | Purpose |\n|----------|---------|\n| `NoExtractionStrategy` | Pass-through, no extraction |\n| `JsonCssExtractionStrategy` | CSS selector-based JSON extraction |\n| `RegexExtractionStrategy` | Regex pattern matching |\n| `XPathExtractionStrategy` | XPath-based extraction |\n\n资料来源：[docs/md_v2/extraction/no-llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/no-llm-strategies.md)\n\n### JsonCssExtractionStrategy\n\n```python\nfrom crawl4ai import JsonCssExtractionStrategy\n\nstrategy = JsonCssExtractionStrategy(\n    schema={\n        \"name\": \"ProductList\",\n        \"baseSelector\": \"div.product\",\n        \"fields\": [\n            {\"name\": \"title\", \"selector\": \"h2.title\", \"type\": \"text\"},\n            {\"name\": \"price\", \"selector\": \"span.price\", \"type\": \"text\"},\n            {\"name\": \"image\", \"selector\": \"img\", \"attribute\": \"src\"}\n        ]\n    }\n)\n```\n\n资料来源：[crawl4ai/extraction_strategy.py:150-200](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## Chunking Strategies\n\nChunking strategies split content into manageable pieces before extraction.\n\n### Default Chunking Behavior\n\n```mermaid\ngraph LR\n    A[Large Content] --> B[Character Split]\n    B --> C[Overlap Application]\n    C --> D[Token Count Check]\n    D -->|Under limit| E[Chunk Ready]\n    D -->|Over limit| F[Recursive Split]\n    F --> E\n```\n\n### Configuration Options\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `chunk_token_size` | int | 1000 | Target tokens per chunk |\n| `overlap` | int | 100 | Overlapping tokens between chunks |\n| `max_chunk_size` | int | 3000 | Hard maximum chunk size |\n| `splitting_regex` | str | `\\n\\n+` | Regex for splitting points |\n\n资料来源：[crawl4ai/chunking_strategy.py:1-80](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/chunking_strategy.py)\n\n## Content Scraping Strategy\n\nThe content scraping strategy determines initial content extraction from HTML.\n\n```mermaid\ngraph TD\n    A[Raw HTML] --> B{Scraping Strategy}\n    B -->|BeautifulSoup| C[Parse DOM]\n    B -->|Playwright| D[Dynamic Render]\n    B -->|Raw| E[Minimal Processing]\n    C --> F[Content Cleaned]\n    D --> F\n    E --> F\n```\n\n### Strategy Selection\n\n| Strategy | JavaScript | Speed | Use Case |\n|----------|------------|-------|----------|\n| `BeautifulSoup` | No | Fast | Static pages |\n| `Playwright` | Yes | Medium | SPAs, dynamic content |\n| `RawContent` | No | Fastest | Pre-processed HTML |\n\n资料来源：[crawl4ai/content_scraping_strategy.py:1-60](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/content_scraping_strategy.py)\n\n## Table Extraction\n\nTable extraction handles tabular data structures within web pages.\n\n```python\nclass TableExtractionStrategy:\n    def __init__(\n        self,\n        table_styles: Optional[List[str]] = None,\n        ignore_tables: Optional[List[str]] = None,\n        merge_multiple_headers: bool = False\n    )\n```\n\n### Extraction Configuration\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `table_styles` | List[str] | CSS classes to include as tables |\n| `ignore_tables` | List[str] | CSS classes to exclude |\n| `merge_multiple_headers` | bool | Merge multi-row headers |\n| `extract_header` | bool | Include header row (default: True) |\n\n资料来源：[crawl4ai/table_extraction.py:1-100](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/table_extraction.py)\n\n## Complete Pipeline Example\n\n```python\nfrom crawl4ai import (\n    AsyncWebCrawler,\n    LLMExtractionStrategy,\n    JsonCssExtractionStrategy,\n    RegexExtractionStrategy,\n    TableExtractionStrategy\n)\n\nasync with AsyncWebCrawler() as crawler:\n    # LLM-based extraction\n    llm_result = await crawler.arun(\n        url=\"https://example.com/article\",\n        extraction_strategy=LLMExtractionStrategy(\n            provider=\"openai\",\n            model=\"gpt-4\",\n            instruction=\"Extract article title, author, and key points\"\n        )\n    )\n    \n    # CSS-based extraction\n    css_result = await crawler.arun(\n        url=\"https://example.com/products\",\n        extraction_strategy=JsonCssExtractionStrategy(schema=product_schema)\n    )\n```\n\n资料来源：[crawl4ai/extraction_strategy.py:200-250](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## Strategy Selection Guide\n\n```mermaid\ngraph TD\n    A[Start] --> B{Need semantic understanding?}\n    B -->|Yes| C{External API acceptable?}\n    B -->|No| D[No-LLM Strategy]\n    C -->|Yes| E{Local deployment needed?}\n    C -->|No| F[Ollama/Local LLM]\n    E -->|Yes| F\n    E -->|No| G[OpenAI/Anthropic]\n    D --> H{Data has tabular structure?}\n    H -->|Yes| I[Add TableExtractionStrategy]\n    H -->|No| J[Complete]\n    G --> J\n    F --> J\n    I --> J\n```\n\n### Decision Matrix\n\n| Requirement | Recommended Strategy |\n|-------------|---------------------|\n| Simple CSS extraction | `JsonCssExtractionStrategy` |\n| Complex semantic parsing | `LLMExtractionStrategy` |\n| High-volume, low-latency | No-LLM strategies |\n| Schema-agnostic | LLM-based strategies |\n| Tabular data focus | `TableExtractionStrategy` |\n\n资料来源：[docs/md_v2/extraction/llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/llm-strategies.md), [docs/md_v2/extraction/no-llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/no-llm-strategies.md)\n\n## Environment Variables\n\n| Variable | Required For | Description |\n|----------|--------------|-------------|\n| `OPENAI_API_KEY` | OpenAI LLM | API key for GPT models |\n| `ANTHROPIC_API_KEY` | Anthropic LLM | API key for Claude models |\n| `AZURE_API_KEY` | Azure OpenAI | Azure OpenAI API key |\n| `OLLAMA_BASE_URL` | Local LLM | Base URL for Ollama server |\n\n资料来源：[crawl4ai/extraction_strategy.py:250-300](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n---\n\n<a id='deep_crawling'></a>\n\n## Deep Crawling Strategies\n\n### 相关页面\n\n相关主题：[Extraction Strategies](#extraction_strategies), [Async Web Crawler](#async_crawler)\n\n<details>\n<summary>Related Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/deep_crawling/bfs_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/bfs_strategy.py)\n- [crawl4ai/deep_crawling/dfs_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/dfs_strategy.py)\n- [crawl4ai/deep_crawling/bff_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/bff_strategy.py)\n- [crawl4ai/deep_crawling/base_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n- [crawl4ai/deep_crawling/filters.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/filters.py)\n- [crawl4ai/deep_crawling/scorers.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/scorers.py)\n- [docs/md_v2/core/deep-crawling.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/core/deep-crawling.md)\n</details>\n\n# Deep Crawling Strategies\n\n## Overview\n\nDeep Crawling Strategies in crawl4ai provide systematic approaches to traverse and extract content from websites beyond a single page. These strategies enable controlled, scalable web crawling by managing URL discovery, prioritization, filtering, and scoring mechanisms. The deep crawling module supports multiple traversal algorithms (BFS, DFS, BFF) with extensible filtering and scoring systems.\n\n资料来源：[base_strategy.py:1-50](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Seed URLs] --> B[DeepCrawlingStrategy]\n    B --> C[URL Filters]\n    B --> D[URL Scorers]\n    B --> E[Traversal Algorithm]\n    C --> F[Valid URLs]\n    D --> G[Prioritized URLs]\n    E --> H[Crawl Queue]\n    G --> H\n    H --> I[Crawl4AI Extractor]\n    I --> J[Extracted Content]\n    J --> K[Links Extracted]\n    K --> B\n```\n\nThe architecture follows a producer-consumer pattern where the strategy continuously discovers URLs from crawled pages and feeds them back into the crawl queue based on prioritization rules.\n\n资料来源：[base_strategy.py:50-100](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n\n## Core Components\n\n### Base Strategy\n\nAll crawling strategies inherit from `DeepCrawlingStrategy`, which provides the foundational interface and shared functionality:\n\n| Property/Method | Type | Description |\n|-----------------|------|-------------|\n| `url_scorer` | `URLScorer` | Scores URLs for prioritization |\n| `url_filter` | `URLFilter` | Filters URLs for validity |\n| `keywords` | `Set[str]` | Keywords for relevance matching |\n| `keywords_or` | `Set[str]` | Alternative keyword matching |\n| `max_depth` | `int` | Maximum crawl depth |\n| `included_domains` | `Set[str]` | Allowed domains |\n| `excluded_domains` | `Set[str]` | Blocked domains |\n| `crawl_enabled` | `bool` | Enable/disable crawling |\n| `check_keywords` | `Callable` | Custom keyword validation |\n\n资料来源：[base_strategy.py:100-150](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n\n## Traversal Algorithms\n\n### BFS (Breadth-First Search) Strategy\n\nThe BFS strategy explores pages level by level, ensuring comprehensive coverage before going deeper:\n\n```mermaid\ngraph LR\n    A[Level 0: Seed] --> B[Level 1: depth=1]\n    B --> C[Level 2: depth=2]\n    C --> D[Level 3: depth=3]\n    style A fill:#90EE90\n    style B fill:#87CEEB\n    style C fill:#DDA0DD\n    style D fill:#F0E68C\n```\n\n**Characteristics:**\n- Systematic exploration of shallow depths first\n- Ideal for site maps and directory-style sites\n- Higher memory usage due to large frontier sets\n- Better for finding all accessible pages at shallower depths\n\n资料来源：[bfs_strategy.py:1-80](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/bfs_strategy.py)\n\n**Configuration Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `max_depth` | `int` | `3` | Maximum crawl depth |\n| `max_pages` | `int` | `50` | Maximum pages to crawl |\n| `priority` | `int` | `0` | Base priority score |\n| `include_external` | `bool` | `False` | Allow external domain crawling |\n\n### DFS (Depth-First Search) Strategy\n\nThe DFS strategy explores as deep as possible before backtracking:\n\n```mermaid\ngraph TD\n    A[Start] --> B[Depth 1]\n    B --> C[Depth 2]\n    C --> D[Depth 3]\n    D --> E[Backtrack]\n    E --> F[Next Branch]\n    F --> G[Continue Deep]\n    style A fill:#90EE90\n    style D fill:#FF6B6B\n```\n\n**Characteristics:**\n- Deep exploration of specific paths first\n- Lower memory footprint\n- Suitable for following navigation chains\n- Risk of getting stuck in deep site sections\n\n资料来源：[dfs_strategy.py:1-80](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/dfs_strategy.py)\n\n**Configuration Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `max_depth` | `int` | `10` | Maximum crawl depth |\n| `max_pages` | `int` | `100` | Maximum pages to crawl |\n| `priority` | `int` | `0` | Base priority score |\n\n### BFF (Best-First with Filters) Strategy\n\nThe BFF strategy combines filtering with score-based prioritization, crawling the most relevant pages first:\n\n```mermaid\ngraph TD\n    A[URL Discovered] --> B{URL Filter}\n    B -->|Pass| C{Score URL}\n    B -->|Fail| X[Skip]\n    C --> D[Priority Queue]\n    D --> E[Crawl Next Best]\n    E --> F[Extract Links]\n    F --> A\n```\n\n**Characteristics:**\n- Relevance-based crawling using keyword matching\n- Configurable scoring functions\n- Filters out irrelevant content early\n- Most efficient for targeted data extraction\n\n资料来源：[bff_strategy.py:1-100](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/bff_strategy.py)\n\n## Filtering System\n\nThe filtering system determines which URLs are eligible for crawling:\n\n### FilterChain\n\nMultiple filters can be chained together for comprehensive URL validation:\n\n```python\nfrom crawl4ai.deep_crawling.filters import FilterChain, SameDomainFilter, ExtensionFilter\n\nfilter_chain = FilterChain([\n    SameDomainFilter(allowed_domains=[\"example.com\"]),\n    ExtensionFilter(excluded_extensions=[\".pdf\", \".zip\"]),\n])\n```\n\n### Available Filters\n\n| Filter | Purpose | Key Parameters |\n|--------|---------|----------------|\n| `SameDomainFilter` | Restrict to same domain | `allowed_domains`, `strict` |\n| `ExtensionFilter` | Block by file extension | `excluded_extensions` |\n| ` robots.txt Filter` | Respect robots directives | `user_agent` |\n| `RegexFilter` | Custom pattern matching | `patterns`, `exclude` |\n\n资料来源：[filters.py:1-120](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/filters.py)\n\n## Scoring System\n\nURL scoring determines crawl priority within the queue:\n\n### ScorerChain\n\nScorers can be combined in a chain for multi-factor evaluation:\n\n```python\nfrom crawl4ai.deep_crawling.scorers import ScorerChain, KeywordRelevanceScorer, DepthScorer\n\nscorer = ScorerChain([\n    KeywordRelevanceScorer(keywords=[\"api\", \"docs\"]),\n    DepthScorer(max_depth=5, decay=0.5),\n])\n```\n\n### Available Scorers\n\n| Scorer | Function | Parameters |\n|--------|----------|------------|\n| `KeywordRelevanceScorer` | Match keywords in URL/text | `keywords`, `weight` |\n| `DepthScorer` | Penalize deep pages | `max_depth`, `decay` |\n| `FreshnessScorer` | Prefer recent content | `date_field`, `decay` |\n| `CustomScorer` | User-defined scoring | `scoring_fn` |\n\n资料来源：[scorers.py:1-150](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/scorers.py)\n\n## Usage Examples\n\n### Basic BFS Crawling\n\n```python\nfrom crawl4ai import AsyncWebCrawler\nfrom crawl4ai.deep_crawling import BFSDistanceStrategy\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(\n        url=\"https://example.com\",\n        strategy=BFSDistanceStrategy(\n            max_depth=3,\n            max_pages=50\n        )\n    )\n```\n\n### Targeted Crawling with BFF\n\n```python\nfrom crawl4ai.deep_crawling import BFFStrategy\nfrom crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer\nfrom crawl4ai.deep_crawling.filters import SameDomainFilter\n\nstrategy = BFFStrategy(\n    max_depth=5,\n    max_pages=100,\n    keywords={\"documentation\", \"api\", \"guide\"},\n    url_filter=SameDomainFilter(allowed_domains=[\"example.com\"]),\n    url_scorer=KeywordRelevanceScorer(keywords={\"documentation\"}),\n)\n```\n\n资料来源：[docs/md_v2/core/deep-crawling.md:1-100](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/core/deep-crawling.md)\n\n## Workflow States\n\n```mermaid\nstateDiagram-v2\n    [*] --> Initializing: Start crawl\n    Initializing --> Crawling: Load seed URLs\n    Crawling --> Processing: Fetch page\n    Processing --> Filtering: Extract links\n    Filtering --> Scoring: Validate URLs\n    Scoring --> Queuing: Rank by priority\n    Queuing --> Crawling: Next URL\n    Crawling --> [*]: Queue empty or max reached\n```\n\n## Strategy Selection Guide\n\n| Use Case | Recommended Strategy | Reason |\n|----------|---------------------|--------|\n| Site mapping | BFS | Comprehensive shallow coverage |\n| Documentation sites | BFS | Find all pages systematically |\n| Article/navigation chains | DFS | Follow deep links naturally |\n| Targeted data extraction | BFF | Prioritize relevant pages |\n| API documentation | BFF | Filter by keyword relevance |\n| Limited resources | DFS | Lower memory footprint |\n\n## Configuration Reference\n\n### Common Parameters\n\n```python\n@dataclass\nclass DeepCrawlConfig:\n    # Scope\n    included_domains: Set[str] = None\n    excluded_domains: Set[str] = None\n    \n    # Limits\n    max_depth: int = 3\n    max_pages: int = 100\n    max_total_pages: int = 1000\n    \n    # Filtering\n    allow_external: bool = False\n    check_keywords: bool = True\n    \n    # Scoring\n    scoring: ScorerChain = None\n    filter: FilterChain = None\n```\n\n### Keyword Matching\n\n```python\n# AND matching (all keywords must match)\nkeywords = {\"documentation\", \"api\"}\nkeywords_or = False\n\n# OR matching (any keyword matches)\nkeywords = {\"guide\", \"tutorial\", \"docs\"}\nkeywords_or = True\n```\n\n资料来源：[base_strategy.py:150-200](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n\n## Advanced Features\n\n### Custom Filters\n\n```python\nfrom crawl4ai.deep_crawling.filters import URLFilter\n\nclass CustomFilter(URLFilter):\n    def should_crawl(self, url: str) -> bool:\n        # Custom logic\n        return \"product\" in url and not url.endswith(\".jpg\")\n```\n\n### Custom Scorers\n\n```python\nfrom crawl4ai.deep_crawling.scorers import URLScorer\n\nclass PriorityScorer(URLScorer):\n    def score(self, url: str, context: dict) -> float:\n        base_score = 1.0\n        if \"important\" in url:\n            base_score *= 2.0\n        return base_score\n```\n\n资料来源：[filters.py:150-200](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/filters.py), [scorers.py:150-200](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/scorers.py)\n\n## Best Practices\n\n1. **Set reasonable limits**: Always configure `max_pages` and `max_depth` to prevent runaway crawling\n2. **Use BFF for targeted extraction**: When you know what content you need, BFF reduces noise\n3. **Filter early, score late**: Apply filters before scoring to reduce unnecessary processing\n4. **Respect robots.txt**: Configure filters to respect site crawling directives\n5. **Monitor memory usage**: BFS uses more memory; switch to DFS for resource-constrained environments\n6. **Combine keyword strategies**: Use both `keywords` (AND) and `keywords_or` (OR) for flexible matching\n\n## See Also\n\n- [AsyncWebCrawler API Reference](../api/async-web-crawler.md)\n- [Content Extraction](../guides/content-extraction.md)\n- [Browser Configuration](../guides/browser-config.md)\n\n---\n\n<a id='anti_bot_detection'></a>\n\n## Anti-Bot Detection and Proxy Management\n\n### 相关页面\n\n相关主题：[Browser Management](#browser_management)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/antibot_detector.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/antibot_detector.py)\n- [crawl4ai/proxy_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/proxy_strategy.py)\n- [crawl4ai/async_configs.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_configs.py)\n- [docs/md_v2/advanced/anti-bot-and-fallback.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/advanced/anti-bot-and-fallback.md)\n- [docs/md_v2/advanced/proxy-security.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/advanced/proxy-security.md)\n- [sbom/README.md](https://github.com/unclecode/crawl4ai/blob/main/sbom/README.md)\n</details>\n\n# Anti-Bot Detection and Proxy Management\n\n## Overview\n\nCrawl4ai provides sophisticated anti-bot detection evasion and proxy management capabilities to ensure reliable web crawling operations. These features work together to detect and circumvent bot protection mechanisms while maintaining request anonymity through proxy rotation.\n\n## Architecture Overview\n\n```mermaid\ngraph TD\n    A[Client Request] --> B[AntiBotDetector]\n    B --> C{Bot Detection?}\n    C -->|Yes| D[Apply Evasion Strategy]\n    C -->|No| E[Direct Request]\n    D --> F[ProxySelector]\n    F --> G[Rotating Proxies]\n    G --> H[Target Website]\n    H --> I{Response Valid?}\n    I -->|No| J[Fallback Mechanism]\n    J --> B\n    I -->|Yes| K[Return Content]\n```\n\n## Anti-Bot Detection System\n\n### Purpose and Scope\n\nThe anti-bot detection module (`antibot_detector.py`) analyzes responses from target websites to determine if bot protection mechanisms have been triggered. When detection occurs, the system can automatically apply evasion strategies or fall back to alternative methods.\n\n资料来源：[crawl4ai/antibot_detector.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/antibot_detector.py)\n\n### Detection Strategies\n\n| Strategy | Description | Use Case |\n|----------|-------------|----------|\n| Header Analysis | Examines HTTP headers for bot detection signals | Standard bot checks |\n| Content Analysis | Scans response content for CAPTCHAs or blocking messages | Challenge pages |\n| Status Code Monitoring | Tracks HTTP status codes indicating blocks | 403, 429 responses |\n| JavaScript Challenge Detection | Identifies JS-based bot challenges | Cloudflare, PerimeterX |\n\n### Key Components\n\nThe anti-bot detector integrates with the async configuration system to provide seamless fallback handling:\n\n```python\n# Pseudocode representation based on async_configs.py integration\nclass AntiBotConfig:\n    enabled: bool = True\n    detection_threshold: float = 0.7\n    auto_fallback: bool = True\n    max_retries: int = 3\n```\n\n资料来源：[crawl4ai/async_configs.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_configs.py)\n\n## Proxy Management System\n\n### Purpose and Scope\n\nThe proxy strategy module (`proxy_strategy.py`) manages proxy rotation, selection, and health checking to maintain request anonymity and distribute load across multiple IP addresses.\n\n资料来源：[crawl4ai/proxy_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/proxy_strategy.py)\n\n### Proxy Rotation Strategies\n\n| Strategy | Description | Best For |\n|----------|-------------|----------|\n| Round Robin | Sequential proxy selection | Even distribution |\n| Random | Random proxy selection | Avoiding pattern detection |\n| Weighted | Prioritize faster/reliable proxies | Performance optimization |\n| Geographic | Match proxy location to target | Region-specific content |\n\n### Configuration Parameters\n\n```python\nclass ProxyConfig:\n    proxies: List[str] = []          # List of proxy URLs\n    rotation_strategy: str = \"round_robin\"\n    health_check_interval: int = 300 # seconds\n    timeout: int = 30                # proxy timeout in seconds\n    retry_on_failure: bool = True\n```\n\n### Proxy Health Monitoring\n\nThe system continuously monitors proxy health through periodic health checks, removing failed proxies from the active pool and re-evaluating them after a cooldown period.\n\n## Integration with Async Configuration\n\n### Fallback Mechanisms\n\nWhen anti-bot detection triggers, the system can automatically switch to fallback modes:\n\n1. **Proxy Fallback**: Rotate to a different proxy server\n2. **Strategy Fallback**: Switch to alternative crawling strategies\n3. **User-Agent Fallback**: Use different browser fingerprints\n\n资料来源：[docs/md_v2/advanced/anti-bot-and-fallback.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/advanced/anti-bot-and-fallback.md)\n\n### Configuration Example\n\n```yaml\nanti_bot:\n  enabled: true\n  detection_sensitivity: \"medium\"\n  auto_fallback: true\n  \nproxy:\n  enabled: true\n  strategy: \"weighted\"\n  proxies:\n    - \"http://proxy1.example.com:8080\"\n    - \"http://proxy2.example.com:8080\"\n  health_check:\n    enabled: true\n    interval: 300\n```\n\n## Security Considerations\n\n### Proxy Security\n\nWhen configuring proxies, consider the following security aspects:\n\n- **Proxy Protocol**: Use HTTPS proxies to encrypt traffic\n- **Authentication**: Implement proxy authentication where supported\n- **Provider Reputation**: Use trusted proxy providers\n- **IP Rotation**: Avoid predictable IP patterns\n\n资料来源：[docs/md_v2/advanced/proxy-security.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/advanced/proxy-security.md)\n\n### Best Practices\n\n| Practice | Description |\n|----------|-------------|\n| Rate Limiting | Respect target site limits to avoid IP bans |\n| Request Delays | Implement delays between requests |\n| Header Randomization | Vary User-Agent and other headers |\n| Cookie Management | Handle cookies appropriately per session |\n| SSL Verification | Validate SSL certificates for security |\n\n## Workflow Diagram\n\n```mermaid\nsequenceDiagram\n    participant Client\n    participant AntiBot as AntiBot Detector\n    participant ProxyMgr as Proxy Manager\n    participant Target as Target Website\n    \n    Client->>AntiBot: Send Request\n    AntiBot->>ProxyMgr: Request Proxy\n    ProxyMgr->>AntiBot: Return Proxy\n    AntiBot->>Target: Forward Request via Proxy\n    \n    alt Bot Detected\n        Target-->>AntiBot: Bot Challenge Response\n        AntiBot->>ProxyMgr: Request Different Proxy\n        ProxyMgr-->>AntiBot: New Proxy\n        AntiBot->>Target: Retry with New Proxy\n    else Success\n        Target-->>Client: Return Content\n    end\n```\n\n## Error Handling\n\n### Common Error Scenarios\n\n| Error | Cause | Resolution |\n|-------|-------|------------|\n| `403 Forbidden` | IP blocked | Rotate proxy |\n| `429 Too Many Requests` | Rate limited | Backoff and retry |\n| `CAPTCHA Required` | Bot detected | Switch strategy |\n| `Proxy Timeout` | Proxy unavailable | Health check and replace |\n\n### Retry Logic\n\nThe system implements exponential backoff for retries:\n\n```python\nretry_config = {\n    \"max_attempts\": 3,\n    \"base_delay\": 1.0,      # seconds\n    \"max_delay\": 60.0,       # seconds\n    \"exponential_base\": 2\n}\n```\n\n## Summary\n\nCrawl4ai's anti-bot detection and proxy management system provides a robust framework for evading bot detection while maintaining reliable web crawling operations. The integration between `AntiBotDetector` and `ProxyStrategy` enables automatic fallback mechanisms that significantly improve crawling success rates against protected websites.\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：unclecode/crawl4ai\n\n摘要：发现 21 个潜在踩坑项，其中 5 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: arun() and arun_many() type hinting needs fixing。\n\n## 1. 安装坑 · 来源证据：[Bug]: arun() and arun_many() type hinting needs fixing\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: arun() and arun_many() type hinting needs fixing\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d3b6cfd3700147f690e0e65875f15424 | https://github.com/unclecode/crawl4ai/issues/1898 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 配置坑 · 来源证据：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason…\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason is shown\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ad61b108bf894cc286ca7966e8c86758 | https://github.com/unclecode/crawl4ai/issues/1949 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 配置坑 · 来源证据：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_1ee99f5d72f143f4b064732cc19e0c85 | https://github.com/unclecode/crawl4ai/issues/1963 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 4. 配置坑 · 来源证据：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d7fa967632a948008efbc182d1f2c96b | https://github.com/unclecode/crawl4ai/issues/1938 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安全/权限坑 · 来源证据：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_2e9fbf659fbb40aba437886a87f8e2d7 | https://github.com/unclecode/crawl4ai/issues/1962 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_af29278fd7294d4a8f0f6f37ab987b5c | https://github.com/unclecode/crawl4ai/issues/1968 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 7. 安装坑 · 来源证据：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_97d44cedb21a4908a7743fde11209954 | https://github.com/unclecode/crawl4ai/issues/1950 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ae45861377894b99a57d6bbdc06af313 | https://github.com/unclecode/crawl4ai/issues/1959 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 9. 安装坑 · 来源证据：v0.7.1:Update\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.1:Update\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_a6ae9133fff54443b712725f51769fa1 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：v0.7.2: CI/CD & Dependency Optimization Update\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.2: CI/CD & Dependency Optimization Update\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_14954e0431ca426ebeaa4bb31778d4af | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.2 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 11. 配置坑 · 来源证据：[Bug]: Markdown export loses heading hierarchy and table structure\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: Markdown export loses heading hierarchy and table structure\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_c3eac8ab81e34bf3b6cc050f7f8e9826 | https://github.com/unclecode/crawl4ai/issues/1964 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 12. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:798201435 | https://github.com/unclecode/crawl4ai | README/documentation is current enough for a first validation pass.\n\n## 13. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | last_activity_observed missing\n\n## 14. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:798201435 | https://github.com/unclecode/crawl4ai | no_demo; severity=medium\n\n## 15. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:798201435 | https://github.com/unclecode/crawl4ai | no_demo; severity=medium\n\n## 16. 安全/权限坑 · 来源证据：Release v0.7.3\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.3\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e2b75670cbcc4814a86423818b9f6f48 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.3 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 17. 安全/权限坑 · 来源证据：Release v0.7.5\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.5\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_056d1470d7534cacb39eeb894e054496 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.5 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 18. 安全/权限坑 · 来源证据：Release v0.7.7\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.7\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e157445f88744795b5c6234783eca692 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.7 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 19. 安全/权限坑 · 来源证据：[Bug]: Markdown text extraction drops text when element contains empty elements\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug]: Markdown text extraction drops text when element contains empty elements\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_dffa926853d147ebb487a03fdfd1818e | https://github.com/unclecode/crawl4ai/issues/1966 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 20. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | issue_or_pr_quality=unknown\n\n## 21. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | release_recency=unknown\n\n<!-- canonical_name: unclecode/crawl4ai; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "crawl4ai",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "github_repo:798201435",
          "kind": "repo",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/unclecode/crawl4ai"
        },
        {
          "evidence_id": "art_ad7837973e204acb98fc90e9f4b59193",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/unclecode/crawl4ai#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "crawl4ai 说明书",
      "toc": [
        "https://github.com/unclecode/crawl4ai 项目说明书",
        "目录",
        "Introduction to Crawl4AI",
        "Overview",
        "Purpose and Scope",
        "Core Architecture",
        "Key Features",
        "Installation and Setup",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "1debe5f5fcc118ced10826a1040a81f9b77e9255",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "Dockerfile",
      "README.md",
      "docker-compose.yml",
      "uv.lock",
      "requirements.txt",
      "docs/RELEASE_NOTES_v0.8.0.md",
      "docs/releases_review/v0_7_0_features_demo.py",
      "docs/releases_review/v0.3.74.overview.py",
      "docs/releases_review/demo_v0.7.7.py",
      "docs/releases_review/v0.7.5_docker_hooks_demo.py",
      "docs/releases_review/v0_4_3b2_features_demo.py",
      "docs/releases_review/demo_v0.8.5.py",
      "docs/releases_review/demo_v0.7.6.py",
      "docs/releases_review/demo_v0.7.0.py",
      "docs/releases_review/demo_v0.8.0.py",
      "docs/releases_review/demo_v0.7.8.py",
      "docs/releases_review/v0_4_24_walkthrough.py",
      "docs/releases_review/crawl4ai_v0_7_0_showcase.py",
      "docs/releases_review/demo_v0.7.5.py",
      "docs/migration/v0.8.0-upgrade-guide.md",
      "docs/blog/release-v0.7.7.md",
      "docs/blog/release-v0.7.8.md",
      "docs/blog/release-v0.7.5.md",
      "docs/blog/release-v0.7.1.md",
      "docs/blog/release-v0.8.0.md",
      "docs/blog/release-v0.7.3.md",
      "docs/blog/release-v0.8.5.md",
      "docs/blog/release-v0.7.4.md",
      "docs/blog/release-v0.7.0.md",
      "docs/blog/release-v0.7.6.md",
      "docs/tutorials/coming_soon.md",
      "docs/md_v2/index.md",
      "docs/md_v2/complete-sdk-reference.md",
      "docs/md_v2/CONTRIBUTING.md",
      "docs/md_v2/stats.md",
      "docs/examples/dispatcher_example.py",
      "docs/examples/link_head_extraction_example.py",
      "docs/examples/docker_webhook_example.py",
      "docs/examples/README_BUILTIN_BROWSER.md"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# crawl4ai - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 crawl4ai 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **正在使用 Claude/Codex/Cursor/Gemini 等宿主 AI 的开发者**：README 或插件配置提到多个宿主 AI。 证据：`README.md` Claim：`clm_0002` supported 0.86\n\n## 它能做什么\n\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n## 怎么开始\n\n- `pip install -U crawl4ai` 证据：`README.md` Claim：`clm_0003` supported 0.86\n- `pip install crawl4ai --pre` 证据：`README.md` Claim：`clm_0004` supported 0.86\n- `pip install crawl4ai` 证据：`README.md` Claim：`clm_0004` supported 0.86, `clm_0005` supported 0.86, `clm_0006` supported 0.86, `clm_0014` supported 0.86\n- `pip install crawl4ai[sync]` 证据：`README.md` Claim：`clm_0006` supported 0.86\n- `git clone https://github.com/unclecode/crawl4ai.git` 证据：`README.md` Claim：`clm_0007` supported 0.86\n- `pip install -e .                    # Basic installation in editable mode` 证据：`README.md` Claim：`clm_0008` supported 0.86\n- `pip install -e \".[torch]\"           # With PyTorch features` 证据：`README.md` Claim：`clm_0009` supported 0.86\n- `pip install -e \".[transformer]\"     # With Transformer features` 证据：`README.md` Claim：`clm_0010` supported 0.86\n- `pip install -e \".[cosine]\"          # With cosine similarity features` 证据：`README.md` Claim：`clm_0011` supported 0.86\n- `pip install -e \".[sync]\"            # With synchronous crawling (Selenium)` 证据：`README.md` Claim：`clm_0012` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：先做角色匹配试用\n- **为什么**：这个项目更像角色库，核心风险是选错角色或把角色文案当执行能力；先用 Prompt Preview 试角色匹配，再决定是否沙盒导入。\n\n### 30 秒判断\n\n- **现在怎么做**：先做角色匹配试用\n- **最小安全下一步**：先用 Prompt Preview 试角色匹配；满意后再隔离导入\n- **先别相信**：角色质量和任务匹配不能直接相信。\n- **继续会触碰**：角色选择偏差、命令执行、本地环境或项目文件\n\n### 现在可以相信\n\n- **适合人群线索：正在使用 Claude/Codex/Cursor/Gemini 等宿主 AI 的开发者**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n### 现在还不能相信\n\n- **角色质量和任务匹配不能直接相信。**（unverified）：角色库证明有很多角色，不证明每个角色都适合你的具体任务，也不证明角色能产生高质量结果。\n- **不能把角色文案当成真实执行能力。**（unverified）：安装前只能判断角色描述和任务画像是否匹配，不能证明它能在宿主 AI 里完成任务。\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n\n### 继续会触碰什么\n\n- **角色选择偏差**：用户对任务应该由哪个专家角色处理的判断。 原因：选错角色会让 AI 从错误专业视角回答，浪费时间或误导决策。\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：先用交互式试用验证任务画像和角色匹配，不要先导入整套角色库。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **保留原始角色选择记录**：如果输出偏题，可以回到任务画像阶段重新选择角色，而不是继续沿着错误角色推进。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0015` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md` Claim：`clm_0016` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n### 上下文规模\n\n- 文件总数：783\n- 重要文件覆盖：40/783\n- 证据索引条目：80\n- 角色 / Skill 条目：79\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 crawl4ai 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 crawl4ai 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 crawl4ai 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 79 个角色 / Skill / 项目文档条目。\n\n- **GitHub Actions Workflows Documentation**（project_doc）：GitHub Actions Workflows Documentation 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`.github/workflows/docs/README.md`\n- **Crawl4AI Prospect‑Wizard – step‑by‑step guide**（project_doc）：Crawl4AI Prospect‑Wizard – step‑by‑step guide 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/apps/linkdin/README.md`\n- **Adaptive Crawling Examples**（project_doc）：This directory contains examples demonstrating various aspects of Crawl4AI's Adaptive Crawling feature. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/adaptive_crawling/README.md`\n- **Amazon R2D2 Product Search Example**（project_doc）：A real-world demonstration of Crawl4AI's multi-step crawling with LLM-generated automation scripts. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/c4a_script/amazon_example/README.md`\n- **C4A-Script Interactive Tutorial**（project_doc）：A comprehensive web-based tutorial for learning and experimenting with C4A-Script - Crawl4AI's visual web automation language. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/c4a_script/tutorial/README.md`\n- **Web Scraper API with Custom Model Support**（project_doc）：Web Scraper API with Custom Model Support 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/website-to-api/README.md`\n- **C4A-Script Interactive Tutorial**（project_doc）：A comprehensive web-based tutorial for learning and experimenting with C4A-Script - Crawl4AI's visual web automation language. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/apps/c4a-script/README.md`\n- **Crawl4AI Chrome Extension**（project_doc）：Visual extraction tools for Crawl4AI - Click to extract data and content from any webpage! 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/apps/crawl4ai-assistant/README.md`\n- **Crawl4AI Marketplace**（project_doc）：A terminal-themed marketplace for tools, integrations, and resources related to Crawl4AI. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/marketplace/README.md`\n- **Contributing to Crawl4AI**（project_doc）：Welcome to the Crawl4AI project! As an open-source library for web crawling and AI integration, we value contributions from the community. This guide explains our branching strategy, how to contribute effectively, and the overall release process. Our goal is to maintain a stable, collaborative environment where bug fixes, features, and improvements can be integrated smoothly while allowing for experimental developme… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/CONTRIBUTING.md`\n- **🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.**（project_doc）：🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`README.md`\n- **Software Bill of Materials SBOM**（project_doc）：This directory contains the CycloneDX SBOM for the project. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`sbom/README.md`\n- **Crawl4AI Docker Guide 🐳**（project_doc）：Table of Contents - Prerequisites prerequisites - Installation installation - Option 1: Using Pre-built Docker Hub Images Recommended option-1-using-pre-built-docker-hub-images-recommended - Option 2: Using Docker Compose option-2-using-docker-compose - Option 3: Manual Local Build & Run option-3-manual-local-build--run - Dockerfile Parameters dockerfile-parameters - Using the API using-the-api - Playground Interfac… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`deploy/docker/README.md`\n- **Crawl4AI Stress Testing and Benchmarking**（project_doc）：Crawl4AI Stress Testing and Benchmarking 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`tests/memory/README.md`\n- **Contributing to Crawl4AI**（project_doc）：Welcome to the Crawl4AI project! As an open-source library for web crawling and AI integration, we value contributions from the community. This guide explains our branching strategy, how to contribute effectively, and the overall release process. Our goal is to maintain a stable, collaborative environment where bug fixes, features, and improvements can be integrated smoothly while allowing for experimental developme… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`CONTRIBUTING.md`\n- **Crawl4AI v0.8.0 Release Notes**（project_doc）：Release Date : January 2026 Previous Version : v0.7.6 Status : Release Candidate 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/RELEASE_NOTES_v0.8.0.md`\n- **Workflow Architecture Documentation**（project_doc）：Workflow Architecture Documentation 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`.github/workflows/docs/ARCHITECTURE.md`\n- **Workflow Quick Reference**（project_doc）：Standard Release bash 1. Update version vim crawl4ai/ version .py Set to \"1.2.3\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`.github/workflows/docs/WORKFLOW_REFERENCE.md`\n- **🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update**（project_doc）：🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.7.0.md`\n- **🛠️ Crawl4AI v0.7.1: Minor Cleanup Update**（project_doc）：🛠️ Crawl4AI v0.7.1: Minor Cleanup Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.7.1.md`\n- **🚀 Crawl4AI v0.7.3: The Multi-Config Intelligence Update**（project_doc）：🚀 Crawl4AI v0.7.3: The Multi-Config Intelligence Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.7.3.md`\n- **🚀 Crawl4AI v0.7.4: The Intelligent Table Extraction & Performance Update**（project_doc）：🚀 Crawl4AI v0.7.4: The Intelligent Table Extraction & Performance Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.7.4.md`\n- **🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update**（project_doc）：🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.7.5.md`\n- **Crawl4AI v0.7.6 Release Notes**（project_doc）：I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.7.6.md`\n- **🚀 Crawl4AI v0.7.7: The Self-Hosting & Monitoring Update**（project_doc）：🚀 Crawl4AI v0.7.7: The Self-Hosting & Monitoring Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.7.7.md`\n- **Crawl4AI v0.7.8: Stability & Bug Fix Release**（project_doc）：Crawl4AI v0.7.8: Stability & Bug Fix Release 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.7.8.md`\n- **Crawl4AI v0.8.0 Release Notes**（project_doc）：Release Date : January 2026 Previous Version : v0.7.6 Status : Release Candidate 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.8.0.md`\n- **Crawl4AI v0.8.5: Anti-Bot, Shadow DOM & 60+ Bug Fixes**（project_doc）：Crawl4AI v0.8.5: Anti-Bot, Shadow DOM & 60+ Bug Fixes 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/blog/release-v0.8.5.md`\n- **browser manager.py**（project_doc）：Function What it does --- --- ManagedBrowser.build browser flags Returns baseline Chromium CLI flags, disables GPU and sandbox, plugs locale, timezone, stealth tweaks, and any extras from BrowserConfig . ManagedBrowser. init Stores config and logger, creates temp dir, preps internal state. ManagedBrowser.start Spawns or connects to the Chromium process, returns its CDP endpoint plus the subprocess.Popen handle. Mana… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/codebase/browser.md`\n- **cli.py command surface**（project_doc）：Command Inputs / flags What it does --- --- --- profiles none Opens the interactive profile manager, lets you list, create, delete saved browser profiles that live in ~/.crawl4ai/profiles . browser status – Prints whether the always-on builtin browser is running, shows its CDP URL, PID, start time. browser stop – Kills the builtin browser and deletes its status file. browser view --url, -u URL optional Pops a visibl… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/codebase/cli.md`\n- **🐳 Using Docker Legacy**（project_doc）：Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub recommended or build from the repository. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/deprecated/docker-deployment.md`\n- **Builtin Browser in Crawl4AI**（project_doc）：This document explains the builtin browser feature in Crawl4AI and how to use it effectively. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/README_BUILTIN_BROWSER.md`\n- **Welcome to Crawl4AI! 🚀🤖**（project_doc）：Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/chainlit.md`\n- **Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI**（project_doc）：Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/full_page_screenshot_and_pdf_export.md`\n- **Using storage state to Pre-Load Cookies and LocalStorage**（project_doc）：Using storage state to Pre-Load Cookies and LocalStorage 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/storage_state_tutorial.md`\n- **Tutorial: Clicking Buttons to Load More Content with Crawl4AI**（project_doc）：Tutorial: Clicking Buttons to Load More Content with Crawl4AI 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/tutorial_dynamic_clicks.md`\n- **🔬 Building an AI Research Assistant with Crawl4AI: Smart URL Discovery**（project_doc）：🔬 Building an AI Research Assistant with Crawl4AI: Smart URL Discovery 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/url_seeder/tutorial_url_seeder.md`\n- **Advanced Adaptive Strategies**（project_doc）：While the default adaptive crawling configuration works well for most use cases, understanding the underlying strategies and scoring mechanisms allows you to fine-tune the crawler for specific domains and requirements. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/adaptive-strategies.md`\n- **Overview of Some Important Advanced Features**（project_doc）：Overview of Some Important Advanced Features Proxy, PDF, Screenshot, SSL, Headers, & Storage State 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/advanced-features.md`\n- **Anti-Bot Detection & Fallback**（project_doc）：When crawling sites protected by anti-bot systems Akamai, Cloudflare, PerimeterX, DataDome, Imperva, etc. , requests often get blocked with CAPTCHAs, 403 responses, or empty pages. Crawl4AI provides a layered retry and fallback system that automatically detects blocking and escalates through multiple strategies until content is retrieved. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/anti-bot-and-fallback.md`\n- **Crawl Dispatcher**（project_doc）：We’re excited to announce a Crawl Dispatcher module that can handle thousands of crawling tasks simultaneously. By efficiently managing system resources memory, CPU, network , this dispatcher ensures high-performance data extraction at scale. It also provides real-time monitoring of each crawler’s status, memory usage, and overall progress. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/crawl-dispatcher.md`\n- **Download Handling in Crawl4AI**（project_doc）：This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/file-downloading.md`\n- **Hooks & Auth in AsyncWebCrawler**（project_doc）：Crawl4AI’s hooks let you customize the crawler at specific points in the pipeline: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/hooks-auth.md`\n- **Preserve Your Identity with Crawl4AI**（project_doc）：Preserve Your Identity with Crawl4AI 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/identity-based-crawling.md`\n- **Handling Lazy-Loaded Images**（project_doc）：Many websites now load images lazily as you scroll. If you need to ensure they appear in your final crawl and in result.media , consider: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/lazy-loading.md`\n- **Advanced Multi-URL Crawling with Dispatchers**（project_doc）：Advanced Multi-URL Crawling with Dispatchers 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/multi-url-crawling.md`\n- **Network Requests & Console Message Capturing**（project_doc）：Network Requests & Console Message Capturing 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/network-console-capture.md`\n- **PDF Processing Strategies**（project_doc）：Crawl4AI provides specialized strategies for handling and extracting content from PDF files. These strategies allow you to seamlessly integrate PDF processing into your crawling workflows, whether the PDFs are hosted online or stored locally. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/pdf-parsing.md`\n- **Session Management**（project_doc）：Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab or page object across sequential actions and crawls, which is beneficial for: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/session-management.md`\n- **SSLCertificate Reference**（project_doc）：The SSLCertificate class encapsulates an SSL certificate’s data and allows exporting it in various formats PEM, DER, JSON, or text . It’s used within Crawl4AI whenever you set fetch ssl certificate=True in your CrawlerRunConfig . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/ssl-certificate.md`\n- **Undetected Browser Mode**（project_doc）：Crawl4AI offers two powerful anti-bot features to help you access websites with bot detection: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/undetected-browser.md`\n- **Virtual Scroll**（project_doc）：Modern websites increasingly use virtual scrolling also called windowed rendering or viewport rendering to handle large datasets efficiently. This technique only renders visible items in the DOM, replacing content as users scroll. Popular examples include Twitter's timeline, Instagram's feed, and many data tables. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/advanced/virtual-scroll.md`\n- **AdaptiveCrawler**（project_doc）：The AdaptiveCrawler class implements intelligent web crawling that automatically determines when sufficient information has been gathered to answer a query. It uses a three-layer scoring system to evaluate coverage, consistency, and saturation. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/api/adaptive-crawler.md`\n- **arun Parameter Guide New Approach**（project_doc）：In Crawl4AI’s latest configuration model, nearly all parameters that once went directly to arun are now part of CrawlerRunConfig . When calling arun , you provide: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/api/arun.md`\n- **arun many ... Reference**（project_doc）：Note : This function is very similar to arun ./arun.md but focused on concurrent or batch crawling. If you’re unfamiliar with arun usage, please read that doc first, then review this for differences. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/api/arun_many.md`\n- **AsyncWebCrawler**（project_doc）：The AsyncWebCrawler is the core class for asynchronous web crawling in Crawl4AI. You typically create it once , optionally customize it with a BrowserConfig e.g., headless, user agent , then run multiple arun calls with different CrawlerRunConfig objects. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/api/async-webcrawler.md`\n- **C4A-Script API Reference**（project_doc）：Complete reference for all C4A-Script commands, syntax, and advanced features. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/api/c4a-script-reference.md`\n- **CrawlResult Reference**（project_doc）：The CrawlResult class encapsulates everything returned after a single crawl operation. It provides the raw or processed content , details on links and media, plus optional metadata like screenshots, PDFs, or extracted JSON . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/api/crawl-result.md`\n- **digest**（project_doc）：The digest method is the primary interface for adaptive web crawling. It intelligently crawls websites starting from a given URL, guided by a query, and automatically determines when sufficient information has been gathered. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/api/digest.md`\n- **1. BrowserConfig – Controlling the Browser**（project_doc）：1. BrowserConfig – Controlling the Browser 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/api/parameters.md`\n- **Extraction & Chunking Strategies API**（project_doc）：Extraction & Chunking Strategies API 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/api/strategies.md`\n- **🚀 Crawl4AI Interactive Apps**（project_doc）：Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos that make web scraping more intuitive and powerful. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/apps/index.md`\n- **Build**（project_doc）：O Prompt for AI Coding Assistant: Create an Interactive LLM Context Builder Page 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/apps/llmtxt/build.md`\n- **Supercharging Your AI Assistant: My Journey to Better LLM Contexts for crawl4ai**（project_doc）：Supercharging Your AI Assistant: My Journey to Better LLM Contexts for crawl4ai 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/apps/llmtxt/why.md`\n- **Installation 💻**（project_doc）：Crawl4AI offers flexible installation options to suit various use cases. You can install it as a Python package, use it with Docker, or run it as a local server. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/basic/installation.md`\n- **Adaptive Crawling: Building Dynamic Knowledge That Grows on Demand**（project_doc）：Adaptive Crawling: Building Dynamic Knowledge That Grows on Demand 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/articles/adaptive-crawling-revolution.md`\n- **Introducing Event Streams and Interactive Hooks in Crawl4AI**（project_doc）：Introducing Event Streams and Interactive Hooks in Crawl4AI 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/articles/dockerize_hooks.md`\n- **The LLM Context Protocol: Why Your AI Assistant Needs Memory, Reasoning, and Examples**（project_doc）：The LLM Context Protocol: Why Your AI Assistant Needs Memory, Reasoning, and Examples 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/articles/llm-context-revolution.md`\n- **Solving the Virtual Scroll Puzzle: How Crawl4AI Captures What Others Miss**（project_doc）：Solving the Virtual Scroll Puzzle: How Crawl4AI Captures What Others Miss 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/articles/virtual-scroll-revolution.md`\n- **Crawl4AI Blog**（project_doc）：Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical insights, and updates about the project. Whether you're looking for the latest improvements or want to dive deep into web crawling techniques, this is the place. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/index.md`\n- **Release Summary for Version 0.4.0 December 1, 2024**（project_doc）：Release Summary for Version 0.4.0 December 1, 2024 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/releases/0.4.0.md`\n- **Release Summary for Version 0.4.1 December 8, 2024 : Major Efficiency Boosts with New Features!**（project_doc）：Release Summary for Version 0.4.1 December 8, 2024 : Major Efficiency Boosts with New Features! 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/releases/0.4.1.md`\n- **🚀 Crawl4AI 0.4.2 Update: Smarter Crawling Just Got Easier Dec 12, 2024**（project_doc）：🚀 Crawl4AI 0.4.2 Update: Smarter Crawling Just Got Easier Dec 12, 2024 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/releases/0.4.2.md`\n- **Crawl4AI v0.5.0 Release Notes**（project_doc）：Release Theme: Power, Flexibility, and Scalability 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/releases/0.5.0.md`\n- **Crawl4AI v0.6.0 Release Notes**（project_doc）：We're excited to announce the release of Crawl4AI v0.6.0 , our biggest and most feature-rich update yet. This version introduces major architectural upgrades, brand-new capabilities for geo-aware crawling, high-efficiency scraping, and real-time streaming support for scalable deployments. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/releases/0.6.0.md`\n- **🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update**（project_doc）：🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/releases/0.7.0.md`\n- **🛠️ Crawl4AI v0.7.1: Minor Cleanup Update**（project_doc）：🛠️ Crawl4AI v0.7.1: Minor Cleanup Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/releases/0.7.1.md`\n- **🚀 Crawl4AI v0.7.2: CI/CD & Dependency Optimization Update**（project_doc）：🚀 Crawl4AI v0.7.2: CI/CD & Dependency Optimization Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/releases/0.7.2.md`\n- **🚀 Crawl4AI v0.7.3: The Multi-Config Intelligence Update**（project_doc）：🚀 Crawl4AI v0.7.3: The Multi-Config Intelligence Update 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/md_v2/blog/releases/0.7.3.md`\n\n## 证据索引\n\n- 共索引 80 条证据。\n\n- **GitHub Actions Workflows Documentation**（documentation）：GitHub Actions Workflows Documentation 证据：`.github/workflows/docs/README.md`\n- **Crawl4AI Prospect‑Wizard – step‑by‑step guide**（documentation）：Crawl4AI Prospect‑Wizard – step‑by‑step guide 证据：`docs/apps/linkdin/README.md`\n- **Adaptive Crawling Examples**（documentation）：This directory contains examples demonstrating various aspects of Crawl4AI's Adaptive Crawling feature. 证据：`docs/examples/adaptive_crawling/README.md`\n- **Amazon R2D2 Product Search Example**（documentation）：A real-world demonstration of Crawl4AI's multi-step crawling with LLM-generated automation scripts. 证据：`docs/examples/c4a_script/amazon_example/README.md`\n- **C4A-Script Interactive Tutorial**（documentation）：A comprehensive web-based tutorial for learning and experimenting with C4A-Script - Crawl4AI's visual web automation language. 证据：`docs/examples/c4a_script/tutorial/README.md`\n- **Web Scraper API with Custom Model Support**（documentation）：Web Scraper API with Custom Model Support 证据：`docs/examples/website-to-api/README.md`\n- **C4A-Script Interactive Tutorial**（documentation）：A comprehensive web-based tutorial for learning and experimenting with C4A-Script - Crawl4AI's visual web automation language. 证据：`docs/md_v2/apps/c4a-script/README.md`\n- **Crawl4AI Chrome Extension**（documentation）：Visual extraction tools for Crawl4AI - Click to extract data and content from any webpage! 证据：`docs/md_v2/apps/crawl4ai-assistant/README.md`\n- **Crawl4AI Marketplace**（documentation）：A terminal-themed marketplace for tools, integrations, and resources related to Crawl4AI. 证据：`docs/md_v2/marketplace/README.md`\n- **Contributing to Crawl4AI**（documentation）：Welcome to the Crawl4AI project! As an open-source library for web crawling and AI integration, we value contributions from the community. This guide explains our branching strategy, how to contribute effectively, and the overall release process. Our goal is to maintain a stable, collaborative environment where bug fixes, features, and improvements can be integrated smoothly while allowing for experimental development. 证据：`docs/md_v2/CONTRIBUTING.md`\n- **🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper.**（documentation）：🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper. 证据：`README.md`\n- **Software Bill of Materials SBOM**（documentation）：This directory contains the CycloneDX SBOM for the project. 证据：`sbom/README.md`\n- **Crawl4AI Docker Guide 🐳**（documentation）：Table of Contents - Prerequisites prerequisites - Installation installation - Option 1: Using Pre-built Docker Hub Images Recommended option-1-using-pre-built-docker-hub-images-recommended - Option 2: Using Docker Compose option-2-using-docker-compose - Option 3: Manual Local Build & Run option-3-manual-local-build--run - Dockerfile Parameters dockerfile-parameters - Using the API using-the-api - Playground Interface playground-interface - Python SDK python-sdk - Understanding Request Schema understanding-request-schema - REST API Examples rest-api-examples - Asynchronous Jobs with Webhooks asynchronous-jobs-with-webhooks - Additional API Endpoints additional-api-endpoints - HTML Extraction… 证据：`deploy/docker/README.md`\n- **Crawl4AI Stress Testing and Benchmarking**（documentation）：Crawl4AI Stress Testing and Benchmarking 证据：`tests/memory/README.md`\n- **Contributing to Crawl4AI**（documentation）：Welcome to the Crawl4AI project! As an open-source library for web crawling and AI integration, we value contributions from the community. This guide explains our branching strategy, how to contribute effectively, and the overall release process. Our goal is to maintain a stable, collaborative environment where bug fixes, features, and improvements can be integrated smoothly while allowing for experimental development. 证据：`CONTRIBUTING.md`\n- **License**（source_file）：Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ 证据：`LICENSE`\n- **Crawl4AI v0.8.0 Release Notes**（documentation）：Release Date : January 2026 Previous Version : v0.7.6 Status : Release Candidate 证据：`docs/RELEASE_NOTES_v0.8.0.md`\n- **Workflow Architecture Documentation**（documentation）：Workflow Architecture Documentation 证据：`.github/workflows/docs/ARCHITECTURE.md`\n- **Workflow Quick Reference**（documentation）：Standard Release bash 1. Update version vim crawl4ai/ version .py Set to \"1.2.3\" 证据：`.github/workflows/docs/WORKFLOW_REFERENCE.md`\n- **🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update**（documentation）：🚀 Crawl4AI v0.7.0: The Adaptive Intelligence Update 证据：`docs/blog/release-v0.7.0.md`\n- **🛠️ Crawl4AI v0.7.1: Minor Cleanup Update**（documentation）：🛠️ Crawl4AI v0.7.1: Minor Cleanup Update 证据：`docs/blog/release-v0.7.1.md`\n- **🚀 Crawl4AI v0.7.3: The Multi-Config Intelligence Update**（documentation）：🚀 Crawl4AI v0.7.3: The Multi-Config Intelligence Update 证据：`docs/blog/release-v0.7.3.md`\n- **🚀 Crawl4AI v0.7.4: The Intelligent Table Extraction & Performance Update**（documentation）：🚀 Crawl4AI v0.7.4: The Intelligent Table Extraction & Performance Update 证据：`docs/blog/release-v0.7.4.md`\n- **🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update**（documentation）：🚀 Crawl4AI v0.7.5: The Docker Hooks & Security Update 证据：`docs/blog/release-v0.7.5.md`\n- **Crawl4AI v0.7.6 Release Notes**（documentation）：I'm excited to announce Crawl4AI v0.7.6, featuring a complete webhook infrastructure for the Docker job queue API! This release eliminates polling and brings real-time notifications to both crawling and LLM extraction workflows. 证据：`docs/blog/release-v0.7.6.md`\n- **🚀 Crawl4AI v0.7.7: The Self-Hosting & Monitoring Update**（documentation）：🚀 Crawl4AI v0.7.7: The Self-Hosting & Monitoring Update 证据：`docs/blog/release-v0.7.7.md`\n- **Crawl4AI v0.7.8: Stability & Bug Fix Release**（documentation）：Crawl4AI v0.7.8: Stability & Bug Fix Release 证据：`docs/blog/release-v0.7.8.md`\n- **Crawl4AI v0.8.0 Release Notes**（documentation）：Release Date : January 2026 Previous Version : v0.7.6 Status : Release Candidate 证据：`docs/blog/release-v0.8.0.md`\n- **Crawl4AI v0.8.5: Anti-Bot, Shadow DOM & 60+ Bug Fixes**（documentation）：Crawl4AI v0.8.5: Anti-Bot, Shadow DOM & 60+ Bug Fixes 证据：`docs/blog/release-v0.8.5.md`\n- **browser manager.py**（documentation）：Function What it does --- --- ManagedBrowser.build browser flags Returns baseline Chromium CLI flags, disables GPU and sandbox, plugs locale, timezone, stealth tweaks, and any extras from BrowserConfig . ManagedBrowser. init Stores config and logger, creates temp dir, preps internal state. ManagedBrowser.start Spawns or connects to the Chromium process, returns its CDP endpoint plus the subprocess.Popen handle. ManagedBrowser. initial startup check Pings the CDP endpoint once to be sure the browser is alive, raises if not. ManagedBrowser. monitor browser process Async-loops on the subprocess, logs exits or crashes, restarts if policy allows. ManagedBrowser. get browser path WIP Old helper t… 证据：`docs/codebase/browser.md`\n- **cli.py command surface**（documentation）：Command Inputs / flags What it does --- --- --- profiles none Opens the interactive profile manager, lets you list, create, delete saved browser profiles that live in ~/.crawl4ai/profiles . browser status – Prints whether the always-on builtin browser is running, shows its CDP URL, PID, start time. browser stop – Kills the builtin browser and deletes its status file. browser view --url, -u URL optional Pops a visible window of the builtin browser, navigates to URL or about:blank . config list – Dumps every global setting, showing current value, default, and description. config get key Prints the value of a single setting, falls back to default if unset. config set key value Persists a new v… 证据：`docs/codebase/cli.md`\n- **🐳 Using Docker Legacy**（documentation）：Crawl4AI is available as Docker images for easy deployment. You can either pull directly from Docker Hub recommended or build from the repository. 证据：`docs/deprecated/docker-deployment.md`\n- **Builtin Browser in Crawl4AI**（documentation）：This document explains the builtin browser feature in Crawl4AI and how to use it effectively. 证据：`docs/examples/README_BUILTIN_BROWSER.md`\n- **Welcome to Crawl4AI! 🚀🤖**（documentation）：Hi there, Developer! 👋 Here is an example of a research pipeline, where you can share a URL in your conversation with any LLM, and then the context of crawled pages will be used as the context. 证据：`docs/examples/chainlit.md`\n- **Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI**（documentation）：Capturing Full-Page Screenshots and PDFs from Massive Webpages with Crawl4AI 证据：`docs/examples/full_page_screenshot_and_pdf_export.md`\n- **Using storage state to Pre-Load Cookies and LocalStorage**（documentation）：Using storage state to Pre-Load Cookies and LocalStorage 证据：`docs/examples/storage_state_tutorial.md`\n- **Tutorial: Clicking Buttons to Load More Content with Crawl4AI**（documentation）：Tutorial: Clicking Buttons to Load More Content with Crawl4AI 证据：`docs/examples/tutorial_dynamic_clicks.md`\n- **🔬 Building an AI Research Assistant with Crawl4AI: Smart URL Discovery**（documentation）：🔬 Building an AI Research Assistant with Crawl4AI: Smart URL Discovery 证据：`docs/examples/url_seeder/tutorial_url_seeder.md`\n- **Advanced Adaptive Strategies**（documentation）：While the default adaptive crawling configuration works well for most use cases, understanding the underlying strategies and scoring mechanisms allows you to fine-tune the crawler for specific domains and requirements. 证据：`docs/md_v2/advanced/adaptive-strategies.md`\n- **Overview of Some Important Advanced Features**（documentation）：Overview of Some Important Advanced Features Proxy, PDF, Screenshot, SSL, Headers, & Storage State 证据：`docs/md_v2/advanced/advanced-features.md`\n- **Anti-Bot Detection & Fallback**（documentation）：When crawling sites protected by anti-bot systems Akamai, Cloudflare, PerimeterX, DataDome, Imperva, etc. , requests often get blocked with CAPTCHAs, 403 responses, or empty pages. Crawl4AI provides a layered retry and fallback system that automatically detects blocking and escalates through multiple strategies until content is retrieved. 证据：`docs/md_v2/advanced/anti-bot-and-fallback.md`\n- **Crawl Dispatcher**（documentation）：We’re excited to announce a Crawl Dispatcher module that can handle thousands of crawling tasks simultaneously. By efficiently managing system resources memory, CPU, network , this dispatcher ensures high-performance data extraction at scale. It also provides real-time monitoring of each crawler’s status, memory usage, and overall progress. 证据：`docs/md_v2/advanced/crawl-dispatcher.md`\n- **Download Handling in Crawl4AI**（documentation）：This guide explains how to use Crawl4AI to handle file downloads during crawling. You'll learn how to trigger downloads, specify download locations, and access downloaded files. 证据：`docs/md_v2/advanced/file-downloading.md`\n- **Hooks & Auth in AsyncWebCrawler**（documentation）：Crawl4AI’s hooks let you customize the crawler at specific points in the pipeline: 证据：`docs/md_v2/advanced/hooks-auth.md`\n- **Preserve Your Identity with Crawl4AI**（documentation）：Preserve Your Identity with Crawl4AI 证据：`docs/md_v2/advanced/identity-based-crawling.md`\n- **Handling Lazy-Loaded Images**（documentation）：Many websites now load images lazily as you scroll. If you need to ensure they appear in your final crawl and in result.media , consider: 证据：`docs/md_v2/advanced/lazy-loading.md`\n- **Advanced Multi-URL Crawling with Dispatchers**（documentation）：Advanced Multi-URL Crawling with Dispatchers 证据：`docs/md_v2/advanced/multi-url-crawling.md`\n- **Network Requests & Console Message Capturing**（documentation）：Network Requests & Console Message Capturing 证据：`docs/md_v2/advanced/network-console-capture.md`\n- **PDF Processing Strategies**（documentation）：Crawl4AI provides specialized strategies for handling and extracting content from PDF files. These strategies allow you to seamlessly integrate PDF processing into your crawling workflows, whether the PDFs are hosted online or stored locally. 证据：`docs/md_v2/advanced/pdf-parsing.md`\n- **Session Management**（documentation）：Session management in Crawl4AI is a powerful feature that allows you to maintain state across multiple requests, making it particularly suitable for handling complex multi-step crawling tasks. It enables you to reuse the same browser tab or page object across sequential actions and crawls, which is beneficial for: 证据：`docs/md_v2/advanced/session-management.md`\n- **SSLCertificate Reference**（documentation）：The SSLCertificate class encapsulates an SSL certificate’s data and allows exporting it in various formats PEM, DER, JSON, or text . It’s used within Crawl4AI whenever you set fetch ssl certificate=True in your CrawlerRunConfig . 证据：`docs/md_v2/advanced/ssl-certificate.md`\n- **Undetected Browser Mode**（documentation）：Crawl4AI offers two powerful anti-bot features to help you access websites with bot detection: 证据：`docs/md_v2/advanced/undetected-browser.md`\n- **Virtual Scroll**（documentation）：Modern websites increasingly use virtual scrolling also called windowed rendering or viewport rendering to handle large datasets efficiently. This technique only renders visible items in the DOM, replacing content as users scroll. Popular examples include Twitter's timeline, Instagram's feed, and many data tables. 证据：`docs/md_v2/advanced/virtual-scroll.md`\n- **AdaptiveCrawler**（documentation）：The AdaptiveCrawler class implements intelligent web crawling that automatically determines when sufficient information has been gathered to answer a query. It uses a three-layer scoring system to evaluate coverage, consistency, and saturation. 证据：`docs/md_v2/api/adaptive-crawler.md`\n- **arun Parameter Guide New Approach**（documentation）：In Crawl4AI’s latest configuration model, nearly all parameters that once went directly to arun are now part of CrawlerRunConfig . When calling arun , you provide: 证据：`docs/md_v2/api/arun.md`\n- **arun many ... Reference**（documentation）：Note : This function is very similar to arun ./arun.md but focused on concurrent or batch crawling. If you’re unfamiliar with arun usage, please read that doc first, then review this for differences. 证据：`docs/md_v2/api/arun_many.md`\n- **AsyncWebCrawler**（documentation）：The AsyncWebCrawler is the core class for asynchronous web crawling in Crawl4AI. You typically create it once , optionally customize it with a BrowserConfig e.g., headless, user agent , then run multiple arun calls with different CrawlerRunConfig objects. 证据：`docs/md_v2/api/async-webcrawler.md`\n- **C4A-Script API Reference**（documentation）：Complete reference for all C4A-Script commands, syntax, and advanced features. 证据：`docs/md_v2/api/c4a-script-reference.md`\n- **CrawlResult Reference**（documentation）：The CrawlResult class encapsulates everything returned after a single crawl operation. It provides the raw or processed content , details on links and media, plus optional metadata like screenshots, PDFs, or extracted JSON . 证据：`docs/md_v2/api/crawl-result.md`\n- **digest**（documentation）：The digest method is the primary interface for adaptive web crawling. It intelligently crawls websites starting from a given URL, guided by a query, and automatically determines when sufficient information has been gathered. 证据：`docs/md_v2/api/digest.md`\n- 其余 20 条证据见 `AI_CONTEXT_PACK.json` 或 `EVIDENCE_INDEX.json`。\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`.github/workflows/docs/README.md`, `docs/apps/linkdin/README.md`, `docs/examples/adaptive_crawling/README.md`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`.github/workflows/docs/README.md`, `docs/apps/linkdin/README.md`, `docs/examples/adaptive_crawling/README.md`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Introduction to Crawl4AI**：importance `high`\n  - source_paths: README.md, MISSION.md, pyproject.toml\n- **Installation Guide**：importance `high`\n  - source_paths: setup.py, requirements.txt, Dockerfile, crawl4ai/install.py\n- **Quick Start Guide**：importance `high`\n  - source_paths: docs/examples/quickstart.py, docs/examples/hello_world.py, crawl4ai/__init__.py\n- **System Architecture**：importance `high`\n  - source_paths: deploy/docker/ARCHITECTURE.md, crawl4ai/async_webcrawler.py, crawl4ai/browser_manager.py, crawl4ai/async_dispatcher.py, crawl4ai/models.py\n- **Browser Management**：importance `high`\n  - source_paths: crawl4ai/browser_manager.py, crawl4ai/browser_adapter.py, crawl4ai/browser_profiler.py, crawl4ai/js_snippet, docs/codebase/browser.md\n- **Async Web Crawler**：importance `high`\n  - source_paths: crawl4ai/async_webcrawler.py, crawl4ai/async_configs.py, crawl4ai/cache_context.py, crawl4ai/types.py, docs/md_v2/api/async-webcrawler.md\n- **Markdown Generation**：importance `high`\n  - source_paths: crawl4ai/markdown_generation_strategy.py, crawl4ai/content_filter_strategy.py, crawl4ai/html2text, docs/md_v2/core/markdown-generation.md\n- **Extraction Strategies**：importance `high`\n  - source_paths: crawl4ai/extraction_strategy.py, crawl4ai/chunking_strategy.py, crawl4ai/content_scraping_strategy.py, crawl4ai/table_extraction.py, docs/md_v2/extraction/llm-strategies.md\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `1debe5f5fcc118ced10826a1040a81f9b77e9255`\n- inspected_files: `pyproject.toml`, `Dockerfile`, `README.md`, `docker-compose.yml`, `uv.lock`, `requirements.txt`, `docs/RELEASE_NOTES_v0.8.0.md`, `docs/releases_review/v0_7_0_features_demo.py`, `docs/releases_review/v0.3.74.overview.py`, `docs/releases_review/demo_v0.7.7.py`, `docs/releases_review/v0.7.5_docker_hooks_demo.py`, `docs/releases_review/v0_4_3b2_features_demo.py`, `docs/releases_review/demo_v0.8.5.py`, `docs/releases_review/demo_v0.7.6.py`, `docs/releases_review/demo_v0.7.0.py`, `docs/releases_review/demo_v0.8.0.py`, `docs/releases_review/demo_v0.7.8.py`, `docs/releases_review/v0_4_24_walkthrough.py`, `docs/releases_review/crawl4ai_v0_7_0_showcase.py`, `docs/releases_review/demo_v0.7.5.py`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 来源证据：[Bug]: arun() and arun_many() type hinting needs fixing\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: arun() and arun_many() type hinting needs fixing\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_d3b6cfd3700147f690e0e65875f15424 | https://github.com/unclecode/crawl4ai/issues/1898 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 来源证据：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason…\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason is shown\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_ad61b108bf894cc286ca7966e8c86758 | https://github.com/unclecode/crawl4ai/issues/1949 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 来源证据：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_1ee99f5d72f143f4b064732cc19e0c85 | https://github.com/unclecode/crawl4ai/issues/1963 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 来源证据：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_d7fa967632a948008efbc182d1f2c96b | https://github.com/unclecode/crawl4ai/issues/1938 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 来源证据：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content\n\n- Trigger: GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能影响授权、密钥配置或安全边界。\n- Evidence: community_evidence:github | cevd_2e9fbf659fbb40aba437886a87f8e2d7 | https://github.com/unclecode/crawl4ai/issues/1962 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: 来源证据：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能影响升级、迁移或版本选择。\n- Evidence: community_evidence:github | cevd_af29278fd7294d4a8f0f6f37ab987b5c | https://github.com/unclecode/crawl4ai/issues/1968 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 来源证据：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_97d44cedb21a4908a7743fde11209954 | https://github.com/unclecode/crawl4ai/issues/1950 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 8: 来源证据：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_ae45861377894b99a57d6bbdc06af313 | https://github.com/unclecode/crawl4ai/issues/1959 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 9: 来源证据：v0.7.1:Update\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.1:Update\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能影响升级、迁移或版本选择。\n- Evidence: community_evidence:github | cevd_a6ae9133fff54443b712725f51769fa1 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 10: 来源证据：v0.7.2: CI/CD & Dependency Optimization Update\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.2: CI/CD & Dependency Optimization Update\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能影响升级、迁移或版本选择。\n- Evidence: community_evidence:github | cevd_14954e0431ca426ebeaa4bb31778d4af | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.2 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：unclecode/crawl4ai\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：local_cli\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 来源证据：[Bug]: arun() and arun_many() type hinting needs fixing（high）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason…（high）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide（high）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail（high）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content（high）：可能影响授权、密钥配置或安全边界。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/unclecode/crawl4ai 项目说明书\n\n生成时间：2026-05-15 08:23:04 UTC\n\n## 目录\n\n- [Introduction to Crawl4AI](#introduction)\n- [Installation Guide](#installation)\n- [Quick Start Guide](#quickstart)\n- [System Architecture](#architecture)\n- [Browser Management](#browser_management)\n- [Async Web Crawler](#async_crawler)\n- [Markdown Generation](#markdown_generation)\n- [Extraction Strategies](#extraction_strategies)\n- [Deep Crawling Strategies](#deep_crawling)\n- [Anti-Bot Detection and Proxy Management](#anti_bot_detection)\n\n<a id='introduction'></a>\n\n## Introduction to Crawl4AI\n\n### 相关页面\n\n相关主题：[Installation Guide](#installation), [Quick Start Guide](#quickstart)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/unclecode/crawl4ai/blob/main/README.md)\n- [MISSION.md](https://github.com/unclecode/crawl4ai/blob/main/MISSION.md)\n- [pyproject.toml](https://github.com/unclecode/crawl4ai/blob/main/pyproject.toml)\n- [docs/README.md](https://github.com/unclecode/crawl4ai/blob/main/docs/README.md)\n- [CONTRIBUTING.md](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTING.md)\n- [sbom/README.md](https://github.com/unclecode/crawl4ai/blob/main/sbom/README.md)\n</details>\n\n# Introduction to Crawl4AI\n\n## Overview\n\nCrawl4AI is an open-source AI-powered web crawling framework designed to extract structured data from web pages and deliver clean, LLM-ready output. It serves as a modern alternative to traditional web scraping tools by combining intelligent crawling capabilities with AI-driven content extraction and formatting.\n\nThe project emphasizes ease of use, providing both programmatic APIs and command-line interfaces for rapid integration into data pipelines, research workflows, and AI applications.\n\n## Purpose and Scope\n\nCrawl4AI addresses the fundamental challenge of extracting meaningful data from unstructured web content. While conventional web crawlers focus on fetching page content, Crawl4AI goes further by:\n\n- **Semantic Understanding**: Analyzing page content to identify and extract relevant information based on context rather than rigid selectors\n- **Structured Output**: Delivering data in formats optimized for large language model consumption, including Markdown, JSON, and structured extractions\n- **Performance Optimization**: Enabling high-throughput crawling with configurable browser automation and connection pooling\n- ** flexibility**: Supporting various output strategies including simple crawling, chunked extraction, and memory-aware processing\n\nThe scope encompasses web crawling, content extraction, link navigation, media handling, and output formatting—providing an end-to-end solution from URL input to structured data output.\n\n## Core Architecture\n\nCrawl4AI follows a modular architecture composed of distinct processing stages:\n\n```mermaid\ngraph TD\n    A[URL Input] --> B[Crawl Strategy]\n    B --> C[Browser Automation]\n    C --> D[Content Extraction]\n    D --> E[AI Processing]\n    E --> F[Output Formatter]\n    F --> G[Structured Output]\n    \n    C -->|JS Rendering| H[JavaScript Executor]\n    D -->|Media| I[Media Handler]\n    E -->|Memory| J[Memory Manager]\n```\n\n### Processing Pipeline\n\n| Stage | Component | Description |\n|-------|-----------|-------------|\n| Input | URL Parser | Validates and normalizes target URLs |\n| Crawl | Strategy Engine | Selects crawling approach based on configuration |\n| Render | Browser Pool | Manages headless browser instances |\n| Extract | AI Extractor | Uses ML models to identify relevant content |\n| Format | Output Serializer | Converts to target format (JSON/Markdown/HTML) |\n\n## Key Features\n\n### 1. AI-Powered Extraction\n\nCrawl4AI leverages machine learning models to understand page content semantically. Rather than relying solely on CSS selectors or XPath expressions, the extractor can identify:\n\n- Main article content and metadata\n- Structured data elements (tables, lists, forms)\n- Semantic sections and their relationships\n- Relevant versus boilerplate content\n\n### 2. Multiple Output Strategies\n\nThe framework supports various extraction strategies optimized for different use cases:\n\n| Strategy | Use Case | Output |\n|----------|----------|--------|\n| `default` | General purpose | Clean Markdown with metadata |\n| `cosine` | Semantic clustering | Grouped content chunks |\n| `no-cache` | Fresh data | Bypass internal caching |\n| `passive` | Low resource | Minimal processing |\n| `brainless` | Simple fetch | Raw HTML without AI processing |\n\n### 3. Browser Automation\n\nIntegrated headless browser support enables:\n\n- JavaScript rendering for single-page applications\n- Cookie and session management\n- Custom headers and authentication\n- Screenshot capture\n- PDF generation\n\n### 4. Media Handling\n\nCrawl4AI processes various media types during extraction:\n\n- **Images**: Download, compress, and embed with alt-text preservation\n- **Videos**: Extract metadata and embed URLs\n- **Audio**: Handle media references for podcasts and audio content\n- **Documents**: Process embedded PDFs and downloadable files\n\n## Installation and Setup\n\n### Prerequisites\n\n- Python 3.9 or higher\n- Chrome/Chromium browser (for browser automation features)\n- pip or poetry package manager\n\n### Basic Installation\n\n```bash\npip install crawl4ai\n```\n\n### Development Installation\n\n```bash\ngit clone https://github.com/unclecode/crawl4ai.git\ncd crawl4ai\npip install -e \".[dev]\"\n```\n\n### Verify Installation\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(url=\"https://example.com\")\n    print(result.markdown)\n```\n\n## Basic Usage Patterns\n\n### Simple Crawl\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(url=\"https://example.com\")\n    print(f\"Title: {result.metadata.get('title')}\")\n    print(f\"Content: {result.markdown}\")\n```\n\n### Configured Extraction\n\n```python\nfrom crawl4ai import CrawlerRunConfig, AsyncWebCrawler\n\nconfig = CrawlerRunConfig(\n    mode=\"aggressive\",\n    word_count_threshold=10,\n    remove_hidden_text=True,\n    process_iframes=True,\n    scroll_delay=1.0\n)\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(\n        url=\"https://example.com/article\",\n        config=config\n    )\n    print(result.markdown)\n```\n\n### Batch Crawling\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n\nurls = [\n    \"https://example.com/page1\",\n    \"https://example.com/page2\",\n    \"https://example.com/page3\"\n]\n\nasync with AsyncWebCrawler() as crawler:\n    results = await crawler.arun_many(urls=urls)\n    for result in results:\n        print(f\"URL: {result.url}\")\n        print(f\"Status: {result.status_code}\")\n```\n\n## Configuration Reference\n\n### CrawlerRunConfig Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `mode` | str | `\"default\"` | Extraction strategy |\n| `headless` | bool | `True` | Run browser in headless mode |\n| `verbose` | bool | `False` | Enable verbose logging |\n| `text_threshold` | int | | Minimum text length filter |\n| `word_count_threshold` | int | | Words per chunk threshold |\n| `skip_download_images` | bool | `False` | Skip image downloads |\n| `page_timeout` | int | `30000` | Page load timeout (ms) |\n| `scroll_delay` | float | `0` | Delay between scrolls |\n\n### Browser Configuration\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `browser_type` | str | Chromium/Firefox/WebKit |\n| `headless` | bool | Headless mode toggle |\n| `proxy` | dict | Proxy configuration |\n| `user_agent` | str | Custom user agent string |\n\n## Project Structure\n\n```\ncrawl4ai/\n├── src/crawl4ai/          # Main package source\n│   ├── core/              # Core crawling engine\n│   ├── extractors/        # Content extraction strategies\n│   ├── formatters/        # Output formatters\n│   └── utils/             # Utility functions\n├── examples/              # Usage examples\n├── tests/                 # Test suite\n├── docs/                  # Documentation\n├── scripts/               # Build and utility scripts\n└── sbom/                  # Software Bill of Materials\n```\n\n## Dependencies and SBOM\n\nCrawl4AI maintains a comprehensive Software Bill of Materials (SBOM) documenting all direct and transitive dependencies. This SBOM is generated using CycloneDX format and regenerated on a best-effort basis through automated scripts.\n\n### Regenerating SBOM\n\n```bash\n./scripts/gen-sbom.sh\n```\n\nThe SBOM provides visibility into the project's dependency tree, supporting security audits and license compliance verification.\n\n## Extensibility\n\n### Custom Extractors\n\nExtend the extraction framework by implementing the base extractor interface:\n\n```python\nfrom crawl4ai.extractors import BaseExtractor\n\nclass CustomExtractor(BaseExtractor):\n    async def extract(self, html: str, url: str) -> dict:\n        # Custom extraction logic\n        return {\"content\": html, \"custom_field\": \"value\"}\n```\n\n### Output Formatters\n\nCreate custom output formats by implementing the formatter interface:\n\n```python\nfrom crawl4ai.formatters import BaseFormatter\n\nclass CustomFormatter(BaseFormatter):\n    def format(self, result: CrawlResult) -> str:\n        # Custom formatting logic\n        return custom_string\n```\n\n## Contributing\n\nThe project welcomes contributions from the community. Developers interested in contributing should:\n\n1. Fork the repository\n2. Create a feature branch\n3. Follow the established coding standards\n4. Add tests for new functionality\n5. Submit a pull request with clear documentation\n\n## Resources and Documentation\n\n| Resource | Location |\n|----------|----------|\n| Source Code | [GitHub Repository](https://github.com/unclecode/crawl4ai) |\n| Documentation | `/docs` directory |\n| Examples | `/examples` directory |\n| SBOM | `/sbom` directory |\n| Issue Tracker | GitHub Issues |\n\n## License\n\nCrawl4AI is released under open-source licensing terms. Refer to the LICENSE file in the repository for specific terms and conditions.\n\n---\n\n<a id='installation'></a>\n\n## Installation Guide\n\n### 相关页面\n\n相关主题：[Quick Start Guide](#quickstart)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [setup.py](https://github.com/unclecode/crawl4ai/blob/main/setup.py)\n- [requirements.txt](https://github.com/unclecode/crawl4ai/blob/main/requirements.txt)\n- [Dockerfile](https://github.com/unclecode/crawl4ai/blob/main/Dockerfile)\n- [crawl4ai/install.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/install.py)\n</details>\n\n# Installation Guide\n\n## Overview\n\nThis guide covers all supported methods for installing crawl4ai, a powerful web crawling and data extraction framework. The installation process handles automatic setup of core dependencies including Playwright for browser automation, supporting multiple installation scenarios from simple pip installations to Docker containerized deployments.\n\n## System Requirements\n\n### Hardware Requirements\n\n| Component | Minimum | Recommended |\n|-----------|---------|-------------|\n| RAM | 4 GB | 8 GB |\n| Disk Space | 2 GB | 5 GB |\n| CPU | 2 cores | 4+ cores |\n\n### Software Prerequisites\n\n| Requirement | Version | Notes |\n|-------------|---------|-------|\n| Python | >= 3.9 | Tested up to 3.12 |\n| pip | Latest | For pip installations |\n| Docker | 20.10+ | For Docker installations |\n| Chrome/Chromium | Latest | Auto-installed by Playwright |\n\n## Installation Methods\n\n### Method 1: pip Installation (Recommended)\n\nThe simplest and most common installation method uses pip package manager.\n\n```bash\npip install crawl4ai\n```\n\nAfter pip installation, you must run the post-installation setup to configure browser dependencies:\n\n```bash\npython -m crawl4ai install\n```\n\nThis command installs Playwright browsers and configures the necessary system dependencies. 资料来源：[crawl4ai/install.py:1-50]()\n\n### Method 2: Docker Installation\n\nDocker provides an isolated environment with all dependencies pre-configured.\n\n#### Pulling the Official Image\n\n```bash\ndocker pull unclecode/crawl4ai\n```\n\n#### Running with Docker\n\n```bash\ndocker run -d \\\n  --name crawl4ai \\\n  -p 8000:8000 \\\n  unclecode/crawl4ai\n```\n\n#### Building Custom Docker Image\n\nYou can build your own image using the provided Dockerfile:\n\n```dockerfile\nFROM python:3.11-slim\nWORKDIR /app\nCOPY requirements.txt .\nRUN pip install --no-cache-dir -r requirements.txt\nCOPY . .\nCMD [\"python\", \"-m\", \"crawl4ai\"]\n```\n\n资料来源：[Dockerfile](https://github.com/unclecode/crawl4ai/blob/main/Dockerfile)\n\n### Method 3: Installation from Source\n\nFor development or customization purposes, install from the source repository:\n\n```bash\ngit clone https://github.com/unclecode/crawl4ai.git\ncd crawl4ai\npip install -e .\n```\n\n## Dependencies Management\n\n### Core Dependencies\n\nThe project defines dependencies in `setup.py` for package distribution and `requirements.txt` for development.\n\n**setup.py dependencies:**\n\n| Package | Purpose |\n|---------|---------|\n| playwright | Browser automation |\n| asyncio | Async operations |\n| aiohttp | HTTP client |\n| beautifulsoup4 | HTML parsing |\n| lxml | XML/HTML processing |\n\n资料来源：[setup.py](https://github.com/unclecode/crawl4ai/blob/main/setup.py)\n\n### Runtime Dependency Installation\n\nThe `crawl4ai/install.py` module handles automatic installation of runtime dependencies:\n\n```python\n# Key installation steps in install.py\nimport subprocess\nimport sys\n\ndef install_browsers():\n    subprocess.check_call([\n        sys.executable, \"-m\", \"playwright\", \"install\", \"chromium\"\n    ])\n```\n\n资料来源：[crawl4ai/install.py:20-30]()\n\n### Installing Optional Dependencies\n\n| Extra | Command | Description |\n|-------|---------|-------------|\n| All extras | `pip install crawl4ai[all]` | Install all optional packages |\n| Dev tools | `pip install crawl4ai[dev]` | Development dependencies |\n| LLM support | `pip install crawl4ai[llm]` | Language model integration |\n\n## Installation Workflow\n\n```mermaid\ngraph TD\n    A[Start Installation] --> B{Installation Method}\n    B -->|pip| C[Run pip install]\n    B -->|Docker| D[Pull/Build Image]\n    B -->|Source| E[Clone Repository]\n    C --> F[Run post-install]\n    F --> G[Install Playwright Browsers]\n    D --> H[Run Container]\n    E --> I[Install in Editable Mode]\n    I --> F\n    G --> J[Verify Installation]\n    H --> J\n    J --> K{Success?}\n    K -->|Yes| L[Installation Complete]\n    K -->|No| M[Troubleshoot]\n    M --> G\n```\n\n## Post-Installation Verification\n\nVerify that crawl4ai is installed correctly by checking the installation status:\n\n```bash\npython -m crawl4ai --version\n```\n\nOr test the installation programmatically:\n\n```python\nimport crawl4ai\nprint(crawl4ai.__version__)\n```\n\n### Browser Verification\n\nEnsure Playwright browsers are properly installed:\n\n```bash\npython -m playwright install-deps chromium\npython -m playwright install chromium\n```\n\n## Environment Configuration\n\n### Environment Variables\n\n| Variable | Default | Description |\n|----------|---------|-------------|\n| `CRAWL4AI_BROWSER_HEADLESS` | `true` | Run browser in headless mode |\n| `CRAWL4AI_MAX_CONCURRENT` | `5` | Maximum concurrent crawls |\n| `CRAWL4AI_CACHE_DIR` | `~/.crawl4ai/cache` | Cache directory path |\n\n### Configuration File\n\nCreate `~/.crawl4ai/config.json` for persistent configuration:\n\n```json\n{\n  \"browser\": {\n    \"headless\": true,\n    \"viewport\": {\"width\": 1920, \"height\": 1080}\n  },\n  \"cache\": {\n    \"enabled\": true,\n    \"ttl\": 3600\n  }\n}\n```\n\n## Common Installation Issues\n\n### Issue: Playwright Installation Fails\n\n**Solution:** Install system dependencies manually:\n\n```bash\n# Ubuntu/Debian\napt-get install -y libnss3 libnspr4 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2\n\n# Then retry browser installation\npython -m playwright install chromium\n```\n\n### Issue: Permission Denied\n\n**Solution:** Use virtual environment or `--user` flag:\n\n```bash\npython -m venv venv\nsource venv/bin/activate  # Linux/Mac\npip install crawl4ai\n```\n\n### Issue: Import Errors After Installation\n\n**Solution:** Verify Python path and reinstall:\n\n```bash\npip uninstall crawl4ai\npip install crawl4ai --force-reinstall\n```\n\n## Docker-Specific Configuration\n\n### Volume Mounts\n\nMount local directories for persistent data:\n\n```bash\ndocker run -v /path/to/data:/data crawl4ai\n```\n\n### Network Configuration\n\nFor web crawling behind proxies:\n\n```bash\ndocker run -e HTTP_PROXY=http://proxy:8080 \\\n           -e HTTPS_PROXY=https://proxy:8080 \\\n           crawl4ai\n```\n\n## Upgrading crawl4ai\n\n### pip Upgrade\n\n```bash\npip install crawl4ai --upgrade\n```\n\n### Docker Upgrade\n\n```bash\ndocker pull unclecode/crawl4ai\ndocker stop old_container\ndocker rm old_container\ndocker run crawl4ai\n```\n\n### Source Upgrade\n\n```bash\ngit pull origin main\npip install -e . --force-reinstall\n```\n\n## Next Steps\n\nAfter successful installation, proceed to:\n\n1. **Quick Start** - Run your first crawl operation\n2. **Configuration Guide** - Customize crawl behavior\n3. **API Reference** - Explore available methods and options\n4. **Examples** - Review usage patterns and best practices\n\n---\n\n<a id='quickstart'></a>\n\n## Quick Start Guide\n\n### 相关页面\n\n相关主题：[Async Web Crawler](#async_crawler), [Markdown Generation](#markdown_generation)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [docs/examples/quickstart.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py)\n- [docs/examples/hello_world.py](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world.py)\n- [crawl4ai/__init__.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/__init__.py)\n- [sbom/README.md](https://github.com/unclecode/crawl4ai/blob/main/sbom/README.md)\n</details>\n\n# Quick Start Guide\n\n## Overview\n\nThe **Quick Start Guide** provides developers with a rapid introduction to using crawl4ai for web crawling and data extraction tasks. It serves as the entry point for users who want to begin extracting structured data from websites within minutes of installation.\n\n### Purpose and Scope\n\nThe Quick Start Guide is designed to:\n\n- Demonstrate the simplest possible usage pattern for crawling web pages\n- Show how to extract and structure content from HTML pages\n- Provide copy-paste-ready code examples for immediate experimentation\n- Bridge the gap between installation and production usage\n\n## Installation\n\n### Prerequisites\n\n| Requirement | Description |\n|-------------|-------------|\n| Python | Version 3.8 or higher |\n| pip | Latest version recommended |\n| Browser | Chrome/Chromium (for JavaScript rendering) |\n\n### Installation Command\n\n```bash\npip install crawl4ai\n```\n\n## Basic Usage Pattern\n\nThe fundamental workflow in crawl4ai follows a simple three-step pattern:\n\n```mermaid\ngraph TD\n    A[Create AsyncWebCrawler Instance] --> B[Configure Parameters]\n    B --> C[Call crawl Method with URL]\n    C --> D[Process Result Object]\n    D --> E[Extract Content/Markdown/HTML]\n```\n\n### Hello World Example\n\nThe simplest possible usage demonstrates core functionality:\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def main():\n    async with AsyncWebCrawler() as crawler:\n        result = await crawler.crawl(url=\"https://example.com\")\n        \n        if result.success:\n            print(f\"Content: {result.markdown}\")\n            print(f\"Links found: {len(result.links)}\")\n        else:\n            print(f\"Crawl failed: {result.error_message}\")\n\nasyncio.run(main())\n```\n\n## Core Components\n\n### AsyncWebCrawler\n\nThe primary entry point for all crawling operations:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `verbose` | bool | Enable detailed logging output |\n| `headless` | bool | Run browser in headless mode |\n| `browser_type` | str | Specify browser engine |\n\n资料来源：[crawl4ai/__init__.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/__init__.py)\n\n### CrawlResult Object\n\nThe return value from `crawler.crawl()` contains extracted data:\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `success` | bool | Whether crawl completed successfully |\n| `markdown` | str | Extracted content as markdown |\n| `html` | str | Raw HTML content |\n| `links` | dict | Dictionary of internal/external links |\n| `media` | dict | Images, videos, and other media |\n| `error_message` | str | Error details if `success` is False |\n\n## Common Usage Patterns\n\n### Pattern 1: Simple Content Extraction\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def extract_content():\n    async with AsyncWebCrawler(verbose=True) as crawler:\n        result = await crawler.crawl(\n            url=\"https://example.com\",\n            word_count_threshold=10\n        )\n        \n        if result.success:\n            return result.markdown\n        return None\n\ncontent = asyncio.run(extract_content())\n```\n\n### Pattern 2: Batch Crawling\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def crawl_multiple(urls):\n    async with AsyncWebCrawler() as crawler:\n        tasks = [crawler.crawl(url=url) for url in urls]\n        results = await asyncio.gather(*tasks)\n        return [r for r in results if r.success]\n\nurls = [\"https://example.com\", \"https://example.org\"]\nsuccessful_results = asyncio.run(crawl_multiple(urls))\n```\n\n## Configuration Options\n\n### Browser Configuration\n\n```python\nfrom crawl4ai import BrowserConfig, CrawlerRunConfig\n\nbrowser_config = BrowserConfig(\n    headless=True,\n    verbose=False\n)\n\nrun_config = CrawlerRunConfig(\n    word_count_threshold=10,\n    page_timeout=30000\n)\n\nasync with AsyncWebCrawler(config=browser_config) as crawler:\n    result = await crawler.crawl(\n        url=\"https://example.com\",\n        config=run_config\n    )\n```\n\n## Error Handling\n\nAlways check the `success` property before accessing extracted content:\n\n```python\nresult = await crawler.crawl(url=\"https://example.com\")\n\nif result.success:\n    process_data(result.markdown)\nelse:\n    log_error(f\"Crawl failed: {result.error_message}\")\n    handle_failure()\n```\n\n## Next Steps\n\nAfter completing the Quick Start Guide, users should explore:\n\n- Advanced extraction strategies with CSS selectors and XPath\n- JavaScript-heavy page crawling\n- Rate limiting and polite crawling practices\n- Integration with AI/LLM pipelines for content analysis\n\n---\n\n## Summary\n\nThe Quick Start Guide provides the essential foundation for using crawl4ai effectively. By following the patterns shown, developers can immediately begin extracting structured web content with minimal configuration overhead.\n\n---\n\n<a id='architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Browser Management](#browser_management), [Async Web Crawler](#async_crawler)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [deploy/docker/ARCHITECTURE.md](https://github.com/unclecode/crawl4ai/blob/main/deploy/docker/ARCHITECTURE.md)\n- [crawl4ai/async_webcrawler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_webcrawler.py)\n- [crawl4ai/browser_manager.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_manager.py)\n- [crawl4ai/async_dispatcher.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_dispatcher.py)\n- [crawl4ai/models.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/models.py)\n</details>\n\n# System Architecture\n\n## Overview\n\nCrawl4AI is a high-performance web crawling framework designed for AI applications. It enables efficient extraction of web content along with metadata, supporting both single-page crawling and large-scale asynchronous crawling operations. The architecture emphasizes separation of concerns between browser management, crawling logic, and result processing.\n\n## Core Components\n\nThe system is built around three primary modules that work in coordination:\n\n| Component | File | Responsibility |\n|-----------|------|----------------|\n| AsyncWebCrawler | `crawl4ai/async_webcrawler.py` | Main entry point for crawling operations |\n| BrowserManager | `crawl4ai/browser_manager.py` | Handles browser lifecycle and page interactions |\n| AsyncDispatcher | `crawl4ai/async_dispatcher.py` | Manages concurrent crawling tasks |\n\n## Component Architecture\n\n```mermaid\ngraph TD\n    A[User / API Client] --> B[AsyncWebCrawler]\n    B --> C[BrowserManager]\n    B --> D[AsyncDispatcher]\n    C --> E[Browser Instance]\n    D --> F[Task Queue]\n    F --> E\n    E --> G[Content Extraction]\n    G --> H[Result Models]\n    H --> B\n```\n\n## AsyncWebCrawler\n\nThe `AsyncWebCrawler` class serves as the primary interface for initiating crawl operations. It accepts configuration parameters and coordinates the crawling workflow.\n\n### Key Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| config | CrawlerRunConfig | Configuration for the crawl session |\n| browser_manager | BrowserManager | Shared browser manager instance |\n| dispatcher | AsyncDispatcher | Task dispatcher for async operations |\n\n资料来源：[crawl4ai/async_webcrawler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_webcrawler.py)\n\n### Workflow\n\n```mermaid\ngraph LR\n    A[Initialize Crawler] --> B[Configure Browser]\n    B --> C[Create BrowserContext]\n    C --> D[Navigate to URL]\n    D --> E[Extract Content]\n    E --> F[Return CrawlResult]\n```\n\n## BrowserManager\n\nThe `BrowserManager` handles the lifecycle of browser instances, managing Chrome/Chromium processes and providing isolated contexts for crawling sessions.\n\n资料来源：[crawl4ai/browser_manager.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_manager.py)\n\n### Browser Lifecycle\n\n```mermaid\ngraph TD\n    A[Launch Browser] --> B[Create Context]\n    B --> C[Create Page]\n    C --> D[Execute Crawl]\n    D --> E[Close Context]\n    E --> F[Repeat or Shutdown]\n    F --> A\n```\n\n## AsyncDispatcher\n\nThe `AsyncDispatcher` enables concurrent crawling operations, managing task queues and coordinating multiple browser contexts for parallel extraction.\n\n资料来源：[crawl4ai/async_dispatcher.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_dispatcher.py)\n\n### Parallel Execution Model\n\n```mermaid\ngraph TD\n    A[URL List] --> B[Dispatcher Queue]\n    B --> C[Worker 1]\n    B --> D[Worker 2]\n    B --> E[Worker N]\n    C --> F[Results Aggregator]\n    D --> F\n    E --> F\n    F --> G[Combined Output]\n```\n\n## Data Models\n\nResults from crawling operations are structured using Pydantic models defined in `models.py`.\n\n资料来源：[crawl4ai/models.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/models.py)\n\n| Model | Purpose |\n|-------|---------|\n| CrawlResult | Container for extracted content and metadata |\n| CrawlerRunConfig | Configuration parameters for crawl sessions |\n\n## Docker Deployment Architecture\n\nThe project includes Docker deployment specifications that containerize the crawling infrastructure.\n\n```mermaid\ngraph TD\n    A[Docker Compose] --> B[Crawl4AI Container]\n    A --> C[Redis Cache]\n    A --> D[Chrome Browser]\n    B --> C\n    B --> D\n```\n\n资料来源：[deploy/docker/ARCHITECTURE.md](https://github.com/unclecode/crawl4ai/blob/main/deploy/docker/ARCHITECTURE.md)\n\n## Technology Stack\n\n| Layer | Technology |\n|-------|------------|\n| Runtime | Python 3.10+ |\n| Browser Engine | Chrome/Chromium via Playwright |\n| Async Framework | asyncio |\n| Data Validation | Pydantic |\n| Containerization | Docker |\n\n## Configuration\n\nThe system supports extensive configuration options through `CrawlerRunConfig`, including:\n\n- JavaScript execution toggles\n- Memory management settings\n- Request throttling parameters\n- Content extraction strategies\n\n## Dependency Management\n\nThe project maintains a Software Bill of Materials (SBOM) for tracking dependencies and ensuring reproducible builds.\n\n资料来源：[sbom/README.md](https://github.com/unclecode/crawl4ai/blob/main/sbom/README.md)\n\nTo regenerate the SBOM:\n\n```bash\n./scripts/gen-sbom.sh\n\n---\n\n<a id='browser_management'></a>\n\n## Browser Management\n\n### 相关页面\n\n相关主题：[Anti-Bot Detection and Proxy Management](#anti_bot_detection)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/browser_manager.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_manager.py)\n- [crawl4ai/browser_adapter.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_adapter.py)\n- [crawl4ai/browser_profiler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_profiler.py)\n- [crawl4ai/js_snippet](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/js_snippet)\n- [docs/codebase/browser.md](https://github.com/unclecode/crawl4ai/blob/main/docs/codebase/browser.md)\n</details>\n\n# Browser Management\n\n## Overview\n\nBrowser Management in crawl4ai provides a comprehensive abstraction layer for controlling and orchestrating browser instances used during web crawling and scraping operations. The system abstracts the complexity of browser automation, allowing users to focus on data extraction rather than browser lifecycle management.\n\n## Architecture Overview\n\nThe browser management system follows a modular architecture with distinct components that handle specific responsibilities:\n\n```mermaid\ngraph TD\n    A[BrowserManager] --> B[BrowserAdapter]\n    A --> C[BrowserProfiler]\n    B --> D[Playwright/Chromium]\n    C --> E[JS Snippets]\n    F[User Request] --> A\n    A --> G[Crawled Result]\n```\n\n## Core Components\n\n### BrowserManager\n\nThe central orchestrator responsible for:\n\n- Browser instance lifecycle (creation, configuration, teardown)\n- Session management and isolation\n- Resource allocation and cleanup\n- Coordination between adapters and profilers\n\n**Key Responsibilities:**\n\n| Responsibility | Description |\n|----------------|-------------|\n| Instance Creation | Creates and initializes browser contexts |\n| Configuration | Applies user-defined browser settings |\n| Lifecycle Control | Manages startup and shutdown sequences |\n| Pool Management | Handles browser pool for concurrent operations |\n\n资料来源：[crawl4ai/browser_manager.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_manager.py)\n\n### BrowserAdapter\n\nThe adapter pattern implementation that provides a consistent interface for interacting with different browser engines (Playwright, Chromium, Firefox, WebKit).\n\n**Adapter Features:**\n\n| Feature | Description |\n|---------|-------------|\n| Engine Abstraction | Unified API across browser backends |\n| Command Translation | Converts high-level commands to browser-specific instructions |\n| Response Normalization | Standardizes browser responses |\n\n资料来源：[crawl4ai/browser_adapter.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_adapter.py)\n\n### BrowserProfiler\n\nHandles JavaScript injection and performance profiling during browser operations.\n\n**Profiler Capabilities:**\n\n| Capability | Purpose |\n|------------|---------|\n| JS Injection | Execute custom JavaScript in page context |\n| Performance Tracking | Monitor page load and execution metrics |\n| Resource Profiling | Track network requests and responses |\n\n资料来源：[crawl4ai/browser_profiler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_profiler.py)\n\n## JavaScript Integration\n\nThe `js_snippet` module provides pre-built JavaScript utilities for browser automation tasks:\n\n```mermaid\ngraph LR\n    A[Browser Context] --> B[js_snippet Module]\n    B --> C[DOM Manipulation]\n    B --> D[Data Extraction]\n    B --> E[Event Handling]\n```\n\n**Common JS Snippet Categories:**\n\n- DOM traversal and manipulation\n- Content extraction\n- Scroll management\n- Wait conditions\n- Network request interception\n\n资料来源：[crawl4ai/js_snippet](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/js_snippet)\n\n## Configuration Options\n\n### Browser Launch Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| headless | bool | true | Run browser in headless mode |\n| args | list | [] | Additional browser arguments |\n| timeout | int | 30000 | Navigation timeout in milliseconds |\n| viewport | dict | {\"width\": 1920, \"height\": 1080} | Browser viewport dimensions |\n| user_agent | str | None | Custom user agent string |\n| proxy | dict | None | Proxy configuration |\n\n### Context Options\n\n| Option | Type | Description |\n|--------|------|-------------|\n| java_script_enabled | bool | Enable/disable JavaScript |\n| ignore_https_errors | bool | Ignore SSL certificate errors |\n| java_script_enabled | bool | Browser context JavaScript state |\n\n资料来源：[docs/codebase/browser.md](https://github.com/unclecode/crawl4ai/blob/main/docs/codebase/browser.md)\n\n## Browser Lifecycle\n\n```mermaid\nstateDiagram-v2\n    [*] --> Initializing: Create BrowserManager\n    Initializing --> Launching: Launch Browser\n    Launching --> Ready: Browser Context Created\n    Ready --> Navigating: Load URL\n    Navigating --> Ready: Page Loaded\n    Ready --> Executing: Run JS/Commands\n    Executing --> Ready: Commands Complete\n    Ready --> Closing: Shutdown Request\n    Closing --> [*]: Resources Freed\n```\n\n## Usage Patterns\n\n### Basic Browser Usage\n\n```python\nfrom crawl4ai import BrowserManager\n\n# Initialize browser manager\nbrowser_mgr = BrowserManager(\n    headless=True,\n    viewport={\"width\": 1920, \"height\": 1080}\n)\n\n# Create browser context\ncontext = browser_mgr.new_context()\n\n# Use context for crawling\nresult = await context.goto(\"https://example.com\")\n```\n\n### Advanced Configuration\n\n```python\nbrowser_mgr = BrowserManager(\n    headless=False,\n    args=[\n        \"--disable-blink-features=AutomationControlled\",\n        \"--disable-dev-shm-usage\"\n    ],\n    timeout=60000,\n    user_agent=\"Custom User Agent\"\n)\n```\n\n## Session Management\n\nThe system supports multiple concurrent sessions through isolated browser contexts:\n\n```mermaid\ngraph TD\n    A[BrowserManager] --> B1[Session 1 Context]\n    A --> B2[Session 2 Context]\n    A --> B3[Session N Context]\n    B1 --> C1[Page 1]\n    B2 --> C2[Page 2]\n    B3 --> C3[Page N]\n```\n\n## Error Handling\n\nThe browser management system implements comprehensive error handling:\n\n| Error Type | Handling Strategy |\n|------------|-------------------|\n| Navigation Timeout | Retry with exponential backoff |\n| Browser Crash | Automatic restart and context recreation |\n| Resource Exhaustion | Automatic cleanup of stale contexts |\n| Network Errors | Graceful degradation with cached content |\n\n## Performance Considerations\n\n### Optimization Strategies\n\n1. **Context Reuse**: Reuse browser contexts for multiple pages when possible\n2. **Lazy Loading**: Only load resources when explicitly requested\n3. **Resource Limits**: Configure memory and CPU limits per context\n4. **Connection Pooling**: Maintain warm browser instances for rapid access\n\n### Memory Management\n\n| Strategy | Description |\n|----------|-------------|\n| Context Isolation | Each session runs in isolated context |\n| Automatic Cleanup | Temporary files and caches cleared automatically |\n| Resource Limits | Configurable memory caps per browser instance |\n\n## Related Documentation\n\n- [Browser Adapter API Reference](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/browser_adapter.py)\n- [JavaScript Snippets Library](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/js_snippet)\n- [Codebase Overview](https://github.com/unclecode/crawl4ai/blob/main/docs/codebase/browser.md)\n\n---\n\n<a id='async_crawler'></a>\n\n## Async Web Crawler\n\n### 相关页面\n\n相关主题：[Markdown Generation](#markdown_generation), [Extraction Strategies](#extraction_strategies)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/async_webcrawler.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_webcrawler.py)\n- [crawl4ai/async_configs.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_configs.py)\n- [crawl4ai/cache_context.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/cache_context.py)\n- [crawl4ai/types.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/types.py)\n- [docs/md_v2/api/async-webcrawler.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/api/async-webcrawler.md)\n</details>\n\n# Async Web Crawler\n\n## Overview\n\nThe Async Web Crawler is the core component of crawl4ai, providing an asynchronous, high-performance web crawling engine built on Python's `asyncio` framework. It enables concurrent crawling of multiple URLs with built-in caching, configurable extraction strategies, and comprehensive result handling.\n\nThe primary purpose of this module is to fetch web pages, extract meaningful content, and return structured results that include HTML, markdown, media assets, metadata, and optional AI-generated summaries. The async design allows for efficient I/O-bound operations, making it suitable for large-scale web scraping projects.\n\n资料来源：[crawl4ai/async_webcrawler.py:1-50]()\n\n## Architecture\n\n### System Components\n\nThe async web crawler system consists of several interconnected components that work together to provide a seamless crawling experience.\n\n```mermaid\ngraph TD\n    A[AsyncWebCrawler] --> B[Browser Manager]\n    A --> C[Cache Layer]\n    A --> D[Extraction Strategy]\n    A --> E[Result Processor]\n    \n    B --> F[Playwright/Chromium]\n    C --> G[File System Cache]\n    C --> H[Memory Cache]\n    \n    D --> I[LLM-based Extraction]\n    D --> J[CSS/XPath Extraction]\n    \n    E --> K[CrawlResult]\n    E --> L[Raw HTML]\n    E --> M[Markdown]\n```\n\n资料来源：[crawl4ai/async_webcrawler.py:1-30]()\n\n### Core Classes\n\n| Class | File | Purpose |\n|-------|------|---------|\n| `AsyncWebCrawler` | async_webcrawler.py | Main crawler entry point with `arun()` and `arun_many()` methods |\n| `BrowserConfig` | async_configs.py | Configuration for headless browser behavior |\n| `CrawlCache` | cache_context.py | Manages caching strategies for crawled content |\n| `CrawlResult` | types.py | Data model for returning crawl results |\n\n资料来源：[crawl4ai/async_configs.py:1-30]()\n\n## AsyncWebCrawler Class\n\n### Initialization\n\nThe `AsyncWebCrawler` class can be initialized with optional configuration parameters:\n\n```python\nclass AsyncWebCrawler:\n    def __init__(\n        self,\n        config: BrowserConfig | None = None,\n        verbose: bool = False\n    ) -> None:\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `config` | `BrowserConfig` | `None` | Browser configuration object |\n| `verbose` | `bool` | `False` | Enable verbose logging output |\n\n资料来源：[crawl4ai/async_webcrawler.py:50-70]()\n\n### Core Methods\n\n#### `arun()`\n\nSingle URL crawling with comprehensive result extraction:\n\n```python\nasync def arun(\n    self,\n    url: str,\n    config: BrowserConfig | None = None,\n    **kwargs\n) -> CrawlResult\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `url` | `str` | Target URL to crawl |\n| `config` | `BrowserConfig` | Override browser configuration |\n\n资料来源：[crawl4ai/async_webcrawler.py:100-150]()\n\n#### `arun_many()`\n\nBatch crawling for multiple URLs concurrently:\n\n```python\nasync def arun_many(\n    self,\n    urls: list[str],\n    config: BrowserConfig | None = None,\n    **kwargs\n) -> list[CrawlResult]\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `urls` | `list[str]` | List of target URLs |\n| `config` | `BrowserConfig` | Shared configuration for all URLs |\n\n资料来源：[crawl4ai/async_webcrawler.py:200-250]()\n\n### Context Manager Support\n\nThe `AsyncWebCrawler` implements the async context manager protocol for proper resource cleanup:\n\n```python\nasync def __aenter__(self) -> \"AsyncWebCrawler\":\n    await self.start()\n    return self\n\nasync def __aexit__(\n    self,\n    exc_type, exc_val, exc_tb\n) -> None:\n    await self.close()\n```\n\n资料来源：[crawl4ai/async_webcrawler.py:80-100]()\n\n## CrawlResult Data Model\n\nThe `CrawlResult` class encapsulates all information retrieved from a crawled page:\n\n```mermaid\nclassDiagram\n    class CrawlResult {\n        +str url\n        +str html\n        +str markdown\n        +list~MediaItem~ media\n        +list~Link~ links\n        +dict metadata\n        +str|None success\n        +str|None error\n        +dict~str, Any~ extracted_content\n        +int status_code\n        +datetime created_at\n    }\n```\n\n资料来源：[crawl4ai/types.py:1-80]()\n\n### Properties\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `url` | `str` | Original request URL |\n| `html` | `str` | Raw HTML content |\n| `markdown` | `str` | Converted markdown content |\n| `media` | `list[MediaItem]` | Extracted images, videos, audio |\n| `links` | `list[Link]` | Internal and external links |\n| `metadata` | `dict` | Page metadata (title, description) |\n| `success` | `str \\| None` | Success status message |\n| `error` | `str \\| None` | Error message if failed |\n| `status_code` | `int` | HTTP response status code |\n| `created_at` | `datetime` | Timestamp of crawl operation |\n\n资料来源：[crawl4ai/types.py:50-100]()\n\n## Configuration\n\n### BrowserConfig\n\nThe `BrowserConfig` class provides fine-grained control over browser behavior:\n\n```python\n@dataclass\nclass BrowserConfig:\n    headless: bool = True\n    browser_type: str = \"chromium\"\n    viewport_size: dict = {\"width\": 1920, \"height\": 1080}\n    user_agent: str | None = None\n    verbose: bool = False\n```\n\n资料来源：[crawl4ai/async_configs.py:30-80]()\n\n### Configuration Options\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `headless` | `bool` | `True` | Run browser in headless mode |\n| `browser_type` | `str` | `\"chromium\"` | Browser engine (chromium, firefox, webkit) |\n| `viewport_size.width` | `int` | `1920` | Viewport width in pixels |\n| `viewport_size.height` | `int` | `1080` | Viewport height in pixels |\n| `user_agent` | `str \\| None` | `None` | Custom user agent string |\n| `verbose` | `bool` | `False` | Enable debug output |\n\n资料来源：[crawl4ai/async_configs.py:40-90]()\n\n### Advanced Configuration\n\nAdditional crawling parameters can be passed via kwargs:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `word_count_threshold` | `int` | Minimum word count for content extraction |\n| `extraction_strategy` | `ExtractionStrategy` | Strategy for content extraction |\n| `cache_mode` | `CacheMode` | Caching behavior (enabled/disabled/bypass) |\n| `js_enabled` | `bool` | Enable JavaScript execution |\n| `wait_for` | `str` | CSS selector to wait for before returning |\n| `delay_before_return_html` | `float` | Delay in seconds before capturing HTML |\n\n资料来源：[docs/md_v2/api/async-webcrawler.md:1-60]()\n\n## Caching System\n\n### Cache Modes\n\nThe crawl4ai framework implements a multi-layered caching strategy:\n\n```mermaid\ngraph LR\n    A[Request] --> B{Memory Cache}\n    B -->|Hit| C[Return Cached]\n    B -->|Miss| D{File System Cache}\n    D -->|Hit| E[Return Cached]\n    D -->|Miss| F[Fetch Remote]\n    F --> G[Store in Both Layers]\n```\n\n资料来源：[crawl4ai/cache_context.py:1-50]()\n\n### CacheMode Enum\n\n| Mode | Description |\n|------|-------------|\n| `ENABLED` | Use cache if available, otherwise fetch and cache |\n| `DISABLED` | Always fetch fresh content, bypass cache |\n| `BYPASS` | Fetch and update cache but don't read from it |\n| `READ_ONLY` | Only read from cache, never fetch |\n\n资料来源：[crawl4ai/cache_context.py:30-60]()\n\n### Cache Context Manager\n\n```python\nasync with cache_context(cache_mode=CacheMode.ENABLED):\n    result = await crawler.arun(url=\"https://example.com\")\n```\n\n资料来源：[crawl4ai/cache_context.py:60-90]()\n\n## Usage Examples\n\n### Basic Single URL Crawl\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def main():\n    async with AsyncWebCrawler(verbose=True) as crawler:\n        result = await crawler.arun(\n            url=\"https://example.com\",\n            config=BrowserConfig(headless=True)\n        )\n        \n        print(f\"Success: {result.success}\")\n        print(f\"Markdown content: {result.markdown[:500]}\")\n\nasyncio.run(main())\n```\n\n资料来源：[crawl4ai/async_webcrawler.py:150-200]()\n\n### Batch Crawling\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler\n\nasync def main():\n    urls = [\n        \"https://example.com/page1\",\n        \"https://example.com/page2\",\n        \"https://example.com/page3\"\n    ]\n    \n    async with AsyncWebCrawler() as crawler:\n        results = await crawler.arun_many(urls=urls)\n        \n        for result in results:\n            print(f\"URL: {result.url}, Status: {result.status_code}\")\n\nasyncio.run(main())\n```\n\n资料来源：[docs/md_v2/api/async-webcrawler.md:60-100]()\n\n### With Custom Extraction Strategy\n\n```python\nimport asyncio\nfrom crawl4ai import AsyncWebCrawler, BrowserConfig\nfrom crawl4ai.extraction_strategy import LLMExtractionStrategy\n\nasync def main():\n    config = BrowserConfig(\n        headless=True,\n        verbose=True\n    )\n    \n    strategy = LLMExtractionStrategy(\n        provider=\"openai/gpt-4\",\n        api_token=\"your-token\"\n    )\n    \n    async with AsyncWebCrawler(config=config) as crawler:\n        result = await crawler.arun(\n            url=\"https://news-site.com/article\",\n            extraction_strategy=strategy\n        )\n        \n        print(result.extracted_content)\n\nasyncio.run(main())\n```\n\n资料来源：[crawl4ai/async_configs.py:90-130]()\n\n## Error Handling\n\nThe crawler returns comprehensive error information through the `CrawlResult` object:\n\n```python\nresult = await crawler.arun(url=\"https://invalid-url.xyz\")\n\nif not result.success:\n    print(f\"Error: {result.error}\")\n    print(f\"Status Code: {result.status_code}\")\n```\n\n| Error Scenario | `success` Value | `error` Field |\n|----------------|-----------------|---------------|\n| Network timeout | `False` | Connection timeout message |\n| Invalid URL | `False` | URL validation error |\n| JavaScript error | `False` | Browser console error |\n| HTTP 404/500 | `False` | HTTP status message |\n| Success | `True` | `None` |\n\n资料来源：[crawl4ai/types.py:80-120]()\n\n## Performance Considerations\n\n### Async Benefits\n\nThe async architecture provides several performance advantages:\n\n1. **Concurrent Requests**: Multiple URLs can be crawled simultaneously\n2. **Non-blocking I/O**: Browser operations don't block other tasks\n3. **Resource Efficiency**: Single event loop manages all crawling tasks\n\n### Best Practices\n\n| Practice | Benefit |\n|----------|---------|\n| Use `arun_many()` for batch operations | Reduces connection overhead |\n| Enable caching for repeated URLs | Avoids redundant network requests |\n| Set appropriate `word_count_threshold` | Reduces unnecessary processing |\n| Use `headless=True` in production | Reduces memory usage |\n\n资料来源：[crawl4ai/async_webcrawler.py:250-300]()\n\n## Related Components\n\n| Component | File | Relationship |\n|-----------|------|--------------|\n| `ExtractionStrategy` | extraction_strategy.py | Defines how content is extracted |\n| `MediaItem` | types.py | Represents extracted media |\n| `Link` | types.py | Represents extracted links |\n| `CacheBackend` | cache_backend.py | Abstract cache implementation |\n\n资料来源：[crawl4ai/types.py:1-30]()\n\n## Summary\n\nThe Async Web Crawler is the foundational building block of crawl4ai, providing:\n\n- **Asynchronous operation** for high-performance concurrent crawling\n- **Flexible configuration** via `BrowserConfig` dataclass\n- **Comprehensive result types** through `CrawlResult` model\n- **Multi-layered caching** with configurable modes\n- **Extensible extraction** via pluggable strategies\n- **Production-ready error handling** with detailed error reporting\n\nThis architecture enables developers to build scalable web scraping solutions while maintaining clean, readable code patterns familiar to Python async developers.\n\n---\n\n<a id='markdown_generation'></a>\n\n## Markdown Generation\n\n### 相关页面\n\n相关主题：[Extraction Strategies](#extraction_strategies), [Async Web Crawler](#async_crawler)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/markdown_generation_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/markdown_generation_strategy.py)\n- [crawl4ai/content_filter_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/content_filter_strategy.py)\n- [crawl4ai/html2text](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/html2text)\n- [docs/md_v2/core/markdown-generation.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/core/markdown-generation.md)\n- [crawl4ai/extraction_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n</details>\n\n# Markdown Generation\n\nMarkdown Generation is a core feature in crawl4ai that transforms raw HTML content into clean, readable Markdown format. This system provides flexible strategies for content extraction, filtering, and conversion with extensive customization options.\n\n## Overview\n\nThe Markdown Generation system converts web page HTML into structured Markdown text suitable for LLM consumption, RAG systems, or documentation purposes. It offers multiple generation strategies, content filtering capabilities, and fine-grained control over extraction behavior.\n\n资料来源：[docs/md_v2/core/markdown-generation.md:1-15]()\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[HTML Input] --> B[Content Filter Strategy]\n    B --> C[HTML Processing]\n    C --> D[Markdown Generation Strategy]\n    D --> E[Markdown Output]\n    \n    F[Configuration] --> B\n    F --> D\n    \n    G[BestProvider] --> B\n    G --> D\n```\n\n## Core Components\n\n### MarkdownGenerationStrategy\n\nThe primary abstraction for generating Markdown from HTML content.\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `provider` | `str` | `\"best\"` | Content extraction provider |\n| `configs` | `dict` | `{}` | Provider-specific configurations |\n| `strict` | `bool` | `False` | Raise errors on failure |\n| `override_system_prompt` | `str` | `None` | Custom system prompt |\n| `override_user_prompt` | `str` | `None` | Custom user prompt |\n\n资料来源：[crawl4ai/markdown_generation_strategy.py:1-50]()\n\n### ContentFilterStrategy\n\nAbstract base class for filtering and selecting content before Markdown conversion.\n\n| Filter | Description |\n|--------|-------------|\n| `PruningContentFilter` | Removes low-value content nodes |\n| `BM25ContentFilter` | Uses BM25 ranking for content selection |\n| `OrgAnnContentFilter` | Organic annotation-based filtering |\n\n资料来源：[crawl4ai/content_filter_strategy.py:1-100]()\n\n## Generation Providers\n\n### Available Providers\n\n| Provider | Description |\n|----------|-------------|\n| `best` | Automatically selects optimal provider |\n| `playwright` | Uses Playwright for JavaScript rendering |\n| `curl` | Lightweight extraction via curl |\n| `trafilatura` | Trafilatura library extraction |\n| `lxml` | LXML-based HTML parsing |\n| `readability` | Mozilla Readability algorithm |\n\n资料来源：[crawl4ai/markdown_generation_strategy.py:50-150]()\n\n### Best Provider Selection\n\nThe `BestProvider` class intelligently selects the most appropriate extraction method based on content characteristics.\n\n```python\nclass BestProvider:\n    def get_strategy(self, html: str) -> MarkdownGenerationStrategy:\n        # Analyzes HTML and selects optimal provider\n        pass\n```\n\n资料来源：[crawl4ai/markdown_generation_strategy.py:150-200]()\n\n## Workflow\n\n```mermaid\ngraph LR\n    A[Fetch HTML] --> B{Content Filter Enabled?}\n    B -->|Yes| C[Apply Filter Strategy]\n    B -->|No| D[Skip Filtering]\n    C --> E[Generate Markdown]\n    D --> E\n    E --> F{Post-Processing?}\n    F -->|Yes| G[Apply Custom Rules]\n    F -->|No| H[Return Result]\n    G --> H\n```\n\n## Configuration Options\n\n### Generator Config\n\n```python\n{\n    \"word_threshold\": 50,          # Minimum words per chunk\n    \"language\": \"en\",              # Content language\n    \"skip_internal_links\": True,   # Ignore internal links\n    \"content_type\": \"markdown\"     # Output format\n}\n```\n\n### BM25 Filter Config\n\n```python\n{\n    \"query\": \"relevant keywords\",\n    \"top_n\": 5,                    # Number of chunks\n    \"use_stem\": True               # Apply stemming\n}\n```\n\n资料来源：[crawl4ai/content_filter_strategy.py:100-180]()\n\n## HTML to Markdown Conversion\n\n### html2text Module\n\nThe `html2text` submodule handles low-level HTML to Markdown conversion.\n\n| Method | Purpose |\n|--------|---------|\n| `handle_anchor` | Convert `<a>` tags to `[text](url)` |\n| `handle_image` | Convert `<img>` to `![alt](src)` |\n| `handle_heading` | Convert `<h1>`-`<h6>` to `#` - `######` |\n| `handle_table` | Convert `<table>` to Markdown tables |\n| `handle_code` | Preserve `<code>` and `<pre>` formatting |\n\n资料来源：[crawl4ai/html2text:core.py:1-100]()\n\n### Conversion Features\n\n- **Link Preservation**: External links converted to Markdown format with titles\n- **Image Extraction**: Images extracted with alt text and sources\n- **Table Conversion**: HTML tables converted to GFM tables\n- **Code Block Handling**: Syntax-aware code block extraction\n- **List Recognition**: Ordered and unordered lists properly formatted\n\n## Usage Examples\n\n### Basic Usage\n\n```python\nfrom crawl4ai import AsyncWebCrawler\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(\n        url=\"https://example.com\",\n        markdown_generator={\n            \"provider\": \"best\",\n            \"configs\": {\"word_threshold\": 100}\n        }\n    )\n    print(result.markdown)\n```\n\n### With Content Filter\n\n```python\nfrom crawl4ai import AsyncWebCrawler\nfrom crawl4ai.content_filter_strategy import BM25ContentFilter\n\nfilter_strategy = BM25ContentFilter(\n    query=\"getting started installation\",\n    top_n=10\n)\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(\n        url=\"https://docs.example.com\",\n        markdown_generator={\n            \"provider\": \"playwright\",\n            \"filter_strategy\": filter_strategy\n        }\n    )\n```\n\n## Advanced Configuration\n\n### Custom Prompts\n\nOverride system and user prompts for specialized extraction:\n\n```python\nmarkdown_generator = {\n    \"override_system_prompt\": \"Extract only technical documentation...\",\n    \"override_user_prompt\": \"Focus on API endpoints and code examples...\"\n}\n```\n\n### Strict Mode\n\nEnable strict mode to raise exceptions on extraction failures:\n\n```python\nmarkdown_generator = {\n    \"strict\": True,\n    \"provider\": \"playwright\"\n}\n```\n\n## Performance Considerations\n\n| Aspect | Recommendation |\n|--------|----------------|\n| Large Pages | Use `BM25ContentFilter` to reduce content |\n| JavaScript-heavy Sites | Use `playwright` provider |\n| Simple Pages | Use `lxml` or `trafilatura` for speed |\n| Batch Processing | Set appropriate `word_threshold` |\n\n## Error Handling\n\nThe system provides graceful degradation:\n\n1. **Provider Fallback**: Falls back to alternative provider on failure\n2. **Strict Mode**: Raises exceptions when enabled\n3. **Partial Results**: Returns available content on partial failures\n\n资料来源：[crawl4ai/extraction_strategy.py:50-120]()\n\n## Related Components\n\n- **Chunking**: Content can be further processed with text chunking strategies\n- **Extraction**: Works alongside extraction strategies for structured data\n- **Cache**: Generated Markdown can be cached for repeated access\n\n---\n\n<a id='extraction_strategies'></a>\n\n## Extraction Strategies\n\n### 相关页面\n\n相关主题：[Markdown Generation](#markdown_generation), [Deep Crawling Strategies](#deep_crawling)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/extraction_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n- [crawl4ai/chunking_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/chunking_strategy.py)\n- [crawl4ai/content_scraping_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/content_scraping_strategy.py)\n- [crawl4ai/table_extraction.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/table_extraction.py)\n- [docs/md_v2/extraction/llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/llm-strategies.md)\n- [docs/md_v2/extraction/no-llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/no-llm-strategies.md)\n</details>\n\n# Extraction Strategies\n\nExtraction Strategies in crawl4ai define how content is parsed, structured, and extracted from crawled web pages. They form the core abstraction layer that determines whether unstructured HTML becomes meaningful, machine-readable data.\n\n## Overview\n\nExtraction Strategies handle the transformation pipeline from raw HTML to structured output. The system supports two primary categories:\n\n| Category | Use Case | Performance |\n|----------|----------|-------------|\n| LLM-based | Complex, semantic extraction | Slower, higher accuracy |\n| No-LLM | Fast, pattern-based extraction | Faster, rule-dependent |\n\n资料来源：[crawl4ai/extraction_strategy.py:1-50](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[HTML Content] --> B[Content Scraping Strategy]\n    B --> C[Chunking Strategy]\n    C --> D{Extraction Strategy}\n    D -->|LLM-based| E[LLM Strategy]\n    D -->|No-LLM| F[No-LLM Strategy]\n    E --> G[Structured JSON/Markdown]\n    F --> G\n    G --> H[Table Extraction Optional]\n    H --> I[Final Output]\n```\n\nThe extraction pipeline flows through scraping → chunking → extraction, with table extraction as an optional final step.\n\n资料来源：[crawl4ai/extraction_strategy.py:50-100](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## LLM-Based Strategies\n\nLLM strategies leverage large language models for semantic understanding and intelligent content extraction.\n\n### Supported Providers\n\n| Provider | Model Support | Configuration |\n|----------|--------------|---------------|\n| OpenAI | GPT-4, GPT-3.5 | `OPENAI_API_KEY` |\n| Anthropic | Claude 3, Claude 2 | `ANTHROPIC_API_KEY` |\n| Azure OpenAI | Custom deployments | `AZURE_API_KEY`, `AZURE_API_BASE` |\n| Ollama | Local models | `OLLAMA_BASE_URL` |\n\n### Configuration Parameters\n\n```python\nclass LLMExtractionStrategy:\n    def __init__(\n        self,\n        provider: str = \"openai\",\n        model: str = \"gpt-4\",\n        api_token: Optional[str] = None,\n        system_prompt: Optional[str] = None,\n        user_prompt: Optional[str] = None,\n        extraction_type: str = \"block\",\n        input_format: str = \"html\",\n        instruction: Optional[str] = None\n    )\n```\n\n资料来源：[docs/md_v2/extraction/llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/llm-strategies.md)\n\n### Extraction Types\n\n| Type | Description | Best For |\n|------|-------------|----------|\n| `block` | Block-level extraction | Paragraphs, sections |\n| `schema` | Schema-based extraction | Structured data, forms |\n| `custom` | Custom instructions | Specific extraction needs |\n\n资料来源：[crawl4ai/extraction_strategy.py:100-150](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## No-LLM Strategies\n\nNo-LLM strategies provide fast, deterministic extraction without external API dependencies.\n\n### Available Strategies\n\n| Strategy | Purpose |\n|----------|---------|\n| `NoExtractionStrategy` | Pass-through, no extraction |\n| `JsonCssExtractionStrategy` | CSS selector-based JSON extraction |\n| `RegexExtractionStrategy` | Regex pattern matching |\n| `XPathExtractionStrategy` | XPath-based extraction |\n\n资料来源：[docs/md_v2/extraction/no-llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/no-llm-strategies.md)\n\n### JsonCssExtractionStrategy\n\n```python\nfrom crawl4ai import JsonCssExtractionStrategy\n\nstrategy = JsonCssExtractionStrategy(\n    schema={\n        \"name\": \"ProductList\",\n        \"baseSelector\": \"div.product\",\n        \"fields\": [\n            {\"name\": \"title\", \"selector\": \"h2.title\", \"type\": \"text\"},\n            {\"name\": \"price\", \"selector\": \"span.price\", \"type\": \"text\"},\n            {\"name\": \"image\", \"selector\": \"img\", \"attribute\": \"src\"}\n        ]\n    }\n)\n```\n\n资料来源：[crawl4ai/extraction_strategy.py:150-200](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## Chunking Strategies\n\nChunking strategies split content into manageable pieces before extraction.\n\n### Default Chunking Behavior\n\n```mermaid\ngraph LR\n    A[Large Content] --> B[Character Split]\n    B --> C[Overlap Application]\n    C --> D[Token Count Check]\n    D -->|Under limit| E[Chunk Ready]\n    D -->|Over limit| F[Recursive Split]\n    F --> E\n```\n\n### Configuration Options\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `chunk_token_size` | int | 1000 | Target tokens per chunk |\n| `overlap` | int | 100 | Overlapping tokens between chunks |\n| `max_chunk_size` | int | 3000 | Hard maximum chunk size |\n| `splitting_regex` | str | `\\n\\n+` | Regex for splitting points |\n\n资料来源：[crawl4ai/chunking_strategy.py:1-80](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/chunking_strategy.py)\n\n## Content Scraping Strategy\n\nThe content scraping strategy determines initial content extraction from HTML.\n\n```mermaid\ngraph TD\n    A[Raw HTML] --> B{Scraping Strategy}\n    B -->|BeautifulSoup| C[Parse DOM]\n    B -->|Playwright| D[Dynamic Render]\n    B -->|Raw| E[Minimal Processing]\n    C --> F[Content Cleaned]\n    D --> F\n    E --> F\n```\n\n### Strategy Selection\n\n| Strategy | JavaScript | Speed | Use Case |\n|----------|------------|-------|----------|\n| `BeautifulSoup` | No | Fast | Static pages |\n| `Playwright` | Yes | Medium | SPAs, dynamic content |\n| `RawContent` | No | Fastest | Pre-processed HTML |\n\n资料来源：[crawl4ai/content_scraping_strategy.py:1-60](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/content_scraping_strategy.py)\n\n## Table Extraction\n\nTable extraction handles tabular data structures within web pages.\n\n```python\nclass TableExtractionStrategy:\n    def __init__(\n        self,\n        table_styles: Optional[List[str]] = None,\n        ignore_tables: Optional[List[str]] = None,\n        merge_multiple_headers: bool = False\n    )\n```\n\n### Extraction Configuration\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `table_styles` | List[str] | CSS classes to include as tables |\n| `ignore_tables` | List[str] | CSS classes to exclude |\n| `merge_multiple_headers` | bool | Merge multi-row headers |\n| `extract_header` | bool | Include header row (default: True) |\n\n资料来源：[crawl4ai/table_extraction.py:1-100](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/table_extraction.py)\n\n## Complete Pipeline Example\n\n```python\nfrom crawl4ai import (\n    AsyncWebCrawler,\n    LLMExtractionStrategy,\n    JsonCssExtractionStrategy,\n    RegexExtractionStrategy,\n    TableExtractionStrategy\n)\n\nasync with AsyncWebCrawler() as crawler:\n    # LLM-based extraction\n    llm_result = await crawler.arun(\n        url=\"https://example.com/article\",\n        extraction_strategy=LLMExtractionStrategy(\n            provider=\"openai\",\n            model=\"gpt-4\",\n            instruction=\"Extract article title, author, and key points\"\n        )\n    )\n    \n    # CSS-based extraction\n    css_result = await crawler.arun(\n        url=\"https://example.com/products\",\n        extraction_strategy=JsonCssExtractionStrategy(schema=product_schema)\n    )\n```\n\n资料来源：[crawl4ai/extraction_strategy.py:200-250](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n## Strategy Selection Guide\n\n```mermaid\ngraph TD\n    A[Start] --> B{Need semantic understanding?}\n    B -->|Yes| C{External API acceptable?}\n    B -->|No| D[No-LLM Strategy]\n    C -->|Yes| E{Local deployment needed?}\n    C -->|No| F[Ollama/Local LLM]\n    E -->|Yes| F\n    E -->|No| G[OpenAI/Anthropic]\n    D --> H{Data has tabular structure?}\n    H -->|Yes| I[Add TableExtractionStrategy]\n    H -->|No| J[Complete]\n    G --> J\n    F --> J\n    I --> J\n```\n\n### Decision Matrix\n\n| Requirement | Recommended Strategy |\n|-------------|---------------------|\n| Simple CSS extraction | `JsonCssExtractionStrategy` |\n| Complex semantic parsing | `LLMExtractionStrategy` |\n| High-volume, low-latency | No-LLM strategies |\n| Schema-agnostic | LLM-based strategies |\n| Tabular data focus | `TableExtractionStrategy` |\n\n资料来源：[docs/md_v2/extraction/llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/llm-strategies.md), [docs/md_v2/extraction/no-llm-strategies.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/extraction/no-llm-strategies.md)\n\n## Environment Variables\n\n| Variable | Required For | Description |\n|----------|--------------|-------------|\n| `OPENAI_API_KEY` | OpenAI LLM | API key for GPT models |\n| `ANTHROPIC_API_KEY` | Anthropic LLM | API key for Claude models |\n| `AZURE_API_KEY` | Azure OpenAI | Azure OpenAI API key |\n| `OLLAMA_BASE_URL` | Local LLM | Base URL for Ollama server |\n\n资料来源：[crawl4ai/extraction_strategy.py:250-300](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/extraction_strategy.py)\n\n---\n\n<a id='deep_crawling'></a>\n\n## Deep Crawling Strategies\n\n### 相关页面\n\n相关主题：[Extraction Strategies](#extraction_strategies), [Async Web Crawler](#async_crawler)\n\n<details>\n<summary>Related Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/deep_crawling/bfs_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/bfs_strategy.py)\n- [crawl4ai/deep_crawling/dfs_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/dfs_strategy.py)\n- [crawl4ai/deep_crawling/bff_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/bff_strategy.py)\n- [crawl4ai/deep_crawling/base_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n- [crawl4ai/deep_crawling/filters.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/filters.py)\n- [crawl4ai/deep_crawling/scorers.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/scorers.py)\n- [docs/md_v2/core/deep-crawling.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/core/deep-crawling.md)\n</details>\n\n# Deep Crawling Strategies\n\n## Overview\n\nDeep Crawling Strategies in crawl4ai provide systematic approaches to traverse and extract content from websites beyond a single page. These strategies enable controlled, scalable web crawling by managing URL discovery, prioritization, filtering, and scoring mechanisms. The deep crawling module supports multiple traversal algorithms (BFS, DFS, BFF) with extensible filtering and scoring systems.\n\n资料来源：[base_strategy.py:1-50](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Seed URLs] --> B[DeepCrawlingStrategy]\n    B --> C[URL Filters]\n    B --> D[URL Scorers]\n    B --> E[Traversal Algorithm]\n    C --> F[Valid URLs]\n    D --> G[Prioritized URLs]\n    E --> H[Crawl Queue]\n    G --> H\n    H --> I[Crawl4AI Extractor]\n    I --> J[Extracted Content]\n    J --> K[Links Extracted]\n    K --> B\n```\n\nThe architecture follows a producer-consumer pattern where the strategy continuously discovers URLs from crawled pages and feeds them back into the crawl queue based on prioritization rules.\n\n资料来源：[base_strategy.py:50-100](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n\n## Core Components\n\n### Base Strategy\n\nAll crawling strategies inherit from `DeepCrawlingStrategy`, which provides the foundational interface and shared functionality:\n\n| Property/Method | Type | Description |\n|-----------------|------|-------------|\n| `url_scorer` | `URLScorer` | Scores URLs for prioritization |\n| `url_filter` | `URLFilter` | Filters URLs for validity |\n| `keywords` | `Set[str]` | Keywords for relevance matching |\n| `keywords_or` | `Set[str]` | Alternative keyword matching |\n| `max_depth` | `int` | Maximum crawl depth |\n| `included_domains` | `Set[str]` | Allowed domains |\n| `excluded_domains` | `Set[str]` | Blocked domains |\n| `crawl_enabled` | `bool` | Enable/disable crawling |\n| `check_keywords` | `Callable` | Custom keyword validation |\n\n资料来源：[base_strategy.py:100-150](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n\n## Traversal Algorithms\n\n### BFS (Breadth-First Search) Strategy\n\nThe BFS strategy explores pages level by level, ensuring comprehensive coverage before going deeper:\n\n```mermaid\ngraph LR\n    A[Level 0: Seed] --> B[Level 1: depth=1]\n    B --> C[Level 2: depth=2]\n    C --> D[Level 3: depth=3]\n    style A fill:#90EE90\n    style B fill:#87CEEB\n    style C fill:#DDA0DD\n    style D fill:#F0E68C\n```\n\n**Characteristics:**\n- Systematic exploration of shallow depths first\n- Ideal for site maps and directory-style sites\n- Higher memory usage due to large frontier sets\n- Better for finding all accessible pages at shallower depths\n\n资料来源：[bfs_strategy.py:1-80](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/bfs_strategy.py)\n\n**Configuration Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `max_depth` | `int` | `3` | Maximum crawl depth |\n| `max_pages` | `int` | `50` | Maximum pages to crawl |\n| `priority` | `int` | `0` | Base priority score |\n| `include_external` | `bool` | `False` | Allow external domain crawling |\n\n### DFS (Depth-First Search) Strategy\n\nThe DFS strategy explores as deep as possible before backtracking:\n\n```mermaid\ngraph TD\n    A[Start] --> B[Depth 1]\n    B --> C[Depth 2]\n    C --> D[Depth 3]\n    D --> E[Backtrack]\n    E --> F[Next Branch]\n    F --> G[Continue Deep]\n    style A fill:#90EE90\n    style D fill:#FF6B6B\n```\n\n**Characteristics:**\n- Deep exploration of specific paths first\n- Lower memory footprint\n- Suitable for following navigation chains\n- Risk of getting stuck in deep site sections\n\n资料来源：[dfs_strategy.py:1-80](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/dfs_strategy.py)\n\n**Configuration Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `max_depth` | `int` | `10` | Maximum crawl depth |\n| `max_pages` | `int` | `100` | Maximum pages to crawl |\n| `priority` | `int` | `0` | Base priority score |\n\n### BFF (Best-First with Filters) Strategy\n\nThe BFF strategy combines filtering with score-based prioritization, crawling the most relevant pages first:\n\n```mermaid\ngraph TD\n    A[URL Discovered] --> B{URL Filter}\n    B -->|Pass| C{Score URL}\n    B -->|Fail| X[Skip]\n    C --> D[Priority Queue]\n    D --> E[Crawl Next Best]\n    E --> F[Extract Links]\n    F --> A\n```\n\n**Characteristics:**\n- Relevance-based crawling using keyword matching\n- Configurable scoring functions\n- Filters out irrelevant content early\n- Most efficient for targeted data extraction\n\n资料来源：[bff_strategy.py:1-100](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/bff_strategy.py)\n\n## Filtering System\n\nThe filtering system determines which URLs are eligible for crawling:\n\n### FilterChain\n\nMultiple filters can be chained together for comprehensive URL validation:\n\n```python\nfrom crawl4ai.deep_crawling.filters import FilterChain, SameDomainFilter, ExtensionFilter\n\nfilter_chain = FilterChain([\n    SameDomainFilter(allowed_domains=[\"example.com\"]),\n    ExtensionFilter(excluded_extensions=[\".pdf\", \".zip\"]),\n])\n```\n\n### Available Filters\n\n| Filter | Purpose | Key Parameters |\n|--------|---------|----------------|\n| `SameDomainFilter` | Restrict to same domain | `allowed_domains`, `strict` |\n| `ExtensionFilter` | Block by file extension | `excluded_extensions` |\n| ` robots.txt Filter` | Respect robots directives | `user_agent` |\n| `RegexFilter` | Custom pattern matching | `patterns`, `exclude` |\n\n资料来源：[filters.py:1-120](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/filters.py)\n\n## Scoring System\n\nURL scoring determines crawl priority within the queue:\n\n### ScorerChain\n\nScorers can be combined in a chain for multi-factor evaluation:\n\n```python\nfrom crawl4ai.deep_crawling.scorers import ScorerChain, KeywordRelevanceScorer, DepthScorer\n\nscorer = ScorerChain([\n    KeywordRelevanceScorer(keywords=[\"api\", \"docs\"]),\n    DepthScorer(max_depth=5, decay=0.5),\n])\n```\n\n### Available Scorers\n\n| Scorer | Function | Parameters |\n|--------|----------|------------|\n| `KeywordRelevanceScorer` | Match keywords in URL/text | `keywords`, `weight` |\n| `DepthScorer` | Penalize deep pages | `max_depth`, `decay` |\n| `FreshnessScorer` | Prefer recent content | `date_field`, `decay` |\n| `CustomScorer` | User-defined scoring | `scoring_fn` |\n\n资料来源：[scorers.py:1-150](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/scorers.py)\n\n## Usage Examples\n\n### Basic BFS Crawling\n\n```python\nfrom crawl4ai import AsyncWebCrawler\nfrom crawl4ai.deep_crawling import BFSDistanceStrategy\n\nasync with AsyncWebCrawler() as crawler:\n    result = await crawler.arun(\n        url=\"https://example.com\",\n        strategy=BFSDistanceStrategy(\n            max_depth=3,\n            max_pages=50\n        )\n    )\n```\n\n### Targeted Crawling with BFF\n\n```python\nfrom crawl4ai.deep_crawling import BFFStrategy\nfrom crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer\nfrom crawl4ai.deep_crawling.filters import SameDomainFilter\n\nstrategy = BFFStrategy(\n    max_depth=5,\n    max_pages=100,\n    keywords={\"documentation\", \"api\", \"guide\"},\n    url_filter=SameDomainFilter(allowed_domains=[\"example.com\"]),\n    url_scorer=KeywordRelevanceScorer(keywords={\"documentation\"}),\n)\n```\n\n资料来源：[docs/md_v2/core/deep-crawling.md:1-100](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/core/deep-crawling.md)\n\n## Workflow States\n\n```mermaid\nstateDiagram-v2\n    [*] --> Initializing: Start crawl\n    Initializing --> Crawling: Load seed URLs\n    Crawling --> Processing: Fetch page\n    Processing --> Filtering: Extract links\n    Filtering --> Scoring: Validate URLs\n    Scoring --> Queuing: Rank by priority\n    Queuing --> Crawling: Next URL\n    Crawling --> [*]: Queue empty or max reached\n```\n\n## Strategy Selection Guide\n\n| Use Case | Recommended Strategy | Reason |\n|----------|---------------------|--------|\n| Site mapping | BFS | Comprehensive shallow coverage |\n| Documentation sites | BFS | Find all pages systematically |\n| Article/navigation chains | DFS | Follow deep links naturally |\n| Targeted data extraction | BFF | Prioritize relevant pages |\n| API documentation | BFF | Filter by keyword relevance |\n| Limited resources | DFS | Lower memory footprint |\n\n## Configuration Reference\n\n### Common Parameters\n\n```python\n@dataclass\nclass DeepCrawlConfig:\n    # Scope\n    included_domains: Set[str] = None\n    excluded_domains: Set[str] = None\n    \n    # Limits\n    max_depth: int = 3\n    max_pages: int = 100\n    max_total_pages: int = 1000\n    \n    # Filtering\n    allow_external: bool = False\n    check_keywords: bool = True\n    \n    # Scoring\n    scoring: ScorerChain = None\n    filter: FilterChain = None\n```\n\n### Keyword Matching\n\n```python\n# AND matching (all keywords must match)\nkeywords = {\"documentation\", \"api\"}\nkeywords_or = False\n\n# OR matching (any keyword matches)\nkeywords = {\"guide\", \"tutorial\", \"docs\"}\nkeywords_or = True\n```\n\n资料来源：[base_strategy.py:150-200](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/base_strategy.py)\n\n## Advanced Features\n\n### Custom Filters\n\n```python\nfrom crawl4ai.deep_crawling.filters import URLFilter\n\nclass CustomFilter(URLFilter):\n    def should_crawl(self, url: str) -> bool:\n        # Custom logic\n        return \"product\" in url and not url.endswith(\".jpg\")\n```\n\n### Custom Scorers\n\n```python\nfrom crawl4ai.deep_crawling.scorers import URLScorer\n\nclass PriorityScorer(URLScorer):\n    def score(self, url: str, context: dict) -> float:\n        base_score = 1.0\n        if \"important\" in url:\n            base_score *= 2.0\n        return base_score\n```\n\n资料来源：[filters.py:150-200](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/filters.py), [scorers.py:150-200](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/deep_crawling/scorers.py)\n\n## Best Practices\n\n1. **Set reasonable limits**: Always configure `max_pages` and `max_depth` to prevent runaway crawling\n2. **Use BFF for targeted extraction**: When you know what content you need, BFF reduces noise\n3. **Filter early, score late**: Apply filters before scoring to reduce unnecessary processing\n4. **Respect robots.txt**: Configure filters to respect site crawling directives\n5. **Monitor memory usage**: BFS uses more memory; switch to DFS for resource-constrained environments\n6. **Combine keyword strategies**: Use both `keywords` (AND) and `keywords_or` (OR) for flexible matching\n\n## See Also\n\n- [AsyncWebCrawler API Reference](../api/async-web-crawler.md)\n- [Content Extraction](../guides/content-extraction.md)\n- [Browser Configuration](../guides/browser-config.md)\n\n---\n\n<a id='anti_bot_detection'></a>\n\n## Anti-Bot Detection and Proxy Management\n\n### 相关页面\n\n相关主题：[Browser Management](#browser_management)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [crawl4ai/antibot_detector.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/antibot_detector.py)\n- [crawl4ai/proxy_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/proxy_strategy.py)\n- [crawl4ai/async_configs.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_configs.py)\n- [docs/md_v2/advanced/anti-bot-and-fallback.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/advanced/anti-bot-and-fallback.md)\n- [docs/md_v2/advanced/proxy-security.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/advanced/proxy-security.md)\n- [sbom/README.md](https://github.com/unclecode/crawl4ai/blob/main/sbom/README.md)\n</details>\n\n# Anti-Bot Detection and Proxy Management\n\n## Overview\n\nCrawl4ai provides sophisticated anti-bot detection evasion and proxy management capabilities to ensure reliable web crawling operations. These features work together to detect and circumvent bot protection mechanisms while maintaining request anonymity through proxy rotation.\n\n## Architecture Overview\n\n```mermaid\ngraph TD\n    A[Client Request] --> B[AntiBotDetector]\n    B --> C{Bot Detection?}\n    C -->|Yes| D[Apply Evasion Strategy]\n    C -->|No| E[Direct Request]\n    D --> F[ProxySelector]\n    F --> G[Rotating Proxies]\n    G --> H[Target Website]\n    H --> I{Response Valid?}\n    I -->|No| J[Fallback Mechanism]\n    J --> B\n    I -->|Yes| K[Return Content]\n```\n\n## Anti-Bot Detection System\n\n### Purpose and Scope\n\nThe anti-bot detection module (`antibot_detector.py`) analyzes responses from target websites to determine if bot protection mechanisms have been triggered. When detection occurs, the system can automatically apply evasion strategies or fall back to alternative methods.\n\n资料来源：[crawl4ai/antibot_detector.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/antibot_detector.py)\n\n### Detection Strategies\n\n| Strategy | Description | Use Case |\n|----------|-------------|----------|\n| Header Analysis | Examines HTTP headers for bot detection signals | Standard bot checks |\n| Content Analysis | Scans response content for CAPTCHAs or blocking messages | Challenge pages |\n| Status Code Monitoring | Tracks HTTP status codes indicating blocks | 403, 429 responses |\n| JavaScript Challenge Detection | Identifies JS-based bot challenges | Cloudflare, PerimeterX |\n\n### Key Components\n\nThe anti-bot detector integrates with the async configuration system to provide seamless fallback handling:\n\n```python\n# Pseudocode representation based on async_configs.py integration\nclass AntiBotConfig:\n    enabled: bool = True\n    detection_threshold: float = 0.7\n    auto_fallback: bool = True\n    max_retries: int = 3\n```\n\n资料来源：[crawl4ai/async_configs.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/async_configs.py)\n\n## Proxy Management System\n\n### Purpose and Scope\n\nThe proxy strategy module (`proxy_strategy.py`) manages proxy rotation, selection, and health checking to maintain request anonymity and distribute load across multiple IP addresses.\n\n资料来源：[crawl4ai/proxy_strategy.py](https://github.com/unclecode/crawl4ai/blob/main/crawl4ai/proxy_strategy.py)\n\n### Proxy Rotation Strategies\n\n| Strategy | Description | Best For |\n|----------|-------------|----------|\n| Round Robin | Sequential proxy selection | Even distribution |\n| Random | Random proxy selection | Avoiding pattern detection |\n| Weighted | Prioritize faster/reliable proxies | Performance optimization |\n| Geographic | Match proxy location to target | Region-specific content |\n\n### Configuration Parameters\n\n```python\nclass ProxyConfig:\n    proxies: List[str] = []          # List of proxy URLs\n    rotation_strategy: str = \"round_robin\"\n    health_check_interval: int = 300 # seconds\n    timeout: int = 30                # proxy timeout in seconds\n    retry_on_failure: bool = True\n```\n\n### Proxy Health Monitoring\n\nThe system continuously monitors proxy health through periodic health checks, removing failed proxies from the active pool and re-evaluating them after a cooldown period.\n\n## Integration with Async Configuration\n\n### Fallback Mechanisms\n\nWhen anti-bot detection triggers, the system can automatically switch to fallback modes:\n\n1. **Proxy Fallback**: Rotate to a different proxy server\n2. **Strategy Fallback**: Switch to alternative crawling strategies\n3. **User-Agent Fallback**: Use different browser fingerprints\n\n资料来源：[docs/md_v2/advanced/anti-bot-and-fallback.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/advanced/anti-bot-and-fallback.md)\n\n### Configuration Example\n\n```yaml\nanti_bot:\n  enabled: true\n  detection_sensitivity: \"medium\"\n  auto_fallback: true\n  \nproxy:\n  enabled: true\n  strategy: \"weighted\"\n  proxies:\n    - \"http://proxy1.example.com:8080\"\n    - \"http://proxy2.example.com:8080\"\n  health_check:\n    enabled: true\n    interval: 300\n```\n\n## Security Considerations\n\n### Proxy Security\n\nWhen configuring proxies, consider the following security aspects:\n\n- **Proxy Protocol**: Use HTTPS proxies to encrypt traffic\n- **Authentication**: Implement proxy authentication where supported\n- **Provider Reputation**: Use trusted proxy providers\n- **IP Rotation**: Avoid predictable IP patterns\n\n资料来源：[docs/md_v2/advanced/proxy-security.md](https://github.com/unclecode/crawl4ai/blob/main/docs/md_v2/advanced/proxy-security.md)\n\n### Best Practices\n\n| Practice | Description |\n|----------|-------------|\n| Rate Limiting | Respect target site limits to avoid IP bans |\n| Request Delays | Implement delays between requests |\n| Header Randomization | Vary User-Agent and other headers |\n| Cookie Management | Handle cookies appropriately per session |\n| SSL Verification | Validate SSL certificates for security |\n\n## Workflow Diagram\n\n```mermaid\nsequenceDiagram\n    participant Client\n    participant AntiBot as AntiBot Detector\n    participant ProxyMgr as Proxy Manager\n    participant Target as Target Website\n    \n    Client->>AntiBot: Send Request\n    AntiBot->>ProxyMgr: Request Proxy\n    ProxyMgr->>AntiBot: Return Proxy\n    AntiBot->>Target: Forward Request via Proxy\n    \n    alt Bot Detected\n        Target-->>AntiBot: Bot Challenge Response\n        AntiBot->>ProxyMgr: Request Different Proxy\n        ProxyMgr-->>AntiBot: New Proxy\n        AntiBot->>Target: Retry with New Proxy\n    else Success\n        Target-->>Client: Return Content\n    end\n```\n\n## Error Handling\n\n### Common Error Scenarios\n\n| Error | Cause | Resolution |\n|-------|-------|------------|\n| `403 Forbidden` | IP blocked | Rotate proxy |\n| `429 Too Many Requests` | Rate limited | Backoff and retry |\n| `CAPTCHA Required` | Bot detected | Switch strategy |\n| `Proxy Timeout` | Proxy unavailable | Health check and replace |\n\n### Retry Logic\n\nThe system implements exponential backoff for retries:\n\n```python\nretry_config = {\n    \"max_attempts\": 3,\n    \"base_delay\": 1.0,      # seconds\n    \"max_delay\": 60.0,       # seconds\n    \"exponential_base\": 2\n}\n```\n\n## Summary\n\nCrawl4ai's anti-bot detection and proxy management system provides a robust framework for evading bot detection while maintaining reliable web crawling operations. The integration between `AntiBotDetector` and `ProxyStrategy` enables automatic fallback mechanisms that significantly improve crawling success rates against protected websites.\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：unclecode/crawl4ai\n\n摘要：发现 21 个潜在踩坑项，其中 5 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: arun() and arun_many() type hinting needs fixing。\n\n## 1. 安装坑 · 来源证据：[Bug]: arun() and arun_many() type hinting needs fixing\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: arun() and arun_many() type hinting needs fixing\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d3b6cfd3700147f690e0e65875f15424 | https://github.com/unclecode/crawl4ai/issues/1898 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 配置坑 · 来源证据：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason…\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason is shown\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ad61b108bf894cc286ca7966e8c86758 | https://github.com/unclecode/crawl4ai/issues/1949 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 配置坑 · 来源证据：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_1ee99f5d72f143f4b064732cc19e0c85 | https://github.com/unclecode/crawl4ai/issues/1963 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 4. 配置坑 · 来源证据：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d7fa967632a948008efbc182d1f2c96b | https://github.com/unclecode/crawl4ai/issues/1938 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安全/权限坑 · 来源证据：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_2e9fbf659fbb40aba437886a87f8e2d7 | https://github.com/unclecode/crawl4ai/issues/1962 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_af29278fd7294d4a8f0f6f37ab987b5c | https://github.com/unclecode/crawl4ai/issues/1968 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 7. 安装坑 · 来源证据：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_97d44cedb21a4908a7743fde11209954 | https://github.com/unclecode/crawl4ai/issues/1950 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ae45861377894b99a57d6bbdc06af313 | https://github.com/unclecode/crawl4ai/issues/1959 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 9. 安装坑 · 来源证据：v0.7.1:Update\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.1:Update\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_a6ae9133fff54443b712725f51769fa1 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：v0.7.2: CI/CD & Dependency Optimization Update\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.2: CI/CD & Dependency Optimization Update\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_14954e0431ca426ebeaa4bb31778d4af | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.2 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 11. 配置坑 · 来源证据：[Bug]: Markdown export loses heading hierarchy and table structure\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: Markdown export loses heading hierarchy and table structure\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_c3eac8ab81e34bf3b6cc050f7f8e9826 | https://github.com/unclecode/crawl4ai/issues/1964 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 12. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:798201435 | https://github.com/unclecode/crawl4ai | README/documentation is current enough for a first validation pass.\n\n## 13. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | last_activity_observed missing\n\n## 14. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:798201435 | https://github.com/unclecode/crawl4ai | no_demo; severity=medium\n\n## 15. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:798201435 | https://github.com/unclecode/crawl4ai | no_demo; severity=medium\n\n## 16. 安全/权限坑 · 来源证据：Release v0.7.3\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.3\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e2b75670cbcc4814a86423818b9f6f48 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.3 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 17. 安全/权限坑 · 来源证据：Release v0.7.5\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.5\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_056d1470d7534cacb39eeb894e054496 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.5 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 18. 安全/权限坑 · 来源证据：Release v0.7.7\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.7\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e157445f88744795b5c6234783eca692 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.7 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 19. 安全/权限坑 · 来源证据：[Bug]: Markdown text extraction drops text when element contains empty elements\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug]: Markdown text extraction drops text when element contains empty elements\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_dffa926853d147ebb487a03fdfd1818e | https://github.com/unclecode/crawl4ai/issues/1966 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 20. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | issue_or_pr_quality=unknown\n\n## 21. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | release_recency=unknown\n\n<!-- canonical_name: unclecode/crawl4ai; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：unclecode/crawl4ai\n\n摘要：发现 21 个潜在踩坑项，其中 5 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: arun() and arun_many() type hinting needs fixing。\n\n## 1. 安装坑 · 来源证据：[Bug]: arun() and arun_many() type hinting needs fixing\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: arun() and arun_many() type hinting needs fixing\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d3b6cfd3700147f690e0e65875f15424 | https://github.com/unclecode/crawl4ai/issues/1898 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 配置坑 · 来源证据：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason…\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: After successful FETCH, and failed SCRAPE (COMPLETE being marked as failed), no error messages or failure reason is shown\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ad61b108bf894cc286ca7966e8c86758 | https://github.com/unclecode/crawl4ai/issues/1949 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 配置坑 · 来源证据：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: MCP scrape tools lack wait_until / SPA support that REST API and CLI provide\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_1ee99f5d72f143f4b064732cc19e0c85 | https://github.com/unclecode/crawl4ai/issues/1963 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 4. 配置坑 · 来源证据：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: `remove_empty_elements_fast()` drops trailing text when removing empty elements with non-empty .tail\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d7fa967632a948008efbc182d1f2c96b | https://github.com/unclecode/crawl4ai/issues/1938 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安全/权限坑 · 来源证据：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug] MCP Server json.dumps() escapes non-ASCII characters, causing 2.5-3x token overhead for CJK content\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_2e9fbf659fbb40aba437886a87f8e2d7 | https://github.com/unclecode/crawl4ai/issues/1962 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug] AsyncLogger writes to stdout, breaking MCP stdio transport\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_af29278fd7294d4a8f0f6f37ab987b5c | https://github.com/unclecode/crawl4ai/issues/1968 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 7. 安装坑 · 来源证据：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: The install with pip on just about any system rarely works. It requires an env or it only partial installs\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_97d44cedb21a4908a7743fde11209954 | https://github.com/unclecode/crawl4ai/issues/1950 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: enable_stealth=True is a silent no-op — StealthAdapter imports symbols that don't exist in playwright-stealth 2.x\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ae45861377894b99a57d6bbdc06af313 | https://github.com/unclecode/crawl4ai/issues/1959 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 9. 安装坑 · 来源证据：v0.7.1:Update\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.1:Update\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_a6ae9133fff54443b712725f51769fa1 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：v0.7.2: CI/CD & Dependency Optimization Update\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.2: CI/CD & Dependency Optimization Update\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_14954e0431ca426ebeaa4bb31778d4af | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.2 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 11. 配置坑 · 来源证据：[Bug]: Markdown export loses heading hierarchy and table structure\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：[Bug]: Markdown export loses heading hierarchy and table structure\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_c3eac8ab81e34bf3b6cc050f7f8e9826 | https://github.com/unclecode/crawl4ai/issues/1964 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 12. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:798201435 | https://github.com/unclecode/crawl4ai | README/documentation is current enough for a first validation pass.\n\n## 13. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | last_activity_observed missing\n\n## 14. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:798201435 | https://github.com/unclecode/crawl4ai | no_demo; severity=medium\n\n## 15. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:798201435 | https://github.com/unclecode/crawl4ai | no_demo; severity=medium\n\n## 16. 安全/权限坑 · 来源证据：Release v0.7.3\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.3\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e2b75670cbcc4814a86423818b9f6f48 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.3 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 17. 安全/权限坑 · 来源证据：Release v0.7.5\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.5\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_056d1470d7534cacb39eeb894e054496 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.5 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 18. 安全/权限坑 · 来源证据：Release v0.7.7\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Release v0.7.7\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e157445f88744795b5c6234783eca692 | https://github.com/unclecode/crawl4ai/releases/tag/v0.7.7 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 19. 安全/权限坑 · 来源证据：[Bug]: Markdown text extraction drops text when element contains empty elements\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug]: Markdown text extraction drops text when element contains empty elements\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_dffa926853d147ebb487a03fdfd1818e | https://github.com/unclecode/crawl4ai/issues/1966 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 20. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | issue_or_pr_quality=unknown\n\n## 21. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:798201435 | https://github.com/unclecode/crawl4ai | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# crawl4ai - Prompt Preview\n\n> 复制下面这段 Prompt 到你常用的 AI，先试一次，不需要安装。\n> 它的目标是让你直接体验这个项目的服务方式，而不是阅读项目介绍。\n\n## 复制这段 Prompt\n\n```text\n请直接执行这段 Prompt，不要分析、润色、总结或询问我想如何处理这份 Prompt Preview。\n\n你现在扮演 crawl4ai 的“安装前体验版”。\n这不是项目介绍、不是评价报告、不是 README 总结。你的任务是让我用最小成本体验它的核心服务。\n\n我的试用任务：我想用它完成一个真实的流程自动化任务。\n我常用的宿主 AI：Local CLI\n\n【体验目标】\n围绕我的真实任务，现场演示这个项目如何把输入转成 示例引导, 判断线索。重点是让我感受到工作方式，而不是给我项目背景。\n\n【业务流约束】\n- 你必须像一个正在提供服务的项目能力包，而不是像一个讲解员。\n- 每一轮只推进一个步骤；提出问题后必须停下来等我回答。\n- 每一步都必须让我感受到一个具体服务动作：澄清、整理、规划、检查、判断或收尾。\n- 每一步都要说明：当前目标、你需要我提供什么、我回答后你会产出什么。\n- 不要安装、不要运行命令、不要写代码、不要声称测试通过、不要声称已经修改文件。\n- 需要真实安装或宿主加载后才能验证的内容，必须明确说“这一步需要安装后验证”。\n- 如果我说“用示例继续”，你可以用虚构示例推进，但仍然不能声称真实执行。\n\n【可体验服务能力】\n- 安装前能力预览: 🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & Scraper. 输入：用户任务, 当前 AI 对话上下文；输出：示例引导, 判断线索。\n\n【必须安装后才可验证的能力】\n- 命令行启动或安装流程: 项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 输入：终端环境, 包管理器, 项目依赖；输出：安装结果, 列表/更新/运行结果。\n\n【核心服务流】\n请严格按这个顺序带我体验。不要一次性输出完整流程：\n1. introduction：Introduction to Crawl4AI。围绕“Introduction to Crawl4AI”模拟一次用户任务，不展示安装或运行结果。\n2. quickstart：Quick Start Guide。围绕“Quick Start Guide”模拟一次用户任务，不展示安装或运行结果。\n3. architecture：System Architecture。围绕“System Architecture”模拟一次用户任务，不展示安装或运行结果。\n4. browser_management：Browser Management。围绕“Browser Management”模拟一次用户任务，不展示安装或运行结果。\n5. async_crawler：Async Web Crawler。围绕“Async Web Crawler”模拟一次用户任务，不展示安装或运行结果。\n\n【核心能力体验剧本】\n每一步都必须按“输入 -> 服务动作 -> 中间产物”执行。不要只说流程名：\n1. introduction\n输入：用户提供的“Introduction to Crawl4AI”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n2. quickstart\n输入：用户提供的“Quick Start Guide”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n3. architecture\n输入：用户提供的“System Architecture”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n4. browser_management\n输入：用户提供的“Browser Management”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n5. async_crawler\n输入：用户提供的“Async Web Crawler”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n【项目服务规则】\n这些规则决定你如何服务用户。不要解释规则本身，而要在每一步执行时遵守：\n- 先确认用户任务、输入材料和成功标准，再模拟项目能力。\n- 每一步都必须形成可检查的小产物，并等待用户确认后再继续。\n- 凡是需要安装、调用工具或访问外部服务的能力，都必须标记为安装后验证。\n\n【每一步的服务约束】\n- Step 1 / introduction：Step 1 必须围绕“Introduction to Crawl4AI”形成一个小中间产物，并等待用户确认。\n- Step 2 / quickstart：Step 2 必须围绕“Quick Start Guide”形成一个小中间产物，并等待用户确认。\n- Step 3 / architecture：Step 3 必须围绕“System Architecture”形成一个小中间产物，并等待用户确认。\n- Step 4 / browser_management：Step 4 必须围绕“Browser Management”形成一个小中间产物，并等待用户确认。\n- Step 5 / async_crawler：Step 5 必须围绕“Async Web Crawler”形成一个小中间产物，并等待用户确认。\n\n【边界与风险】\n- 不要声称已经安装、运行、调用 API、读写本地文件或完成真实任务。\n- 安装前预览只能展示工作方式，不能证明兼容性、性能或输出质量。\n- 涉及安装、插件加载、工具调用或外部服务的能力必须安装后验证。\n\n【可追溯依据】\n这些路径只用于你内部校验或在我追问“依据是什么”时简要引用。不要在首次回复主动展开：\n- https://github.com/unclecode/crawl4ai\n- https://github.com/unclecode/crawl4ai#readme\n- README.md\n- MISSION.md\n- pyproject.toml\n- docs/examples/quickstart.py\n- docs/examples/hello_world.py\n- crawl4ai/__init__.py\n- deploy/docker/ARCHITECTURE.md\n- crawl4ai/async_webcrawler.py\n- crawl4ai/browser_manager.py\n- crawl4ai/async_dispatcher.py\n\n【首次问题规则】\n- 首次三问必须先确认用户目标、成功标准和边界，不要提前进入工具、安装或实现细节。\n- 如果后续需要技术条件、文件路径或运行环境，必须等用户确认目标后再追问。\n\n首次回复必须只输出下面 4 个部分：\n1. 体验开始：用 1 句话说明你将带我体验 crawl4ai 的核心服务。\n2. 当前步骤：明确进入 Step 1，并说明这一步要解决什么。\n3. 你会如何服务我：说明你会先改变我完成任务的哪个动作。\n4. 只问我 3 个问题，然后停下等待回答。\n\n首次回复禁止输出：后续完整流程、证据清单、安装命令、项目评价、营销文案、已经安装或运行的说法。\n\nStep 1 / brainstorming 的二轮协议：\n- 我回答首次三问后，你仍然停留在 Step 1 / brainstorming，不要进入 Step 2。\n- 第二次回复必须产出 6 个部分：澄清后的任务定义、成功标准、边界条件、\n  2-3 个可选方案、每个方案的权衡、推荐方案。\n- 第二次回复最后必须问我是否确认推荐方案；只有我明确确认后，才能进入下一步。\n- 第二次回复禁止输出 git worktree、代码计划、测试文件、命令或真实执行结果。\n\n后续对话规则：\n- 我回答后，你先完成当前步骤的中间产物并等待确认；只有我确认后，才能进入下一步。\n- 每一步都要生成一个小的中间产物，例如澄清后的目标、计划草案、测试意图、验证清单或继续/停止判断。\n- 所有演示都写成“我会建议/我会引导/这一步会形成”，不要写成已经真实执行。\n- 不要声称已经测试通过、文件已修改、命令已运行或结果已产生。\n- 如果某个能力必须安装后验证，请直接说“这一步需要安装后验证”。\n- 如果证据不足，请明确说“证据不足”，不要补事实。\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：unclecode/crawl4ai\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install -U crawl4ai\n```\n\n来源：https://github.com/unclecode/crawl4ai#readme\n\n## 来源\n\n- repo: https://github.com/unclecode/crawl4ai\n- docs: https://github.com/unclecode/crawl4ai#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_854bda8c953f459280cd4085d3ec4f00"
}
