{
  "canonical_name": "arklexai/arksim",
  "compilation_id": "pack_46ca53fe478a45bfb0cde2bf09108844",
  "created_at": "2026-05-19T05:33:09.248299+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=skill, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=skill, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install arksim` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install arksim",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "deterministic_isolated_install",
      "sandbox_validation_id": "sbx_33cdb6e5dadb4844bd35aadeb1e3874c"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_260ba2081cfeb608d45f3d97dc789155",
    "canonical_name": "arklexai/arksim",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/arklexai/arksim",
    "slug": "arksim",
    "source_packet_id": "phit_26cb20ea266a424793bec80ed8842fc0",
    "source_validation_id": "dval_3536c77cf68640e68b2a7b029788aeed"
  },
  "merchandising": {
    "best_for": "需要软件开发与交付能力，并使用 chatgpt的用户",
    "github_forks": null,
    "github_stars": null,
    "one_liner_en": "<p align=\"center\">",
    "one_liner_zh": "<p align=\"center\">",
    "primary_category": {
      "category_id": "software-development",
      "confidence": "medium",
      "name_en": "Software Development",
      "name_zh": "软件开发与交付",
      "reason": "matched_keywords:git"
    },
    "target_user": "使用 chatgpt 等宿主 AI 的用户",
    "title_en": "arksim",
    "title_zh": "arksim 能力包",
    "visible_tags": [
      {
        "label_en": "Browser Agents",
        "label_zh": "浏览器 Agent",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-browser-agents",
        "type": "product_domain"
      },
      {
        "label_en": "Web Task Automation",
        "label_zh": "网页任务自动化",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-web-task-automation",
        "type": "user_job"
      },
      {
        "label_en": "Natural-language Web Actions",
        "label_zh": "自然语言网页操作",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-natural-language-web-actions",
        "type": "core_capability"
      },
      {
        "label_en": "Page Observation and Action Planning",
        "label_zh": "页面观察与动作规划",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-page-observation-and-action-planning",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Evaluation Suite",
        "label_zh": "评测体系",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-evaluation-suite",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_26cb20ea266a424793bec80ed8842fc0",
  "page_model": {
    "artifacts": {
      "artifact_slug": "arksim",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install arksim",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/arklexai/arksim#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "浏览器 Agent",
        "网页任务自动化",
        "自然语言网页操作",
        "页面观察与动作规划",
        "评测体系"
      ],
      "eyebrow": "软件开发与交付",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要软件开发与交付能力，并使用 chatgpt的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "<p align=\"center\">"
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "chatgpt",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "skill, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.1.0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_d55b6435222f4142bf979809f0cf0794 | https://github.com/arklexai/arksim/releases/tag/v0.1.0 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.1.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.0.6",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_57b2e0dc0cf94da8b4fc97ea744ae7e9 | https://github.com/arklexai/arksim/releases/tag/v0.0.6 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.0.6",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.2",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_3a2981d341cb4512a410e7d75e03baf0 | https://github.com/arklexai/arksim/releases/tag/v0.3.2 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.3.2",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.4",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_f949170a7f5d440cb0559c40fb441178 | https://github.com/arklexai/arksim/releases/tag/v0.3.4 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.3.4",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.5",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_314def1c94614699b6c9ad728c36e9ab | https://github.com/arklexai/arksim/releases/tag/v0.3.5 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.3.5",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.3.1",
            "category": "能力坑",
            "evidence": [
              "community_evidence:github | cevd_c3e7552e9af84b95b2aec63c73dc7c26 | https://github.com/arklexai/arksim/releases/tag/v0.3.1 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.3.1",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个维护/版本相关的待验证问题：v0.3.3",
            "category": "维护坑",
            "evidence": [
              "community_evidence:github | cevd_03b0d8ce6f8a4bc2bc8e33a7636aa07c | https://github.com/arklexai/arksim/releases/tag/v0.3.3 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.3.3",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "未记录 last_activity_observed。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | last_activity_observed missing"
            ],
            "severity": "medium",
            "suggested_check": "补 GitHub 最近 commit、release、issue/PR 响应信号。",
            "title": "维护活跃度未知",
            "user_impact": "新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "downstream_validation.risk_items | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "进入安全/权限治理复核队列。",
            "title": "下游验证发现风险项",
            "user_impact": "下游已经要求复核，不能在页面中弱化。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "risks.scoring_risks | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "把风险写入边界卡，并确认是否需要人工复核。",
            "title": "存在评分风险",
            "user_impact": "风险会影响是否适合普通用户安装。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.2.0",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_3c70cc242a1242b98d50cfe5078d6752 | https://github.com/arklexai/arksim/releases/tag/v0.2.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.2.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.3.0",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_69bd9f91a5064aeeb87f325f2f56f115 | https://github.com/arklexai/arksim/releases/tag/v0.3.0 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.3.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "issue_or_pr_quality=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | issue_or_pr_quality=unknown"
            ],
            "severity": "low",
            "suggested_check": "抽样最近 issue/PR，判断是否长期无人处理。",
            "title": "issue/PR 响应质量未知",
            "user_impact": "用户无法判断遇到问题后是否有人维护。"
          },
          {
            "body": "release_recency=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | release_recency=unknown"
            ],
            "severity": "low",
            "suggested_check": "确认最近 release/tag 和 README 安装命令是否一致。",
            "title": "发布节奏不明确",
            "user_impact": "安装命令和文档可能落后于代码，用户踩坑概率升高。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：v0.1.0。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": null,
        "forks": null,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": null
      },
      "source_url": "https://github.com/arklexai/arksim",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "<p align=\"center\">",
      "title": "arksim 能力包",
      "trial_prompt": "# arksim - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for arklexai/arksim.\n\nProject:\n- Name: arksim\n- Repository: https://github.com/arklexai/arksim\n- Summary: <p align=\"center\">\n- Host target: chatgpt\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: <p align=\"center\">\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. intro: Introduction to ArkSim. Produce one small intermediate artifact and wait for confirmation.\n2. quickstart: Quickstart Guide. Produce one small intermediate artifact and wait for confirmation.\n3. arch-overview: System Architecture Overview. Produce one small intermediate artifact and wait for confirmation.\n4. simulation-engine: Simulation Engine. Produce one small intermediate artifact and wait for confirmation.\n5. evaluation-system: Evaluation System. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/arklexai/arksim#readme\n- .claude/skills/draft-pr/SKILL.md\n- .claude/skills/pre-review/SKILL.md\n- .claude/skills/review-pr/SKILL.md\n- README.md\n- arksim/__init__.py\n- arksim/constants.py\n- arksim/cli.py\n- arksim/templates/my_agent.py\n- arksim/templates/config.yaml\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "voices": [
        {
          "body": "来源平台：github, x。github/github_release: v0.3.7（https://github.com/arklexai/arksim/releases/tag/v0.3.7）；github/github_release: v0.3.6（https://github.com/arklexai/arksim/releases/tag/v0.3.6）；github/github_release: v0.3.5（https://github.com/arklexai/arksim/releases/tag/v0.3.5）；github/github_release: v0.3.4（https://github.com/arklexai/arksim/releases/tag/v0.3.4）；github/github_release: v0.3.3（https://github.com/arklexai/arksim/releases/tag/v0.3.3）；github/github_release: v0.3.2（https://github.com/arklexai/arksim/releases/tag/v0.3.2）；github/github_release: v0.3.1（https://github.com/arklexai/arksim/releases/tag/v0.3.1）；github/github_release: v0.3.0（https://github.com/arklexai/arksim/releases/tag/v0.3.0）；github/github_release: v0.2.0（https://github.com/arklexai/arksim/releases/tag/v0.2.0）；github/github_release: v0.1.0（https://github.com/arklexai/arksim/releases/tag/v0.1.0）；github/github_release: v0.0.6（https://github.com/arklexai/arksim/releases/tag/v0.0.6）；x: Zhou Yu (@Zhou_Yu_AI) / Highlights / X（https://x.com/Zhou_Yu_AI/highlights）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.3.7",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.3.7"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.3.6",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.3.6"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.3.5",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.3.5"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.3.4",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.3.4"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.3.3",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.3.3"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.3.2",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.3.2"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.3.1",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.3.1"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.3.0",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.3.0"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.2.0",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.2.0"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.1.0",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.1.0"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.0.6",
              "url": "https://github.com/arklexai/arksim/releases/tag/v0.0.6"
            },
            {
              "kind": "searxng_indexed",
              "source": "x",
              "title": "Zhou Yu (@Zhou_Yu_AI) / Highlights / X",
              "url": "https://x.com/Zhou_Yu_AI/highlights"
            }
          ],
          "status": "已收录 12 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "软件开发与交付",
      "desc": "<p align=\"center\">",
      "effort": "安装已验证",
      "forks": null,
      "icon": "code",
      "name": "arksim 能力包",
      "risk": "可发布",
      "slug": "arksim",
      "stars": null,
      "tags": [
        "浏览器 Agent",
        "网页任务自动化",
        "自然语言网页操作",
        "页面观察与动作规划",
        "评测体系"
      ],
      "thumb": "gray",
      "type": "Skill Pack"
    },
    "manual": {
      "markdown": "# https://github.com/arklexai/arksim 项目说明书\n\n生成时间：2026-05-15 12:11:39 UTC\n\n## 目录\n\n- [Introduction to ArkSim](#intro)\n- [Quickstart Guide](#quickstart)\n- [System Architecture Overview](#arch-overview)\n- [Simulation Engine](#simulation-engine)\n- [Evaluation System](#evaluation-system)\n- [Agent Types and Integration](#agent-types)\n- [LLM Provider Integration](#llm-providers)\n- [Tool Call Capture](#tool-call-capture)\n- [Scenario Management](#scenario-management)\n- [Configuration System](#configuration)\n\n<a id='intro'></a>\n\n## Introduction to ArkSim\n\n### 相关页面\n\n相关主题：[Quickstart Guide](#quickstart), [System Architecture Overview](#arch-overview)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n- [CONTRIBUTING.md](https://github.com/arklexai/arksim/blob/main/CONTRIBUTING.md)\n- [CLAUDE.md](https://github.com/arklexai/arksim/blob/main/CLAUDE.md)\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n- [examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n- [examples/integrations/langgraph/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langgraph/README.md)\n- [examples/integrations/autogen/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/autogen/README.md)\n- [examples/ci/README.md](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n</details>\n\n# Introduction to ArkSim\n\nArkSim is a multi-turn agent evaluation and simulation platform designed to test, measure, and improve AI agent quality across conversational scenarios. It enables developers to define test scenarios, simulate user interactions with their agents, and generate comprehensive evaluation reports with quantitative and qualitative metrics.\n\n## Overview\n\nArkSim addresses the challenge of systematically evaluating conversational AI agents by providing:\n\n- **Scenario-based simulation**: Define test cases with user profiles, knowledge bases, and expected behaviors\n- **Automated evaluation**: Score agent responses across multiple dimensions including goal completion, helpfulness, and coherence\n- **Framework integration**: Connect with various agent frameworks including LangGraph, LangChain, AutoGen, Claude Agent SDK, Rasa, Dify, and OpenClaw\n- **Custom metrics**: Extend evaluation capabilities with user-defined quantitative and qualitative metrics\n- **Visual reporting**: Generate HTML reports with conversation transcripts, failure analysis, and per-metric scores\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Architecture Overview\n\nArkSim follows a pipeline architecture consisting of three primary stages:\n\n```mermaid\ngraph TD\n    A[Configuration<br/>config.yaml] --> B[Simulation Engine]\n    C[Scenarios<br/>scenarios.json] --> B\n    D[Agent<br/>custom_agent.py] --> B\n    B --> E[Conversation Data]\n    E --> F[Evaluator]\n    F --> G[HTML Report<br/>final_report.html]\n    H[Custom Metrics<br/>custom_metrics.py] --> F\n```\n\n### Core Components\n\n| Component | Description |\n|-----------|-------------|\n| **Simulation Engine** | Orchestrates multi-turn conversations between simulated users and the agent |\n| **Evaluator** | Scores agent performance using built-in and custom metrics |\n| **CLI Interface** | Command-line interface for running simulations (`arksim simulate-evaluate`) |\n| **Report Generator** | Produces HTML reports with visualizations and conversation transcripts |\n\n资料来源：[examples/customer-service/README.md:1-25](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Agent Integration Methods\n\nArkSim supports multiple agent integration patterns to accommodate different agent architectures.\n\n### Python Class Integration (BaseAgent)\n\nThe default integration method uses a Python class inheriting from `BaseAgent`. This pattern provides maximum flexibility for connecting any agent implementation.\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Replace with your agent logic\n        return \"agent response\"\n```\n\nThe agent can return either:\n- A plain string response\n- An `AgentResponse` object containing both content and tool calls\n\n资料来源：[README.md:40-55](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### AgentResponse Data Model\n\nThe `AgentResponse` model encapsulates structured agent outputs:\n\n```python\nclass AgentResponse(BaseModel):\n    \"\"\"Structured return from agent execution, carrying both text and tool calls.\"\"\"\n    model_config = ConfigDict(extra=\"ignore\")\n    \n    content: str\n    tool_calls: list[ToolCall] = Field(default_factory=list)\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:32-42](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n\n### ToolCall Model\n\nTool calls are captured using the `ToolCall` model:\n\n```python\nclass ToolCall(BaseModel):\n    \"\"\"A single tool/function call observed during a turn.\"\"\"\n    model_config = ConfigDict(extra=\"ignore\")\n    \n    id: str\n    name: str\n    arguments: dict[str, Any] = Field(default_factory=dict)\n    result: str | None = None\n    error: str | None = None\n    source: ToolCallSource | None = None\n```\n\nThe `extra=\"ignore\"` configuration ensures forward compatibility with future versions.\n\n资料来源：[arksim/simulation_engine/tool_types.py:18-30](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n\n### Chat Completions Endpoint Integration\n\nFor agents exposing a standard OpenAI-compatible API:\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n资料来源：[README.md:57-62](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### A2A Protocol Integration\n\nFor agents implementing the Agent-to-Agent (A2A) protocol:\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\n资料来源：[README.md:64-70](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### Automatic Tool Call Capture (Tracing)\n\nArkSim supports automatic tool call capture through a tracing processor, eliminating the need for agents to explicitly return tool calls:\n\n```mermaid\ngraph LR\n    A[Simulator sets<br/>routing context] --> B[agent.execute()]\n    B --> C[SDK fires<br/>TracingProcessor.on_span_end]\n    C --> D[arksim captures<br/>tool calls]\n    D --> E[Evaluator scores]\n```\n\n```bash\npip install -r requirements-traced.txt\narksim simulate-evaluate config_traced.yaml\n```\n\n资料来源：[examples/customer-service/README.md:35-55](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Configuration System\n\nArkSim uses YAML configuration files to define simulation and evaluation parameters.\n\n### Configuration File Structure\n\n```yaml\n# Simulation settings\nsimulation:\n  max_turns: 10\n  timeout: 300\n\n# Agent configuration\nagent_config:\n  agent_type: chat_completions  # or 'a2a', 'custom'\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n\n# Trace receiver (optional)\ntrace_receiver:\n  enabled: true\n  wait_timeout: 5\n\n# Custom metrics\ncustom_metrics_file_paths:\n  - ./custom_metrics.py\n\nmetrics_to_run:\n  - goal_completion\n  - verification_compliance\n```\n\n资料来源：[examples/ci/README.md:1-25](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n\n### Scenario Definition\n\nScenarios are defined in `scenarios.json` with the following structure:\n\n| Field | Description |\n|-------|-------------|\n| `scenario_id` | Unique identifier for the scenario |\n| `user_profile` | Description of the simulated user |\n| `knowledge` | Context and information available to the user |\n| `goals` | Objectives the user aims to achieve |\n| `expected_behavior` | Expected agent behavior patterns |\n\n资料来源：[examples/integrations/dify/README.md:1-20](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md)\n\n## Evaluation Metrics\n\nArkSim provides built-in metrics and supports custom metric definitions.\n\n### Built-in Metrics\n\n| Metric | Description | Weight |\n|--------|-------------|--------|\n| Goal Completion | Whether the agent achieved the user's objectives | 60% |\n| Turn Success Ratio | Ratio of successful turns to total turns | 40% |\n| Final Score | Weighted average of other metrics | - |\n\n资料来源：[arksim/utils/html_report/report_template.html:80-95](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n### Custom Metrics\n\nCustom metrics can be implemented as either quantitative (scored numerically) or qualitative (evaluated via LLM).\n\n#### Quantitative Metric Pattern\n\n```python\nfrom arksim.evaluator import (\n    QuantitativeMetric,\n    QuantResult,\n    ScoreInput,\n    format_chat_history,\n)\n\nclass VerificationComplianceMetric(Queresult):\n    identity_verification: float  # 0.0-1.0\n    action_gating: float  # 0.0-1.0\n    reason: str\n\nVERIFICATION_COMPLIANCE_SYSTEM_PROMPT = \"\"\"\\\nYou are an impartial evaluator for a customer service agent.\nScore how well the agent followed identity verification protocols...\"\"\"\n\ndef verification_compliance_metric(\n    input_data: ScoreInput,\n) -> QuantResult:\n    # Implementation\n    return QuantResult(score=0.85, reason=\"Verification completed\")\n```\n\n资料来源：[examples/customer-service/custom_metrics.py:15-40](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n#### Qualitative Metric Pattern\n\n```python\nfrom arksim.evaluator import (\n    QualitativeMetric,\n    QualResult,\n    ScoreInput,\n    format_chat_history,\n)\n\ndef helpfulness_metric(input_data: ScoreInput) -> QualResult:\n    system_prompt = \"\"\"Evaluate the helpfulness of the agent response...\"\"\"\n    return QualResult(\n        assessment=\"The agent provided comprehensive assistance\",\n        score=0.9,\n        reason=\"Clear explanation with actionable steps\"\n    )\n```\n\n### Metric Configuration\n\nTo use custom metrics:\n\n1. Implement the metric function in a Python file\n2. Reference the file path in `custom_metrics_file_paths` in config.yaml\n3. Add the metric name to `metrics_to_run`\n\n资料来源：[examples/customer-service/custom_metrics.py:1-15](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n## CLI Usage\n\nArkSim provides a command-line interface for running simulations and evaluations.\n\n### Primary Commands\n\n| Command | Description |\n|---------|-------------|\n| `arksim simulate-evaluate config.yaml` | Run simulation and evaluation in one step |\n| `arksim simulate config_simulate.yaml` | Run simulation only |\n| `arksim evaluate config_evaluate.yaml` | Evaluate existing simulation results |\n\n资料来源：[examples/customer-service/README.md:15-22](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n### Workflow Examples\n\n#### Combined Simulation and Evaluation\n\n```bash\narksim simulate-evaluate config.yaml\n```\n\n#### Separate Simulation and Evaluation\n\n```bash\n# Step 1: Simulate\narksim simulate config_simulate.yaml\n\n# Step 2: Evaluate\narksim evaluate config_evaluate.yaml\n```\n\nResults are written to `./results/simulation/simulation.json`. The evaluation report is printed to stdout with per-scenario metric scores and failure analysis.\n\n资料来源：[examples/integrations/dify/README.md:15-25](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md)\n\n## Framework Integrations\n\nArkSim provides integration examples for popular agent frameworks.\n\n### Integration Matrix\n\n| Framework | Package Required | Documentation |\n|-----------|-----------------|---------------|\n| LangGraph | `langgraph langchain-openai` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/langgraph/README.md) |\n| LangChain | `langgraph langchain-openai` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/langchain/README.md) |\n| AutoGen | `autogen-agentchat autogen-ext[openai]` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/autogen/README.md) |\n| Claude Agent SDK | `claude-agent-sdk` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/claude-agent-sdk/README.md) |\n| Rasa | `rasa-pro` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/rasa/README.md) |\n| Dify | (custom agent) | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md) |\n| OpenClaw | (custom agent) | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/openclaw/README.md) |\n\n资料来源：[examples/integrations/langgraph/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langgraph/README.md), [examples/integrations/autogen/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/autogen/README.md)\n\n### General Integration Pattern\n\nRegardless of the framework, the integration follows this pattern:\n\n1. Create a `custom_agent.py` file with a `BaseAgent` subclass\n2. Implement the `execute()` method to call the framework's agent\n3. Return either a string or `AgentResponse` with tool calls\n4. Configure `config.yaml` to use the custom agent\n5. Create `scenarios.json` with test cases\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass FrameworkAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-session-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Call your framework's agent here\n        result = await your_agent.run(user_query)\n        return str(result)\n```\n\n## Report Generation\n\nArkSim generates comprehensive HTML evaluation reports containing:\n\n### Report Sections\n\n| Section | Content |\n|---------|---------|\n| **Summary** | Overall scores, pass/fail status |\n| **Per-Scenario Scores** | Individual metric scores for each scenario |\n| **Failure Categories** | Grouped failure patterns with counts |\n| **Conversation Viewer** | Full transcript of each simulated conversation |\n| **Score Explanations** | Rationale for each score with references to conversation turns |\n\n资料来源：[README.md:25-30](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### Report Output\n\nThe report tells you where your agent is strong and where it breaks. You get per-metric scores, categorized failures, and full conversation transcripts so you can read the exact turns where things went wrong.\n\n资料来源：[README.md:30-35](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Development Guidelines\n\n### Project Setup\n\n```bash\n# Fork and clone\ngit clone https://github.com/<your-username>/arksim.git\ncd arksim\n\n# Install in editable mode with dev dependencies\npip install -e \".[dev]\"\n\n# Create a branch\ngit checkout -b my-feature\n```\n\n### Code Quality\n\nArkSim uses Ruff for linting and formatting:\n\n```bash\nruff check .    # Lint\nruff format .   # Format\n```\n\nPre-commit hooks run both automatically on commit.\n\n### Code Style\n\n- Follow PEP 8 conventions\n- Code lines: maximum 120 characters\n- Comments and docstrings: maximum 80 characters\n- Type hints encouraged for function signatures\n- Use absolute imports over relative imports\n\n### Commit Message Format\n\n```\n<component>: <verb> <description>\n```\n\nExamples:\n- `evaluator: add custom metric support`\n- `simulator: fix profile generation for empty attributes`\n- `cli: support verbose flag for streaming output`\n\nKeep the subject line under 72 characters, use lowercase, imperative mood.\n\n### Branch Naming\n\n```\n<type>/<short-description>\n```\n\nExamples: `feat/retry-logic`, `fix/empty-list-handling`, `docs/update-quickstart`.\n\n资料来源：[CONTRIBUTING.md:1-50](https://github.com/arklexai/arksim/blob/main/CONTRIBUTING.md)\n\n## CI/CD Integration\n\nArkSim can be integrated into GitHub Actions workflows for automated testing.\n\n### Workflow Options\n\n| Workflow | Use Case |\n|----------|----------|\n| `arksim.yml` | HTTP server endpoints requiring startup/shutdown |\n| `arksim-pytest.yml` | Python-based pytest integration |\n\n### Setup Requirements\n\n1. Update `TODO` sections in `arksim.yml` (startup command and health-check URL)\n2. Create `tests/arksim/config.yaml` pointing to your server endpoint\n3. Create `tests/arksim/scenarios.json` with test cases\n4. Add custom metrics to `tests/arksim/custom_metrics/` if needed\n5. Add `OPENAI_API_KEY` (and optionally `AGENT_API_KEY`) to GitHub secrets\n\n资料来源：[examples/ci/README.md:1-15](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n\n## Summary\n\nArkSim provides a comprehensive framework for evaluating multi-turn conversational AI agents through:\n\n1. **Flexible integration**: Connect agents via Python classes, REST APIs, or framework-specific connectors\n2. **Rich scenario definitions**: Test agents with diverse user profiles and interaction goals\n3. **Extensible evaluation**: Implement custom metrics for domain-specific assessment\n4. **Actionable reporting**: Generate HTML reports that pinpoint exactly where and why agent behavior falls short\n\nBy standardizing agent evaluation, ArkSim enables data-driven improvements to conversational AI systems across different frameworks and architectures.\n\n---\n\n<a id='quickstart'></a>\n\n## Quickstart Guide\n\n### 相关页面\n\n相关主题：[Introduction to ArkSim](#intro), [Agent Types and Integration](#agent-types)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n- [arksim/simulation_engine/agent/base.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/agent/base.py)\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n- [examples/integrations/langchain/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langchain/README.md)\n- [README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n</details>\n\n# Quickstart Guide\n\nThis guide walks you through setting up and running your first agent simulation with ArkSim. By the end, you'll understand how to scaffold a project, connect your agent, define test scenarios, and execute simulation and evaluation workflows.\n\n## Prerequisites\n\nBefore starting, ensure you have:\n\n- Python 3.10 or higher\n- pip or pipx for package installation\n- An API key for your LLM provider (OpenAI, Anthropic, etc.) if your agent requires external API calls\n\n## Installation\n\nInstall ArkSim using pip:\n\n```bash\npip install arksim\n```\n\nFor development with test dependencies:\n\n```bash\npip install -e \".[dev]\"\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Project Initialization\n\nThe `arksim init` command scaffolds a starter project with all necessary configuration files. Run it from your desired working directory:\n\n```bash\narksim init\n```\n\n### Agent Connection Types\n\nArkSim supports three agent connection types via the `--agent-type` flag:\n\n| Type | Description | Use Case |\n|------|-------------|----------|\n| `custom` | Python class implementing `BaseAgent` | Full control, no external server needed |\n| `chat_completions` | HTTP endpoint compatible with OpenAI format | Existing REST APIs |\n| `a2a` | Agent-to-Agent protocol | Multi-agent systems |\n\n资料来源：[arksim/cli.py:120-136](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n### Command Options\n\n```bash\narksim init --agent-type custom    # Default: Python agent class\narksim init --agent-type chat_completions  # HTTP endpoint\narksim init --agent-type a2a       # A2A protocol\narksim init --agent-type custom --force  # Overwrite existing files\n```\n\nScaffolding generates these files:\n\n| File | Purpose |\n|------|---------|\n| `my_agent.py` | Agent implementation stub |\n| `config.yaml` | Simulator configuration |\n| `scenarios.json` | Test scenario definitions |\n| `.env` | Environment variables template |\n\n## Project Structure\n\n```\nyour-project/\n├── my_agent.py        # Your agent implementation\n├── config.yaml         # Simulation & evaluation settings\n├── scenarios.json     # Test scenarios\n└── results/            # Output directory (created on run)\n    ├── simulation/\n    │   └── simulation.json\n    └── evaluation/\n        └── evaluation.json\n```\n\n## Connecting Your Agent\n\n### Python Class (Recommended)\n\nReplace the generated `my_agent.py` with your agent logic. Your class must inherit from `BaseAgent`:\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Your agent logic here\n        return \"agent response\"\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\nThe `execute` method supports two return types:\n\n| Return Type | Description |\n|-------------|-------------|\n| `str` | Plain text response |\n| `AgentResponse` | Structured response with text and tool calls |\n\n```python\nfrom arksim.simulation_engine.tool_types import AgentResponse, ToolCall\n\nasync def execute(self, user_query: str, **kwargs: object) -> AgentResponse:\n    tool_calls = [\n        ToolCall(\n            id=\"call_123\",\n            name=\"search_database\",\n            arguments={\"query\": user_query}\n        )\n    ]\n    return AgentResponse(\n        content=\"Found results for your query\",\n        tool_calls=tool_calls\n    )\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:45-72](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n\n### Chat Completions Endpoint\n\nFor HTTP-based agents, configure `config.yaml`:\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n### A2A Protocol\n\nFor Agent-to-Agent protocol support:\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\nA2A agents can surface tool calls for evaluation via the protocol extension.\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Configuring Simulation\n\nEdit `config.yaml` to control simulation behavior:\n\n```yaml\nsimulator:\n  max_turns: 20\n  timeouts:\n    agent_response: 30\n    tool_execution: 10\n  model: gpt-4o-mini\n```\n\n### Core Configuration Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `max_turns` | int | 20 | Maximum conversation turns per scenario |\n| `agent_response` | int | 30 | Timeout for agent response (seconds) |\n| `tool_execution` | int | 10 | Timeout for tool execution (seconds) |\n| `model` | string | gpt-4o-mini | Simulator model for user simulation |\n\n## Defining Test Scenarios\n\nEdit `scenarios.json` to define your test cases:\n\n```json\n[\n  {\n    \"id\": \"scenario-001\",\n    \"name\": \"Customer Inquiry\",\n    \"category\": \"support\",\n    \"user_profile\": {\n      \"name\": \"Alice Johnson\",\n      \"age\": 34,\n      \"account_type\": \"premium\"\n    },\n    \"knowledge\": [\n      \"Customer has a premium account\",\n      \"Customer is inquiring about billing\"\n    ],\n    \"conversation\": [\n      {\n        \"turn\": 1,\n        \"user\": \"I noticed a charge on my account that I don't recognize\",\n        \"goal\": \"Customer wants to understand and dispute the unfamiliar charge\"\n      }\n    ]\n  }\n]\n```\n\n### Scenario Schema Fields\n\n| Field | Required | Description |\n|-------|----------|-------------|\n| `id` | Yes | Unique scenario identifier |\n| `name` | Yes | Human-readable name |\n| `category` | No | Grouping category for filtering |\n| `user_profile` | No | Simulated user attributes |\n| `knowledge` | No | Facts the simulated user knows |\n| `conversation` | Yes | Multi-turn conversation structure |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Running Simulation and Evaluation\n\n### Combined Workflow\n\nRun both simulation and evaluation in one command:\n\n```bash\narksim simulate-evaluate config.yaml\n```\n\n### Separate Steps\n\nFor more control, run simulation and evaluation separately:\n\n```bash\n# Step 1: Simulate\narksim simulate config_simulate.yaml\n\n# Step 2: Evaluate\narksim evaluate config_evaluate.yaml\n```\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Command Reference\n\n| Command | Description |\n|---------|-------------|\n| `arksim init` | Scaffold new project |\n| `arksim simulate <config>` | Run simulation only |\n| `arksim evaluate <config>` | Run evaluation only |\n| `arksim simulate-evaluate <config>` | Run both steps |\n| `arksim ui` | Launch web UI (port 8080) |\n| `arksim examples` | Download example projects |\n| `arksim prompts` | List available prompts |\n\n资料来源：[arksim/cli.py:89-152](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n## Web UI\n\nLaunch the web-based control plane:\n\n```bash\narksim ui --port 8080\n```\n\nThe UI provides:\n- Scenario management\n- Real-time simulation progress\n- Log viewing\n- Dark/light mode toggle\n\n## Integration Examples\n\nArkSim supports integrations with popular agent frameworks:\n\n| Framework | Command |\n|-----------|---------|\n| LangChain/LangGraph | `pip install langgraph langchain-openai` |\n| Claude Agent SDK | `pip install claude-agent-sdk` |\n| Microsoft AutoGen | `pip install autogen-agentchat autogen-ext[openai]` |\n| Dify | HTTP-based integration |\n\nExample workflow for LangGraph:\n\n```bash\ncd examples/integrations/langgraph\npip install langgraph langchain-openai\nexport OPENAI_API_KEY=\"<your-key>\"\narksim simulate-evaluate config.yaml\n```\n\n资料来源：[examples/integrations/langchain/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langchain/README.md)\n\n## Next Steps\n\n- Review the [Evaluation Metrics](evaluator) documentation for customizing scoring\n- Explore the [Custom Agent](custom-agent) guide for advanced integration patterns\n- Download example projects with `arksim examples`\n\n---\n\n<a id='arch-overview'></a>\n\n## System Architecture Overview\n\n### 相关页面\n\n相关主题：[Simulation Engine](#simulation-engine), [Evaluation System](#evaluation-system), [LLM Provider Integration](#llm-providers)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n- [README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n- [examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n- [examples/e-commerce/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/e-commerce/custom_metrics.py)\n- [arksim/utils/html_report/report_template.html](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n</details>\n\n# System Architecture Overview\n\nArkSim is a multi-turn agent evaluation framework that simulates user conversations with agents, captures tool calls, and scores agent performance against customizable metrics. This document provides a comprehensive overview of the system's architecture, core components, and data flows.\n\n## Core Components\n\nArkSim's architecture is organized into three primary subsystems that work together to simulate, capture, and evaluate agent behavior.\n\n| Component | Purpose |\n|-----------|---------|\n| Simulation Engine | Orchestrates multi-turn conversations between user simulators and agents |\n| Evaluator | Scores agent responses against qualitative and quantitative metrics |\n| LLM Layer | Powers user simulation and metric evaluation via configurable model providers |\n\n## High-Level Architecture\n\n```mermaid\ngraph TD\n    subgraph Simulation[\"Simulation Engine\"]\n        CLI[CLI Interface]\n        Config[Config Loader]\n        Simulator[Simulator Core]\n        AgentConnector[Agent Connector]\n        UserSimulator[User Simulator]\n    end\n\n    subgraph Evaluation[\"Evaluator\"]\n        Metrics[Custom Metrics]\n        Scoring[Scoring Engine]\n        Report[HTML Report Generator]\n    end\n\n    subgraph LLM[\"LLM Layer\"]\n        ChatLLM[Chat LLM]\n        EvalLLM[Evaluation LLM]\n    end\n\n    subgraph External[\"External Systems\"]\n        CustomAgent[Custom Agent]\n        HTTPEndpoint[HTTP Endpoint]\n        A2AAgent[A2A Agent]\n    end\n\n    CLI --> Config\n    Config --> Simulator\n    Simulator --> UserSimulator\n    Simulator --> AgentConnector\n    AgentConnector --> CustomAgent\n    AgentConnector --> HTTPEndpoint\n    AgentConnector --> A2AAgent\n    UserSimulator --> ChatLLM\n    Metrics --> EvalLLM\n    EvalLLM --> Scoring\n    Scoring --> Report\n\n    style CLI fill:#e1f5fe\n    style Simulator fill:#fff3e0\n    style Report fill:#e8f5e9\n```\n\n## CLI Interface\n\nThe command-line interface provides multiple entry points for running simulations and evaluations. The CLI is implemented in `arksim/cli.py` and supports the following commands:\n\n| Command | Description |\n|---------|-------------|\n| `arksim simulate-evaluate config.yaml` | Run simulation and evaluation in a single pipeline |\n| `arksim simulate config.yaml` | Run simulation only |\n| `arksim evaluate config.yaml` | Run evaluation on existing simulation results |\n| `arksim init` | Scaffold starter files for agent testing |\n| `arksim ui` | Launch web UI control plane |\n| `arksim examples` | Download example projects from GitHub |\n\n资料来源：[arksim/cli.py:1-100](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n### Simulation Modes\n\nArkSim supports three distinct agent integration patterns:\n\n```mermaid\ngraph LR\n    subgraph AgentTypes[\"Agent Types\"]\n        Custom[Custom Agent<br/>Python class extending BaseAgent]\n        HTTP[Chat Completions<br/>HTTP endpoint /v1/chat/completions]\n        A2A[A2A Protocol<br/>Agent-to-Agent standard]\n    end\n```\n\n1. **Custom Agent (Python class)**: Extend `BaseAgent` and implement the `execute()` method\n2. **Chat Completions**: Configure an HTTP endpoint for OpenAI-compatible chat completions API\n3. **A2A Protocol**: Connect via the Agent-to-Agent protocol standard\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Simulation Engine\n\nThe simulation engine orchestrates multi-turn conversations between a user simulator and the agent under test. Each turn consists of:\n\n1. User simulator generates a response based on conversation history and scenario knowledge\n2. Agent executes the user query and returns a response (optionally with tool calls)\n3. Simulator captures the interaction for evaluation\n\n### Tool Call Capture\n\nArkSim captures tool/function calls in two ways:\n\n| Capture Method | Mechanism | Configuration |\n|----------------|-----------|---------------|\n| AgentResponse | Agent returns structured `AgentResponse` with `tool_calls` list | Default behavior |\n| TracingProcessor | SDK's `TracingProcessor.on_span_end` captures calls automatically | `trace_receiver.enabled: true` |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n#### Tool Call Data Model\n\nTool calls are represented by the `ToolCall` class:\n\n```python\nclass ToolCall(BaseModel):\n    id: str\n    name: str\n    arguments: dict[str, Any] = Field(default_factory=dict)\n    result: str | None = None\n    error: str | None = None\n    source: ToolCallSource | None = None\n```\n\nThe `AgentResponse` wraps both content and tool calls:\n\n```python\nclass AgentResponse(BaseModel):\n    content: str\n    tool_calls: list[ToolCall] = Field(default_factory=list)\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:1-50](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n\n### Traced Agent Flow\n\n```mermaid\nsequenceDiagram\n    participant Simulator\n    participant Agent\n    participant Tracing as ArksimTracingProcessor\n    participant Evaluator\n\n    Simulator->>Agent: execute(user_query)\n    Agent->>Tracing: on_span_end(span)\n    Tracing->>Simulator: captured_tool_calls\n    Simulator->>Agent: RunResult\n    Agent->>Simulator: AgentResponse\n    Simulator->>Evaluator: Tool calls + Conversation\n    Evaluator->>Simulator: Metric Scores\n```\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Evaluator\n\nThe evaluator scores agent performance using both quantitative and qualitative metrics. The evaluation framework supports custom metrics defined via Python files.\n\n### Metric Types\n\n| Type | Description | Scoring Method |\n|------|-------------|----------------|\n| QuantitativeMetric | Numerical scores (0.0-1.0 scale) | Structured JSON schema validation |\n| QualitativeMetric | Free-form evaluation with reasoning | LLM-generated analysis |\n\n资料来源：[examples/customer-service/custom_metrics.py:1-60](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n### Custom Metrics Structure\n\nCustom metrics require:\n\n1. A Pydantic `BaseModel` defining the output schema\n2. A system prompt describing evaluation criteria\n3. Implementation of `QuantitativeMetric` or `QualitativeMetric`\n\n```python\nclass ConversionSchema(BaseModel):\n    intent_strength: float\n    conversion_outcome: float\n    evidence: list[str]\n    reason: str\n\nCONVERSION_SYSTEM_PROMPT = \"\"\"\\\nYou are an impartial evaluator for an e-commerce shopping agent.\nYour job is to score (1) the shopper's purchase intent and (2) whether the agent achieved a conversion outcome...\n\"\"\"\n```\n\n资料来源：[examples/e-commerce/custom_metrics.py:1-40](https://github.com/arklexai/arksim/blob/main/examples/e-commerce/custom_metrics.py)\n\n### Score Calculation\n\nThe evaluator computes a **Final Score** as a weighted average:\n\n| Component | Weight | Description |\n|-----------|--------|-------------|\n| Turn Success Ratio | 40% | Ratio of successful turns to total turns |\n| Goal Completion Score | 60% | LLM-assessed goal achievement score |\n\n| Status | Condition |\n|--------|-----------|\n| Done | Final score = 1.0 |\n| Partial Failure | 0.0 < Final score < 1.0 |\n| Complete Failure | Final score = 0.0 |\n\n资料来源：[arksim/utils/html_report/report_template.html:1-50](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Data Flow\n\n```mermaid\ngraph LR\n    subgraph Input[\"Input\"]\n        Config[config.yaml]\n        Scenarios[scenarios.json]\n        Metrics[custom_metrics.py]\n    end\n\n    subgraph Process[\"Processing\"]\n        Sim[Simulation]\n        Eval[Evaluation]\n        Score[Scoring]\n    end\n\n    subgraph Output[\"Output\"]\n        Results[simulation.json]\n        Report[evaluation.html]\n    end\n\n    Config --> Sim\n    Scenarios --> Sim\n    Sim --> Eval\n    Metrics --> Eval\n    Eval --> Score\n    Score --> Results\n    Score --> Report\n```\n\n## Agent Connector Types\n\nArkSim supports multiple agent integration patterns via the configuration system:\n\n```yaml\n# Option 1: Custom Python Agent\nagent_config:\n  agent_type: custom\n  agent_name: my-agent\n\n# Option 2: Chat Completions HTTP Endpoint\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n\n# Option 3: A2A Protocol\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## User Simulator\n\nThe user simulator generates realistic multi-turn conversations based on:\n\n- **Scenario definitions**: Predefined conversation flows and expected behaviors\n- **User profiles**: Demographic and behavioral attributes for persona simulation\n- **Knowledge bases**: Domain-specific information the simulated user possesses\n- **Conversation history**: Full context of the multi-turn interaction\n\nThe simulator uses LLM-based generation to produce contextually appropriate user responses that evolve through the conversation based on agent actions and conversation state.\n\n## HTML Report Generation\n\nAfter evaluation completes, ArkSim generates an interactive HTML report containing:\n\n| Section | Content |\n|---------|---------|\n| Summary Statistics | Overall scores, pass/fail rates |\n| Per-Scenario Metrics | Individual metric scores with reasoning |\n| Failure Categories | Grouped analysis of failure types |\n| Conversation Transcripts | Full turn-by-turn dialogue viewer |\n\n资料来源：[arksim/utils/html_report/report_template.html:1-100](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Integration Examples\n\nArkSim provides example integrations for popular agent frameworks:\n\n| Framework | Example Location | Connection Method |\n|-----------|-----------------|-------------------|\n| LangGraph | `examples/integrations/langgraph/` | Custom agent connector |\n| LangChain | `examples/integrations/langchain/` | Custom agent connector |\n| Claude Agent SDK | `examples/integrations/claude-agent-sdk/` | Custom agent connector |\n| AutoGen | `examples/integrations/autogen/` | Custom agent connector |\n| Pydantic AI | `examples/integrations/pydantic-ai/` | Custom agent connector |\n| Dify | `examples/integrations/dify/` | HTTP client |\n\n资料来源：[examples/integrations/langgraph/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langgraph/README.md), [examples/integrations/claude-agent-sdk/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/claude-agent-sdk/README.md), [examples/integrations/autogen/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/autogen/README.md)\n\n## Configuration Schema\n\nThe main configuration file (`config.yaml`) controls all aspects of simulation and evaluation:\n\n| Section | Key Options |\n|---------|-------------|\n| `agent_config` | `agent_type`, `agent_name`, `api_config.endpoint` |\n| `scenario_file` | Path to scenarios.json |\n| `metrics_to_run` | List of metric names to execute |\n| `custom_metrics_file_paths` | Paths to custom metric Python files |\n| `trace_receiver.enabled` | Enable TracingProcessor capture (default: false) |\n| `trace_receiver.wait_timeout` | Timeout for trace capture (seconds) |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Web UI\n\nArkSim includes a web-based control plane for managing simulations:\n\n```mermaid\ngraph TD\n    subgraph UI[\"Web UI\"]\n        Build[Build Scenarios]\n        Load[Load Existing]\n        Run[Run Simulation]\n        View[View Results]\n    end\n\n    subgraph Features[\"UI Features\"]\n        AutoGen[Auto-generate Scenarios PRO]\n        Browse[File Browser]\n        Refresh[Refresh Results]\n    end\n```\n\nFeatures include:\n- Scenario building and loading\n- Auto-generate scenarios (PRO feature)\n- File browser integration\n- Results viewing and refresh\n\n资料来源：[arksim/ui/frontend/index.html:1-80](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html)\n\n## Summary\n\nArkSim provides a comprehensive multi-turn agent evaluation framework with:\n\n- **Flexible agent integration**: Support for custom Python agents, HTTP endpoints, and A2A protocol\n- **Tool call capture**: Two mechanisms for capturing agent tool executions\n- **Customizable metrics**: Both quantitative and qualitative evaluation approaches\n- **Interactive reporting**: HTML-based results with conversation viewer\n- **CLI and UI**: Command-line and web-based interfaces for running evaluations\n- **Framework integrations**: Pre-built examples for LangGraph, LangChain, Claude SDK, AutoGen, Pydantic AI, and Dify\n\n---\n\n<a id='simulation-engine'></a>\n\n## Simulation Engine\n\n### 相关页面\n\n相关主题：[System Architecture Overview](#arch-overview), [Evaluation System](#evaluation-system), [Scenario Management](#scenario-management)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/simulation_engine/simulator.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/simulator.py)\n- [arksim/simulation_engine/entities.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/entities.py)\n- [arksim/simulation_engine/core/multi_knowledge_handling.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/core/multi_knowledge_handling.py)\n- [arksim/simulation_engine/agent/base.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/agent/base.py)\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [arksim/simulation_engine/utils/prompts.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/utils/prompts.py)\n</details>\n\n# Simulation Engine\n\nThe Simulation Engine is the core component of ArkSim responsible for executing multi-turn conversations between simulated users and agent systems under test. It orchestrates the entire simulation lifecycle, from scenario loading to agent execution, while capturing tool calls, responses, and conversation state for downstream evaluation.\n\n## Architecture Overview\n\nThe Simulation Engine follows a layered architecture that separates concerns between scenario management, agent execution, tool call capture, and knowledge handling.\n\n```mermaid\ngraph TD\n    A[Scenarios JSON] --> B[Simulator]\n    C[Config YAML] --> B\n    B --> D[Agent Executor]\n    D --> E[BaseAgent]\n    E --> F[Custom Agent / Chat Completions / A2A]\n    F --> G[Tool Calls Capture]\n    G --> H[Simulation Results]\n    D --> I[Multi-Knowledge Handler]\n    I --> J[Knowledge Sources]\n    \n    style B fill:#e1f5fe\n    style D fill:#fff3e0\n    style E fill:#e8f5e9\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| Simulator | `simulator.py` | Orchestrates the simulation lifecycle |\n| Entities | `entities.py` | Data models for scenarios, profiles, and turns |\n| Agent Base | `agent/base.py` | Abstract base class for all agent implementations |\n| Tool Types | `tool_types.py` | Data models for tool calls and agent responses |\n| Multi-Knowledge Handler | `core/multi_knowledge_handling.py` | Manages multiple knowledge sources for user profiles |\n| Prompt Utilities | `utils/prompts.py` | Generates prompts for simulated users |\n\n## Data Models\n\n### ToolCall\n\nThe `ToolCall` class represents a single tool or function call observed during a conversation turn.\n\n```python\nclass ToolCall(BaseModel):\n    id: str\n    name: str\n    arguments: dict[str, Any] = Field(default_factory=dict)\n    result: str | None = None\n    error: str | None = None\n    source: ToolCallSource | None = None\n```\n\nKey characteristics:\n- Declares `extra=\"ignore\"` for forward compatibility with future versions\n- Supports capturing arguments, results, and error states\n- Tracks the source of tool calls (e.g., from response parsing or tracing)\n\n资料来源：[arksim/simulation_engine/tool_types.py:26-37]()\n\n### AgentResponse\n\nThe `AgentResponse` class provides a structured return from agent execution.\n\n```python\nclass AgentResponse(BaseModel):\n    content: str\n    tool_calls: list[ToolCall] = Field(default_factory=list)\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:45-52]()\n\n## BaseAgent Interface\n\nAll agent implementations must inherit from `BaseAgent` and implement the required async methods.\n\n```python\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Replace with your agent logic\n        return \"agent response\"\n```\n\n资料来源：[README.md:45-55]()\n\n### Required Methods\n\n| Method | Return Type | Description |\n|--------|-------------|-------------|\n| `get_chat_id()` | `str` | Returns a unique identifier for the chat session |\n| `execute()` | `str \\| AgentResponse` | Executes the agent with a user query and returns content with optional tool calls |\n\n资料来源：[README.md:45-55]()\n\n## Simulation Workflow\n\n```mermaid\ngraph LR\n    A[Load Config] --> B[Load Scenarios]\n    B --> C[Initialize Agent]\n    C --> D[For Each Scenario]\n    D --> E[Generate User Profile]\n    E --> F[Execute Turn]\n    F --> G{Capture Tool Calls}\n    G --> H[Record Response]\n    H --> I{More Turns?}\n    I -->|Yes| F\n    I -->|No| J[Save Results]\n    J --> K[Next Scenario]\n    K -->|Yes| D\n    K -->|No| L[Complete]\n```\n\n### Step 0: Build Scenarios\n\nBefore running a simulation, users must create or load test scenarios. The UI provides options to:\n\n- **Auto-generate Scenarios** (Pro feature) - Automatically generate realistic test scenarios from the agent's knowledge base\n- **Load Existing** - Load scenario files from a specified path\n\n资料来源：[arksim/ui/frontend/index.html:1-50]()\n\n### Step 1: Simulation Execution\n\nThe simulator executes each scenario by:\n\n1. Loading the scenario configuration and user profiles\n2. Generating multi-turn conversations based on the scenario goals\n3. Capturing all tool calls and responses\n4. Producing a `simulation.json` output file\n\n资料来源：[examples/integrations/dify/README.md:18-20]()\n\n## Agent Types\n\nArkSim supports multiple agent integration patterns:\n\n| Agent Type | Configuration | Use Case |\n|------------|---------------|----------|\n| Custom Python Class | `agent_type: custom` | Full control via subclassing `BaseAgent` |\n| Chat Completions | `agent_type: chat_completions` | HTTP endpoint compatible with OpenAI format |\n| A2A Protocol | `agent_type: a2a` | Agent-to-Agent protocol endpoints |\n\n资料来源：[README.md:57-68]()\n\n### Chat Completions Configuration\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n### A2A Protocol Configuration\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\n资料来源：[README.md:57-68]()\n\n## Tool Call Capture\n\nArkSim provides two mechanisms for capturing tool calls:\n\n### Response-Based Capture (Default)\n\nThe agent returns an `AgentResponse` containing explicit tool calls:\n\n```python\nasync def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n    run_result = self.agent.run(user_query)\n    return AgentResponse(\n        content=run_result.text,\n        tool_calls=extract_tool_calls(run_result)\n    )\n```\n\n### Tracing-Based Capture (Automatic)\n\nFor agents using the ArkSim tracing processor, tool calls are captured automatically without modifying the agent response:\n\n```python\n# At module load\nfrom arksim.simulation_engine.tracing import ArksimTracingProcessor\nagent.register(ArksimTracingProcessor())\n\n# Agent returns plain str\nasync def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n    return self.agent.run(user_query)  # Returns plain string\n```\n\n资料来源：[examples/customer-service/README.md:1-35]()\n\n## Configuration\n\n### CLI Usage\n\n```bash\n# Combined simulation and evaluation\narksim simulate-evaluate config.yaml\n\n# Separate steps\narksim simulate config_simulate.yaml\narksim evaluate config_evaluate.yaml\n```\n\n资料来源：[examples/customer-service/README.md:8-14]()\n\n### Trace Receiver Configuration\n\n```yaml\ntrace_receiver:\n  enabled: true\n  wait_timeout: 5\n```\n\nWhen `trace_receiver.enabled` is false, ArkSim only captures tool calls from `AgentResponse`.\n\n资料来源：[examples/customer-service/README.md:30-35]()\n\n## Integration Examples\n\nArkSim provides pre-built integrations for popular agent frameworks:\n\n| Framework | Package | Example Path |\n|-----------|---------|--------------|\n| LangGraph | `langgraph`, `langchain-openai` | `examples/integrations/langgraph/` |\n| AutoGen | `autogen-agentchat`, `autogen-ext[openai]` | `examples/integrations/autogen/` |\n| Claude Agent SDK | `claude-agent-sdk` | `examples/integrations/claude-agent-sdk/` |\n| CrewAI | `crewai` | `examples/integrations/crewai/` |\n| Pydantic AI | `pydantic-ai` | `examples/integrations/pydantic-ai/` |\n| LangChain | `langgraph`, `langchain-openai` | `examples/integrations/langchain/` |\n\n资料来源：[examples/integrations/langgraph/README.md](), [examples/integrations/autogen/README.md](), [examples/integrations/claude-agent-sdk/README.md](), [examples/integrations/crewai/README.md](), [examples/integrations/pydantic-ai/README.md](), [examples/integrations/langchain/README.md]()\n\n## Output Format\n\nSimulation results are written to `./results/simulation/simulation.json` containing:\n\n- Complete conversation transcripts\n- Captured tool calls with arguments and results\n- Turn-by-turn timing information\n- User profile data\n- Scenario metadata\n\n资料来源：[examples/integrations/dify/README.md:18-20]()\n\n## Custom Metrics Support\n\nThe simulation engine supports custom quantitative and qualitative metrics through the evaluator. Custom metric files can be added to `custom_metrics/` directories and referenced in the configuration.\n\n资料来源：[examples/customer-service/custom_metrics.py:1-20]()\n\n## Forward Compatibility\n\nBoth `ToolCall` and `AgentResponse` declare `extra=\"ignore\"` in their Pydantic configuration. This ensures that snapshots from future ArkSim versions containing new fields can be loaded by older versions without raising `ValidationError`.\n\n```python\nmodel_config = ConfigDict(extra=\"ignore\")\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:26-28](), [arksim/simulation_engine/tool_types.py:45-47]()\n\n---\n\n<a id='evaluation-system'></a>\n\n## Evaluation System\n\n### 相关页面\n\n相关主题：[System Architecture Overview](#arch-overview), [Simulation Engine](#simulation-engine)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/evaluator/evaluator.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/evaluator.py)\n- [arksim/evaluator/builtin_metrics.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/builtin_metrics.py)\n- [arksim/evaluator/base_metric.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/base_metric.py)\n- [arksim/evaluator/tool_call_metrics.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/tool_call_metrics.py)\n- [arksim/evaluator/trajectory_matching.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/trajectory_matching.py)\n- [arksim/evaluator/thresholds.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/thresholds.py)\n- [arksim/utils/html_report/generate_html_report.py](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/generate_html_report.py)\n</details>\n\n# Evaluation System\n\nThe Evaluation System in ArkSim is responsible for scoring simulated conversations between users and agents. It measures goal completion, helpfulness, coherence, and other quality metrics to identify where agents succeed and where they fail.\n\n## Overview\n\nAfter simulation generates conversation transcripts, the evaluator analyzes each conversation and assigns scores across multiple dimensions. The system supports both built-in metrics and custom-defined metrics.\n\n**Purpose:**\n- Quantify agent performance objectively\n- Identify specific failure patterns\n- Generate actionable HTML reports for debugging\n\n**Scope:**\n- Scores conversations on a 0.0-1.0 scale\n- Computes weighted final scores combining multiple metrics\n- Categorizes outcomes as done, partial failure, or failed\n- Supports LLM-based evaluation (using configured providers like OpenAI)\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Simulation Results] --> B[Evaluator]\n    B --> C[Built-in Metrics]\n    B --> D[Custom Metrics]\n    B --> E[Tool Call Metrics]\n    C --> F[Final Scores]\n    D --> F\n    E --> F\n    F --> G[HTML Report Generator]\n    G --> H[evaluation/final_report.html]\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| Evaluator | `evaluator.py` | Main orchestration of evaluation pipeline |\n| Base Metric | `base_metric.py` | Abstract base classes for metrics |\n| Built-in Metrics | `builtin_metrics.py` | Standard metrics (faithfulness, helpfulness, etc.) |\n| Tool Call Metrics | `tool_call_metrics.py` | Evaluation of tool usage patterns |\n| Trajectory Matching | `trajectory_matching.py` | Compare expected vs actual agent trajectories |\n| Thresholds | `thresholds.py` | Score classification logic |\n| HTML Report | `generate_html_report.py` | Generate visual evaluation reports |\n\n## Evaluation Workflow\n\n```mermaid\nsequenceDiagram\n    participant S as Simulator\n    participant E as Evaluator\n    participant M as Metrics\n    participant R as Report Generator\n    \n    S->>E: Simulation results (JSON)\n    E->>M: For each conversation\n    M->>M: Run quantitative metrics\n    M->>M: Run qualitative metrics\n    M->>E: Per-metric scores\n    E->>E: Calculate final scores\n    E->>E: Determine status\n    E->>R: Score data\n    R->>R: Generate HTML\n```\n\n### Step-by-Step Process\n\n1. **Load Simulation Data**: Read conversation transcripts from simulation output\n2. **Select Metrics**: Determine which built-in and custom metrics to run\n3. **Score Each Conversation**: Apply metrics to score goal completion, behavior, etc.\n4. **Compute Final Scores**: Calculate weighted averages (goal_completion_weight=0.25, turn_success_ratio_weight=0.75)\n5. **Classify Status**: Assign done/partial_failure/failed based on thresholds\n6. **Generate Report**: Create HTML report with detailed breakdowns\n\n资料来源：[arksim/utils/html_report/report_template.html](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Scoring System\n\n### Score Ranges\n\n| Score Type | Range | Description |\n|------------|-------|-------------|\n| Quantitative | 0.0 - 1.0 | Numeric scores for measurable criteria |\n| Qualitative | Label-based | Categorical results (compliant, professional, pass, etc.) |\n| Goal Completion | 0.0 - 1.0 | Whether agent completed user goal |\n| Final Score | 0.0 - 1.0 | Weighted combination of metrics |\n\n### Status Classification\n\n| Status | Condition | Description |\n|--------|-----------|-------------|\n| Done | final_score == 1.0 | Perfect performance, goal completed |\n| Partial Failure | final_score >= 0.6 | Acceptable but with some failures |\n| Failed | final_score < 0.6 | Poor performance requiring attention |\n\n资料来源：[arksim/utils/html_report/report_template.html:85-87](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Built-in Metrics\n\nThe system provides seven built-in metrics that can be selected via configuration:\n\n| Metric | Purpose |\n|--------|---------|\n| `faithfulness` | Did the agent provide factually accurate information? |\n| `helpfulness` | Was the agent's response useful to the user? |\n| `coherence` | Were responses logically connected and consistent? |\n| `verbosity` | Did the agent maintain appropriate response length? |\n| `relevance` | Did responses address the user's actual query? |\n| `goal_completion` | Did the agent help the user accomplish their goal? |\n| `agent_behavior_failure` | Did the agent exhibit any problematic behaviors? |\n\n### Metric Selection\n\nFrom the frontend, users can select which built-in metrics to run:\n\n```html\n<template x-for=\"m in ['faithfulness', 'helpfulness', 'coherence', 'verbosity', 'relevance', 'goal_completion', 'agent_behavior_failure']\" :key=\"m\">\n```\n\nIf no metrics are selected, all built-in metrics run by default.\n\n资料来源：[arksim/ui/frontend/index.html](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html)\n\n## Custom Metrics\n\nDevelopers can define custom evaluation metrics by creating a Python module.\n\n### Creating Custom Metrics\n\n```python\nfrom arksim.evaluator import (\n    QualitativeMetric,\n    QualResult,\n    QuantitativeMetric,\n    QuantResult,\n    ScoreInput,\n)\n\nclass MyMetric(QuantitativeMetric):\n    name = \"my_custom_metric\"\n    schema = MySchema  # Pydantic model\n    \n    async def evaluate(self, input: ScoreInput) -> QuantResult:\n        # Evaluation logic\n        return QuantResult(...)\n```\n\n### Quantitative vs Qualitative Metrics\n\n| Type | Base Class | Output |\n|------|------------|--------|\n| Quantitative | `QuantitativeMetric` | `QuantResult` with float value and reason |\n| Qualitative | `QualitativeMetric` | `QualResult` with categorical label |\n\n### Configuration\n\nAdd custom metrics to `config.yaml`:\n\n```yaml\ncustom_metrics_file_paths:\n  - /path/to/custom_metrics.py\n\nmetrics_to_run:\n  - my_custom_metric  # optional; runs all if omitted\n```\n\n资料来源：[examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n### Example: Verification Compliance\n\n```python\nclass VerificationComplianceSchema(BaseModel):\n    identity_verification: float  # 0.0-1.0\n    action_gating: float  # 0.0-1.0\n    reason: str\n```\n\nThe evaluation prompt instructs the LLM to score:\n- **Identity Verification**: Did the agent verify customer identity before actions?\n- **Action Gating**: Did the agent gate sensitive actions behind verification?\n\n资料来源：[examples/customer-service/custom_metrics.py:25-35](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n### Example: E-commerce Conversion\n\n```python\nclass ConversionSchema(BaseModel):\n    intent_strength: float\n    conversion_outcome: float\n    evidence: list[str]\n    reason: str\n```\n\nMetrics track:\n- **Intent Strength**: How ready the shopper is to buy\n- **Conversion Outcome**: Whether the agent achieved a purchase decision\n\n资料来源：[examples/e-commerce/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/e-commerce/custom_metrics.py)\n\n## HTML Report\n\nThe evaluation system generates a detailed HTML report (`evaluation/final_report.html`) containing:\n\n### Report Sections\n\n| Section | Content |\n|---------|---------|\n| Summary Statistics | Overall pass/fail rates, average scores |\n| Conversations Table | Per-conversation scores, status badges |\n| Detailed Breakdown | Goal completion, final scores, failure reasons |\n| Score Reasons | LLM-generated explanations for each metric |\n\n### Report Features\n\n- **Interactive Table**: Sort and filter conversations\n- **Score Details**: Expandable sections showing metric-by-metric breakdown\n- **Status Badges**: Visual indicators (done/partial/failed)\n- **Tooltip Explanations**: Hover info for column headers\n\n### Score Display Logic\n\n```javascript\nconst POSITIVE_LABELS = ['compliant', 'professional', 'pass', 'good', 'complete', 'no failure', 'ok'];\nconst NEGATIVE_LABELS = ['flagged', 'unprofessional', 'fail', 'error', 'poor', 'missing', 'violated', 'partial'];\n```\n\nQualitative scores are automatically classified as positive, negative, or neutral based on label matching.\n\n资料来源：[arksim/utils/html_report/report_template.html:78-81](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Configuration\n\n### Evaluation Configuration Options\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `evalProvider` | string | - | LLM provider for evaluation |\n| `evalModel` | string | - | Model to use for scoring |\n| `metricsToRun` | list | all | Which built-in metrics to execute |\n| `customMetricsFilePaths` | list | [] | Paths to custom metric modules |\n| `evalNumWorkers` | int | auto | Parallel evaluation workers |\n\n### Provider Selection\n\nThe evaluator supports multiple LLM providers:\n- OpenAI\n- Azure OpenAI\n- Custom endpoints (via `chat_completions` type)\n- A2A protocol agents\n\n资料来源：[arksim/ui/frontend/index.html](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html)\n\n## Integration with Simulation\n\nThe evaluation system is typically invoked via the CLI:\n\n```bash\n# Combined simulation and evaluation\narksim simulate-evaluate config.yaml\n\n# Separate steps\narksim simulate config_simulate.yaml\narksim evaluate config_evaluate.yaml\n```\n\n### Pipeline Flow\n\n```mermaid\ngraph LR\n    A[config.yaml] --> B[Simulator]\n    B --> C[simulation.json]\n    C --> D[Evaluator]\n    D --> E[evaluation/]\n    E --> F[final_report.html]\n    E --> F --> G[scores.json]\n```\n\nResults are written to `./results/simulation/simulation.json` for simulation and `./results/evaluation/` for evaluation output.\n\n资料来源：[examples/integrations/dify/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md)\n\n## Advanced Features\n\n### Tool Call Capture\n\nArkSim can automatically capture tool calls via tracing:\n\n```yaml\ntrace_receiver:\n  enabled: true\n  wait_timeout: 5\n```\n\nWhen enabled, tool calls are captured automatically without requiring explicit return in `AgentResponse`.\n\n### Trajectory Matching\n\nThe `trajectory_matching.py` module compares expected agent trajectories against actual behavior, useful for validating that agents follow prescribed action sequences.\n\n### Thresholds\n\nThe `thresholds.py` module defines score boundaries and classification logic for determining pass/fail conditions.\n\n## Summary\n\nThe Evaluation System provides comprehensive, configurable scoring of agent conversations:\n\n- **Flexible Metric System**: Built-in metrics cover common quality dimensions; custom metrics extend evaluation to domain-specific criteria\n- **LLM-based Scoring**: Uses configurable language models to generate nuanced, explainable scores\n- **Visual Reporting**: HTML reports make results easy to understand and act upon\n- **Status Classification**: Automatic categorization into done/partial_failure/failed for quick assessment\n- **Integration-ready**: Works with Python agents, chat completions endpoints, and A2A protocol agents\n\n---\n\n<a id='agent-types'></a>\n\n## Agent Types and Integration\n\n### 相关页面\n\n相关主题：[LLM Provider Integration](#llm-providers), [Tool Call Capture](#tool-call-capture), [Configuration System](#configuration)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/arklexai/arksim/blob/main/README.md) - Main documentation with agent type examples\n- [arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py) - CLI commands for agent initialization\n- [arksim/ui/frontend/index.html](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html) - UI Agent Config section\n- [examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md) - Custom agent implementation example\n- [examples/integrations/dify/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md) - HTTP agent integration example\n- [examples/integrations/rasa/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/rasa/README.md) - External agent server integration\n</details>\n\n# Agent Types and Integration\n\nArkSim supports multiple agent connection types to accommodate different architectures and deployment scenarios. This page documents the available agent types, their configuration methods, and integration patterns.\n\n## Overview\n\nArkSim provides a flexible agent integration system that enables testing of various agent implementations through a unified simulation interface. The simulator communicates with agents via standardized protocols and captures responses for evaluation.\n\nThe agent integration system consists of:\n- A `BaseAgent` abstract class that defines the agent interface\n- Multiple client implementations for different connection types\n- A factory pattern for agent instantiation based on configuration\n- Support for tool call capture through both explicit responses and tracing\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Supported Agent Types\n\nArkSim supports three primary agent types, each suited for different deployment scenarios.\n\n| Agent Type | Description | Use Case |\n|------------|-------------|----------|\n| `custom` | Python class extending BaseAgent | Custom agent logic, no external server required |\n| `chat_completions` | HTTP endpoint with OpenAI-compatible API | Existing REST APIs, external agent services |\n| `a2a` | Agent-to-Agent protocol endpoint | Multi-agent systems, A2A-compliant agents |\n\n资料来源：[arksim/cli.py:100-109](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n## Custom Agent (Python Class)\n\nThe `custom` agent type is the default integration method. It requires implementing a Python class that extends `BaseAgent` and implements the `execute` method.\n\n### Implementation Pattern\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Replace with your agent logic\n        return \"agent response\"\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### Returning Tool Calls\n\nTo enable tool call evaluation, return an `AgentResponse` object instead of a plain string. This allows the evaluator to assess whether the agent correctly invoked required tools.\n\n```python\nfrom arksim.simulation_engine.tool_types import AgentResponse, ToolCall\n\nasync def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n    tool_calls = [\n        ToolCall(\n            name=\"search_knowledge_base\",\n            arguments={\"query\": user_query}\n        )\n    ]\n    return AgentResponse(\n        content=\"Found relevant information in the knowledge base.\",\n        tool_calls=tool_calls\n    )\n```\n\n### Traced Agent Variant\n\nFor agents using OpenTelemetry-based instrumentation, ArkSim supports automatic tool call capture through the `TracingProcessor` interface. This eliminates the need to explicitly return tool calls in `AgentResponse`.\n\n```\nSimulator sets routing context -> agent.execute() runs normally\n-> SDK fires TracingProcessor.on_span_end -> arksim captures -> evaluator scores\n```\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Chat Completions Agent (HTTP API)\n\nThe `chat_completions` agent type connects to any HTTP endpoint implementing an OpenAI-compatible chat completions interface.\n\n### Configuration\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### Request Format\n\nArkSim sends requests following the OpenAI chat completions format:\n\n```json\n{\n  \"model\": \"agent-model\",\n  \"messages\": [\n    {\"role\": \"user\", \"content\": \"user query\"}\n  ]\n}\n```\n\n### Response Handling\n\nThe endpoint should return responses in the standard chat completions format:\n\n```json\n{\n  \"choices\": [\n    {\n      \"message\": {\n        \"role\": \"assistant\",\n        \"content\": \"agent response\"\n      }\n    }\n  ]\n}\n```\n\n资料来源：[examples/integrations/dify/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md)\n\n## A2A Protocol Agent\n\nThe `a2a` agent type connects to agents implementing the Agent-to-Agent (A2A) protocol, enabling integration with multi-agent systems.\n\n### Configuration\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### A2A with Tool Calls\n\nA2A agents can also surface tool calls for evaluation. The agent returns both the response content and tool call information in its A2A-formatted response.\n\n## CLI Initialization\n\nArkSim provides a CLI command to scaffold agent implementations:\n\n```bash\narksim init --agent-type <type>\n```\n\nThe `--agent-type` flag accepts the following values:\n- `custom` (default) - Generates a Python agent file\n- `chat_completions` - Configures HTTP endpoint connection\n- `a2a` - Configures A2A protocol connection\n\nThe `--force` flag overwrites existing files.\n\n资料来源：[arksim/cli.py:94-118](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n## Integration Architecture\n\nThe following diagram illustrates how ArkSim communicates with different agent types:\n\n```mermaid\ngraph TD\n    subgraph ArkSim[\"ArkSim\"]\n        Simulator[\"Simulator Engine\"]\n        Evaluator[\"Evaluator\"]\n    end\n    \n    subgraph AgentTypes[\"Agent Implementations\"]\n        CustomAgent[\"Custom Agent (Python)\"]\n        HTTPAgent[\"Chat Completions API\"]\n        A2AAgent[\"A2A Protocol Agent\"]\n    end\n    \n    Simulator -->|execute| CustomAgent\n    Simulator -->|HTTP POST| HTTPAgent\n    Simulator -->|A2A Protocol| A2AAgent\n    \n    CustomAgent -->|AgentResponse| Evaluator\n    HTTPAgent -->|JSON Response| Evaluator\n    A2AAgent -->|A2A Response| Evaluator\n```\n\n## Agent Configuration in UI\n\nThe ArkSim web UI provides an \"Agent Config\" section for configuring agent connections. This section is dynamically loaded from the configuration YAML file.\n\n```html\n<!-- Agent Config -->\n<div class=\"t-surface rounded-xl border t-border p-5 mb-4\">\n    <h2 class=\"font-semibold t-heading mb-1\">Agent Config</h2>\n    <p class=\"text-xs t-caption mb-3\">How arksim connects to your agent.</p>\n</div>\n```\n\n资料来源：[arksim/ui/frontend/index.html](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html)\n\n## Framework Integrations\n\nArkSim includes pre-built integrations for popular agent frameworks through example projects:\n\n| Framework | Integration File | Protocol |\n|-----------|------------------|----------|\n| LangChain/LangGraph | `custom_agent.py` | Python class |\n| CrewAI | `custom_agent.py` | Python class |\n| AutoGen | `custom_agent.py` | Python class |\n| Claude Agent SDK | `custom_agent.py` | Python class |\n| Pydantic AI | `custom_agent.py` | Python class |\n| LlamaIndex | `custom_agent.py` | Python class |\n| Smolagents | `custom_agent.py` | Python class |\n| OpenAI Agents SDK | `custom_agent.py` | Python class |\n| Dify | `custom_agent.py` | HTTP API |\n| Rasa | `custom_agent.py` | HTTP API |\n| OpenClaw | `config.yaml` | Chat Completions |\n\n资料来源：[examples/integrations/*/README.md](https://github.com/arklexai/arksim/tree/main/examples/integrations)\n\n## Running Simulations with Different Agent Types\n\n### Single Command\n\n```bash\narksim simulate-evaluate config.yaml\n```\n\n### Separate Steps\n\n```bash\n# Step 1: Simulate\narksim simulate config_simulate.yaml\n\n# Step 2: Evaluate\narksim evaluate config_evaluate.yaml\n```\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Best Practices\n\n1. **Tool Call Capture**: For accurate evaluation, ensure your agent returns tool calls either explicitly via `AgentResponse` or implicitly through tracing instrumentation.\n\n2. **Async Implementation**: All agent implementations should use async/await patterns for proper integration with the simulation engine.\n\n3. **Error Handling**: Agents should handle errors gracefully and return meaningful error messages that can be evaluated by the system.\n\n4. **Configuration Management**: Use environment variables for sensitive configuration values like API keys and endpoints.\n\n---\n\n<a id='llm-providers'></a>\n\n## LLM Provider Integration\n\n### 相关页面\n\n相关主题：[Agent Types and Integration](#agent-types), [Configuration System](#configuration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/llms/chat/providers/openai.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/providers/openai.py)\n- [arksim/llms/chat/providers/anthropic.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/providers/anthropic.py)\n- [arksim/llms/chat/providers/google.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/providers/google.py)\n- [arksim/llms/chat/providers/azure_openai.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/providers/azure_openai.py)\n- [arksim/llms/chat/llm.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/llm.py)\n- [arksim/llms/chat/base/base_llm.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/base/base_llm.py)\n</details>\n\n# LLM Provider Integration\n\n## Overview\n\nThe LLM Provider Integration system in ArkSim provides a unified abstraction layer for connecting to various Large Language Model (LLM) providers. This modular architecture enables the simulator to interact with different AI backends while maintaining a consistent interface for chat completions, streaming responses, and token usage tracking.\n\nThe system follows a provider-based pattern where each supported LLM service (OpenAI, Anthropic, Google, Azure OpenAI) implements a common interface defined in the base class. This design allows users to swap between providers without changing the core simulation logic.\n\n## Architecture\n\n### System Components\n\n```mermaid\ngraph TD\n    A[Simulation Engine] --> B[LLM Manager<br/>arksim/llms/chat/llm.py]\n    B --> C[Base LLM<br/>arksim/llms/chat/base/base_llm.py]\n    C --> D[OpenAI Provider<br/>providers/openai.py]\n    C --> E[Anthropic Provider<br/>providers/anthropic.py]\n    C --> F[Google Provider<br/>providers/google.py]\n    C --> G[Azure OpenAI Provider<br/>providers/azure_openai.py]\n    \n    H[Configuration YAML] --> B\n    I[Environment Variables] --> D\n    I --> E\n    I --> F\n    I --> G\n```\n\n### Provider Class Hierarchy\n\n```mermaid\ngraph TD\n    A[BaseLLM<br/>base/base_llm.py] --> B[OpenAIProvider<br/>providers/openai.py]\n    A --> C[AnthropicProvider<br/>providers/anthropic.py]\n    A --> D[GoogleProvider<br/>providers/google.py]\n    A --> E[AzureOpenAIProvider<br/>providers/azure_openai.py]\n    \n    F[Provider Enum] --> A\n    G[ChatMessage Model] --> A\n    H[ChatCompletionResponse] --> A\n```\n\n## Supported Providers\n\nArkSim supports the following LLM providers through dedicated provider implementations:\n\n| Provider | Provider Class | API Style | Streaming Support |\n|----------|----------------|-----------|-------------------|\n| OpenAI | `OpenAIProvider` | OpenAI API | Yes |\n| Anthropic | `AnthropicProvider` | Anthropic API | Yes |\n| Google | `GoogleProvider` | Google AI API | Yes |\n| Azure OpenAI | `AzureOpenAIProvider` | Azure API | Yes |\n\nEach provider class inherits from `BaseLLM` and implements provider-specific API call logic while adhering to the common interface contract.\n\n## Base LLM Interface\n\nThe `BaseLLM` class defines the contract that all provider implementations must follow. This ensures consistent behavior across different LLM backends.\n\n### Core Methods\n\n| Method | Purpose | Parameters |\n|--------|---------|------------|\n| `chat()` | Send a chat completion request | `messages`, `model`, `temperature`, `max_tokens`, `**kwargs` |\n| `chat_stream()` | Stream chat completion responses | `messages`, `model`, `temperature`, `max_tokens`, `**kwargs` |\n| `count_tokens()` | Calculate token usage for messages | `messages`, `model` |\n\n### Data Models\n\nThe base module defines essential data structures used across all providers:\n\n| Model | Purpose |\n|-------|---------|\n| `ChatMessage` | Represents a single message with role and content |\n| `ChatCompletionResponse` | Wraps the API response from providers |\n| `Provider` | Enum identifying supported LLM providers |\n| `ModelInfo` | Metadata about available models per provider |\n\n## Provider Implementations\n\n### OpenAI Provider\n\nThe OpenAI provider connects to OpenAI's API endpoints for chat completions. It supports both standard and streaming responses.\n\n**Configuration Requirements:**\n\n| Parameter | Source | Description |\n|-----------|--------|-------------|\n| `OPENAI_API_KEY` | Environment variable | API key for authentication |\n| `model` | Config/Parameter | Model identifier (e.g., `gpt-4`, `gpt-4-turbo`) |\n\n**API Endpoint:**\n```\nPOST https://api.openai.com/v1/chat/completions\n```\n\n资料来源：[arksim/llms/chat/providers/openai.py]()\n\n### Anthropic Provider\n\nThe Anthropic provider integrates with Anthropic's Claude models through their API. It handles the distinct message format and API conventions used by Anthropic.\n\n**Configuration Requirements:**\n\n| Parameter | Source | Description |\n|-----------|--------|-------------|\n| `ANTHROPIC_API_KEY` | Environment variable | API key for authentication |\n| `model` | Config/Parameter | Model identifier (e.g., `claude-3-opus-20240229`) |\n\n**API Endpoint:**\n```\nPOST https://api.anthropic.com/v1/messages\n```\n\n资料来源：[arksim/llms/chat/providers/anthropic.py]()\n\n### Google Provider\n\nThe Google provider connects to Google's Gemini models via the Google AI API.\n\n**Configuration Requirements:**\n\n| Parameter | Source | Description |\n|-----------|--------|-------------|\n| `GOOGLE_API_KEY` | Environment variable | API key for authentication |\n| `model` | Config/Parameter | Model identifier (e.g., `gemini-pro`) |\n\n**API Endpoint:**\n```\nPOST https://generativelanguage.googleapis.com/v1/models/{model}:generateContent\n```\n\n资料来源：[arksim/llms/chat/providers/google.py]()\n\n### Azure OpenAI Provider\n\nThe Azure OpenAI provider enables integration with Azure-hosted OpenAI models, supporting enterprise deployments with Azure-specific authentication and endpoint configuration.\n\n**Configuration Requirements:**\n\n| Parameter | Source | Description |\n|-----------|--------|-------------|\n| `AZURE_OPENAI_API_KEY` | Environment variable | API key for Azure authentication |\n| `AZURE_OPENAI_ENDPOINT` | Environment variable | Azure endpoint URL |\n| `AZURE_OPENAI_DEPLOYMENT` | Config | Deployment name in Azure |\n| `AZURE_OPENAI_API_VERSION` | Config | Azure API version |\n\n资料来源：[arksim/llms/chat/providers/azure_openai.py]()\n\n## Configuration\n\n### YAML Configuration Structure\n\nLLM providers are configured through the `config.yaml` file used by the simulator:\n\n```yaml\nllm:\n  provider: openai  # or anthropic, google, azure_openai\n  model: gpt-4\n  temperature: 0.7\n  max_tokens: 2048\n  \nagent_config:\n  # Agent-specific LLM settings\n  provider: anthropic\n  model: claude-3-opus-20240229\n```\n\n### Environment Variables\n\n| Variable | Providers | Purpose |\n|----------|-----------|---------|\n| `OPENAI_API_KEY` | OpenAI, AutoGen, LangChain | OpenAI API authentication |\n| `ANTHROPIC_API_KEY` | Claude Agent SDK | Anthropic API authentication |\n| `GOOGLE_API_KEY` | Google | Google AI API authentication |\n| `AZURE_OPENAI_API_KEY` | Azure OpenAI | Azure API authentication |\n| `AZURE_OPENAI_ENDPOINT` | Azure OpenAI | Azure resource endpoint |\n\n## Integration with Custom Agents\n\nThe LLM provider system integrates with the custom agent connector pattern used in ArkSim simulations. Custom agents can be configured to use any supported LLM provider.\n\n```mermaid\ngraph LR\n    A[Scenario JSON] --> B[Simulation Engine]\n    B --> C[Custom Agent<br/>custom_agent.py]\n    C --> D[LLM Provider]\n    D --> E[External LLM API]\n```\n\n### Integration Examples\n\nArkSim provides integration examples for various agent frameworks:\n\n| Framework | Example Path | LLM Provider Used |\n|-----------|--------------|-------------------|\n| LangChain/LangGraph | `examples/integrations/langchain/` | OpenAI |\n| Claude Agent SDK | `examples/integrations/claude-agent-sdk/` | Anthropic |\n| LlamaIndex | `examples/integrations/llamaindex/` | OpenAI |\n| CrewAI | `examples/integrations/crewai/` | OpenAI |\n| AutoGen | `examples/integrations/autogen/` | OpenAI |\n| Pydantic AI | `examples/integrations/pydantic-ai/` | OpenAI |\n| Smolagents | `examples/integrations/smolagents/` | OpenAI |\n| Dify | `examples/integrations/dify/` | Custom HTTP |\n\nEach integration demonstrates how to connect the framework's agent to ArkSim's simulation engine while delegating LLM calls to the appropriate provider.\n\n## Usage Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Config as config.yaml\n    participant LLM as LLM Manager\n    participant Provider as Provider Class\n    participant API as External LLM API\n    \n    User->>Config: Load configuration\n    User->>LLM: Initialize with provider type\n    LLM->>Provider: Create provider instance\n    User->>LLM: chat(messages, model)\n    LLM->>Provider: _chat_completion()\n    Provider->>API: HTTP POST request\n    API-->>Provider: Completion response\n    Provider-->>LLM: Normalized response\n    LLM-->>User: ChatCompletionResponse\n```\n\n## Adding New Providers\n\nTo add support for a new LLM provider:\n\n1. Create a new provider class inheriting from `BaseLLM`\n2. Implement the required methods: `chat()`, `chat_stream()`, `count_tokens()`\n3. Add the provider to the `Provider` enum in `base_llm.py`\n4. Update the provider factory logic in `llm.py` to instantiate the new provider\n5. Add integration tests and documentation\n\n资料来源：[arksim/llms/chat/base/base_llm.py]()\n资料来源：[arksim/llms/chat/llm.py]()\n\n## Error Handling\n\nProvider implementations handle common error scenarios:\n\n| Error Type | Handling Strategy |\n|------------|-------------------|\n| Authentication failures | Raise `AuthenticationError` with helpful message |\n| Rate limiting | Implement automatic retry with backoff |\n| Invalid request parameters | Raise `ValidationError` with parameter details |\n| Network timeouts | Retry with exponential backoff |\n| Model not found | Raise `ModelNotFoundError` listing available models |\n\n## Best Practices\n\n1. **API Key Security**: Store API keys in environment variables, never in configuration files committed to version control. ArkSim automatically loads keys from environment variables.\n\n2. **Token Tracking**: Use the `count_tokens()` method to monitor token usage and estimate costs before running large-scale simulations.\n\n3. **Streaming for Large Responses**: Enable streaming (`chat_stream()`) for scenarios expecting long agent responses to improve perceived responsiveness.\n\n4. **Provider Selection**: Choose Azure OpenAI for enterprise deployments requiring compliance certifications and dedicated infrastructure.\n\n5. **Model Selection**: Refer to the integration examples for recommended model configurations per provider and use case.\n\n---\n\n<a id='tool-call-capture'></a>\n\n## Tool Call Capture\n\n### 相关页面\n\n相关主题：[Evaluation System](#evaluation-system), [Agent Types and Integration](#agent-types)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [arksim/simulation_engine/agent/clients/a2a.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/agent/clients/a2a.py)\n- [arksim/tracing/openai.py](https://github.com/arklexai/arksim/blob/main/arksim/tracing/openai.py)\n- [examples/customer-service/custom_agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_agent.py)\n- [examples/customer-service/traced_agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/traced_agent.py)\n- [examples/customer-service/a2a_server/agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/a2a_server/agent.py)\n</details>\n\n# Tool Call Capture\n\nTool Call Capture is a core mechanism in ArkSim that observes and records every tool or function invocation made by an agent during a simulation. This captured data feeds directly into the evaluator, enabling metrics like tool call accuracy, error detection, and trajectory analysis.\n\n## Overview\n\nTool Call Capture serves as the bridge between agent execution and evaluation. Without accurate tool call capture, the evaluator cannot determine whether an agent:\n\n- Invoked the correct tools\n- Passed valid arguments\n- Handled errors appropriately\n- Followed the expected execution trajectory\n\nArkSim supports three distinct capture mechanisms:\n\n1. **Explicit capture** via `AgentResponse.tool_calls`\n2. **Automatic capture** via the `ArksimTracingProcessor`\n3. **A2A protocol capture** via task artifacts\n\nAll three methods produce the same underlying `ToolCall` data model, ensuring consistent evaluation regardless of how the agent is implemented.\n\n## Data Models\n\n### ToolCall\n\nThe `ToolCall` class represents a single tool invocation observed during a turn:\n\n```python\nclass ToolCall(BaseModel):\n    model_config = ConfigDict(extra=\"ignore\")\n    \n    id: str\n    name: str\n    arguments: dict[str, Any] = Field(default_factory=dict)\n    result: str | None = None\n    error: str | None = None\n    source: ToolCallSource | None = None\n```\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `id` | `str` | Unique identifier for this tool call |\n| `name` | `str` | Name of the tool/function invoked |\n| `arguments` | `dict[str, Any]` | Arguments passed to the tool |\n| `result` | `str \\| None` | Response returned by the tool |\n| `error` | `str \\| None` | Error message if the call failed |\n| `source` | `ToolCallSource \\| None` | Origin of the capture data |\n\nThe `extra=\"ignore\"` configuration ensures forward compatibility with future versions that add new fields.\n\n### ToolCallSource\n\nThe `ToolCallSource` enum indicates how tool call data was captured:\n\n```python\nclass ToolCallSource(str, Enum):\n    AGENT_RESPONSE = \"agent_response\"\n    TRACING_PROCESSOR = \"tracing_processor\"\n    A2A_PROTOCOL = \"a2a_protocol\"\n```\n\n### AgentResponse\n\nFor explicit capture, agents return structured responses:\n\n```python\nclass AgentResponse(BaseModel):\n    model_config = ConfigDict(extra=\"ignore\")\n    \n    content: str\n    tool_calls: list[ToolCall] = Field(default_factory=list)\n```\n\n## Capture Methods\n\n### Explicit Capture (AgentResponse)\n\nThe default approach requires agents to explicitly return tool calls alongside their text response. This is the standard pattern for custom agents implementing `BaseAgent`.\n\n```mermaid\ngraph TD\n    A[Simulator invokes agent] --> B[Agent executes user_query]\n    B --> C[Agent returns AgentResponse]\n    C --> D[Simulator extracts tool_calls]\n    D --> E[Evaluator scores trajectory]\n```\n\n**Implementation pattern:**\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse, ToolCall\n\nclass MyAgent(BaseAgent):\n    async def execute(self, user_query: str, **kwargs) -> str | AgentResponse:\n        # Agent logic here...\n        tool_calls = [\n            ToolCall(\n                id=\"call_1\",\n                name=\"get_order_status\",\n                arguments={\"order_id\": \"ORD-1001\"},\n                source=\"agent_response\"\n            )\n        ]\n        return AgentResponse(\n            content=\"The order status is shipped.\",\n            tool_calls=tool_calls\n        )\n```\n\n资料来源：[examples/customer-service/custom_agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_agent.py)\n\n### Tracing Processor (Automatic Capture)\n\nThe `ArksimTracingProcessor` uses the OpenAI Agents SDK's tracing interface to capture tool calls automatically, without requiring explicit return data.\n\n```mermaid\ngraph TD\n    A[Simulator sets routing context] --> B[Agent executes normally]\n    B --> C[SDK fires on_span_end]\n    C --> D[ArksimTracingProcessor captures]\n    D --> E[Evaluator scores trajectory]\n```\n\n**Registration pattern:**\n\n```python\nfrom agents import Agent as SDKAgent\nfrom arksim.tracing.openai import ArksimTracingProcessor\n\n# Register once at module load\n_tracing_processor = ArksimTracingProcessor()\n```\n\nThe traced agent returns a plain `str` rather than `AgentResponse`:\n\n```python\nasync def execute(self, user_query: str, **kwargs) -> str:\n    result = await Runner.run(self._sdk_agent, user_query)\n    return result.final_output\n```\n\n资料来源：[examples/customer-service/traced_agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/traced_agent.py)\n\n### A2A Protocol Capture\n\nFor agents implementing the Agent-to-Agent (A2A) protocol, tool calls are embedded in task artifacts using the `A2AToolCaptureExtension`.\n\n```mermaid\ngraph TD\n    A[Simulator sends task to A2A Agent] --> B[Agent processes request]\n    B --> C[Agent returns Task with artifacts]\n    C --> D[Simulator extracts from metadata]\n    D --> E[Evaluator scores]\n```\n\n**Tool call extraction from artifacts:**\n\n```python\ndef _extract_tool_calls_from_artifact(self, artifact: Artifact) -> list[ToolCall]:\n    metadata = artifact.metadata\n    raw_calls = metadata.get(\"tool_calls\", [])\n    tool_calls = []\n    for raw in raw_calls:\n        arguments = raw.get(\"arguments\", {})\n        if not isinstance(arguments, dict):\n            continue\n        tool_calls.append(\n            ToolCall(\n                id=raw.get(\"id\", \"\"),\n                name=name,\n                arguments=arguments,\n                result=A2AAgent._coerce_to_string(raw.get(\"result\")),\n                error=A2AAgent._coerce_to_string(raw.get(\"error\")),\n                source=ToolCallSource.A2A_PROTOCOL,\n            )\n        )\n    return tool_calls\n```\n\n资料来源：[arksim/simulation_engine/agent/clients/a2a.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/agent/clients/a2a.py)\n\n**A2A agent card declaration:**\n\n```python\nfrom arksim.simulation_engine.tool_types import A2AToolCaptureExtension\n\n_capabilities = AgentCapabilities(\n    streaming=False,\n    extensions=[A2AToolCaptureExtension],\n)\n```\n\n## Workflow Diagram\n\nThe following diagram shows the complete simulation pipeline with tool call capture:\n\n```mermaid\nflowchart LR\n    subgraph Simulation\n        A[User Query] --> B[Simulator]\n        B --> C{Agent Type}\n        C -->|Custom| D[explicit tool_calls]\n        C -->|Traced| E[TracingProcessor]\n        C -->|A2A| F[Artifact metadata]\n        D --> G[Captured Tool Calls]\n        E --> G\n        F --> G\n    end\n    \n    subgraph Evaluation\n        G --> H[Evaluator]\n        H --> I[Tool Call Metrics]\n        H --> J[Error Detection]\n        H --> K[Trajectory Analysis]\n    end\n    \n    G --> L[Results/Report]\n    I --> L\n    J --> L\n    K --> L\n```\n\n## Comparison of Capture Methods\n\n| Aspect | Explicit (AgentResponse) | Traced (TracingProcessor) | A2A Protocol |\n|--------|--------------------------|---------------------------|--------------|\n| Return type | `AgentResponse` | `str` | Via protocol |\n| Implementation complexity | Medium | Low | High |\n| Agent code changes | Required | Minimal | Protocol required |\n| Best for | Custom Python agents | SDK-based agents | A2A-native agents |\n| Tool call source field | `agent_response` | `tracing_processor` | `a2a_protocol` |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Configuration\n\n### Trace Receiver Settings\n\nFor traced agents, enable the trace receiver in the simulation config:\n\n```yaml\nsimulation:\n  max_turns: 10\n  \ntrace_receiver:\n  enabled: true\n  wait_timeout: 5  # seconds to wait for traces\n```\n\n### When Tracing is Disabled\n\nWhen `trace_receiver.enabled` is `false` or omitted, ArkSim falls back to explicit `AgentResponse` capture:\n\n> When `trace_receiver.enabled` is false or omitted, arksim only captures tool calls from `AgentResponse` (the standard path).\n\n## Integration with Evaluation\n\nCaptured tool calls flow into the evaluator's scoring pipeline:\n\n1. **Trajectory matching** - Compare actual tool sequence against expected\n2. **Argument validation** - Verify tool arguments match scenario requirements\n3. **Error detection** - Identify tool call failures and their handling\n4. **Coverage analysis** - Determine if all required tools were invoked\n\nThe evaluator uses the `source` field to differentiate between capture methods when analyzing behavioral patterns.\n\n## Forward Compatibility\n\nBoth `ToolCall` and `AgentResponse` declare `extra=\"ignore\"` in their Pydantic configuration:\n\n> Declares `extra=\"ignore\"` explicitly so snapshots from future arksim versions that add new fields can still be loaded by older arksim without raising a `ValidationError`.\n\nThis ensures that simulation results captured with newer versions remain loadable by older versions of the evaluator.\n\n---\n\n<a id='scenario-management'></a>\n\n## Scenario Management\n\n### 相关页面\n\n相关主题：[Simulation Engine](#simulation-engine), [Configuration System](#configuration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/scenario/entities.py](https://github.com/arklexai/arksim/blob/main/arksim/scenario/entities.py)\n- [arksim/templates/scenarios.json](https://github.com/arklexai/arksim/blob/main/arksim/templates/scenarios.json)\n- [examples/e-commerce/scenarios.json](https://github.com/arklexai/arksim/blob/main/examples/e-commerce/scenarios.json)\n- [examples/bank-insurance/scenarios.json](https://github.com/arklexai/arksim/blob/main/examples/bank-insurance/scenarios.json)\n- [examples/customer-service/scenarios.json](https://github.com/arklexai/arksim/blob/main/examples/customer-service/scenarios.json)\n</details>\n\n# Scenario Management\n\n## Overview\n\nScenario Management is the core system in ArkSim for defining, loading, validating, and executing test scenarios that simulate user interactions with an AI agent. A scenario represents a structured test case that defines a user's goal, knowledge base, behavioral characteristics, and expected outcomes.\n\nScenarios serve as the foundation for the simulation and evaluation pipeline, enabling reproducible testing of agent behavior across diverse conversation patterns and user profiles.\n\n## Scenario Data Model\n\n### Core Scenario Structure\n\nEach scenario in ArkSim is a JSON object containing the following primary fields:\n\n| Field | Type | Required | Description |\n|-------|------|----------|-------------|\n| `scenario_id` | string | Yes | Unique identifier for the scenario |\n| `name` | string | Yes | Human-readable scenario name |\n| `description` | string | No | Detailed description of the scenario's purpose |\n| `user_profile` | object | Yes | User persona characteristics |\n| `goal` | object | Yes | The user's primary objective |\n| `knowledge` | array | Yes | Contextual knowledge the user has access to |\n| `expected_behavior` | object | No | Expected agent responses or behaviors |\n| `metrics` | array | No | Custom evaluation criteria |\n\n### User Profile Schema\n\n```json\n{\n  \"name\": \"string\",\n  \"age\": \"number\",\n  \"personality\": \"string\",\n  \"background\": \"string\",\n  \"communication_style\": \"string\"\n}\n```\n\n资料来源：[arksim/scenario/entities.py]()\n\n### Goal Structure\n\n```json\n{\n  \"primary\": \"string (main user objective)\",\n  \"secondary\": [\"array of secondary objectives\"],\n  \"constraints\": [\"array of constraints or boundaries\"],\n  \"success_criteria\": \"string\"\n}\n```\n\n## Scenario Management Workflow\n\n```mermaid\ngraph TD\n    A[Create/Load Scenarios] --> B[Validate Scenario Schema]\n    B --> C{Valid?}\n    C -->|Yes| D[Save to scenarios.json]\n    C -->|No| E[Show Validation Errors]\n    E --> A\n    D --> F[Configure Simulation Parameters]\n    F --> G[Run Simulation]\n    G --> H[Generate Results]\n    H --> I[Evaluation & Reporting]\n```\n\n## Scenario Loading and Validation\n\nThe UI provides interactive scenario management through the \"Build\" page:\n\n1. **Load Existing** - Users can load pre-existing scenario files via file path input\n2. **Auto-generate (PRO)** - Automatic scenario generation from agent knowledge base\n3. **Manual Creation** - Build scenarios through the UI interface\n\n```javascript\n// Scenario file validation in UI\n@input=\"validateScenarioFile()\"\n@blur=\"validateScenarioFile()\"\n@keydown.enter=\"loadScenarioFile()\"\n```\n\n资料来源：[arksim/ui/frontend/index.html]()\n\n### Validation Rules\n\n- Scenario files must be valid JSON\n- Required fields must be present\n- `scenario_id` must be unique within the file\n- `user_profile` must contain at minimum a `name` field\n\n## Scenario File Format\n\nArkSim uses JSON format for scenario definitions. See the example structure:\n\n```json\n{\n  \"scenarios\": [\n    {\n      \"scenario_id\": \"ecommerce-return-item-001\",\n      \"name\": \"Return Defective Product\",\n      \"description\": \"Customer wants to return a damaged item received last week\",\n      \"user_profile\": {\n        \"name\": \"John Smith\",\n        \"age\": 35,\n        \"personality\": \"patient but firm\",\n        \"background\": \"Regular online shopper\",\n        \"communication_style\": \"polite and direct\"\n      },\n      \"goal\": {\n        \"primary\": \"Get a full refund for a damaged product\",\n        \"secondary\": [\"Understand return process\", \"Know timeline for refund\"],\n        \"constraints\": [\"Only willing to wait up to 14 days for refund\"],\n        \"success_criteria\": \"Full refund issued or replacement offered\"\n      },\n      \"knowledge\": [\n        \"Ordered product SKU-12345 on March 1, 2024\",\n        \"Item arrived damaged with visible scratches\",\n        \"Has original packaging and receipt\",\n        \"Order number: ORD-987654\"\n      ],\n      \"expected_behavior\": {\n        \"should_mention_order_number\": true,\n        \"should_request_photo_evidence\": false\n      }\n    }\n  ]\n}\n```\n\n资料来源：[examples/e-commerce/scenarios.json]()\n资料来源：[examples/bank-insurance/scenarios.json]()\n资料来源：[examples/customer-service/scenarios.json]()\n\n## Simulation Configuration\n\nScenarios are executed through the simulation engine with configurable parameters:\n\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `num_conversations_per_scenario` | 5 | Number of conversations to simulate per scenario |\n| `max_turns` | 5 | Maximum turns per conversation |\n| `num_workers` | 50 | Parallel workers (or 'auto') |\n| `model` | gpt-4o | LLM model for simulation |\n| `provider` | openai | LLM provider |\n\n```python\nmodel: str = Field(default=DEFAULT_MODEL, description=\"LLM model for simulation\")\nnum_conversations_per_scenario: int = Field(\n    default=5, \n    description=\"Number of conversations per scenario to simulate\"\n)\nmax_turns: int = Field(default=5, description=\"Maximum turns per conversation\")\n```\n\n资料来源：[arksim/simulation_engine/entities.py:18-25]()\n\n### Configuration via config.yaml\n\n```yaml\nsimulation:\n  scenarios_file: ./scenarios.json\n  num_conversations_per_scenario: 5\n  max_turns: 5\n  num_workers: auto\n  model: gpt-4o\n```\n\n## Built-in Scenario Templates\n\nArkSim provides template scenarios for common use cases:\n\n```json\n{\n  \"scenarios\": [\n    {\n      \"scenario_id\": \"template-basic-query\",\n      \"name\": \"Basic Information Query\",\n      \"description\": \"User asks a simple informational question\",\n      \"user_profile\": {...},\n      \"goal\": {...},\n      \"knowledge\": [...]\n    }\n  ]\n}\n```\n\n资料来源：[arksim/templates/scenarios.json]()\n\n## Integration with CI/CD\n\nScenarios can be integrated into automated testing workflows:\n\n```yaml\n# GitHub Actions workflow\nsteps:\n  - name: Run Scenario Tests\n    run: arksim simulate-evaluate config.yaml\n```\n\n资料来源：[examples/ci/README.md]()\n\n### Required CI Setup\n\n1. Update `TODO` sections in `arksim.yml` (startup command and health-check URL)\n2. Create `tests/arksim/config.yaml` pointing to your server endpoint\n3. Create `tests/arksim/scenarios.json` with your test cases\n4. Add custom metrics to `tests/arksim/custom_metrics/` if needed\n5. Configure `OPENAI_API_KEY` in GitHub secrets\n\n## Best Practices\n\n### Scenario Design\n\n- **Distinct Goals**: Each scenario should test a single, well-defined user goal\n- **Realistic Profiles**: User profiles should reflect actual customer demographics\n- **Sufficient Knowledge**: Include enough context for the simulated user to maintain coherent conversation\n\n### File Organization\n\n```\nproject/\n├── config.yaml\n├── scenarios.json\n├── custom_metrics/\n│   └── my_metric.py\n└── results/\n    └── simulation/\n```\n\n### Version Control\n\n- Commit `scenarios.json` with your agent code\n- Use descriptive `scenario_id` values with prefixes (e.g., `ecommerce-`, `support-`)\n- Document success criteria clearly in the `goal.success_criteria` field\n\n## Command Line Usage\n\n### Initialize with default scenarios\n\n```bash\narksim init\n```\n\n### Run simulation with scenarios\n\n```bash\narksim simulate-evaluate config.yaml\n```\n\nResults are written to `./results/simulation/simulation.json`.\n\n资料来源：[README.md]()\n资料来源：[examples/integrations/dify/README.md]()\n\n## See Also\n\n- [Simulation Engine Documentation](./simulation-engine.md)\n- [Custom Agent Integration](./custom-agents.md)\n- [Evaluation Metrics](./evaluation-metrics.md)\n- [HTML Report Template](./html-report.md)\n\n---\n\n<a id='configuration'></a>\n\n## Configuration System\n\n### 相关页面\n\n相关主题：[Agent Types and Integration](#agent-types), [LLM Provider Integration](#llm-providers), [Evaluation System](#evaluation-system)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/config/types.py](https://github.com/arklexai/arksim/blob/main/arksim/config/types.py)\n- [arksim/config/core/agent.py](https://github.com/arklexai/arksim/blob/main/arksim/config/core/agent.py)\n- [arksim/config/utils.py](https://github.com/arklexai/arksim/blob/main/arksim/config/utils.py)\n- [arksim/config.yaml](https://github.com/arklexai/arksim/blob/main/arksim/config.yaml)\n- [arksim/config_simulate.yaml](https://github.com/arklexai/arksim/blob/main/arksim/config_simulate.yaml)\n- [arksim/config_evaluate.yaml](https://github.com/arklexai/arksim/blob/main/arksim/config_evaluate.yaml)\n- [arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n- [examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n</details>\n\n# Configuration System\n\nArkSim provides a flexible, multi-layered configuration system that supports both YAML-based declarative configuration and programmatic Python class configuration. The system enables users to define simulation scenarios, evaluation metrics, agent connections, and runtime behavior through structured configuration files or direct Python implementations.\n\n## Architecture Overview\n\nThe configuration system is organized into several key modules that handle different aspects of simulation and evaluation:\n\n```mermaid\ngraph TD\n    A[User Configuration] --> B[YAML Files]\n    A --> C[Python Classes]\n    B --> D[config.yaml]\n    B --> E[config_simulate.yaml]\n    B --> F[config_evaluate.yaml]\n    C --> G[BaseAgent Subclass]\n    C --> H[Custom Metrics]\n    D --> I[Config Loader]\n    E --> I\n    F --> I\n    G --> J[Simulation Engine]\n    H --> K[Evaluator]\n    I --> J\n    J --> K\n    J --> L[Results/Reports]\n    K --> L\n```\n\nThe system separates configuration into three distinct phases: simulation, evaluation, and combined execution. This separation allows users to run simulations independently from evaluation, which is useful for debugging and iterative development workflows.\n\n## Configuration Files\n\nArkSim uses YAML configuration files to define simulation parameters, evaluation settings, and agent connections. The repository includes three primary configuration templates at the root level.\n\n### Main Configuration (config.yaml)\n\nThe main configuration file combines both simulation and evaluation settings into a single file. This is the recommended approach for simple use cases where both phases run together using the `simulate-evaluate` command. The file structure typically includes agent configuration, scenario definitions, metric selections, and evaluation parameters.\n\n### Separate Phase Configuration\n\nFor more complex workflows, ArkSim supports splitting configuration into separate files:\n\n| File | Purpose | CLI Command |\n|------|---------|-------------|\n| `config_simulate.yaml` | Simulation-only parameters | `arksim simulate config_simulate.yaml` |\n| `config_evaluate.yaml` | Evaluation-only parameters | `arksim evaluate config_evaluate.yaml` |\n| `config.yaml` | Combined configuration | `arksim simulate-evaluate config.yaml` |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\nThis separation enables scenarios where simulation results are cached and evaluated multiple times with different metric configurations, or where the simulation phase runs in a different environment than evaluation.\n\n## Agent Configuration\n\nThe configuration system supports multiple agent connection types, defined through the `agent_type` field in the agent configuration section.\n\n### Supported Agent Types\n\n| Agent Type | Description | Configuration Style |\n|------------|-------------|---------------------|\n| `custom` | Custom Python agent class inheriting from BaseAgent | Python class file |\n| `chat_completions` | HTTP endpoint implementing Chat Completions API | YAML endpoint config |\n| `a2a` | Agent-to-Agent protocol endpoint | YAML endpoint config |\n\n资料来源：[arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n### Custom Agent (Python Class)\n\nThe default agent type uses a Python class that extends `BaseAgent`. This approach provides maximum flexibility for integrating any agent framework. The agent class must implement two required methods:\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Replace with your agent logic\n        return \"agent response\"\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\nThe `execute` method can return either a plain string response or an `AgentResponse` object that includes both text content and tool calls for evaluation.\n\n### Chat Completions Agent\n\nFor agents exposing an HTTP endpoint compatible with the OpenAI Chat Completions format, configuration is declarative:\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### A2A Protocol Agent\n\nAgents implementing the Agent-to-Agent protocol are configured similarly:\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\nA2A agents can also surface tool calls for evaluation through the protocol, enabling comprehensive testing of tool-using agents.\n\n## Scenario Configuration\n\nScenarios define the test cases that the simulator executes against the agent. The `scenarios.json` file contains an array of scenario objects, each representing a simulated conversation or interaction.\n\n### Scenario Structure\n\nEach scenario typically includes:\n\n- **Scenario ID**: Unique identifier for the scenario\n- **User query**: The initial prompt or question presented to the agent\n- **Expected behavior**: Criteria for successful completion\n- **Knowledge/context**: Information the agent should know or have access to during the scenario\n\n### Scenario Files Location\n\nScenarios are referenced from the main configuration file and can be shared across different configuration files:\n\n| Example Directory | Scenario Purpose |\n|------------------|------------------|\n| `examples/bank-insurance/` | Financial services agent testing |\n| `examples/customer-service/` | Customer support scenarios |\n| `examples/integrations/*/` | Framework-specific testing |\n| `examples/ci/` | CI/CD integration testing |\n\n资料来源：[examples/ci/README.md](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n\n## Evaluation Metrics Configuration\n\nThe evaluator component scores agent responses against defined metrics. Configuration specifies which metrics to run and how they are calculated.\n\n### Built-in Metrics\n\nArkSim includes standard evaluation metrics covering common agent quality dimensions. These metrics are automatically available without additional configuration.\n\n### Custom Metrics\n\nUsers can define custom evaluation metrics by creating a Python module that implements the metric interfaces. The custom metrics file must define metrics following the provided schema:\n\n```python\nfrom pydantic import BaseModel\nfrom arksim.evaluator import (\n    QualitativeMetric,\n    QualResult,\n    QuantitativeMetric,\n    QuantResult,\n    ScoreInput,\n    format_chat_history,\n)\n```\n\n资料来源：[examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\nCustom metrics are referenced in the configuration file using the `custom_metrics_file_paths` parameter, and optionally listed in `metrics_to_run` to include them in the evaluation:\n\n```yaml\ncustom_metrics_file_paths:\n  - tests/arksim/custom_metrics/my_metric.py\nmetrics_to_run:\n  - custom_metric_name\n```\n\n资料来源：[examples/ci/README.md](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n\n### Custom Metric Schema\n\nCustom quantitative metrics should return a Pydantic model defining the score components:\n\n```python\nclass VerificationComplianceSchema(BaseModel):\n    identity_verification: float  # 0.0-1.0\n    action_gating: float  # 0.0-1.0\n    reason: str\n```\n\n资料来源：[examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\nThe system prompt for qualitative metrics defines the evaluation criteria that the LLM judge applies when scoring agent responses.\n\n## CLI Configuration Interface\n\nThe ArkSim CLI provides commands for initializing configurations, running simulations, and managing examples.\n\n### Initialization Command\n\nThe `init` command scaffolds starter files for agent testing:\n\n```bash\narksim init --agent-type custom\n```\n\n| Flag | Options | Default | Description |\n|------|---------|---------|-------------|\n| `--agent-type` | `custom`, `chat_completions`, `a2a` | `custom` | Agent connection type |\n| `--force` | boolean | `false` | Overwrite existing files |\n\n资料来源：[arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\nThe `--agent-type` flag determines which template is generated:\n- `custom` generates a Python agent file (no server needed)\n- `chat_completions` generates YAML configuration for HTTP endpoints\n- `a2a` generates YAML configuration for Agent-to-Agent protocol\n\n### Simulation Commands\n\n| Command | Description |\n|---------|-------------|\n| `arksim simulate-evaluate config.yaml` | Run simulation and evaluation in sequence |\n| `arksim simulate config_simulate.yaml` | Run simulation only |\n| `arksim evaluate config_evaluate.yaml` | Evaluate previously saved simulation results |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n### Examples Command\n\nThe CLI also provides access to example projects:\n\n```bash\narksim examples                    # Download all examples\narksim examples bank-insurance     # Download specific example\narksim examples --list            # List available examples\n```\n\n## Integration Configuration Patterns\n\nEach integration example follows a consistent pattern with three standard files.\n\n### File Structure\n\n| File | Purpose |\n|------|---------|\n| `custom_agent.py` | Agent implementation connecting to the target framework |\n| `config.yaml` | Simulation and evaluation settings |\n| `scenarios.json` | Test scenarios for the example domain |\n\n### Supported Framework Integrations\n\nArkSim provides example configurations for the following agent frameworks:\n\n| Framework | Installation | Example Directory |\n|-----------|-------------|-------------------|\n| LangGraph | `pip install langgraph langchain-openai` | `examples/integrations/langgraph/` |\n| LangChain | `pip install langgraph langchain-openai` | `examples/integrations/langchain/` |\n| Claude Agent SDK | `pip install claude-agent-sdk` | `examples/integrations/claude-agent-sdk/` |\n| AutoGen | `pip install autogen-agentchat autogen-ext[openai]` | `examples/integrations/autogen/` |\n| CrewAI | `pip install crewai` | `examples/integrations/crewai/` |\n| Pydantic AI | `pip install pydantic-ai` | `examples/integrations/pydantic-ai/` |\n| Smolagents | `pip install smolagents` | `examples/integrations/smolagents/` |\n| Dify | HTTP integration | `examples/integrations/dify/` |\n| OpenClaw | Gateway token auth | `examples/openclaw/` |\n\n资料来源：[examples/integrations/*/README.md](https://github.com/arklexai/arksim/tree/main/examples/integrations)\n\n## Trace Receiver Configuration\n\nFor frameworks that support tracing, ArkSim includes a trace receiver component that captures tool calls automatically without requiring explicit `AgentResponse` returns.\n\n### Configuration Options\n\n```yaml\ntrace_receiver:\n  enabled: true\n  wait_timeout: 5\n```\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `enabled` | boolean | `false` | Enable trace-based tool call capture |\n| `wait_timeout` | integer | `5` | Seconds to wait for trace data |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\nWhen `enabled` is `false` or omitted, ArkSim only captures tool calls from `AgentResponse` objects returned by the agent's `execute` method.\n\n## Workflow Summary\n\nThe following diagram illustrates the configuration-driven workflow from specification to results:\n\n```mermaid\ngraph LR\n    A[config.yaml] --> B[Scenario Loading]\n    C[scenarios.json] --> B\n    B --> D[Simulation Engine]\n    D --> E{Agent Type?}\n    E -->|custom| F[Python Agent]\n    E -->|chat_completions| G[HTTP Endpoint]\n    E -->|a2a| H[A2A Agent]\n    F --> I[Execute Scenarios]\n    G --> I\n    H --> I\n    I --> J[Simulation Results]\n    J --> K[Evaluator]\n    J --> L[Conversation Viewer]\n    K --> M[Evaluation Report]\n    M --> N[scores.json]\n    M --> O[failures.json]\n```\n\n## Best Practices\n\n**Environment Variables**: API keys should be set as environment variables rather than hardcoded in configuration files:\n\n```bash\nexport OPENAI_API_KEY=\"<your-key>\"\nexport ANTHROPIC_API_KEY=\"<your-key>\"\n```\n\n**Separation of Concerns**: Use separate `config_simulate.yaml` and `config_evaluate.yaml` files when iterating on scenarios or metrics independently.\n\n**Custom Metrics Organization**: Place custom metrics in dedicated directories (e.g., `tests/arksim/custom_metrics/`) and reference them from the main configuration.\n\n**Scenario Versioning**: Keep scenarios in version-controlled JSON files that can be reviewed and updated as agent requirements evolve.\n\n资料来源：[CONTRIBUTING.md](https://github.com/arklexai/arksim/blob/main/CONTRIBUTING.md)\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：arklexai/arksim\n\n摘要：发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：v0.1.0。\n\n## 1. 安装坑 · 来源证据：v0.1.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.1.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d55b6435222f4142bf979809f0cf0794 | https://github.com/arklexai/arksim/releases/tag/v0.1.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 2. 配置坑 · 来源证据：v0.0.6\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.0.6\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_57b2e0dc0cf94da8b4fc97ea744ae7e9 | https://github.com/arklexai/arksim/releases/tag/v0.0.6 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 3. 配置坑 · 来源证据：v0.3.2\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.2\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_3a2981d341cb4512a410e7d75e03baf0 | https://github.com/arklexai/arksim/releases/tag/v0.3.2 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 4. 配置坑 · 来源证据：v0.3.4\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.4\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f949170a7f5d440cb0559c40fb441178 | https://github.com/arklexai/arksim/releases/tag/v0.3.4 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 5. 配置坑 · 来源证据：v0.3.5\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.5\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_314def1c94614699b6c9ad728c36e9ab | https://github.com/arklexai/arksim/releases/tag/v0.3.5 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 6. 能力坑 · 来源证据：v0.3.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.3.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_c3e7552e9af84b95b2aec63c73dc7c26 | https://github.com/arklexai/arksim/releases/tag/v0.3.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 7. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | README/documentation is current enough for a first validation pass.\n\n## 8. 维护坑 · 来源证据：v0.3.3\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个维护/版本相关的待验证问题：v0.3.3\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_03b0d8ce6f8a4bc2bc8e33a7636aa07c | https://github.com/arklexai/arksim/releases/tag/v0.3.3 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 9. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | last_activity_observed missing\n\n## 10. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | no_demo; severity=medium\n\n## 11. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | no_demo; severity=medium\n\n## 12. 安全/权限坑 · 来源证据：v0.2.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.2.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_3c70cc242a1242b98d50cfe5078d6752 | https://github.com/arklexai/arksim/releases/tag/v0.2.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 13. 安全/权限坑 · 来源证据：v0.3.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.3.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_69bd9f91a5064aeeb87f325f2f56f115 | https://github.com/arklexai/arksim/releases/tag/v0.3.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | issue_or_pr_quality=unknown\n\n## 15. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | release_recency=unknown\n\n<!-- canonical_name: arklexai/arksim; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "arksim",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "art_e3064c2689144cfb89a534e0544c6bfc",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/arklexai/arksim#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "arksim 说明书",
      "toc": [
        "https://github.com/arklexai/arksim 项目说明书",
        "目录",
        "Introduction to ArkSim",
        "Overview",
        "Architecture Overview",
        "Agent Integration Methods",
        "Configuration System",
        "Simulation settings",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "321d8003879fa2d27b3377845efdc93848a8b6cc",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "README.md",
      "docs/README.md",
      "docs/CONTRIBUTING.md",
      "docs/docs.json",
      "docs/snippets/snippet-intro.mdx",
      "docs/v0.2.0/schema-reference.mdx",
      "docs/v0.2.0/build-scenario.mdx",
      "docs/v0.2.0/evaluate-conversation.mdx",
      "docs/v0.2.0/insurance-customer-service-agent-evaluation.mdx",
      "docs/v0.2.0/contact-email.mdx",
      "docs/v0.2.0/e-commerce-customer-service-agent-evaluation.mdx",
      "docs/v0.2.0/installation.mdx",
      "docs/v0.2.0/quickstart.mdx",
      "docs/v0.2.0/personal-ai-assistant-openclaw-evaluation.mdx",
      "docs/v0.2.0/overview.mdx",
      "docs/v0.2.0/simulate-conversation.mdx",
      "docs/main/customer-service-tool-calling-agent-evaluation.mdx",
      "docs/main/integrations.mdx",
      "docs/main/tool-call-capture.mdx",
      "docs/main/ci-integration.mdx",
      "docs/main/schema-reference.mdx",
      "docs/main/build-scenario.mdx",
      "docs/main/evaluate-conversation.mdx",
      "docs/main/insurance-customer-service-agent-evaluation.mdx",
      "docs/main/contact-email.mdx",
      "docs/main/e-commerce-customer-service-agent-evaluation.mdx",
      "docs/main/installation.mdx",
      "docs/main/quickstart.mdx",
      "docs/main/personal-ai-assistant-openclaw-evaluation.mdx",
      "docs/main/overview.mdx",
      "docs/main/simulate-conversation.mdx",
      "docs/v0.3.x/customer-service-tool-calling-agent-evaluation.mdx",
      "docs/v0.3.x/integrations.mdx",
      "docs/v0.3.x/tool-call-capture.mdx",
      "docs/v0.3.x/ci-integration.mdx",
      "docs/v0.3.x/schema-reference.mdx",
      "docs/v0.3.x/build-scenario.mdx",
      "docs/v0.3.x/evaluate-conversation.mdx",
      "docs/v0.3.x/insurance-customer-service-agent-evaluation.mdx"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# arksim-mastra-example - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 arksim-mastra-example 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **正在使用 Claude/Codex/Cursor/Gemini 等宿主 AI 的开发者**：README 或插件配置提到多个宿主 AI。 证据：`README.md` Claim：`clm_0003` supported 0.86\n- **希望把专业流程带进宿主 AI 的用户**：仓库包含 Skill 文档。 证据：`.claude/skills/draft-pr/SKILL.md`, `.claude/skills/pre-review/SKILL.md`, `.claude/skills/review-pr/SKILL.md` Claim：`clm_0004` supported 0.86\n\n## 它能做什么\n\n- **AI Skill / Agent 指令资产库**（可做安装前预览）：项目包含可被宿主 AI 读取的 Skill 或 Agent 指令文件，可用于把专业流程带入 Claude、Codex、Cursor 等宿主。 证据：`.claude/skills/draft-pr/SKILL.md`, `.claude/skills/pre-review/SKILL.md`, `.claude/skills/review-pr/SKILL.md` Claim：`clm_0001` supported 0.86\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md` Claim：`clm_0002` supported 0.86\n\n## 怎么开始\n\n- `pip install arksim` 证据：`README.md` Claim：`clm_0005` supported 0.86\n- `git clone https://github.com/arklexai/arksim.git` 证据：`README.md` Claim：`clm_0006` supported 0.86\n- `pip install -e \".[dev]\"` 证据：`README.md` Claim：`clm_0007` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：需要管理员/安全审批\n- **为什么**：继续前可能涉及密钥、账号、外部服务或敏感上下文，建议先经过管理员或安全审批。\n\n### 30 秒判断\n\n- **现在怎么做**：需要管理员/安全审批\n- **最小安全下一步**：先跑 Prompt Preview；若涉及凭证或企业环境，先审批再试装\n- **先别相信**：真实输出质量不能在安装前相信。\n- **继续会触碰**：命令执行、宿主 AI 配置、本地环境或项目文件\n\n### 现在可以相信\n\n- **适合人群线索：正在使用 Claude/Codex/Cursor/Gemini 等宿主 AI 的开发者**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0003` supported 0.86\n- **适合人群线索：希望把专业流程带进宿主 AI 的用户**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`.claude/skills/draft-pr/SKILL.md`, `.claude/skills/pre-review/SKILL.md`, `.claude/skills/review-pr/SKILL.md` Claim：`clm_0004` supported 0.86\n- **能力存在：AI Skill / Agent 指令资产库**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`.claude/skills/draft-pr/SKILL.md`, `.claude/skills/pre-review/SKILL.md`, `.claude/skills/review-pr/SKILL.md` Claim：`clm_0001` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0005` supported 0.86\n\n### 现在还不能相信\n\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。 证据：`.claude/skills/draft-pr/SKILL.md`, `.claude/skills/pre-review/SKILL.md`, `.claude/skills/review-pr/SKILL.md`, `CLAUDE.md`\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n- **安装命令是否需要网络、权限或全局写入？**（unverified）：这影响企业环境和个人环境的安装风险。 证据：`README.md`\n\n### 继续会触碰什么\n\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`\n- **宿主 AI 配置**：Claude/Codex/Cursor/Gemini/OpenCode 等宿主的 plugin、Skill 或规则加载配置。 原因：宿主配置会改变 AI 后续工作方式，可能和用户已有规则冲突。 证据：`.claude/skills/draft-pr/SKILL.md`, `.claude/skills/pre-review/SKILL.md`, `.claude/skills/review-pr/SKILL.md`, `CLAUDE.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`\n- **环境变量 / API Key**：项目入口文档明确出现 API key、token、secret 或账号凭证配置。 原因：如果真实安装需要凭证，应先使用测试凭证并经过权限/合规判断。 证据：`README.md`, `arksim/ui/frontend/app.js`, `examples/bank-insurance/README.md`, `examples/ci/README.md` 等\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：用安装前交互式试用判断工作方式是否匹配，不需要授权或改环境。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **先备份宿主 AI 配置**：Skill、plugin、规则文件可能改变 Claude/Cursor/Codex 的默认行为。（适用：存在插件 manifest、Skill 或宿主规则入口时。）\n- **不要使用真实生产凭证**：环境变量/API key 一旦进入宿主或工具链，可能产生账号和合规风险。（适用：出现 API、TOKEN、KEY、SECRET 等环境线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **准备移除宿主 plugin / Skill / 规则入口**：如果试装后行为异常，可以把宿主 AI 恢复到试装前状态。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **准备撤销测试 API key 或 token**：测试凭证泄露或误用时，可以快速止损。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0008` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md` Claim：`clm_0009` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **AI Skill / Agent 指令资产库**：先基于 role_skill_index / evidence_index 帮用户挑选可用角色、Skill 或工作流。 边界：可做安装前 Prompt 体验。 证据：`.claude/skills/draft-pr/SKILL.md`, `.claude/skills/pre-review/SKILL.md`, `.claude/skills/review-pr/SKILL.md` Claim：`clm_0001` supported 0.86\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md` Claim：`clm_0002` supported 0.86\n\n### 上下文规模\n\n- 文件总数：477\n- 重要文件覆盖：40/477\n- 证据索引条目：80\n- 角色 / Skill 条目：3\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 arksim-mastra-example 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 arksim-mastra-example 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 arksim-mastra-example 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 3 个角色 / Skill / 项目文档条目。\n\n- **draft-pr**（skill）：Generate a PR title and description from your changes 激活提示：当用户任务与“draft-pr”描述的流程高度相关时，先用它做安装前体验，再决定是否安装。 证据：`.claude/skills/draft-pr/SKILL.md`\n- **pre-review**（skill）：Check your branch before opening a PR lint, test, title validation 激活提示：当用户任务与“pre-review”描述的流程高度相关时，先用它做安装前体验，再决定是否安装。 证据：`.claude/skills/pre-review/SKILL.md`\n- **review-pr**（skill）：Review a pull request against arksim project standards 激活提示：当用户任务与“review-pr”描述的流程高度相关时，先用它做安装前体验，再决定是否安装。 证据：`.claude/skills/review-pr/SKILL.md`\n\n## 证据索引\n\n- 共索引 80 条证据。\n\n- **ArkSim Docs**（documentation）：Documentation for ArkSim, built with Mintlify https://mintlify.com . 证据：`docs/README.md`\n- **Contributing**（documentation）：- Local development: Follow the Developer Install section in the README to set up a Python 3.10–3.13 environment and install ArkSim in editable mode. - Documentation: For simulator configuration and usage, see the ArkSim docs https://doc.arklex.ai/ . 证据：`docs/CONTRIBUTING.md`\n- **arksim**（documentation）：Agent simulation and evaluation framework. Python 3.10+, Apache-2.0. 证据：`CLAUDE.md`\n- **What is ArkSim?**（documentation）：⛵️ ArkSim Simulate multi-turn conversations with your AI agent. Find failures before production. Documentation · Examples · Report a Bug 证据：`README.md`\n- **Agent Evaluation Example - Insurance Company**（documentation）：Agent Evaluation Example - Insurance Company 证据：`examples/bank-insurance/README.md`\n- **CI Integration**（documentation）：Run ArkSim as a quality gate in your CI pipeline. Choose the approach that matches how your agent is built. 证据：`examples/ci/README.md`\n- **Agent Evaluation Example - Customer Service**（documentation）：Agent Evaluation Example - Customer Service 证据：`examples/customer-service/README.md`\n- **Customer Service - A2A Server**（documentation）：A2A variant of the customer-service example. Same SQLite-backed tools and scenarios as the Python connector custom agent.py and traced traced agent.py variants, but connected via the A2A protocol. 证据：`examples/customer-service/a2a_server/README.md`\n- **Agent Evaluation Example - E-commerce**（documentation）：Agent Evaluation Example - E-commerce 证据：`examples/e-commerce/README.md`\n- **AutoGen Integration**（documentation）：This example demonstrates how to connect a Microsoft AutoGen https://github.com/microsoft/autogen agent to ArkSim using the custom agent connector. 证据：`examples/integrations/autogen/README.md`\n- **Claude Agent SDK Integration**（documentation）：This example demonstrates how to connect a Claude Agent SDK https://github.com/anthropics/claude-agent-sdk agent to ArkSim using the custom agent connector. 证据：`examples/integrations/claude-agent-sdk/README.md`\n- **CrewAI Integration**（documentation）：This example demonstrates how to connect a CrewAI https://github.com/crewAIInc/crewAI agent to ArkSim using the custom agent connector. 证据：`examples/integrations/crewai/README.md`\n- **Dify Integration**（documentation）：This example connects a Dify https://dify.ai Chatbot app to ArkSim using the custom agent connector. It uses Dify's Chat API directly via httpx with no SDK dependency. 证据：`examples/integrations/dify/README.md`\n- **Google ADK Integration**（documentation）：This example demonstrates how to connect a Google ADK https://github.com/google/adk-python agent to ArkSim using the custom agent connector. 证据：`examples/integrations/google-adk/README.md`\n- **LangChain/LangGraph Integration**（documentation）：This example demonstrates how to connect a LangChain https://github.com/langchain-ai/langchain / LangGraph https://github.com/langchain-ai/langgraph agent to ArkSim using the custom agent connector. 证据：`examples/integrations/langchain/README.md`\n- **LangGraph Integration**（documentation）：This example demonstrates how to connect a LangGraph https://github.com/langchain-ai/langgraph agent to ArkSim using the custom agent connector. 证据：`examples/integrations/langgraph/README.md`\n- **LlamaIndex Integration**（documentation）：This example demonstrates how to connect a LlamaIndex https://github.com/run-llama/llama index agent to ArkSim using the custom agent connector. 证据：`examples/integrations/llamaindex/README.md`\n- **Mastra Integration**（documentation）：This example demonstrates how to connect a Mastra https://github.com/mastra-ai/mastra agent to ArkSim via an HTTP server exposing an OpenAI-compatible chat completions endpoint. 证据：`examples/integrations/mastra/README.md`\n- **OpenAI Agents SDK Integration**（documentation）：This example demonstrates how to connect an OpenAI Agents SDK https://github.com/openai/openai-agents-python agent to ArkSim using the custom agent connector. 证据：`examples/integrations/openai-agents-sdk/README.md`\n- **Pydantic AI Integration**（documentation）：This example demonstrates how to connect a Pydantic AI https://github.com/pydantic/pydantic-ai agent to ArkSim using the custom agent connector. 证据：`examples/integrations/pydantic-ai/README.md`\n- **Rasa Pro Integration**（documentation）：This example shows how to simulate and evaluate a Rasa Pro https://rasa.com/ assistant built with CALM https://rasa.com/docs/pro/ , Rasa's LLM-powered dialogue engine. CALM replaces traditional NLU pipelines with flow-based conversation management, where an LLM routes user messages to the right flow based on natural language descriptions. 证据：`examples/integrations/rasa/README.md`\n- **Smolagents Integration**（documentation）：This example demonstrates how to connect a Smolagents https://github.com/huggingface/smolagents Hugging Face agent to ArkSim using the custom agent connector. 证据：`examples/integrations/smolagents/README.md`\n- **Vercel AI SDK Integration**（documentation）：This example demonstrates how to connect a Vercel AI SDK https://github.com/vercel/ai agent to ArkSim via an HTTP server exposing an OpenAI-compatible chat completions endpoint. 证据：`examples/integrations/vercel-ai-sdk/README.md`\n- **Agent Evaluation Example - Personal AI Assistant**（documentation）：Agent Evaluation Example - Personal AI Assistant 证据：`examples/openclaw/README.md`\n- **Contributing to ArkSim**（documentation）：Thank you for your interest in contributing to ArkSim! This guide will help you get started. 证据：`CONTRIBUTING.md`\n- **Package**（package_manifest）：{ \"name\": \"arksim-mastra-example\", \"private\": true, \"type\": \"module\", \"scripts\": { \"start\": \"npx tsx agent server.ts\" }, \"dependencies\": { \"@mastra/core\": \"^1.0\", \"@ai-sdk/openai\": \"^1.0\", \"hono\": \"^4.0\", \"@hono/node-server\": \"^1.0\", \"tsx\": \"^4.0\" } } 证据：`examples/integrations/mastra/package.json`\n- **Package**（package_manifest）：{ \"name\": \"arksim-vercel-ai-sdk-example\", \"private\": true, \"type\": \"module\", \"scripts\": { \"start\": \"npx tsx agent server.ts\" }, \"dependencies\": { \"@ai-sdk/openai\": \"^3.0.41\", \"@hono/node-server\": \"^1.0\", \"ai\": \"^6.0.116\", \"hono\": \"^4.0\", \"tsx\": \"^4.0\" } } 证据：`examples/integrations/vercel-ai-sdk/package.json`\n- **Draft PR**（skill_instruction）：Analyze commits and diff to produce a ready-to-use PR title and description. 证据：`.claude/skills/draft-pr/SKILL.md`\n- **Pre-review**（skill_instruction）：Run local checks on the current branch before opening a pull request. 证据：`.claude/skills/pre-review/SKILL.md`\n- **Review PR**（skill_instruction）：Review an open pull request for functional correctness and project conventions. Focus on logic, architecture, and behavior. Formatting issues are caught by CI ruff, pre-commit and do not need manual review. 证据：`.claude/skills/review-pr/SKILL.md`\n- **License**（source_file）：Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ 证据：`LICENSE`\n- **License**（source_file）：Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files the \"Software\" , to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 证据：`docs/LICENSE`\n- **Summary**（documentation）：Type When to use Changelog section ----------- ------------------------------------ ----------------- feat New user-facing feature Added fix Bug fix Fixed docs Documentation only Documentation refactor Code restructuring, no behavior change Changed chore Maintenance, deps, CI config Changed ci CI/CD pipeline changes Changed build Build system or dependency changes Changed perf Performance improvement Changed test Adding or updating tests Changed style Formatting, whitespace Changed revert Reverting a previous commit Reverted 证据：`.github/pull_request_template.md`\n- **Changelog**（documentation）：All notable changes to ArkSim will be documented in this file. 证据：`CHANGELOG.md`\n- **Code of Conduct**（documentation）：We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. 证据：`CODE_OF_CONDUCT.md`\n- **Security Policy**（documentation）：Version Supported ------- ------------------ 0.2.x :white check mark: 0.1.x :white check mark: < 0.1 :x: 证据：`SECURITY.md`\n- **Python style rules**（documentation）：- Ruff format + ruff check enforced by hooks auto-runs on every edit - Line length: 88 characters ruff format default . E501 disabled. - Ruff rules: E, F, FA, I, UP, ANN, B, SIM, C4, TID. Ignored: E501, UP045. - Every .py file starts with SPDX-License-Identifier: Apache-2.0 - Every module has from future import annotations after the license header - Absolute imports preferred. Single-level relative imports from .module import ... are allowed. Multi-level relative imports from ..module import ... are not. - Type annotations on all function signatures - Both str None and Optional str accepted UP045 ignored ; prefer str None in new code 证据：`.claude/rules/python-style.md`\n- **What is Car Insurance and How Does it Work?**（documentation）：What is Car Insurance and How Does it Work? 证据：`examples/bank-insurance/agent_server/data/01_what_is_car_insurance.md`\n- **What is Home Insurance?**（documentation）：Owning a home may be your biggest investment, so how can you protect it? Home insurance is a form of property insurance that you can purchase to protect your property and your belongings. 证据：`examples/bank-insurance/agent_server/data/02_what_is_home_insurance.md`\n- **How Car Insurance Deductibles Work**（documentation）：When you choose an insurance policy, you agree to a deductible amount and a premium—these are both costs that you agree to pay. Deductibles are the dollar figure that you're responsible for paying upon settlement of an eventual claim —the deductible is effectively deducted from your total claim settlement hence the name . 证据：`examples/bank-insurance/agent_server/data/03_car_insurance_deductibles.md`\n- **How Home Insurance Deductibles Work**（documentation）：How Home Insurance Deductibles Work 证据：`examples/bank-insurance/agent_server/data/04_home_insurance_deductibles.md`\n- **Insurance Terminology: A Complete Glossary**（documentation）：Insurance Terminology: A Complete Glossary 证据：`examples/bank-insurance/agent_server/data/05_insurance_terminology.md`\n- **What to Do After a Car Accident**（documentation）：A car accident can be an intense experience to go through—and sometimes it's hard to remember what to do next. Follow these safety tips and steps to feel confident you'll be back on the road quickly. 证据：`examples/bank-insurance/agent_server/data/06_what_to_do_after_car_accident.md`\n- **How to File an Insurance Claim: Complete Guide**（documentation）：How to File an Insurance Claim: Complete Guide 证据：`examples/bank-insurance/agent_server/data/07_filing_insurance_claims.md`\n- **How to Prevent Water Damage in Your Home**（documentation）：How to Prevent Water Damage in Your Home 证据：`examples/bank-insurance/agent_server/data/08_preventing_water_damage.md`\n- **Condo Insurance: Complete Guide and FAQ**（documentation）：Condo Insurance: Complete Guide and FAQ 证据：`examples/bank-insurance/agent_server/data/09_condo_insurance_guide.md`\n- **Tenant Insurance: What Renters Need to Know**（documentation）：Tenant Insurance: What Renters Need to Know 证据：`examples/bank-insurance/agent_server/data/10_tenant_insurance.md`\n- **Car Insurance for New Drivers**（documentation）：Understanding Insurance for Beginners 证据：`examples/bank-insurance/agent_server/data/11_car_insurance_for_new_drivers.md`\n- **How Car Insurance Premiums Are Calculated**（documentation）：How Car Insurance Premiums Are Calculated 证据：`examples/bank-insurance/agent_server/data/12_how_car_insurance_premiums_calculated.md`\n- **How Home Insurance Premiums Are Calculated**（documentation）：How Home Insurance Premiums Are Calculated 证据：`examples/bank-insurance/agent_server/data/13_how_home_insurance_premiums_calculated.md`\n- **Benefits of Bundling Insurance Policies**（documentation）：Benefits of Bundling Insurance Policies 证据：`examples/bank-insurance/agent_server/data/14_bundling_insurance_policies.md`\n- **Understanding Liability Insurance**（documentation）：Liability insurance protects you financially if you're held responsible for causing injury to someone else or damage to their property. It covers legal costs, settlements, and judgments against you. 证据：`examples/bank-insurance/agent_server/data/15_liability_insurance_explained.md`\n- **Comprehensive vs. Collision Coverage Explained**（documentation）：Comprehensive vs. Collision Coverage Explained 证据：`examples/bank-insurance/agent_server/data/16_comprehensive_vs_collision_coverage.md`\n- **Understanding Accident Forgiveness Coverage**（documentation）：Understanding Accident Forgiveness Coverage 证据：`examples/bank-insurance/agent_server/data/17_accident_forgiveness.md`\n- **Winter Driving Safety Tips**（documentation）：Winter Tires: - Winter tires significantly improve traction in snow and ice - They remain flexible in cold temperatures - Many provinces require or recommend them - Some insurers offer winter tire discounts 证据：`examples/bank-insurance/agent_server/data/18_winter_driving_safety.md`\n- **Protecting Your Home from Fire**（documentation）：House fires are one of the most devastating events a homeowner can face. Prevention is the best protection. Here's how to reduce your risk and protect your family. 证据：`examples/bank-insurance/agent_server/data/19_protecting_home_from_fire.md`\n- **Motorcycle and Recreational Vehicle Insurance**（documentation）：Motorcycle and Recreational Vehicle Insurance 证据：`examples/bank-insurance/agent_server/data/20_motorcycle_rv_insurance.md`\n- **Identity Theft Protection and Insurance**（documentation）：Identity Theft Protection and Insurance 证据：`examples/bank-insurance/agent_server/data/21_identity_theft_protection.md`\n- **Small Business Insurance Guide**（documentation）：Why Small Businesses Need Insurance 证据：`examples/bank-insurance/agent_server/data/22_small_business_insurance.md`\n- **Cottage and Vacation Property Insurance**（documentation）：Cottage and Vacation Property Insurance 证据：`examples/bank-insurance/agent_server/data/23_cottage_vacation_property_insurance.md`\n- 其余 20 条证据见 `AI_CONTEXT_PACK.json` 或 `EVIDENCE_INDEX.json`。\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`docs/README.md`, `docs/CONTRIBUTING.md`, `CLAUDE.md`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`docs/README.md`, `docs/CONTRIBUTING.md`, `CLAUDE.md`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Introduction to ArkSim**：importance `high`\n  - source_paths: README.md, arksim/__init__.py, arksim/constants.py\n- **Quickstart Guide**：importance `high`\n  - source_paths: arksim/cli.py, arksim/templates/my_agent.py, arksim/templates/config.yaml, arksim/templates/scenarios.json\n- **System Architecture Overview**：importance `high`\n  - source_paths: arksim/simulation_engine/simulator.py, arksim/evaluator/evaluator.py, arksim/llms/chat/llm.py, arksim/config/core/agent.py\n- **Simulation Engine**：importance `high`\n  - source_paths: arksim/simulation_engine/simulator.py, arksim/simulation_engine/entities.py, arksim/simulation_engine/core/multi_knowledge_handling.py, arksim/simulation_engine/agent/base.py, arksim/simulation_engine/utils/prompts.py\n- **Evaluation System**：importance `high`\n  - source_paths: arksim/evaluator/evaluator.py, arksim/evaluator/builtin_metrics.py, arksim/evaluator/base_metric.py, arksim/evaluator/tool_call_metrics.py, arksim/evaluator/trajectory_matching.py\n- **Agent Types and Integration**：importance `high`\n  - source_paths: arksim/simulation_engine/agent/base.py, arksim/simulation_engine/agent/clients/chat_completions.py, arksim/simulation_engine/agent/clients/a2a.py, arksim/simulation_engine/agent/clients/custom.py, arksim/simulation_engine/agent/factory.py\n- **LLM Provider Integration**：importance `medium`\n  - source_paths: arksim/llms/chat/providers/openai.py, arksim/llms/chat/providers/anthropic.py, arksim/llms/chat/providers/google.py, arksim/llms/chat/providers/azure_openai.py, arksim/llms/chat/llm.py\n- **Tool Call Capture**：importance `medium`\n  - source_paths: arksim/evaluator/tool_call_metrics.py, arksim/evaluator/error_detection.py, arksim/evaluator/error_scenarios.py, arksim/simulation_engine/tool_types.py\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `321d8003879fa2d27b3377845efdc93848a8b6cc`\n- inspected_files: `pyproject.toml`, `README.md`, `docs/README.md`, `docs/CONTRIBUTING.md`, `docs/docs.json`, `docs/snippets/snippet-intro.mdx`, `docs/v0.2.0/schema-reference.mdx`, `docs/v0.2.0/build-scenario.mdx`, `docs/v0.2.0/evaluate-conversation.mdx`, `docs/v0.2.0/insurance-customer-service-agent-evaluation.mdx`, `docs/v0.2.0/contact-email.mdx`, `docs/v0.2.0/e-commerce-customer-service-agent-evaluation.mdx`, `docs/v0.2.0/installation.mdx`, `docs/v0.2.0/quickstart.mdx`, `docs/v0.2.0/personal-ai-assistant-openclaw-evaluation.mdx`, `docs/v0.2.0/overview.mdx`, `docs/v0.2.0/simulate-conversation.mdx`, `docs/main/customer-service-tool-calling-agent-evaluation.mdx`, `docs/main/integrations.mdx`, `docs/main/tool-call-capture.mdx`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 来源证据：v0.1.0\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.1.0\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能影响升级、迁移或版本选择。\n- Evidence: community_evidence:github | cevd_d55b6435222f4142bf979809f0cf0794 | https://github.com/arklexai/arksim/releases/tag/v0.1.0 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 来源证据：v0.0.6\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.0.6\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_57b2e0dc0cf94da8b4fc97ea744ae7e9 | https://github.com/arklexai/arksim/releases/tag/v0.0.6 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 来源证据：v0.3.2\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.2\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_3a2981d341cb4512a410e7d75e03baf0 | https://github.com/arklexai/arksim/releases/tag/v0.3.2 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 来源证据：v0.3.4\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.4\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能阻塞安装或首次运行。\n- Evidence: community_evidence:github | cevd_f949170a7f5d440cb0559c40fb441178 | https://github.com/arklexai/arksim/releases/tag/v0.3.4 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 来源证据：v0.3.5\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.5\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_314def1c94614699b6c9ad728c36e9ab | https://github.com/arklexai/arksim/releases/tag/v0.3.5 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: 来源证据：v0.3.1\n\n- Trigger: GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.3.1\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_c3e7552e9af84b95b2aec63c73dc7c26 | https://github.com/arklexai/arksim/releases/tag/v0.3.1 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 能力判断依赖假设\n\n- Trigger: README/documentation is current enough for a first validation pass.\n- Host AI rule: 将假设转成下游验证清单。\n- Why it matters: 假设不成立时，用户拿不到承诺的能力。\n- Evidence: capability.assumptions | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | README/documentation is current enough for a first validation pass.\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 8: 来源证据：v0.3.3\n\n- Trigger: GitHub 社区证据显示该项目存在一个维护/版本相关的待验证问题：v0.3.3\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_03b0d8ce6f8a4bc2bc8e33a7636aa07c | https://github.com/arklexai/arksim/releases/tag/v0.3.3 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 9: 维护活跃度未知\n\n- Trigger: 未记录 last_activity_observed。\n- Host AI rule: 补 GitHub 最近 commit、release、issue/PR 响应信号。\n- Why it matters: 新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- Evidence: evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | last_activity_observed missing\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 10: 下游验证发现风险项\n\n- Trigger: no_demo\n- Host AI rule: 进入安全/权限治理复核队列。\n- Why it matters: 下游已经要求复核，不能在页面中弱化。\n- Evidence: downstream_validation.risk_items | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | no_demo; severity=medium\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：arklexai/arksim\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：chatgpt\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 来源证据：v0.1.0（medium）：可能影响升级、迁移或版本选择。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：v0.0.6（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：v0.3.2（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：v0.3.4（medium）：可能阻塞安装或首次运行。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：v0.3.5（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/arklexai/arksim 项目说明书\n\n生成时间：2026-05-15 12:11:39 UTC\n\n## 目录\n\n- [Introduction to ArkSim](#intro)\n- [Quickstart Guide](#quickstart)\n- [System Architecture Overview](#arch-overview)\n- [Simulation Engine](#simulation-engine)\n- [Evaluation System](#evaluation-system)\n- [Agent Types and Integration](#agent-types)\n- [LLM Provider Integration](#llm-providers)\n- [Tool Call Capture](#tool-call-capture)\n- [Scenario Management](#scenario-management)\n- [Configuration System](#configuration)\n\n<a id='intro'></a>\n\n## Introduction to ArkSim\n\n### 相关页面\n\n相关主题：[Quickstart Guide](#quickstart), [System Architecture Overview](#arch-overview)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n- [CONTRIBUTING.md](https://github.com/arklexai/arksim/blob/main/CONTRIBUTING.md)\n- [CLAUDE.md](https://github.com/arklexai/arksim/blob/main/CLAUDE.md)\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n- [examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n- [examples/integrations/langgraph/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langgraph/README.md)\n- [examples/integrations/autogen/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/autogen/README.md)\n- [examples/ci/README.md](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n</details>\n\n# Introduction to ArkSim\n\nArkSim is a multi-turn agent evaluation and simulation platform designed to test, measure, and improve AI agent quality across conversational scenarios. It enables developers to define test scenarios, simulate user interactions with their agents, and generate comprehensive evaluation reports with quantitative and qualitative metrics.\n\n## Overview\n\nArkSim addresses the challenge of systematically evaluating conversational AI agents by providing:\n\n- **Scenario-based simulation**: Define test cases with user profiles, knowledge bases, and expected behaviors\n- **Automated evaluation**: Score agent responses across multiple dimensions including goal completion, helpfulness, and coherence\n- **Framework integration**: Connect with various agent frameworks including LangGraph, LangChain, AutoGen, Claude Agent SDK, Rasa, Dify, and OpenClaw\n- **Custom metrics**: Extend evaluation capabilities with user-defined quantitative and qualitative metrics\n- **Visual reporting**: Generate HTML reports with conversation transcripts, failure analysis, and per-metric scores\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Architecture Overview\n\nArkSim follows a pipeline architecture consisting of three primary stages:\n\n```mermaid\ngraph TD\n    A[Configuration<br/>config.yaml] --> B[Simulation Engine]\n    C[Scenarios<br/>scenarios.json] --> B\n    D[Agent<br/>custom_agent.py] --> B\n    B --> E[Conversation Data]\n    E --> F[Evaluator]\n    F --> G[HTML Report<br/>final_report.html]\n    H[Custom Metrics<br/>custom_metrics.py] --> F\n```\n\n### Core Components\n\n| Component | Description |\n|-----------|-------------|\n| **Simulation Engine** | Orchestrates multi-turn conversations between simulated users and the agent |\n| **Evaluator** | Scores agent performance using built-in and custom metrics |\n| **CLI Interface** | Command-line interface for running simulations (`arksim simulate-evaluate`) |\n| **Report Generator** | Produces HTML reports with visualizations and conversation transcripts |\n\n资料来源：[examples/customer-service/README.md:1-25](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Agent Integration Methods\n\nArkSim supports multiple agent integration patterns to accommodate different agent architectures.\n\n### Python Class Integration (BaseAgent)\n\nThe default integration method uses a Python class inheriting from `BaseAgent`. This pattern provides maximum flexibility for connecting any agent implementation.\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Replace with your agent logic\n        return \"agent response\"\n```\n\nThe agent can return either:\n- A plain string response\n- An `AgentResponse` object containing both content and tool calls\n\n资料来源：[README.md:40-55](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### AgentResponse Data Model\n\nThe `AgentResponse` model encapsulates structured agent outputs:\n\n```python\nclass AgentResponse(BaseModel):\n    \"\"\"Structured return from agent execution, carrying both text and tool calls.\"\"\"\n    model_config = ConfigDict(extra=\"ignore\")\n    \n    content: str\n    tool_calls: list[ToolCall] = Field(default_factory=list)\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:32-42](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n\n### ToolCall Model\n\nTool calls are captured using the `ToolCall` model:\n\n```python\nclass ToolCall(BaseModel):\n    \"\"\"A single tool/function call observed during a turn.\"\"\"\n    model_config = ConfigDict(extra=\"ignore\")\n    \n    id: str\n    name: str\n    arguments: dict[str, Any] = Field(default_factory=dict)\n    result: str | None = None\n    error: str | None = None\n    source: ToolCallSource | None = None\n```\n\nThe `extra=\"ignore\"` configuration ensures forward compatibility with future versions.\n\n资料来源：[arksim/simulation_engine/tool_types.py:18-30](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n\n### Chat Completions Endpoint Integration\n\nFor agents exposing a standard OpenAI-compatible API:\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n资料来源：[README.md:57-62](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### A2A Protocol Integration\n\nFor agents implementing the Agent-to-Agent (A2A) protocol:\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\n资料来源：[README.md:64-70](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### Automatic Tool Call Capture (Tracing)\n\nArkSim supports automatic tool call capture through a tracing processor, eliminating the need for agents to explicitly return tool calls:\n\n```mermaid\ngraph LR\n    A[Simulator sets<br/>routing context] --> B[agent.execute()]\n    B --> C[SDK fires<br/>TracingProcessor.on_span_end]\n    C --> D[arksim captures<br/>tool calls]\n    D --> E[Evaluator scores]\n```\n\n```bash\npip install -r requirements-traced.txt\narksim simulate-evaluate config_traced.yaml\n```\n\n资料来源：[examples/customer-service/README.md:35-55](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Configuration System\n\nArkSim uses YAML configuration files to define simulation and evaluation parameters.\n\n### Configuration File Structure\n\n```yaml\n# Simulation settings\nsimulation:\n  max_turns: 10\n  timeout: 300\n\n# Agent configuration\nagent_config:\n  agent_type: chat_completions  # or 'a2a', 'custom'\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n\n# Trace receiver (optional)\ntrace_receiver:\n  enabled: true\n  wait_timeout: 5\n\n# Custom metrics\ncustom_metrics_file_paths:\n  - ./custom_metrics.py\n\nmetrics_to_run:\n  - goal_completion\n  - verification_compliance\n```\n\n资料来源：[examples/ci/README.md:1-25](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n\n### Scenario Definition\n\nScenarios are defined in `scenarios.json` with the following structure:\n\n| Field | Description |\n|-------|-------------|\n| `scenario_id` | Unique identifier for the scenario |\n| `user_profile` | Description of the simulated user |\n| `knowledge` | Context and information available to the user |\n| `goals` | Objectives the user aims to achieve |\n| `expected_behavior` | Expected agent behavior patterns |\n\n资料来源：[examples/integrations/dify/README.md:1-20](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md)\n\n## Evaluation Metrics\n\nArkSim provides built-in metrics and supports custom metric definitions.\n\n### Built-in Metrics\n\n| Metric | Description | Weight |\n|--------|-------------|--------|\n| Goal Completion | Whether the agent achieved the user's objectives | 60% |\n| Turn Success Ratio | Ratio of successful turns to total turns | 40% |\n| Final Score | Weighted average of other metrics | - |\n\n资料来源：[arksim/utils/html_report/report_template.html:80-95](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n### Custom Metrics\n\nCustom metrics can be implemented as either quantitative (scored numerically) or qualitative (evaluated via LLM).\n\n#### Quantitative Metric Pattern\n\n```python\nfrom arksim.evaluator import (\n    QuantitativeMetric,\n    QuantResult,\n    ScoreInput,\n    format_chat_history,\n)\n\nclass VerificationComplianceMetric(Queresult):\n    identity_verification: float  # 0.0-1.0\n    action_gating: float  # 0.0-1.0\n    reason: str\n\nVERIFICATION_COMPLIANCE_SYSTEM_PROMPT = \"\"\"\\\nYou are an impartial evaluator for a customer service agent.\nScore how well the agent followed identity verification protocols...\"\"\"\n\ndef verification_compliance_metric(\n    input_data: ScoreInput,\n) -> QuantResult:\n    # Implementation\n    return QuantResult(score=0.85, reason=\"Verification completed\")\n```\n\n资料来源：[examples/customer-service/custom_metrics.py:15-40](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n#### Qualitative Metric Pattern\n\n```python\nfrom arksim.evaluator import (\n    QualitativeMetric,\n    QualResult,\n    ScoreInput,\n    format_chat_history,\n)\n\ndef helpfulness_metric(input_data: ScoreInput) -> QualResult:\n    system_prompt = \"\"\"Evaluate the helpfulness of the agent response...\"\"\"\n    return QualResult(\n        assessment=\"The agent provided comprehensive assistance\",\n        score=0.9,\n        reason=\"Clear explanation with actionable steps\"\n    )\n```\n\n### Metric Configuration\n\nTo use custom metrics:\n\n1. Implement the metric function in a Python file\n2. Reference the file path in `custom_metrics_file_paths` in config.yaml\n3. Add the metric name to `metrics_to_run`\n\n资料来源：[examples/customer-service/custom_metrics.py:1-15](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n## CLI Usage\n\nArkSim provides a command-line interface for running simulations and evaluations.\n\n### Primary Commands\n\n| Command | Description |\n|---------|-------------|\n| `arksim simulate-evaluate config.yaml` | Run simulation and evaluation in one step |\n| `arksim simulate config_simulate.yaml` | Run simulation only |\n| `arksim evaluate config_evaluate.yaml` | Evaluate existing simulation results |\n\n资料来源：[examples/customer-service/README.md:15-22](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n### Workflow Examples\n\n#### Combined Simulation and Evaluation\n\n```bash\narksim simulate-evaluate config.yaml\n```\n\n#### Separate Simulation and Evaluation\n\n```bash\n# Step 1: Simulate\narksim simulate config_simulate.yaml\n\n# Step 2: Evaluate\narksim evaluate config_evaluate.yaml\n```\n\nResults are written to `./results/simulation/simulation.json`. The evaluation report is printed to stdout with per-scenario metric scores and failure analysis.\n\n资料来源：[examples/integrations/dify/README.md:15-25](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md)\n\n## Framework Integrations\n\nArkSim provides integration examples for popular agent frameworks.\n\n### Integration Matrix\n\n| Framework | Package Required | Documentation |\n|-----------|-----------------|---------------|\n| LangGraph | `langgraph langchain-openai` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/langgraph/README.md) |\n| LangChain | `langgraph langchain-openai` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/langchain/README.md) |\n| AutoGen | `autogen-agentchat autogen-ext[openai]` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/autogen/README.md) |\n| Claude Agent SDK | `claude-agent-sdk` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/claude-agent-sdk/README.md) |\n| Rasa | `rasa-pro` | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/rasa/README.md) |\n| Dify | (custom agent) | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md) |\n| OpenClaw | (custom agent) | [Link](https://github.com/arklexai/arksim/blob/main/examples/integrations/openclaw/README.md) |\n\n资料来源：[examples/integrations/langgraph/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langgraph/README.md), [examples/integrations/autogen/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/autogen/README.md)\n\n### General Integration Pattern\n\nRegardless of the framework, the integration follows this pattern:\n\n1. Create a `custom_agent.py` file with a `BaseAgent` subclass\n2. Implement the `execute()` method to call the framework's agent\n3. Return either a string or `AgentResponse` with tool calls\n4. Configure `config.yaml` to use the custom agent\n5. Create `scenarios.json` with test cases\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass FrameworkAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-session-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Call your framework's agent here\n        result = await your_agent.run(user_query)\n        return str(result)\n```\n\n## Report Generation\n\nArkSim generates comprehensive HTML evaluation reports containing:\n\n### Report Sections\n\n| Section | Content |\n|---------|---------|\n| **Summary** | Overall scores, pass/fail status |\n| **Per-Scenario Scores** | Individual metric scores for each scenario |\n| **Failure Categories** | Grouped failure patterns with counts |\n| **Conversation Viewer** | Full transcript of each simulated conversation |\n| **Score Explanations** | Rationale for each score with references to conversation turns |\n\n资料来源：[README.md:25-30](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### Report Output\n\nThe report tells you where your agent is strong and where it breaks. You get per-metric scores, categorized failures, and full conversation transcripts so you can read the exact turns where things went wrong.\n\n资料来源：[README.md:30-35](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Development Guidelines\n\n### Project Setup\n\n```bash\n# Fork and clone\ngit clone https://github.com/<your-username>/arksim.git\ncd arksim\n\n# Install in editable mode with dev dependencies\npip install -e \".[dev]\"\n\n# Create a branch\ngit checkout -b my-feature\n```\n\n### Code Quality\n\nArkSim uses Ruff for linting and formatting:\n\n```bash\nruff check .    # Lint\nruff format .   # Format\n```\n\nPre-commit hooks run both automatically on commit.\n\n### Code Style\n\n- Follow PEP 8 conventions\n- Code lines: maximum 120 characters\n- Comments and docstrings: maximum 80 characters\n- Type hints encouraged for function signatures\n- Use absolute imports over relative imports\n\n### Commit Message Format\n\n```\n<component>: <verb> <description>\n```\n\nExamples:\n- `evaluator: add custom metric support`\n- `simulator: fix profile generation for empty attributes`\n- `cli: support verbose flag for streaming output`\n\nKeep the subject line under 72 characters, use lowercase, imperative mood.\n\n### Branch Naming\n\n```\n<type>/<short-description>\n```\n\nExamples: `feat/retry-logic`, `fix/empty-list-handling`, `docs/update-quickstart`.\n\n资料来源：[CONTRIBUTING.md:1-50](https://github.com/arklexai/arksim/blob/main/CONTRIBUTING.md)\n\n## CI/CD Integration\n\nArkSim can be integrated into GitHub Actions workflows for automated testing.\n\n### Workflow Options\n\n| Workflow | Use Case |\n|----------|----------|\n| `arksim.yml` | HTTP server endpoints requiring startup/shutdown |\n| `arksim-pytest.yml` | Python-based pytest integration |\n\n### Setup Requirements\n\n1. Update `TODO` sections in `arksim.yml` (startup command and health-check URL)\n2. Create `tests/arksim/config.yaml` pointing to your server endpoint\n3. Create `tests/arksim/scenarios.json` with test cases\n4. Add custom metrics to `tests/arksim/custom_metrics/` if needed\n5. Add `OPENAI_API_KEY` (and optionally `AGENT_API_KEY`) to GitHub secrets\n\n资料来源：[examples/ci/README.md:1-15](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n\n## Summary\n\nArkSim provides a comprehensive framework for evaluating multi-turn conversational AI agents through:\n\n1. **Flexible integration**: Connect agents via Python classes, REST APIs, or framework-specific connectors\n2. **Rich scenario definitions**: Test agents with diverse user profiles and interaction goals\n3. **Extensible evaluation**: Implement custom metrics for domain-specific assessment\n4. **Actionable reporting**: Generate HTML reports that pinpoint exactly where and why agent behavior falls short\n\nBy standardizing agent evaluation, ArkSim enables data-driven improvements to conversational AI systems across different frameworks and architectures.\n\n---\n\n<a id='quickstart'></a>\n\n## Quickstart Guide\n\n### 相关页面\n\n相关主题：[Introduction to ArkSim](#intro), [Agent Types and Integration](#agent-types)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n- [arksim/simulation_engine/agent/base.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/agent/base.py)\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n- [examples/integrations/langchain/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langchain/README.md)\n- [README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n</details>\n\n# Quickstart Guide\n\nThis guide walks you through setting up and running your first agent simulation with ArkSim. By the end, you'll understand how to scaffold a project, connect your agent, define test scenarios, and execute simulation and evaluation workflows.\n\n## Prerequisites\n\nBefore starting, ensure you have:\n\n- Python 3.10 or higher\n- pip or pipx for package installation\n- An API key for your LLM provider (OpenAI, Anthropic, etc.) if your agent requires external API calls\n\n## Installation\n\nInstall ArkSim using pip:\n\n```bash\npip install arksim\n```\n\nFor development with test dependencies:\n\n```bash\npip install -e \".[dev]\"\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Project Initialization\n\nThe `arksim init` command scaffolds a starter project with all necessary configuration files. Run it from your desired working directory:\n\n```bash\narksim init\n```\n\n### Agent Connection Types\n\nArkSim supports three agent connection types via the `--agent-type` flag:\n\n| Type | Description | Use Case |\n|------|-------------|----------|\n| `custom` | Python class implementing `BaseAgent` | Full control, no external server needed |\n| `chat_completions` | HTTP endpoint compatible with OpenAI format | Existing REST APIs |\n| `a2a` | Agent-to-Agent protocol | Multi-agent systems |\n\n资料来源：[arksim/cli.py:120-136](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n### Command Options\n\n```bash\narksim init --agent-type custom    # Default: Python agent class\narksim init --agent-type chat_completions  # HTTP endpoint\narksim init --agent-type a2a       # A2A protocol\narksim init --agent-type custom --force  # Overwrite existing files\n```\n\nScaffolding generates these files:\n\n| File | Purpose |\n|------|---------|\n| `my_agent.py` | Agent implementation stub |\n| `config.yaml` | Simulator configuration |\n| `scenarios.json` | Test scenario definitions |\n| `.env` | Environment variables template |\n\n## Project Structure\n\n```\nyour-project/\n├── my_agent.py        # Your agent implementation\n├── config.yaml         # Simulation & evaluation settings\n├── scenarios.json     # Test scenarios\n└── results/            # Output directory (created on run)\n    ├── simulation/\n    │   └── simulation.json\n    └── evaluation/\n        └── evaluation.json\n```\n\n## Connecting Your Agent\n\n### Python Class (Recommended)\n\nReplace the generated `my_agent.py` with your agent logic. Your class must inherit from `BaseAgent`:\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Your agent logic here\n        return \"agent response\"\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\nThe `execute` method supports two return types:\n\n| Return Type | Description |\n|-------------|-------------|\n| `str` | Plain text response |\n| `AgentResponse` | Structured response with text and tool calls |\n\n```python\nfrom arksim.simulation_engine.tool_types import AgentResponse, ToolCall\n\nasync def execute(self, user_query: str, **kwargs: object) -> AgentResponse:\n    tool_calls = [\n        ToolCall(\n            id=\"call_123\",\n            name=\"search_database\",\n            arguments={\"query\": user_query}\n        )\n    ]\n    return AgentResponse(\n        content=\"Found results for your query\",\n        tool_calls=tool_calls\n    )\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:45-72](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n\n### Chat Completions Endpoint\n\nFor HTTP-based agents, configure `config.yaml`:\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n### A2A Protocol\n\nFor Agent-to-Agent protocol support:\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\nA2A agents can surface tool calls for evaluation via the protocol extension.\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Configuring Simulation\n\nEdit `config.yaml` to control simulation behavior:\n\n```yaml\nsimulator:\n  max_turns: 20\n  timeouts:\n    agent_response: 30\n    tool_execution: 10\n  model: gpt-4o-mini\n```\n\n### Core Configuration Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `max_turns` | int | 20 | Maximum conversation turns per scenario |\n| `agent_response` | int | 30 | Timeout for agent response (seconds) |\n| `tool_execution` | int | 10 | Timeout for tool execution (seconds) |\n| `model` | string | gpt-4o-mini | Simulator model for user simulation |\n\n## Defining Test Scenarios\n\nEdit `scenarios.json` to define your test cases:\n\n```json\n[\n  {\n    \"id\": \"scenario-001\",\n    \"name\": \"Customer Inquiry\",\n    \"category\": \"support\",\n    \"user_profile\": {\n      \"name\": \"Alice Johnson\",\n      \"age\": 34,\n      \"account_type\": \"premium\"\n    },\n    \"knowledge\": [\n      \"Customer has a premium account\",\n      \"Customer is inquiring about billing\"\n    ],\n    \"conversation\": [\n      {\n        \"turn\": 1,\n        \"user\": \"I noticed a charge on my account that I don't recognize\",\n        \"goal\": \"Customer wants to understand and dispute the unfamiliar charge\"\n      }\n    ]\n  }\n]\n```\n\n### Scenario Schema Fields\n\n| Field | Required | Description |\n|-------|----------|-------------|\n| `id` | Yes | Unique scenario identifier |\n| `name` | Yes | Human-readable name |\n| `category` | No | Grouping category for filtering |\n| `user_profile` | No | Simulated user attributes |\n| `knowledge` | No | Facts the simulated user knows |\n| `conversation` | Yes | Multi-turn conversation structure |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Running Simulation and Evaluation\n\n### Combined Workflow\n\nRun both simulation and evaluation in one command:\n\n```bash\narksim simulate-evaluate config.yaml\n```\n\n### Separate Steps\n\nFor more control, run simulation and evaluation separately:\n\n```bash\n# Step 1: Simulate\narksim simulate config_simulate.yaml\n\n# Step 2: Evaluate\narksim evaluate config_evaluate.yaml\n```\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Command Reference\n\n| Command | Description |\n|---------|-------------|\n| `arksim init` | Scaffold new project |\n| `arksim simulate <config>` | Run simulation only |\n| `arksim evaluate <config>` | Run evaluation only |\n| `arksim simulate-evaluate <config>` | Run both steps |\n| `arksim ui` | Launch web UI (port 8080) |\n| `arksim examples` | Download example projects |\n| `arksim prompts` | List available prompts |\n\n资料来源：[arksim/cli.py:89-152](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n## Web UI\n\nLaunch the web-based control plane:\n\n```bash\narksim ui --port 8080\n```\n\nThe UI provides:\n- Scenario management\n- Real-time simulation progress\n- Log viewing\n- Dark/light mode toggle\n\n## Integration Examples\n\nArkSim supports integrations with popular agent frameworks:\n\n| Framework | Command |\n|-----------|---------|\n| LangChain/LangGraph | `pip install langgraph langchain-openai` |\n| Claude Agent SDK | `pip install claude-agent-sdk` |\n| Microsoft AutoGen | `pip install autogen-agentchat autogen-ext[openai]` |\n| Dify | HTTP-based integration |\n\nExample workflow for LangGraph:\n\n```bash\ncd examples/integrations/langgraph\npip install langgraph langchain-openai\nexport OPENAI_API_KEY=\"<your-key>\"\narksim simulate-evaluate config.yaml\n```\n\n资料来源：[examples/integrations/langchain/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langchain/README.md)\n\n## Next Steps\n\n- Review the [Evaluation Metrics](evaluator) documentation for customizing scoring\n- Explore the [Custom Agent](custom-agent) guide for advanced integration patterns\n- Download example projects with `arksim examples`\n\n---\n\n<a id='arch-overview'></a>\n\n## System Architecture Overview\n\n### 相关页面\n\n相关主题：[Simulation Engine](#simulation-engine), [Evaluation System](#evaluation-system), [LLM Provider Integration](#llm-providers)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n- [README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n- [examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n- [examples/e-commerce/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/e-commerce/custom_metrics.py)\n- [arksim/utils/html_report/report_template.html](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n</details>\n\n# System Architecture Overview\n\nArkSim is a multi-turn agent evaluation framework that simulates user conversations with agents, captures tool calls, and scores agent performance against customizable metrics. This document provides a comprehensive overview of the system's architecture, core components, and data flows.\n\n## Core Components\n\nArkSim's architecture is organized into three primary subsystems that work together to simulate, capture, and evaluate agent behavior.\n\n| Component | Purpose |\n|-----------|---------|\n| Simulation Engine | Orchestrates multi-turn conversations between user simulators and agents |\n| Evaluator | Scores agent responses against qualitative and quantitative metrics |\n| LLM Layer | Powers user simulation and metric evaluation via configurable model providers |\n\n## High-Level Architecture\n\n```mermaid\ngraph TD\n    subgraph Simulation[\"Simulation Engine\"]\n        CLI[CLI Interface]\n        Config[Config Loader]\n        Simulator[Simulator Core]\n        AgentConnector[Agent Connector]\n        UserSimulator[User Simulator]\n    end\n\n    subgraph Evaluation[\"Evaluator\"]\n        Metrics[Custom Metrics]\n        Scoring[Scoring Engine]\n        Report[HTML Report Generator]\n    end\n\n    subgraph LLM[\"LLM Layer\"]\n        ChatLLM[Chat LLM]\n        EvalLLM[Evaluation LLM]\n    end\n\n    subgraph External[\"External Systems\"]\n        CustomAgent[Custom Agent]\n        HTTPEndpoint[HTTP Endpoint]\n        A2AAgent[A2A Agent]\n    end\n\n    CLI --> Config\n    Config --> Simulator\n    Simulator --> UserSimulator\n    Simulator --> AgentConnector\n    AgentConnector --> CustomAgent\n    AgentConnector --> HTTPEndpoint\n    AgentConnector --> A2AAgent\n    UserSimulator --> ChatLLM\n    Metrics --> EvalLLM\n    EvalLLM --> Scoring\n    Scoring --> Report\n\n    style CLI fill:#e1f5fe\n    style Simulator fill:#fff3e0\n    style Report fill:#e8f5e9\n```\n\n## CLI Interface\n\nThe command-line interface provides multiple entry points for running simulations and evaluations. The CLI is implemented in `arksim/cli.py` and supports the following commands:\n\n| Command | Description |\n|---------|-------------|\n| `arksim simulate-evaluate config.yaml` | Run simulation and evaluation in a single pipeline |\n| `arksim simulate config.yaml` | Run simulation only |\n| `arksim evaluate config.yaml` | Run evaluation on existing simulation results |\n| `arksim init` | Scaffold starter files for agent testing |\n| `arksim ui` | Launch web UI control plane |\n| `arksim examples` | Download example projects from GitHub |\n\n资料来源：[arksim/cli.py:1-100](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n### Simulation Modes\n\nArkSim supports three distinct agent integration patterns:\n\n```mermaid\ngraph LR\n    subgraph AgentTypes[\"Agent Types\"]\n        Custom[Custom Agent<br/>Python class extending BaseAgent]\n        HTTP[Chat Completions<br/>HTTP endpoint /v1/chat/completions]\n        A2A[A2A Protocol<br/>Agent-to-Agent standard]\n    end\n```\n\n1. **Custom Agent (Python class)**: Extend `BaseAgent` and implement the `execute()` method\n2. **Chat Completions**: Configure an HTTP endpoint for OpenAI-compatible chat completions API\n3. **A2A Protocol**: Connect via the Agent-to-Agent protocol standard\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Simulation Engine\n\nThe simulation engine orchestrates multi-turn conversations between a user simulator and the agent under test. Each turn consists of:\n\n1. User simulator generates a response based on conversation history and scenario knowledge\n2. Agent executes the user query and returns a response (optionally with tool calls)\n3. Simulator captures the interaction for evaluation\n\n### Tool Call Capture\n\nArkSim captures tool/function calls in two ways:\n\n| Capture Method | Mechanism | Configuration |\n|----------------|-----------|---------------|\n| AgentResponse | Agent returns structured `AgentResponse` with `tool_calls` list | Default behavior |\n| TracingProcessor | SDK's `TracingProcessor.on_span_end` captures calls automatically | `trace_receiver.enabled: true` |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n#### Tool Call Data Model\n\nTool calls are represented by the `ToolCall` class:\n\n```python\nclass ToolCall(BaseModel):\n    id: str\n    name: str\n    arguments: dict[str, Any] = Field(default_factory=dict)\n    result: str | None = None\n    error: str | None = None\n    source: ToolCallSource | None = None\n```\n\nThe `AgentResponse` wraps both content and tool calls:\n\n```python\nclass AgentResponse(BaseModel):\n    content: str\n    tool_calls: list[ToolCall] = Field(default_factory=list)\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:1-50](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n\n### Traced Agent Flow\n\n```mermaid\nsequenceDiagram\n    participant Simulator\n    participant Agent\n    participant Tracing as ArksimTracingProcessor\n    participant Evaluator\n\n    Simulator->>Agent: execute(user_query)\n    Agent->>Tracing: on_span_end(span)\n    Tracing->>Simulator: captured_tool_calls\n    Simulator->>Agent: RunResult\n    Agent->>Simulator: AgentResponse\n    Simulator->>Evaluator: Tool calls + Conversation\n    Evaluator->>Simulator: Metric Scores\n```\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Evaluator\n\nThe evaluator scores agent performance using both quantitative and qualitative metrics. The evaluation framework supports custom metrics defined via Python files.\n\n### Metric Types\n\n| Type | Description | Scoring Method |\n|------|-------------|----------------|\n| QuantitativeMetric | Numerical scores (0.0-1.0 scale) | Structured JSON schema validation |\n| QualitativeMetric | Free-form evaluation with reasoning | LLM-generated analysis |\n\n资料来源：[examples/customer-service/custom_metrics.py:1-60](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n### Custom Metrics Structure\n\nCustom metrics require:\n\n1. A Pydantic `BaseModel` defining the output schema\n2. A system prompt describing evaluation criteria\n3. Implementation of `QuantitativeMetric` or `QualitativeMetric`\n\n```python\nclass ConversionSchema(BaseModel):\n    intent_strength: float\n    conversion_outcome: float\n    evidence: list[str]\n    reason: str\n\nCONVERSION_SYSTEM_PROMPT = \"\"\"\\\nYou are an impartial evaluator for an e-commerce shopping agent.\nYour job is to score (1) the shopper's purchase intent and (2) whether the agent achieved a conversion outcome...\n\"\"\"\n```\n\n资料来源：[examples/e-commerce/custom_metrics.py:1-40](https://github.com/arklexai/arksim/blob/main/examples/e-commerce/custom_metrics.py)\n\n### Score Calculation\n\nThe evaluator computes a **Final Score** as a weighted average:\n\n| Component | Weight | Description |\n|-----------|--------|-------------|\n| Turn Success Ratio | 40% | Ratio of successful turns to total turns |\n| Goal Completion Score | 60% | LLM-assessed goal achievement score |\n\n| Status | Condition |\n|--------|-----------|\n| Done | Final score = 1.0 |\n| Partial Failure | 0.0 < Final score < 1.0 |\n| Complete Failure | Final score = 0.0 |\n\n资料来源：[arksim/utils/html_report/report_template.html:1-50](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Data Flow\n\n```mermaid\ngraph LR\n    subgraph Input[\"Input\"]\n        Config[config.yaml]\n        Scenarios[scenarios.json]\n        Metrics[custom_metrics.py]\n    end\n\n    subgraph Process[\"Processing\"]\n        Sim[Simulation]\n        Eval[Evaluation]\n        Score[Scoring]\n    end\n\n    subgraph Output[\"Output\"]\n        Results[simulation.json]\n        Report[evaluation.html]\n    end\n\n    Config --> Sim\n    Scenarios --> Sim\n    Sim --> Eval\n    Metrics --> Eval\n    Eval --> Score\n    Score --> Results\n    Score --> Report\n```\n\n## Agent Connector Types\n\nArkSim supports multiple agent integration patterns via the configuration system:\n\n```yaml\n# Option 1: Custom Python Agent\nagent_config:\n  agent_type: custom\n  agent_name: my-agent\n\n# Option 2: Chat Completions HTTP Endpoint\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n\n# Option 3: A2A Protocol\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## User Simulator\n\nThe user simulator generates realistic multi-turn conversations based on:\n\n- **Scenario definitions**: Predefined conversation flows and expected behaviors\n- **User profiles**: Demographic and behavioral attributes for persona simulation\n- **Knowledge bases**: Domain-specific information the simulated user possesses\n- **Conversation history**: Full context of the multi-turn interaction\n\nThe simulator uses LLM-based generation to produce contextually appropriate user responses that evolve through the conversation based on agent actions and conversation state.\n\n## HTML Report Generation\n\nAfter evaluation completes, ArkSim generates an interactive HTML report containing:\n\n| Section | Content |\n|---------|---------|\n| Summary Statistics | Overall scores, pass/fail rates |\n| Per-Scenario Metrics | Individual metric scores with reasoning |\n| Failure Categories | Grouped analysis of failure types |\n| Conversation Transcripts | Full turn-by-turn dialogue viewer |\n\n资料来源：[arksim/utils/html_report/report_template.html:1-100](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Integration Examples\n\nArkSim provides example integrations for popular agent frameworks:\n\n| Framework | Example Location | Connection Method |\n|-----------|-----------------|-------------------|\n| LangGraph | `examples/integrations/langgraph/` | Custom agent connector |\n| LangChain | `examples/integrations/langchain/` | Custom agent connector |\n| Claude Agent SDK | `examples/integrations/claude-agent-sdk/` | Custom agent connector |\n| AutoGen | `examples/integrations/autogen/` | Custom agent connector |\n| Pydantic AI | `examples/integrations/pydantic-ai/` | Custom agent connector |\n| Dify | `examples/integrations/dify/` | HTTP client |\n\n资料来源：[examples/integrations/langgraph/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/langgraph/README.md), [examples/integrations/claude-agent-sdk/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/claude-agent-sdk/README.md), [examples/integrations/autogen/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/autogen/README.md)\n\n## Configuration Schema\n\nThe main configuration file (`config.yaml`) controls all aspects of simulation and evaluation:\n\n| Section | Key Options |\n|---------|-------------|\n| `agent_config` | `agent_type`, `agent_name`, `api_config.endpoint` |\n| `scenario_file` | Path to scenarios.json |\n| `metrics_to_run` | List of metric names to execute |\n| `custom_metrics_file_paths` | Paths to custom metric Python files |\n| `trace_receiver.enabled` | Enable TracingProcessor capture (default: false) |\n| `trace_receiver.wait_timeout` | Timeout for trace capture (seconds) |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Web UI\n\nArkSim includes a web-based control plane for managing simulations:\n\n```mermaid\ngraph TD\n    subgraph UI[\"Web UI\"]\n        Build[Build Scenarios]\n        Load[Load Existing]\n        Run[Run Simulation]\n        View[View Results]\n    end\n\n    subgraph Features[\"UI Features\"]\n        AutoGen[Auto-generate Scenarios PRO]\n        Browse[File Browser]\n        Refresh[Refresh Results]\n    end\n```\n\nFeatures include:\n- Scenario building and loading\n- Auto-generate scenarios (PRO feature)\n- File browser integration\n- Results viewing and refresh\n\n资料来源：[arksim/ui/frontend/index.html:1-80](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html)\n\n## Summary\n\nArkSim provides a comprehensive multi-turn agent evaluation framework with:\n\n- **Flexible agent integration**: Support for custom Python agents, HTTP endpoints, and A2A protocol\n- **Tool call capture**: Two mechanisms for capturing agent tool executions\n- **Customizable metrics**: Both quantitative and qualitative evaluation approaches\n- **Interactive reporting**: HTML-based results with conversation viewer\n- **CLI and UI**: Command-line and web-based interfaces for running evaluations\n- **Framework integrations**: Pre-built examples for LangGraph, LangChain, Claude SDK, AutoGen, Pydantic AI, and Dify\n\n---\n\n<a id='simulation-engine'></a>\n\n## Simulation Engine\n\n### 相关页面\n\n相关主题：[System Architecture Overview](#arch-overview), [Evaluation System](#evaluation-system), [Scenario Management](#scenario-management)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/simulation_engine/simulator.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/simulator.py)\n- [arksim/simulation_engine/entities.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/entities.py)\n- [arksim/simulation_engine/core/multi_knowledge_handling.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/core/multi_knowledge_handling.py)\n- [arksim/simulation_engine/agent/base.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/agent/base.py)\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [arksim/simulation_engine/utils/prompts.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/utils/prompts.py)\n</details>\n\n# Simulation Engine\n\nThe Simulation Engine is the core component of ArkSim responsible for executing multi-turn conversations between simulated users and agent systems under test. It orchestrates the entire simulation lifecycle, from scenario loading to agent execution, while capturing tool calls, responses, and conversation state for downstream evaluation.\n\n## Architecture Overview\n\nThe Simulation Engine follows a layered architecture that separates concerns between scenario management, agent execution, tool call capture, and knowledge handling.\n\n```mermaid\ngraph TD\n    A[Scenarios JSON] --> B[Simulator]\n    C[Config YAML] --> B\n    B --> D[Agent Executor]\n    D --> E[BaseAgent]\n    E --> F[Custom Agent / Chat Completions / A2A]\n    F --> G[Tool Calls Capture]\n    G --> H[Simulation Results]\n    D --> I[Multi-Knowledge Handler]\n    I --> J[Knowledge Sources]\n    \n    style B fill:#e1f5fe\n    style D fill:#fff3e0\n    style E fill:#e8f5e9\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| Simulator | `simulator.py` | Orchestrates the simulation lifecycle |\n| Entities | `entities.py` | Data models for scenarios, profiles, and turns |\n| Agent Base | `agent/base.py` | Abstract base class for all agent implementations |\n| Tool Types | `tool_types.py` | Data models for tool calls and agent responses |\n| Multi-Knowledge Handler | `core/multi_knowledge_handling.py` | Manages multiple knowledge sources for user profiles |\n| Prompt Utilities | `utils/prompts.py` | Generates prompts for simulated users |\n\n## Data Models\n\n### ToolCall\n\nThe `ToolCall` class represents a single tool or function call observed during a conversation turn.\n\n```python\nclass ToolCall(BaseModel):\n    id: str\n    name: str\n    arguments: dict[str, Any] = Field(default_factory=dict)\n    result: str | None = None\n    error: str | None = None\n    source: ToolCallSource | None = None\n```\n\nKey characteristics:\n- Declares `extra=\"ignore\"` for forward compatibility with future versions\n- Supports capturing arguments, results, and error states\n- Tracks the source of tool calls (e.g., from response parsing or tracing)\n\n资料来源：[arksim/simulation_engine/tool_types.py:26-37]()\n\n### AgentResponse\n\nThe `AgentResponse` class provides a structured return from agent execution.\n\n```python\nclass AgentResponse(BaseModel):\n    content: str\n    tool_calls: list[ToolCall] = Field(default_factory=list)\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:45-52]()\n\n## BaseAgent Interface\n\nAll agent implementations must inherit from `BaseAgent` and implement the required async methods.\n\n```python\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Replace with your agent logic\n        return \"agent response\"\n```\n\n资料来源：[README.md:45-55]()\n\n### Required Methods\n\n| Method | Return Type | Description |\n|--------|-------------|-------------|\n| `get_chat_id()` | `str` | Returns a unique identifier for the chat session |\n| `execute()` | `str \\| AgentResponse` | Executes the agent with a user query and returns content with optional tool calls |\n\n资料来源：[README.md:45-55]()\n\n## Simulation Workflow\n\n```mermaid\ngraph LR\n    A[Load Config] --> B[Load Scenarios]\n    B --> C[Initialize Agent]\n    C --> D[For Each Scenario]\n    D --> E[Generate User Profile]\n    E --> F[Execute Turn]\n    F --> G{Capture Tool Calls}\n    G --> H[Record Response]\n    H --> I{More Turns?}\n    I -->|Yes| F\n    I -->|No| J[Save Results]\n    J --> K[Next Scenario]\n    K -->|Yes| D\n    K -->|No| L[Complete]\n```\n\n### Step 0: Build Scenarios\n\nBefore running a simulation, users must create or load test scenarios. The UI provides options to:\n\n- **Auto-generate Scenarios** (Pro feature) - Automatically generate realistic test scenarios from the agent's knowledge base\n- **Load Existing** - Load scenario files from a specified path\n\n资料来源：[arksim/ui/frontend/index.html:1-50]()\n\n### Step 1: Simulation Execution\n\nThe simulator executes each scenario by:\n\n1. Loading the scenario configuration and user profiles\n2. Generating multi-turn conversations based on the scenario goals\n3. Capturing all tool calls and responses\n4. Producing a `simulation.json` output file\n\n资料来源：[examples/integrations/dify/README.md:18-20]()\n\n## Agent Types\n\nArkSim supports multiple agent integration patterns:\n\n| Agent Type | Configuration | Use Case |\n|------------|---------------|----------|\n| Custom Python Class | `agent_type: custom` | Full control via subclassing `BaseAgent` |\n| Chat Completions | `agent_type: chat_completions` | HTTP endpoint compatible with OpenAI format |\n| A2A Protocol | `agent_type: a2a` | Agent-to-Agent protocol endpoints |\n\n资料来源：[README.md:57-68]()\n\n### Chat Completions Configuration\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n### A2A Protocol Configuration\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\n资料来源：[README.md:57-68]()\n\n## Tool Call Capture\n\nArkSim provides two mechanisms for capturing tool calls:\n\n### Response-Based Capture (Default)\n\nThe agent returns an `AgentResponse` containing explicit tool calls:\n\n```python\nasync def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n    run_result = self.agent.run(user_query)\n    return AgentResponse(\n        content=run_result.text,\n        tool_calls=extract_tool_calls(run_result)\n    )\n```\n\n### Tracing-Based Capture (Automatic)\n\nFor agents using the ArkSim tracing processor, tool calls are captured automatically without modifying the agent response:\n\n```python\n# At module load\nfrom arksim.simulation_engine.tracing import ArksimTracingProcessor\nagent.register(ArksimTracingProcessor())\n\n# Agent returns plain str\nasync def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n    return self.agent.run(user_query)  # Returns plain string\n```\n\n资料来源：[examples/customer-service/README.md:1-35]()\n\n## Configuration\n\n### CLI Usage\n\n```bash\n# Combined simulation and evaluation\narksim simulate-evaluate config.yaml\n\n# Separate steps\narksim simulate config_simulate.yaml\narksim evaluate config_evaluate.yaml\n```\n\n资料来源：[examples/customer-service/README.md:8-14]()\n\n### Trace Receiver Configuration\n\n```yaml\ntrace_receiver:\n  enabled: true\n  wait_timeout: 5\n```\n\nWhen `trace_receiver.enabled` is false, ArkSim only captures tool calls from `AgentResponse`.\n\n资料来源：[examples/customer-service/README.md:30-35]()\n\n## Integration Examples\n\nArkSim provides pre-built integrations for popular agent frameworks:\n\n| Framework | Package | Example Path |\n|-----------|---------|--------------|\n| LangGraph | `langgraph`, `langchain-openai` | `examples/integrations/langgraph/` |\n| AutoGen | `autogen-agentchat`, `autogen-ext[openai]` | `examples/integrations/autogen/` |\n| Claude Agent SDK | `claude-agent-sdk` | `examples/integrations/claude-agent-sdk/` |\n| CrewAI | `crewai` | `examples/integrations/crewai/` |\n| Pydantic AI | `pydantic-ai` | `examples/integrations/pydantic-ai/` |\n| LangChain | `langgraph`, `langchain-openai` | `examples/integrations/langchain/` |\n\n资料来源：[examples/integrations/langgraph/README.md](), [examples/integrations/autogen/README.md](), [examples/integrations/claude-agent-sdk/README.md](), [examples/integrations/crewai/README.md](), [examples/integrations/pydantic-ai/README.md](), [examples/integrations/langchain/README.md]()\n\n## Output Format\n\nSimulation results are written to `./results/simulation/simulation.json` containing:\n\n- Complete conversation transcripts\n- Captured tool calls with arguments and results\n- Turn-by-turn timing information\n- User profile data\n- Scenario metadata\n\n资料来源：[examples/integrations/dify/README.md:18-20]()\n\n## Custom Metrics Support\n\nThe simulation engine supports custom quantitative and qualitative metrics through the evaluator. Custom metric files can be added to `custom_metrics/` directories and referenced in the configuration.\n\n资料来源：[examples/customer-service/custom_metrics.py:1-20]()\n\n## Forward Compatibility\n\nBoth `ToolCall` and `AgentResponse` declare `extra=\"ignore\"` in their Pydantic configuration. This ensures that snapshots from future ArkSim versions containing new fields can be loaded by older versions without raising `ValidationError`.\n\n```python\nmodel_config = ConfigDict(extra=\"ignore\")\n```\n\n资料来源：[arksim/simulation_engine/tool_types.py:26-28](), [arksim/simulation_engine/tool_types.py:45-47]()\n\n---\n\n<a id='evaluation-system'></a>\n\n## Evaluation System\n\n### 相关页面\n\n相关主题：[System Architecture Overview](#arch-overview), [Simulation Engine](#simulation-engine)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/evaluator/evaluator.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/evaluator.py)\n- [arksim/evaluator/builtin_metrics.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/builtin_metrics.py)\n- [arksim/evaluator/base_metric.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/base_metric.py)\n- [arksim/evaluator/tool_call_metrics.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/tool_call_metrics.py)\n- [arksim/evaluator/trajectory_matching.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/trajectory_matching.py)\n- [arksim/evaluator/thresholds.py](https://github.com/arklexai/arksim/blob/main/arksim/evaluator/thresholds.py)\n- [arksim/utils/html_report/generate_html_report.py](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/generate_html_report.py)\n</details>\n\n# Evaluation System\n\nThe Evaluation System in ArkSim is responsible for scoring simulated conversations between users and agents. It measures goal completion, helpfulness, coherence, and other quality metrics to identify where agents succeed and where they fail.\n\n## Overview\n\nAfter simulation generates conversation transcripts, the evaluator analyzes each conversation and assigns scores across multiple dimensions. The system supports both built-in metrics and custom-defined metrics.\n\n**Purpose:**\n- Quantify agent performance objectively\n- Identify specific failure patterns\n- Generate actionable HTML reports for debugging\n\n**Scope:**\n- Scores conversations on a 0.0-1.0 scale\n- Computes weighted final scores combining multiple metrics\n- Categorizes outcomes as done, partial failure, or failed\n- Supports LLM-based evaluation (using configured providers like OpenAI)\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Simulation Results] --> B[Evaluator]\n    B --> C[Built-in Metrics]\n    B --> D[Custom Metrics]\n    B --> E[Tool Call Metrics]\n    C --> F[Final Scores]\n    D --> F\n    E --> F\n    F --> G[HTML Report Generator]\n    G --> H[evaluation/final_report.html]\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| Evaluator | `evaluator.py` | Main orchestration of evaluation pipeline |\n| Base Metric | `base_metric.py` | Abstract base classes for metrics |\n| Built-in Metrics | `builtin_metrics.py` | Standard metrics (faithfulness, helpfulness, etc.) |\n| Tool Call Metrics | `tool_call_metrics.py` | Evaluation of tool usage patterns |\n| Trajectory Matching | `trajectory_matching.py` | Compare expected vs actual agent trajectories |\n| Thresholds | `thresholds.py` | Score classification logic |\n| HTML Report | `generate_html_report.py` | Generate visual evaluation reports |\n\n## Evaluation Workflow\n\n```mermaid\nsequenceDiagram\n    participant S as Simulator\n    participant E as Evaluator\n    participant M as Metrics\n    participant R as Report Generator\n    \n    S->>E: Simulation results (JSON)\n    E->>M: For each conversation\n    M->>M: Run quantitative metrics\n    M->>M: Run qualitative metrics\n    M->>E: Per-metric scores\n    E->>E: Calculate final scores\n    E->>E: Determine status\n    E->>R: Score data\n    R->>R: Generate HTML\n```\n\n### Step-by-Step Process\n\n1. **Load Simulation Data**: Read conversation transcripts from simulation output\n2. **Select Metrics**: Determine which built-in and custom metrics to run\n3. **Score Each Conversation**: Apply metrics to score goal completion, behavior, etc.\n4. **Compute Final Scores**: Calculate weighted averages (goal_completion_weight=0.25, turn_success_ratio_weight=0.75)\n5. **Classify Status**: Assign done/partial_failure/failed based on thresholds\n6. **Generate Report**: Create HTML report with detailed breakdowns\n\n资料来源：[arksim/utils/html_report/report_template.html](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Scoring System\n\n### Score Ranges\n\n| Score Type | Range | Description |\n|------------|-------|-------------|\n| Quantitative | 0.0 - 1.0 | Numeric scores for measurable criteria |\n| Qualitative | Label-based | Categorical results (compliant, professional, pass, etc.) |\n| Goal Completion | 0.0 - 1.0 | Whether agent completed user goal |\n| Final Score | 0.0 - 1.0 | Weighted combination of metrics |\n\n### Status Classification\n\n| Status | Condition | Description |\n|--------|-----------|-------------|\n| Done | final_score == 1.0 | Perfect performance, goal completed |\n| Partial Failure | final_score >= 0.6 | Acceptable but with some failures |\n| Failed | final_score < 0.6 | Poor performance requiring attention |\n\n资料来源：[arksim/utils/html_report/report_template.html:85-87](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Built-in Metrics\n\nThe system provides seven built-in metrics that can be selected via configuration:\n\n| Metric | Purpose |\n|--------|---------|\n| `faithfulness` | Did the agent provide factually accurate information? |\n| `helpfulness` | Was the agent's response useful to the user? |\n| `coherence` | Were responses logically connected and consistent? |\n| `verbosity` | Did the agent maintain appropriate response length? |\n| `relevance` | Did responses address the user's actual query? |\n| `goal_completion` | Did the agent help the user accomplish their goal? |\n| `agent_behavior_failure` | Did the agent exhibit any problematic behaviors? |\n\n### Metric Selection\n\nFrom the frontend, users can select which built-in metrics to run:\n\n```html\n<template x-for=\"m in ['faithfulness', 'helpfulness', 'coherence', 'verbosity', 'relevance', 'goal_completion', 'agent_behavior_failure']\" :key=\"m\">\n```\n\nIf no metrics are selected, all built-in metrics run by default.\n\n资料来源：[arksim/ui/frontend/index.html](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html)\n\n## Custom Metrics\n\nDevelopers can define custom evaluation metrics by creating a Python module.\n\n### Creating Custom Metrics\n\n```python\nfrom arksim.evaluator import (\n    QualitativeMetric,\n    QualResult,\n    QuantitativeMetric,\n    QuantResult,\n    ScoreInput,\n)\n\nclass MyMetric(QuantitativeMetric):\n    name = \"my_custom_metric\"\n    schema = MySchema  # Pydantic model\n    \n    async def evaluate(self, input: ScoreInput) -> QuantResult:\n        # Evaluation logic\n        return QuantResult(...)\n```\n\n### Quantitative vs Qualitative Metrics\n\n| Type | Base Class | Output |\n|------|------------|--------|\n| Quantitative | `QuantitativeMetric` | `QuantResult` with float value and reason |\n| Qualitative | `QualitativeMetric` | `QualResult` with categorical label |\n\n### Configuration\n\nAdd custom metrics to `config.yaml`:\n\n```yaml\ncustom_metrics_file_paths:\n  - /path/to/custom_metrics.py\n\nmetrics_to_run:\n  - my_custom_metric  # optional; runs all if omitted\n```\n\n资料来源：[examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n### Example: Verification Compliance\n\n```python\nclass VerificationComplianceSchema(BaseModel):\n    identity_verification: float  # 0.0-1.0\n    action_gating: float  # 0.0-1.0\n    reason: str\n```\n\nThe evaluation prompt instructs the LLM to score:\n- **Identity Verification**: Did the agent verify customer identity before actions?\n- **Action Gating**: Did the agent gate sensitive actions behind verification?\n\n资料来源：[examples/customer-service/custom_metrics.py:25-35](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\n### Example: E-commerce Conversion\n\n```python\nclass ConversionSchema(BaseModel):\n    intent_strength: float\n    conversion_outcome: float\n    evidence: list[str]\n    reason: str\n```\n\nMetrics track:\n- **Intent Strength**: How ready the shopper is to buy\n- **Conversion Outcome**: Whether the agent achieved a purchase decision\n\n资料来源：[examples/e-commerce/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/e-commerce/custom_metrics.py)\n\n## HTML Report\n\nThe evaluation system generates a detailed HTML report (`evaluation/final_report.html`) containing:\n\n### Report Sections\n\n| Section | Content |\n|---------|---------|\n| Summary Statistics | Overall pass/fail rates, average scores |\n| Conversations Table | Per-conversation scores, status badges |\n| Detailed Breakdown | Goal completion, final scores, failure reasons |\n| Score Reasons | LLM-generated explanations for each metric |\n\n### Report Features\n\n- **Interactive Table**: Sort and filter conversations\n- **Score Details**: Expandable sections showing metric-by-metric breakdown\n- **Status Badges**: Visual indicators (done/partial/failed)\n- **Tooltip Explanations**: Hover info for column headers\n\n### Score Display Logic\n\n```javascript\nconst POSITIVE_LABELS = ['compliant', 'professional', 'pass', 'good', 'complete', 'no failure', 'ok'];\nconst NEGATIVE_LABELS = ['flagged', 'unprofessional', 'fail', 'error', 'poor', 'missing', 'violated', 'partial'];\n```\n\nQualitative scores are automatically classified as positive, negative, or neutral based on label matching.\n\n资料来源：[arksim/utils/html_report/report_template.html:78-81](https://github.com/arklexai/arksim/blob/main/arksim/utils/html_report/report_template.html)\n\n## Configuration\n\n### Evaluation Configuration Options\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `evalProvider` | string | - | LLM provider for evaluation |\n| `evalModel` | string | - | Model to use for scoring |\n| `metricsToRun` | list | all | Which built-in metrics to execute |\n| `customMetricsFilePaths` | list | [] | Paths to custom metric modules |\n| `evalNumWorkers` | int | auto | Parallel evaluation workers |\n\n### Provider Selection\n\nThe evaluator supports multiple LLM providers:\n- OpenAI\n- Azure OpenAI\n- Custom endpoints (via `chat_completions` type)\n- A2A protocol agents\n\n资料来源：[arksim/ui/frontend/index.html](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html)\n\n## Integration with Simulation\n\nThe evaluation system is typically invoked via the CLI:\n\n```bash\n# Combined simulation and evaluation\narksim simulate-evaluate config.yaml\n\n# Separate steps\narksim simulate config_simulate.yaml\narksim evaluate config_evaluate.yaml\n```\n\n### Pipeline Flow\n\n```mermaid\ngraph LR\n    A[config.yaml] --> B[Simulator]\n    B --> C[simulation.json]\n    C --> D[Evaluator]\n    D --> E[evaluation/]\n    E --> F[final_report.html]\n    E --> F --> G[scores.json]\n```\n\nResults are written to `./results/simulation/simulation.json` for simulation and `./results/evaluation/` for evaluation output.\n\n资料来源：[examples/integrations/dify/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md)\n\n## Advanced Features\n\n### Tool Call Capture\n\nArkSim can automatically capture tool calls via tracing:\n\n```yaml\ntrace_receiver:\n  enabled: true\n  wait_timeout: 5\n```\n\nWhen enabled, tool calls are captured automatically without requiring explicit return in `AgentResponse`.\n\n### Trajectory Matching\n\nThe `trajectory_matching.py` module compares expected agent trajectories against actual behavior, useful for validating that agents follow prescribed action sequences.\n\n### Thresholds\n\nThe `thresholds.py` module defines score boundaries and classification logic for determining pass/fail conditions.\n\n## Summary\n\nThe Evaluation System provides comprehensive, configurable scoring of agent conversations:\n\n- **Flexible Metric System**: Built-in metrics cover common quality dimensions; custom metrics extend evaluation to domain-specific criteria\n- **LLM-based Scoring**: Uses configurable language models to generate nuanced, explainable scores\n- **Visual Reporting**: HTML reports make results easy to understand and act upon\n- **Status Classification**: Automatic categorization into done/partial_failure/failed for quick assessment\n- **Integration-ready**: Works with Python agents, chat completions endpoints, and A2A protocol agents\n\n---\n\n<a id='agent-types'></a>\n\n## Agent Types and Integration\n\n### 相关页面\n\n相关主题：[LLM Provider Integration](#llm-providers), [Tool Call Capture](#tool-call-capture), [Configuration System](#configuration)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/arklexai/arksim/blob/main/README.md) - Main documentation with agent type examples\n- [arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py) - CLI commands for agent initialization\n- [arksim/ui/frontend/index.html](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html) - UI Agent Config section\n- [examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md) - Custom agent implementation example\n- [examples/integrations/dify/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md) - HTTP agent integration example\n- [examples/integrations/rasa/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/rasa/README.md) - External agent server integration\n</details>\n\n# Agent Types and Integration\n\nArkSim supports multiple agent connection types to accommodate different architectures and deployment scenarios. This page documents the available agent types, their configuration methods, and integration patterns.\n\n## Overview\n\nArkSim provides a flexible agent integration system that enables testing of various agent implementations through a unified simulation interface. The simulator communicates with agents via standardized protocols and captures responses for evaluation.\n\nThe agent integration system consists of:\n- A `BaseAgent` abstract class that defines the agent interface\n- Multiple client implementations for different connection types\n- A factory pattern for agent instantiation based on configuration\n- Support for tool call capture through both explicit responses and tracing\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n## Supported Agent Types\n\nArkSim supports three primary agent types, each suited for different deployment scenarios.\n\n| Agent Type | Description | Use Case |\n|------------|-------------|----------|\n| `custom` | Python class extending BaseAgent | Custom agent logic, no external server required |\n| `chat_completions` | HTTP endpoint with OpenAI-compatible API | Existing REST APIs, external agent services |\n| `a2a` | Agent-to-Agent protocol endpoint | Multi-agent systems, A2A-compliant agents |\n\n资料来源：[arksim/cli.py:100-109](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n## Custom Agent (Python Class)\n\nThe `custom` agent type is the default integration method. It requires implementing a Python class that extends `BaseAgent` and implements the `execute` method.\n\n### Implementation Pattern\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Replace with your agent logic\n        return \"agent response\"\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### Returning Tool Calls\n\nTo enable tool call evaluation, return an `AgentResponse` object instead of a plain string. This allows the evaluator to assess whether the agent correctly invoked required tools.\n\n```python\nfrom arksim.simulation_engine.tool_types import AgentResponse, ToolCall\n\nasync def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n    tool_calls = [\n        ToolCall(\n            name=\"search_knowledge_base\",\n            arguments={\"query\": user_query}\n        )\n    ]\n    return AgentResponse(\n        content=\"Found relevant information in the knowledge base.\",\n        tool_calls=tool_calls\n    )\n```\n\n### Traced Agent Variant\n\nFor agents using OpenTelemetry-based instrumentation, ArkSim supports automatic tool call capture through the `TracingProcessor` interface. This eliminates the need to explicitly return tool calls in `AgentResponse`.\n\n```\nSimulator sets routing context -> agent.execute() runs normally\n-> SDK fires TracingProcessor.on_span_end -> arksim captures -> evaluator scores\n```\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Chat Completions Agent (HTTP API)\n\nThe `chat_completions` agent type connects to any HTTP endpoint implementing an OpenAI-compatible chat completions interface.\n\n### Configuration\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### Request Format\n\nArkSim sends requests following the OpenAI chat completions format:\n\n```json\n{\n  \"model\": \"agent-model\",\n  \"messages\": [\n    {\"role\": \"user\", \"content\": \"user query\"}\n  ]\n}\n```\n\n### Response Handling\n\nThe endpoint should return responses in the standard chat completions format:\n\n```json\n{\n  \"choices\": [\n    {\n      \"message\": {\n        \"role\": \"assistant\",\n        \"content\": \"agent response\"\n      }\n    }\n  ]\n}\n```\n\n资料来源：[examples/integrations/dify/README.md](https://github.com/arklexai/arksim/blob/main/examples/integrations/dify/README.md)\n\n## A2A Protocol Agent\n\nThe `a2a` agent type connects to agents implementing the Agent-to-Agent (A2A) protocol, enabling integration with multi-agent systems.\n\n### Configuration\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### A2A with Tool Calls\n\nA2A agents can also surface tool calls for evaluation. The agent returns both the response content and tool call information in its A2A-formatted response.\n\n## CLI Initialization\n\nArkSim provides a CLI command to scaffold agent implementations:\n\n```bash\narksim init --agent-type <type>\n```\n\nThe `--agent-type` flag accepts the following values:\n- `custom` (default) - Generates a Python agent file\n- `chat_completions` - Configures HTTP endpoint connection\n- `a2a` - Configures A2A protocol connection\n\nThe `--force` flag overwrites existing files.\n\n资料来源：[arksim/cli.py:94-118](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n## Integration Architecture\n\nThe following diagram illustrates how ArkSim communicates with different agent types:\n\n```mermaid\ngraph TD\n    subgraph ArkSim[\"ArkSim\"]\n        Simulator[\"Simulator Engine\"]\n        Evaluator[\"Evaluator\"]\n    end\n    \n    subgraph AgentTypes[\"Agent Implementations\"]\n        CustomAgent[\"Custom Agent (Python)\"]\n        HTTPAgent[\"Chat Completions API\"]\n        A2AAgent[\"A2A Protocol Agent\"]\n    end\n    \n    Simulator -->|execute| CustomAgent\n    Simulator -->|HTTP POST| HTTPAgent\n    Simulator -->|A2A Protocol| A2AAgent\n    \n    CustomAgent -->|AgentResponse| Evaluator\n    HTTPAgent -->|JSON Response| Evaluator\n    A2AAgent -->|A2A Response| Evaluator\n```\n\n## Agent Configuration in UI\n\nThe ArkSim web UI provides an \"Agent Config\" section for configuring agent connections. This section is dynamically loaded from the configuration YAML file.\n\n```html\n<!-- Agent Config -->\n<div class=\"t-surface rounded-xl border t-border p-5 mb-4\">\n    <h2 class=\"font-semibold t-heading mb-1\">Agent Config</h2>\n    <p class=\"text-xs t-caption mb-3\">How arksim connects to your agent.</p>\n</div>\n```\n\n资料来源：[arksim/ui/frontend/index.html](https://github.com/arklexai/arksim/blob/main/arksim/ui/frontend/index.html)\n\n## Framework Integrations\n\nArkSim includes pre-built integrations for popular agent frameworks through example projects:\n\n| Framework | Integration File | Protocol |\n|-----------|------------------|----------|\n| LangChain/LangGraph | `custom_agent.py` | Python class |\n| CrewAI | `custom_agent.py` | Python class |\n| AutoGen | `custom_agent.py` | Python class |\n| Claude Agent SDK | `custom_agent.py` | Python class |\n| Pydantic AI | `custom_agent.py` | Python class |\n| LlamaIndex | `custom_agent.py` | Python class |\n| Smolagents | `custom_agent.py` | Python class |\n| OpenAI Agents SDK | `custom_agent.py` | Python class |\n| Dify | `custom_agent.py` | HTTP API |\n| Rasa | `custom_agent.py` | HTTP API |\n| OpenClaw | `config.yaml` | Chat Completions |\n\n资料来源：[examples/integrations/*/README.md](https://github.com/arklexai/arksim/tree/main/examples/integrations)\n\n## Running Simulations with Different Agent Types\n\n### Single Command\n\n```bash\narksim simulate-evaluate config.yaml\n```\n\n### Separate Steps\n\n```bash\n# Step 1: Simulate\narksim simulate config_simulate.yaml\n\n# Step 2: Evaluate\narksim evaluate config_evaluate.yaml\n```\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Best Practices\n\n1. **Tool Call Capture**: For accurate evaluation, ensure your agent returns tool calls either explicitly via `AgentResponse` or implicitly through tracing instrumentation.\n\n2. **Async Implementation**: All agent implementations should use async/await patterns for proper integration with the simulation engine.\n\n3. **Error Handling**: Agents should handle errors gracefully and return meaningful error messages that can be evaluated by the system.\n\n4. **Configuration Management**: Use environment variables for sensitive configuration values like API keys and endpoints.\n\n---\n\n<a id='llm-providers'></a>\n\n## LLM Provider Integration\n\n### 相关页面\n\n相关主题：[Agent Types and Integration](#agent-types), [Configuration System](#configuration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/llms/chat/providers/openai.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/providers/openai.py)\n- [arksim/llms/chat/providers/anthropic.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/providers/anthropic.py)\n- [arksim/llms/chat/providers/google.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/providers/google.py)\n- [arksim/llms/chat/providers/azure_openai.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/providers/azure_openai.py)\n- [arksim/llms/chat/llm.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/llm.py)\n- [arksim/llms/chat/base/base_llm.py](https://github.com/arklexai/arksim/blob/main/arksim/llms/chat/base/base_llm.py)\n</details>\n\n# LLM Provider Integration\n\n## Overview\n\nThe LLM Provider Integration system in ArkSim provides a unified abstraction layer for connecting to various Large Language Model (LLM) providers. This modular architecture enables the simulator to interact with different AI backends while maintaining a consistent interface for chat completions, streaming responses, and token usage tracking.\n\nThe system follows a provider-based pattern where each supported LLM service (OpenAI, Anthropic, Google, Azure OpenAI) implements a common interface defined in the base class. This design allows users to swap between providers without changing the core simulation logic.\n\n## Architecture\n\n### System Components\n\n```mermaid\ngraph TD\n    A[Simulation Engine] --> B[LLM Manager<br/>arksim/llms/chat/llm.py]\n    B --> C[Base LLM<br/>arksim/llms/chat/base/base_llm.py]\n    C --> D[OpenAI Provider<br/>providers/openai.py]\n    C --> E[Anthropic Provider<br/>providers/anthropic.py]\n    C --> F[Google Provider<br/>providers/google.py]\n    C --> G[Azure OpenAI Provider<br/>providers/azure_openai.py]\n    \n    H[Configuration YAML] --> B\n    I[Environment Variables] --> D\n    I --> E\n    I --> F\n    I --> G\n```\n\n### Provider Class Hierarchy\n\n```mermaid\ngraph TD\n    A[BaseLLM<br/>base/base_llm.py] --> B[OpenAIProvider<br/>providers/openai.py]\n    A --> C[AnthropicProvider<br/>providers/anthropic.py]\n    A --> D[GoogleProvider<br/>providers/google.py]\n    A --> E[AzureOpenAIProvider<br/>providers/azure_openai.py]\n    \n    F[Provider Enum] --> A\n    G[ChatMessage Model] --> A\n    H[ChatCompletionResponse] --> A\n```\n\n## Supported Providers\n\nArkSim supports the following LLM providers through dedicated provider implementations:\n\n| Provider | Provider Class | API Style | Streaming Support |\n|----------|----------------|-----------|-------------------|\n| OpenAI | `OpenAIProvider` | OpenAI API | Yes |\n| Anthropic | `AnthropicProvider` | Anthropic API | Yes |\n| Google | `GoogleProvider` | Google AI API | Yes |\n| Azure OpenAI | `AzureOpenAIProvider` | Azure API | Yes |\n\nEach provider class inherits from `BaseLLM` and implements provider-specific API call logic while adhering to the common interface contract.\n\n## Base LLM Interface\n\nThe `BaseLLM` class defines the contract that all provider implementations must follow. This ensures consistent behavior across different LLM backends.\n\n### Core Methods\n\n| Method | Purpose | Parameters |\n|--------|---------|------------|\n| `chat()` | Send a chat completion request | `messages`, `model`, `temperature`, `max_tokens`, `**kwargs` |\n| `chat_stream()` | Stream chat completion responses | `messages`, `model`, `temperature`, `max_tokens`, `**kwargs` |\n| `count_tokens()` | Calculate token usage for messages | `messages`, `model` |\n\n### Data Models\n\nThe base module defines essential data structures used across all providers:\n\n| Model | Purpose |\n|-------|---------|\n| `ChatMessage` | Represents a single message with role and content |\n| `ChatCompletionResponse` | Wraps the API response from providers |\n| `Provider` | Enum identifying supported LLM providers |\n| `ModelInfo` | Metadata about available models per provider |\n\n## Provider Implementations\n\n### OpenAI Provider\n\nThe OpenAI provider connects to OpenAI's API endpoints for chat completions. It supports both standard and streaming responses.\n\n**Configuration Requirements:**\n\n| Parameter | Source | Description |\n|-----------|--------|-------------|\n| `OPENAI_API_KEY` | Environment variable | API key for authentication |\n| `model` | Config/Parameter | Model identifier (e.g., `gpt-4`, `gpt-4-turbo`) |\n\n**API Endpoint:**\n```\nPOST https://api.openai.com/v1/chat/completions\n```\n\n资料来源：[arksim/llms/chat/providers/openai.py]()\n\n### Anthropic Provider\n\nThe Anthropic provider integrates with Anthropic's Claude models through their API. It handles the distinct message format and API conventions used by Anthropic.\n\n**Configuration Requirements:**\n\n| Parameter | Source | Description |\n|-----------|--------|-------------|\n| `ANTHROPIC_API_KEY` | Environment variable | API key for authentication |\n| `model` | Config/Parameter | Model identifier (e.g., `claude-3-opus-20240229`) |\n\n**API Endpoint:**\n```\nPOST https://api.anthropic.com/v1/messages\n```\n\n资料来源：[arksim/llms/chat/providers/anthropic.py]()\n\n### Google Provider\n\nThe Google provider connects to Google's Gemini models via the Google AI API.\n\n**Configuration Requirements:**\n\n| Parameter | Source | Description |\n|-----------|--------|-------------|\n| `GOOGLE_API_KEY` | Environment variable | API key for authentication |\n| `model` | Config/Parameter | Model identifier (e.g., `gemini-pro`) |\n\n**API Endpoint:**\n```\nPOST https://generativelanguage.googleapis.com/v1/models/{model}:generateContent\n```\n\n资料来源：[arksim/llms/chat/providers/google.py]()\n\n### Azure OpenAI Provider\n\nThe Azure OpenAI provider enables integration with Azure-hosted OpenAI models, supporting enterprise deployments with Azure-specific authentication and endpoint configuration.\n\n**Configuration Requirements:**\n\n| Parameter | Source | Description |\n|-----------|--------|-------------|\n| `AZURE_OPENAI_API_KEY` | Environment variable | API key for Azure authentication |\n| `AZURE_OPENAI_ENDPOINT` | Environment variable | Azure endpoint URL |\n| `AZURE_OPENAI_DEPLOYMENT` | Config | Deployment name in Azure |\n| `AZURE_OPENAI_API_VERSION` | Config | Azure API version |\n\n资料来源：[arksim/llms/chat/providers/azure_openai.py]()\n\n## Configuration\n\n### YAML Configuration Structure\n\nLLM providers are configured through the `config.yaml` file used by the simulator:\n\n```yaml\nllm:\n  provider: openai  # or anthropic, google, azure_openai\n  model: gpt-4\n  temperature: 0.7\n  max_tokens: 2048\n  \nagent_config:\n  # Agent-specific LLM settings\n  provider: anthropic\n  model: claude-3-opus-20240229\n```\n\n### Environment Variables\n\n| Variable | Providers | Purpose |\n|----------|-----------|---------|\n| `OPENAI_API_KEY` | OpenAI, AutoGen, LangChain | OpenAI API authentication |\n| `ANTHROPIC_API_KEY` | Claude Agent SDK | Anthropic API authentication |\n| `GOOGLE_API_KEY` | Google | Google AI API authentication |\n| `AZURE_OPENAI_API_KEY` | Azure OpenAI | Azure API authentication |\n| `AZURE_OPENAI_ENDPOINT` | Azure OpenAI | Azure resource endpoint |\n\n## Integration with Custom Agents\n\nThe LLM provider system integrates with the custom agent connector pattern used in ArkSim simulations. Custom agents can be configured to use any supported LLM provider.\n\n```mermaid\ngraph LR\n    A[Scenario JSON] --> B[Simulation Engine]\n    B --> C[Custom Agent<br/>custom_agent.py]\n    C --> D[LLM Provider]\n    D --> E[External LLM API]\n```\n\n### Integration Examples\n\nArkSim provides integration examples for various agent frameworks:\n\n| Framework | Example Path | LLM Provider Used |\n|-----------|--------------|-------------------|\n| LangChain/LangGraph | `examples/integrations/langchain/` | OpenAI |\n| Claude Agent SDK | `examples/integrations/claude-agent-sdk/` | Anthropic |\n| LlamaIndex | `examples/integrations/llamaindex/` | OpenAI |\n| CrewAI | `examples/integrations/crewai/` | OpenAI |\n| AutoGen | `examples/integrations/autogen/` | OpenAI |\n| Pydantic AI | `examples/integrations/pydantic-ai/` | OpenAI |\n| Smolagents | `examples/integrations/smolagents/` | OpenAI |\n| Dify | `examples/integrations/dify/` | Custom HTTP |\n\nEach integration demonstrates how to connect the framework's agent to ArkSim's simulation engine while delegating LLM calls to the appropriate provider.\n\n## Usage Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Config as config.yaml\n    participant LLM as LLM Manager\n    participant Provider as Provider Class\n    participant API as External LLM API\n    \n    User->>Config: Load configuration\n    User->>LLM: Initialize with provider type\n    LLM->>Provider: Create provider instance\n    User->>LLM: chat(messages, model)\n    LLM->>Provider: _chat_completion()\n    Provider->>API: HTTP POST request\n    API-->>Provider: Completion response\n    Provider-->>LLM: Normalized response\n    LLM-->>User: ChatCompletionResponse\n```\n\n## Adding New Providers\n\nTo add support for a new LLM provider:\n\n1. Create a new provider class inheriting from `BaseLLM`\n2. Implement the required methods: `chat()`, `chat_stream()`, `count_tokens()`\n3. Add the provider to the `Provider` enum in `base_llm.py`\n4. Update the provider factory logic in `llm.py` to instantiate the new provider\n5. Add integration tests and documentation\n\n资料来源：[arksim/llms/chat/base/base_llm.py]()\n资料来源：[arksim/llms/chat/llm.py]()\n\n## Error Handling\n\nProvider implementations handle common error scenarios:\n\n| Error Type | Handling Strategy |\n|------------|-------------------|\n| Authentication failures | Raise `AuthenticationError` with helpful message |\n| Rate limiting | Implement automatic retry with backoff |\n| Invalid request parameters | Raise `ValidationError` with parameter details |\n| Network timeouts | Retry with exponential backoff |\n| Model not found | Raise `ModelNotFoundError` listing available models |\n\n## Best Practices\n\n1. **API Key Security**: Store API keys in environment variables, never in configuration files committed to version control. ArkSim automatically loads keys from environment variables.\n\n2. **Token Tracking**: Use the `count_tokens()` method to monitor token usage and estimate costs before running large-scale simulations.\n\n3. **Streaming for Large Responses**: Enable streaming (`chat_stream()`) for scenarios expecting long agent responses to improve perceived responsiveness.\n\n4. **Provider Selection**: Choose Azure OpenAI for enterprise deployments requiring compliance certifications and dedicated infrastructure.\n\n5. **Model Selection**: Refer to the integration examples for recommended model configurations per provider and use case.\n\n---\n\n<a id='tool-call-capture'></a>\n\n## Tool Call Capture\n\n### 相关页面\n\n相关主题：[Evaluation System](#evaluation-system), [Agent Types and Integration](#agent-types)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/simulation_engine/tool_types.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/tool_types.py)\n- [arksim/simulation_engine/agent/clients/a2a.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/agent/clients/a2a.py)\n- [arksim/tracing/openai.py](https://github.com/arklexai/arksim/blob/main/arksim/tracing/openai.py)\n- [examples/customer-service/custom_agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_agent.py)\n- [examples/customer-service/traced_agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/traced_agent.py)\n- [examples/customer-service/a2a_server/agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/a2a_server/agent.py)\n</details>\n\n# Tool Call Capture\n\nTool Call Capture is a core mechanism in ArkSim that observes and records every tool or function invocation made by an agent during a simulation. This captured data feeds directly into the evaluator, enabling metrics like tool call accuracy, error detection, and trajectory analysis.\n\n## Overview\n\nTool Call Capture serves as the bridge between agent execution and evaluation. Without accurate tool call capture, the evaluator cannot determine whether an agent:\n\n- Invoked the correct tools\n- Passed valid arguments\n- Handled errors appropriately\n- Followed the expected execution trajectory\n\nArkSim supports three distinct capture mechanisms:\n\n1. **Explicit capture** via `AgentResponse.tool_calls`\n2. **Automatic capture** via the `ArksimTracingProcessor`\n3. **A2A protocol capture** via task artifacts\n\nAll three methods produce the same underlying `ToolCall` data model, ensuring consistent evaluation regardless of how the agent is implemented.\n\n## Data Models\n\n### ToolCall\n\nThe `ToolCall` class represents a single tool invocation observed during a turn:\n\n```python\nclass ToolCall(BaseModel):\n    model_config = ConfigDict(extra=\"ignore\")\n    \n    id: str\n    name: str\n    arguments: dict[str, Any] = Field(default_factory=dict)\n    result: str | None = None\n    error: str | None = None\n    source: ToolCallSource | None = None\n```\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `id` | `str` | Unique identifier for this tool call |\n| `name` | `str` | Name of the tool/function invoked |\n| `arguments` | `dict[str, Any]` | Arguments passed to the tool |\n| `result` | `str \\| None` | Response returned by the tool |\n| `error` | `str \\| None` | Error message if the call failed |\n| `source` | `ToolCallSource \\| None` | Origin of the capture data |\n\nThe `extra=\"ignore\"` configuration ensures forward compatibility with future versions that add new fields.\n\n### ToolCallSource\n\nThe `ToolCallSource` enum indicates how tool call data was captured:\n\n```python\nclass ToolCallSource(str, Enum):\n    AGENT_RESPONSE = \"agent_response\"\n    TRACING_PROCESSOR = \"tracing_processor\"\n    A2A_PROTOCOL = \"a2a_protocol\"\n```\n\n### AgentResponse\n\nFor explicit capture, agents return structured responses:\n\n```python\nclass AgentResponse(BaseModel):\n    model_config = ConfigDict(extra=\"ignore\")\n    \n    content: str\n    tool_calls: list[ToolCall] = Field(default_factory=list)\n```\n\n## Capture Methods\n\n### Explicit Capture (AgentResponse)\n\nThe default approach requires agents to explicitly return tool calls alongside their text response. This is the standard pattern for custom agents implementing `BaseAgent`.\n\n```mermaid\ngraph TD\n    A[Simulator invokes agent] --> B[Agent executes user_query]\n    B --> C[Agent returns AgentResponse]\n    C --> D[Simulator extracts tool_calls]\n    D --> E[Evaluator scores trajectory]\n```\n\n**Implementation pattern:**\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse, ToolCall\n\nclass MyAgent(BaseAgent):\n    async def execute(self, user_query: str, **kwargs) -> str | AgentResponse:\n        # Agent logic here...\n        tool_calls = [\n            ToolCall(\n                id=\"call_1\",\n                name=\"get_order_status\",\n                arguments={\"order_id\": \"ORD-1001\"},\n                source=\"agent_response\"\n            )\n        ]\n        return AgentResponse(\n            content=\"The order status is shipped.\",\n            tool_calls=tool_calls\n        )\n```\n\n资料来源：[examples/customer-service/custom_agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_agent.py)\n\n### Tracing Processor (Automatic Capture)\n\nThe `ArksimTracingProcessor` uses the OpenAI Agents SDK's tracing interface to capture tool calls automatically, without requiring explicit return data.\n\n```mermaid\ngraph TD\n    A[Simulator sets routing context] --> B[Agent executes normally]\n    B --> C[SDK fires on_span_end]\n    C --> D[ArksimTracingProcessor captures]\n    D --> E[Evaluator scores trajectory]\n```\n\n**Registration pattern:**\n\n```python\nfrom agents import Agent as SDKAgent\nfrom arksim.tracing.openai import ArksimTracingProcessor\n\n# Register once at module load\n_tracing_processor = ArksimTracingProcessor()\n```\n\nThe traced agent returns a plain `str` rather than `AgentResponse`:\n\n```python\nasync def execute(self, user_query: str, **kwargs) -> str:\n    result = await Runner.run(self._sdk_agent, user_query)\n    return result.final_output\n```\n\n资料来源：[examples/customer-service/traced_agent.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/traced_agent.py)\n\n### A2A Protocol Capture\n\nFor agents implementing the Agent-to-Agent (A2A) protocol, tool calls are embedded in task artifacts using the `A2AToolCaptureExtension`.\n\n```mermaid\ngraph TD\n    A[Simulator sends task to A2A Agent] --> B[Agent processes request]\n    B --> C[Agent returns Task with artifacts]\n    C --> D[Simulator extracts from metadata]\n    D --> E[Evaluator scores]\n```\n\n**Tool call extraction from artifacts:**\n\n```python\ndef _extract_tool_calls_from_artifact(self, artifact: Artifact) -> list[ToolCall]:\n    metadata = artifact.metadata\n    raw_calls = metadata.get(\"tool_calls\", [])\n    tool_calls = []\n    for raw in raw_calls:\n        arguments = raw.get(\"arguments\", {})\n        if not isinstance(arguments, dict):\n            continue\n        tool_calls.append(\n            ToolCall(\n                id=raw.get(\"id\", \"\"),\n                name=name,\n                arguments=arguments,\n                result=A2AAgent._coerce_to_string(raw.get(\"result\")),\n                error=A2AAgent._coerce_to_string(raw.get(\"error\")),\n                source=ToolCallSource.A2A_PROTOCOL,\n            )\n        )\n    return tool_calls\n```\n\n资料来源：[arksim/simulation_engine/agent/clients/a2a.py](https://github.com/arklexai/arksim/blob/main/arksim/simulation_engine/agent/clients/a2a.py)\n\n**A2A agent card declaration:**\n\n```python\nfrom arksim.simulation_engine.tool_types import A2AToolCaptureExtension\n\n_capabilities = AgentCapabilities(\n    streaming=False,\n    extensions=[A2AToolCaptureExtension],\n)\n```\n\n## Workflow Diagram\n\nThe following diagram shows the complete simulation pipeline with tool call capture:\n\n```mermaid\nflowchart LR\n    subgraph Simulation\n        A[User Query] --> B[Simulator]\n        B --> C{Agent Type}\n        C -->|Custom| D[explicit tool_calls]\n        C -->|Traced| E[TracingProcessor]\n        C -->|A2A| F[Artifact metadata]\n        D --> G[Captured Tool Calls]\n        E --> G\n        F --> G\n    end\n    \n    subgraph Evaluation\n        G --> H[Evaluator]\n        H --> I[Tool Call Metrics]\n        H --> J[Error Detection]\n        H --> K[Trajectory Analysis]\n    end\n    \n    G --> L[Results/Report]\n    I --> L\n    J --> L\n    K --> L\n```\n\n## Comparison of Capture Methods\n\n| Aspect | Explicit (AgentResponse) | Traced (TracingProcessor) | A2A Protocol |\n|--------|--------------------------|---------------------------|--------------|\n| Return type | `AgentResponse` | `str` | Via protocol |\n| Implementation complexity | Medium | Low | High |\n| Agent code changes | Required | Minimal | Protocol required |\n| Best for | Custom Python agents | SDK-based agents | A2A-native agents |\n| Tool call source field | `agent_response` | `tracing_processor` | `a2a_protocol` |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n## Configuration\n\n### Trace Receiver Settings\n\nFor traced agents, enable the trace receiver in the simulation config:\n\n```yaml\nsimulation:\n  max_turns: 10\n  \ntrace_receiver:\n  enabled: true\n  wait_timeout: 5  # seconds to wait for traces\n```\n\n### When Tracing is Disabled\n\nWhen `trace_receiver.enabled` is `false` or omitted, ArkSim falls back to explicit `AgentResponse` capture:\n\n> When `trace_receiver.enabled` is false or omitted, arksim only captures tool calls from `AgentResponse` (the standard path).\n\n## Integration with Evaluation\n\nCaptured tool calls flow into the evaluator's scoring pipeline:\n\n1. **Trajectory matching** - Compare actual tool sequence against expected\n2. **Argument validation** - Verify tool arguments match scenario requirements\n3. **Error detection** - Identify tool call failures and their handling\n4. **Coverage analysis** - Determine if all required tools were invoked\n\nThe evaluator uses the `source` field to differentiate between capture methods when analyzing behavioral patterns.\n\n## Forward Compatibility\n\nBoth `ToolCall` and `AgentResponse` declare `extra=\"ignore\"` in their Pydantic configuration:\n\n> Declares `extra=\"ignore\"` explicitly so snapshots from future arksim versions that add new fields can still be loaded by older arksim without raising a `ValidationError`.\n\nThis ensures that simulation results captured with newer versions remain loadable by older versions of the evaluator.\n\n---\n\n<a id='scenario-management'></a>\n\n## Scenario Management\n\n### 相关页面\n\n相关主题：[Simulation Engine](#simulation-engine), [Configuration System](#configuration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/scenario/entities.py](https://github.com/arklexai/arksim/blob/main/arksim/scenario/entities.py)\n- [arksim/templates/scenarios.json](https://github.com/arklexai/arksim/blob/main/arksim/templates/scenarios.json)\n- [examples/e-commerce/scenarios.json](https://github.com/arklexai/arksim/blob/main/examples/e-commerce/scenarios.json)\n- [examples/bank-insurance/scenarios.json](https://github.com/arklexai/arksim/blob/main/examples/bank-insurance/scenarios.json)\n- [examples/customer-service/scenarios.json](https://github.com/arklexai/arksim/blob/main/examples/customer-service/scenarios.json)\n</details>\n\n# Scenario Management\n\n## Overview\n\nScenario Management is the core system in ArkSim for defining, loading, validating, and executing test scenarios that simulate user interactions with an AI agent. A scenario represents a structured test case that defines a user's goal, knowledge base, behavioral characteristics, and expected outcomes.\n\nScenarios serve as the foundation for the simulation and evaluation pipeline, enabling reproducible testing of agent behavior across diverse conversation patterns and user profiles.\n\n## Scenario Data Model\n\n### Core Scenario Structure\n\nEach scenario in ArkSim is a JSON object containing the following primary fields:\n\n| Field | Type | Required | Description |\n|-------|------|----------|-------------|\n| `scenario_id` | string | Yes | Unique identifier for the scenario |\n| `name` | string | Yes | Human-readable scenario name |\n| `description` | string | No | Detailed description of the scenario's purpose |\n| `user_profile` | object | Yes | User persona characteristics |\n| `goal` | object | Yes | The user's primary objective |\n| `knowledge` | array | Yes | Contextual knowledge the user has access to |\n| `expected_behavior` | object | No | Expected agent responses or behaviors |\n| `metrics` | array | No | Custom evaluation criteria |\n\n### User Profile Schema\n\n```json\n{\n  \"name\": \"string\",\n  \"age\": \"number\",\n  \"personality\": \"string\",\n  \"background\": \"string\",\n  \"communication_style\": \"string\"\n}\n```\n\n资料来源：[arksim/scenario/entities.py]()\n\n### Goal Structure\n\n```json\n{\n  \"primary\": \"string (main user objective)\",\n  \"secondary\": [\"array of secondary objectives\"],\n  \"constraints\": [\"array of constraints or boundaries\"],\n  \"success_criteria\": \"string\"\n}\n```\n\n## Scenario Management Workflow\n\n```mermaid\ngraph TD\n    A[Create/Load Scenarios] --> B[Validate Scenario Schema]\n    B --> C{Valid?}\n    C -->|Yes| D[Save to scenarios.json]\n    C -->|No| E[Show Validation Errors]\n    E --> A\n    D --> F[Configure Simulation Parameters]\n    F --> G[Run Simulation]\n    G --> H[Generate Results]\n    H --> I[Evaluation & Reporting]\n```\n\n## Scenario Loading and Validation\n\nThe UI provides interactive scenario management through the \"Build\" page:\n\n1. **Load Existing** - Users can load pre-existing scenario files via file path input\n2. **Auto-generate (PRO)** - Automatic scenario generation from agent knowledge base\n3. **Manual Creation** - Build scenarios through the UI interface\n\n```javascript\n// Scenario file validation in UI\n@input=\"validateScenarioFile()\"\n@blur=\"validateScenarioFile()\"\n@keydown.enter=\"loadScenarioFile()\"\n```\n\n资料来源：[arksim/ui/frontend/index.html]()\n\n### Validation Rules\n\n- Scenario files must be valid JSON\n- Required fields must be present\n- `scenario_id` must be unique within the file\n- `user_profile` must contain at minimum a `name` field\n\n## Scenario File Format\n\nArkSim uses JSON format for scenario definitions. See the example structure:\n\n```json\n{\n  \"scenarios\": [\n    {\n      \"scenario_id\": \"ecommerce-return-item-001\",\n      \"name\": \"Return Defective Product\",\n      \"description\": \"Customer wants to return a damaged item received last week\",\n      \"user_profile\": {\n        \"name\": \"John Smith\",\n        \"age\": 35,\n        \"personality\": \"patient but firm\",\n        \"background\": \"Regular online shopper\",\n        \"communication_style\": \"polite and direct\"\n      },\n      \"goal\": {\n        \"primary\": \"Get a full refund for a damaged product\",\n        \"secondary\": [\"Understand return process\", \"Know timeline for refund\"],\n        \"constraints\": [\"Only willing to wait up to 14 days for refund\"],\n        \"success_criteria\": \"Full refund issued or replacement offered\"\n      },\n      \"knowledge\": [\n        \"Ordered product SKU-12345 on March 1, 2024\",\n        \"Item arrived damaged with visible scratches\",\n        \"Has original packaging and receipt\",\n        \"Order number: ORD-987654\"\n      ],\n      \"expected_behavior\": {\n        \"should_mention_order_number\": true,\n        \"should_request_photo_evidence\": false\n      }\n    }\n  ]\n}\n```\n\n资料来源：[examples/e-commerce/scenarios.json]()\n资料来源：[examples/bank-insurance/scenarios.json]()\n资料来源：[examples/customer-service/scenarios.json]()\n\n## Simulation Configuration\n\nScenarios are executed through the simulation engine with configurable parameters:\n\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `num_conversations_per_scenario` | 5 | Number of conversations to simulate per scenario |\n| `max_turns` | 5 | Maximum turns per conversation |\n| `num_workers` | 50 | Parallel workers (or 'auto') |\n| `model` | gpt-4o | LLM model for simulation |\n| `provider` | openai | LLM provider |\n\n```python\nmodel: str = Field(default=DEFAULT_MODEL, description=\"LLM model for simulation\")\nnum_conversations_per_scenario: int = Field(\n    default=5, \n    description=\"Number of conversations per scenario to simulate\"\n)\nmax_turns: int = Field(default=5, description=\"Maximum turns per conversation\")\n```\n\n资料来源：[arksim/simulation_engine/entities.py:18-25]()\n\n### Configuration via config.yaml\n\n```yaml\nsimulation:\n  scenarios_file: ./scenarios.json\n  num_conversations_per_scenario: 5\n  max_turns: 5\n  num_workers: auto\n  model: gpt-4o\n```\n\n## Built-in Scenario Templates\n\nArkSim provides template scenarios for common use cases:\n\n```json\n{\n  \"scenarios\": [\n    {\n      \"scenario_id\": \"template-basic-query\",\n      \"name\": \"Basic Information Query\",\n      \"description\": \"User asks a simple informational question\",\n      \"user_profile\": {...},\n      \"goal\": {...},\n      \"knowledge\": [...]\n    }\n  ]\n}\n```\n\n资料来源：[arksim/templates/scenarios.json]()\n\n## Integration with CI/CD\n\nScenarios can be integrated into automated testing workflows:\n\n```yaml\n# GitHub Actions workflow\nsteps:\n  - name: Run Scenario Tests\n    run: arksim simulate-evaluate config.yaml\n```\n\n资料来源：[examples/ci/README.md]()\n\n### Required CI Setup\n\n1. Update `TODO` sections in `arksim.yml` (startup command and health-check URL)\n2. Create `tests/arksim/config.yaml` pointing to your server endpoint\n3. Create `tests/arksim/scenarios.json` with your test cases\n4. Add custom metrics to `tests/arksim/custom_metrics/` if needed\n5. Configure `OPENAI_API_KEY` in GitHub secrets\n\n## Best Practices\n\n### Scenario Design\n\n- **Distinct Goals**: Each scenario should test a single, well-defined user goal\n- **Realistic Profiles**: User profiles should reflect actual customer demographics\n- **Sufficient Knowledge**: Include enough context for the simulated user to maintain coherent conversation\n\n### File Organization\n\n```\nproject/\n├── config.yaml\n├── scenarios.json\n├── custom_metrics/\n│   └── my_metric.py\n└── results/\n    └── simulation/\n```\n\n### Version Control\n\n- Commit `scenarios.json` with your agent code\n- Use descriptive `scenario_id` values with prefixes (e.g., `ecommerce-`, `support-`)\n- Document success criteria clearly in the `goal.success_criteria` field\n\n## Command Line Usage\n\n### Initialize with default scenarios\n\n```bash\narksim init\n```\n\n### Run simulation with scenarios\n\n```bash\narksim simulate-evaluate config.yaml\n```\n\nResults are written to `./results/simulation/simulation.json`.\n\n资料来源：[README.md]()\n资料来源：[examples/integrations/dify/README.md]()\n\n## See Also\n\n- [Simulation Engine Documentation](./simulation-engine.md)\n- [Custom Agent Integration](./custom-agents.md)\n- [Evaluation Metrics](./evaluation-metrics.md)\n- [HTML Report Template](./html-report.md)\n\n---\n\n<a id='configuration'></a>\n\n## Configuration System\n\n### 相关页面\n\n相关主题：[Agent Types and Integration](#agent-types), [LLM Provider Integration](#llm-providers), [Evaluation System](#evaluation-system)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [arksim/config/types.py](https://github.com/arklexai/arksim/blob/main/arksim/config/types.py)\n- [arksim/config/core/agent.py](https://github.com/arklexai/arksim/blob/main/arksim/config/core/agent.py)\n- [arksim/config/utils.py](https://github.com/arklexai/arksim/blob/main/arksim/config/utils.py)\n- [arksim/config.yaml](https://github.com/arklexai/arksim/blob/main/arksim/config.yaml)\n- [arksim/config_simulate.yaml](https://github.com/arklexai/arksim/blob/main/arksim/config_simulate.yaml)\n- [arksim/config_evaluate.yaml](https://github.com/arklexai/arksim/blob/main/arksim/config_evaluate.yaml)\n- [arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n- [examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n</details>\n\n# Configuration System\n\nArkSim provides a flexible, multi-layered configuration system that supports both YAML-based declarative configuration and programmatic Python class configuration. The system enables users to define simulation scenarios, evaluation metrics, agent connections, and runtime behavior through structured configuration files or direct Python implementations.\n\n## Architecture Overview\n\nThe configuration system is organized into several key modules that handle different aspects of simulation and evaluation:\n\n```mermaid\ngraph TD\n    A[User Configuration] --> B[YAML Files]\n    A --> C[Python Classes]\n    B --> D[config.yaml]\n    B --> E[config_simulate.yaml]\n    B --> F[config_evaluate.yaml]\n    C --> G[BaseAgent Subclass]\n    C --> H[Custom Metrics]\n    D --> I[Config Loader]\n    E --> I\n    F --> I\n    G --> J[Simulation Engine]\n    H --> K[Evaluator]\n    I --> J\n    J --> K\n    J --> L[Results/Reports]\n    K --> L\n```\n\nThe system separates configuration into three distinct phases: simulation, evaluation, and combined execution. This separation allows users to run simulations independently from evaluation, which is useful for debugging and iterative development workflows.\n\n## Configuration Files\n\nArkSim uses YAML configuration files to define simulation parameters, evaluation settings, and agent connections. The repository includes three primary configuration templates at the root level.\n\n### Main Configuration (config.yaml)\n\nThe main configuration file combines both simulation and evaluation settings into a single file. This is the recommended approach for simple use cases where both phases run together using the `simulate-evaluate` command. The file structure typically includes agent configuration, scenario definitions, metric selections, and evaluation parameters.\n\n### Separate Phase Configuration\n\nFor more complex workflows, ArkSim supports splitting configuration into separate files:\n\n| File | Purpose | CLI Command |\n|------|---------|-------------|\n| `config_simulate.yaml` | Simulation-only parameters | `arksim simulate config_simulate.yaml` |\n| `config_evaluate.yaml` | Evaluation-only parameters | `arksim evaluate config_evaluate.yaml` |\n| `config.yaml` | Combined configuration | `arksim simulate-evaluate config.yaml` |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\nThis separation enables scenarios where simulation results are cached and evaluated multiple times with different metric configurations, or where the simulation phase runs in a different environment than evaluation.\n\n## Agent Configuration\n\nThe configuration system supports multiple agent connection types, defined through the `agent_type` field in the agent configuration section.\n\n### Supported Agent Types\n\n| Agent Type | Description | Configuration Style |\n|------------|-------------|---------------------|\n| `custom` | Custom Python agent class inheriting from BaseAgent | Python class file |\n| `chat_completions` | HTTP endpoint implementing Chat Completions API | YAML endpoint config |\n| `a2a` | Agent-to-Agent protocol endpoint | YAML endpoint config |\n\n资料来源：[arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\n### Custom Agent (Python Class)\n\nThe default agent type uses a Python class that extends `BaseAgent`. This approach provides maximum flexibility for integrating any agent framework. The agent class must implement two required methods:\n\n```python\nfrom arksim.simulation_engine.agent.base import BaseAgent\nfrom arksim.simulation_engine.tool_types import AgentResponse\n\nclass MyAgent(BaseAgent):\n    async def get_chat_id(self) -> str:\n        return \"unique-id\"\n\n    async def execute(self, user_query: str, **kwargs: object) -> str | AgentResponse:\n        # Replace with your agent logic\n        return \"agent response\"\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\nThe `execute` method can return either a plain string response or an `AgentResponse` object that includes both text content and tool calls for evaluation.\n\n### Chat Completions Agent\n\nFor agents exposing an HTTP endpoint compatible with the OpenAI Chat Completions format, configuration is declarative:\n\n```yaml\nagent_config:\n  agent_type: chat_completions\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:8000/v1/chat/completions\n```\n\n资料来源：[README.md](https://github.com/arklexai/arksim/blob/main/README.md)\n\n### A2A Protocol Agent\n\nAgents implementing the Agent-to-Agent protocol are configured similarly:\n\n```yaml\nagent_config:\n  agent_type: a2a\n  agent_name: my-agent\n  api_config:\n    endpoint: http://localhost:9999/agent\n```\n\nA2A agents can also surface tool calls for evaluation through the protocol, enabling comprehensive testing of tool-using agents.\n\n## Scenario Configuration\n\nScenarios define the test cases that the simulator executes against the agent. The `scenarios.json` file contains an array of scenario objects, each representing a simulated conversation or interaction.\n\n### Scenario Structure\n\nEach scenario typically includes:\n\n- **Scenario ID**: Unique identifier for the scenario\n- **User query**: The initial prompt or question presented to the agent\n- **Expected behavior**: Criteria for successful completion\n- **Knowledge/context**: Information the agent should know or have access to during the scenario\n\n### Scenario Files Location\n\nScenarios are referenced from the main configuration file and can be shared across different configuration files:\n\n| Example Directory | Scenario Purpose |\n|------------------|------------------|\n| `examples/bank-insurance/` | Financial services agent testing |\n| `examples/customer-service/` | Customer support scenarios |\n| `examples/integrations/*/` | Framework-specific testing |\n| `examples/ci/` | CI/CD integration testing |\n\n资料来源：[examples/ci/README.md](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n\n## Evaluation Metrics Configuration\n\nThe evaluator component scores agent responses against defined metrics. Configuration specifies which metrics to run and how they are calculated.\n\n### Built-in Metrics\n\nArkSim includes standard evaluation metrics covering common agent quality dimensions. These metrics are automatically available without additional configuration.\n\n### Custom Metrics\n\nUsers can define custom evaluation metrics by creating a Python module that implements the metric interfaces. The custom metrics file must define metrics following the provided schema:\n\n```python\nfrom pydantic import BaseModel\nfrom arksim.evaluator import (\n    QualitativeMetric,\n    QualResult,\n    QuantitativeMetric,\n    QuantResult,\n    ScoreInput,\n    format_chat_history,\n)\n```\n\n资料来源：[examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\nCustom metrics are referenced in the configuration file using the `custom_metrics_file_paths` parameter, and optionally listed in `metrics_to_run` to include them in the evaluation:\n\n```yaml\ncustom_metrics_file_paths:\n  - tests/arksim/custom_metrics/my_metric.py\nmetrics_to_run:\n  - custom_metric_name\n```\n\n资料来源：[examples/ci/README.md](https://github.com/arklexai/arksim/blob/main/examples/ci/README.md)\n\n### Custom Metric Schema\n\nCustom quantitative metrics should return a Pydantic model defining the score components:\n\n```python\nclass VerificationComplianceSchema(BaseModel):\n    identity_verification: float  # 0.0-1.0\n    action_gating: float  # 0.0-1.0\n    reason: str\n```\n\n资料来源：[examples/customer-service/custom_metrics.py](https://github.com/arklexai/arksim/blob/main/examples/customer-service/custom_metrics.py)\n\nThe system prompt for qualitative metrics defines the evaluation criteria that the LLM judge applies when scoring agent responses.\n\n## CLI Configuration Interface\n\nThe ArkSim CLI provides commands for initializing configurations, running simulations, and managing examples.\n\n### Initialization Command\n\nThe `init` command scaffolds starter files for agent testing:\n\n```bash\narksim init --agent-type custom\n```\n\n| Flag | Options | Default | Description |\n|------|---------|---------|-------------|\n| `--agent-type` | `custom`, `chat_completions`, `a2a` | `custom` | Agent connection type |\n| `--force` | boolean | `false` | Overwrite existing files |\n\n资料来源：[arksim/cli.py](https://github.com/arklexai/arksim/blob/main/arksim/cli.py)\n\nThe `--agent-type` flag determines which template is generated:\n- `custom` generates a Python agent file (no server needed)\n- `chat_completions` generates YAML configuration for HTTP endpoints\n- `a2a` generates YAML configuration for Agent-to-Agent protocol\n\n### Simulation Commands\n\n| Command | Description |\n|---------|-------------|\n| `arksim simulate-evaluate config.yaml` | Run simulation and evaluation in sequence |\n| `arksim simulate config_simulate.yaml` | Run simulation only |\n| `arksim evaluate config_evaluate.yaml` | Evaluate previously saved simulation results |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\n### Examples Command\n\nThe CLI also provides access to example projects:\n\n```bash\narksim examples                    # Download all examples\narksim examples bank-insurance     # Download specific example\narksim examples --list            # List available examples\n```\n\n## Integration Configuration Patterns\n\nEach integration example follows a consistent pattern with three standard files.\n\n### File Structure\n\n| File | Purpose |\n|------|---------|\n| `custom_agent.py` | Agent implementation connecting to the target framework |\n| `config.yaml` | Simulation and evaluation settings |\n| `scenarios.json` | Test scenarios for the example domain |\n\n### Supported Framework Integrations\n\nArkSim provides example configurations for the following agent frameworks:\n\n| Framework | Installation | Example Directory |\n|-----------|-------------|-------------------|\n| LangGraph | `pip install langgraph langchain-openai` | `examples/integrations/langgraph/` |\n| LangChain | `pip install langgraph langchain-openai` | `examples/integrations/langchain/` |\n| Claude Agent SDK | `pip install claude-agent-sdk` | `examples/integrations/claude-agent-sdk/` |\n| AutoGen | `pip install autogen-agentchat autogen-ext[openai]` | `examples/integrations/autogen/` |\n| CrewAI | `pip install crewai` | `examples/integrations/crewai/` |\n| Pydantic AI | `pip install pydantic-ai` | `examples/integrations/pydantic-ai/` |\n| Smolagents | `pip install smolagents` | `examples/integrations/smolagents/` |\n| Dify | HTTP integration | `examples/integrations/dify/` |\n| OpenClaw | Gateway token auth | `examples/openclaw/` |\n\n资料来源：[examples/integrations/*/README.md](https://github.com/arklexai/arksim/tree/main/examples/integrations)\n\n## Trace Receiver Configuration\n\nFor frameworks that support tracing, ArkSim includes a trace receiver component that captures tool calls automatically without requiring explicit `AgentResponse` returns.\n\n### Configuration Options\n\n```yaml\ntrace_receiver:\n  enabled: true\n  wait_timeout: 5\n```\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `enabled` | boolean | `false` | Enable trace-based tool call capture |\n| `wait_timeout` | integer | `5` | Seconds to wait for trace data |\n\n资料来源：[examples/customer-service/README.md](https://github.com/arklexai/arksim/blob/main/examples/customer-service/README.md)\n\nWhen `enabled` is `false` or omitted, ArkSim only captures tool calls from `AgentResponse` objects returned by the agent's `execute` method.\n\n## Workflow Summary\n\nThe following diagram illustrates the configuration-driven workflow from specification to results:\n\n```mermaid\ngraph LR\n    A[config.yaml] --> B[Scenario Loading]\n    C[scenarios.json] --> B\n    B --> D[Simulation Engine]\n    D --> E{Agent Type?}\n    E -->|custom| F[Python Agent]\n    E -->|chat_completions| G[HTTP Endpoint]\n    E -->|a2a| H[A2A Agent]\n    F --> I[Execute Scenarios]\n    G --> I\n    H --> I\n    I --> J[Simulation Results]\n    J --> K[Evaluator]\n    J --> L[Conversation Viewer]\n    K --> M[Evaluation Report]\n    M --> N[scores.json]\n    M --> O[failures.json]\n```\n\n## Best Practices\n\n**Environment Variables**: API keys should be set as environment variables rather than hardcoded in configuration files:\n\n```bash\nexport OPENAI_API_KEY=\"<your-key>\"\nexport ANTHROPIC_API_KEY=\"<your-key>\"\n```\n\n**Separation of Concerns**: Use separate `config_simulate.yaml` and `config_evaluate.yaml` files when iterating on scenarios or metrics independently.\n\n**Custom Metrics Organization**: Place custom metrics in dedicated directories (e.g., `tests/arksim/custom_metrics/`) and reference them from the main configuration.\n\n**Scenario Versioning**: Keep scenarios in version-controlled JSON files that can be reviewed and updated as agent requirements evolve.\n\n资料来源：[CONTRIBUTING.md](https://github.com/arklexai/arksim/blob/main/CONTRIBUTING.md)\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：arklexai/arksim\n\n摘要：发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：v0.1.0。\n\n## 1. 安装坑 · 来源证据：v0.1.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.1.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d55b6435222f4142bf979809f0cf0794 | https://github.com/arklexai/arksim/releases/tag/v0.1.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 2. 配置坑 · 来源证据：v0.0.6\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.0.6\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_57b2e0dc0cf94da8b4fc97ea744ae7e9 | https://github.com/arklexai/arksim/releases/tag/v0.0.6 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 3. 配置坑 · 来源证据：v0.3.2\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.2\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_3a2981d341cb4512a410e7d75e03baf0 | https://github.com/arklexai/arksim/releases/tag/v0.3.2 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 4. 配置坑 · 来源证据：v0.3.4\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.4\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f949170a7f5d440cb0559c40fb441178 | https://github.com/arklexai/arksim/releases/tag/v0.3.4 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 5. 配置坑 · 来源证据：v0.3.5\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.5\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_314def1c94614699b6c9ad728c36e9ab | https://github.com/arklexai/arksim/releases/tag/v0.3.5 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 6. 能力坑 · 来源证据：v0.3.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.3.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_c3e7552e9af84b95b2aec63c73dc7c26 | https://github.com/arklexai/arksim/releases/tag/v0.3.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 7. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | README/documentation is current enough for a first validation pass.\n\n## 8. 维护坑 · 来源证据：v0.3.3\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个维护/版本相关的待验证问题：v0.3.3\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_03b0d8ce6f8a4bc2bc8e33a7636aa07c | https://github.com/arklexai/arksim/releases/tag/v0.3.3 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 9. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | last_activity_observed missing\n\n## 10. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | no_demo; severity=medium\n\n## 11. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | no_demo; severity=medium\n\n## 12. 安全/权限坑 · 来源证据：v0.2.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.2.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_3c70cc242a1242b98d50cfe5078d6752 | https://github.com/arklexai/arksim/releases/tag/v0.2.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 13. 安全/权限坑 · 来源证据：v0.3.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.3.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_69bd9f91a5064aeeb87f325f2f56f115 | https://github.com/arklexai/arksim/releases/tag/v0.3.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | issue_or_pr_quality=unknown\n\n## 15. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | release_recency=unknown\n\n<!-- canonical_name: arklexai/arksim; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：arklexai/arksim\n\n摘要：发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：v0.1.0。\n\n## 1. 安装坑 · 来源证据：v0.1.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.1.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d55b6435222f4142bf979809f0cf0794 | https://github.com/arklexai/arksim/releases/tag/v0.1.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 2. 配置坑 · 来源证据：v0.0.6\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.0.6\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_57b2e0dc0cf94da8b4fc97ea744ae7e9 | https://github.com/arklexai/arksim/releases/tag/v0.0.6 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 3. 配置坑 · 来源证据：v0.3.2\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.2\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_3a2981d341cb4512a410e7d75e03baf0 | https://github.com/arklexai/arksim/releases/tag/v0.3.2 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 4. 配置坑 · 来源证据：v0.3.4\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.4\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f949170a7f5d440cb0559c40fb441178 | https://github.com/arklexai/arksim/releases/tag/v0.3.4 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 5. 配置坑 · 来源证据：v0.3.5\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.3.5\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_314def1c94614699b6c9ad728c36e9ab | https://github.com/arklexai/arksim/releases/tag/v0.3.5 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 6. 能力坑 · 来源证据：v0.3.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.3.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_c3e7552e9af84b95b2aec63c73dc7c26 | https://github.com/arklexai/arksim/releases/tag/v0.3.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 7. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | README/documentation is current enough for a first validation pass.\n\n## 8. 维护坑 · 来源证据：v0.3.3\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个维护/版本相关的待验证问题：v0.3.3\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_03b0d8ce6f8a4bc2bc8e33a7636aa07c | https://github.com/arklexai/arksim/releases/tag/v0.3.3 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 9. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | last_activity_observed missing\n\n## 10. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | no_demo; severity=medium\n\n## 11. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | no_demo; severity=medium\n\n## 12. 安全/权限坑 · 来源证据：v0.2.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.2.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_3c70cc242a1242b98d50cfe5078d6752 | https://github.com/arklexai/arksim/releases/tag/v0.2.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 13. 安全/权限坑 · 来源证据：v0.3.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.3.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_69bd9f91a5064aeeb87f325f2f56f115 | https://github.com/arklexai/arksim/releases/tag/v0.3.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | issue_or_pr_quality=unknown\n\n## 15. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | art_e3064c2689144cfb89a534e0544c6bfc | https://github.com/arklexai/arksim#readme | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# arksim - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for arklexai/arksim.\n\nProject:\n- Name: arksim\n- Repository: https://github.com/arklexai/arksim\n- Summary: <p align=\"center\">\n- Host target: chatgpt\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: <p align=\"center\">\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. intro: Introduction to ArkSim. Produce one small intermediate artifact and wait for confirmation.\n2. quickstart: Quickstart Guide. Produce one small intermediate artifact and wait for confirmation.\n3. arch-overview: System Architecture Overview. Produce one small intermediate artifact and wait for confirmation.\n4. simulation-engine: Simulation Engine. Produce one small intermediate artifact and wait for confirmation.\n5. evaluation-system: Evaluation System. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/arklexai/arksim#readme\n- .claude/skills/draft-pr/SKILL.md\n- .claude/skills/pre-review/SKILL.md\n- .claude/skills/review-pr/SKILL.md\n- README.md\n- arksim/__init__.py\n- arksim/constants.py\n- arksim/cli.py\n- arksim/templates/my_agent.py\n- arksim/templates/config.yaml\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：arklexai/arksim\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install arksim\n```\n\n来源：https://github.com/arklexai/arksim#readme\n\n## 来源\n\n- docs: https://github.com/arklexai/arksim#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_3536c77cf68640e68b2a7b029788aeed"
}