{
  "canonical_name": "antoinezambelli/forge",
  "compilation_id": "pack_9a35ce0fc8554ad284980112bdc2cb2a",
  "created_at": "2026-05-19T20:57:19.772162+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=mcp_config, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=mcp_config, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install forge-guardrails` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install forge-guardrails",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "llm_execute_isolated_install",
      "sandbox_validation_id": "sbx_d7f86e82b70843fd84ac111fe8f1523a"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_bfaa280ff5fdd17d67aef16011b25542",
    "canonical_name": "antoinezambelli/forge",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/antoinezambelli/forge",
    "slug": "forge",
    "source_packet_id": "phit_a0567a8456844c358ea5c25a82960af7",
    "source_validation_id": "dval_7a58244d8bf74e91982ffb1c5a2b2cc8"
  },
  "merchandising": {
    "best_for": "需要软件开发与交付能力，并使用 chatgpt的用户",
    "github_forks": null,
    "github_stars": null,
    "one_liner_en": "[![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)",
    "one_liner_zh": "[![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)",
    "primary_category": {
      "category_id": "software-development",
      "confidence": "medium",
      "name_en": "Software Development",
      "name_zh": "软件开发与交付",
      "reason": "matched_keywords:git"
    },
    "target_user": "使用 chatgpt 等宿主 AI 的用户",
    "title_en": "forge",
    "title_zh": "forge 能力包",
    "visible_tags": [
      {
        "label_en": "Knowledge Retrieval",
        "label_zh": "知识检索",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-knowledge-retrieval",
        "type": "product_domain"
      },
      {
        "label_en": "Knowledge Base Q&A",
        "label_zh": "知识库问答",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-knowledge-base-q-a",
        "type": "user_job"
      },
      {
        "label_en": "Multi-agent Collaboration",
        "label_zh": "多 Agent 协作",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-multi-agent-collaboration",
        "type": "core_capability"
      },
      {
        "label_en": "Multi-role Workflow",
        "label_zh": "多角色协作流程",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-multi-role-workflow",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Evaluation Suite",
        "label_zh": "评测体系",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-evaluation-suite",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_a0567a8456844c358ea5c25a82960af7",
  "page_model": {
    "artifacts": {
      "artifact_slug": "forge",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install forge-guardrails",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/antoinezambelli/forge#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "知识检索",
        "知识库问答",
        "多 Agent 协作",
        "多角色协作流程",
        "评测体系"
      ],
      "eyebrow": "软件开发与交付",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要软件开发与交付能力，并使用 chatgpt的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "[![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)"
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "chatgpt",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "mcp_config, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_148dff87195e42549d0ffb88b99e9cbf | https://github.com/antoinezambelli/forge/issues/58 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Investigate: integration paths with Hermes Agent",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_e3cbd2d1c9a84a1887887bf24b036865 | https://github.com/antoinezambelli/forge/issues/51 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Investigate: integration paths with Hermes Agent",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Per-model recommended sampling defaults (map keyed by HF model cards)",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_057ca2af912e4a608259ffb2a3654d4f | https://github.com/antoinezambelli/forge/issues/59 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Per-model recommended sampling defaults (map keyed by HF model cards)",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Rescue-parse ChatGPT-style XML tool calls",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_471c674c8d73451da75d6b8c9349aabf | https://github.com/antoinezambelli/forge/issues/55 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Rescue-parse ChatGPT-style XML tool calls",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：Proxy external mode hardcodes native FC — no prompt-injection fallback",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_f3a85ec8447a4838b3bc4c846cd9e7a0 | https://github.com/antoinezambelli/forge/issues/53 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Proxy external mode hardcodes native FC — no prompt-injection fallback",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "未记录 last_activity_observed。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | last_activity_observed missing"
            ],
            "severity": "medium",
            "suggested_check": "补 GitHub 最近 commit、release、issue/PR 响应信号。",
            "title": "维护活跃度未知",
            "user_impact": "新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "downstream_validation.risk_items | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "进入安全/权限治理复核队列。",
            "title": "下游验证发现风险项",
            "user_impact": "下游已经要求复核，不能在页面中弱化。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "risks.scoring_risks | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "把风险写入边界卡，并确认是否需要人工复核。",
            "title": "存在评分风险",
            "user_impact": "风险会影响是否适合普通用户安装。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_4ad226a6d1fa4a5f89fa7702bec11188 | https://github.com/antoinezambelli/forge/issues/61 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Sub-agent support: dynamic slot splitting",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_5b35873cf63c4647bca8a0611d441189 | https://github.com/antoinezambelli/forge/issues/28 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Sub-agent support: dynamic slot splitting",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Sub-agent support: slot pool",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_070d9a3d20d24123b62d7d76ee16078a | https://github.com/antoinezambelli/forge/issues/29 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Sub-agent support: slot pool",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：llama.cpp reasoning budget sampler causes silent hangs after April 10 builds",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_673be4a583984219bab90cbadff631fe | https://github.com/antoinezambelli/forge/issues/54 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：llama.cpp reasoning budget sampler causes silent hangs after April 10 builds",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "issue_or_pr_quality=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | issue_or_pr_quality=unknown"
            ],
            "severity": "low",
            "suggested_check": "抽样最近 issue/PR，判断是否长期无人处理。",
            "title": "issue/PR 响应质量未知",
            "user_impact": "用户无法判断遇到问题后是否有人维护。"
          },
          {
            "body": "release_recency=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | release_recency=unknown"
            ],
            "severity": "low",
            "suggested_check": "确认最近 release/tag 和 README 安装命令是否一致。",
            "title": "发布节奏不明确",
            "user_impact": "安装命令和文档可能落后于代码，用户踩坑概率升高。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": null,
        "forks": null,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": null
      },
      "source_url": "https://github.com/antoinezambelli/forge",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "[![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)",
      "title": "forge 能力包",
      "trial_prompt": "# forge - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for antoinezambelli/forge.\n\nProject:\n- Name: forge\n- Repository: https://github.com/antoinezambelli/forge\n- Summary: [![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)\n- Host target: chatgpt\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: [![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: [![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. page-introduction: Introduction to Forge. Produce one small intermediate artifact and wait for confirmation.\n2. page-installation: Installation Guide. Produce one small intermediate artifact and wait for confirmation.\n3. page-quickstart: Quick Start Guide. Produce one small intermediate artifact and wait for confirmation.\n4. page-architecture: System Architecture. Produce one small intermediate artifact and wait for confirmation.\n5. page-module-structure: Module Structure and API. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://news.ycombinator.com/item?id=48192383\n- https://github.com/antoinezambelli/forge#readme\n- README.md\n- src/forge/__init__.py\n- pyproject.toml\n- src/forge/core/workflow.py\n- src/forge/core/runner.py\n- docs/ARCHITECTURE.md\n- src/forge/context/manager.py\n- src/forge/core/messages.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: Hardware detection: AMD unified-memory rigs fall through to 4K Ollama bu（https://github.com/antoinezambelli/forge/issues/61）；github/github_issue: Per-model recommended sampling defaults (map keyed by HF model cards)（https://github.com/antoinezambelli/forge/issues/59）；github/github_issue: Client sampling params: thread top_p/top_k/min_p/repeat_penalty through （https://github.com/antoinezambelli/forge/issues/58）；github/github_issue: llama.cpp reasoning budget sampler causes silent hangs after April 10 bu（https://github.com/antoinezambelli/forge/issues/54）；github/github_issue: Rescue-parse ChatGPT-style XML tool calls（https://github.com/antoinezambelli/forge/issues/55）；github/github_issue: Proxy external mode hardcodes native FC — no prompt-injection fallback（https://github.com/antoinezambelli/forge/issues/53）；github/github_issue: Investigate: integration paths with Hermes Agent（https://github.com/antoinezambelli/forge/issues/51）；github/github_issue: Sub-agent support: slot pool（https://github.com/antoinezambelli/forge/issues/29）；github/github_issue: Sub-agent support: dynamic slot splitting（https://github.com/antoinezambelli/forge/issues/28）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Hardware detection: AMD unified-memory rigs fall through to 4K Ollama bu",
              "url": "https://github.com/antoinezambelli/forge/issues/61"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Per-model recommended sampling defaults (map keyed by HF model cards)",
              "url": "https://github.com/antoinezambelli/forge/issues/59"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Client sampling params: thread top_p/top_k/min_p/repeat_penalty through ",
              "url": "https://github.com/antoinezambelli/forge/issues/58"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "llama.cpp reasoning budget sampler causes silent hangs after April 10 bu",
              "url": "https://github.com/antoinezambelli/forge/issues/54"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Rescue-parse ChatGPT-style XML tool calls",
              "url": "https://github.com/antoinezambelli/forge/issues/55"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Proxy external mode hardcodes native FC — no prompt-injection fallback",
              "url": "https://github.com/antoinezambelli/forge/issues/53"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Investigate: integration paths with Hermes Agent",
              "url": "https://github.com/antoinezambelli/forge/issues/51"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Sub-agent support: slot pool",
              "url": "https://github.com/antoinezambelli/forge/issues/29"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Sub-agent support: dynamic slot splitting",
              "url": "https://github.com/antoinezambelli/forge/issues/28"
            }
          ],
          "status": "已收录 9 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "软件开发与交付",
      "desc": "[![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)",
      "effort": "安装已验证",
      "forks": null,
      "icon": "code",
      "name": "forge 能力包",
      "risk": "可发布",
      "slug": "forge",
      "stars": null,
      "tags": [
        "知识检索",
        "知识库问答",
        "多 Agent 协作",
        "多角色协作流程",
        "评测体系"
      ],
      "thumb": "gray",
      "type": "MCP 配置"
    },
    "manual": {
      "markdown": "# https://github.com/antoinezambelli/forge 项目说明书\n\n生成时间：2026-05-19 20:57:03 UTC\n\n## 目录\n\n- [Introduction to Forge](#page-introduction)\n- [Installation Guide](#page-installation)\n- [Quick Start Guide](#page-quickstart)\n- [System Architecture](#page-architecture)\n- [Module Structure and API](#page-module-structure)\n- [Architecture Decision Records](#page-adr-index)\n- [WorkflowRunner and Agentic Loop](#page-workflowrunner)\n- [Guardrails Middleware for External Loops](#page-guardrails-middleware)\n- [Proxy Server Setup](#page-proxy-server)\n- [Backend Clients](#page-backend-clients)\n- [Backend Setup Guide](#page-backend-setup)\n- [Model Selection Guide](#page-model-guide)\n\n<a id='page-introduction'></a>\n\n## Introduction to Forge\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture), [Quick Start Guide](#page-quickstart)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/errors.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/errors.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n</details>\n\n# Introduction to Forge\n\nForge is a framework-agnostic LLM orchestration library that provides structured tool-calling workflows, guardrail enforcement, and context management for building reliable AI agents. It supports multiple LLM backends (Ollama, Llamafile, llama.cpp) and exposes both high-level runners and granular APIs for embedding into foreign orchestration loops.\n\n## Overview\n\nForge addresses the core challenges of LLM-based agent development:\n\n| Challenge | Forge Solution |\n|-----------|----------------|\n| Unreliable tool parsing | Rescue parsing with retry mechanisms |\n| Missing required steps | StepEnforcer validates call sequences |\n| Context overflow | Tiered context compaction strategies |\n| Model-specific sampling | Per-model verified sampling defaults |\n| Multi-backend support | Unified client abstraction layer |\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Core Architecture\n\nForge follows a layered architecture with clear separation of concerns:\n\n```mermaid\ngraph TD\n    subgraph \"User Layer\"\n        A[User Workflow] --> B[WorkflowRunner]\n    end\n    \n    subgraph \"Core Layer\"\n        B --> C[ContextManager]\n        B --> D[LLMClient]\n        B --> E[Guardrails]\n    end\n    \n    subgraph \"Client Layer\"\n        D --> F[OllamaClient]\n        D --> G[LlamafileClient]\n        D --> H[AnthropicClient]\n    end\n    \n    subgraph \"Backend Layer\"\n        F --> I[Ollama Backend]\n        G --> J[Llamafile Backend]\n        H --> K[Anthropic API]\n    end\n```\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n### Project Structure\n\n```\nsrc/forge/           # Library source\n  clients/           # LLM backend adapters (one per backend)\n  core/              # Workflow, runner, messages, steps\n  context/           # Context management and compaction\n  prompts/           # Prompt templates and nudges\n  guardrails/        # Response validation and step enforcement\n  proxy/             # OpenAI-compatible proxy server\n```\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Quick Start\n\nThe fundamental building blocks of a Forge workflow:\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Core Components\n\n### Workflow\n\nThe `Workflow` class defines the structure and constraints of an agent task:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `name` | `str` | Workflow identifier |\n| `description` | `str` | Human-readable description |\n| `tools` | `dict[str, ToolDef]` | Tool definitions keyed by name |\n| `required_steps` | `list[str]` | Tools that must precede terminal tool |\n| `terminal_tool` | `str` | Tool(s) that can end the workflow |\n\n资料来源：[src/forge/core/workflow.py:1-50](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### ToolDef\n\nBinds a tool schema to its implementation:\n\n```python\n@dataclass\nclass ToolDef:\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\nThe `prerequisites` field supports:\n- **String entries**: Name-only requirements (\"read_file\" — any prior call satisfies)\n- **Dict entries**: Arg-matched requirements (`{\"tool\": \"read_file\", \"match_arg\": \"path\"}`)\n\n资料来源：[src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### LLM Clients\n\nForge provides backend-specific clients:\n\n| Client | Backend | Features |\n|--------|---------|----------|\n| `OllamaClient` | Ollama API | recommended_sampling, async streaming |\n| `LlamafileClient` | Llamafile binary | Context length detection, reasoning extraction |\n| `AnthropicClient` | Anthropic API | Native Claude support |\n\nAll clients support `send()` and `send_stream()` methods with sampling parameter overrides:\n\n```python\n# Instance-level sampling\nclient = OllamaClient(model=\"qwen3:8b\", recommended_sampling=True)\n\n# Per-call override (merged without mutation)\nawait client.send(messages, sampling={\"temperature\": 0.5})\n```\n\n资料来源：[src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n\n### Per-Model Sampling Defaults\n\nThe `sampling_defaults` module provides verified per-model sampling parameters sourced from HuggingFace model cards:\n\n```python\ndef get_sampling_defaults(model: str) -> dict[str, float | int]:\n    \"\"\"Pure lookup - returns copy of map value or {} for unknown models.\"\"\"\n    return dict(MODEL_SAMPLING_DEFAULTS.get(model, {}))\n\ndef apply_sampling_defaults(model: str, *, strict: bool) -> dict[str, float | int]:\n    \"\"\"Policy layer with four-quadrant behavior.\"\"\"\n```\n\n| strict | model in map | behavior |\n|--------|--------------|----------|\n| True | yes | return dict |\n| True | no | raise `UnsupportedModelError` |\n| False | yes | one-shot INFO log; return `{}` |\n| False | no | return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n\n## Guardrails System\n\nForge's guardrails provide multi-layered response validation:\n\n```mermaid\ngraph LR\n    A[LLM Response] --> B[ResponseValidator]\n    B --> C{Valid ToolCall?}\n    C -->|No| D[Rescue Parsing]\n    D --> E{Rescued?}\n    E -->|Yes| F[Return ToolCall]\n    E -->|No| G[Retry Nudge]\n    C -->|Yes| H[StepEnforcer]\n    H --> I{Valid Sequence?}\n    I -->|Yes| J[ErrorTracker]\n    I -->|No| K[Step Blocked Nudge]\n    J --> L{Error Limit?}\n    L -->|Yes| M[Fatal Error]\n    L -->|No| N[Execute]\n```\n\n资料来源：[src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n\n### Guardrails API\n\n```python\nclass Guardrails:\n    def __init__(\n        self,\n        tool_names: list[str],\n        terminal_tool: str | frozenset[str],\n        required_steps: list[str] | None = None,\n        max_retries: int = 3,\n        max_tool_errors: int = 2,\n        rescue_enabled: bool = True,\n        max_premature_attempts: int = 3,\n        retry_nudge: Callable[[str], str] | None = None,\n    ) -> None:\n```\n\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `max_retries` | 3 | Consecutive bad responses before fatal |\n| `max_tool_errors` | 2 | Tool execution failures before exhaustion |\n| `max_premature_attempts` | 3 | Premature terminal attempts before fatal |\n| `rescue_enabled` | True | Parse tool calls from plain text |\n| `retry_nudge` | None | Custom nudge for bare text responses |\n\n资料来源：[src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n\n### CheckResult Actions\n\n```python\n@dataclass\nclass CheckResult:\n    action: Literal[\"execute\", \"retry\", \"step_blocked\", \"fatal\"]\n    tool_calls: list[ToolCall] | None = None\n    nudge: Nudge | None = None  # Set when action is \"retry\" or \"step_blocked\"\n    reason: str | None = None  # Only when action == \"fatal\"\n```\n\n资料来源：[src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n\n## Context Management\n\nThe `ContextManager` handles token budget enforcement and message compaction:\n\n| Strategy | Purpose |\n|----------|---------|\n| `TieredCompact` | Keeps recent N messages, compacts older ones |\n| Custom strategies | Pluggable compaction algorithms |\n\n```python\nctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n```\n\n资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Server Manager\n\nFor local GGUF model execution, `ServerManager` handles backend lifecycle:\n\n```python\nserver, ctx = await forge.serve(\n    backend=\"llamaserver\",\n    gguf_path=\"/path/to/model.gguf\",\n    mode=BudgetMode.FORGE_FAST,\n    client=client,\n)\nrunner = WorkflowRunner(client=client, context_manager=ctx)\n```\n\n| Parameter | Description |\n|-----------|-------------|\n| `backend` | \"ollama\", \"llamaserver\", or \"llamafile\" |\n| `gguf_path` | Path to GGUF file (not for Ollama) |\n| `mode` | Budget mode (FORGE_FAST, FORGE_BALANCED, FORGE_DEEP) |\n| `n_slots` | Concurrent slots for multi-agent |\n| `kv_unified` | Single shared KV cache across slots |\n\n资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Error Handling\n\nForge defines a structured exception hierarchy:\n\n```python\nclass ForgeError(Exception):  # Base exception\n    pass\n\nclass UnsupportedModelError(ForgeError):\n    \"\"\"raised when recommended_sampling=True for unknown model.\"\"\"\n    \nclass ToolCallError(ForgeError):\n    \"\"\"LLM failed to produce valid tool call after retries.\"\"\"\n    \nclass ToolExecutionError(ForgeError):\n    \"\"\"Tool callable raised during execution.\"\"\"\n```\n\n资料来源：[src/forge/errors.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/errors.py)\n\n## Foreign Loop Integration\n\nForge can be embedded into existing orchestration systems using the `Guardrails` middleware API:\n\n```python\nfrom forge.guardrails import Guardrails\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response(response):\n    result = guardrails.check(response)\n    \n    if result.action == \"fatal\":\n        return f\"FATAL: {result.reason}\"\n    \n    if result.action in (\"retry\", \"step_blocked\"):\n        return f\"{result.action}: {result.nudge.content[:80]}...\"\n    \n    # Execute tools, then record\n    executed = [tc.tool for tc in result.tool_calls]\n    done = guardrails.record(executed)\n    return f\"executed {executed}\" + (\" -- DONE\" if done else \"\")\n```\n\n资料来源：[examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n### Granular Component Access\n\nFor fine-grained control, access components directly:\n\n```python\nfrom forge.guardrails import ErrorTracker, ResponseValidator, StepEnforcer\n\nvalidator = ResponseValidator(tool_names=[\"search\", \"lookup\", \"answer\"], rescue_enabled=True)\nenforcer = StepEnforcer(required_steps=[\"search\", \"lookup\"], terminal_tool=\"answer\")\nerrors = ErrorTracker(max_retries=3, max_tool_errors=2)\n```\n\n资料来源：[examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Code Style and Requirements\n\n| Requirement | Details |\n|-------------|---------|\n| Python | 3.12+ |\n| Async | `asyncio` throughout — all client methods and runner are async |\n| Type Safety | Pydantic for tool parameter schemas |\n| Type Syntax | Modern unions with `\\|` |\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Running Tests\n\n```bash\n# Full suite (865 tests)\npython -m pytest tests/unit/ -v --tb=short\n\n# With coverage\npython -m pytest tests/unit/ --cov=forge --cov-report=term-missing\n\n# Skip integration tests (require live backend)\npython -m pytest tests/ -m \"not integration\"\n```\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n---\n\n<a id='page-installation'></a>\n\n## Installation Guide\n\n### 相关页面\n\n相关主题：[Backend Setup Guide](#page-backend-setup), [Quick Start Guide](#page-quickstart)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n</details>\n\n# Installation Guide\n\nThis guide covers all aspects of setting up the **forge** framework, from basic installation to advanced configuration for different LLM backends.\n\n## Overview\n\nForge is a Python library for building LLM-powered workflows with automatic tool calling, context management, and multi-backend support. The installation process involves:\n\n1. Python environment setup (Python 3.12+)\n2. Package installation via pip\n3. LLM backend selection and configuration\n4. Optional development environment for contributors\n\n**资料来源：[CONTRIBUTING.md:1-15]()**\n\n## Prerequisites\n\n### System Requirements\n\n| Component | Requirement |\n|-----------|-------------|\n| Python | 3.12+ |\n| OS | Linux, macOS, Windows |\n| LLM Backend | Ollama, llama-server, or llamafile |\n| VRAM | Varies by model (8GB minimum for small models) |\n\n### Backend Options\n\nForge supports three LLM backends:\n\n| Backend | Description | Use Case |\n|---------|-------------|----------|\n| **Ollama** | Local model management with simple API | Quick setup, model management |\n| **llama-server** | llama.cpp server binary | Production, GGUF files |\n| **llamafile** | Single-file executable models | Distribution, portability |\n\nEach backend requires different installation steps:\n\n- **Ollama**: Install via `ollama run` commands\n- **llama-server**: Download llama.cpp server binary\n- **llamafile**: Download pre-built executables or convert GGUF files\n\n**资料来源：[src/forge/server.py:1-50]()**\n\n## Installation Methods\n\n### Standard Installation\n\nFor end users wanting to use forge as a library:\n\n```bash\npip install forge\n```\n\nThis installs the core package with all necessary dependencies.\n\n### Development Installation\n\nFor contributors and those wanting to modify the codebase:\n\n```bash\ngit clone https://github.com/antoinezambelli/forge.git\ncd forge\npython -m venv .venv\nsource .venv/bin/activate  # Linux/macOS\n# or: .venv\\Scripts\\activate  # Windows\npip install -e \".[dev]\"\n```\n\n**资料来源：[CONTRIBUTING.md:7-14]()**\n\nThe `.[dev]` extras include:\n\n- Testing dependencies (`pytest`)\n- Development tools\n- Documentation build tools\n\n### Package Configuration\n\nThe project uses `pyproject.toml` for dependency management:\n\n```toml\n[project]\nname = \"forge\"\nversion = \"0.5.0\"\nrequires-python = \">=3.12\"\n\n[project.optional-dependencies]\ndev = [\n    \"pytest>=7.0\",\n    \"pytest-asyncio\",\n    \"pytest-cov\",\n]\n```\n\n## Environment Setup\n\n### Virtual Environment\n\nUsing a virtual environment is **strongly recommended** to avoid dependency conflicts:\n\n```bash\npython -m venv forge-env\nsource forge-env/bin/activate\n```\n\n### Backend Installation\n\n#### Ollama Setup\n\n1. Install Ollama from [ollama.ai](https://ollama.ai)\n2. Pull a model:\n\n```bash\nollama pull ministral-3:8b-instruct-2512-q4_K_M\n```\n\n3. Verify the installation:\n\n```bash\nollama list\n```\n\n#### llama-server Setup\n\n1. Download the llama.cpp server binary for your platform\n2. Place the binary in your models directory or system PATH\n3. Verify with:\n\n```bash\n./llama-server --help\n```\n\n#### llamafile Setup\n\n1. Download a pre-built llamafile (e.g., from TheBloke)\n2. Make it executable:\n\n```bash\nchmod +x model-name.Q4_K_M.llamafile\n```\n\n**资料来源：[src/forge/server.py:180-220]()**\n\n## Configuration Flow\n\n```mermaid\ngraph TD\n    A[Install forge] --> B{Use Case}\n    B -->|Library| C[pip install forge]\n    B -->|Development| D[pip install -e .[dev]]\n    C --> E[Choose Backend]\n    D --> E\n    E -->|Ollama| F[Install Ollama + Pull Model]\n    E -->|llama-server| G[Download llama.cpp binary]\n    E -->|llamafile| H[Download llamafile]\n    F --> I[Verify with test script]\n    G --> I\n    H --> I\n```\n\n## Quick Start Verification\n\nAfter installation, verify your setup with this minimal example:\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n**资料来源：[README.md:1-60]()**\n\n## Advanced Backend Configuration\n\n### KV Cache Quantization\n\nForge supports KV cache quantization to reduce VRAM usage:\n\n| Setting | VRAM Savings | Quality Impact |\n|---------|--------------|----------------|\n| `q8_0` | ~50% vs F16 | Minimal |\n| `q4_0` | ~75% vs F16 | Low |\n\n```python\n# In setup_backend call\nfrom forge.server import setup_backend, BudgetMode\n\nserver, ctx = await setup_backend(\n    backend=\"llamaserver\",\n    gguf_path=\"/path/to/model.gguf\",\n    budget_mode=BudgetMode.FORGE_FAST,\n    cache_type_k=\"q4_0\",  # Key cache quantization\n    cache_type_v=\"q4_0\",   # Value cache quantization\n)\n```\n\n**资料来源：[src/forge/server.py:20-45]()**\n\n### Multi-Slot Configuration\n\nFor multi-agent architectures:\n\n```python\nserver, ctx = await setup_backend(\n    backend=\"llamaserver\",\n    gguf_path=\"/path/to/model.gguf\",\n    n_slots=4,           # Number of concurrent slots\n    kv_unified=True,     # Shared KV cache pool\n)\n```\n\nWhen `kv_unified=True`, all slots share a single KV cache pool, allowing each slot to use the full context window.\n\n**资料来源：[src/forge/server.py:40-55]()**\n\n## Recommended Sampling Parameters\n\nForge provides recommended sampling defaults for specific models:\n\n```python\nclient = OllamaClient(\n    model=\"qwen3:8b-q4_K_M\",\n    recommended_sampling=True  # Enable recommended defaults\n)\n```\n\nThe `recommended_sampling=True` parameter enables tuned temperature, top_p, top_k, and other sampling parameters sourced from HuggingFace model cards.\n\n**资料来源：[src/forge/clients/sampling_defaults.py:1-80]()**\n\n## Testing Your Installation\n\n### Unit Tests\n\nRun the deterministic unit test suite (no backend required):\n\n```bash\npython -m pytest tests/unit/ -v --tb=short\n```\n\n**资料来源：[CONTRIBUTING.md:18-25]()**\n\n### Integration Tests\n\nIntegration tests require a running backend:\n\n```bash\n# Skip integration tests\npython -m pytest tests/ -m \"not integration\"\n\n# Run with coverage\npython -m pytest tests/unit/ --cov=forge --cov-report=term-missing\n```\n\n## Troubleshooting\n\n### Common Issues\n\n| Issue | Solution |\n|-------|----------|\n| `ModuleNotFoundError: forge` | Run `pip install forge` or check virtual environment |\n| Backend connection refused | Verify backend is running on correct port |\n| Model not found (Ollama) | Run `ollama pull <model-name>` |\n| VRAM out of memory | Enable KV cache quantization or use smaller model |\n\n### Backend Health Check\n\nVerify backend connectivity:\n\n```python\nimport httpx\nimport asyncio\n\nasync def check_backend():\n    try:\n        async with httpx.AsyncClient(timeout=5.0) as client:\n            resp = await client.get(\"http://localhost:8080/props\")\n            if resp.status_code == 200:\n                print(\"Backend is ready\")\n    except httpx.ConnectError:\n        print(\"Backend not reachable\")\n```\n\n**资料来源：[src/forge/server.py:250-280]()**\n\n## Next Steps\n\nAfter installation, refer to:\n\n- [User Guide](docs/USER_GUIDE.md) — Complete workflow creation and execution\n- [Backend Setup Guide](docs/BACKEND_SETUP.md) — Detailed backend configuration\n- [Model Guide](docs/MODEL_GUIDE.md) — Recommended models by hardware tier\n- [Architecture Decision Records](docs/decisions/) — Design rationale documentation\n\n---\n\n<a id='page-quickstart'></a>\n\n## Quick Start Guide\n\n### 相关页面\n\n相关主题：[WorkflowRunner and Agentic Loop](#page-workflowrunner), [System Architecture](#page-architecture)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/core/runner.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/runner.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n</details>\n\n# Quick Start Guide\n\nThis guide provides a practical introduction to **Forge**, a reliability layer for self-hosted LLM tool-calling. Forge elevates an 8B local model to top-tier performance on multi-step agentic workflows through guardrails (rescue parsing, retry nudges, step enforcement) and context management (VRAM-aware budgets, tiered compaction).\n\n## Purpose and Scope\n\nThe Quick Start Guide covers:\n\n- **Installation and environment setup** for local LLM backends\n- **Core concepts** including Workflows, ToolDefs, and the WorkflowRunner\n- **Basic usage patterns** for single-step and multi-step agentic workflows\n- **Integration options** for foreign orchestration loops\n- **Backend management** with auto-start capabilities\n\nForge targets developers building agentic applications that require structured tool-calling with local models. It works with llama.cpp-based backends (llama-server, llamafile) and Ollama.\n\n资料来源：[README.md:1-20](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Installation\n\n### Prerequisites\n\n| Requirement | Version | Notes |\n|-------------|---------|-------|\n| Python | 3.12+ | Modern syntax required (type unions with `\\|`) |\n| pip | Latest | For package installation |\n| LLM Backend | llama.cpp / Ollama | For inference |\n\n### Setup Commands\n\n```bash\ngit clone https://github.com/antoinezambelli/forge.git\ncd forge\npython -m venv .venv\npip install -e \".[dev]\"\n```\n\n资料来源：[CONTRIBUTING.md:1-15](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Core Concepts\n\n### Architecture Overview\n\n```mermaid\ngraph TD\n    A[User Input] --> B[WorkflowRunner]\n    B --> C[LLM Client]\n    C --> D[Tool Call Response]\n    D --> E[Guardrails Check]\n    E -->|execute| F[Tool Execution]\n    F --> G[Context Manager]\n    G -->|compact| B\n    E -->|retry| C\n    E -->|fatal| H[Error Handling]\n```\n\n### Workflow\n\nThe `Workflow` is the central definition for an agentic task. It binds together:\n\n| Component | Type | Purpose |\n|-----------|------|---------|\n| `name` | `str` | Workflow identifier |\n| `description` | `str` | Human-readable description |\n| `tools` | `dict[str, ToolDef]` | Tool name → definition mapping |\n| `required_steps` | `list[str]` | Tools that must execute before terminal |\n| `terminal_tool` | `str` | Tool that ends the workflow |\n| `system_prompt_template` | `str` | System prompt for the LLM |\n\n资料来源：[src/forge/core/workflow.py:1-50](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### ToolDef and ToolSpec\n\n**ToolSpec** defines the schema exposed to the LLM:\n\n```python\nclass ToolSpec(BaseModel):\n    name: str\n    description: str\n    parameters: type[BaseModel]  # Pydantic model\n```\n\n**ToolDef** binds the schema to its Python implementation:\n\n```python\n@dataclass\nclass ToolDef:\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\nThe `prerequisites` field enables conditional dependencies:\n- `str`: \"if you call this tool, you must have called tool X first\"\n- `dict`: `{\"tool\": \"read_file\", \"match_arg\": \"path\"}` — arg-matched prerequisites\n\n资料来源：[src/forge/core/workflow.py:60-90](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### WorkflowRunner\n\nThe `WorkflowRunner` manages the full lifecycle:\n\n- System prompt injection\n- Tool execution and result handling\n- Context compaction\n- Guardrail enforcement\n- Multi-turn conversation state\n\n```python\nclass WorkflowRunner:\n    def __init__(self, client, context_manager):\n        ...\n    \n    async def run(self, workflow, user_message):\n        ...\n```\n\n资料来源：[src/forge/core/runner.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/runner.py)\n\n### ContextManager and Budget Modes\n\nForge provides VRAM-aware context management through budget modes:\n\n| Mode | Behavior |\n|------|----------|\n| `BudgetMode.MANUAL` | User-specified token budget |\n| `BudgetMode.FORGE_FAST` | VRAM-optimized fast inference budget |\n| `BudgetMode.FORGE_DEEP` | Extended context for complex reasoning |\n\nThe `ContextManager` resolves budgets at runtime based on the backend:\n\n```python\nasync def resolve_budget(self, mode: BudgetMode, manual_tokens: int | None = None) -> int:\n    if mode == BudgetMode.MANUAL:\n        if self._backend == \"ollama\":\n            return manual_tokens\n        return await self.get_server_context()\n```\n\n资料来源：[src/forge/server.py:80-120](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Quick Start Example\n\n### Basic Single-Tool Workflow\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md:20-60](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n### Key Components Explained\n\n| Component | Import | Purpose |\n|-----------|--------|---------|\n| `OllamaClient` | `forge` | LLM backend adapter for Ollama |\n| `TieredCompact` | `forge` | Context compaction strategy |\n| `ContextManager` | `forge` | Token budget management |\n| `WorkflowRunner` | `forge` | Orchestrates the agent loop |\n\n## Multi-Step Workflows\n\nFor workflows requiring sequential tool execution:\n\n```python\n# Define multi-step workflow with prerequisites\nworkflow = Workflow(\n    name=\"research_assistant\",\n    description=\"Research and answer questions\",\n    tools={\n        \"search\": ToolDef(spec=search_spec, callable=do_search),\n        \"lookup\": ToolDef(spec=lookup_spec, callable=do_lookup),\n        \"answer\": ToolDef(spec=answer_spec, callable=final_answer),\n    },\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n```\n\nThe `required_steps` list enforces that `search` and `lookup` must execute before `answer`. Attempting to call the terminal tool prematurely triggers a retry nudge.\n\n资料来源：[examples/foreign_loop.py:1-50](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Guardrails API\n\nFor foreign orchestration loops (non-WorkflowRunner usage), Forge provides standalone guardrails:\n\n### Simple API\n\n```python\nfrom forge.guardrails import Guardrails\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response(response):\n    result = guardrails.check(response)\n    \n    if result.action == \"fatal\":\n        return f\"FATAL: {result.reason}\"\n    \n    if result.action in (\"retry\", \"step_blocked\"):\n        return f\"{result.action}: {result.nudge.content[:80]}...\"\n    \n    # Execute tools\n    executed = [tc.tool for tc in result.tool_calls]\n    done = guardrails.record(executed)\n    return f\"executed {executed}\" + (\" -- DONE\" if done else \"\")\n```\n\n### Granular API\n\nDirect access to individual components:\n\n```python\nfrom forge.guardrails import ErrorTracker, ResponseValidator, StepEnforcer\n\nvalidator = ResponseValidator(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    rescue_enabled=True,\n)\nenforcer = StepEnforcer(\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\nerrors = ErrorTracker(max_retries=3, max_tool_errors=2)\n```\n\n| Component | Purpose |\n|-----------|---------|\n| `ResponseValidator` | Parses tool calls from LLM responses, rescue mode |\n| `StepEnforcer` | Enforces required step sequence |\n| `ErrorTracker` | Tracks retry attempts and tool errors |\n\n资料来源：[examples/foreign_loop.py:80-150](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Respond Tool Pattern\n\nFor conversational turns where the model should respond directly:\n\n```python\nfrom forge.tools import RESPOND_TOOL_NAME, respond_spec\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\", RESPOND_TOOL_NAME],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response_with_respond(response):\n    result = guardrails.check(response)\n    \n    # Check for respond() call\n    for tc in result.tool_calls:\n        if tc.tool == RESPOND_TOOL_NAME:\n            message = tc.args.get(\"message\", \"\")\n            return f\"MODEL SAYS: {message}\"\n    \n    # Normal tool execution\n    ...\n```\n\n资料来源：[examples/foreign_loop.py:160-200](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Backend Auto-Management\n\nForge can auto-start backends for multi-agent architectures:\n\n```python\nfrom forge.server import run_with_server\nfrom forge.clients import LlamafileClient, BudgetMode\n\nasync with run_with_server(\n    backend=\"llamafile\",\n    gguf_path=\"/path/to/model.gguf\",\n    budget_mode=BudgetMode.FORGE_FAST,\n) as (server, ctx):\n    client = LlamafileClient(model=\"my-model\")\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    # Run workflows...\n```\n\n### Backend Options\n\n| Backend | Model Source | GGUF Support |\n|---------|-------------|--------------|\n| `ollama` | Model name (e.g., `ministral-3:8b`) | No |\n| `llamaserver` | GGUF file path | Yes |\n| `llamafile` | GGUF file path | Yes |\n\n资料来源：[src/forge/server.py:1-80](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Recommended Sampling Parameters\n\nForge provides curated sampling defaults for supported models:\n\n```python\nfrom forge.clients import OllamaClient, get_sampling_defaults\n\n# Opt-in to recommended sampling\nclient = OllamaClient(\n    model=\"ministral-3:8b-q4_K_M\",\n    recommended_sampling=True  # Raises error for unknown models\n)\n```\n\n| Parameter | Source | Verification |\n|-----------|--------|--------------|\n| `temperature` | HF model cards | Per-model verification |\n| `top_p` | HF model cards | Per-model verification |\n| `top_k` | HF model cards | Per-model verification |\n| `min_p` | HF model cards | Per-model verification |\n| `repeat_penalty` | HF model cards | Per-model verification |\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-50](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n\n## Testing\n\n### Unit Tests (No Backend Required)\n\n```bash\n# Full suite (865 tests)\npython -m pytest tests/unit/ -v --tb=short\n\n# With coverage\npython -m pytest tests/unit/ --cov=forge --cov-report=term-missing\n\n# Single file\npython -m pytest tests/unit/test_runner.py -v\n```\n\n### Integration Tests (Requires Backend)\n\n```bash\n# Skip integration tests\npython -m pytest tests/ -m \"not integration\"\n```\n\n资料来源：[CONTRIBUTING.md:15-30](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Next Steps\n\n| Topic | Description |\n|-------|-------------|\n| [User Guide](docs/USER_GUIDE.md) | Multi-step workflows, long-running sessions |\n| [Model Guide](docs/MODEL_GUIDE.md) | Model-specific configurations |\n| [Architecture Decisions](docs/decisions/) | Design rationale and ADRs |\n| [Eval Suite](tests/eval/) | Performance evaluation methodology |\n\n---\n\n<a id='page-architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[WorkflowRunner and Agentic Loop](#page-workflowrunner)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n</details>\n\n# System Architecture\n\n## Overview\n\nForge is an LLM agent framework that orchestrates multi-step tool-calling workflows with built-in guardrails, context management, and automatic backend server management. The architecture follows a clean separation of concerns: clients abstract LLM backends, workflows define agent behavior, guardrails enforce execution policies, and the context manager handles token budgeting. 资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\nThe framework is designed for determinism in unit tests and supports three backend types: Ollama, llama-server, and llamafile. 资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Project Layout\n\n```\nsrc/forge/           # Library source\n  clients/           # LLM backend adapters (one per backend)\n  core/              # Workflow, runner, messages, steps\n  context/           # Context management and compaction\n  prompts/           # Prompt templates and nudges\ntests/\n  unit/              # Deterministic tests\n  eval/              # Eval harness (requires live backends)\n    scenarios/       # Eval scenario definitions\n    dashboard/       # React-based HTML dashboard (separate npm build)\ndocs/                # User-facing documentation\n  decisions/         # Architecture Decision Records (ADRs)\n  results/           # Eval results and raw data tables\n```\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Core Architecture Components\n\n### Architecture Diagram\n\n```mermaid\ngraph TD\n    User[\"User / Application\"]\n    Runner[\"WorkflowRunner\"]\n    Workflow[\"Workflow\"]\n    Guardrails[\"Guardrails\"]\n    ContextMgr[\"ContextManager\"]\n    Client[\"LLM Client\"]\n    ServerMgr[\"ServerManager\"]\n    \n    User --> Runner\n    Runner --> Workflow\n    Runner --> Guardrails\n    Runner --> ContextMgr\n    Runner --> Client\n    Client --> ServerMgr\n    \n    subgraph \"forge Library\"\n        Runner\n        Workflow\n        Guardrails\n        ContextMgr\n        Client\n    end\n    \n    subgraph \"Backend\"\n        ServerMgr\n    end\n```\n\n### LLM Clients\n\nThe client layer abstracts different LLM backends behind a common async interface. Each client handles backend-specific protocol differences.\n\n| Client | Backend | Protocol |\n|--------|---------|----------|\n| `OllamaClient` | Ollama | OpenAI-compatible REST |\n| `LlamafileClient` | Llamafile | OpenAI-compatible REST |\n| `AnthropicClient` | Anthropic API | Anthropic native |\n| `OpenAIClient` | OpenAI API | OpenAI native |\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n#### Sampling Defaults\n\nEach client can optionally apply recommended sampling parameters sourced from HuggingFace model cards. The policy layer provides four-quadrant behavior:\n\n| `strict` | Model in map | Behavior |\n|----------|--------------|----------|\n| `True` | Yes | Return dict |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n\nClients no longer ship hardcoded temperature defaults. With `recommended_sampling=False` (default), forge sends nothing and the backend's default applies. 资料来源：[CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n\n### Workflow System\n\nThe `Workflow` class defines an agent's behavior declaratively:\n\n```python\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(...),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools.\",\n)\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n#### Tool Definition\n\n`ToolDef` binds a tool schema to its implementation:\n\n```python\n@dataclass\nclass ToolDef:\n    \"\"\"Binds a tool schema to its implementation.\"\"\"\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\nPrerequisites express conditional dependencies:\n- `str`: Name-only (`\"read_file\"` — any prior call satisfies it)\n- `dict`: Arg-matched (`{\"tool\": \"read_file\", \"match_arg\": \"path\"}`)\n\n资料来源：[src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### Guardrails System\n\nGuardrails enforce execution policies through three coordinated components:\n\n```mermaid\ngraph LR\n    Response[\"LLM Response\"] --> Validator[\"ResponseValidator\"]\n    Validator --> Enforcer[\"StepEnforcer\"]\n    Enforcer --> Tracker[\"ErrorTracker\"]\n    \n    Validator --> \"rescue parsing\"\n    Enforcer --> \"required steps\"\n    Tracker --> \"retry limits\"\n```\n\n#### Guardrails Configuration\n\n| Parameter | Purpose | Default |\n|-----------|---------|---------|\n| `tool_names` | List of available tools | Required |\n| `terminal_tool` | Final allowed tool | Required |\n| `required_steps` | Ordered prerequisite chain | `None` |\n| `max_retries` | Total retry attempts | 3 |\n| `max_tool_errors` | Consecutive tool failures | 2 |\n| `rescue_enabled` | Enable XML rescue parsing | `True` |\n| `max_premature_attempts` | Premature terminal attempts before fatal | 3 |\n\n资料来源：[src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n\n#### Check Result Actions\n\nThe `check()` method returns a `CheckResult` with these actions:\n\n| Action | Meaning |\n|--------|---------|\n| `proceed` | Response passes all guardrails |\n| `retry` | Invalid response, apply nudge and retry |\n| `step_blocked` | Missing required step |\n| `fatal` | Max retries exceeded |\n\n### Context Manager\n\nThe `ContextManager` handles token budgeting and context compaction to prevent context overflow during long conversations. 资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n#### Budget Resolution\n\nBudget is resolved based on the mode:\n\n| Mode | Resolution Strategy |\n|------|---------------------|\n| `MANUAL` | Use `manual_tokens` parameter or query server |\n| `FORGE_FAST` | Server-reported context / 4 |\n| `FORGE_BALANCED` | Server-reported context / 2 |\n| `FORGE_DEEP` | Server-reported context * 3 / 4 |\n\nFor Ollama backends, the context length is obtained from `ollama show`. For llama-server/llamafile, a `/props` query retrieves the actual `n_ctx`. 资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n### Server Manager\n\n`ServerManager` handles lifecycle management of backend servers (llama-server and llamafile only; Ollama is managed externally).\n\n```python\nserver = ServerManager(backend=\"llamaserver\", port=8080)\ncontext, ctx_mgr = await server.start_with_budget(\n    model=\"qwen3:8b-q4_K_M\",\n    budget_mode=BudgetMode.FORGE_FAST,\n    client=client,\n)\n```\n\n#### Server Configuration Parameters\n\n| Parameter | Description | Backend |\n|-----------|-------------|---------|\n| `model` | Model identity for server | All |\n| `gguf_path` | Path to GGUF file | llamaserver/llamafile |\n| `mode` | Operation mode | All |\n| `extra_flags` | Additional CLI flags | llamaserver/llamafile |\n| `ctx_override` | Override context length (`-c value`) | llamaserver/llamafile |\n| `cache_type_k` | KV cache quantization type for keys | llamaserver |\n| `cache_type_v` | KV cache quantization type for values | llamaserver |\n| `n_slots` | Concurrent slots count | llamaserver |\n| `kv_unified` | Single unified KV cache | llamaserver |\n\nThe server reuses an existing process if the same configuration is requested, avoiding unnecessary restarts. 资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Runner\n    participant Workflow\n    participant Guardrails\n    participant Context\n    participant Client\n    \n    User->>Runner: run(workflow, input)\n    Runner->>Context: begin_session()\n    Runner->>Workflow: Get system prompt\n    Runner->>Client: send(messages)\n    \n    loop Until terminal or max iterations\n        Client-->>Runner: LLMResponse\n        Runner->>Guardrails: check(response)\n        Guardrails-->>Runner: CheckResult\n        \n        alt proceed\n            Runner->>Runner: Execute tools\n            Runner->>Context: append(messages)\n            Runner->>Client: send(messages)\n        else retry\n            Runner->>Runner: Apply nudge, retry\n        else fatal\n            Runner->>User: Return error\n        end\n    end\n    \n    Runner-->>User: Final result\n```\n\n## Workflow Runner Integration\n\nThe `WorkflowRunner` orchestrates all components:\n\n```python\nasync def main():\n    client = OllamaClient(\n        model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n        recommended_sampling=True\n    )\n    ctx = ContextManager(\n        strategy=TieredCompact(keep_recent=2),\n        budget_tokens=8192\n    )\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Design Principles\n\n### Async-First\n\nAll client methods and the runner are async, enabling efficient I/O handling across multiple concurrent requests. 资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n### Type Safety\n\nPydantic is used for tool parameter schemas and response validation, ensuring runtime type safety for tool arguments. 资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n### Modern Python\n\nThe codebase targets Python 3.12+ and uses modern syntax including:\n- Type unions with `|` (e.g., `str | None`)\n- `dataclass` decorators\n- `field(default_factory=list)` patterns\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Guardrails Configuration Examples\n\n```python\n# Basic guardrails\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\n# With respond tool for middleware\nrespond_guardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\", RESPOND_TOOL_NAME],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n```\n\n资料来源：[examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Related Documentation\n\n- [User Guide](docs/USER_GUIDE.md) — Multi-step workflows and backend auto-management\n- [Model Guide](docs/MODEL_GUIDE.md) — Model recommendations by tier\n- [Architecture Decisions](docs/decisions/) — Design rationale for significant changes\n\n---\n\n<a id='page-module-structure'></a>\n\n## Module Structure and API\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture), [WorkflowRunner and Agentic Loop](#page-workflowrunner)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/__init__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/__init__.py)\n- [src/forge/core/messages.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/messages.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/errors.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/errors.py)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/clients/llamafile.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/llamafile.py)\n</details>\n\n# Module Structure and API\n\n## Overview\n\nThe forge repository implements a modular LLM orchestration framework designed to handle multi-step tool-calling workflows with built-in guardrails, context management, and support for multiple LLM backends. The architecture follows a clean separation of concerns with distinct modules for client handling, workflow orchestration, guardrails enforcement, and server management.\n\n## Core Architecture\n\nThe forge library is organized into a layered architecture that separates backend communication, workflow definition, execution enforcement, and context management.\n\n```mermaid\ngraph TD\n    A[User Code] --> B[WorkflowRunner]\n    B --> C[LLM Clients]\n    C --> D[llama.cpp / Ollama / Llamafile]\n    B --> E[Guardrails]\n    B --> F[ContextManager]\n    E --> G[ResponseValidator]\n    E --> H[StepEnforcer]\n    E --> I[ErrorTracker]\n```\n\n## Module Hierarchy\n\n| Module | Purpose | Key Classes |\n|--------|---------|-------------|\n| `forge.core` | Core workflow orchestration | `Workflow`, `WorkflowRunner`, `ToolSpec`, `ToolDef`, `ToolCall` |\n| `forge.clients` | LLM backend adapters | `OllamaClient`, `LlamafileClient`, `LlamaServerClient` |\n| `forge.guardrails` | Response validation and enforcement | `Guardrails`, `ResponseValidator`, `StepEnforcer`, `ErrorTracker` |\n| `forge.context` | Token budget and context management | `ContextManager`, `TieredCompact` |\n| `forge.server` | Backend server lifecycle management | `ServerManager`, `BudgetMode` |\n\n资料来源：[src/forge/__init__.py]()\n\n## Tool Definition API\n\n### ToolSpec\n\nThe `ToolSpec` class defines the interface for tools that the LLM can invoke. It wraps a Pydantic model representing the tool's parameters.\n\n```python\nclass ToolSpec(BaseModel):\n    name: str\n    description: str\n    parameters: type[BaseModel]\n```\n\n**Construction from OpenAI Schema:**\n\nTools can be defined from an OpenAI-style JSON Schema:\n\n```python\ntool_spec = ToolSpec.from_openai_schema(\n    name=\"get_weather\",\n    description=\"Get current weather for a city\",\n    schema={\n        \"type\": \"object\",\n        \"properties\": {\n            \"city\": {\"type\": \"string\", \"description\": \"City name\"}\n        },\n        \"required\": [\"city\"]\n    }\n)\n```\n\n资料来源：[src/forge/core/workflow.py:1-50]()\n\n### ToolDef\n\nThe `ToolDef` dataclass binds a tool schema to its implementation callable, along with prerequisites:\n\n```python\n@dataclass\nclass ToolDef:\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\n**Prerequisites Syntax:**\n\nPrerequisites express conditional dependencies between tool calls:\n\n| Type | Example | Behavior |\n|------|---------|----------|\n| String (name-only) | `\"read_file\"` | Any prior call to `read_file` satisfies it |\n| Dict (arg-matched) | `{\"tool\": \"read_file\", \"match_arg\": \"path\"}` | Prior call with same `path` value required |\n\n```python\ntool_def = ToolDef(\n    spec=tool_spec,\n    callable=get_weather_function,\n    prerequisites=[{\"tool\": \"search\", \"match_arg\": \"query\"}]\n)\n```\n\n资料来源：[src/forge/core/workflow.py:52-72]()\n\n## Workflow Definition API\n\n### Workflow Class\n\nThe `Workflow` class is the central configuration object for a multi-step LLM task:\n\n```python\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city\",\n    tools={\n        \"get_weather\": ToolDef(spec=..., callable=get_weather)\n    },\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n    system_prompt_template=\"You are a helpful assistant.\"\n)\n```\n\n**Key Parameters:**\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `name` | `str` | Yes | Workflow identifier |\n| `description` | `str` | Yes | Human-readable description for the LLM |\n| `tools` | `dict[str, ToolDef]` | Yes | Map of tool name to ToolDef |\n| `required_steps` | `list[str]` | No | Tools that must be called before terminal_tool |\n| `terminal_tool` | `str` | Yes | Tool(s) that can end the workflow |\n| `system_prompt_template` | `str` | No | System prompt injected into context |\n\n资料来源：[src/forge/core/workflow.py]()\n\n## LLM Client API\n\n### Client Architecture\n\nForge provides backend-agnostic client adapters that implement a common interface:\n\n```mermaid\ngraph LR\n    A[WorkflowRunner] --> B[Client Interface]\n    B --> C[OllamaClient]\n    B --> D[LlamafileClient]\n    B --> E[LlamaServerClient]\n```\n\n### OllamaClient\n\n```python\nclient = OllamaClient(\n    model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n    recommended_sampling=True\n)\n```\n\n### Sampling Defaults\n\nPer-model recommended sampling parameters are managed through `sampling_defaults.py`:\n\n```python\ndef apply_sampling_defaults(\n    model: str,\n    *,\n    strict: bool,\n) -> dict[str, float | int]:\n    \"\"\"Apply the recommended-sampling policy for model.\"\"\"\n```\n\n**Sampling Policy Quadrant:**\n\n| `strict` | Model in Map | Behavior |\n|----------|-------------|----------|\n| `True` | Yes | Return dict copy |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-80]()\n\n### ToolCall Response Model\n\nThe `ToolCall` class represents a validated tool invocation returned by an LLM client:\n\n```python\nclass ToolCall(BaseModel):\n    tool: str\n```\n\nAdditional fields may be populated by client implementations (e.g., `args`, `reasoning`).\n\n资料来源：[src/forge/core/workflow.py:74-77]()\n\n## Guardrails API\n\nThe guardrails system provides middleware for orchestrating LLM responses with built-in validation, step enforcement, and error handling.\n\n### Guardrails Class\n\nThe main entry point for the guardrails system:\n\n```python\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n    max_retries=3,\n    max_tool_errors=2,\n    rescue_enabled=True,\n    max_premature_attempts=3\n)\n```\n\n**Constructor Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `tool_names` | `list[str]` | Required | Valid tool names for this workflow |\n| `required_steps` | `list[str]` | `None` | Tools that must be called before terminal_tool |\n| `terminal_tool` | `str \\| frozenset` | Required | Tool(s) that can end the workflow |\n| `max_retries` | `int` | `3` | Consecutive bad responses before fatal |\n| `max_tool_errors` | `int` | `2` | Consecutive tool failures before exhaustion |\n| `rescue_enabled` | `bool` | `True` | Attempt to parse tool calls from plain text |\n| `max_premature_attempts` | `int` | `3` | Premature terminal attempts before fatal |\n| `retry_nudge` | `Callable[[str], str]` | `None` | Custom nudge for bare text responses |\n\n资料来源：[src/forge/guardrails/guardrails.py:1-80]()\n\n### CheckResult\n\nThe return type of `Guardrails.check()`:\n\n```python\nclass CheckResult:\n    action: Literal[\"execute\", \"retry\", \"step_blocked\", \"fatal\"]\n    tool_calls: list[ToolCall] | None\n    nudge: Nudge | None\n    reason: str | None\n```\n\n**Action Meanings:**\n\n| Action | Description |\n|--------|-------------|\n| `execute` | Safe to proceed; `tool_calls` contains valid calls |\n| `retry` | Invalid response; inject `nudge` and retry |\n| `step_blocked` | Attempted terminal tool before required steps |\n| `fatal` | Max retries exhausted; `reason` contains explanation |\n\n### Two-Method Guardrails API\n\n```python\n# After each LLM response\nresult = guardrails.check(response)\n\nif result.action == \"fatal\":\n    return f\"FATAL: {result.reason}\"\n\nif result.action in (\"retry\", \"step_blocked\"):\n    return f\"{result.action}: {result.nudge.content}\"\n\n# result.action == \"execute\"\n# Run tools yourself, then record results\ntool_calls = result.tool_calls\nexecuted = [tc.tool for tc in tool_calls]\ndone = guardrails.record(executed)\n```\n\n资料来源：[src/forge/guardrails/guardrails.py:82-130]()\n\n### Granular API\n\nFor advanced use cases, individual guardrail components can be used directly:\n\n```python\nfrom forge.guardrails import ResponseValidator, StepEnforcer, ErrorTracker\n\nvalidator = ResponseValidator(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    rescue_enabled=True,\n)\nenforcer = StepEnforcer(\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\nerrors = ErrorTracker(max_retries=3, max_tool_errors=2)\n```\n\n## Server Management API\n\n### ServerManager\n\nThe `ServerManager` class handles lifecycle management for llama.cpp-based backends:\n\n```python\nserver = ServerManager(backend=\"llamaserver\", port=8080)\n```\n\n**Backend Options:**\n\n| Backend | Description |\n|---------|-------------|\n| `\"ollama\"` | Ollama server (model name, no GGUF path) |\n| `\"llamaserver\"` | llama.cpp server via `llama-server` |\n| `\"llamafile\"` | Mozilla llamafile binary |\n\n### Budget Resolution\n\nThe `resolve_budget()` method determines context length based on mode:\n\n```python\nasync def resolve_budget(\n    self,\n    mode: BudgetMode,\n    manual_tokens: int | None = None,\n) -> int:\n```\n\n| Mode | Behavior |\n|------|----------|\n| `MANUAL` | Use `manual_tokens` directly |\n| `FORGE_FAST` / `FORGE_DEEP` | Query server `/props` for context |\n\n资料来源：[src/forge/server.py:1-100]()\n\n## Context Management\n\n### ContextManager\n\nToken budget management for long-running conversations:\n\n```python\nctx = ContextManager(\n    strategy=TieredCompact(keep_recent=2),\n    budget_tokens=8192\n)\n```\n\n### BudgetMode Enum\n\n```python\nclass BudgetMode(Enum):\n    MANUAL = \"manual\"\n    FORGE_FAST = \"forge_fast\"\n    FORGE_DEEP = \"forge_deep\"\n```\n\n资料来源：[src/forge/server.py]()\n\n## Error Types\n\nForge defines custom exceptions for specific error conditions:\n\n```python\nclass UnsupportedModelError(Exception):\n    \"\"\"Raised when strict sampling defaults are requested for unknown models.\"\"\"\n    pass\n```\n\nAdditional error types in `errors.py`:\n\n| Error | Use Case |\n|-------|----------|\n| `BudgetResolutionError` | Server unreachable or missing n_ctx |\n| `BackendError` | Backend communication failures |\n\n资料来源：[src/forge/errors.py]()\n\n## Quick Start Example\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Summary\n\nThe forge module structure provides:\n\n1. **Tool Definition** — `ToolSpec` and `ToolDef` for declaring LLM-callable functions with prerequisites\n2. **Workflow Orchestration** — `Workflow` and `WorkflowRunner` for managing multi-step tasks\n3. **Client Abstraction** — Backend-agnostic clients with sampling defaults\n4. **Guardrails Middleware** — Built-in validation, step enforcement, and error handling\n5. **Server Management** — Lifecycle control for llama.cpp backends\n6. **Context Management** — Token budget and compaction strategies\n\n---\n\n<a id='page-adr-index'></a>\n\n## Architecture Decision Records\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n</details>\n\n# Architecture Decision Records\n\n## Overview\n\nArchitecture Decision Records (ADRs) serve as the authoritative documentation for significant design choices within the forge project. They capture the *why* behind implementation decisions, enabling current and future contributors to understand the reasoning without reconstructing the original context.\n\nThe forge project stores ADRs in `docs/decisions/`, using a numbered naming convention (e.g., `001-ablation-framework.md`, `011-guardrail-middleware.md`, `013-text-response-intent.md`). This numbering scheme allows for easy chronological tracking and establishes precedent relationships between decisions.\n\n## Purpose and Scope\n\nADRs in forge address several critical aspects:\n\n| Category | Description | Example Documents |\n|----------|-------------|-------------------|\n| Framework Design | Ablation study methodology and tooling | `001-ablation-framework.md` |\n| Middleware Patterns | Guardrail implementation and composition | `011-guardrail-middleware.md` |\n| Response Handling | Intent classification for text responses | `013-text-response-intent.md` |\n| Backend Integration | LLM client configuration and sampling defaults | `sampling_defaults.py` |\n| Server Management | Context resolution and budget modes | `server.py` |\n\nEach ADR documents not only the chosen approach but also considered alternatives and the tradeoffs that influenced the final decision. This creates a historical record that prevents repeated debates over settled questions while enabling informed reconsideration when circumstances change.\n\n## ADR Contribution Workflow\n\nAccording to the contribution guidelines, the process for introducing a new architecture decision follows a structured review pattern:\n\n```mermaid\ngraph TD\n    A[Identify Design Decision] --> B[Review Existing ADRs]\n    B --> C{Decision Already Documented?}\n    C -->|Yes| D[Reference Existing ADR]\n    C -->|No| E[Draft New ADR]\n    E --> F[Propose ADR Format]\n    F --> G[Review Against Project Standards]\n    G --> H[Merge and Publish]\n    \n    style A fill:#e1f5fe\n    style H fill:#c8e6c9\n```\n\nThe contribution workflow integrates with the broader project development cycle:\n\n1. **Proposal Phase**: Before implementing significant changes, contributors should draft an ADR following the established format\n2. **Review Phase**: The ADR undergoes peer review alongside code review\n3. **Adoption Phase**: Once approved, the ADR becomes the reference for implementation decisions\n4. **Maintenance Phase**: ADRs may be updated if subsequent decisions supersede them\n\n资料来源：[CONTRIBUTING.md:1-50]()\n\n## Core Architecture Components\n\n### Workflow Engine\n\nThe `Workflow` and `WorkflowRunner` classes form the central orchestration layer. A workflow defines the available tools, required execution steps, and terminal conditions.\n\n```mermaid\ngraph TD\n    subgraph Workflow Definition\n        W[Workflow] --> TD[Tool Definitions]\n        W --> RS[Required Steps]\n        W --> TT[Terminal Tool]\n        W --> SP[System Prompt Template]\n    end\n    \n    subgraph Execution Layer\n        WR[WorkflowRunner] --> CM[Context Manager]\n        WR --> GR[Guardrails]\n        WR --> CL[LLM Client]\n    end\n    \n    subgraph Tool Layer\n        TC[ToolCall] --> T[Tool Execution]\n        T --> TR[Tool Response]\n    end\n    \n    WR --> TC\n    TC -->|Result| CM\n    CM -->|Context| WR\n    \n    style W fill:#fff3e0\n    style WR fill:#e3f2fd\n```\n\nThe `ToolDef` dataclass binds tool schemas to implementations, while `ToolSpec` defines the JSON Schema for parameter validation. Tool calls are represented as `ToolCall` objects containing the tool name and arguments.\n\n资料来源：[src/forge/core/workflow.py:1-100]()\n\n### Guardrail Middleware\n\nThe guardrail system provides a composable validation layer that intercepts LLM responses before tool execution:\n\n```mermaid\ngraph LR\n    LLM[LLM Response] --> GR[Guardrails.check]\n    GR --> RV[ResponseValidator]\n    GR --> SE[StepEnforcer]\n    GR --> ET[ErrorTracker]\n    \n    RV -->|Valid| TC[ToolCalls]\n    RV -->|Invalid| NR[Retry Nudge]\n    SE -->|Correct Order| TC\n    SE -->|Wrong Order| SB[Step Blocked]\n    ET -->|OK| TC\n    ET -->|Max Errors| FT[Fatal]\n    \n    style GR fill:#fce4ec\n    style TC fill:#c8e6c9\n```\n\nThe `Guardrails` class orchestrates three sub-components:\n\n| Component | Responsibility | Key Parameters |\n|-----------|---------------|----------------|\n| `ResponseValidator` | Parses tool calls, enables rescue parsing | `rescue_enabled`, `retry_nudge_fn` |\n| `StepEnforcer` | Ensures required steps precede terminal tool | `required_steps`, `max_premature_attempts` |\n| `ErrorTracker` | Tracks consecutive errors and retries | `max_retries`, `max_tool_errors` |\n\n资料来源：[src/forge/guardrails/guardrails.py:1-100]()\n\n### Server Management and Budget Resolution\n\nThe `ServerManager` handles lifecycle management for llama.cpp-based backends, while the `ContextManager` implements token budget strategies:\n\n```mermaid\ngraph TD\n    SM[ServerManager] --> BM[BudgetMode]\n    BM -->|FORGE_FAST| FT[Fast Budget]\n    BM -->|FORGE_BALANCED| BT[Balanced Budget]\n    BM -->|FORGE_DEEP| DT[Deep Budget]\n    BM -->|MANUAL| MT[Manual Tokens]\n    \n    CM[ContextManager] --> TC[TieredCompact]\n    CM --> SC[SimpleCompact]\n    \n    SM -->|Context Query| Props[/props endpoint]\n    Props -->|n_ctx| CM\n```\n\nBudget resolution follows platform-specific paths:\n\n- **Ollama**: Uses `manual_tokens` parameter for `MANUAL` mode\n- **Llamafile/Llama Server**: Queries `/props` endpoint for server-configured context length\n\n资料来源：[src/forge/server.py:1-100]()\n\n## Sampling Configuration System\n\nThe sampling defaults system separates lookup from policy, enabling fine-grained control over model parameters:\n\n```mermaid\ngraph TD\n    subgraph Lookup Layer\n        GM[get_model_defaults] --> MAP[MODEL_SAMPLING_DEFAULTS]\n    end\n    \n    subgraph Policy Layer\n        AS[apply_sampling_defaults] --> |strict=True| KR[Known + Known]\n        AS --> |strict=False| KU[Known + Unknown]\n        KR -->|In Map| ReturnDict[Return Dict]\n        KU -->|Not In Map| InfoLog[INFO Log Once]\n    end\n    \n    subgraph Client Integration\n        OC[OllamaClient] --> AS\n        LC[LlamafileClient] --> AS\n        AC[AnthropicClient] --> AS\n    end\n```\n\nThe two-function design (`get_sampling_defaults` for pure lookup, `apply_sampling_defaults` for policy) ensures that:\n\n- Unknown models don't cause errors when `strict=False`\n- Known models log a one-time INFO message when not opted in\n- Explicit opt-in via `recommended_sampling=True` enables strict behavior\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-100]()\n\n## Proxy Server Architecture\n\nThe `ProxyServer` provides a forwarding layer with additional control features:\n\n```mermaid\ngraph TD\n    subgraph Proxy Layer\n        PS[ProxyServer] --> SF[Serialize Flag]\n        PS --> RT[Retry Logic]\n        PS --> RC[Rescue Parser]\n    end\n    \n    subgraph Backend Routing\n        PS --> Ollama[Ollama Backend]\n        PS --> Llama[Llama Backend]\n        PS --> LLF[Llamafile Backend]\n    end\n    \n    subgraph Configuration\n        SF --> |serialize=True| Serial[Serialize Requests]\n        SF --> |serialize=False| Parallel[Parallel Requests]\n        RT --> |max_retries=N| RetryN[N Attempts]\n    end\n```\n\nKey proxy options include:\n\n| Flag | Default | Purpose |\n|------|---------|---------|\n| `--host` | `127.0.0.1` | Proxy listen address |\n| `--port` | `8081` | Proxy listen port |\n| `--serialize` | `None` | Request serialization control |\n| `--max-retries` | `3` | Retries per request |\n| `--no-rescue` | `False` | Disable rescue parsing |\n\n资料来源：[src/forge/proxy/__main__.py:1-80]()\n\n## ADR Format and Standards\n\nEach ADR in the forge repository follows a consistent structure:\n\n1. **Title**: Descriptive name with ADR number\n2. **Status**: Proposed, Accepted, Deprecated, or Superseded\n3. **Context**: Background and problem statement\n4. **Decision**: The chosen approach with rationale\n5. **Consequences**: Benefits, drawbacks, and tradeoffs\n6. **Related Decisions**: Links to dependent or related ADRs\n\nThis format ensures that future maintainers can quickly assess whether an ADR is current and understand the full context of each decision.\n\n## Versioning and Evolution\n\nThe CHANGELOG maintains a parallel record of implementation milestones, cross-referenced with ADRs. Major architectural changes increment the minor version number, while bug fixes increment the patch version (semantic versioning).\n\nChanges that require ADR updates include:\n\n- New LLM backend support\n- Guardrail algorithm modifications\n- Context management strategy changes\n- Tool execution model alterations\n- Breaking API changes\n\n资料来源：[CHANGELOG.md:1-100]()\n\n## Best Practices for ADR Readers\n\nWhen reviewing ADRs to understand forge's architecture:\n\n1. **Start with the index**: The `docs/decisions/` directory lists all ADRs chronologically\n2. **Check status**: Deprecated ADRs indicate historical context, not current practice\n3. **Cross-reference implementations**: Source files in `src/forge/` implement ADR decisions\n4. **Review CHANGELOG**: Implementation dates and version numbers provide temporal context\n5. **Examine tests**: Unit tests in `tests/unit/` validate ADR-enforced behaviors\n\n---\n\n<a id='page-workflowrunner'></a>\n\n## WorkflowRunner and Agentic Loop\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/core/runner.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/runner.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/core/steps.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/steps.py)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/guardrails/response_validator.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/response_validator.py)\n- [src/forge/guardrails/step_enforcer.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/step_enforcer.py)\n- [src/forge/guardrails/error_tracker.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/error_tracker.py)\n- [src/forge/prompts/nudges.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/prompts/nudges.py)\n</details>\n\n# WorkflowRunner and Agentic Loop\n\n## Overview\n\nThe `WorkflowRunner` is the central execution engine in the Forge framework, implementing an **agentic loop** that orchestrates multi-step tool-calling workflows with an LLM backend. It manages the complete lifecycle of a workflow: from initializing messages, through iterative LLM inference and tool execution, to context management and termination.\n\n**Core responsibilities:**\n\n1. Building initial message lists (system prompt + user input)\n2. Coordinating LLM inference with streaming or batch responses\n3. Validating and executing tool calls returned by the LLM\n4. Managing context budget through the `ContextManager`\n5. Enforcing required step sequences via the `StepEnforcer`\n6. Handling retries for malformed responses\n7. Terminating on terminal tool execution or max iterations\n\n资料来源：[src/forge/core/runner.py:1-50]()\n\n---\n\n## Architecture\n\n### Component Relationships\n\n```mermaid\ngraph TD\n    subgraph \"Forge Agentic System\"\n        WR[WorkflowRunner] --> CM[ContextManager]\n        WR --> IV[Inference Validator]\n        WR --> SE[StepEnforcer]\n        WR --> ET[ErrorTracker]\n        CM --> CC[Compaction Chain]\n        IV --> RV[ResponseValidator]\n    end\n    \n    WR --> Client[LLMClient]\n    Client --> Ollama[OllamaClient]\n    Client --> LlamaServer[LlamafileClient]\n    Client --> Proxy[ProxyClient]\n    \n    WF[Workflow] --> WR\n    WF --> TD[ToolDefs]\n    TD --> Impl[Callable Implementations]\n```\n\n资料来源：[src/forge/core/runner.py:1-50]()\n\n### Agentic Loop Flow\n\nThe `WorkflowRunner.run()` method implements a 7-phase agentic loop:\n\n```mermaid\ngraph TD\n    A[Start: User Query] --> B[\"1. Build Messages<br/>system_prompt + user_input\"]\n    B --> C[\"2. Send to LLM<br/>Streaming or Batch\"]\n    C --> D{Response Type?}\n    D -->|TextResponse<br/>Malformed/Refusal| E[\"3. Retry with Nudge\"]\n    E --> C\n    D -->|ToolCall| F[\"4. Validate Tool Calls<br/>Schema + Prerequisites\"]\n    F --> G{\"5. Execute Tools<br/>Batch-aware\"}\n    G --> H{Success?}\n    H -->|Error| I[\"Track Error<br/>Retry if < max_tool_errors\"]\n    I --> C\n    H -->|Success| J[\"6. Compact Context<br/>ContextManager\"]\n    J --> K{\"7. Check Terminal<br/>terminal_tool called?\"}\n    K -->|No| L[\"Increment iteration<br/>Check < max_iterations\"]\n    L -->|Yes| C\n    K -->|Yes| M[Done]\n    L -->|No| N[MaxIterationsError]\n    \n    style E fill:#ffcccc\n    style N fill:#ffcccc\n    style M fill:#ccffcc\n```\n\n资料来源：[src/forge/core/runner.py:50-200]()\n\n---\n\n## Core Data Models\n\n### Workflow\n\nThe `Workflow` dataclass defines the complete task specification:\n\n```python\n@dataclass\nclass Workflow:\n    name: str\n    description: str\n    tools: dict[str, ToolDef]\n    required_steps: list[str] = field(default_factory=list)\n    terminal_tool: str | frozenset[str] = \"\"\n    system_prompt_template: str = \"\"\n    max_retries: int = 3\n    cancel_event: asyncio.Event | None = None\n```\n\n资料来源：[src/forge/core/workflow.py:1-100]()\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `name` | `str` | Workflow identifier |\n| `description` | `str` | Human-readable task description |\n| `tools` | `dict[str, ToolDef]` | Tool name → ToolDef mapping |\n| `required_steps` | `list[str]` | Tool names that must execute before terminal tool |\n| `terminal_tool` | `str \\| frozenset[str]` | Tool(s) that end the workflow |\n| `system_prompt_template` | `str` | System prompt for the LLM |\n| `max_retries` | `int` | Consecutive bad responses before fatal (default: 3) |\n| `cancel_event` | `asyncio.Event \\| None` | Optional cancellation signal |\n\n### ToolDef\n\n`ToolDef` binds a tool's schema to its implementation:\n\n```python\n@dataclass\nclass ToolDef:\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\n资料来源：[src/forge/core/workflow.py:100-130]()\n\n**Prerequisites** express conditional dependencies:\n\n| Type | Example | Behavior |\n|------|---------|----------|\n| `str` | `\"read_file\"` | Any prior call to `read_file` satisfies it |\n| `dict` | `{\"tool\": \"read_file\", \"match_arg\": \"path\"}` | Prior call with matching `path` value required |\n\n### ToolSpec\n\nDefines the tool's interface for LLM communication using Pydantic:\n\n```python\n@dataclass\nclass ToolSpec:\n    name: str\n    description: str\n    parameters: type[BaseModel]\n```\n\n资料来源：[src/forge/core/workflow.py:40-60]()\n\n### ToolCall\n\nValidated tool invocation returned by the LLM:\n\n```python\nclass ToolCall(BaseModel):\n    tool: str\n    args: dict[str, Any] = Field(default_factory=dict)\n```\n\n资料来源：[src/forge/core/workflow.py:160-170]()\n\n---\n\n## StepTracker\n\nThe `StepTracker` maintains workflow state outside the message history, tracking which required steps have been completed and what tools have been executed with what arguments.\n\n```python\n@dataclass\nclass StepTracker:\n    required_steps: list[str]\n    completed_steps: dict[str, None] = field(default_factory=dict)\n    executed_tools: dict[str, list[dict[str, Any]]] = field(default_factory=dict)\n    \n    def record(self, tool_name: str, args: dict[str, Any] | None = None) -> None\n    def is_satisfied(self) -> bool\n    def pending(self) -> list[str]\n```\n\n资料来源：[src/forge/core/steps.py:20-50]()\n\n**Key behaviors:**\n\n- `completed_steps` uses `dict` (insertion order preserved) to track step order\n- `executed_tools` stores argument history for arg-matched prerequisites\n- Compaction cannot invalidate step completion (state lives outside message history)\n\n### PrerequisiteCheck\n\n```python\n@dataclass\nclass PrerequisiteCheck:\n    satisfied: bool\n    missing: list[str]\n```\n\n资料来源：[src/forge/core/steps.py:8-15]()\n\n---\n\n## Guardrails System\n\nThe guardrails system provides composable middleware for reliability without requiring `WorkflowRunner`:\n\n```mermaid\ngraph LR\n    subgraph \"Guardrails Bundle\"\n        G[Guardrails]\n        G --> RV[ResponseValidator]\n        G --> SE[StepEnforcer]\n        G --> ET[ErrorTracker]\n    end\n    \n    Input[LLM Response] --> G\n    G --> Output[CheckResult]\n```\n\n资料来源：[src/forge/guardrails/__init__.py:1-30]()\n\n### Guardrails (Bundled API)\n\n```python\nclass Guardrails:\n    def __init__(\n        self,\n        tool_names: list[str],\n        terminal_tool: str | frozenset[str],\n        required_steps: list[str] | None = None,\n        max_retries: int = 3,\n        max_tool_errors: int = 2,\n        rescue_enabled: bool = True,\n        max_premature_attempts: int = 3,\n        retry_nudge: Callable[[str], str] | None = None,\n    ) -> None\n    \n    def check(self, response: LLMResponse) -> CheckResult\n    def record(self, executed_tools: list[str]) -> bool\n```\n\n资料来源：[src/forge/guardrails/guardrails.py:60-100]()\n\n### CheckResult\n\n```python\n@dataclass\nclass CheckResult:\n    action: Literal[\"execute\", \"retry\", \"step_blocked\", \"fatal\"]\n    tool_calls: list[ToolCall] | None = None\n    nudge: Nudge | None = None\n    reason: str | None = None\n```\n\n资料来源：[src/forge/guardrails/guardrails.py:20-35]()\n\n| Action | Meaning |\n|--------|---------|\n| `execute` | Response is valid, proceed with tool calls |\n| `retry` | Bare text or malformed, retry with nudge |\n| `step_blocked` | Tried to call terminal tool prematurely |\n| `fatal` | Exhausted retries or too many errors |\n\n### ResponseValidator\n\nValidates LLM responses and extracts tool calls:\n\n- **Normal extraction**: Parses structured `tool_calls` from response\n- **Rescue parsing**: Handles plain text with regex-based tool extraction for models like Qwen Coder (XML format: `<function=name>...`)\n- **Retry nudging**: Injects guidance when model produces bare text\n\n资料来源：[src/forge/guardrails/response_validator.py:1-50]()\n\n### StepEnforcer\n\nEnforces required step sequences:\n\n```python\nclass StepEnforcer:\n    def check(\n        self,\n        tool_calls: list[ToolCall],\n        step_tracker: StepTracker,\n    ) -> StepCheck\n```\n\n资料来源：[src/forge/guardrails/step_enforcer.py:1-50]()\n\n**Nudge escalation tiers:**\n\n| Tier | Tone | Example |\n|------|------|---------|\n| 1 | Polite | \"You cannot call X yet. You must first complete: Y, Z.\" |\n| 2 | Direct | \"You must call one of these tools now: Y, Z. Pick one.\" |\n| 3 | Aggressive | \"STOP. You MUST call one of: Y, Z. Do NOT call X.\" |\n\n资料来源：[src/forge/prompts/nudges.py:1-40]()\n\n### ErrorTracker\n\nTracks consecutive errors for retry/exhaustion logic:\n\n```python\nclass ErrorTracker:\n    def record_retry(self) -> int\n    def record_tool_error(self) -> int\n    def reset(self) -> None\n    @property\n    def should_fatal(self) -> bool\n```\n\n资料来源：[src/forge/guardrails/error_tracker.py:1-40]()\n\n---\n\n## WorkflowRunner Configuration\n\n### Constructor Parameters\n\n```python\nclass WorkflowRunner:\n    def __init__(\n        self,\n        client: LLMClient,\n        context_manager: ContextManager,\n        max_iterations: int = 10,\n        max_retries_per_step: int = 3,\n        max_tool_errors: int = 2,\n        max_premature_attempts: int = 3,\n        retry_nudge: Callable[[str], str] | None = None,\n    ) -> None:\n```\n\n资料来源：[src/forge/core/runner.py:50-80]()\n\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `client` | required | `LLMClient` instance (OllamaClient, LlamafileClient, ProxyClient) |\n| `context_manager` | required | `ContextManager` instance for context compaction |\n| `max_iterations` | `10` | Maximum LLM turns before raising `MaxIterationsError` |\n| `max_retries_per_step` | `3` | Consecutive bare-text responses before fatal |\n| `max_tool_errors` | `2` | Consecutive tool execution failures before exhaustion |\n| `max_premature_attempts` | `3` | Premature terminal tool attempts before fatal |\n| `retry_nudge` | `None` | Custom nudge function for bare text responses |\n\n### Run Method\n\n```python\nasync def run(\n    self,\n    workflow: Workflow,\n    user_input: str,\n    cancel_event: asyncio.Event | None = None,\n) -> list[Message]:\n```\n\n资料来源：[src/forge/core/runner.py:80-120]()\n\n**Returns:** List of all `Message` objects in the conversation history\n\n**Raises:**\n\n| Exception | Condition |\n|-----------|-----------|\n| `MaxIterationsError` | Exceeded `max_iterations` without reaching terminal tool |\n| `StepEnforcementError` | `max_premature_attempts` exceeded |\n| `PrerequisiteError` | Tool called without satisfying prerequisites |\n| `ToolCallError` | Tool name not found in workflow |\n| `ToolExecutionError` | Tool callable raised an exception |\n| `WorkflowCancelledError` | `cancel_event` was set |\n\n---\n\n## WorkflowRunner vs Guardrails\n\nForge provides two integration patterns:\n\n| Aspect | WorkflowRunner | Guardrails |\n|--------|---------------|------------|\n| **Abstraction** | Full agentic loop | Lightweight middleware |\n| **Context management** | Built-in via ContextManager | Handled externally |\n| **Best for** | Standard workflows | Custom orchestration loops |\n| **API surface** | `run()` | `check()` + `record()` |\n\n资料来源：[src/forge/guardrails/__init__.py:1-20]()\n\n### Guardrails Usage Pattern\n\n```python\nfrom forge.guardrails import Guardrails, CheckResult\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\n# After each LLM response\nresult = guardrails.check(llm_response)\n\nif result.action == \"execute\":\n    for tc in result.tool_calls:\n        execute_tool(tc.tool, tc.args)\n    guardrails.record([tc.tool for tc in result.tool_calls])\nelif result.action == \"retry\":\n    # Re-prompt with nudge\n    pass\nelif result.action == \"fatal\":\n    # Stop\n    pass\n```\n\n---\n\n## Context Management\n\nThe `ContextManager` handles conversation context compaction:\n\n```mermaid\ngraph TD\n    A[New Tool Result] --> B{Token Budget?}\n    B -->|OK| C[Continue]\n    B -->|Exceeded| D[\"Compact Messages<br/>TieredCompact\"]\n    D --> C\n```\n\nThe runner automatically calls context compaction after each tool execution cycle. Compaction decisions use **real token counting** from backend-reported token usage.\n\n资料来源：[src/forge/core/runner.py:150-200]()\n\n### TieredCompact Strategy\n\nKeeps recent messages intact while compacting older history:\n\n- `keep_recent`: Number of recent message pairs to preserve\n- Older messages are summarized or removed\n\n---\n\n## Error Handling\n\n### Error Flow\n\n```mermaid\ngraph TD\n    A[Tool Execution] --> B{Success?}\n    B -->|Yes| C[Record Success]\n    B -->|No| D[Increment Error Count]\n    D --> E{max_tool_errors?}\n    E -->|Exceeded| F[Raise ToolExecutionError]\n    E -->|Not exceeded| G[Retry Tool Call]\n    G --> A\n    F --> H[Abort Workflow]\n```\n\n### Recovery Mechanisms\n\n| Error Type | Recovery Strategy |\n|------------|-------------------|\n| Malformed response | Retry with nudge (up to `max_retries_per_step`) |\n| Tool execution failure | Retry tool call (up to `max_tool_errors`) |\n| Premature terminal | Escalating nudge (3 tiers), then fatal |\n| Prerequisite violation | Prerequisite nudge, model re-plans |\n\n---\n\n## Cancellation Support\n\nWorkflows can be cancelled via `asyncio.Event`:\n\n```python\ncancel_event = asyncio.Event()\n\nrunner = WorkflowRunner(client=client, context_manager=ctx)\nasyncio.create_task(runner.run(workflow, user_input, cancel_event))\n\n# Later, from another task:\ncancel_event.set()  # Raises WorkflowCancelledError\n```\n\n资料来源：[src/forge/core/workflow.py:80-90]()\n\n---\n\n## Usage Examples\n\n### Basic Workflow\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import Workflow, ToolDef, ToolSpec, WorkflowRunner, OllamaClient\nfrom forge.context import ContextManager, TieredCompact\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    messages = await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md:1-60]()\n\n### Multi-Step Workflow with Required Steps\n\n```python\nworkflow = Workflow(\n    name=\"research\",\n    description=\"Research a topic with multiple steps\",\n    tools={\n        \"search\": ToolDef(spec=search_spec, callable=search_impl),\n        \"lookup\": ToolDef(\n            spec=lookup_spec,\n            callable=lookup_impl,\n            prerequisites=[\"search\"],  # Must search before lookup\n        ),\n        \"answer\": ToolDef(\n            spec=answer_spec,\n            callable=answer_impl,\n            prerequisites=[\"search\", \"lookup\"],  # All steps required\n        ),\n    },\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n    system_prompt_template=\"You are a research assistant.\",\n)\n```\n\n---\n\n## See Also\n\n- [User Guide](docs/USER_GUIDE.md) — Full workflow configuration and best practices\n- [Eval Guide](docs/EVAL_GUIDE.md) — Running evaluation scenarios\n- [ADR-011: Guardrail Middleware](docs/decisions/011-guardrail-middleware.md) — Design rationale for the guardrails system\n- [Backend Setup](docs/BACKEND_SETUP.md) — LLM backend configuration (Ollama, llamafile, llama.cpp)\n\n---\n\n<a id='page-guardrails-middleware'></a>\n\n## Guardrails Middleware for External Loops\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/guardrails/__init__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/__init__.py)\n- [examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n- [src/forge/prompts/nudges.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/prompts/nudges.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n</details>\n\n# Guardrails Middleware for External Loops\n\nForge's **Guardrails Middleware** provides a composable reliability layer that can be integrated into any external orchestration loop. Rather than requiring adoption of the full `WorkflowRunner`, external projects can embed forge's retry nudges, rescue parsing, step enforcement, and error tracking directly within their own agent execution frameworks.\n\n---\n\n## Overview\n\nThe guardrails system addresses a fundamental challenge in LLM tool-calling: models frequently produce malformed, premature, or incomplete tool invocations that require intervention rather than silent failure.\n\n| Concern | Component | Purpose |\n|---------|-----------|---------|\n| Malformed responses | `ResponseValidator` | Parse tool calls from text, retry on bad format |\n| Premature termination | `StepEnforcer` | Enforce required step sequence before terminal tool |\n| Consecutive failures | `ErrorTracker` | Count retries/tool errors, signal exhaustion |\n| User-facing messages | `Nudge` | Structured retry guidance injected into prompts |\n\n资料来源：[src/forge/guardrails/__init__.py:1-29]()\n\n---\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[\"LLM Response\"] --> B[\"Guardrails.check()\"]\n    B --> C{ResponseValidator}\n    C -->|Valid tool call| D{StepEnforcer}\n    C -->|Bare text / malformed| E[\"Generate retry_nudge\"]\n    D -->|Step violated| F[\"Generate premature_terminal_nudge\"]\n    D -->|All checks pass| G{ErrorTracker}\n    G -->|Under threshold| H[\"action: execute\"]\n    G -->|Over threshold| I[\"action: fatal\"]\n    \n    J[\"Tool Execution\"] --> K[\"Guardrails.record()\"]\n    K --> G\n```\n\nThe `Guardrails` class bundles three components into a single two-method API:\n\n```python\nclass Guardrails:\n    def check(self, response: LLMResponse) -> CheckResult: ...\n    def record(self, executed: list[str]) -> bool: ...\n```\n\n资料来源：[src/forge/guardrails/guardrails.py:86-109]()\n\n---\n\n## CheckResult Data Model\n\nEvery call to `check()` returns a `CheckResult` that encodes the appropriate action for the orchestration loop:\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `action` | `Literal[\"execute\", \"retry\", \"step_blocked\", \"fatal\"]` | Directive for the caller |\n| `tool_calls` | `list[ToolCall] \\| None` | Parsed tool invocations (when action is `execute`) |\n| `nudge` | `Nudge \\| None` | Structured message to inject (when action is `retry` or `step_blocked`) |\n| `reason` | `str \\| None` | Human-readable explanation (only when action is `fatal`) |\n\n资料来源：[src/forge/guardrails/guardrails.py:66-85]()\n\n### Action Semantics\n\n| Action | Meaning | Caller Behavior |\n|--------|---------|-----------------|\n| `execute` | Response is valid; proceed to tool execution | Extract `tool_calls`, execute them, call `record()` |\n| `retry` | Response invalid but recoverable; inject nudge | Append `nudge` to messages, re-prompt model |\n| `step_blocked` | Terminal tool called before required steps | Append step-enforcement nudge, re-prompt |\n| `fatal` | Exhausted retry/error budget | Log `reason`, abort workflow |\n\n---\n\n## Guardrails Constructor Parameters\n\n```python\nGuardrails(\n    tool_names: list[str],\n    terminal_tool: str | frozenset[str],\n    required_steps: list[str] | None = None,\n    max_retries: int = 3,\n    max_tool_errors: int = 2,\n    rescue_enabled: bool = True,\n    max_premature_attempts: int = 3,\n    retry_nudge: Callable[[str], str] | None = None,\n)\n```\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `tool_names` | `list[str]` | — | Valid tool names for this workflow |\n| `terminal_tool` | `str \\| frozenset[str]` | — | Tool(s) that can end the workflow |\n| `required_steps` | `list[str] \\| None` | `None` | Tools that must be called before `terminal_tool` |\n| `max_retries` | `int` | `3` | Consecutive bad responses before `fatal` |\n| `max_tool_errors` | `int` | `2` | Consecutive tool execution failures before exhaustion |\n| `rescue_enabled` | `bool` | `True` | Attempt to parse tool calls from plain text |\n| `max_premature_attempts` | `int` | `3` | Premature terminal attempts before `fatal` |\n| `retry_nudge` | `Callable[[str], str] \\| None` | `None` | Custom nudge generator for bare text responses |\n\n资料来源：[src/forge/guardrails/guardrails.py:47-69]()\n\n---\n\n## Integration Patterns\n\n### Bundled API (Recommended)\n\nFor most integrations, use the two-method `Guardrails` interface:\n\n```python\nfrom forge.guardrails import Guardrails, CheckResult\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response(response) -> str:\n    result = guardrails.check(response)\n    \n    if result.action == \"fatal\":\n        return f\"ABORT: {result.reason}\"\n    \n    if result.action in (\"retry\", \"step_blocked\"):\n        return f\"RETRY with nudge: {result.nudge.content}\"\n    \n    # Execute tools\n    executed = [tc.tool for tc in result.tool_calls]\n    done = guardrails.record(executed)\n    return f\"executed {executed}\" + (\" -- DONE\" if done else \"\")\n```\n\n资料来源：[examples/foreign_loop.py:60-78]()\n\n### Granular API (Advanced)\n\nFor fine-grained control, instantiate components directly:\n\n```python\nfrom forge.guardrails import ResponseValidator, StepEnforcer, ErrorTracker\n\nvalidator = ResponseValidator(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    rescue_enabled=True,\n)\nenforcer = StepEnforcer(\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tools=frozenset([\"answer\"]),\n)\nerrors = ErrorTracker(max_retries=3, max_tool_errors=2)\n\ndef handle_response_granular(response):\n    # Reset state for each new response\n    errors._consecutive_retries = 0\n    enforcer._premature_attempts = 0\n    \n    validation = validator.validate(response)\n    if not validation.valid:\n        return \"RETRY: \" + validation.nudge.content\n    \n    step_check = enforcer.check(response.tool_calls)\n    if step_check.blocked:\n        return \"BLOCKED: \" + step_check.nudge.content\n    \n    # Execute, then record\n    errors.record(tool_name=\"search\", error=None)\n    return \"proceed to execution\"\n```\n\n资料来源：[examples/foreign_loop.py:32-59]()\n\n---\n\n## Nudge Generation\n\nThe `nudge.py` module provides structured retry messages with three escalation tiers:\n\n```mermaid\ngraph LR\n    A[\"tier=1\"] --> B[\"Polite:<br/>'You cannot call X yet...'\"]\n    A --> C[\"tier=2\"]\n    C --> D[\"Direct:<br/>'You must call one of...'\"]\n    C --> E[\"tier=3\"]\n    E --> F[\"Aggressive:<br/>'STOP. You MUST call...'\"]\n```\n\n### Premature Terminal Nudge\n\nGenerated when the model attempts to call the terminal tool before completing required steps:\n\n```python\ndef premature_terminal_nudge(\n    terminal_tool: str,\n    pending_steps: list[str],\n    tier: int = 1,\n) -> str:\n    tier = max(1, min(3, tier))  # Clamped to 1-3\n```\n\n资料来源：[src/forge/prompts/nudges.py:1-26]()\n\n| Tier | Tone | Example Output |\n|------|------|----------------|\n| 1 | Polite | \"You cannot call `answer` yet. You must first complete these required steps: search, lookup. Call one of them now.\" |\n| 2 | Direct | \"You must call one of these tools now: search, lookup. Pick one.\" |\n| 3 | Aggressive | \"STOP. You MUST call one of: search, lookup. Do NOT call `answer`. Your next response MUST be a tool call to one of: search, lookup.\" |\n\n### Prerequisite Nudge\n\nGenerated when a tool is called without its prerequisites:\n\n```python\ndef prerequisite_nudge(tool_name: str, missing_prereqs: list[str]) -> str:\n```\n\n资料来源：[src/forge/prompts/nudges.py:28-46]()\n\n---\n\n## ToolCall Model\n\nThe `tool_calls` extracted from a valid response are represented as:\n\n```python\nclass ToolCall(BaseModel):\n    tool: str  # Tool name\n    # Additional fields inherited from LLMClient extraction\n```\n\nFor OpenAI-style tool definitions, the `ToolSpec` in `workflow.py` handles parameter schema conversion:\n\n```python\nclass ToolSpec:\n    name: str\n    description: str\n    parameters: type[BaseModel]  # Pydantic model\n    \n    @classmethod\n    def from_openai_schema(cls, name: str, description: str, schema: dict) -> ToolSpec:\n        properties = schema.get(\"properties\", {})\n        required = set(schema.get(\"required\", []))\n```\n\n资料来源：[src/forge/core/workflow.py:1-40]()\n\n---\n\n## Respond Tool Integration\n\nFor conversational use cases where the model may produce a final text response instead of a tool call, forge provides a special `respond` tool:\n\n```python\nfrom forge.tools import RESPOND_TOOL_NAME, respond_spec\n\nrespond_guardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\", RESPOND_TOOL_NAME],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response_with_respond(response):\n    result = respond_guardrails.check(response)\n    \n    if result.action == \"fatal\":\n        return f\"FATAL: {result.reason}\"\n    \n    if result.action in (\"retry\", \"step_blocked\"):\n        return f\"{result.action}: {result.nudge.content[:80]}...\"\n    \n    tool_calls = result.tool_calls\n    \n    # Check if the model called respond\n    for tc in tool_calls:\n        if tc.tool == RESPOND_TOOL_NAME:\n            message = tc.args.get(\"message\", \"\")\n            return f\"MODEL SAYS: {message}\"\n    \n    executed = [tc.tool for tc in tool_calls]\n    done = respond_guardrails.record(executed)\n    return f\"executed {executed}\" + (\" -- DONE\" if done else \"\")\n```\n\n资料来源：[examples/foreign_loop.py:83-114]()\n\n---\n\n## Public API Reference\n\n```python\nfrom forge.guardrails import (\n    # Bundled API\n    CheckResult,\n    Guardrails,\n    \n    # Granular components\n    ErrorTracker,\n    Nudge,\n    ResponseValidator,\n    StepCheck,\n    StepEnforcer,\n    ValidationResult,\n)\n```\n\n资料来源：[src/forge/guardrails/__init__.py:7-27]()\n\n---\n\n## Design Rationale\n\nThe guardrails middleware was designed to solve the foreign-loop integration problem described in ADR-011. The core principle is **orthogonal concerns**: each component handles one aspect of reliability, and the `Guardrails` bundler coordinates them without hiding their individual behavior.\n\nKey design decisions:\n\n| Decision | Rationale |\n|----------|-----------|\n| Two-method API | Simple contract; orchestrator controls the loop |\n| `record()` returns `bool` | `True` signals terminal tool reached; orchestrator decides when to stop |\n| `Nudge` dataclass | Structured message type allows metadata (e.g., tier) beyond raw string |\n| `rescue_enabled` | Opt-in parsing from plain text; some backends emit structured output natively |\n| `frozenset[str]` for terminal_tool | Multiple terminal tools supported without API change |\n\n资料来源：[src/forge/guardrails/__init__.py:1-10]()\n\n---\n\n<a id='page-proxy-server'></a>\n\n## Proxy Server Setup\n\n### 相关页面\n\n相关主题：[Backend Clients](#page-backend-clients)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n- [src/forge/proxy/proxy.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n- [src/forge/proxy/__init__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__init__.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [scripts/smoke_test_proxy.py](https://github.com/antoinezambelli/forge/blob/main/scripts/smoke_test_proxy.py)\n</details>\n\n# Proxy Server Setup\n\n## Overview\n\nThe forge proxy server is an OpenAI-compatible HTTP proxy that transparently applies forge's guardrail stack to any backend that speaks the Chat Completions API. It acts as a drop-in replacement for local model servers, enabling existing OpenAI-compatible clients to benefit from forge's reliability layer without code changes.\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Architecture\n\nThe proxy operates in two distinct modes, determined at startup:\n\n```mermaid\ngraph TD\n    subgraph \"Managed Mode\"\n        P[\"ProxyServer<br/>:8081\"] --> SM[\"ServerManager\"]\n        SM --> BE[\"llama-server<br/>llamafile<br/>ollama<br/>:8080\"]\n    end\n    \n    subgraph \"External Mode\"\n        P2[\"ProxyServer<br/>:8081\"] --> BU[\"User-managed<br/>Backend<br/>:8080\"]\n    end\n    \n    C[\"OpenAI Client\"] --> P\n    C2[\"OpenAI Client\"] --> P2\n```\n\n### Managed Mode\n\nIn managed mode, forge starts and controls the backend process lifecycle. The `ServerManager` class handles:\n\n- Backend binary discovery and execution\n- Model loading and initialization\n- Health verification via `/props` endpoint polling\n- Graceful shutdown and restart\n\n资料来源：[src/forge/proxy/proxy.py:1-40](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n### External Mode\n\nIn external mode, the proxy connects to a user-managed backend. This is useful when:\n\n- The backend runs on a different machine or container\n- Custom backend configurations are required\n- The backend is managed by an external orchestration system\n\n资料来源：[src/forge/proxy/proxy.py:35-45](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n## Supported Backends\n\n| Backend | Description | Requirements |\n|---------|-------------|--------------|\n| `llamaserver` | Llama.cpp's HTTP server | Local GGUF model file |\n| `llamafile` | Mozilla's single-file model executable | Single-file executable |\n| `ollama` | Ollama local inference server | Ollama runtime + model pulled |\n\n资料来源：[src/forge/proxy/__main__.py:25-29](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n## CLI Usage\n\n### Basic Invocation\n\n```bash\n# External mode — you manage the backend\npython -m forge.proxy --backend-url http://localhost:8080 --port 8081\n\n# Managed mode — forge starts llama-server and proxy together\npython -m forge.proxy --backend llamaserver --gguf path/to/model.gguf --port 8081\n\n# Managed mode with ollama\npython -m forge.proxy --backend ollama --model llama3.2 --port 8081\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n### Command-Line Arguments\n\n| Argument | Type | Default | Description |\n|----------|------|---------|-------------|\n| `--backend-url` | string | - | External backend URL (mutually exclusive with `--backend`) |\n| `--backend` | choice | - | Backend type: `llamaserver`, `llamafile`, `ollama` |\n| `--model` | string | - | Model name (required for ollama) |\n| `--gguf` | string | - | Path to GGUF file (llamaserver/llamafile) |\n| `--backend-port` | int | 8080 | Backend port for managed mode |\n| `--budget-mode` | choice | backend | Context budget: `backend`, `manual`, `forge-full`, `forge-fast` |\n| `--budget-tokens` | int | - | Manual token budget override |\n| `--extra-flags` | list | - | Additional backend CLI flags |\n| `--host` | string | 127.0.0.1 | Proxy listen host |\n| `--port` | int | 8081 | Proxy listen port |\n| `--serialize` | flag | - | Force request serialization |\n| `--no-serialize` | flag | - | Disable request serialization |\n| `--max-retries` | int | 3 | Max retries per request |\n| `--no-rescue` | flag | - | Disable rescue parsing |\n| `-v, --verbose` | flag | - | Enable debug logging |\n\n资料来源：[src/forge/proxy/__main__.py:13-53](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n## Programmatic API\n\n### ProxyServer Class\n\nThe `ProxyServer` class provides a programmatic interface for embedding the proxy in Python applications.\n\n```python\nfrom forge.proxy import ProxyServer\n\n# External mode\nproxy = ProxyServer(backend_url=\"http://localhost:8080\")\nproxy.start()\nprint(f\"Proxy running at {proxy.url}\")  # http://127.0.0.1:8081\n# ... use proxy ...\nproxy.stop()\n```\n\n```python\n# Managed mode\nproxy = ProxyServer(\n    backend=\"llamaserver\",\n    gguf=\"model.gguf\",\n    budget_mode=\"forge-fast\",\n    port=8081\n)\nproxy.start()\nproxy.stop()  # Stops both backend and proxy\n```\n\n资料来源：[src/forge/proxy/proxy.py:50-75](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n### Constructor Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `backend_url` | `str \\| None` | None | External backend URL |\n| `backend` | `str \\| None` | None | Backend type: `llamaserver`, `llamafile`, `ollama` |\n| `model` | `str \\| None` | None | Model name for ollama |\n| `gguf` | `str \\| Path \\| None` | None | Path to GGUF file |\n| `backend_port` | int | 8080 | Backend port |\n| `budget_mode` | BudgetMode | BudgetMode.BACKEND | Context budget strategy |\n| `budget_tokens` | int | - | Manual token budget |\n| `extra_flags` | `list[str] \\| None` | None | Additional CLI flags |\n| `host` | str | 127.0.0.1 | Listen host |\n| `port` | int | 8081 | Listen port |\n| `serialize` | `bool \\| None` | None | Request serialization control |\n| `max_retries` | int | 3 | Max retries per request |\n| `rescue_enabled` | bool | True | Enable rescue parsing |\n\n资料来源：[src/forge/proxy/proxy.py:56-100](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n### Lifecycle Methods\n\n| Method | Description |\n|--------|-------------|\n| `start()` | Start the proxy (blocks until ready, max 120s timeout) |\n| `stop()` | Stop the proxy and managed backend (30s shutdown timeout) |\n| `url` | Property returning the proxy's base URL |\n\n资料来源：[src/forge/proxy/proxy.py:102-125](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n## Respond Tool Injection\n\n### Purpose\n\nSmall local models (~8B parameters) cannot reliably choose between text output and tool calls. The proxy automatically injects a synthetic `respond` tool when tools are present in the request, forcing the model into tool-calling mode.\n\n### Behavior\n\n1. When the request contains `tools`, forge injects a `respond(message=\"...\")` tool into the tools list\n2. The model calls `respond(message=\"...\")` instead of producing bare text\n3. The `respond` call is stripped from the outbound response\n4. The client receives a normal text response with `finish_reason: \"stop\"`\n\nThis keeps the model in tool-calling mode where forge's full guardrail stack applies.\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n```mermaid\nsequenceDiagram\n    participant C as OpenAI Client\n    participant P as ProxyServer\n    participant B as Backend\n    \n    C->>P: POST /v1/chat/completions<br/>(with tools)\n    P->>P: Inject respond tool\n    P->>B: Forward request<br/>(tools + respond)\n    B->>P: respond(message=\"answer\")\n    P->>P: Strip respond call\n    P->>C: Normal text response<br/>(finish_reason: \"stop\")\n```\n\n## Context Budget Modes\n\nThe proxy supports different strategies for managing context window usage:\n\n| Mode | Description |\n|------|-------------|\n| `backend` | Let the backend manage context (default) |\n| `manual` | Use `--budget-tokens` for fixed budget |\n| `forge-full` | Full tiered compaction strategy |\n| `forge-fast` | Fast tiered compaction (reduced) |\n\n资料来源：[src/forge/proxy/__main__.py:35-38](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n### Tiered Compaction\n\nThe `forge-full` and `forge-fast` modes utilize `TieredCompact`, a three-phase compaction strategy:\n\n1. **Truncate** — Remove oldest messages\n2. **Drop results** — Remove tool result content\n3. **Sliding window** — Maintain recent context\n\n资料来源：[src/forge/proxy/proxy.py:22](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n## Request Serialization\n\nBy default, the proxy handles concurrent requests independently. The serialization flags control this behavior:\n\n| Flag | Behavior |\n|------|----------|\n| (none) | Proxy decides based on backend capabilities |\n| `--serialize` | Force sequential request processing |\n| `--no-serialize` | Allow concurrent processing |\n\n资料来源：[src/forge/proxy/__main__.py:31-34](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n## Sampling Parameters Pass-Through\n\nThe proxy forwards OpenAI-compatible sampling fields directly to the backend without modification:\n\n- `temperature`\n- `top_p`\n- `top_k`\n- `min_p`\n- `repeat_penalty`\n- `presence_penalty`\n- `seed`\n\n资料来源：[CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n\nTo use model-card-recommended sampling in proxy mode:\n\n```python\nfrom forge.clients import get_sampling_defaults\n\n# Look up recommended sampling parameters\nsampling = get_sampling_defaults(\"ministral-3-8b-instruct\")\n# Include in request body\nresponse = client.post(\"/v1/chat/completions\", json={\n    \"model\": \"ministral-3-8b-instruct\",\n    \"messages\": [...],\n    **sampling\n})\n```\n\n## Signal Handling\n\nThe proxy gracefully handles shutdown signals:\n\n- `SIGINT` (Ctrl+C) — Immediate shutdown\n- `SIGTERM` — Graceful shutdown\n\nThe main thread uses a timed sleep loop (`time.sleep(0.1)`) to allow Python to deliver signals between iterations, ensuring proper shutdown on Windows.\n\n资料来源：[src/forge/proxy/__main__.py:95-105](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n## Testing with Smoke Test Script\n\nThe repository includes a smoke test at `scripts/smoke_test_proxy.py` that:\n\n1. Starts a mock backend on port 18080\n2. Launches the proxy in external mode on port 18081\n3. Verifies health endpoint\n4. Sends a test chat completion request\n5. Validates the response structure\n\n```bash\npython scripts/smoke_test_proxy.py\n```\n\n资料来源：[scripts/smoke_test_proxy.py](https://github.com/antoinezambelli/forge/blob/main/scripts/smoke_test_proxy.py)\n\n## Health Endpoint\n\nThe proxy exposes a `/health` endpoint for monitoring:\n\n```bash\ncurl http://127.0.0.1:8081/health\n```\n\n资料来源：[scripts/smoke_test_proxy.py:70](https://github.com/antoinezambelli/forge/blob/main/scripts/smoke_test_proxy.py)\n\n## Configuration Example: Complete Setup\n\n```bash\n# Start llama-server with custom flags, proxy it\npython -m forge.proxy \\\n    --backend llamaserver \\\n    --gguf ./models/ministral-3-8b-instruct-q8_0.gguf \\\n    --model ministral-3-8b-instruct \\\n    --budget-mode forge-full \\\n    --backend-port 8080 \\\n    --port 8081 \\\n    --host 0.0.0.0 \\\n    --extra-flags --reasoning-format auto \\\n    --verbose\n```\n\nThen configure your client:\n\n```python\nimport httpx\n\nclient = httpx.AsyncClient(\n    base_url=\"http://localhost:8081/v1\",\n    timeout=120.0\n)\n\nresponse = await client.post(\"/chat/completions\", json={\n    \"model\": \"ministral-3-8b-instruct\",\n    \"messages\": [\n        {\"role\": \"user\", \"content\": \"What's the weather in Paris?\"}\n    ],\n    \"tools\": [\n        {\n            \"type\": \"function\",\n            \"function\": {\n                \"name\": \"get_weather\",\n                \"parameters\": {\n                    \"type\": \"object\",\n                    \"properties\": {\n                        \"city\": {\"type\": \"string\"}\n                    },\n                    \"required\": [\"city\"]\n                }\n            }\n        }\n    ]\n})\n\n---\n\n<a id='page-backend-clients'></a>\n\n## Backend Clients\n\n### 相关页面\n\n相关主题：[Backend Setup Guide](#page-backend-setup), [Model Selection Guide](#page-model-guide)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/clients/base.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/base.py)\n- [src/forge/clients/ollama.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/ollama.py)\n- [src/forge/clients/llamafile.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/llamafile.py)\n- [src/forge/clients/anthropic.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/anthropic.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n</details>\n\n# Backend Clients\n\n## Overview\n\nThe Backend Clients subsystem provides a unified abstraction layer over various LLM backends, enabling forge to interact with different inference engines through a consistent interface. This modular design allows users to switch between Ollama, llamafile, and Anthropic backends without modifying workflow code.\n\nEach client handles backend-specific communication protocols, response parsing, streaming, and tool call extraction while exposing a common async API for send operations and context resolution.\n\n资料来源：[src/forge/clients/base.py:1-50]()\n\n## Architecture\n\n### Client Hierarchy\n\n```mermaid\ngraph TD\n    A[BaseClient] --> B[OllamaClient]\n    A --> C[LlamafileClient]\n    A --> D[AnthropicClient]\n    \n    E[sampling_defaults.py] --> B\n    E --> C\n    E --> D\n    \n    F[WorkflowRunner] --> B\n    F --> C\n    F --> D\n```\n\nAll clients inherit from `BaseClient`, which defines the core async interface including `send()`, `send_stream()`, and `get_context_length()` methods. Backend-specific implementations override these methods to handle vendor-specific APIs and response formats.\n\n资料来源：[src/forge/clients/base.py:1-100]()\n\n### Common Interface\n\nAll clients implement the following async methods:\n\n| Method | Purpose |\n|--------|---------|\n| `send(messages, tools, **kwargs)` | Send a request and receive a complete response |\n| `send_stream(messages, tools, **kwargs)` | Stream responses as an async generator |\n| `get_context_length()` | Query the backend for maximum context window |\n| `stop()` | Stop any ongoing generation |\n\n资料来源：[src/forge/clients/base.py:50-150]()\n\n## OllamaClient\n\n### Purpose\n\nThe `OllamaClient` connects to a local Ollama server instance, supporting both standard models and GGUF-formatted models served through Ollama's model management system.\n\n资料来源：[src/forge/clients/ollama.py:1-100]()\n\n### Configuration Options\n\n```python\nOllamaClient(\n    model: str,                          # Model name (e.g., \"qwen3:8b-q4_K_M\")\n    base_url: str = \"http://localhost:11434\",\n    recommended_sampling: bool = False, # Use verified per-model sampling params\n    **kwargs                            # Passed to httpx client\n)\n```\n\n### Key Features\n\n- **Recommended Sampling**: When `recommended_sampling=True`, the client retrieves verified sampling parameters from `forge.clients.sampling_defaults` for known models. If a model is not in the map and `strict=True`, an `UnsupportedModelError` is raised.\n\n- **Streaming Support**: Full streaming support with token-level async generation through `send_stream()`.\n\n- **Tool Call Extraction**: Parses Ollama's JSON tool call format and converts to forge's internal `ToolCall` format.\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-80]()\n\n## LlamafileClient\n\n### Purpose\n\nThe `LlamafileClient` communicates with llamafile or llama-server instances, providing support for GGUF models served directly without Ollama's model management layer.\n\n资料来源：[src/forge/clients/llamafile.py:1-100]()\n\n### Context Resolution\n\nUnlike Ollama, llamafile and llama-server require querying the `/props` endpoint to determine the configured context length:\n\n```python\nasync def get_context_length(self) -> int | None:\n    \"\"\"Query the Llamafile /props endpoint for configured context length.\"\"\"\n    base = self.base_url.rstrip(\"/\")\n    if base.endswith(\"/v1\"):\n        base = base[:-3]\n\n    resp = await self._http.get(f\"{base}/props\")\n    data = resp.json()\n    n_ctx = data.get(\"default_generation_settings\", {}).get(\"n_ctx\")\n    return int(n_ctx) if n_ctx is not None else None\n```\n\n资料来源：[src/forge/clients/llamafile.py:180-200]()\n\n### Tool Call Modes\n\nThe client supports multiple tool call parsing strategies:\n\n| Mode | Description |\n|------|-------------|\n| `native` | Uses backend's native tool call format |\n| `function` | Parses `<function=name>...</function>` style tags |\n| `prompt` | Extracts tool calls from prompted responses |\n\n资料来源：[src/forge/clients/llamafile.py:100-180]()\n\n## AnthropicClient\n\n### Purpose\n\nThe `AnthropicClient` integrates with Anthropic's Claude API, enabling forge workflows to leverage Claude Opus, Sonnet, and Haiku models.\n\n资料来源：[src/forge/clients/anthropic.py:1-100]()\n\n### Key Differences\n\n- No hardcoded temperature defaults — relies on Anthropic API's own defaults\n- Supports Anthropic-specific headers and request formatting\n- Compatible with tools via Anthropic's tool use API\n\n## Sampling Defaults System\n\n### Overview\n\nThe `sampling_defaults` module provides verified per-model sampling parameters sourced from HuggingFace model cards. This ensures optimal generation quality for supported models without requiring users to manually tune hyperparameters.\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-50]()\n\n### Supported Parameters\n\n| Parameter | Description | Typical Range |\n|-----------|-------------|---------------|\n| `temperature` | Sampling temperature | 0.0 - 1.0 |\n| `top_p` | Nucleus sampling threshold | 0.0 - 1.0 |\n| `top_k` | Top-k sampling | 1 - 100 |\n| `min_p` | Minimum probability threshold | 0.0 - 1.0 |\n| `repeat_penalty` | Repetition penalty | 0.0 - 2.0 |\n| `presence_penalty` | Presence penalty (OpenAI compat) | -2.0 - 2.0 |\n\n### Policy Behavior\n\nThe `apply_sampling_defaults()` function implements a four-quadrant policy:\n\n```python\ndef apply_sampling_defaults(model: str, *, strict: bool) -> dict[str, float | int]:\n    \"\"\"Apply the recommended-sampling policy for model.\"\"\"\n    in_map = model in MODEL_SAMPLING_DEFAULTS\n    if strict:\n        if not in_map:\n            raise UnsupportedModelError(model)\n        return dict(MODEL_SAMPLING_DEFAULTS[model])\n    \n    # strict=False: one-shot INFO log if known, else silent\n    if in_map and model not in _INFO_LOGGED:\n        log.info(\"Recommended sampling params exist for %r...\", model)\n        _INFO_LOGGED.add(model)\n    return {}\n```\n\n| `strict` | Model in Map | Behavior |\n|----------|--------------|----------|\n| `True` | Yes | Return dict copy |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py:60-120]()\n\n### Verified Models\n\nThe following model families are currently supported with verified sampling parameters:\n\n- Qwen3 / Qwen3.5 / Qwen3.6\n- Qwen3-Coder\n- Gemma 4\n- Mistral Small 3.2\n- Devstral Small 2\n- Ministral 3 Instruct + Reasoning\n- Mistral Nemo\n- Granite 4.0\n\nEach entry includes an inline HuggingFace card URL comment for verification.\n\n资料来源：[src/forge/clients/sampling_defaults.py:50-80]()\n\n## Tool Call Processing\n\n### Extraction Flow\n\n```mermaid\ngraph TD\n    A[LLM Response] --> B{Response Type}\n    B -->|tool_calls| C[extract_tool_call]\n    B -->|text| D[TextResponse]\n    \n    C --> E{Tool Call Format}\n    E -->|OpenAI style| F[Parse name + arguments]\n    E -->|function tags| G[Parse XML-style tags]\n    E -->|dict style| H[Parse dict with name field]\n    \n    F --> I[ToolCall object]\n    G --> I\n    H --> I\n```\n\n### Supported Formats\n\n| Backend | Format | Example |\n|---------|--------|---------|\n| Ollama | OpenAI-style function calls | `{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Paris\"}}` |\n| Llamafile | Function tags or native | `<function=name><parameter=city>Paris</parameter></function>` |\n| Anthropic | Claude tool_use blocks | `{name: \"get_weather\", input: {city: \"Paris\"}}` |\n\n资料来源：[src/forge/core/workflow.py:1-50]()\n\n## Proxy Mode Integration\n\n### Request Passthrough\n\nWhen running in proxy mode, the client plumbs OpenAI-compatible body fields through to backends without modification:\n\n```python\n# Proxy plumbs these fields through per request:\n- temperature\n- top_p\n- top_k\n- min_p\n- repeat_penalty\n- presence_penalty\n- seed\n```\n\nFor per-model recommended sampling in proxy mode, the calling client must look up `forge.clients.get_sampling_defaults(model)` and include the values in the request body.\n\n资料来源：[src/forge/proxy/__main__.py:1-50]()\n\n## Usage Examples\n\n### Basic Workflow with OllamaClient\n\n```python\nfrom forge import OllamaClient, WorkflowRunner, ContextManager, TieredCompact\n\nasync def main():\n    client = OllamaClient(\n        model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n        recommended_sampling=True  # Use verified sampling params\n    )\n    ctx = ContextManager(\n        strategy=TieredCompact(keep_recent=2),\n        budget_tokens=8192\n    )\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    # ... run workflows\n\nasyncio.run(main())\n```\n\n### Per-Call Sampling Override\n\n```python\nresponse = await client.send(\n    messages,\n    tools,\n    sampling={\n        \"temperature\": 0.7,\n        \"top_p\": 0.9\n    }\n)\n```\n\nThe caller's explicit non-None fields merge with client-level defaults without mutating the original configuration.\n\n## Error Handling\n\n| Error | Cause | Resolution |\n|-------|-------|------------|\n| `UnsupportedModelError` | Model not in sampling defaults map | Add model to `sampling_defaults.py` or pass `recommended_sampling=False` |\n| `httpx.HTTPError` | Backend unreachable | Verify backend is running on correct port |\n| `BudgetResolutionError` | Cannot determine context length | Check backend `/props` endpoint returns `n_ctx` |\n\n资料来源：[src/forge/server.py:1-50]()\n\n## Backend Server Management\n\n### ServerManager Integration\n\nForge can auto-manage backend servers through `ServerManager`, which handles starting, stopping, and context resolution:\n\n```python\nfrom forge.server import create_server_and_context\n\nserver, ctx = await create_server_and_context(\n    backend=\"ollama\",\n    model=\"qwen3:8b-q4_K_M\",\n    budget_mode=BudgetMode.FORGE_FAST,\n    client=client,\n)\n```\n\n### Supported Backends\n\n| Backend | Model Specification | Port | Features |\n|---------|---------------------|------|----------|\n| `ollama` | Model name string | 11434 | Auto model management |\n| `llamaserver` | GGUF file path | 8080 | Direct GGUF serving |\n| `llamafile` | GGUF file path | 8080 | Single-file server |\n\n资料来源：[src/forge/server.py:50-150]()\n\n---\n\n<a id='page-backend-setup'></a>\n\n## Backend Setup Guide\n\n### 相关页面\n\n相关主题：[Backend Clients](#page-backend-clients), [Model Selection Guide](#page-model-guide)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n</details>\n\n# Backend Setup Guide\n\n## Overview\n\nThe Backend Setup Guide covers how to configure, initialize, and manage LLM backend servers within forge. Forge supports multiple backend types—**Ollama**, **llama-server**, and **llamafile**—each with distinct initialization patterns, context management strategies, and operational characteristics. Understanding these backends is essential for running forge's Workflow and WorkflowRunner components effectively.\n\nForge abstracts backend management through two primary classes: `ServerManager` (for direct server lifecycle control) and `setup_backend()` (a high-level async factory that combines server startup with `ContextManager` creation). 资料来源：[src/forge/server.py:1-50]()\n\n## Supported Backend Types\n\nForge supports three backend implementations, each targeting different deployment scenarios:\n\n| Backend | Identity | Configuration | Use Case |\n|---------|----------|---------------|----------|\n| `ollama` | Model name (e.g., `qwen3:8b`) | Uses Ollama's native model management | Quick local development, model switching |\n| `llamaserver` | GGUF file path | Requires explicit GGUF path and context size | Production GGUF inference with fine control |\n| `llamafile` | GGUF file path | Auto-discovers llamafile runtime | Single-file distribution, portable setups |\n\nThe backend type is determined at `ServerManager` instantiation and cannot be changed afterward. 资料来源：[src/forge/server.py:140-145]()\n\n### Ollama Backend\n\nThe Ollama backend leverages Ollama's built-in model management system. Models are pulled and managed through the Ollama CLI rather than requiring manual GGUF file handling.\n\n```python\nserver = ServerManager(backend=\"ollama\", port=8080)\nawait server.start(model=\"qwen3:8b-q4_K_M\", gguf_path=\"\", mode=\"native\")\n```\n\n**Key constraints:**\n- Does not accept `gguf_path` (use the `model` parameter instead)\n- Requires `model` to be specified\n- VRAM cleanup between model switches is handled via `ollama stop` 资料来源：[src/forge/server.py:170-180]()\n\n### Llama-server and Llamafile Backends\n\nBoth `llamaserver` and `llamafile` backends operate on GGUF files directly. The server identity is derived from the GGUF path, enabling cache-equality checks for server reuse. 资料来源：[src/forge/server.py:185-200]()\n\n```python\n# Llama-server\nserver = ServerManager(backend=\"llamaserver\", port=8080)\nawait server.start(model=\"identity\", gguf_path=\"/models/qwen3-8b-q4_K_M.gguf\", mode=\"native\")\n\n# Llamafile (auto-discovers runtime)\nserver = ServerManager(backend=\"llamafile\", port=8080)\nawait server.start(model=\"identity\", gguf_path=\"/models/llamafile-binary\", mode=\"native\")\n```\n\n## ServerManager Architecture\n\nThe `ServerManager` class encapsulates all backend server lifecycle operations, providing a unified interface across different backend types.\n\n### State Management\n\n```mermaid\ngraph TD\n    A[ServerManager.__init__] --> B[_proc: Popen | None]\n    A --> C[_current_model: str | None]\n    A --> D[_current_mode: str | None]\n    A --> E[_current_ctx: int | None]\n    A --> F[_current_flags: tuple]\n    A --> G[_current_cache_type_k/v: str | None]\n    A --> H[_current_n_slots: int | None]\n    A --> I[_current_kv_unified: bool]\n    \n    J[ServerManager.start] --> K{Cache Hit?}\n    K -->|Yes| L[Return - reuse existing]\n    K -->|No| M[await stop]\n    M --> N[Build command flags]\n    N --> O[Spawn subprocess]\n```\n\n### Cache Equality Check\n\nBefore starting a new server instance, `ServerManager` checks if an existing server matches the requested configuration. This prevents unnecessary VRAM allocation and model reloading. 资料来源：[src/forge/server.py:50-65]()\n\n```python\nflags = tuple(extra_flags) if extra_flags else ()\nif (\n    self._current_model == model\n    and self._current_mode == mode\n    and self._current_ctx == ctx_override\n    and self._current_flags == flags\n    and self._current_cache_type_k == cache_type_k\n    and self._current_cache_type_v == cache_type_v\n    and self._current_n_slots == n_slots\n    and self._current_kv_unified == kv_unified\n):\n    return  # Reuse existing server\n```\n\n### Server Initialization Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `backend` | `str` | Required | Backend type: `\"ollama\"` \\| `\"llamaserver\"` \\| `\"llamafile\"` |\n| `port` | `int` | `8080` | Server listen port (llama-server / llamafile only) |\n| `models_dir` | `str \\| Path` | `None` | Directory containing GGUF files |\n\n## Startup Parameters\n\nThe `start()` method accepts numerous parameters for fine-grained control over server behavior:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `model` | `str` | Model identity (Ollama: model name; others: GGUF path as string) |\n| `gguf_path` | `str \\| Path` | Path to GGUF file for llamaserver/llamafile |\n| `mode` | `str` | `\"native\"` or `\"prompt\"` reasoning mode |\n| `extra_flags` | `list[str]` | Additional CLI flags passed to the server |\n| `ctx_override` | `int \\| None` | Override context window size (`-c <value>`) |\n| `cache_type_k` | `str` | KV cache quantization type for keys (e.g., `\"q8_0\"`, `\"q4_0\"`) |\n| `cache_type_v` | `str` | KV cache quantization type for values |\n| `n_slots` | `int` | Concurrent slot count for multi-agent architectures |\n| `kv_unified` | `bool` | Use unified KV cache across all slots |\n\n资料来源：[src/forge/server.py:95-115]()\n\n## Budget Modes\n\nBudget modes control how forge resolves the context window budget for the `ContextManager`. The `resolve_budget()` method maps `BudgetMode` enum values to actual token counts. 资料来源：[src/forge/server.py:220-250]()\n\n```mermaid\ngraph TD\n    A[resolve_budget mode] --> B{MANUAL?}\n    B -->|Yes, Ollama| C[Return manual_tokens]\n    B -->|Yes, others| D[await get_server_context]\n    B -->|No| E{Ollama?}\n    E -->|Yes| F[await _ollama.full]\n    E -->|No| G{Mode == FORGE_FAST?}\n    G -->|Yes| H[await get_server_context ÷ 4]\n    G -->|No| I[await get_server_context]\n```\n\n### Budget Resolution Table\n\n| BudgetMode | Ollama Backend | Llama-server/Llamafile Backend |\n|------------|----------------|--------------------------------|\n| `MANUAL` | Returns `manual_tokens` parameter | Queries `/props` for `n_ctx` |\n| `BACKEND` | Ollama's reported context length | Queries `/props` for `n_ctx` |\n| `FORGE_FAST` | `n_ctx / 4` | `n_ctx / 4` |\n\n## High-Level Setup with `setup_backend()`\n\nFor most use cases, prefer `setup_backend()` which combines server startup with `ContextManager` creation. 资料来源：[src/forge/server.py:280-330]()\n\n```python\nfrom forge.server import setup_backend, BudgetMode\n\nasync def example():\n    client, ctx = await setup_backend(\n        backend=\"llamaserver\",\n        gguf_path=\"/models/qwen3-8b-q4_K_M.gguf\",\n        budget_mode=BudgetMode.FORGE_FAST,\n        client=None,  # Will create default client\n    )\n    # ... run workflows ...\n    await client.close()\n```\n\n### `setup_backend()` Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `backend` | `str` | Required | Backend type |\n| `model` | `str \\| None` | `None` | Ollama model name |\n| `gguf_path` | `str \\| Path \\| None` | `None` | GGUF file path |\n| `budget_mode` | `BudgetMode` | `BudgetMode.BACKEND` | Context budget strategy |\n| `manual_tokens` | `int \\| None` | `None` | Required for `MANUAL` mode on Ollama |\n| `client` | `Any \\| None` | `None` | Existing client or `None` to create default |\n| `mode` | `str` | `\"native\"` | Reasoning mode |\n| `port` | `int` | `8080` | Server port |\n| `extra_flags` | `list[str] \\| None` | `None` | Additional backend flags |\n| `on_compact` | `Callable \\| None` | `None` | Callback for compaction events |\n| `compact_threshold` | `float` | `0.75` | Compaction trigger threshold |\n| `phase_thresholds` | `tuple` | `(0.5, 0.7, 0.9)` | Tiered compaction thresholds |\n\n## Server Readiness Detection\n\nForge uses `/props` polling rather than `/health` for readiness confirmation. This eliminates the gap between health-ok and props-available states. 资料来源：[src/forge/server.py:260-278]()\n\n```python\nasync def wait_for_ready(self, timeout: float = 60.0) -> None:\n    url = f\"http://localhost:{self._port}/props\"\n    while time.monotonic() < deadline:\n        try:\n            resp = await client.get(url)\n            if resp.status_code == 200:\n                data = resp.json()\n                if \"default_generation_settings\" in data:\n                    return\n        except (httpx.ConnectError, httpx.ReadError, httpx.TimeoutException):\n            pass\n        await asyncio.sleep(2)\n```\n\nThe readiness check looks for `default_generation_settings` in the response—a strong indicator that the model is fully loaded and serving. 资料来源：[src/forge/server.py:260-278]()\n\n## Proxy Server Configuration\n\nForge includes a proxy server (`forge.proxy`) that plumbs OpenAI-compatible sampling parameters through to backends. The proxy does not consult the sampling defaults map; it passes through whatever parameters the inbound request carries. 资料来源：[src/forge/proxy/__main__.py:1-60]()\n\n### Proxy CLI Options\n\n| Flag | Type | Default | Description |\n|------|------|---------|-------------|\n| `--backend-url` | `str` | Required | Target backend URL |\n| `--backend` | `str` | Required | Backend type |\n| `--model` | `str` | Required | Model identifier |\n| `--gguf` | `str` | `\"\"` | GGUF path (for non-Ollama) |\n| `--budget-mode` | `str` | `\"backend\"` | Budget resolution mode |\n| `--budget-tokens` | `int` | `None` | Manual token budget |\n| `--host` | `str` | `127.0.0.1` | Proxy listen host |\n| `--port` | `int` | `8081` | Proxy listen port |\n| `--serialize` | `flag` | `None` | Force request serialization |\n| `--max-retries` | `int` | `3` | Max retries per request |\n| `--verbose` | `flag` | `False` | Enable debug logging |\n\n### Proxy Sampling Passthrough\n\nThe proxy supports these OpenAI-compatible body fields:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `temperature` | `float` | Sampling temperature |\n| `top_p` | `float` | Nucleus sampling threshold |\n| `top_k` | `int` | Top-k sampling |\n| `min_p` | `float` | Minimum probability threshold |\n| `repeat_penalty` | `float` | Repetition penalty |\n| `presence_penalty` | `float` | Presence penalty |\n| `seed` | `int` | Deterministic sampling seed |\n\nFor per-model recommended sampling in proxy mode, callers should look up `forge.clients.get_sampling_defaults(model)` and include the values in the request body. 资料来源：[src/forge/clients/sampling_defaults.py:1-50]()\n\n## Per-Model Sampling Defaults\n\nForge ships verified per-model sampling recommendations for supported models. These must be explicitly opted into via `recommended_sampling=True`. 资料来源：[src/forge/clients/sampling_defaults.py:50-80]()\n\n### Supported Models\n\nThe sampling defaults map includes recommendations for:\n\n- **Qwen3 / 3.5 / 3.6** series\n- **Qwen3-Coder**\n- **Gemma 4**\n- **Mistral Small 3.2**\n- **Devstral Small 2**\n- **Ministral 3 Instruct + Reasoning**\n- **Mistral Nemo**\n- **Granite 4.0** (`h-micro`, `h-tiny`)\n\nEach entry includes an inline HuggingFace model card URL for verification. 资料来源：[CHANGELOG.md:0.6.0]()\n\n### Sampling Policy\n\n| `strict` | Model in Map | Behavior |\n|----------|--------------|----------|\n| `True` | Yes | Return dict copy |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n## Context Length Resolution\n\nFor non-Ollama backends, forge queries the server's `/props` endpoint to determine the configured context length. This value feeds into budget resolution. 资料来源：[src/forge/server.py:200-220]()\n\n```python\nasync def get_server_context(self) -> int:\n    \"\"\"Query /props for actual n_ctx.\n    \n    For Ollama: ``ollama stop`` for clean VRAM unloads between model switches.\n    \"\"\"\n    props = await self.query_props()\n    ctx = props.get(\"default_generation_settings\", {}).get(\"n_ctx\")\n    if ctx is None:\n        raise BudgetResolutionError()\n    return ctx\n```\n\n## Best Practices\n\n### Server Reuse\n\nAlways check if a server with the desired configuration is already running before starting a new one. The `ServerManager` performs this check internally based on:\n\n1. Model identity (name or GGUF path)\n2. Mode (`native` or `prompt`)\n3. Context override\n4. CLI flags\n5. KV cache quantization settings\n6. Slot configuration\n\n### VRAM Management\n\nFor Ollama backends, use `ollama stop` to cleanly unload models and free VRAM before switching to a different model. The llama-server/llamafile backends handle this through server restart. 资料来源：[src/forge/server.py:200]()\n\n### Graceful Shutdown\n\nAlways call `server.stop()` when finished to properly terminate the backend process:\n\n```python\nserver = ServerManager(backend=\"llamaserver\", port=8080)\ntry:\n    await server.start(...)\n    # ... work ...\nfinally:\n    await server.stop()\n```\n\n### Multi-Agent Configurations\n\nFor multi-agent architectures requiring concurrent slots, configure `n_slots` and optionally `kv_unified=True` for shared KV cache across slots:\n\n```python\nawait server.start(\n    model=\"...\",\n    gguf_path=\"...\",\n    n_slots=4,\n    kv_unified=True,  # Each slot can use full context\n)\n```\n\n## Quick Start Example\n\n```python\nimport asyncio\nfrom forge import OllamaClient, WorkflowRunner\nfrom forge.server import setup_backend, BudgetMode\nfrom forge.context import ContextManager, TieredCompact\n\nasync def main():\n    # Setup backend with forge-managed context\n    client, ctx = await setup_backend(\n        backend=\"ollama\",\n        model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n        budget_mode=BudgetMode.FORGE_FAST,\n        recommended_sampling=True,\n    )\n    \n    try:\n        runner = WorkflowRunner(client=client, context_manager=ctx)\n        # ... run workflows ...\n    finally:\n        await client.close()\n\nasyncio.run(main())\n```\n\nFor GGUF-based setups:\n\n```python\nfrom forge.server import setup_backend, BudgetMode\n\nclient, ctx = await setup_backend(\n    backend=\"llamaserver\",\n    gguf_path=\"/path/to/model-q4_K_M.gguf\",\n    budget_mode=BudgetMode.FORGE_FAST,\n    extra_flags=[\"--reasoning-format\", \"auto\"],\n)\n```\n\n## Related Documentation\n\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md) - Project setup and testing\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md) - Quick start and Workflow overview\n- [CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md) - Version history and breaking changes\n\n---\n\n<a id='page-model-guide'></a>\n\n## Model Selection Guide\n\n### 相关页面\n\n相关主题：[Backend Setup Guide](#page-backend-setup), [Backend Clients](#page-backend-clients)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/errors.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/errors.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/clients/llamafile.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/llamafile.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n</details>\n\n# Model Selection Guide\n\n## Overview\n\nThe Model Selection Guide covers how to choose, configure, and deploy language models within forge. Forge provides a unified workflow engine that abstracts backend differences (Ollama, Llamafile, LlamaServer) while offering per-model recommended sampling parameters sourced directly from HuggingFace model cards.\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-20]()\n\n## Supported Backends\n\nForge supports three LLM backend types, each with distinct configuration requirements.\n\n| Backend | Configuration Method | Model Specification | Notes |\n|---------|---------------------|---------------------|-------|\n| Ollama | `model` parameter | Model name from `ollama list` | No GGUF path needed |\n| Llamafile | `gguf_path` parameter | Path to .llamafile binary | Self-contained executables |\n| LlamaServer | `gguf_path` parameter | Path to GGUF model file | Requires llama.cpp server binary |\n\n资料来源：[src/forge/server.py:1-50]()\n\n### Backend Selection Logic\n\n```mermaid\ngraph TD\n    A[Choose Backend] --> B{Backend Type?}\n    B -->|Ollama| C[Use model name]\n    B -->|Llamafile| D[Use gguf_path]\n    B -->|LlamaServer| D\n    C --> E[Connect via localhost:11434]\n    D --> F[Start server process]\n    F --> G[Connect via port 8080]\n```\n\nFor Ollama, the `model` parameter directly references the model name from `ollama list`. For Llamafile and LlamaServer, you must provide the `gguf_path` pointing to the model file, and Forge will manage the server process lifecycle.\n\n资料来源：[src/forge/server.py:80-120]()\n\n## Supported Models\n\n### Model Families\n\nForge has been tested and evaluated with the following model families across different quantization levels.\n\n| Model Family | Variants | Recommended Quantization | Notes |\n|-------------|----------|-------------------------|-------|\n| Qwen3 | 8B, 3.5, 3.6 | Q4_K_M, Q8_0 | Includes Qwen3-Coder |\n| Gemma | 4 (all sizes) | Q4_K_M | Use `--reasoning-budget 0` workaround |\n| Mistral | Small 3.2, Nemo, 7B | Q4_K_M, Q8_0 | Ministral variants available |\n| Devstral | Small 2 | Q4_K_M | Code-focused model |\n| Granite | 4.0 (h-micro, h-tiny) | Q4_K_M, Q8_0 | OpenAI-style tool calls |\n| Llama | 3.1 8B | Q4_K_M, Q8_0 | 8B Reasoning variants |\n| Ministral | 3 Instruct, 8B Instruct, Reasoning | Q4_K_M, Q8_0 | Reasoning requires budget fix |\n\n资料来源：[CHANGELOG.md:1-50]()\n\n### Model Naming Conventions\n\nModel names vary by backend. When using Ollama, use the exact model tag as shown in `ollama list`. For GGUF-based backends, the model name is derived from the filename stem of the GGUF file.\n\n```python\n# Ollama example\nclient = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\")\n\n# Llamafile/LlamaServer example - model derived from path\nclient = LlamafileClient(gguf_path=\"/models/mistral-7b-q4_K_M.gguf\")\n```\n\n资料来源：[README.md:1-30]()\n\n## Recommended Sampling Configuration\n\n### The Sampling Defaults Map\n\nForge ships `forge.clients.sampling_defaults` containing a verified per-model sampling recommendations map. Each entry includes parameters such as `temperature`, `top_p`, `top_k`, `min_p`, `repeat_penalty`, and `presence_penalty` sourced directly from HuggingFace model cards.\n\n资料来源：[src/forge/clients/sampling_defaults.py:20-40]()\n\n### Enabling Recommended Sampling\n\nTo use per-model recommended sampling, pass `recommended_sampling=True` when initializing the client.\n\n```python\nfrom forge import OllamaClient\n\n# Opt-in to recommended sampling\nclient = OllamaClient(\n    model=\"qwen3:8b-q4_K_M\",\n    recommended_sampling=True\n)\n```\n\nIf the model is not in the map and `recommended_sampling=True` is set, Forge raises `UnsupportedModelError` rather than silently falling back to backend defaults.\n\n资料来源：[src/forge/errors.py:1-25]()\n\n### Sampling Policy Behavior\n\nThe following table describes the four-quadrant behavior when applying sampling defaults.\n\n| `strict` | Model in Map | Behavior |\n|----------|-------------|----------|\n| `True` | Yes | Return recommended dict |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py:60-85]()\n\n### Per-Call Sampling Overrides\n\nThe `send()` and `send_stream()` methods accept a `sampling: dict | None` kwarg that merges field-by-field with the client's instance-level sampling without mutating it. The caller's explicit non-None fields take precedence.\n\n```python\n# Merge with instance defaults\nresponse = await client.send(\n    messages,\n    sampling={\"temperature\": 0.7, \"top_p\": 0.9}\n)\n```\n\n资料来源：[CHANGELOG.md:50-70]()\n\n## Proxy Mode Configuration\n\nWhen running forge in proxy mode, sampling parameters are plumbed through from the incoming request body. OpenAI-compatible fields supported include `temperature`, `top_p`, `top_k`, `min_p`, `repeat_penalty`, `presence_penalty`, and `seed`.\n\n### Proxy Server Startup\n\n```bash\npython -m forge.proxy \\\n    --backend-url http://localhost:11434 \\\n    --backend ollama \\\n    --model qwen3:8b-q4_K_M \\\n    --port 8081\n```\n\nFor per-model recommended sampling in proxy mode, the calling client should look up `forge.clients.get_sampling_defaults(model)` and include the values in the request body.\n\n资料来源：[src/forge/proxy/__main__.py:1-40]()\n\n## Context and Budget Management\n\n### Server Context Resolution\n\nForge automatically queries the backend's `/props` endpoint to determine the maximum context length. For Ollama, use `ollama stop` to cleanly unload VRAM between model switches.\n\n```python\nfrom forge import ContextManager, TieredCompact\n\nctx = ContextManager(\n    strategy=TieredCompact(keep_recent=2),\n    budget_tokens=8192\n)\n```\n\n### Budget Modes\n\n| Mode | Description | Token Source |\n|------|-------------|--------------|\n| `MANUAL` | User-specified token budget | `manual_tokens` parameter |\n| `FORGE_FAST` | Fast iteration mode | Server-reported context |\n| `FORGE_BALANCED` | Balanced speed/quality | Server-reported context |\n| `FORGE_THOROUGH` | Maximum quality | Server-reported context |\n\n资料来源：[src/forge/server.py:150-200]()\n\n### Known Issues with Reasoning Models\n\nModels using extended reasoning (Gemma 4, Qwen 3.5, Ministral Reasoning) may hang with unbounded reasoning budgets on builds after April 10, 2026. The workaround is to set `--reasoning-budget 0` when starting the backend.\n\n资料来源：[CHANGELOG.md:70-90]()\n\n## Client Configuration Reference\n\n### OllamaClient\n\n```python\nOllamaClient(\n    model: str,                          # Model name from `ollama list`\n    base_url: str = \"http://localhost:11434/v1\",\n    api_key: str | None = None,\n    timeout: float = 120.0,\n    recommended_sampling: bool = False,  # Opt-in to per-model defaults\n    **kwargs\n)\n```\n\n### LlamafileClient\n\n```python\nLlamafileClient(\n    gguf_path: str | Path,               # Path to .llamafile binary\n    model: str | None = None,            # Optional model name\n    base_url: str = \"http://localhost:8080/v1\",\n    recommended_sampling: bool = False,\n    **kwargs\n)\n```\n\n资料来源：[src/forge/clients/llamafile.py:1-50]()\n\n## Complete Workflow Example\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(\n        model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n        recommended_sampling=True\n    )\n    ctx = ContextManager(\n        strategy=TieredCompact(keep_recent=2),\n        budget_tokens=8192\n    )\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md:30-80]()\n\n## Error Handling\n\n### UnsupportedModelError\n\nRaised when `recommended_sampling=True` is specified but the model is not in the sampling defaults map.\n\n```python\nfrom forge.errors import UnsupportedModelError\n\ntry:\n    client = OllamaClient(\n        model=\"unknown-model:latest\",\n        recommended_sampling=True\n    )\nexcept UnsupportedModelError as e:\n    print(f\"Model not supported: {e.model}\")\n    # Solution: Either add entry to MODEL_SAMPLING_DEFAULTS\n    # or drop recommended_sampling=True\n```\n\n### Tool Call Errors\n\nTool-related errors include `ToolCallError` (LLM failed to produce valid tool call), `ToolExecutionError` (tool callable raised an exception), and `ToolResolutionError` (valid arguments but data didn't resolve).\n\n资料来源：[src/forge/errors.py:25-60]()\n\n## Best Practices\n\n### Selecting Quantization Levels\n\n| Use Case | Recommended Quantization |\n|----------|-------------------------|\n| Development/Testing | Q4_K_M (balanced quality/size) |\n| Production (quality priority) | Q8_0 (near-float quality) |\n| Resource-constrained | Q4_0 (smaller, lower quality) |\n\n### Guardrail Integration\n\nGuardrails in forge are defined in `src/forge/core/runner.py` and nudge templates in `src/forge/prompts/nudges.py`. Each guardrail can be independently toggled via ablation presets for evaluation.\n\n资料来源：[CONTRIBUTING.md:1-30]()\n\n### Server Management\n\nWhen running multiple evaluations, reuse `ServerManager` instances when the model and configuration match to avoid unnecessary server restarts.\n\n```python\n# ServerManager caches configuration to avoid redundant restarts\nif (\n    self._current_model == model\n    and self._current_mode == mode\n    and self._current_ctx == ctx_override\n):\n    # Reuse existing server\n    return\n```\n\n资料来源：[src/forge/server.py:60-75]()\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：antoinezambelli/forge\n\n摘要：发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body。\n\n## 1. 安装坑 · 来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_148dff87195e42549d0ffb88b99e9cbf | https://github.com/antoinezambelli/forge/issues/58 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 2. 安装坑 · 来源证据：Investigate: integration paths with Hermes Agent\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Investigate: integration paths with Hermes Agent\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e3cbd2d1c9a84a1887887bf24b036865 | https://github.com/antoinezambelli/forge/issues/51 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 3. 安装坑 · 来源证据：Per-model recommended sampling defaults (map keyed by HF model cards)\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Per-model recommended sampling defaults (map keyed by HF model cards)\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_057ca2af912e4a608259ffb2a3654d4f | https://github.com/antoinezambelli/forge/issues/59 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 4. 安装坑 · 来源证据：Rescue-parse ChatGPT-style XML tool calls\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Rescue-parse ChatGPT-style XML tool calls\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_471c674c8d73451da75d6b8c9349aabf | https://github.com/antoinezambelli/forge/issues/55 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 5. 配置坑 · 来源证据：Proxy external mode hardcodes native FC — no prompt-injection fallback\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：Proxy external mode hardcodes native FC — no prompt-injection fallback\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f3a85ec8447a4838b3bc4c846cd9e7a0 | https://github.com/antoinezambelli/forge/issues/53 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 6. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | README/documentation is current enough for a first validation pass.\n\n## 7. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | last_activity_observed missing\n\n## 8. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium\n\n## 9. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium\n\n## 10. 安全/权限坑 · 来源证据：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_4ad226a6d1fa4a5f89fa7702bec11188 | https://github.com/antoinezambelli/forge/issues/61 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 安全/权限坑 · 来源证据：Sub-agent support: dynamic slot splitting\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Sub-agent support: dynamic slot splitting\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_5b35873cf63c4647bca8a0611d441189 | https://github.com/antoinezambelli/forge/issues/28 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 12. 安全/权限坑 · 来源证据：Sub-agent support: slot pool\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Sub-agent support: slot pool\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_070d9a3d20d24123b62d7d76ee16078a | https://github.com/antoinezambelli/forge/issues/29 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 13. 安全/权限坑 · 来源证据：llama.cpp reasoning budget sampler causes silent hangs after April 10 builds\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：llama.cpp reasoning budget sampler causes silent hangs after April 10 builds\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_673be4a583984219bab90cbadff631fe | https://github.com/antoinezambelli/forge/issues/54 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 14. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | issue_or_pr_quality=unknown\n\n## 15. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | release_recency=unknown\n\n<!-- canonical_name: antoinezambelli/forge; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "forge",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "hn_item:48192383",
          "kind": "hn",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://news.ycombinator.com/item?id=48192383"
        },
        {
          "evidence_id": "art_9416e509908347e09477e1870e1e192a",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/antoinezambelli/forge#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "forge 说明书",
      "toc": [
        "https://github.com/antoinezambelli/forge 项目说明书",
        "目录",
        "Introduction to Forge",
        "Overview",
        "Core Architecture",
        "Quick Start",
        "Core Components",
        "Instance-level sampling",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "f1b87b05b863c7d12927f3dbdbd716af2dc3ace1",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "README.md",
      "docs/USER_GUIDE.md",
      "docs/ARCHITECTURE.md",
      "docs/EVAL_GUIDE.md",
      "docs/MODEL_GUIDE.md",
      "docs/BACKEND_SETUP.md",
      "docs/WORKFLOW.md",
      "docs/results/index.md",
      "docs/decisions/006-tool-prerequisites.md",
      "docs/decisions/002-anthropic-baseline.md",
      "docs/decisions/MULTI_MODEL_ROUTING.md",
      "docs/decisions/009-bfcl-integration.md",
      "docs/decisions/010-tool-resolution-error.md",
      "docs/decisions/013-text-response-intent.md",
      "docs/decisions/003-thinking-label-ux.md",
      "docs/decisions/004-async-on-chunk.md",
      "docs/decisions/012-openai-proxy.md",
      "docs/decisions/001-ablation-framework.md",
      "docs/decisions/005-parallel-tool-calls.md",
      "docs/decisions/014-recommended-sampling-opt-in.md",
      "docs/decisions/007-report-views.md",
      "docs/decisions/011-guardrail-middleware.md",
      "docs/decisions/008-stateful-eval-scenarios.md",
      "docs/results/raw/reforged-vs-bare.md",
      "docs/results/raw/native-vs-prompt.md",
      "docs/results/raw/reforged/by-backend.md",
      "docs/results/raw/reforged/all.md",
      "docs/results/raw/reforged/by-family.md",
      "examples/foreign_loop.py",
      "src/forge/server.py",
      "src/forge/errors.py",
      "src/forge/__init__.py",
      "src/forge/context/hardware.py",
      "src/forge/context/manager.py",
      "src/forge/context/strategies.py",
      "src/forge/context/__init__.py",
      "src/forge/guardrails/guardrails.py",
      "src/forge/guardrails/step_enforcer.py",
      "src/forge/guardrails/nudge.py"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# dashboard - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 dashboard 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **AI 研究者或研究型 Agent 构建者**：README 明确围绕研究、实验或论文工作流展开。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **正在使用 Claude/Codex/Cursor/Gemini 等宿主 AI 的开发者**：README 或插件配置提到多个宿主 AI。 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n## 它能做什么\n\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n## 怎么开始\n\n- `pip install forge-guardrails                # core only` 证据：`README.md` Claim：`clm_0004` supported 0.86\n- `pip install \"forge-guardrails[anthropic]\"   # + Anthropic client` 证据：`README.md` Claim：`clm_0005` supported 0.86\n- `git clone https://github.com/antoinezambelli/forge.git` 证据：`README.md` Claim：`clm_0006` supported 0.86\n- `pip install -e \".[dev]\"` 证据：`README.md` Claim：`clm_0007` supported 0.86\n- `pip install -e \".[anthropic]\"` 证据：`README.md` Claim：`clm_0008` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：需要管理员/安全审批\n- **为什么**：继续前可能涉及密钥、账号、外部服务或敏感上下文，建议先经过管理员或安全审批。\n\n### 30 秒判断\n\n- **现在怎么做**：需要管理员/安全审批\n- **最小安全下一步**：先跑 Prompt Preview；若涉及凭证或企业环境，先审批再试装\n- **先别相信**：角色质量和任务匹配不能直接相信。\n- **继续会触碰**：角色选择偏差、命令执行、本地环境或项目文件\n\n### 现在可以相信\n\n- **适合人群线索：AI 研究者或研究型 Agent 构建者**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **适合人群线索：正在使用 Claude/Codex/Cursor/Gemini 等宿主 AI 的开发者**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0003` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0004` supported 0.86\n\n### 现在还不能相信\n\n- **角色质量和任务匹配不能直接相信。**（unverified）：角色库证明有很多角色，不证明每个角色都适合你的具体任务，也不证明角色能产生高质量结果。\n- **不能把角色文案当成真实执行能力。**（unverified）：安装前只能判断角色描述和任务画像是否匹配，不能证明它能在宿主 AI 里完成任务。\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n\n### 继续会触碰什么\n\n- **角色选择偏差**：用户对任务应该由哪个专家角色处理的判断。 原因：选错角色会让 AI 从错误专业视角回答，浪费时间或误导决策。\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`\n- **环境变量 / API Key**：项目入口文档明确出现 API key、token、secret 或账号凭证配置。 原因：如果真实安装需要凭证，应先使用测试凭证并经过权限/合规判断。 证据：`README.md`, `docs/EVAL_GUIDE.md`\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：先用交互式试用验证任务画像和角色匹配，不要先导入整套角色库。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **不要使用真实生产凭证**：环境变量/API key 一旦进入宿主或工具链，可能产生账号和合规风险。（适用：出现 API、TOKEN、KEY、SECRET 等环境线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **保留原始角色选择记录**：如果输出偏题，可以回到任务画像阶段重新选择角色，而不是继续沿着错误角色推进。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **准备撤销测试 API key 或 token**：测试凭证泄露或误用时，可以快速止损。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0009` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md` Claim：`clm_0010` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n### 上下文规模\n\n- 文件总数：150\n- 重要文件覆盖：40/150\n- 证据索引条目：45\n- 角色 / Skill 条目：30\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 dashboard 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 dashboard 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 dashboard 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 30 个角色 / Skill / 项目文档条目。\n\n- **forge**（project_doc）：! PyPI https://img.shields.io/pypi/v/forge-guardrails.svg https://pypi.org/project/forge-guardrails/ ! Tests https://github.com/antoinezambelli/forge/actions/workflows/tests.yml/badge.svg https://github.com/antoinezambelli/forge/actions/workflows/tests.yml ! codecov https://codecov.io/gh/antoinezambelli/forge/branch/main/graph/badge.svg https://codecov.io/gh/antoinezambelli/forge ! Python 3.12+ https://img.shields.i… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`README.md`\n- **Contributing to forge**（project_doc）：Thanks for your interest in contributing. This guide covers how to get set up, run tests, and where to look when adding new functionality. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`CONTRIBUTING.md`\n- **Architecture: Agentic Tool-Calling Library**（project_doc）：Architecture: Agentic Tool-Calling Library 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/ARCHITECTURE.md`\n- **Backend Setup Guide**（project_doc）：How to install and run each LLM backend for forge eval and development. All instructions assume Windows 11 with an NVIDIA GPU 16GB VRAM . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/BACKEND_SETUP.md`\n- **Eval Guide**（project_doc）：Internal tooling for measuring how reliably a model + backend combo navigates multi-step tool-calling workflows. Not a test suite — run manually against a live backend. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/EVAL_GUIDE.md`\n- **Model Guide**（project_doc）：Which model and backend to use with forge, based on your hardware and goals. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/MODEL_GUIDE.md`\n- **User Guide**（project_doc）：Practical usage patterns for forge — from single-turn tool calling to multi-turn conversations. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/USER_GUIDE.md`\n- **Workflow**（project_doc）：Visual guide to the forge agentic tool-calling loop. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/WORKFLOW.md`\n- **ADR-001: Ablation Framework**（project_doc）：Status: Implemented az/ablation branch, Feb 2026 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/001-ablation-framework.md`\n- **ADR-002: Anthropic Baseline Client**（project_doc）：Status: Implemented az/ablation branch, Feb 2026 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/002-anthropic-baseline.md`\n- **ADR-003: thinking Label UX — Reasoning Capture Gating**（project_doc）：ADR-003: thinking Label UX — Reasoning Capture Gating 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/003-thinking-label-ux.md`\n- **ADR-004: Async on chunk Callback**（project_doc）：Status: Done implemented on az/async think branch 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/004-async-on-chunk.md`\n- **ADR-005: Parallel Tool Calling**（project_doc）：Status: Done branch az/parallel tools , commit cd2bd69 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/005-parallel-tool-calls.md`\n- **ADR-006: Tool Prerequisites**（project_doc）：Status: Accepted and implemented March 2026 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/006-tool-prerequisites.md`\n- **ADR-007: Report Views**（project_doc）：Status: Planned README roadmap item 5 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/007-report-views.md`\n- **ADR-008: Stateful Eval Scenarios**（project_doc）：All 11 eval scenarios use argument-blind tool callables. get info query=\"rome\" returns the Paris canned string. check supplier supplier=\"anything\" routes through a fuzzy match but fundamentally returns a static blob. The only exception is error recovery , which validates a 4-digit format — a type check, not stateful behavior. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/008-stateful-eval-scenarios.md`\n- **ADR-009: BFCL Integration**（project_doc）：Status: Implemented az/bfcl eval branch, Feb 2026 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/009-bfcl-integration.md`\n- **ADR-010: ToolResolutionError**（project_doc）：Status: Implemented az/tre branch, Mar 2026 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/010-tool-resolution-error.md`\n- **ADR-011: Guardrail Middleware — Composable Reliability Without Loop Ownership**（project_doc）：ADR-011: Guardrail Middleware — Composable Reliability Without Loop Ownership 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/011-guardrail-middleware.md`\n- **ADR-012: OpenAI-Compatible Proxy Server**（project_doc）：ADR-012: OpenAI-Compatible Proxy Server 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/012-openai-proxy.md`\n- **ADR-013: Text Response Intent -- When the Model Chooses Not to Call Tools**（project_doc）：ADR-013: Text Response Intent -- When the Model Chooses Not to Call Tools 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/013-text-response-intent.md`\n- **ADR-014: Recommended sampling — opt-in flag and proxy pass-through**（project_doc）：ADR-014: Recommended sampling — opt-in flag and proxy pass-through 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/014-recommended-sampling-opt-in.md`\n- **Multi-Model Routing — Concept Doc**（project_doc）：Allow forge to manage multiple model backends simultaneously and expose them as named clients to the consumer. Forge handles the pool lifecycle, health, budgets . The consumer handles orchestration which workflow uses which model, when to swap, event dispatch . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/decisions/MULTI_MODEL_ROUTING.md`\n- **Forge Eval Reports**（project_doc）：For model and backend recommendations, see Model Guide ../MODEL GUIDE.md . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/results/index.md`\n- **Forge Eval — Native vs Prompt llama-server**（project_doc）：Forge Eval — Native vs Prompt llama-server 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/results/raw/native-vs-prompt.md`\n- **Forge Eval — Reforged vs Bare**（project_doc）：claude-haiku-4-5-20251001 anthropic/native 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/results/raw/reforged-vs-bare.md`\n- **Forge Eval — Reforged Leaderboard**（project_doc）：Scr=score correct/total , Acc=accuracy correct/total, excl validate errors , Cmp=completeness completed/total , Eff=efficiency ideal/actual calls , Wst=avg wasted calls, Spd=avg time excl compaction rel=relevance detection, arg=argument fidelity, tsl=tool selection, b2s=basic 2step, s3s=sequential 3step, crt=conditional routing, srn=sequential reasoning, err=error recovery, dgr=data gap recovery, dge=data gap recove… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/results/raw/reforged/all.md`\n- **Forge Eval — Reforged by Backend**（project_doc）：Scr=score correct/total , Acc=accuracy correct/total, excl validate errors , Cmp=completeness completed/total , Eff=efficiency ideal/actual calls , Wst=avg wasted calls, Spd=avg time excl compaction rel=relevance detection, arg=argument fidelity, tsl=tool selection, b2s=basic 2step, s3s=sequential 3step, crt=conditional routing, srn=sequential reasoning, err=error recovery, dgr=data gap recovery, dge=data gap recove… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/results/raw/reforged/by-backend.md`\n- **Forge Eval — Reforged by Model Family**（project_doc）：Forge Eval — Reforged by Model Family 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/results/raw/reforged/by-family.md`\n- **Changelog**（project_doc）：All notable changes to forge are documented here. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`CHANGELOG.md`\n\n## 证据索引\n\n- 共索引 45 条证据。\n\n- **forge**（documentation）：! PyPI https://img.shields.io/pypi/v/forge-guardrails.svg https://pypi.org/project/forge-guardrails/ ! Tests https://github.com/antoinezambelli/forge/actions/workflows/tests.yml/badge.svg https://github.com/antoinezambelli/forge/actions/workflows/tests.yml ! codecov https://codecov.io/gh/antoinezambelli/forge/branch/main/graph/badge.svg https://codecov.io/gh/antoinezambelli/forge ! Python 3.12+ https://img.shields.io/badge/python-3.12%2B-blue.svg https://www.python.org/downloads/ ! License: MIT https://img.shields.io/badge/license-MIT-green.svg LICENSE 证据：`README.md`\n- **Contributing to forge**（documentation）：Thanks for your interest in contributing. This guide covers how to get set up, run tests, and where to look when adding new functionality. 证据：`CONTRIBUTING.md`\n- **Package**（package_manifest）：{ \"name\": \"dashboard\", \"private\": true, \"version\": \"0.0.0\", \"type\": \"module\", \"scripts\": { \"dev\": \"vite\", \"build\": \"tsc -b && vite build\", \"lint\": \"eslint .\", \"preview\": \"vite preview\" }, \"dependencies\": { \"react\": \"^19.2.0\", \"react-dom\": \"^19.2.0\" }, \"devDependencies\": { \"@eslint/js\": \"^9.39.1\", \"@tailwindcss/vite\": \"^4.2.1\", \"@types/node\": \"^24.10.1\", \"@types/react\": \"^19.2.7\", \"@types/react-dom\": \"^19.2.3\", \"@vitejs/plugin-react\": \"^5.1.1\", \"eslint\": \"^9.39.1\", \"eslint-plugin-react-hooks\": \"^7.0.1\", \"eslint-plugin-react-refresh\": \"^0.4.24\", \"globals\": \"^16.5.0\", \"tailwindcss\": \"^4.2.1\", \"typescript\": \"~5.9.3\", \"typescript-eslint\": \"^8.48.0\", \"vite\": \"^7.3.1\", \"vite-plugin-singlefile\": \"^… 证据：`tests/eval/dashboard/package.json`\n- **License**（source_file）：Copyright c 2025-2026 Antoine Zambelli 证据：`LICENSE`\n- **Architecture: Agentic Tool-Calling Library**（documentation）：Architecture: Agentic Tool-Calling Library 证据：`docs/ARCHITECTURE.md`\n- **Backend Setup Guide**（documentation）：How to install and run each LLM backend for forge eval and development. All instructions assume Windows 11 with an NVIDIA GPU 16GB VRAM . 证据：`docs/BACKEND_SETUP.md`\n- **Eval Guide**（documentation）：Internal tooling for measuring how reliably a model + backend combo navigates multi-step tool-calling workflows. Not a test suite — run manually against a live backend. 证据：`docs/EVAL_GUIDE.md`\n- **Model Guide**（documentation）：Which model and backend to use with forge, based on your hardware and goals. 证据：`docs/MODEL_GUIDE.md`\n- **User Guide**（documentation）：Practical usage patterns for forge — from single-turn tool calling to multi-turn conversations. 证据：`docs/USER_GUIDE.md`\n- **Workflow**（documentation）：Visual guide to the forge agentic tool-calling loop. 证据：`docs/WORKFLOW.md`\n- **ADR-001: Ablation Framework**（documentation）：Status: Implemented az/ablation branch, Feb 2026 证据：`docs/decisions/001-ablation-framework.md`\n- **ADR-002: Anthropic Baseline Client**（documentation）：Status: Implemented az/ablation branch, Feb 2026 证据：`docs/decisions/002-anthropic-baseline.md`\n- **ADR-003: thinking Label UX — Reasoning Capture Gating**（documentation）：ADR-003: thinking Label UX — Reasoning Capture Gating 证据：`docs/decisions/003-thinking-label-ux.md`\n- **ADR-004: Async on chunk Callback**（documentation）：Status: Done implemented on az/async think branch 证据：`docs/decisions/004-async-on-chunk.md`\n- **ADR-005: Parallel Tool Calling**（documentation）：Status: Done branch az/parallel tools , commit cd2bd69 证据：`docs/decisions/005-parallel-tool-calls.md`\n- **ADR-006: Tool Prerequisites**（documentation）：Status: Accepted and implemented March 2026 证据：`docs/decisions/006-tool-prerequisites.md`\n- **ADR-007: Report Views**（documentation）：Status: Planned README roadmap item 5 证据：`docs/decisions/007-report-views.md`\n- **ADR-008: Stateful Eval Scenarios**（documentation）：All 11 eval scenarios use argument-blind tool callables. get info query=\"rome\" returns the Paris canned string. check supplier supplier=\"anything\" routes through a fuzzy match but fundamentally returns a static blob. The only exception is error recovery , which validates a 4-digit format — a type check, not stateful behavior. 证据：`docs/decisions/008-stateful-eval-scenarios.md`\n- **ADR-009: BFCL Integration**（documentation）：Status: Implemented az/bfcl eval branch, Feb 2026 证据：`docs/decisions/009-bfcl-integration.md`\n- **ADR-010: ToolResolutionError**（documentation）：Status: Implemented az/tre branch, Mar 2026 证据：`docs/decisions/010-tool-resolution-error.md`\n- **ADR-011: Guardrail Middleware — Composable Reliability Without Loop Ownership**（documentation）：ADR-011: Guardrail Middleware — Composable Reliability Without Loop Ownership 证据：`docs/decisions/011-guardrail-middleware.md`\n- **ADR-012: OpenAI-Compatible Proxy Server**（documentation）：ADR-012: OpenAI-Compatible Proxy Server 证据：`docs/decisions/012-openai-proxy.md`\n- **ADR-013: Text Response Intent -- When the Model Chooses Not to Call Tools**（documentation）：ADR-013: Text Response Intent -- When the Model Chooses Not to Call Tools 证据：`docs/decisions/013-text-response-intent.md`\n- **ADR-014: Recommended sampling — opt-in flag and proxy pass-through**（documentation）：ADR-014: Recommended sampling — opt-in flag and proxy pass-through 证据：`docs/decisions/014-recommended-sampling-opt-in.md`\n- **Multi-Model Routing — Concept Doc**（documentation）：Allow forge to manage multiple model backends simultaneously and expose them as named clients to the consumer. Forge handles the pool lifecycle, health, budgets . The consumer handles orchestration which workflow uses which model, when to swap, event dispatch . 证据：`docs/decisions/MULTI_MODEL_ROUTING.md`\n- **Forge Eval Reports**（documentation）：For model and backend recommendations, see Model Guide ../MODEL GUIDE.md . 证据：`docs/results/index.md`\n- **Forge Eval — Native vs Prompt llama-server**（documentation）：Forge Eval — Native vs Prompt llama-server 证据：`docs/results/raw/native-vs-prompt.md`\n- **Forge Eval — Reforged vs Bare**（documentation）：claude-haiku-4-5-20251001 anthropic/native 证据：`docs/results/raw/reforged-vs-bare.md`\n- **Forge Eval — Reforged Leaderboard**（documentation）：Scr=score correct/total , Acc=accuracy correct/total, excl validate errors , Cmp=completeness completed/total , Eff=efficiency ideal/actual calls , Wst=avg wasted calls, Spd=avg time excl compaction rel=relevance detection, arg=argument fidelity, tsl=tool selection, b2s=basic 2step, s3s=sequential 3step, crt=conditional routing, srn=sequential reasoning, err=error recovery, dgr=data gap recovery, dge=data gap recovery extended, art=argument transformation, grs=grounded synthesis, iar=inconsistent api recovery, rel s=relevance detection stateful, arg s=argument fidelity stateful, tsl s=tool selection stateful, b2s s=basic 2step stateful, s3s s=sequential 3step stateful, crt s=conditional rou… 证据：`docs/results/raw/reforged/all.md`\n- **Forge Eval — Reforged by Backend**（documentation）：Scr=score correct/total , Acc=accuracy correct/total, excl validate errors , Cmp=completeness completed/total , Eff=efficiency ideal/actual calls , Wst=avg wasted calls, Spd=avg time excl compaction rel=relevance detection, arg=argument fidelity, tsl=tool selection, b2s=basic 2step, s3s=sequential 3step, crt=conditional routing, srn=sequential reasoning, err=error recovery, dgr=data gap recovery, dge=data gap recovery extended, art=argument transformation, grs=grounded synthesis, iar=inconsistent api recovery, rel s=relevance detection stateful, arg s=argument fidelity stateful, tsl s=tool selection stateful, b2s s=basic 2step stateful, s3s s=sequential 3step stateful, crt s=conditional rou… 证据：`docs/results/raw/reforged/by-backend.md`\n- **Forge Eval — Reforged by Model Family**（documentation）：Forge Eval — Reforged by Model Family 证据：`docs/results/raw/reforged/by-family.md`\n- **Changelog**（documentation）：All notable changes to forge are documented here. 证据：`CHANGELOG.md`\n- **Eval Rigs**（structured_config）：{ \"rig-00\": {\"gpu\": \"RTX 5070\", \"platform\": \"windows\"}, \"rig-01\": {\"gpu\": \"RTX 5070 Ti\", \"platform\": \"linux/ubuntu24.04\"}, \"rig-02\": {\"gpu\": \"2x RTX 5070 Ti\", \"platform\": \"linux/ubuntu24.04\"}, \"rig-03\": {\"gpu\": \"AMD Strix Halo APU 128GB unified \", \"platform\": \"linux/fedora43\"} } 证据：`eval_rigs.json`\n- **Tsconfig.App**（structured_config）：{ \"compilerOptions\": { \"tsBuildInfoFile\": \"./node modules/.tmp/tsconfig.app.tsbuildinfo\", \"target\": \"ES2022\", \"useDefineForClassFields\": true, \"lib\": \"ES2022\", \"DOM\", \"DOM.Iterable\" , \"module\": \"ESNext\", \"types\": \"vite/client\" , \"skipLibCheck\": true, 证据：`tests/eval/dashboard/tsconfig.app.json`\n- **Tsconfig**（structured_config）：{ \"files\": , \"references\": { \"path\": \"./tsconfig.app.json\" }, { \"path\": \"./tsconfig.node.json\" } } 证据：`tests/eval/dashboard/tsconfig.json`\n- **Tsconfig.Node**（structured_config）：{ \"compilerOptions\": { \"tsBuildInfoFile\": \"./node modules/.tmp/tsconfig.node.tsbuildinfo\", \"target\": \"ES2023\", \"lib\": \"ES2023\" , \"module\": \"ESNext\", \"types\": \"node\" , \"skipLibCheck\": true, 证据：`tests/eval/dashboard/tsconfig.node.json`\n- **Normalize line endings**（source_file）：Normalize line endings text=auto .py text eol=lf .md text eol=lf .toml text eol=lf .yml text eol=lf .yaml text eol=lf eval results.jsonl filter=lfs diff=lfs merge=lfs -text eval results rig .jsonl filter=lfs diff=lfs merge=lfs -text 证据：`.gitattributes`\n- **Python**（source_file）：Python pycache / .py cod .egg-info/ dist/ build/ .egg 证据：`.gitignore`\n- **Codecov**（source_file）：comment: false 证据：`codecov.yml`\n- **Eval Results**（source_file）：version https://git-lfs.github.com/spec/v1 oid sha256:b4393d257ba3e22c5bac7b4cf7ab9431f85bdd2aafe662f1ffb118a73203bc2c size 67078449 证据：`eval_results.jsonl`\n- **=====================================================================**（source_file）：\"\"\"Using forge's guardrail middleware in your own agentic loop. 证据：`examples/foreign_loop.py`\n- **Integration-only: lifecycle orchestration requiring real backends/threads.**（source_file）：build-system requires = \"hatchling\" build-backend = \"hatchling.build\" 证据：`pyproject.toml`\n- **Pinned-in-time translation tables. These are the GGUF MAP and LLAMAFILE MAP**（source_file）：\"\"\"One-shot migration: rewrite llamaserver/llamafile rows to GGUF-stem identity. 证据：`scripts/migrate_eval_jsonl_gguf_identity.py`\n- **Rig-00 plan for the model-params re-run: all Qwen3 variants × all backends,**（source_file）：\"\"\"Unattended ablation study runner. 证据：`scripts/run_ablation.py`\n- **Read request**（source_file）：\"\"\"Smoke test for the proxy — starts proxy in external mode against a mock backend, sends one request, verifies the response. 证据：`scripts/smoke_test_proxy.py`\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`README.md`, `CONTRIBUTING.md`, `tests/eval/dashboard/package.json`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`README.md`, `CONTRIBUTING.md`, `tests/eval/dashboard/package.json`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Introduction to Forge**：importance `high`\n  - source_paths: README.md, src/forge/__init__.py\n- **Installation Guide**：importance `high`\n  - source_paths: README.md, pyproject.toml\n- **Quick Start Guide**：importance `high`\n  - source_paths: README.md, src/forge/core/workflow.py, src/forge/core/runner.py\n- **System Architecture**：importance `high`\n  - source_paths: docs/ARCHITECTURE.md, src/forge/core/runner.py, src/forge/context/manager.py\n- **Module Structure and API**：importance `high`\n  - source_paths: src/forge/__init__.py, src/forge/core/messages.py, src/forge/core/workflow.py, src/forge/errors.py\n- **Architecture Decision Records**：importance `medium`\n  - source_paths: docs/decisions/001-ablation-framework.md, docs/decisions/011-guardrail-middleware.md, docs/decisions/013-text-response-intent.md\n- **WorkflowRunner and Agentic Loop**：importance `high`\n  - source_paths: src/forge/core/runner.py, src/forge/core/workflow.py, src/forge/core/inference.py, docs/WORKFLOW.md\n- **Guardrails Middleware for External Loops**：importance `high`\n  - source_paths: src/forge/guardrails/guardrails.py, src/forge/guardrails/__init__.py, examples/foreign_loop.py\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `f1b87b05b863c7d12927f3dbdbd716af2dc3ace1`\n- inspected_files: `pyproject.toml`, `README.md`, `docs/USER_GUIDE.md`, `docs/ARCHITECTURE.md`, `docs/EVAL_GUIDE.md`, `docs/MODEL_GUIDE.md`, `docs/BACKEND_SETUP.md`, `docs/WORKFLOW.md`, `docs/results/index.md`, `docs/decisions/006-tool-prerequisites.md`, `docs/decisions/002-anthropic-baseline.md`, `docs/decisions/MULTI_MODEL_ROUTING.md`, `docs/decisions/009-bfcl-integration.md`, `docs/decisions/010-tool-resolution-error.md`, `docs/decisions/013-text-response-intent.md`, `docs/decisions/003-thinking-label-ux.md`, `docs/decisions/004-async-on-chunk.md`, `docs/decisions/012-openai-proxy.md`, `docs/decisions/001-ablation-framework.md`, `docs/decisions/005-parallel-tool-calls.md`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_148dff87195e42549d0ffb88b99e9cbf | https://github.com/antoinezambelli/forge/issues/58 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 来源证据：Investigate: integration paths with Hermes Agent\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Investigate: integration paths with Hermes Agent\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_e3cbd2d1c9a84a1887887bf24b036865 | https://github.com/antoinezambelli/forge/issues/51 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 来源证据：Per-model recommended sampling defaults (map keyed by HF model cards)\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Per-model recommended sampling defaults (map keyed by HF model cards)\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能阻塞安装或首次运行。\n- Evidence: community_evidence:github | cevd_057ca2af912e4a608259ffb2a3654d4f | https://github.com/antoinezambelli/forge/issues/59 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 来源证据：Rescue-parse ChatGPT-style XML tool calls\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Rescue-parse ChatGPT-style XML tool calls\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_471c674c8d73451da75d6b8c9349aabf | https://github.com/antoinezambelli/forge/issues/55 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 来源证据：Proxy external mode hardcodes native FC — no prompt-injection fallback\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：Proxy external mode hardcodes native FC — no prompt-injection fallback\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_f3a85ec8447a4838b3bc4c846cd9e7a0 | https://github.com/antoinezambelli/forge/issues/53 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: 能力判断依赖假设\n\n- Trigger: README/documentation is current enough for a first validation pass.\n- Host AI rule: 将假设转成下游验证清单。\n- Why it matters: 假设不成立时，用户拿不到承诺的能力。\n- Evidence: capability.assumptions | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | README/documentation is current enough for a first validation pass.\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 维护活跃度未知\n\n- Trigger: 未记录 last_activity_observed。\n- Host AI rule: 补 GitHub 最近 commit、release、issue/PR 响应信号。\n- Why it matters: 新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- Evidence: evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | last_activity_observed missing\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 8: 下游验证发现风险项\n\n- Trigger: no_demo\n- Host AI rule: 进入安全/权限治理复核队列。\n- Why it matters: 下游已经要求复核，不能在页面中弱化。\n- Evidence: downstream_validation.risk_items | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 9: 存在评分风险\n\n- Trigger: no_demo\n- Host AI rule: 把风险写入边界卡，并确认是否需要人工复核。\n- Why it matters: 风险会影响是否适合普通用户安装。\n- Evidence: risks.scoring_risks | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 10: 来源证据：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget\n\n- Trigger: GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能影响授权、密钥配置或安全边界。\n- Evidence: community_evidence:github | cevd_4ad226a6d1fa4a5f89fa7702bec11188 | https://github.com/antoinezambelli/forge/issues/61 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：antoinezambelli/forge\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：chatgpt\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：Investigate: integration paths with Hermes Agent（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Per-model recommended sampling defaults (map keyed by HF model cards)（medium）：可能阻塞安装或首次运行。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：Rescue-parse ChatGPT-style XML tool calls（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：Proxy external mode hardcodes native FC — no prompt-injection fallback（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/antoinezambelli/forge 项目说明书\n\n生成时间：2026-05-19 20:57:03 UTC\n\n## 目录\n\n- [Introduction to Forge](#page-introduction)\n- [Installation Guide](#page-installation)\n- [Quick Start Guide](#page-quickstart)\n- [System Architecture](#page-architecture)\n- [Module Structure and API](#page-module-structure)\n- [Architecture Decision Records](#page-adr-index)\n- [WorkflowRunner and Agentic Loop](#page-workflowrunner)\n- [Guardrails Middleware for External Loops](#page-guardrails-middleware)\n- [Proxy Server Setup](#page-proxy-server)\n- [Backend Clients](#page-backend-clients)\n- [Backend Setup Guide](#page-backend-setup)\n- [Model Selection Guide](#page-model-guide)\n\n<a id='page-introduction'></a>\n\n## Introduction to Forge\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture), [Quick Start Guide](#page-quickstart)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/errors.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/errors.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n</details>\n\n# Introduction to Forge\n\nForge is a framework-agnostic LLM orchestration library that provides structured tool-calling workflows, guardrail enforcement, and context management for building reliable AI agents. It supports multiple LLM backends (Ollama, Llamafile, llama.cpp) and exposes both high-level runners and granular APIs for embedding into foreign orchestration loops.\n\n## Overview\n\nForge addresses the core challenges of LLM-based agent development:\n\n| Challenge | Forge Solution |\n|-----------|----------------|\n| Unreliable tool parsing | Rescue parsing with retry mechanisms |\n| Missing required steps | StepEnforcer validates call sequences |\n| Context overflow | Tiered context compaction strategies |\n| Model-specific sampling | Per-model verified sampling defaults |\n| Multi-backend support | Unified client abstraction layer |\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Core Architecture\n\nForge follows a layered architecture with clear separation of concerns:\n\n```mermaid\ngraph TD\n    subgraph \"User Layer\"\n        A[User Workflow] --> B[WorkflowRunner]\n    end\n    \n    subgraph \"Core Layer\"\n        B --> C[ContextManager]\n        B --> D[LLMClient]\n        B --> E[Guardrails]\n    end\n    \n    subgraph \"Client Layer\"\n        D --> F[OllamaClient]\n        D --> G[LlamafileClient]\n        D --> H[AnthropicClient]\n    end\n    \n    subgraph \"Backend Layer\"\n        F --> I[Ollama Backend]\n        G --> J[Llamafile Backend]\n        H --> K[Anthropic API]\n    end\n```\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n### Project Structure\n\n```\nsrc/forge/           # Library source\n  clients/           # LLM backend adapters (one per backend)\n  core/              # Workflow, runner, messages, steps\n  context/           # Context management and compaction\n  prompts/           # Prompt templates and nudges\n  guardrails/        # Response validation and step enforcement\n  proxy/             # OpenAI-compatible proxy server\n```\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Quick Start\n\nThe fundamental building blocks of a Forge workflow:\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Core Components\n\n### Workflow\n\nThe `Workflow` class defines the structure and constraints of an agent task:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `name` | `str` | Workflow identifier |\n| `description` | `str` | Human-readable description |\n| `tools` | `dict[str, ToolDef]` | Tool definitions keyed by name |\n| `required_steps` | `list[str]` | Tools that must precede terminal tool |\n| `terminal_tool` | `str` | Tool(s) that can end the workflow |\n\n资料来源：[src/forge/core/workflow.py:1-50](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### ToolDef\n\nBinds a tool schema to its implementation:\n\n```python\n@dataclass\nclass ToolDef:\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\nThe `prerequisites` field supports:\n- **String entries**: Name-only requirements (\"read_file\" — any prior call satisfies)\n- **Dict entries**: Arg-matched requirements (`{\"tool\": \"read_file\", \"match_arg\": \"path\"}`)\n\n资料来源：[src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### LLM Clients\n\nForge provides backend-specific clients:\n\n| Client | Backend | Features |\n|--------|---------|----------|\n| `OllamaClient` | Ollama API | recommended_sampling, async streaming |\n| `LlamafileClient` | Llamafile binary | Context length detection, reasoning extraction |\n| `AnthropicClient` | Anthropic API | Native Claude support |\n\nAll clients support `send()` and `send_stream()` methods with sampling parameter overrides:\n\n```python\n# Instance-level sampling\nclient = OllamaClient(model=\"qwen3:8b\", recommended_sampling=True)\n\n# Per-call override (merged without mutation)\nawait client.send(messages, sampling={\"temperature\": 0.5})\n```\n\n资料来源：[src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n\n### Per-Model Sampling Defaults\n\nThe `sampling_defaults` module provides verified per-model sampling parameters sourced from HuggingFace model cards:\n\n```python\ndef get_sampling_defaults(model: str) -> dict[str, float | int]:\n    \"\"\"Pure lookup - returns copy of map value or {} for unknown models.\"\"\"\n    return dict(MODEL_SAMPLING_DEFAULTS.get(model, {}))\n\ndef apply_sampling_defaults(model: str, *, strict: bool) -> dict[str, float | int]:\n    \"\"\"Policy layer with four-quadrant behavior.\"\"\"\n```\n\n| strict | model in map | behavior |\n|--------|--------------|----------|\n| True | yes | return dict |\n| True | no | raise `UnsupportedModelError` |\n| False | yes | one-shot INFO log; return `{}` |\n| False | no | return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n\n## Guardrails System\n\nForge's guardrails provide multi-layered response validation:\n\n```mermaid\ngraph LR\n    A[LLM Response] --> B[ResponseValidator]\n    B --> C{Valid ToolCall?}\n    C -->|No| D[Rescue Parsing]\n    D --> E{Rescued?}\n    E -->|Yes| F[Return ToolCall]\n    E -->|No| G[Retry Nudge]\n    C -->|Yes| H[StepEnforcer]\n    H --> I{Valid Sequence?}\n    I -->|Yes| J[ErrorTracker]\n    I -->|No| K[Step Blocked Nudge]\n    J --> L{Error Limit?}\n    L -->|Yes| M[Fatal Error]\n    L -->|No| N[Execute]\n```\n\n资料来源：[src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n\n### Guardrails API\n\n```python\nclass Guardrails:\n    def __init__(\n        self,\n        tool_names: list[str],\n        terminal_tool: str | frozenset[str],\n        required_steps: list[str] | None = None,\n        max_retries: int = 3,\n        max_tool_errors: int = 2,\n        rescue_enabled: bool = True,\n        max_premature_attempts: int = 3,\n        retry_nudge: Callable[[str], str] | None = None,\n    ) -> None:\n```\n\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `max_retries` | 3 | Consecutive bad responses before fatal |\n| `max_tool_errors` | 2 | Tool execution failures before exhaustion |\n| `max_premature_attempts` | 3 | Premature terminal attempts before fatal |\n| `rescue_enabled` | True | Parse tool calls from plain text |\n| `retry_nudge` | None | Custom nudge for bare text responses |\n\n资料来源：[src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n\n### CheckResult Actions\n\n```python\n@dataclass\nclass CheckResult:\n    action: Literal[\"execute\", \"retry\", \"step_blocked\", \"fatal\"]\n    tool_calls: list[ToolCall] | None = None\n    nudge: Nudge | None = None  # Set when action is \"retry\" or \"step_blocked\"\n    reason: str | None = None  # Only when action == \"fatal\"\n```\n\n资料来源：[src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n\n## Context Management\n\nThe `ContextManager` handles token budget enforcement and message compaction:\n\n| Strategy | Purpose |\n|----------|---------|\n| `TieredCompact` | Keeps recent N messages, compacts older ones |\n| Custom strategies | Pluggable compaction algorithms |\n\n```python\nctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n```\n\n资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Server Manager\n\nFor local GGUF model execution, `ServerManager` handles backend lifecycle:\n\n```python\nserver, ctx = await forge.serve(\n    backend=\"llamaserver\",\n    gguf_path=\"/path/to/model.gguf\",\n    mode=BudgetMode.FORGE_FAST,\n    client=client,\n)\nrunner = WorkflowRunner(client=client, context_manager=ctx)\n```\n\n| Parameter | Description |\n|-----------|-------------|\n| `backend` | \"ollama\", \"llamaserver\", or \"llamafile\" |\n| `gguf_path` | Path to GGUF file (not for Ollama) |\n| `mode` | Budget mode (FORGE_FAST, FORGE_BALANCED, FORGE_DEEP) |\n| `n_slots` | Concurrent slots for multi-agent |\n| `kv_unified` | Single shared KV cache across slots |\n\n资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Error Handling\n\nForge defines a structured exception hierarchy:\n\n```python\nclass ForgeError(Exception):  # Base exception\n    pass\n\nclass UnsupportedModelError(ForgeError):\n    \"\"\"raised when recommended_sampling=True for unknown model.\"\"\"\n    \nclass ToolCallError(ForgeError):\n    \"\"\"LLM failed to produce valid tool call after retries.\"\"\"\n    \nclass ToolExecutionError(ForgeError):\n    \"\"\"Tool callable raised during execution.\"\"\"\n```\n\n资料来源：[src/forge/errors.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/errors.py)\n\n## Foreign Loop Integration\n\nForge can be embedded into existing orchestration systems using the `Guardrails` middleware API:\n\n```python\nfrom forge.guardrails import Guardrails\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response(response):\n    result = guardrails.check(response)\n    \n    if result.action == \"fatal\":\n        return f\"FATAL: {result.reason}\"\n    \n    if result.action in (\"retry\", \"step_blocked\"):\n        return f\"{result.action}: {result.nudge.content[:80]}...\"\n    \n    # Execute tools, then record\n    executed = [tc.tool for tc in result.tool_calls]\n    done = guardrails.record(executed)\n    return f\"executed {executed}\" + (\" -- DONE\" if done else \"\")\n```\n\n资料来源：[examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n### Granular Component Access\n\nFor fine-grained control, access components directly:\n\n```python\nfrom forge.guardrails import ErrorTracker, ResponseValidator, StepEnforcer\n\nvalidator = ResponseValidator(tool_names=[\"search\", \"lookup\", \"answer\"], rescue_enabled=True)\nenforcer = StepEnforcer(required_steps=[\"search\", \"lookup\"], terminal_tool=\"answer\")\nerrors = ErrorTracker(max_retries=3, max_tool_errors=2)\n```\n\n资料来源：[examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Code Style and Requirements\n\n| Requirement | Details |\n|-------------|---------|\n| Python | 3.12+ |\n| Async | `asyncio` throughout — all client methods and runner are async |\n| Type Safety | Pydantic for tool parameter schemas |\n| Type Syntax | Modern unions with `\\|` |\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Running Tests\n\n```bash\n# Full suite (865 tests)\npython -m pytest tests/unit/ -v --tb=short\n\n# With coverage\npython -m pytest tests/unit/ --cov=forge --cov-report=term-missing\n\n# Skip integration tests (require live backend)\npython -m pytest tests/ -m \"not integration\"\n```\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n---\n\n<a id='page-installation'></a>\n\n## Installation Guide\n\n### 相关页面\n\n相关主题：[Backend Setup Guide](#page-backend-setup), [Quick Start Guide](#page-quickstart)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n</details>\n\n# Installation Guide\n\nThis guide covers all aspects of setting up the **forge** framework, from basic installation to advanced configuration for different LLM backends.\n\n## Overview\n\nForge is a Python library for building LLM-powered workflows with automatic tool calling, context management, and multi-backend support. The installation process involves:\n\n1. Python environment setup (Python 3.12+)\n2. Package installation via pip\n3. LLM backend selection and configuration\n4. Optional development environment for contributors\n\n**资料来源：[CONTRIBUTING.md:1-15]()**\n\n## Prerequisites\n\n### System Requirements\n\n| Component | Requirement |\n|-----------|-------------|\n| Python | 3.12+ |\n| OS | Linux, macOS, Windows |\n| LLM Backend | Ollama, llama-server, or llamafile |\n| VRAM | Varies by model (8GB minimum for small models) |\n\n### Backend Options\n\nForge supports three LLM backends:\n\n| Backend | Description | Use Case |\n|---------|-------------|----------|\n| **Ollama** | Local model management with simple API | Quick setup, model management |\n| **llama-server** | llama.cpp server binary | Production, GGUF files |\n| **llamafile** | Single-file executable models | Distribution, portability |\n\nEach backend requires different installation steps:\n\n- **Ollama**: Install via `ollama run` commands\n- **llama-server**: Download llama.cpp server binary\n- **llamafile**: Download pre-built executables or convert GGUF files\n\n**资料来源：[src/forge/server.py:1-50]()**\n\n## Installation Methods\n\n### Standard Installation\n\nFor end users wanting to use forge as a library:\n\n```bash\npip install forge\n```\n\nThis installs the core package with all necessary dependencies.\n\n### Development Installation\n\nFor contributors and those wanting to modify the codebase:\n\n```bash\ngit clone https://github.com/antoinezambelli/forge.git\ncd forge\npython -m venv .venv\nsource .venv/bin/activate  # Linux/macOS\n# or: .venv\\Scripts\\activate  # Windows\npip install -e \".[dev]\"\n```\n\n**资料来源：[CONTRIBUTING.md:7-14]()**\n\nThe `.[dev]` extras include:\n\n- Testing dependencies (`pytest`)\n- Development tools\n- Documentation build tools\n\n### Package Configuration\n\nThe project uses `pyproject.toml` for dependency management:\n\n```toml\n[project]\nname = \"forge\"\nversion = \"0.5.0\"\nrequires-python = \">=3.12\"\n\n[project.optional-dependencies]\ndev = [\n    \"pytest>=7.0\",\n    \"pytest-asyncio\",\n    \"pytest-cov\",\n]\n```\n\n## Environment Setup\n\n### Virtual Environment\n\nUsing a virtual environment is **strongly recommended** to avoid dependency conflicts:\n\n```bash\npython -m venv forge-env\nsource forge-env/bin/activate\n```\n\n### Backend Installation\n\n#### Ollama Setup\n\n1. Install Ollama from [ollama.ai](https://ollama.ai)\n2. Pull a model:\n\n```bash\nollama pull ministral-3:8b-instruct-2512-q4_K_M\n```\n\n3. Verify the installation:\n\n```bash\nollama list\n```\n\n#### llama-server Setup\n\n1. Download the llama.cpp server binary for your platform\n2. Place the binary in your models directory or system PATH\n3. Verify with:\n\n```bash\n./llama-server --help\n```\n\n#### llamafile Setup\n\n1. Download a pre-built llamafile (e.g., from TheBloke)\n2. Make it executable:\n\n```bash\nchmod +x model-name.Q4_K_M.llamafile\n```\n\n**资料来源：[src/forge/server.py:180-220]()**\n\n## Configuration Flow\n\n```mermaid\ngraph TD\n    A[Install forge] --> B{Use Case}\n    B -->|Library| C[pip install forge]\n    B -->|Development| D[pip install -e .[dev]]\n    C --> E[Choose Backend]\n    D --> E\n    E -->|Ollama| F[Install Ollama + Pull Model]\n    E -->|llama-server| G[Download llama.cpp binary]\n    E -->|llamafile| H[Download llamafile]\n    F --> I[Verify with test script]\n    G --> I\n    H --> I\n```\n\n## Quick Start Verification\n\nAfter installation, verify your setup with this minimal example:\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n**资料来源：[README.md:1-60]()**\n\n## Advanced Backend Configuration\n\n### KV Cache Quantization\n\nForge supports KV cache quantization to reduce VRAM usage:\n\n| Setting | VRAM Savings | Quality Impact |\n|---------|--------------|----------------|\n| `q8_0` | ~50% vs F16 | Minimal |\n| `q4_0` | ~75% vs F16 | Low |\n\n```python\n# In setup_backend call\nfrom forge.server import setup_backend, BudgetMode\n\nserver, ctx = await setup_backend(\n    backend=\"llamaserver\",\n    gguf_path=\"/path/to/model.gguf\",\n    budget_mode=BudgetMode.FORGE_FAST,\n    cache_type_k=\"q4_0\",  # Key cache quantization\n    cache_type_v=\"q4_0\",   # Value cache quantization\n)\n```\n\n**资料来源：[src/forge/server.py:20-45]()**\n\n### Multi-Slot Configuration\n\nFor multi-agent architectures:\n\n```python\nserver, ctx = await setup_backend(\n    backend=\"llamaserver\",\n    gguf_path=\"/path/to/model.gguf\",\n    n_slots=4,           # Number of concurrent slots\n    kv_unified=True,     # Shared KV cache pool\n)\n```\n\nWhen `kv_unified=True`, all slots share a single KV cache pool, allowing each slot to use the full context window.\n\n**资料来源：[src/forge/server.py:40-55]()**\n\n## Recommended Sampling Parameters\n\nForge provides recommended sampling defaults for specific models:\n\n```python\nclient = OllamaClient(\n    model=\"qwen3:8b-q4_K_M\",\n    recommended_sampling=True  # Enable recommended defaults\n)\n```\n\nThe `recommended_sampling=True` parameter enables tuned temperature, top_p, top_k, and other sampling parameters sourced from HuggingFace model cards.\n\n**资料来源：[src/forge/clients/sampling_defaults.py:1-80]()**\n\n## Testing Your Installation\n\n### Unit Tests\n\nRun the deterministic unit test suite (no backend required):\n\n```bash\npython -m pytest tests/unit/ -v --tb=short\n```\n\n**资料来源：[CONTRIBUTING.md:18-25]()**\n\n### Integration Tests\n\nIntegration tests require a running backend:\n\n```bash\n# Skip integration tests\npython -m pytest tests/ -m \"not integration\"\n\n# Run with coverage\npython -m pytest tests/unit/ --cov=forge --cov-report=term-missing\n```\n\n## Troubleshooting\n\n### Common Issues\n\n| Issue | Solution |\n|-------|----------|\n| `ModuleNotFoundError: forge` | Run `pip install forge` or check virtual environment |\n| Backend connection refused | Verify backend is running on correct port |\n| Model not found (Ollama) | Run `ollama pull <model-name>` |\n| VRAM out of memory | Enable KV cache quantization or use smaller model |\n\n### Backend Health Check\n\nVerify backend connectivity:\n\n```python\nimport httpx\nimport asyncio\n\nasync def check_backend():\n    try:\n        async with httpx.AsyncClient(timeout=5.0) as client:\n            resp = await client.get(\"http://localhost:8080/props\")\n            if resp.status_code == 200:\n                print(\"Backend is ready\")\n    except httpx.ConnectError:\n        print(\"Backend not reachable\")\n```\n\n**资料来源：[src/forge/server.py:250-280]()**\n\n## Next Steps\n\nAfter installation, refer to:\n\n- [User Guide](docs/USER_GUIDE.md) — Complete workflow creation and execution\n- [Backend Setup Guide](docs/BACKEND_SETUP.md) — Detailed backend configuration\n- [Model Guide](docs/MODEL_GUIDE.md) — Recommended models by hardware tier\n- [Architecture Decision Records](docs/decisions/) — Design rationale documentation\n\n---\n\n<a id='page-quickstart'></a>\n\n## Quick Start Guide\n\n### 相关页面\n\n相关主题：[WorkflowRunner and Agentic Loop](#page-workflowrunner), [System Architecture](#page-architecture)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/core/runner.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/runner.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n</details>\n\n# Quick Start Guide\n\nThis guide provides a practical introduction to **Forge**, a reliability layer for self-hosted LLM tool-calling. Forge elevates an 8B local model to top-tier performance on multi-step agentic workflows through guardrails (rescue parsing, retry nudges, step enforcement) and context management (VRAM-aware budgets, tiered compaction).\n\n## Purpose and Scope\n\nThe Quick Start Guide covers:\n\n- **Installation and environment setup** for local LLM backends\n- **Core concepts** including Workflows, ToolDefs, and the WorkflowRunner\n- **Basic usage patterns** for single-step and multi-step agentic workflows\n- **Integration options** for foreign orchestration loops\n- **Backend management** with auto-start capabilities\n\nForge targets developers building agentic applications that require structured tool-calling with local models. It works with llama.cpp-based backends (llama-server, llamafile) and Ollama.\n\n资料来源：[README.md:1-20](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Installation\n\n### Prerequisites\n\n| Requirement | Version | Notes |\n|-------------|---------|-------|\n| Python | 3.12+ | Modern syntax required (type unions with `\\|`) |\n| pip | Latest | For package installation |\n| LLM Backend | llama.cpp / Ollama | For inference |\n\n### Setup Commands\n\n```bash\ngit clone https://github.com/antoinezambelli/forge.git\ncd forge\npython -m venv .venv\npip install -e \".[dev]\"\n```\n\n资料来源：[CONTRIBUTING.md:1-15](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Core Concepts\n\n### Architecture Overview\n\n```mermaid\ngraph TD\n    A[User Input] --> B[WorkflowRunner]\n    B --> C[LLM Client]\n    C --> D[Tool Call Response]\n    D --> E[Guardrails Check]\n    E -->|execute| F[Tool Execution]\n    F --> G[Context Manager]\n    G -->|compact| B\n    E -->|retry| C\n    E -->|fatal| H[Error Handling]\n```\n\n### Workflow\n\nThe `Workflow` is the central definition for an agentic task. It binds together:\n\n| Component | Type | Purpose |\n|-----------|------|---------|\n| `name` | `str` | Workflow identifier |\n| `description` | `str` | Human-readable description |\n| `tools` | `dict[str, ToolDef]` | Tool name → definition mapping |\n| `required_steps` | `list[str]` | Tools that must execute before terminal |\n| `terminal_tool` | `str` | Tool that ends the workflow |\n| `system_prompt_template` | `str` | System prompt for the LLM |\n\n资料来源：[src/forge/core/workflow.py:1-50](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### ToolDef and ToolSpec\n\n**ToolSpec** defines the schema exposed to the LLM:\n\n```python\nclass ToolSpec(BaseModel):\n    name: str\n    description: str\n    parameters: type[BaseModel]  # Pydantic model\n```\n\n**ToolDef** binds the schema to its Python implementation:\n\n```python\n@dataclass\nclass ToolDef:\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\nThe `prerequisites` field enables conditional dependencies:\n- `str`: \"if you call this tool, you must have called tool X first\"\n- `dict`: `{\"tool\": \"read_file\", \"match_arg\": \"path\"}` — arg-matched prerequisites\n\n资料来源：[src/forge/core/workflow.py:60-90](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### WorkflowRunner\n\nThe `WorkflowRunner` manages the full lifecycle:\n\n- System prompt injection\n- Tool execution and result handling\n- Context compaction\n- Guardrail enforcement\n- Multi-turn conversation state\n\n```python\nclass WorkflowRunner:\n    def __init__(self, client, context_manager):\n        ...\n    \n    async def run(self, workflow, user_message):\n        ...\n```\n\n资料来源：[src/forge/core/runner.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/runner.py)\n\n### ContextManager and Budget Modes\n\nForge provides VRAM-aware context management through budget modes:\n\n| Mode | Behavior |\n|------|----------|\n| `BudgetMode.MANUAL` | User-specified token budget |\n| `BudgetMode.FORGE_FAST` | VRAM-optimized fast inference budget |\n| `BudgetMode.FORGE_DEEP` | Extended context for complex reasoning |\n\nThe `ContextManager` resolves budgets at runtime based on the backend:\n\n```python\nasync def resolve_budget(self, mode: BudgetMode, manual_tokens: int | None = None) -> int:\n    if mode == BudgetMode.MANUAL:\n        if self._backend == \"ollama\":\n            return manual_tokens\n        return await self.get_server_context()\n```\n\n资料来源：[src/forge/server.py:80-120](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Quick Start Example\n\n### Basic Single-Tool Workflow\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md:20-60](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n### Key Components Explained\n\n| Component | Import | Purpose |\n|-----------|--------|---------|\n| `OllamaClient` | `forge` | LLM backend adapter for Ollama |\n| `TieredCompact` | `forge` | Context compaction strategy |\n| `ContextManager` | `forge` | Token budget management |\n| `WorkflowRunner` | `forge` | Orchestrates the agent loop |\n\n## Multi-Step Workflows\n\nFor workflows requiring sequential tool execution:\n\n```python\n# Define multi-step workflow with prerequisites\nworkflow = Workflow(\n    name=\"research_assistant\",\n    description=\"Research and answer questions\",\n    tools={\n        \"search\": ToolDef(spec=search_spec, callable=do_search),\n        \"lookup\": ToolDef(spec=lookup_spec, callable=do_lookup),\n        \"answer\": ToolDef(spec=answer_spec, callable=final_answer),\n    },\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n```\n\nThe `required_steps` list enforces that `search` and `lookup` must execute before `answer`. Attempting to call the terminal tool prematurely triggers a retry nudge.\n\n资料来源：[examples/foreign_loop.py:1-50](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Guardrails API\n\nFor foreign orchestration loops (non-WorkflowRunner usage), Forge provides standalone guardrails:\n\n### Simple API\n\n```python\nfrom forge.guardrails import Guardrails\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response(response):\n    result = guardrails.check(response)\n    \n    if result.action == \"fatal\":\n        return f\"FATAL: {result.reason}\"\n    \n    if result.action in (\"retry\", \"step_blocked\"):\n        return f\"{result.action}: {result.nudge.content[:80]}...\"\n    \n    # Execute tools\n    executed = [tc.tool for tc in result.tool_calls]\n    done = guardrails.record(executed)\n    return f\"executed {executed}\" + (\" -- DONE\" if done else \"\")\n```\n\n### Granular API\n\nDirect access to individual components:\n\n```python\nfrom forge.guardrails import ErrorTracker, ResponseValidator, StepEnforcer\n\nvalidator = ResponseValidator(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    rescue_enabled=True,\n)\nenforcer = StepEnforcer(\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\nerrors = ErrorTracker(max_retries=3, max_tool_errors=2)\n```\n\n| Component | Purpose |\n|-----------|---------|\n| `ResponseValidator` | Parses tool calls from LLM responses, rescue mode |\n| `StepEnforcer` | Enforces required step sequence |\n| `ErrorTracker` | Tracks retry attempts and tool errors |\n\n资料来源：[examples/foreign_loop.py:80-150](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Respond Tool Pattern\n\nFor conversational turns where the model should respond directly:\n\n```python\nfrom forge.tools import RESPOND_TOOL_NAME, respond_spec\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\", RESPOND_TOOL_NAME],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response_with_respond(response):\n    result = guardrails.check(response)\n    \n    # Check for respond() call\n    for tc in result.tool_calls:\n        if tc.tool == RESPOND_TOOL_NAME:\n            message = tc.args.get(\"message\", \"\")\n            return f\"MODEL SAYS: {message}\"\n    \n    # Normal tool execution\n    ...\n```\n\n资料来源：[examples/foreign_loop.py:160-200](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Backend Auto-Management\n\nForge can auto-start backends for multi-agent architectures:\n\n```python\nfrom forge.server import run_with_server\nfrom forge.clients import LlamafileClient, BudgetMode\n\nasync with run_with_server(\n    backend=\"llamafile\",\n    gguf_path=\"/path/to/model.gguf\",\n    budget_mode=BudgetMode.FORGE_FAST,\n) as (server, ctx):\n    client = LlamafileClient(model=\"my-model\")\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    # Run workflows...\n```\n\n### Backend Options\n\n| Backend | Model Source | GGUF Support |\n|---------|-------------|--------------|\n| `ollama` | Model name (e.g., `ministral-3:8b`) | No |\n| `llamaserver` | GGUF file path | Yes |\n| `llamafile` | GGUF file path | Yes |\n\n资料来源：[src/forge/server.py:1-80](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Recommended Sampling Parameters\n\nForge provides curated sampling defaults for supported models:\n\n```python\nfrom forge.clients import OllamaClient, get_sampling_defaults\n\n# Opt-in to recommended sampling\nclient = OllamaClient(\n    model=\"ministral-3:8b-q4_K_M\",\n    recommended_sampling=True  # Raises error for unknown models\n)\n```\n\n| Parameter | Source | Verification |\n|-----------|--------|--------------|\n| `temperature` | HF model cards | Per-model verification |\n| `top_p` | HF model cards | Per-model verification |\n| `top_k` | HF model cards | Per-model verification |\n| `min_p` | HF model cards | Per-model verification |\n| `repeat_penalty` | HF model cards | Per-model verification |\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-50](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n\n## Testing\n\n### Unit Tests (No Backend Required)\n\n```bash\n# Full suite (865 tests)\npython -m pytest tests/unit/ -v --tb=short\n\n# With coverage\npython -m pytest tests/unit/ --cov=forge --cov-report=term-missing\n\n# Single file\npython -m pytest tests/unit/test_runner.py -v\n```\n\n### Integration Tests (Requires Backend)\n\n```bash\n# Skip integration tests\npython -m pytest tests/ -m \"not integration\"\n```\n\n资料来源：[CONTRIBUTING.md:15-30](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Next Steps\n\n| Topic | Description |\n|-------|-------------|\n| [User Guide](docs/USER_GUIDE.md) | Multi-step workflows, long-running sessions |\n| [Model Guide](docs/MODEL_GUIDE.md) | Model-specific configurations |\n| [Architecture Decisions](docs/decisions/) | Design rationale and ADRs |\n| [Eval Suite](tests/eval/) | Performance evaluation methodology |\n\n---\n\n<a id='page-architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[WorkflowRunner and Agentic Loop](#page-workflowrunner)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n</details>\n\n# System Architecture\n\n## Overview\n\nForge is an LLM agent framework that orchestrates multi-step tool-calling workflows with built-in guardrails, context management, and automatic backend server management. The architecture follows a clean separation of concerns: clients abstract LLM backends, workflows define agent behavior, guardrails enforce execution policies, and the context manager handles token budgeting. 资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\nThe framework is designed for determinism in unit tests and supports three backend types: Ollama, llama-server, and llamafile. 资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Project Layout\n\n```\nsrc/forge/           # Library source\n  clients/           # LLM backend adapters (one per backend)\n  core/              # Workflow, runner, messages, steps\n  context/           # Context management and compaction\n  prompts/           # Prompt templates and nudges\ntests/\n  unit/              # Deterministic tests\n  eval/              # Eval harness (requires live backends)\n    scenarios/       # Eval scenario definitions\n    dashboard/       # React-based HTML dashboard (separate npm build)\ndocs/                # User-facing documentation\n  decisions/         # Architecture Decision Records (ADRs)\n  results/           # Eval results and raw data tables\n```\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Core Architecture Components\n\n### Architecture Diagram\n\n```mermaid\ngraph TD\n    User[\"User / Application\"]\n    Runner[\"WorkflowRunner\"]\n    Workflow[\"Workflow\"]\n    Guardrails[\"Guardrails\"]\n    ContextMgr[\"ContextManager\"]\n    Client[\"LLM Client\"]\n    ServerMgr[\"ServerManager\"]\n    \n    User --> Runner\n    Runner --> Workflow\n    Runner --> Guardrails\n    Runner --> ContextMgr\n    Runner --> Client\n    Client --> ServerMgr\n    \n    subgraph \"forge Library\"\n        Runner\n        Workflow\n        Guardrails\n        ContextMgr\n        Client\n    end\n    \n    subgraph \"Backend\"\n        ServerMgr\n    end\n```\n\n### LLM Clients\n\nThe client layer abstracts different LLM backends behind a common async interface. Each client handles backend-specific protocol differences.\n\n| Client | Backend | Protocol |\n|--------|---------|----------|\n| `OllamaClient` | Ollama | OpenAI-compatible REST |\n| `LlamafileClient` | Llamafile | OpenAI-compatible REST |\n| `AnthropicClient` | Anthropic API | Anthropic native |\n| `OpenAIClient` | OpenAI API | OpenAI native |\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n#### Sampling Defaults\n\nEach client can optionally apply recommended sampling parameters sourced from HuggingFace model cards. The policy layer provides four-quadrant behavior:\n\n| `strict` | Model in map | Behavior |\n|----------|--------------|----------|\n| `True` | Yes | Return dict |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n\nClients no longer ship hardcoded temperature defaults. With `recommended_sampling=False` (default), forge sends nothing and the backend's default applies. 资料来源：[CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n\n### Workflow System\n\nThe `Workflow` class defines an agent's behavior declaratively:\n\n```python\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(...),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools.\",\n)\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n#### Tool Definition\n\n`ToolDef` binds a tool schema to its implementation:\n\n```python\n@dataclass\nclass ToolDef:\n    \"\"\"Binds a tool schema to its implementation.\"\"\"\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\nPrerequisites express conditional dependencies:\n- `str`: Name-only (`\"read_file\"` — any prior call satisfies it)\n- `dict`: Arg-matched (`{\"tool\": \"read_file\", \"match_arg\": \"path\"}`)\n\n资料来源：[src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n\n### Guardrails System\n\nGuardrails enforce execution policies through three coordinated components:\n\n```mermaid\ngraph LR\n    Response[\"LLM Response\"] --> Validator[\"ResponseValidator\"]\n    Validator --> Enforcer[\"StepEnforcer\"]\n    Enforcer --> Tracker[\"ErrorTracker\"]\n    \n    Validator --> \"rescue parsing\"\n    Enforcer --> \"required steps\"\n    Tracker --> \"retry limits\"\n```\n\n#### Guardrails Configuration\n\n| Parameter | Purpose | Default |\n|-----------|---------|---------|\n| `tool_names` | List of available tools | Required |\n| `terminal_tool` | Final allowed tool | Required |\n| `required_steps` | Ordered prerequisite chain | `None` |\n| `max_retries` | Total retry attempts | 3 |\n| `max_tool_errors` | Consecutive tool failures | 2 |\n| `rescue_enabled` | Enable XML rescue parsing | `True` |\n| `max_premature_attempts` | Premature terminal attempts before fatal | 3 |\n\n资料来源：[src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n\n#### Check Result Actions\n\nThe `check()` method returns a `CheckResult` with these actions:\n\n| Action | Meaning |\n|--------|---------|\n| `proceed` | Response passes all guardrails |\n| `retry` | Invalid response, apply nudge and retry |\n| `step_blocked` | Missing required step |\n| `fatal` | Max retries exceeded |\n\n### Context Manager\n\nThe `ContextManager` handles token budgeting and context compaction to prevent context overflow during long conversations. 资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n#### Budget Resolution\n\nBudget is resolved based on the mode:\n\n| Mode | Resolution Strategy |\n|------|---------------------|\n| `MANUAL` | Use `manual_tokens` parameter or query server |\n| `FORGE_FAST` | Server-reported context / 4 |\n| `FORGE_BALANCED` | Server-reported context / 2 |\n| `FORGE_DEEP` | Server-reported context * 3 / 4 |\n\nFor Ollama backends, the context length is obtained from `ollama show`. For llama-server/llamafile, a `/props` query retrieves the actual `n_ctx`. 资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n### Server Manager\n\n`ServerManager` handles lifecycle management of backend servers (llama-server and llamafile only; Ollama is managed externally).\n\n```python\nserver = ServerManager(backend=\"llamaserver\", port=8080)\ncontext, ctx_mgr = await server.start_with_budget(\n    model=\"qwen3:8b-q4_K_M\",\n    budget_mode=BudgetMode.FORGE_FAST,\n    client=client,\n)\n```\n\n#### Server Configuration Parameters\n\n| Parameter | Description | Backend |\n|-----------|-------------|---------|\n| `model` | Model identity for server | All |\n| `gguf_path` | Path to GGUF file | llamaserver/llamafile |\n| `mode` | Operation mode | All |\n| `extra_flags` | Additional CLI flags | llamaserver/llamafile |\n| `ctx_override` | Override context length (`-c value`) | llamaserver/llamafile |\n| `cache_type_k` | KV cache quantization type for keys | llamaserver |\n| `cache_type_v` | KV cache quantization type for values | llamaserver |\n| `n_slots` | Concurrent slots count | llamaserver |\n| `kv_unified` | Single unified KV cache | llamaserver |\n\nThe server reuses an existing process if the same configuration is requested, avoiding unnecessary restarts. 资料来源：[src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n\n## Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Runner\n    participant Workflow\n    participant Guardrails\n    participant Context\n    participant Client\n    \n    User->>Runner: run(workflow, input)\n    Runner->>Context: begin_session()\n    Runner->>Workflow: Get system prompt\n    Runner->>Client: send(messages)\n    \n    loop Until terminal or max iterations\n        Client-->>Runner: LLMResponse\n        Runner->>Guardrails: check(response)\n        Guardrails-->>Runner: CheckResult\n        \n        alt proceed\n            Runner->>Runner: Execute tools\n            Runner->>Context: append(messages)\n            Runner->>Client: send(messages)\n        else retry\n            Runner->>Runner: Apply nudge, retry\n        else fatal\n            Runner->>User: Return error\n        end\n    end\n    \n    Runner-->>User: Final result\n```\n\n## Workflow Runner Integration\n\nThe `WorkflowRunner` orchestrates all components:\n\n```python\nasync def main():\n    client = OllamaClient(\n        model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n        recommended_sampling=True\n    )\n    ctx = ContextManager(\n        strategy=TieredCompact(keep_recent=2),\n        budget_tokens=8192\n    )\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Design Principles\n\n### Async-First\n\nAll client methods and the runner are async, enabling efficient I/O handling across multiple concurrent requests. 资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n### Type Safety\n\nPydantic is used for tool parameter schemas and response validation, ensuring runtime type safety for tool arguments. 资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n### Modern Python\n\nThe codebase targets Python 3.12+ and uses modern syntax including:\n- Type unions with `|` (e.g., `str | None`)\n- `dataclass` decorators\n- `field(default_factory=list)` patterns\n\n资料来源：[CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n\n## Guardrails Configuration Examples\n\n```python\n# Basic guardrails\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\n# With respond tool for middleware\nrespond_guardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\", RESPOND_TOOL_NAME],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n```\n\n资料来源：[examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n\n## Related Documentation\n\n- [User Guide](docs/USER_GUIDE.md) — Multi-step workflows and backend auto-management\n- [Model Guide](docs/MODEL_GUIDE.md) — Model recommendations by tier\n- [Architecture Decisions](docs/decisions/) — Design rationale for significant changes\n\n---\n\n<a id='page-module-structure'></a>\n\n## Module Structure and API\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture), [WorkflowRunner and Agentic Loop](#page-workflowrunner)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/__init__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/__init__.py)\n- [src/forge/core/messages.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/messages.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/errors.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/errors.py)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/clients/llamafile.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/llamafile.py)\n</details>\n\n# Module Structure and API\n\n## Overview\n\nThe forge repository implements a modular LLM orchestration framework designed to handle multi-step tool-calling workflows with built-in guardrails, context management, and support for multiple LLM backends. The architecture follows a clean separation of concerns with distinct modules for client handling, workflow orchestration, guardrails enforcement, and server management.\n\n## Core Architecture\n\nThe forge library is organized into a layered architecture that separates backend communication, workflow definition, execution enforcement, and context management.\n\n```mermaid\ngraph TD\n    A[User Code] --> B[WorkflowRunner]\n    B --> C[LLM Clients]\n    C --> D[llama.cpp / Ollama / Llamafile]\n    B --> E[Guardrails]\n    B --> F[ContextManager]\n    E --> G[ResponseValidator]\n    E --> H[StepEnforcer]\n    E --> I[ErrorTracker]\n```\n\n## Module Hierarchy\n\n| Module | Purpose | Key Classes |\n|--------|---------|-------------|\n| `forge.core` | Core workflow orchestration | `Workflow`, `WorkflowRunner`, `ToolSpec`, `ToolDef`, `ToolCall` |\n| `forge.clients` | LLM backend adapters | `OllamaClient`, `LlamafileClient`, `LlamaServerClient` |\n| `forge.guardrails` | Response validation and enforcement | `Guardrails`, `ResponseValidator`, `StepEnforcer`, `ErrorTracker` |\n| `forge.context` | Token budget and context management | `ContextManager`, `TieredCompact` |\n| `forge.server` | Backend server lifecycle management | `ServerManager`, `BudgetMode` |\n\n资料来源：[src/forge/__init__.py]()\n\n## Tool Definition API\n\n### ToolSpec\n\nThe `ToolSpec` class defines the interface for tools that the LLM can invoke. It wraps a Pydantic model representing the tool's parameters.\n\n```python\nclass ToolSpec(BaseModel):\n    name: str\n    description: str\n    parameters: type[BaseModel]\n```\n\n**Construction from OpenAI Schema:**\n\nTools can be defined from an OpenAI-style JSON Schema:\n\n```python\ntool_spec = ToolSpec.from_openai_schema(\n    name=\"get_weather\",\n    description=\"Get current weather for a city\",\n    schema={\n        \"type\": \"object\",\n        \"properties\": {\n            \"city\": {\"type\": \"string\", \"description\": \"City name\"}\n        },\n        \"required\": [\"city\"]\n    }\n)\n```\n\n资料来源：[src/forge/core/workflow.py:1-50]()\n\n### ToolDef\n\nThe `ToolDef` dataclass binds a tool schema to its implementation callable, along with prerequisites:\n\n```python\n@dataclass\nclass ToolDef:\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\n**Prerequisites Syntax:**\n\nPrerequisites express conditional dependencies between tool calls:\n\n| Type | Example | Behavior |\n|------|---------|----------|\n| String (name-only) | `\"read_file\"` | Any prior call to `read_file` satisfies it |\n| Dict (arg-matched) | `{\"tool\": \"read_file\", \"match_arg\": \"path\"}` | Prior call with same `path` value required |\n\n```python\ntool_def = ToolDef(\n    spec=tool_spec,\n    callable=get_weather_function,\n    prerequisites=[{\"tool\": \"search\", \"match_arg\": \"query\"}]\n)\n```\n\n资料来源：[src/forge/core/workflow.py:52-72]()\n\n## Workflow Definition API\n\n### Workflow Class\n\nThe `Workflow` class is the central configuration object for a multi-step LLM task:\n\n```python\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city\",\n    tools={\n        \"get_weather\": ToolDef(spec=..., callable=get_weather)\n    },\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n    system_prompt_template=\"You are a helpful assistant.\"\n)\n```\n\n**Key Parameters:**\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `name` | `str` | Yes | Workflow identifier |\n| `description` | `str` | Yes | Human-readable description for the LLM |\n| `tools` | `dict[str, ToolDef]` | Yes | Map of tool name to ToolDef |\n| `required_steps` | `list[str]` | No | Tools that must be called before terminal_tool |\n| `terminal_tool` | `str` | Yes | Tool(s) that can end the workflow |\n| `system_prompt_template` | `str` | No | System prompt injected into context |\n\n资料来源：[src/forge/core/workflow.py]()\n\n## LLM Client API\n\n### Client Architecture\n\nForge provides backend-agnostic client adapters that implement a common interface:\n\n```mermaid\ngraph LR\n    A[WorkflowRunner] --> B[Client Interface]\n    B --> C[OllamaClient]\n    B --> D[LlamafileClient]\n    B --> E[LlamaServerClient]\n```\n\n### OllamaClient\n\n```python\nclient = OllamaClient(\n    model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n    recommended_sampling=True\n)\n```\n\n### Sampling Defaults\n\nPer-model recommended sampling parameters are managed through `sampling_defaults.py`:\n\n```python\ndef apply_sampling_defaults(\n    model: str,\n    *,\n    strict: bool,\n) -> dict[str, float | int]:\n    \"\"\"Apply the recommended-sampling policy for model.\"\"\"\n```\n\n**Sampling Policy Quadrant:**\n\n| `strict` | Model in Map | Behavior |\n|----------|-------------|----------|\n| `True` | Yes | Return dict copy |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-80]()\n\n### ToolCall Response Model\n\nThe `ToolCall` class represents a validated tool invocation returned by an LLM client:\n\n```python\nclass ToolCall(BaseModel):\n    tool: str\n```\n\nAdditional fields may be populated by client implementations (e.g., `args`, `reasoning`).\n\n资料来源：[src/forge/core/workflow.py:74-77]()\n\n## Guardrails API\n\nThe guardrails system provides middleware for orchestrating LLM responses with built-in validation, step enforcement, and error handling.\n\n### Guardrails Class\n\nThe main entry point for the guardrails system:\n\n```python\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n    max_retries=3,\n    max_tool_errors=2,\n    rescue_enabled=True,\n    max_premature_attempts=3\n)\n```\n\n**Constructor Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `tool_names` | `list[str]` | Required | Valid tool names for this workflow |\n| `required_steps` | `list[str]` | `None` | Tools that must be called before terminal_tool |\n| `terminal_tool` | `str \\| frozenset` | Required | Tool(s) that can end the workflow |\n| `max_retries` | `int` | `3` | Consecutive bad responses before fatal |\n| `max_tool_errors` | `int` | `2` | Consecutive tool failures before exhaustion |\n| `rescue_enabled` | `bool` | `True` | Attempt to parse tool calls from plain text |\n| `max_premature_attempts` | `int` | `3` | Premature terminal attempts before fatal |\n| `retry_nudge` | `Callable[[str], str]` | `None` | Custom nudge for bare text responses |\n\n资料来源：[src/forge/guardrails/guardrails.py:1-80]()\n\n### CheckResult\n\nThe return type of `Guardrails.check()`:\n\n```python\nclass CheckResult:\n    action: Literal[\"execute\", \"retry\", \"step_blocked\", \"fatal\"]\n    tool_calls: list[ToolCall] | None\n    nudge: Nudge | None\n    reason: str | None\n```\n\n**Action Meanings:**\n\n| Action | Description |\n|--------|-------------|\n| `execute` | Safe to proceed; `tool_calls` contains valid calls |\n| `retry` | Invalid response; inject `nudge` and retry |\n| `step_blocked` | Attempted terminal tool before required steps |\n| `fatal` | Max retries exhausted; `reason` contains explanation |\n\n### Two-Method Guardrails API\n\n```python\n# After each LLM response\nresult = guardrails.check(response)\n\nif result.action == \"fatal\":\n    return f\"FATAL: {result.reason}\"\n\nif result.action in (\"retry\", \"step_blocked\"):\n    return f\"{result.action}: {result.nudge.content}\"\n\n# result.action == \"execute\"\n# Run tools yourself, then record results\ntool_calls = result.tool_calls\nexecuted = [tc.tool for tc in tool_calls]\ndone = guardrails.record(executed)\n```\n\n资料来源：[src/forge/guardrails/guardrails.py:82-130]()\n\n### Granular API\n\nFor advanced use cases, individual guardrail components can be used directly:\n\n```python\nfrom forge.guardrails import ResponseValidator, StepEnforcer, ErrorTracker\n\nvalidator = ResponseValidator(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    rescue_enabled=True,\n)\nenforcer = StepEnforcer(\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\nerrors = ErrorTracker(max_retries=3, max_tool_errors=2)\n```\n\n## Server Management API\n\n### ServerManager\n\nThe `ServerManager` class handles lifecycle management for llama.cpp-based backends:\n\n```python\nserver = ServerManager(backend=\"llamaserver\", port=8080)\n```\n\n**Backend Options:**\n\n| Backend | Description |\n|---------|-------------|\n| `\"ollama\"` | Ollama server (model name, no GGUF path) |\n| `\"llamaserver\"` | llama.cpp server via `llama-server` |\n| `\"llamafile\"` | Mozilla llamafile binary |\n\n### Budget Resolution\n\nThe `resolve_budget()` method determines context length based on mode:\n\n```python\nasync def resolve_budget(\n    self,\n    mode: BudgetMode,\n    manual_tokens: int | None = None,\n) -> int:\n```\n\n| Mode | Behavior |\n|------|----------|\n| `MANUAL` | Use `manual_tokens` directly |\n| `FORGE_FAST` / `FORGE_DEEP` | Query server `/props` for context |\n\n资料来源：[src/forge/server.py:1-100]()\n\n## Context Management\n\n### ContextManager\n\nToken budget management for long-running conversations:\n\n```python\nctx = ContextManager(\n    strategy=TieredCompact(keep_recent=2),\n    budget_tokens=8192\n)\n```\n\n### BudgetMode Enum\n\n```python\nclass BudgetMode(Enum):\n    MANUAL = \"manual\"\n    FORGE_FAST = \"forge_fast\"\n    FORGE_DEEP = \"forge_deep\"\n```\n\n资料来源：[src/forge/server.py]()\n\n## Error Types\n\nForge defines custom exceptions for specific error conditions:\n\n```python\nclass UnsupportedModelError(Exception):\n    \"\"\"Raised when strict sampling defaults are requested for unknown models.\"\"\"\n    pass\n```\n\nAdditional error types in `errors.py`:\n\n| Error | Use Case |\n|-------|----------|\n| `BudgetResolutionError` | Server unreachable or missing n_ctx |\n| `BackendError` | Backend communication failures |\n\n资料来源：[src/forge/errors.py]()\n\n## Quick Start Example\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Summary\n\nThe forge module structure provides:\n\n1. **Tool Definition** — `ToolSpec` and `ToolDef` for declaring LLM-callable functions with prerequisites\n2. **Workflow Orchestration** — `Workflow` and `WorkflowRunner` for managing multi-step tasks\n3. **Client Abstraction** — Backend-agnostic clients with sampling defaults\n4. **Guardrails Middleware** — Built-in validation, step enforcement, and error handling\n5. **Server Management** — Lifecycle control for llama.cpp backends\n6. **Context Management** — Token budget and compaction strategies\n\n---\n\n<a id='page-adr-index'></a>\n\n## Architecture Decision Records\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n</details>\n\n# Architecture Decision Records\n\n## Overview\n\nArchitecture Decision Records (ADRs) serve as the authoritative documentation for significant design choices within the forge project. They capture the *why* behind implementation decisions, enabling current and future contributors to understand the reasoning without reconstructing the original context.\n\nThe forge project stores ADRs in `docs/decisions/`, using a numbered naming convention (e.g., `001-ablation-framework.md`, `011-guardrail-middleware.md`, `013-text-response-intent.md`). This numbering scheme allows for easy chronological tracking and establishes precedent relationships between decisions.\n\n## Purpose and Scope\n\nADRs in forge address several critical aspects:\n\n| Category | Description | Example Documents |\n|----------|-------------|-------------------|\n| Framework Design | Ablation study methodology and tooling | `001-ablation-framework.md` |\n| Middleware Patterns | Guardrail implementation and composition | `011-guardrail-middleware.md` |\n| Response Handling | Intent classification for text responses | `013-text-response-intent.md` |\n| Backend Integration | LLM client configuration and sampling defaults | `sampling_defaults.py` |\n| Server Management | Context resolution and budget modes | `server.py` |\n\nEach ADR documents not only the chosen approach but also considered alternatives and the tradeoffs that influenced the final decision. This creates a historical record that prevents repeated debates over settled questions while enabling informed reconsideration when circumstances change.\n\n## ADR Contribution Workflow\n\nAccording to the contribution guidelines, the process for introducing a new architecture decision follows a structured review pattern:\n\n```mermaid\ngraph TD\n    A[Identify Design Decision] --> B[Review Existing ADRs]\n    B --> C{Decision Already Documented?}\n    C -->|Yes| D[Reference Existing ADR]\n    C -->|No| E[Draft New ADR]\n    E --> F[Propose ADR Format]\n    F --> G[Review Against Project Standards]\n    G --> H[Merge and Publish]\n    \n    style A fill:#e1f5fe\n    style H fill:#c8e6c9\n```\n\nThe contribution workflow integrates with the broader project development cycle:\n\n1. **Proposal Phase**: Before implementing significant changes, contributors should draft an ADR following the established format\n2. **Review Phase**: The ADR undergoes peer review alongside code review\n3. **Adoption Phase**: Once approved, the ADR becomes the reference for implementation decisions\n4. **Maintenance Phase**: ADRs may be updated if subsequent decisions supersede them\n\n资料来源：[CONTRIBUTING.md:1-50]()\n\n## Core Architecture Components\n\n### Workflow Engine\n\nThe `Workflow` and `WorkflowRunner` classes form the central orchestration layer. A workflow defines the available tools, required execution steps, and terminal conditions.\n\n```mermaid\ngraph TD\n    subgraph Workflow Definition\n        W[Workflow] --> TD[Tool Definitions]\n        W --> RS[Required Steps]\n        W --> TT[Terminal Tool]\n        W --> SP[System Prompt Template]\n    end\n    \n    subgraph Execution Layer\n        WR[WorkflowRunner] --> CM[Context Manager]\n        WR --> GR[Guardrails]\n        WR --> CL[LLM Client]\n    end\n    \n    subgraph Tool Layer\n        TC[ToolCall] --> T[Tool Execution]\n        T --> TR[Tool Response]\n    end\n    \n    WR --> TC\n    TC -->|Result| CM\n    CM -->|Context| WR\n    \n    style W fill:#fff3e0\n    style WR fill:#e3f2fd\n```\n\nThe `ToolDef` dataclass binds tool schemas to implementations, while `ToolSpec` defines the JSON Schema for parameter validation. Tool calls are represented as `ToolCall` objects containing the tool name and arguments.\n\n资料来源：[src/forge/core/workflow.py:1-100]()\n\n### Guardrail Middleware\n\nThe guardrail system provides a composable validation layer that intercepts LLM responses before tool execution:\n\n```mermaid\ngraph LR\n    LLM[LLM Response] --> GR[Guardrails.check]\n    GR --> RV[ResponseValidator]\n    GR --> SE[StepEnforcer]\n    GR --> ET[ErrorTracker]\n    \n    RV -->|Valid| TC[ToolCalls]\n    RV -->|Invalid| NR[Retry Nudge]\n    SE -->|Correct Order| TC\n    SE -->|Wrong Order| SB[Step Blocked]\n    ET -->|OK| TC\n    ET -->|Max Errors| FT[Fatal]\n    \n    style GR fill:#fce4ec\n    style TC fill:#c8e6c9\n```\n\nThe `Guardrails` class orchestrates three sub-components:\n\n| Component | Responsibility | Key Parameters |\n|-----------|---------------|----------------|\n| `ResponseValidator` | Parses tool calls, enables rescue parsing | `rescue_enabled`, `retry_nudge_fn` |\n| `StepEnforcer` | Ensures required steps precede terminal tool | `required_steps`, `max_premature_attempts` |\n| `ErrorTracker` | Tracks consecutive errors and retries | `max_retries`, `max_tool_errors` |\n\n资料来源：[src/forge/guardrails/guardrails.py:1-100]()\n\n### Server Management and Budget Resolution\n\nThe `ServerManager` handles lifecycle management for llama.cpp-based backends, while the `ContextManager` implements token budget strategies:\n\n```mermaid\ngraph TD\n    SM[ServerManager] --> BM[BudgetMode]\n    BM -->|FORGE_FAST| FT[Fast Budget]\n    BM -->|FORGE_BALANCED| BT[Balanced Budget]\n    BM -->|FORGE_DEEP| DT[Deep Budget]\n    BM -->|MANUAL| MT[Manual Tokens]\n    \n    CM[ContextManager] --> TC[TieredCompact]\n    CM --> SC[SimpleCompact]\n    \n    SM -->|Context Query| Props[/props endpoint]\n    Props -->|n_ctx| CM\n```\n\nBudget resolution follows platform-specific paths:\n\n- **Ollama**: Uses `manual_tokens` parameter for `MANUAL` mode\n- **Llamafile/Llama Server**: Queries `/props` endpoint for server-configured context length\n\n资料来源：[src/forge/server.py:1-100]()\n\n## Sampling Configuration System\n\nThe sampling defaults system separates lookup from policy, enabling fine-grained control over model parameters:\n\n```mermaid\ngraph TD\n    subgraph Lookup Layer\n        GM[get_model_defaults] --> MAP[MODEL_SAMPLING_DEFAULTS]\n    end\n    \n    subgraph Policy Layer\n        AS[apply_sampling_defaults] --> |strict=True| KR[Known + Known]\n        AS --> |strict=False| KU[Known + Unknown]\n        KR -->|In Map| ReturnDict[Return Dict]\n        KU -->|Not In Map| InfoLog[INFO Log Once]\n    end\n    \n    subgraph Client Integration\n        OC[OllamaClient] --> AS\n        LC[LlamafileClient] --> AS\n        AC[AnthropicClient] --> AS\n    end\n```\n\nThe two-function design (`get_sampling_defaults` for pure lookup, `apply_sampling_defaults` for policy) ensures that:\n\n- Unknown models don't cause errors when `strict=False`\n- Known models log a one-time INFO message when not opted in\n- Explicit opt-in via `recommended_sampling=True` enables strict behavior\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-100]()\n\n## Proxy Server Architecture\n\nThe `ProxyServer` provides a forwarding layer with additional control features:\n\n```mermaid\ngraph TD\n    subgraph Proxy Layer\n        PS[ProxyServer] --> SF[Serialize Flag]\n        PS --> RT[Retry Logic]\n        PS --> RC[Rescue Parser]\n    end\n    \n    subgraph Backend Routing\n        PS --> Ollama[Ollama Backend]\n        PS --> Llama[Llama Backend]\n        PS --> LLF[Llamafile Backend]\n    end\n    \n    subgraph Configuration\n        SF --> |serialize=True| Serial[Serialize Requests]\n        SF --> |serialize=False| Parallel[Parallel Requests]\n        RT --> |max_retries=N| RetryN[N Attempts]\n    end\n```\n\nKey proxy options include:\n\n| Flag | Default | Purpose |\n|------|---------|---------|\n| `--host` | `127.0.0.1` | Proxy listen address |\n| `--port` | `8081` | Proxy listen port |\n| `--serialize` | `None` | Request serialization control |\n| `--max-retries` | `3` | Retries per request |\n| `--no-rescue` | `False` | Disable rescue parsing |\n\n资料来源：[src/forge/proxy/__main__.py:1-80]()\n\n## ADR Format and Standards\n\nEach ADR in the forge repository follows a consistent structure:\n\n1. **Title**: Descriptive name with ADR number\n2. **Status**: Proposed, Accepted, Deprecated, or Superseded\n3. **Context**: Background and problem statement\n4. **Decision**: The chosen approach with rationale\n5. **Consequences**: Benefits, drawbacks, and tradeoffs\n6. **Related Decisions**: Links to dependent or related ADRs\n\nThis format ensures that future maintainers can quickly assess whether an ADR is current and understand the full context of each decision.\n\n## Versioning and Evolution\n\nThe CHANGELOG maintains a parallel record of implementation milestones, cross-referenced with ADRs. Major architectural changes increment the minor version number, while bug fixes increment the patch version (semantic versioning).\n\nChanges that require ADR updates include:\n\n- New LLM backend support\n- Guardrail algorithm modifications\n- Context management strategy changes\n- Tool execution model alterations\n- Breaking API changes\n\n资料来源：[CHANGELOG.md:1-100]()\n\n## Best Practices for ADR Readers\n\nWhen reviewing ADRs to understand forge's architecture:\n\n1. **Start with the index**: The `docs/decisions/` directory lists all ADRs chronologically\n2. **Check status**: Deprecated ADRs indicate historical context, not current practice\n3. **Cross-reference implementations**: Source files in `src/forge/` implement ADR decisions\n4. **Review CHANGELOG**: Implementation dates and version numbers provide temporal context\n5. **Examine tests**: Unit tests in `tests/unit/` validate ADR-enforced behaviors\n\n---\n\n<a id='page-workflowrunner'></a>\n\n## WorkflowRunner and Agentic Loop\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/core/runner.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/runner.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/core/steps.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/steps.py)\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/guardrails/response_validator.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/response_validator.py)\n- [src/forge/guardrails/step_enforcer.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/step_enforcer.py)\n- [src/forge/guardrails/error_tracker.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/error_tracker.py)\n- [src/forge/prompts/nudges.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/prompts/nudges.py)\n</details>\n\n# WorkflowRunner and Agentic Loop\n\n## Overview\n\nThe `WorkflowRunner` is the central execution engine in the Forge framework, implementing an **agentic loop** that orchestrates multi-step tool-calling workflows with an LLM backend. It manages the complete lifecycle of a workflow: from initializing messages, through iterative LLM inference and tool execution, to context management and termination.\n\n**Core responsibilities:**\n\n1. Building initial message lists (system prompt + user input)\n2. Coordinating LLM inference with streaming or batch responses\n3. Validating and executing tool calls returned by the LLM\n4. Managing context budget through the `ContextManager`\n5. Enforcing required step sequences via the `StepEnforcer`\n6. Handling retries for malformed responses\n7. Terminating on terminal tool execution or max iterations\n\n资料来源：[src/forge/core/runner.py:1-50]()\n\n---\n\n## Architecture\n\n### Component Relationships\n\n```mermaid\ngraph TD\n    subgraph \"Forge Agentic System\"\n        WR[WorkflowRunner] --> CM[ContextManager]\n        WR --> IV[Inference Validator]\n        WR --> SE[StepEnforcer]\n        WR --> ET[ErrorTracker]\n        CM --> CC[Compaction Chain]\n        IV --> RV[ResponseValidator]\n    end\n    \n    WR --> Client[LLMClient]\n    Client --> Ollama[OllamaClient]\n    Client --> LlamaServer[LlamafileClient]\n    Client --> Proxy[ProxyClient]\n    \n    WF[Workflow] --> WR\n    WF --> TD[ToolDefs]\n    TD --> Impl[Callable Implementations]\n```\n\n资料来源：[src/forge/core/runner.py:1-50]()\n\n### Agentic Loop Flow\n\nThe `WorkflowRunner.run()` method implements a 7-phase agentic loop:\n\n```mermaid\ngraph TD\n    A[Start: User Query] --> B[\"1. Build Messages<br/>system_prompt + user_input\"]\n    B --> C[\"2. Send to LLM<br/>Streaming or Batch\"]\n    C --> D{Response Type?}\n    D -->|TextResponse<br/>Malformed/Refusal| E[\"3. Retry with Nudge\"]\n    E --> C\n    D -->|ToolCall| F[\"4. Validate Tool Calls<br/>Schema + Prerequisites\"]\n    F --> G{\"5. Execute Tools<br/>Batch-aware\"}\n    G --> H{Success?}\n    H -->|Error| I[\"Track Error<br/>Retry if < max_tool_errors\"]\n    I --> C\n    H -->|Success| J[\"6. Compact Context<br/>ContextManager\"]\n    J --> K{\"7. Check Terminal<br/>terminal_tool called?\"}\n    K -->|No| L[\"Increment iteration<br/>Check < max_iterations\"]\n    L -->|Yes| C\n    K -->|Yes| M[Done]\n    L -->|No| N[MaxIterationsError]\n    \n    style E fill:#ffcccc\n    style N fill:#ffcccc\n    style M fill:#ccffcc\n```\n\n资料来源：[src/forge/core/runner.py:50-200]()\n\n---\n\n## Core Data Models\n\n### Workflow\n\nThe `Workflow` dataclass defines the complete task specification:\n\n```python\n@dataclass\nclass Workflow:\n    name: str\n    description: str\n    tools: dict[str, ToolDef]\n    required_steps: list[str] = field(default_factory=list)\n    terminal_tool: str | frozenset[str] = \"\"\n    system_prompt_template: str = \"\"\n    max_retries: int = 3\n    cancel_event: asyncio.Event | None = None\n```\n\n资料来源：[src/forge/core/workflow.py:1-100]()\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `name` | `str` | Workflow identifier |\n| `description` | `str` | Human-readable task description |\n| `tools` | `dict[str, ToolDef]` | Tool name → ToolDef mapping |\n| `required_steps` | `list[str]` | Tool names that must execute before terminal tool |\n| `terminal_tool` | `str \\| frozenset[str]` | Tool(s) that end the workflow |\n| `system_prompt_template` | `str` | System prompt for the LLM |\n| `max_retries` | `int` | Consecutive bad responses before fatal (default: 3) |\n| `cancel_event` | `asyncio.Event \\| None` | Optional cancellation signal |\n\n### ToolDef\n\n`ToolDef` binds a tool's schema to its implementation:\n\n```python\n@dataclass\nclass ToolDef:\n    spec: ToolSpec\n    callable: Callable[..., Any]\n    prerequisites: list[str | dict[str, str]] = field(default_factory=list)\n```\n\n资料来源：[src/forge/core/workflow.py:100-130]()\n\n**Prerequisites** express conditional dependencies:\n\n| Type | Example | Behavior |\n|------|---------|----------|\n| `str` | `\"read_file\"` | Any prior call to `read_file` satisfies it |\n| `dict` | `{\"tool\": \"read_file\", \"match_arg\": \"path\"}` | Prior call with matching `path` value required |\n\n### ToolSpec\n\nDefines the tool's interface for LLM communication using Pydantic:\n\n```python\n@dataclass\nclass ToolSpec:\n    name: str\n    description: str\n    parameters: type[BaseModel]\n```\n\n资料来源：[src/forge/core/workflow.py:40-60]()\n\n### ToolCall\n\nValidated tool invocation returned by the LLM:\n\n```python\nclass ToolCall(BaseModel):\n    tool: str\n    args: dict[str, Any] = Field(default_factory=dict)\n```\n\n资料来源：[src/forge/core/workflow.py:160-170]()\n\n---\n\n## StepTracker\n\nThe `StepTracker` maintains workflow state outside the message history, tracking which required steps have been completed and what tools have been executed with what arguments.\n\n```python\n@dataclass\nclass StepTracker:\n    required_steps: list[str]\n    completed_steps: dict[str, None] = field(default_factory=dict)\n    executed_tools: dict[str, list[dict[str, Any]]] = field(default_factory=dict)\n    \n    def record(self, tool_name: str, args: dict[str, Any] | None = None) -> None\n    def is_satisfied(self) -> bool\n    def pending(self) -> list[str]\n```\n\n资料来源：[src/forge/core/steps.py:20-50]()\n\n**Key behaviors:**\n\n- `completed_steps` uses `dict` (insertion order preserved) to track step order\n- `executed_tools` stores argument history for arg-matched prerequisites\n- Compaction cannot invalidate step completion (state lives outside message history)\n\n### PrerequisiteCheck\n\n```python\n@dataclass\nclass PrerequisiteCheck:\n    satisfied: bool\n    missing: list[str]\n```\n\n资料来源：[src/forge/core/steps.py:8-15]()\n\n---\n\n## Guardrails System\n\nThe guardrails system provides composable middleware for reliability without requiring `WorkflowRunner`:\n\n```mermaid\ngraph LR\n    subgraph \"Guardrails Bundle\"\n        G[Guardrails]\n        G --> RV[ResponseValidator]\n        G --> SE[StepEnforcer]\n        G --> ET[ErrorTracker]\n    end\n    \n    Input[LLM Response] --> G\n    G --> Output[CheckResult]\n```\n\n资料来源：[src/forge/guardrails/__init__.py:1-30]()\n\n### Guardrails (Bundled API)\n\n```python\nclass Guardrails:\n    def __init__(\n        self,\n        tool_names: list[str],\n        terminal_tool: str | frozenset[str],\n        required_steps: list[str] | None = None,\n        max_retries: int = 3,\n        max_tool_errors: int = 2,\n        rescue_enabled: bool = True,\n        max_premature_attempts: int = 3,\n        retry_nudge: Callable[[str], str] | None = None,\n    ) -> None\n    \n    def check(self, response: LLMResponse) -> CheckResult\n    def record(self, executed_tools: list[str]) -> bool\n```\n\n资料来源：[src/forge/guardrails/guardrails.py:60-100]()\n\n### CheckResult\n\n```python\n@dataclass\nclass CheckResult:\n    action: Literal[\"execute\", \"retry\", \"step_blocked\", \"fatal\"]\n    tool_calls: list[ToolCall] | None = None\n    nudge: Nudge | None = None\n    reason: str | None = None\n```\n\n资料来源：[src/forge/guardrails/guardrails.py:20-35]()\n\n| Action | Meaning |\n|--------|---------|\n| `execute` | Response is valid, proceed with tool calls |\n| `retry` | Bare text or malformed, retry with nudge |\n| `step_blocked` | Tried to call terminal tool prematurely |\n| `fatal` | Exhausted retries or too many errors |\n\n### ResponseValidator\n\nValidates LLM responses and extracts tool calls:\n\n- **Normal extraction**: Parses structured `tool_calls` from response\n- **Rescue parsing**: Handles plain text with regex-based tool extraction for models like Qwen Coder (XML format: `<function=name>...`)\n- **Retry nudging**: Injects guidance when model produces bare text\n\n资料来源：[src/forge/guardrails/response_validator.py:1-50]()\n\n### StepEnforcer\n\nEnforces required step sequences:\n\n```python\nclass StepEnforcer:\n    def check(\n        self,\n        tool_calls: list[ToolCall],\n        step_tracker: StepTracker,\n    ) -> StepCheck\n```\n\n资料来源：[src/forge/guardrails/step_enforcer.py:1-50]()\n\n**Nudge escalation tiers:**\n\n| Tier | Tone | Example |\n|------|------|---------|\n| 1 | Polite | \"You cannot call X yet. You must first complete: Y, Z.\" |\n| 2 | Direct | \"You must call one of these tools now: Y, Z. Pick one.\" |\n| 3 | Aggressive | \"STOP. You MUST call one of: Y, Z. Do NOT call X.\" |\n\n资料来源：[src/forge/prompts/nudges.py:1-40]()\n\n### ErrorTracker\n\nTracks consecutive errors for retry/exhaustion logic:\n\n```python\nclass ErrorTracker:\n    def record_retry(self) -> int\n    def record_tool_error(self) -> int\n    def reset(self) -> None\n    @property\n    def should_fatal(self) -> bool\n```\n\n资料来源：[src/forge/guardrails/error_tracker.py:1-40]()\n\n---\n\n## WorkflowRunner Configuration\n\n### Constructor Parameters\n\n```python\nclass WorkflowRunner:\n    def __init__(\n        self,\n        client: LLMClient,\n        context_manager: ContextManager,\n        max_iterations: int = 10,\n        max_retries_per_step: int = 3,\n        max_tool_errors: int = 2,\n        max_premature_attempts: int = 3,\n        retry_nudge: Callable[[str], str] | None = None,\n    ) -> None:\n```\n\n资料来源：[src/forge/core/runner.py:50-80]()\n\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `client` | required | `LLMClient` instance (OllamaClient, LlamafileClient, ProxyClient) |\n| `context_manager` | required | `ContextManager` instance for context compaction |\n| `max_iterations` | `10` | Maximum LLM turns before raising `MaxIterationsError` |\n| `max_retries_per_step` | `3` | Consecutive bare-text responses before fatal |\n| `max_tool_errors` | `2` | Consecutive tool execution failures before exhaustion |\n| `max_premature_attempts` | `3` | Premature terminal tool attempts before fatal |\n| `retry_nudge` | `None` | Custom nudge function for bare text responses |\n\n### Run Method\n\n```python\nasync def run(\n    self,\n    workflow: Workflow,\n    user_input: str,\n    cancel_event: asyncio.Event | None = None,\n) -> list[Message]:\n```\n\n资料来源：[src/forge/core/runner.py:80-120]()\n\n**Returns:** List of all `Message` objects in the conversation history\n\n**Raises:**\n\n| Exception | Condition |\n|-----------|-----------|\n| `MaxIterationsError` | Exceeded `max_iterations` without reaching terminal tool |\n| `StepEnforcementError` | `max_premature_attempts` exceeded |\n| `PrerequisiteError` | Tool called without satisfying prerequisites |\n| `ToolCallError` | Tool name not found in workflow |\n| `ToolExecutionError` | Tool callable raised an exception |\n| `WorkflowCancelledError` | `cancel_event` was set |\n\n---\n\n## WorkflowRunner vs Guardrails\n\nForge provides two integration patterns:\n\n| Aspect | WorkflowRunner | Guardrails |\n|--------|---------------|------------|\n| **Abstraction** | Full agentic loop | Lightweight middleware |\n| **Context management** | Built-in via ContextManager | Handled externally |\n| **Best for** | Standard workflows | Custom orchestration loops |\n| **API surface** | `run()` | `check()` + `record()` |\n\n资料来源：[src/forge/guardrails/__init__.py:1-20]()\n\n### Guardrails Usage Pattern\n\n```python\nfrom forge.guardrails import Guardrails, CheckResult\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\n# After each LLM response\nresult = guardrails.check(llm_response)\n\nif result.action == \"execute\":\n    for tc in result.tool_calls:\n        execute_tool(tc.tool, tc.args)\n    guardrails.record([tc.tool for tc in result.tool_calls])\nelif result.action == \"retry\":\n    # Re-prompt with nudge\n    pass\nelif result.action == \"fatal\":\n    # Stop\n    pass\n```\n\n---\n\n## Context Management\n\nThe `ContextManager` handles conversation context compaction:\n\n```mermaid\ngraph TD\n    A[New Tool Result] --> B{Token Budget?}\n    B -->|OK| C[Continue]\n    B -->|Exceeded| D[\"Compact Messages<br/>TieredCompact\"]\n    D --> C\n```\n\nThe runner automatically calls context compaction after each tool execution cycle. Compaction decisions use **real token counting** from backend-reported token usage.\n\n资料来源：[src/forge/core/runner.py:150-200]()\n\n### TieredCompact Strategy\n\nKeeps recent messages intact while compacting older history:\n\n- `keep_recent`: Number of recent message pairs to preserve\n- Older messages are summarized or removed\n\n---\n\n## Error Handling\n\n### Error Flow\n\n```mermaid\ngraph TD\n    A[Tool Execution] --> B{Success?}\n    B -->|Yes| C[Record Success]\n    B -->|No| D[Increment Error Count]\n    D --> E{max_tool_errors?}\n    E -->|Exceeded| F[Raise ToolExecutionError]\n    E -->|Not exceeded| G[Retry Tool Call]\n    G --> A\n    F --> H[Abort Workflow]\n```\n\n### Recovery Mechanisms\n\n| Error Type | Recovery Strategy |\n|------------|-------------------|\n| Malformed response | Retry with nudge (up to `max_retries_per_step`) |\n| Tool execution failure | Retry tool call (up to `max_tool_errors`) |\n| Premature terminal | Escalating nudge (3 tiers), then fatal |\n| Prerequisite violation | Prerequisite nudge, model re-plans |\n\n---\n\n## Cancellation Support\n\nWorkflows can be cancelled via `asyncio.Event`:\n\n```python\ncancel_event = asyncio.Event()\n\nrunner = WorkflowRunner(client=client, context_manager=ctx)\nasyncio.create_task(runner.run(workflow, user_input, cancel_event))\n\n# Later, from another task:\ncancel_event.set()  # Raises WorkflowCancelledError\n```\n\n资料来源：[src/forge/core/workflow.py:80-90]()\n\n---\n\n## Usage Examples\n\n### Basic Workflow\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import Workflow, ToolDef, ToolSpec, WorkflowRunner, OllamaClient\nfrom forge.context import ContextManager, TieredCompact\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant.\",\n)\n\nasync def main():\n    client = OllamaClient(model=\"ministral-3:8b-instruct-q4_K_M\", recommended_sampling=True)\n    ctx = ContextManager(strategy=TieredCompact(keep_recent=2), budget_tokens=8192)\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    messages = await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md:1-60]()\n\n### Multi-Step Workflow with Required Steps\n\n```python\nworkflow = Workflow(\n    name=\"research\",\n    description=\"Research a topic with multiple steps\",\n    tools={\n        \"search\": ToolDef(spec=search_spec, callable=search_impl),\n        \"lookup\": ToolDef(\n            spec=lookup_spec,\n            callable=lookup_impl,\n            prerequisites=[\"search\"],  # Must search before lookup\n        ),\n        \"answer\": ToolDef(\n            spec=answer_spec,\n            callable=answer_impl,\n            prerequisites=[\"search\", \"lookup\"],  # All steps required\n        ),\n    },\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n    system_prompt_template=\"You are a research assistant.\",\n)\n```\n\n---\n\n## See Also\n\n- [User Guide](docs/USER_GUIDE.md) — Full workflow configuration and best practices\n- [Eval Guide](docs/EVAL_GUIDE.md) — Running evaluation scenarios\n- [ADR-011: Guardrail Middleware](docs/decisions/011-guardrail-middleware.md) — Design rationale for the guardrails system\n- [Backend Setup](docs/BACKEND_SETUP.md) — LLM backend configuration (Ollama, llamafile, llama.cpp)\n\n---\n\n<a id='page-guardrails-middleware'></a>\n\n## Guardrails Middleware for External Loops\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/guardrails/guardrails.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/guardrails.py)\n- [src/forge/guardrails/__init__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/guardrails/__init__.py)\n- [examples/foreign_loop.py](https://github.com/antoinezambelli/forge/blob/main/examples/foreign_loop.py)\n- [src/forge/prompts/nudges.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/prompts/nudges.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n</details>\n\n# Guardrails Middleware for External Loops\n\nForge's **Guardrails Middleware** provides a composable reliability layer that can be integrated into any external orchestration loop. Rather than requiring adoption of the full `WorkflowRunner`, external projects can embed forge's retry nudges, rescue parsing, step enforcement, and error tracking directly within their own agent execution frameworks.\n\n---\n\n## Overview\n\nThe guardrails system addresses a fundamental challenge in LLM tool-calling: models frequently produce malformed, premature, or incomplete tool invocations that require intervention rather than silent failure.\n\n| Concern | Component | Purpose |\n|---------|-----------|---------|\n| Malformed responses | `ResponseValidator` | Parse tool calls from text, retry on bad format |\n| Premature termination | `StepEnforcer` | Enforce required step sequence before terminal tool |\n| Consecutive failures | `ErrorTracker` | Count retries/tool errors, signal exhaustion |\n| User-facing messages | `Nudge` | Structured retry guidance injected into prompts |\n\n资料来源：[src/forge/guardrails/__init__.py:1-29]()\n\n---\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[\"LLM Response\"] --> B[\"Guardrails.check()\"]\n    B --> C{ResponseValidator}\n    C -->|Valid tool call| D{StepEnforcer}\n    C -->|Bare text / malformed| E[\"Generate retry_nudge\"]\n    D -->|Step violated| F[\"Generate premature_terminal_nudge\"]\n    D -->|All checks pass| G{ErrorTracker}\n    G -->|Under threshold| H[\"action: execute\"]\n    G -->|Over threshold| I[\"action: fatal\"]\n    \n    J[\"Tool Execution\"] --> K[\"Guardrails.record()\"]\n    K --> G\n```\n\nThe `Guardrails` class bundles three components into a single two-method API:\n\n```python\nclass Guardrails:\n    def check(self, response: LLMResponse) -> CheckResult: ...\n    def record(self, executed: list[str]) -> bool: ...\n```\n\n资料来源：[src/forge/guardrails/guardrails.py:86-109]()\n\n---\n\n## CheckResult Data Model\n\nEvery call to `check()` returns a `CheckResult` that encodes the appropriate action for the orchestration loop:\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `action` | `Literal[\"execute\", \"retry\", \"step_blocked\", \"fatal\"]` | Directive for the caller |\n| `tool_calls` | `list[ToolCall] \\| None` | Parsed tool invocations (when action is `execute`) |\n| `nudge` | `Nudge \\| None` | Structured message to inject (when action is `retry` or `step_blocked`) |\n| `reason` | `str \\| None` | Human-readable explanation (only when action is `fatal`) |\n\n资料来源：[src/forge/guardrails/guardrails.py:66-85]()\n\n### Action Semantics\n\n| Action | Meaning | Caller Behavior |\n|--------|---------|-----------------|\n| `execute` | Response is valid; proceed to tool execution | Extract `tool_calls`, execute them, call `record()` |\n| `retry` | Response invalid but recoverable; inject nudge | Append `nudge` to messages, re-prompt model |\n| `step_blocked` | Terminal tool called before required steps | Append step-enforcement nudge, re-prompt |\n| `fatal` | Exhausted retry/error budget | Log `reason`, abort workflow |\n\n---\n\n## Guardrails Constructor Parameters\n\n```python\nGuardrails(\n    tool_names: list[str],\n    terminal_tool: str | frozenset[str],\n    required_steps: list[str] | None = None,\n    max_retries: int = 3,\n    max_tool_errors: int = 2,\n    rescue_enabled: bool = True,\n    max_premature_attempts: int = 3,\n    retry_nudge: Callable[[str], str] | None = None,\n)\n```\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `tool_names` | `list[str]` | — | Valid tool names for this workflow |\n| `terminal_tool` | `str \\| frozenset[str]` | — | Tool(s) that can end the workflow |\n| `required_steps` | `list[str] \\| None` | `None` | Tools that must be called before `terminal_tool` |\n| `max_retries` | `int` | `3` | Consecutive bad responses before `fatal` |\n| `max_tool_errors` | `int` | `2` | Consecutive tool execution failures before exhaustion |\n| `rescue_enabled` | `bool` | `True` | Attempt to parse tool calls from plain text |\n| `max_premature_attempts` | `int` | `3` | Premature terminal attempts before `fatal` |\n| `retry_nudge` | `Callable[[str], str] \\| None` | `None` | Custom nudge generator for bare text responses |\n\n资料来源：[src/forge/guardrails/guardrails.py:47-69]()\n\n---\n\n## Integration Patterns\n\n### Bundled API (Recommended)\n\nFor most integrations, use the two-method `Guardrails` interface:\n\n```python\nfrom forge.guardrails import Guardrails, CheckResult\n\nguardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response(response) -> str:\n    result = guardrails.check(response)\n    \n    if result.action == \"fatal\":\n        return f\"ABORT: {result.reason}\"\n    \n    if result.action in (\"retry\", \"step_blocked\"):\n        return f\"RETRY with nudge: {result.nudge.content}\"\n    \n    # Execute tools\n    executed = [tc.tool for tc in result.tool_calls]\n    done = guardrails.record(executed)\n    return f\"executed {executed}\" + (\" -- DONE\" if done else \"\")\n```\n\n资料来源：[examples/foreign_loop.py:60-78]()\n\n### Granular API (Advanced)\n\nFor fine-grained control, instantiate components directly:\n\n```python\nfrom forge.guardrails import ResponseValidator, StepEnforcer, ErrorTracker\n\nvalidator = ResponseValidator(\n    tool_names=[\"search\", \"lookup\", \"answer\"],\n    rescue_enabled=True,\n)\nenforcer = StepEnforcer(\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tools=frozenset([\"answer\"]),\n)\nerrors = ErrorTracker(max_retries=3, max_tool_errors=2)\n\ndef handle_response_granular(response):\n    # Reset state for each new response\n    errors._consecutive_retries = 0\n    enforcer._premature_attempts = 0\n    \n    validation = validator.validate(response)\n    if not validation.valid:\n        return \"RETRY: \" + validation.nudge.content\n    \n    step_check = enforcer.check(response.tool_calls)\n    if step_check.blocked:\n        return \"BLOCKED: \" + step_check.nudge.content\n    \n    # Execute, then record\n    errors.record(tool_name=\"search\", error=None)\n    return \"proceed to execution\"\n```\n\n资料来源：[examples/foreign_loop.py:32-59]()\n\n---\n\n## Nudge Generation\n\nThe `nudge.py` module provides structured retry messages with three escalation tiers:\n\n```mermaid\ngraph LR\n    A[\"tier=1\"] --> B[\"Polite:<br/>'You cannot call X yet...'\"]\n    A --> C[\"tier=2\"]\n    C --> D[\"Direct:<br/>'You must call one of...'\"]\n    C --> E[\"tier=3\"]\n    E --> F[\"Aggressive:<br/>'STOP. You MUST call...'\"]\n```\n\n### Premature Terminal Nudge\n\nGenerated when the model attempts to call the terminal tool before completing required steps:\n\n```python\ndef premature_terminal_nudge(\n    terminal_tool: str,\n    pending_steps: list[str],\n    tier: int = 1,\n) -> str:\n    tier = max(1, min(3, tier))  # Clamped to 1-3\n```\n\n资料来源：[src/forge/prompts/nudges.py:1-26]()\n\n| Tier | Tone | Example Output |\n|------|------|----------------|\n| 1 | Polite | \"You cannot call `answer` yet. You must first complete these required steps: search, lookup. Call one of them now.\" |\n| 2 | Direct | \"You must call one of these tools now: search, lookup. Pick one.\" |\n| 3 | Aggressive | \"STOP. You MUST call one of: search, lookup. Do NOT call `answer`. Your next response MUST be a tool call to one of: search, lookup.\" |\n\n### Prerequisite Nudge\n\nGenerated when a tool is called without its prerequisites:\n\n```python\ndef prerequisite_nudge(tool_name: str, missing_prereqs: list[str]) -> str:\n```\n\n资料来源：[src/forge/prompts/nudges.py:28-46]()\n\n---\n\n## ToolCall Model\n\nThe `tool_calls` extracted from a valid response are represented as:\n\n```python\nclass ToolCall(BaseModel):\n    tool: str  # Tool name\n    # Additional fields inherited from LLMClient extraction\n```\n\nFor OpenAI-style tool definitions, the `ToolSpec` in `workflow.py` handles parameter schema conversion:\n\n```python\nclass ToolSpec:\n    name: str\n    description: str\n    parameters: type[BaseModel]  # Pydantic model\n    \n    @classmethod\n    def from_openai_schema(cls, name: str, description: str, schema: dict) -> ToolSpec:\n        properties = schema.get(\"properties\", {})\n        required = set(schema.get(\"required\", []))\n```\n\n资料来源：[src/forge/core/workflow.py:1-40]()\n\n---\n\n## Respond Tool Integration\n\nFor conversational use cases where the model may produce a final text response instead of a tool call, forge provides a special `respond` tool:\n\n```python\nfrom forge.tools import RESPOND_TOOL_NAME, respond_spec\n\nrespond_guardrails = Guardrails(\n    tool_names=[\"search\", \"lookup\", \"answer\", RESPOND_TOOL_NAME],\n    required_steps=[\"search\", \"lookup\"],\n    terminal_tool=\"answer\",\n)\n\ndef handle_response_with_respond(response):\n    result = respond_guardrails.check(response)\n    \n    if result.action == \"fatal\":\n        return f\"FATAL: {result.reason}\"\n    \n    if result.action in (\"retry\", \"step_blocked\"):\n        return f\"{result.action}: {result.nudge.content[:80]}...\"\n    \n    tool_calls = result.tool_calls\n    \n    # Check if the model called respond\n    for tc in tool_calls:\n        if tc.tool == RESPOND_TOOL_NAME:\n            message = tc.args.get(\"message\", \"\")\n            return f\"MODEL SAYS: {message}\"\n    \n    executed = [tc.tool for tc in tool_calls]\n    done = respond_guardrails.record(executed)\n    return f\"executed {executed}\" + (\" -- DONE\" if done else \"\")\n```\n\n资料来源：[examples/foreign_loop.py:83-114]()\n\n---\n\n## Public API Reference\n\n```python\nfrom forge.guardrails import (\n    # Bundled API\n    CheckResult,\n    Guardrails,\n    \n    # Granular components\n    ErrorTracker,\n    Nudge,\n    ResponseValidator,\n    StepCheck,\n    StepEnforcer,\n    ValidationResult,\n)\n```\n\n资料来源：[src/forge/guardrails/__init__.py:7-27]()\n\n---\n\n## Design Rationale\n\nThe guardrails middleware was designed to solve the foreign-loop integration problem described in ADR-011. The core principle is **orthogonal concerns**: each component handles one aspect of reliability, and the `Guardrails` bundler coordinates them without hiding their individual behavior.\n\nKey design decisions:\n\n| Decision | Rationale |\n|----------|-----------|\n| Two-method API | Simple contract; orchestrator controls the loop |\n| `record()` returns `bool` | `True` signals terminal tool reached; orchestrator decides when to stop |\n| `Nudge` dataclass | Structured message type allows metadata (e.g., tier) beyond raw string |\n| `rescue_enabled` | Opt-in parsing from plain text; some backends emit structured output natively |\n| `frozenset[str]` for terminal_tool | Multiple terminal tools supported without API change |\n\n资料来源：[src/forge/guardrails/__init__.py:1-10]()\n\n---\n\n<a id='page-proxy-server'></a>\n\n## Proxy Server Setup\n\n### 相关页面\n\n相关主题：[Backend Clients](#page-backend-clients)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n- [src/forge/proxy/proxy.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n- [src/forge/proxy/__init__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__init__.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [scripts/smoke_test_proxy.py](https://github.com/antoinezambelli/forge/blob/main/scripts/smoke_test_proxy.py)\n</details>\n\n# Proxy Server Setup\n\n## Overview\n\nThe forge proxy server is an OpenAI-compatible HTTP proxy that transparently applies forge's guardrail stack to any backend that speaks the Chat Completions API. It acts as a drop-in replacement for local model servers, enabling existing OpenAI-compatible clients to benefit from forge's reliability layer without code changes.\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n## Architecture\n\nThe proxy operates in two distinct modes, determined at startup:\n\n```mermaid\ngraph TD\n    subgraph \"Managed Mode\"\n        P[\"ProxyServer<br/>:8081\"] --> SM[\"ServerManager\"]\n        SM --> BE[\"llama-server<br/>llamafile<br/>ollama<br/>:8080\"]\n    end\n    \n    subgraph \"External Mode\"\n        P2[\"ProxyServer<br/>:8081\"] --> BU[\"User-managed<br/>Backend<br/>:8080\"]\n    end\n    \n    C[\"OpenAI Client\"] --> P\n    C2[\"OpenAI Client\"] --> P2\n```\n\n### Managed Mode\n\nIn managed mode, forge starts and controls the backend process lifecycle. The `ServerManager` class handles:\n\n- Backend binary discovery and execution\n- Model loading and initialization\n- Health verification via `/props` endpoint polling\n- Graceful shutdown and restart\n\n资料来源：[src/forge/proxy/proxy.py:1-40](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n### External Mode\n\nIn external mode, the proxy connects to a user-managed backend. This is useful when:\n\n- The backend runs on a different machine or container\n- Custom backend configurations are required\n- The backend is managed by an external orchestration system\n\n资料来源：[src/forge/proxy/proxy.py:35-45](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n## Supported Backends\n\n| Backend | Description | Requirements |\n|---------|-------------|--------------|\n| `llamaserver` | Llama.cpp's HTTP server | Local GGUF model file |\n| `llamafile` | Mozilla's single-file model executable | Single-file executable |\n| `ollama` | Ollama local inference server | Ollama runtime + model pulled |\n\n资料来源：[src/forge/proxy/__main__.py:25-29](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n## CLI Usage\n\n### Basic Invocation\n\n```bash\n# External mode — you manage the backend\npython -m forge.proxy --backend-url http://localhost:8080 --port 8081\n\n# Managed mode — forge starts llama-server and proxy together\npython -m forge.proxy --backend llamaserver --gguf path/to/model.gguf --port 8081\n\n# Managed mode with ollama\npython -m forge.proxy --backend ollama --model llama3.2 --port 8081\n```\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n### Command-Line Arguments\n\n| Argument | Type | Default | Description |\n|----------|------|---------|-------------|\n| `--backend-url` | string | - | External backend URL (mutually exclusive with `--backend`) |\n| `--backend` | choice | - | Backend type: `llamaserver`, `llamafile`, `ollama` |\n| `--model` | string | - | Model name (required for ollama) |\n| `--gguf` | string | - | Path to GGUF file (llamaserver/llamafile) |\n| `--backend-port` | int | 8080 | Backend port for managed mode |\n| `--budget-mode` | choice | backend | Context budget: `backend`, `manual`, `forge-full`, `forge-fast` |\n| `--budget-tokens` | int | - | Manual token budget override |\n| `--extra-flags` | list | - | Additional backend CLI flags |\n| `--host` | string | 127.0.0.1 | Proxy listen host |\n| `--port` | int | 8081 | Proxy listen port |\n| `--serialize` | flag | - | Force request serialization |\n| `--no-serialize` | flag | - | Disable request serialization |\n| `--max-retries` | int | 3 | Max retries per request |\n| `--no-rescue` | flag | - | Disable rescue parsing |\n| `-v, --verbose` | flag | - | Enable debug logging |\n\n资料来源：[src/forge/proxy/__main__.py:13-53](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n## Programmatic API\n\n### ProxyServer Class\n\nThe `ProxyServer` class provides a programmatic interface for embedding the proxy in Python applications.\n\n```python\nfrom forge.proxy import ProxyServer\n\n# External mode\nproxy = ProxyServer(backend_url=\"http://localhost:8080\")\nproxy.start()\nprint(f\"Proxy running at {proxy.url}\")  # http://127.0.0.1:8081\n# ... use proxy ...\nproxy.stop()\n```\n\n```python\n# Managed mode\nproxy = ProxyServer(\n    backend=\"llamaserver\",\n    gguf=\"model.gguf\",\n    budget_mode=\"forge-fast\",\n    port=8081\n)\nproxy.start()\nproxy.stop()  # Stops both backend and proxy\n```\n\n资料来源：[src/forge/proxy/proxy.py:50-75](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n### Constructor Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `backend_url` | `str \\| None` | None | External backend URL |\n| `backend` | `str \\| None` | None | Backend type: `llamaserver`, `llamafile`, `ollama` |\n| `model` | `str \\| None` | None | Model name for ollama |\n| `gguf` | `str \\| Path \\| None` | None | Path to GGUF file |\n| `backend_port` | int | 8080 | Backend port |\n| `budget_mode` | BudgetMode | BudgetMode.BACKEND | Context budget strategy |\n| `budget_tokens` | int | - | Manual token budget |\n| `extra_flags` | `list[str] \\| None` | None | Additional CLI flags |\n| `host` | str | 127.0.0.1 | Listen host |\n| `port` | int | 8081 | Listen port |\n| `serialize` | `bool \\| None` | None | Request serialization control |\n| `max_retries` | int | 3 | Max retries per request |\n| `rescue_enabled` | bool | True | Enable rescue parsing |\n\n资料来源：[src/forge/proxy/proxy.py:56-100](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n### Lifecycle Methods\n\n| Method | Description |\n|--------|-------------|\n| `start()` | Start the proxy (blocks until ready, max 120s timeout) |\n| `stop()` | Stop the proxy and managed backend (30s shutdown timeout) |\n| `url` | Property returning the proxy's base URL |\n\n资料来源：[src/forge/proxy/proxy.py:102-125](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n## Respond Tool Injection\n\n### Purpose\n\nSmall local models (~8B parameters) cannot reliably choose between text output and tool calls. The proxy automatically injects a synthetic `respond` tool when tools are present in the request, forcing the model into tool-calling mode.\n\n### Behavior\n\n1. When the request contains `tools`, forge injects a `respond(message=\"...\")` tool into the tools list\n2. The model calls `respond(message=\"...\")` instead of producing bare text\n3. The `respond` call is stripped from the outbound response\n4. The client receives a normal text response with `finish_reason: \"stop\"`\n\nThis keeps the model in tool-calling mode where forge's full guardrail stack applies.\n\n资料来源：[README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n\n```mermaid\nsequenceDiagram\n    participant C as OpenAI Client\n    participant P as ProxyServer\n    participant B as Backend\n    \n    C->>P: POST /v1/chat/completions<br/>(with tools)\n    P->>P: Inject respond tool\n    P->>B: Forward request<br/>(tools + respond)\n    B->>P: respond(message=\"answer\")\n    P->>P: Strip respond call\n    P->>C: Normal text response<br/>(finish_reason: \"stop\")\n```\n\n## Context Budget Modes\n\nThe proxy supports different strategies for managing context window usage:\n\n| Mode | Description |\n|------|-------------|\n| `backend` | Let the backend manage context (default) |\n| `manual` | Use `--budget-tokens` for fixed budget |\n| `forge-full` | Full tiered compaction strategy |\n| `forge-fast` | Fast tiered compaction (reduced) |\n\n资料来源：[src/forge/proxy/__main__.py:35-38](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n### Tiered Compaction\n\nThe `forge-full` and `forge-fast` modes utilize `TieredCompact`, a three-phase compaction strategy:\n\n1. **Truncate** — Remove oldest messages\n2. **Drop results** — Remove tool result content\n3. **Sliding window** — Maintain recent context\n\n资料来源：[src/forge/proxy/proxy.py:22](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/proxy.py)\n\n## Request Serialization\n\nBy default, the proxy handles concurrent requests independently. The serialization flags control this behavior:\n\n| Flag | Behavior |\n|------|----------|\n| (none) | Proxy decides based on backend capabilities |\n| `--serialize` | Force sequential request processing |\n| `--no-serialize` | Allow concurrent processing |\n\n资料来源：[src/forge/proxy/__main__.py:31-34](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n## Sampling Parameters Pass-Through\n\nThe proxy forwards OpenAI-compatible sampling fields directly to the backend without modification:\n\n- `temperature`\n- `top_p`\n- `top_k`\n- `min_p`\n- `repeat_penalty`\n- `presence_penalty`\n- `seed`\n\n资料来源：[CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n\nTo use model-card-recommended sampling in proxy mode:\n\n```python\nfrom forge.clients import get_sampling_defaults\n\n# Look up recommended sampling parameters\nsampling = get_sampling_defaults(\"ministral-3-8b-instruct\")\n# Include in request body\nresponse = client.post(\"/v1/chat/completions\", json={\n    \"model\": \"ministral-3-8b-instruct\",\n    \"messages\": [...],\n    **sampling\n})\n```\n\n## Signal Handling\n\nThe proxy gracefully handles shutdown signals:\n\n- `SIGINT` (Ctrl+C) — Immediate shutdown\n- `SIGTERM` — Graceful shutdown\n\nThe main thread uses a timed sleep loop (`time.sleep(0.1)`) to allow Python to deliver signals between iterations, ensuring proper shutdown on Windows.\n\n资料来源：[src/forge/proxy/__main__.py:95-105](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n\n## Testing with Smoke Test Script\n\nThe repository includes a smoke test at `scripts/smoke_test_proxy.py` that:\n\n1. Starts a mock backend on port 18080\n2. Launches the proxy in external mode on port 18081\n3. Verifies health endpoint\n4. Sends a test chat completion request\n5. Validates the response structure\n\n```bash\npython scripts/smoke_test_proxy.py\n```\n\n资料来源：[scripts/smoke_test_proxy.py](https://github.com/antoinezambelli/forge/blob/main/scripts/smoke_test_proxy.py)\n\n## Health Endpoint\n\nThe proxy exposes a `/health` endpoint for monitoring:\n\n```bash\ncurl http://127.0.0.1:8081/health\n```\n\n资料来源：[scripts/smoke_test_proxy.py:70](https://github.com/antoinezambelli/forge/blob/main/scripts/smoke_test_proxy.py)\n\n## Configuration Example: Complete Setup\n\n```bash\n# Start llama-server with custom flags, proxy it\npython -m forge.proxy \\\n    --backend llamaserver \\\n    --gguf ./models/ministral-3-8b-instruct-q8_0.gguf \\\n    --model ministral-3-8b-instruct \\\n    --budget-mode forge-full \\\n    --backend-port 8080 \\\n    --port 8081 \\\n    --host 0.0.0.0 \\\n    --extra-flags --reasoning-format auto \\\n    --verbose\n```\n\nThen configure your client:\n\n```python\nimport httpx\n\nclient = httpx.AsyncClient(\n    base_url=\"http://localhost:8081/v1\",\n    timeout=120.0\n)\n\nresponse = await client.post(\"/chat/completions\", json={\n    \"model\": \"ministral-3-8b-instruct\",\n    \"messages\": [\n        {\"role\": \"user\", \"content\": \"What's the weather in Paris?\"}\n    ],\n    \"tools\": [\n        {\n            \"type\": \"function\",\n            \"function\": {\n                \"name\": \"get_weather\",\n                \"parameters\": {\n                    \"type\": \"object\",\n                    \"properties\": {\n                        \"city\": {\"type\": \"string\"}\n                    },\n                    \"required\": [\"city\"]\n                }\n            }\n        }\n    ]\n})\n\n---\n\n<a id='page-backend-clients'></a>\n\n## Backend Clients\n\n### 相关页面\n\n相关主题：[Backend Setup Guide](#page-backend-setup), [Model Selection Guide](#page-model-guide)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/clients/base.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/base.py)\n- [src/forge/clients/ollama.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/ollama.py)\n- [src/forge/clients/llamafile.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/llamafile.py)\n- [src/forge/clients/anthropic.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/anthropic.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/core/workflow.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/core/workflow.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n</details>\n\n# Backend Clients\n\n## Overview\n\nThe Backend Clients subsystem provides a unified abstraction layer over various LLM backends, enabling forge to interact with different inference engines through a consistent interface. This modular design allows users to switch between Ollama, llamafile, and Anthropic backends without modifying workflow code.\n\nEach client handles backend-specific communication protocols, response parsing, streaming, and tool call extraction while exposing a common async API for send operations and context resolution.\n\n资料来源：[src/forge/clients/base.py:1-50]()\n\n## Architecture\n\n### Client Hierarchy\n\n```mermaid\ngraph TD\n    A[BaseClient] --> B[OllamaClient]\n    A --> C[LlamafileClient]\n    A --> D[AnthropicClient]\n    \n    E[sampling_defaults.py] --> B\n    E --> C\n    E --> D\n    \n    F[WorkflowRunner] --> B\n    F --> C\n    F --> D\n```\n\nAll clients inherit from `BaseClient`, which defines the core async interface including `send()`, `send_stream()`, and `get_context_length()` methods. Backend-specific implementations override these methods to handle vendor-specific APIs and response formats.\n\n资料来源：[src/forge/clients/base.py:1-100]()\n\n### Common Interface\n\nAll clients implement the following async methods:\n\n| Method | Purpose |\n|--------|---------|\n| `send(messages, tools, **kwargs)` | Send a request and receive a complete response |\n| `send_stream(messages, tools, **kwargs)` | Stream responses as an async generator |\n| `get_context_length()` | Query the backend for maximum context window |\n| `stop()` | Stop any ongoing generation |\n\n资料来源：[src/forge/clients/base.py:50-150]()\n\n## OllamaClient\n\n### Purpose\n\nThe `OllamaClient` connects to a local Ollama server instance, supporting both standard models and GGUF-formatted models served through Ollama's model management system.\n\n资料来源：[src/forge/clients/ollama.py:1-100]()\n\n### Configuration Options\n\n```python\nOllamaClient(\n    model: str,                          # Model name (e.g., \"qwen3:8b-q4_K_M\")\n    base_url: str = \"http://localhost:11434\",\n    recommended_sampling: bool = False, # Use verified per-model sampling params\n    **kwargs                            # Passed to httpx client\n)\n```\n\n### Key Features\n\n- **Recommended Sampling**: When `recommended_sampling=True`, the client retrieves verified sampling parameters from `forge.clients.sampling_defaults` for known models. If a model is not in the map and `strict=True`, an `UnsupportedModelError` is raised.\n\n- **Streaming Support**: Full streaming support with token-level async generation through `send_stream()`.\n\n- **Tool Call Extraction**: Parses Ollama's JSON tool call format and converts to forge's internal `ToolCall` format.\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-80]()\n\n## LlamafileClient\n\n### Purpose\n\nThe `LlamafileClient` communicates with llamafile or llama-server instances, providing support for GGUF models served directly without Ollama's model management layer.\n\n资料来源：[src/forge/clients/llamafile.py:1-100]()\n\n### Context Resolution\n\nUnlike Ollama, llamafile and llama-server require querying the `/props` endpoint to determine the configured context length:\n\n```python\nasync def get_context_length(self) -> int | None:\n    \"\"\"Query the Llamafile /props endpoint for configured context length.\"\"\"\n    base = self.base_url.rstrip(\"/\")\n    if base.endswith(\"/v1\"):\n        base = base[:-3]\n\n    resp = await self._http.get(f\"{base}/props\")\n    data = resp.json()\n    n_ctx = data.get(\"default_generation_settings\", {}).get(\"n_ctx\")\n    return int(n_ctx) if n_ctx is not None else None\n```\n\n资料来源：[src/forge/clients/llamafile.py:180-200]()\n\n### Tool Call Modes\n\nThe client supports multiple tool call parsing strategies:\n\n| Mode | Description |\n|------|-------------|\n| `native` | Uses backend's native tool call format |\n| `function` | Parses `<function=name>...</function>` style tags |\n| `prompt` | Extracts tool calls from prompted responses |\n\n资料来源：[src/forge/clients/llamafile.py:100-180]()\n\n## AnthropicClient\n\n### Purpose\n\nThe `AnthropicClient` integrates with Anthropic's Claude API, enabling forge workflows to leverage Claude Opus, Sonnet, and Haiku models.\n\n资料来源：[src/forge/clients/anthropic.py:1-100]()\n\n### Key Differences\n\n- No hardcoded temperature defaults — relies on Anthropic API's own defaults\n- Supports Anthropic-specific headers and request formatting\n- Compatible with tools via Anthropic's tool use API\n\n## Sampling Defaults System\n\n### Overview\n\nThe `sampling_defaults` module provides verified per-model sampling parameters sourced from HuggingFace model cards. This ensures optimal generation quality for supported models without requiring users to manually tune hyperparameters.\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-50]()\n\n### Supported Parameters\n\n| Parameter | Description | Typical Range |\n|-----------|-------------|---------------|\n| `temperature` | Sampling temperature | 0.0 - 1.0 |\n| `top_p` | Nucleus sampling threshold | 0.0 - 1.0 |\n| `top_k` | Top-k sampling | 1 - 100 |\n| `min_p` | Minimum probability threshold | 0.0 - 1.0 |\n| `repeat_penalty` | Repetition penalty | 0.0 - 2.0 |\n| `presence_penalty` | Presence penalty (OpenAI compat) | -2.0 - 2.0 |\n\n### Policy Behavior\n\nThe `apply_sampling_defaults()` function implements a four-quadrant policy:\n\n```python\ndef apply_sampling_defaults(model: str, *, strict: bool) -> dict[str, float | int]:\n    \"\"\"Apply the recommended-sampling policy for model.\"\"\"\n    in_map = model in MODEL_SAMPLING_DEFAULTS\n    if strict:\n        if not in_map:\n            raise UnsupportedModelError(model)\n        return dict(MODEL_SAMPLING_DEFAULTS[model])\n    \n    # strict=False: one-shot INFO log if known, else silent\n    if in_map and model not in _INFO_LOGGED:\n        log.info(\"Recommended sampling params exist for %r...\", model)\n        _INFO_LOGGED.add(model)\n    return {}\n```\n\n| `strict` | Model in Map | Behavior |\n|----------|--------------|----------|\n| `True` | Yes | Return dict copy |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py:60-120]()\n\n### Verified Models\n\nThe following model families are currently supported with verified sampling parameters:\n\n- Qwen3 / Qwen3.5 / Qwen3.6\n- Qwen3-Coder\n- Gemma 4\n- Mistral Small 3.2\n- Devstral Small 2\n- Ministral 3 Instruct + Reasoning\n- Mistral Nemo\n- Granite 4.0\n\nEach entry includes an inline HuggingFace card URL comment for verification.\n\n资料来源：[src/forge/clients/sampling_defaults.py:50-80]()\n\n## Tool Call Processing\n\n### Extraction Flow\n\n```mermaid\ngraph TD\n    A[LLM Response] --> B{Response Type}\n    B -->|tool_calls| C[extract_tool_call]\n    B -->|text| D[TextResponse]\n    \n    C --> E{Tool Call Format}\n    E -->|OpenAI style| F[Parse name + arguments]\n    E -->|function tags| G[Parse XML-style tags]\n    E -->|dict style| H[Parse dict with name field]\n    \n    F --> I[ToolCall object]\n    G --> I\n    H --> I\n```\n\n### Supported Formats\n\n| Backend | Format | Example |\n|---------|--------|---------|\n| Ollama | OpenAI-style function calls | `{\"name\": \"get_weather\", \"arguments\": {\"city\": \"Paris\"}}` |\n| Llamafile | Function tags or native | `<function=name><parameter=city>Paris</parameter></function>` |\n| Anthropic | Claude tool_use blocks | `{name: \"get_weather\", input: {city: \"Paris\"}}` |\n\n资料来源：[src/forge/core/workflow.py:1-50]()\n\n## Proxy Mode Integration\n\n### Request Passthrough\n\nWhen running in proxy mode, the client plumbs OpenAI-compatible body fields through to backends without modification:\n\n```python\n# Proxy plumbs these fields through per request:\n- temperature\n- top_p\n- top_k\n- min_p\n- repeat_penalty\n- presence_penalty\n- seed\n```\n\nFor per-model recommended sampling in proxy mode, the calling client must look up `forge.clients.get_sampling_defaults(model)` and include the values in the request body.\n\n资料来源：[src/forge/proxy/__main__.py:1-50]()\n\n## Usage Examples\n\n### Basic Workflow with OllamaClient\n\n```python\nfrom forge import OllamaClient, WorkflowRunner, ContextManager, TieredCompact\n\nasync def main():\n    client = OllamaClient(\n        model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n        recommended_sampling=True  # Use verified sampling params\n    )\n    ctx = ContextManager(\n        strategy=TieredCompact(keep_recent=2),\n        budget_tokens=8192\n    )\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    # ... run workflows\n\nasyncio.run(main())\n```\n\n### Per-Call Sampling Override\n\n```python\nresponse = await client.send(\n    messages,\n    tools,\n    sampling={\n        \"temperature\": 0.7,\n        \"top_p\": 0.9\n    }\n)\n```\n\nThe caller's explicit non-None fields merge with client-level defaults without mutating the original configuration.\n\n## Error Handling\n\n| Error | Cause | Resolution |\n|-------|-------|------------|\n| `UnsupportedModelError` | Model not in sampling defaults map | Add model to `sampling_defaults.py` or pass `recommended_sampling=False` |\n| `httpx.HTTPError` | Backend unreachable | Verify backend is running on correct port |\n| `BudgetResolutionError` | Cannot determine context length | Check backend `/props` endpoint returns `n_ctx` |\n\n资料来源：[src/forge/server.py:1-50]()\n\n## Backend Server Management\n\n### ServerManager Integration\n\nForge can auto-manage backend servers through `ServerManager`, which handles starting, stopping, and context resolution:\n\n```python\nfrom forge.server import create_server_and_context\n\nserver, ctx = await create_server_and_context(\n    backend=\"ollama\",\n    model=\"qwen3:8b-q4_K_M\",\n    budget_mode=BudgetMode.FORGE_FAST,\n    client=client,\n)\n```\n\n### Supported Backends\n\n| Backend | Model Specification | Port | Features |\n|---------|---------------------|------|----------|\n| `ollama` | Model name string | 11434 | Auto model management |\n| `llamaserver` | GGUF file path | 8080 | Direct GGUF serving |\n| `llamafile` | GGUF file path | 8080 | Single-file server |\n\n资料来源：[src/forge/server.py:50-150]()\n\n---\n\n<a id='page-backend-setup'></a>\n\n## Backend Setup Guide\n\n### 相关页面\n\n相关主题：[Backend Clients](#page-backend-clients), [Model Selection Guide](#page-model-guide)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n</details>\n\n# Backend Setup Guide\n\n## Overview\n\nThe Backend Setup Guide covers how to configure, initialize, and manage LLM backend servers within forge. Forge supports multiple backend types—**Ollama**, **llama-server**, and **llamafile**—each with distinct initialization patterns, context management strategies, and operational characteristics. Understanding these backends is essential for running forge's Workflow and WorkflowRunner components effectively.\n\nForge abstracts backend management through two primary classes: `ServerManager` (for direct server lifecycle control) and `setup_backend()` (a high-level async factory that combines server startup with `ContextManager` creation). 资料来源：[src/forge/server.py:1-50]()\n\n## Supported Backend Types\n\nForge supports three backend implementations, each targeting different deployment scenarios:\n\n| Backend | Identity | Configuration | Use Case |\n|---------|----------|---------------|----------|\n| `ollama` | Model name (e.g., `qwen3:8b`) | Uses Ollama's native model management | Quick local development, model switching |\n| `llamaserver` | GGUF file path | Requires explicit GGUF path and context size | Production GGUF inference with fine control |\n| `llamafile` | GGUF file path | Auto-discovers llamafile runtime | Single-file distribution, portable setups |\n\nThe backend type is determined at `ServerManager` instantiation and cannot be changed afterward. 资料来源：[src/forge/server.py:140-145]()\n\n### Ollama Backend\n\nThe Ollama backend leverages Ollama's built-in model management system. Models are pulled and managed through the Ollama CLI rather than requiring manual GGUF file handling.\n\n```python\nserver = ServerManager(backend=\"ollama\", port=8080)\nawait server.start(model=\"qwen3:8b-q4_K_M\", gguf_path=\"\", mode=\"native\")\n```\n\n**Key constraints:**\n- Does not accept `gguf_path` (use the `model` parameter instead)\n- Requires `model` to be specified\n- VRAM cleanup between model switches is handled via `ollama stop` 资料来源：[src/forge/server.py:170-180]()\n\n### Llama-server and Llamafile Backends\n\nBoth `llamaserver` and `llamafile` backends operate on GGUF files directly. The server identity is derived from the GGUF path, enabling cache-equality checks for server reuse. 资料来源：[src/forge/server.py:185-200]()\n\n```python\n# Llama-server\nserver = ServerManager(backend=\"llamaserver\", port=8080)\nawait server.start(model=\"identity\", gguf_path=\"/models/qwen3-8b-q4_K_M.gguf\", mode=\"native\")\n\n# Llamafile (auto-discovers runtime)\nserver = ServerManager(backend=\"llamafile\", port=8080)\nawait server.start(model=\"identity\", gguf_path=\"/models/llamafile-binary\", mode=\"native\")\n```\n\n## ServerManager Architecture\n\nThe `ServerManager` class encapsulates all backend server lifecycle operations, providing a unified interface across different backend types.\n\n### State Management\n\n```mermaid\ngraph TD\n    A[ServerManager.__init__] --> B[_proc: Popen | None]\n    A --> C[_current_model: str | None]\n    A --> D[_current_mode: str | None]\n    A --> E[_current_ctx: int | None]\n    A --> F[_current_flags: tuple]\n    A --> G[_current_cache_type_k/v: str | None]\n    A --> H[_current_n_slots: int | None]\n    A --> I[_current_kv_unified: bool]\n    \n    J[ServerManager.start] --> K{Cache Hit?}\n    K -->|Yes| L[Return - reuse existing]\n    K -->|No| M[await stop]\n    M --> N[Build command flags]\n    N --> O[Spawn subprocess]\n```\n\n### Cache Equality Check\n\nBefore starting a new server instance, `ServerManager` checks if an existing server matches the requested configuration. This prevents unnecessary VRAM allocation and model reloading. 资料来源：[src/forge/server.py:50-65]()\n\n```python\nflags = tuple(extra_flags) if extra_flags else ()\nif (\n    self._current_model == model\n    and self._current_mode == mode\n    and self._current_ctx == ctx_override\n    and self._current_flags == flags\n    and self._current_cache_type_k == cache_type_k\n    and self._current_cache_type_v == cache_type_v\n    and self._current_n_slots == n_slots\n    and self._current_kv_unified == kv_unified\n):\n    return  # Reuse existing server\n```\n\n### Server Initialization Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `backend` | `str` | Required | Backend type: `\"ollama\"` \\| `\"llamaserver\"` \\| `\"llamafile\"` |\n| `port` | `int` | `8080` | Server listen port (llama-server / llamafile only) |\n| `models_dir` | `str \\| Path` | `None` | Directory containing GGUF files |\n\n## Startup Parameters\n\nThe `start()` method accepts numerous parameters for fine-grained control over server behavior:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `model` | `str` | Model identity (Ollama: model name; others: GGUF path as string) |\n| `gguf_path` | `str \\| Path` | Path to GGUF file for llamaserver/llamafile |\n| `mode` | `str` | `\"native\"` or `\"prompt\"` reasoning mode |\n| `extra_flags` | `list[str]` | Additional CLI flags passed to the server |\n| `ctx_override` | `int \\| None` | Override context window size (`-c <value>`) |\n| `cache_type_k` | `str` | KV cache quantization type for keys (e.g., `\"q8_0\"`, `\"q4_0\"`) |\n| `cache_type_v` | `str` | KV cache quantization type for values |\n| `n_slots` | `int` | Concurrent slot count for multi-agent architectures |\n| `kv_unified` | `bool` | Use unified KV cache across all slots |\n\n资料来源：[src/forge/server.py:95-115]()\n\n## Budget Modes\n\nBudget modes control how forge resolves the context window budget for the `ContextManager`. The `resolve_budget()` method maps `BudgetMode` enum values to actual token counts. 资料来源：[src/forge/server.py:220-250]()\n\n```mermaid\ngraph TD\n    A[resolve_budget mode] --> B{MANUAL?}\n    B -->|Yes, Ollama| C[Return manual_tokens]\n    B -->|Yes, others| D[await get_server_context]\n    B -->|No| E{Ollama?}\n    E -->|Yes| F[await _ollama.full]\n    E -->|No| G{Mode == FORGE_FAST?}\n    G -->|Yes| H[await get_server_context ÷ 4]\n    G -->|No| I[await get_server_context]\n```\n\n### Budget Resolution Table\n\n| BudgetMode | Ollama Backend | Llama-server/Llamafile Backend |\n|------------|----------------|--------------------------------|\n| `MANUAL` | Returns `manual_tokens` parameter | Queries `/props` for `n_ctx` |\n| `BACKEND` | Ollama's reported context length | Queries `/props` for `n_ctx` |\n| `FORGE_FAST` | `n_ctx / 4` | `n_ctx / 4` |\n\n## High-Level Setup with `setup_backend()`\n\nFor most use cases, prefer `setup_backend()` which combines server startup with `ContextManager` creation. 资料来源：[src/forge/server.py:280-330]()\n\n```python\nfrom forge.server import setup_backend, BudgetMode\n\nasync def example():\n    client, ctx = await setup_backend(\n        backend=\"llamaserver\",\n        gguf_path=\"/models/qwen3-8b-q4_K_M.gguf\",\n        budget_mode=BudgetMode.FORGE_FAST,\n        client=None,  # Will create default client\n    )\n    # ... run workflows ...\n    await client.close()\n```\n\n### `setup_backend()` Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `backend` | `str` | Required | Backend type |\n| `model` | `str \\| None` | `None` | Ollama model name |\n| `gguf_path` | `str \\| Path \\| None` | `None` | GGUF file path |\n| `budget_mode` | `BudgetMode` | `BudgetMode.BACKEND` | Context budget strategy |\n| `manual_tokens` | `int \\| None` | `None` | Required for `MANUAL` mode on Ollama |\n| `client` | `Any \\| None` | `None` | Existing client or `None` to create default |\n| `mode` | `str` | `\"native\"` | Reasoning mode |\n| `port` | `int` | `8080` | Server port |\n| `extra_flags` | `list[str] \\| None` | `None` | Additional backend flags |\n| `on_compact` | `Callable \\| None` | `None` | Callback for compaction events |\n| `compact_threshold` | `float` | `0.75` | Compaction trigger threshold |\n| `phase_thresholds` | `tuple` | `(0.5, 0.7, 0.9)` | Tiered compaction thresholds |\n\n## Server Readiness Detection\n\nForge uses `/props` polling rather than `/health` for readiness confirmation. This eliminates the gap between health-ok and props-available states. 资料来源：[src/forge/server.py:260-278]()\n\n```python\nasync def wait_for_ready(self, timeout: float = 60.0) -> None:\n    url = f\"http://localhost:{self._port}/props\"\n    while time.monotonic() < deadline:\n        try:\n            resp = await client.get(url)\n            if resp.status_code == 200:\n                data = resp.json()\n                if \"default_generation_settings\" in data:\n                    return\n        except (httpx.ConnectError, httpx.ReadError, httpx.TimeoutException):\n            pass\n        await asyncio.sleep(2)\n```\n\nThe readiness check looks for `default_generation_settings` in the response—a strong indicator that the model is fully loaded and serving. 资料来源：[src/forge/server.py:260-278]()\n\n## Proxy Server Configuration\n\nForge includes a proxy server (`forge.proxy`) that plumbs OpenAI-compatible sampling parameters through to backends. The proxy does not consult the sampling defaults map; it passes through whatever parameters the inbound request carries. 资料来源：[src/forge/proxy/__main__.py:1-60]()\n\n### Proxy CLI Options\n\n| Flag | Type | Default | Description |\n|------|------|---------|-------------|\n| `--backend-url` | `str` | Required | Target backend URL |\n| `--backend` | `str` | Required | Backend type |\n| `--model` | `str` | Required | Model identifier |\n| `--gguf` | `str` | `\"\"` | GGUF path (for non-Ollama) |\n| `--budget-mode` | `str` | `\"backend\"` | Budget resolution mode |\n| `--budget-tokens` | `int` | `None` | Manual token budget |\n| `--host` | `str` | `127.0.0.1` | Proxy listen host |\n| `--port` | `int` | `8081` | Proxy listen port |\n| `--serialize` | `flag` | `None` | Force request serialization |\n| `--max-retries` | `int` | `3` | Max retries per request |\n| `--verbose` | `flag` | `False` | Enable debug logging |\n\n### Proxy Sampling Passthrough\n\nThe proxy supports these OpenAI-compatible body fields:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `temperature` | `float` | Sampling temperature |\n| `top_p` | `float` | Nucleus sampling threshold |\n| `top_k` | `int` | Top-k sampling |\n| `min_p` | `float` | Minimum probability threshold |\n| `repeat_penalty` | `float` | Repetition penalty |\n| `presence_penalty` | `float` | Presence penalty |\n| `seed` | `int` | Deterministic sampling seed |\n\nFor per-model recommended sampling in proxy mode, callers should look up `forge.clients.get_sampling_defaults(model)` and include the values in the request body. 资料来源：[src/forge/clients/sampling_defaults.py:1-50]()\n\n## Per-Model Sampling Defaults\n\nForge ships verified per-model sampling recommendations for supported models. These must be explicitly opted into via `recommended_sampling=True`. 资料来源：[src/forge/clients/sampling_defaults.py:50-80]()\n\n### Supported Models\n\nThe sampling defaults map includes recommendations for:\n\n- **Qwen3 / 3.5 / 3.6** series\n- **Qwen3-Coder**\n- **Gemma 4**\n- **Mistral Small 3.2**\n- **Devstral Small 2**\n- **Ministral 3 Instruct + Reasoning**\n- **Mistral Nemo**\n- **Granite 4.0** (`h-micro`, `h-tiny`)\n\nEach entry includes an inline HuggingFace model card URL for verification. 资料来源：[CHANGELOG.md:0.6.0]()\n\n### Sampling Policy\n\n| `strict` | Model in Map | Behavior |\n|----------|--------------|----------|\n| `True` | Yes | Return dict copy |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n## Context Length Resolution\n\nFor non-Ollama backends, forge queries the server's `/props` endpoint to determine the configured context length. This value feeds into budget resolution. 资料来源：[src/forge/server.py:200-220]()\n\n```python\nasync def get_server_context(self) -> int:\n    \"\"\"Query /props for actual n_ctx.\n    \n    For Ollama: ``ollama stop`` for clean VRAM unloads between model switches.\n    \"\"\"\n    props = await self.query_props()\n    ctx = props.get(\"default_generation_settings\", {}).get(\"n_ctx\")\n    if ctx is None:\n        raise BudgetResolutionError()\n    return ctx\n```\n\n## Best Practices\n\n### Server Reuse\n\nAlways check if a server with the desired configuration is already running before starting a new one. The `ServerManager` performs this check internally based on:\n\n1. Model identity (name or GGUF path)\n2. Mode (`native` or `prompt`)\n3. Context override\n4. CLI flags\n5. KV cache quantization settings\n6. Slot configuration\n\n### VRAM Management\n\nFor Ollama backends, use `ollama stop` to cleanly unload models and free VRAM before switching to a different model. The llama-server/llamafile backends handle this through server restart. 资料来源：[src/forge/server.py:200]()\n\n### Graceful Shutdown\n\nAlways call `server.stop()` when finished to properly terminate the backend process:\n\n```python\nserver = ServerManager(backend=\"llamaserver\", port=8080)\ntry:\n    await server.start(...)\n    # ... work ...\nfinally:\n    await server.stop()\n```\n\n### Multi-Agent Configurations\n\nFor multi-agent architectures requiring concurrent slots, configure `n_slots` and optionally `kv_unified=True` for shared KV cache across slots:\n\n```python\nawait server.start(\n    model=\"...\",\n    gguf_path=\"...\",\n    n_slots=4,\n    kv_unified=True,  # Each slot can use full context\n)\n```\n\n## Quick Start Example\n\n```python\nimport asyncio\nfrom forge import OllamaClient, WorkflowRunner\nfrom forge.server import setup_backend, BudgetMode\nfrom forge.context import ContextManager, TieredCompact\n\nasync def main():\n    # Setup backend with forge-managed context\n    client, ctx = await setup_backend(\n        backend=\"ollama\",\n        model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n        budget_mode=BudgetMode.FORGE_FAST,\n        recommended_sampling=True,\n    )\n    \n    try:\n        runner = WorkflowRunner(client=client, context_manager=ctx)\n        # ... run workflows ...\n    finally:\n        await client.close()\n\nasyncio.run(main())\n```\n\nFor GGUF-based setups:\n\n```python\nfrom forge.server import setup_backend, BudgetMode\n\nclient, ctx = await setup_backend(\n    backend=\"llamaserver\",\n    gguf_path=\"/path/to/model-q4_K_M.gguf\",\n    budget_mode=BudgetMode.FORGE_FAST,\n    extra_flags=[\"--reasoning-format\", \"auto\"],\n)\n```\n\n## Related Documentation\n\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md) - Project setup and testing\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md) - Quick start and Workflow overview\n- [CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md) - Version history and breaking changes\n\n---\n\n<a id='page-model-guide'></a>\n\n## Model Selection Guide\n\n### 相关页面\n\n相关主题：[Backend Setup Guide](#page-backend-setup), [Backend Clients](#page-backend-clients)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/forge/clients/sampling_defaults.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/sampling_defaults.py)\n- [src/forge/errors.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/errors.py)\n- [src/forge/server.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/server.py)\n- [src/forge/clients/llamafile.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/clients/llamafile.py)\n- [src/forge/proxy/__main__.py](https://github.com/antoinezambelli/forge/blob/main/src/forge/proxy/__main__.py)\n- [CONTRIBUTING.md](https://github.com/antoinezambelli/forge/blob/main/CONTRIBUTING.md)\n- [README.md](https://github.com/antoinezambelli/forge/blob/main/README.md)\n- [CHANGELOG.md](https://github.com/antoinezambelli/forge/blob/main/CHANGELOG.md)\n</details>\n\n# Model Selection Guide\n\n## Overview\n\nThe Model Selection Guide covers how to choose, configure, and deploy language models within forge. Forge provides a unified workflow engine that abstracts backend differences (Ollama, Llamafile, LlamaServer) while offering per-model recommended sampling parameters sourced directly from HuggingFace model cards.\n\n资料来源：[src/forge/clients/sampling_defaults.py:1-20]()\n\n## Supported Backends\n\nForge supports three LLM backend types, each with distinct configuration requirements.\n\n| Backend | Configuration Method | Model Specification | Notes |\n|---------|---------------------|---------------------|-------|\n| Ollama | `model` parameter | Model name from `ollama list` | No GGUF path needed |\n| Llamafile | `gguf_path` parameter | Path to .llamafile binary | Self-contained executables |\n| LlamaServer | `gguf_path` parameter | Path to GGUF model file | Requires llama.cpp server binary |\n\n资料来源：[src/forge/server.py:1-50]()\n\n### Backend Selection Logic\n\n```mermaid\ngraph TD\n    A[Choose Backend] --> B{Backend Type?}\n    B -->|Ollama| C[Use model name]\n    B -->|Llamafile| D[Use gguf_path]\n    B -->|LlamaServer| D\n    C --> E[Connect via localhost:11434]\n    D --> F[Start server process]\n    F --> G[Connect via port 8080]\n```\n\nFor Ollama, the `model` parameter directly references the model name from `ollama list`. For Llamafile and LlamaServer, you must provide the `gguf_path` pointing to the model file, and Forge will manage the server process lifecycle.\n\n资料来源：[src/forge/server.py:80-120]()\n\n## Supported Models\n\n### Model Families\n\nForge has been tested and evaluated with the following model families across different quantization levels.\n\n| Model Family | Variants | Recommended Quantization | Notes |\n|-------------|----------|-------------------------|-------|\n| Qwen3 | 8B, 3.5, 3.6 | Q4_K_M, Q8_0 | Includes Qwen3-Coder |\n| Gemma | 4 (all sizes) | Q4_K_M | Use `--reasoning-budget 0` workaround |\n| Mistral | Small 3.2, Nemo, 7B | Q4_K_M, Q8_0 | Ministral variants available |\n| Devstral | Small 2 | Q4_K_M | Code-focused model |\n| Granite | 4.0 (h-micro, h-tiny) | Q4_K_M, Q8_0 | OpenAI-style tool calls |\n| Llama | 3.1 8B | Q4_K_M, Q8_0 | 8B Reasoning variants |\n| Ministral | 3 Instruct, 8B Instruct, Reasoning | Q4_K_M, Q8_0 | Reasoning requires budget fix |\n\n资料来源：[CHANGELOG.md:1-50]()\n\n### Model Naming Conventions\n\nModel names vary by backend. When using Ollama, use the exact model tag as shown in `ollama list`. For GGUF-based backends, the model name is derived from the filename stem of the GGUF file.\n\n```python\n# Ollama example\nclient = OllamaClient(model=\"ministral-3:8b-instruct-2512-q4_K_M\")\n\n# Llamafile/LlamaServer example - model derived from path\nclient = LlamafileClient(gguf_path=\"/models/mistral-7b-q4_K_M.gguf\")\n```\n\n资料来源：[README.md:1-30]()\n\n## Recommended Sampling Configuration\n\n### The Sampling Defaults Map\n\nForge ships `forge.clients.sampling_defaults` containing a verified per-model sampling recommendations map. Each entry includes parameters such as `temperature`, `top_p`, `top_k`, `min_p`, `repeat_penalty`, and `presence_penalty` sourced directly from HuggingFace model cards.\n\n资料来源：[src/forge/clients/sampling_defaults.py:20-40]()\n\n### Enabling Recommended Sampling\n\nTo use per-model recommended sampling, pass `recommended_sampling=True` when initializing the client.\n\n```python\nfrom forge import OllamaClient\n\n# Opt-in to recommended sampling\nclient = OllamaClient(\n    model=\"qwen3:8b-q4_K_M\",\n    recommended_sampling=True\n)\n```\n\nIf the model is not in the map and `recommended_sampling=True` is set, Forge raises `UnsupportedModelError` rather than silently falling back to backend defaults.\n\n资料来源：[src/forge/errors.py:1-25]()\n\n### Sampling Policy Behavior\n\nThe following table describes the four-quadrant behavior when applying sampling defaults.\n\n| `strict` | Model in Map | Behavior |\n|----------|-------------|----------|\n| `True` | Yes | Return recommended dict |\n| `True` | No | Raise `UnsupportedModelError` |\n| `False` | Yes | One-shot INFO log; return `{}` |\n| `False` | No | Return `{}` (silent) |\n\n资料来源：[src/forge/clients/sampling_defaults.py:60-85]()\n\n### Per-Call Sampling Overrides\n\nThe `send()` and `send_stream()` methods accept a `sampling: dict | None` kwarg that merges field-by-field with the client's instance-level sampling without mutating it. The caller's explicit non-None fields take precedence.\n\n```python\n# Merge with instance defaults\nresponse = await client.send(\n    messages,\n    sampling={\"temperature\": 0.7, \"top_p\": 0.9}\n)\n```\n\n资料来源：[CHANGELOG.md:50-70]()\n\n## Proxy Mode Configuration\n\nWhen running forge in proxy mode, sampling parameters are plumbed through from the incoming request body. OpenAI-compatible fields supported include `temperature`, `top_p`, `top_k`, `min_p`, `repeat_penalty`, `presence_penalty`, and `seed`.\n\n### Proxy Server Startup\n\n```bash\npython -m forge.proxy \\\n    --backend-url http://localhost:11434 \\\n    --backend ollama \\\n    --model qwen3:8b-q4_K_M \\\n    --port 8081\n```\n\nFor per-model recommended sampling in proxy mode, the calling client should look up `forge.clients.get_sampling_defaults(model)` and include the values in the request body.\n\n资料来源：[src/forge/proxy/__main__.py:1-40]()\n\n## Context and Budget Management\n\n### Server Context Resolution\n\nForge automatically queries the backend's `/props` endpoint to determine the maximum context length. For Ollama, use `ollama stop` to cleanly unload VRAM between model switches.\n\n```python\nfrom forge import ContextManager, TieredCompact\n\nctx = ContextManager(\n    strategy=TieredCompact(keep_recent=2),\n    budget_tokens=8192\n)\n```\n\n### Budget Modes\n\n| Mode | Description | Token Source |\n|------|-------------|--------------|\n| `MANUAL` | User-specified token budget | `manual_tokens` parameter |\n| `FORGE_FAST` | Fast iteration mode | Server-reported context |\n| `FORGE_BALANCED` | Balanced speed/quality | Server-reported context |\n| `FORGE_THOROUGH` | Maximum quality | Server-reported context |\n\n资料来源：[src/forge/server.py:150-200]()\n\n### Known Issues with Reasoning Models\n\nModels using extended reasoning (Gemma 4, Qwen 3.5, Ministral Reasoning) may hang with unbounded reasoning budgets on builds after April 10, 2026. The workaround is to set `--reasoning-budget 0` when starting the backend.\n\n资料来源：[CHANGELOG.md:70-90]()\n\n## Client Configuration Reference\n\n### OllamaClient\n\n```python\nOllamaClient(\n    model: str,                          # Model name from `ollama list`\n    base_url: str = \"http://localhost:11434/v1\",\n    api_key: str | None = None,\n    timeout: float = 120.0,\n    recommended_sampling: bool = False,  # Opt-in to per-model defaults\n    **kwargs\n)\n```\n\n### LlamafileClient\n\n```python\nLlamafileClient(\n    gguf_path: str | Path,               # Path to .llamafile binary\n    model: str | None = None,            # Optional model name\n    base_url: str = \"http://localhost:8080/v1\",\n    recommended_sampling: bool = False,\n    **kwargs\n)\n```\n\n资料来源：[src/forge/clients/llamafile.py:1-50]()\n\n## Complete Workflow Example\n\n```python\nimport asyncio\nfrom pydantic import BaseModel, Field\nfrom forge import (\n    Workflow, ToolDef, ToolSpec,\n    WorkflowRunner, OllamaClient,\n    ContextManager, TieredCompact,\n)\n\ndef get_weather(city: str) -> str:\n    return f\"72°F and sunny in {city}\"\n\nclass GetWeatherParams(BaseModel):\n    city: str = Field(description=\"City name\")\n\nworkflow = Workflow(\n    name=\"weather\",\n    description=\"Look up weather for a city.\",\n    tools={\n        \"get_weather\": ToolDef(\n            spec=ToolSpec(\n                name=\"get_weather\",\n                description=\"Get current weather\",\n                parameters=GetWeatherParams,\n            ),\n            callable=get_weather,\n        ),\n    },\n    required_steps=[],\n    terminal_tool=\"get_weather\",\n    system_prompt_template=\"You are a helpful assistant. Use the available tools to answer the user.\",\n)\n\nasync def main():\n    client = OllamaClient(\n        model=\"ministral-3:8b-instruct-2512-q4_K_M\",\n        recommended_sampling=True\n    )\n    ctx = ContextManager(\n        strategy=TieredCompact(keep_recent=2),\n        budget_tokens=8192\n    )\n    runner = WorkflowRunner(client=client, context_manager=ctx)\n    await runner.run(workflow, \"What's the weather in Paris?\")\n\nasyncio.run(main())\n```\n\n资料来源：[README.md:30-80]()\n\n## Error Handling\n\n### UnsupportedModelError\n\nRaised when `recommended_sampling=True` is specified but the model is not in the sampling defaults map.\n\n```python\nfrom forge.errors import UnsupportedModelError\n\ntry:\n    client = OllamaClient(\n        model=\"unknown-model:latest\",\n        recommended_sampling=True\n    )\nexcept UnsupportedModelError as e:\n    print(f\"Model not supported: {e.model}\")\n    # Solution: Either add entry to MODEL_SAMPLING_DEFAULTS\n    # or drop recommended_sampling=True\n```\n\n### Tool Call Errors\n\nTool-related errors include `ToolCallError` (LLM failed to produce valid tool call), `ToolExecutionError` (tool callable raised an exception), and `ToolResolutionError` (valid arguments but data didn't resolve).\n\n资料来源：[src/forge/errors.py:25-60]()\n\n## Best Practices\n\n### Selecting Quantization Levels\n\n| Use Case | Recommended Quantization |\n|----------|-------------------------|\n| Development/Testing | Q4_K_M (balanced quality/size) |\n| Production (quality priority) | Q8_0 (near-float quality) |\n| Resource-constrained | Q4_0 (smaller, lower quality) |\n\n### Guardrail Integration\n\nGuardrails in forge are defined in `src/forge/core/runner.py` and nudge templates in `src/forge/prompts/nudges.py`. Each guardrail can be independently toggled via ablation presets for evaluation.\n\n资料来源：[CONTRIBUTING.md:1-30]()\n\n### Server Management\n\nWhen running multiple evaluations, reuse `ServerManager` instances when the model and configuration match to avoid unnecessary server restarts.\n\n```python\n# ServerManager caches configuration to avoid redundant restarts\nif (\n    self._current_model == model\n    and self._current_mode == mode\n    and self._current_ctx == ctx_override\n):\n    # Reuse existing server\n    return\n```\n\n资料来源：[src/forge/server.py:60-75]()\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：antoinezambelli/forge\n\n摘要：发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body。\n\n## 1. 安装坑 · 来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_148dff87195e42549d0ffb88b99e9cbf | https://github.com/antoinezambelli/forge/issues/58 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 2. 安装坑 · 来源证据：Investigate: integration paths with Hermes Agent\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Investigate: integration paths with Hermes Agent\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e3cbd2d1c9a84a1887887bf24b036865 | https://github.com/antoinezambelli/forge/issues/51 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 3. 安装坑 · 来源证据：Per-model recommended sampling defaults (map keyed by HF model cards)\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Per-model recommended sampling defaults (map keyed by HF model cards)\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_057ca2af912e4a608259ffb2a3654d4f | https://github.com/antoinezambelli/forge/issues/59 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 4. 安装坑 · 来源证据：Rescue-parse ChatGPT-style XML tool calls\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Rescue-parse ChatGPT-style XML tool calls\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_471c674c8d73451da75d6b8c9349aabf | https://github.com/antoinezambelli/forge/issues/55 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 5. 配置坑 · 来源证据：Proxy external mode hardcodes native FC — no prompt-injection fallback\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：Proxy external mode hardcodes native FC — no prompt-injection fallback\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f3a85ec8447a4838b3bc4c846cd9e7a0 | https://github.com/antoinezambelli/forge/issues/53 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 6. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | README/documentation is current enough for a first validation pass.\n\n## 7. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | last_activity_observed missing\n\n## 8. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium\n\n## 9. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium\n\n## 10. 安全/权限坑 · 来源证据：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_4ad226a6d1fa4a5f89fa7702bec11188 | https://github.com/antoinezambelli/forge/issues/61 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 安全/权限坑 · 来源证据：Sub-agent support: dynamic slot splitting\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Sub-agent support: dynamic slot splitting\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_5b35873cf63c4647bca8a0611d441189 | https://github.com/antoinezambelli/forge/issues/28 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 12. 安全/权限坑 · 来源证据：Sub-agent support: slot pool\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Sub-agent support: slot pool\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_070d9a3d20d24123b62d7d76ee16078a | https://github.com/antoinezambelli/forge/issues/29 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 13. 安全/权限坑 · 来源证据：llama.cpp reasoning budget sampler causes silent hangs after April 10 builds\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：llama.cpp reasoning budget sampler causes silent hangs after April 10 builds\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_673be4a583984219bab90cbadff631fe | https://github.com/antoinezambelli/forge/issues/54 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 14. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | issue_or_pr_quality=unknown\n\n## 15. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | release_recency=unknown\n\n<!-- canonical_name: antoinezambelli/forge; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：antoinezambelli/forge\n\n摘要：发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body。\n\n## 1. 安装坑 · 来源证据：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Client sampling params: thread top_p/top_k/min_p/repeat_penalty through request body\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_148dff87195e42549d0ffb88b99e9cbf | https://github.com/antoinezambelli/forge/issues/58 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 2. 安装坑 · 来源证据：Investigate: integration paths with Hermes Agent\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Investigate: integration paths with Hermes Agent\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e3cbd2d1c9a84a1887887bf24b036865 | https://github.com/antoinezambelli/forge/issues/51 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 3. 安装坑 · 来源证据：Per-model recommended sampling defaults (map keyed by HF model cards)\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Per-model recommended sampling defaults (map keyed by HF model cards)\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_057ca2af912e4a608259ffb2a3654d4f | https://github.com/antoinezambelli/forge/issues/59 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 4. 安装坑 · 来源证据：Rescue-parse ChatGPT-style XML tool calls\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Rescue-parse ChatGPT-style XML tool calls\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_471c674c8d73451da75d6b8c9349aabf | https://github.com/antoinezambelli/forge/issues/55 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 5. 配置坑 · 来源证据：Proxy external mode hardcodes native FC — no prompt-injection fallback\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：Proxy external mode hardcodes native FC — no prompt-injection fallback\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f3a85ec8447a4838b3bc4c846cd9e7a0 | https://github.com/antoinezambelli/forge/issues/53 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 6. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | README/documentation is current enough for a first validation pass.\n\n## 7. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | last_activity_observed missing\n\n## 8. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium\n\n## 9. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | no_demo; severity=medium\n\n## 10. 安全/权限坑 · 来源证据：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Hardware detection: AMD unified-memory rigs fall through to 4K Ollama budget\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_4ad226a6d1fa4a5f89fa7702bec11188 | https://github.com/antoinezambelli/forge/issues/61 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 安全/权限坑 · 来源证据：Sub-agent support: dynamic slot splitting\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Sub-agent support: dynamic slot splitting\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_5b35873cf63c4647bca8a0611d441189 | https://github.com/antoinezambelli/forge/issues/28 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 12. 安全/权限坑 · 来源证据：Sub-agent support: slot pool\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Sub-agent support: slot pool\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_070d9a3d20d24123b62d7d76ee16078a | https://github.com/antoinezambelli/forge/issues/29 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 13. 安全/权限坑 · 来源证据：llama.cpp reasoning budget sampler causes silent hangs after April 10 builds\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：llama.cpp reasoning budget sampler causes silent hangs after April 10 builds\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_673be4a583984219bab90cbadff631fe | https://github.com/antoinezambelli/forge/issues/54 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 14. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | issue_or_pr_quality=unknown\n\n## 15. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | hn_item:48192383 | https://news.ycombinator.com/item?id=48192383 | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# forge - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for antoinezambelli/forge.\n\nProject:\n- Name: forge\n- Repository: https://github.com/antoinezambelli/forge\n- Summary: [![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)\n- Host target: chatgpt\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: [![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: [![PyPI](https://img.shields.io/pypi/v/forge-guardrails.svg)](https://pypi.org/project/forge-guardrails/)\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. page-introduction: Introduction to Forge. Produce one small intermediate artifact and wait for confirmation.\n2. page-installation: Installation Guide. Produce one small intermediate artifact and wait for confirmation.\n3. page-quickstart: Quick Start Guide. Produce one small intermediate artifact and wait for confirmation.\n4. page-architecture: System Architecture. Produce one small intermediate artifact and wait for confirmation.\n5. page-module-structure: Module Structure and API. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://news.ycombinator.com/item?id=48192383\n- https://github.com/antoinezambelli/forge#readme\n- README.md\n- src/forge/__init__.py\n- pyproject.toml\n- src/forge/core/workflow.py\n- src/forge/core/runner.py\n- docs/ARCHITECTURE.md\n- src/forge/context/manager.py\n- src/forge/core/messages.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：antoinezambelli/forge\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install forge-guardrails\n```\n\n来源：https://github.com/antoinezambelli/forge#readme\n\n## 来源\n\n- hn: https://news.ycombinator.com/item?id=48192383\n- docs: https://github.com/antoinezambelli/forge#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_7a58244d8bf74e91982ffb1c5a2b2cc8"
}
