{
  "canonical_name": "vllm-project/vllm",
  "compilation_id": "pack_84ef36c02be94a45a498be7946e8dcd9",
  "created_at": "2026-05-15T22:20:06.684959+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=mcp_config, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=mcp_config, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install vllm` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install vllm",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "llm_execute_isolated_install",
      "sandbox_validation_id": "sbx_e702483cb2ae4b309b0bd4385a3bfe63"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_4f6d0a7dcce7c7487f28bb5316a36395",
    "canonical_name": "vllm-project/vllm",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/vllm-project/vllm",
    "slug": "vllm",
    "source_packet_id": "phit_e2df9faea8ea4118b6f56217c795e3bc",
    "source_validation_id": "dval_4b19ac5ad26a41138863fa55543fb1f5"
  },
  "merchandising": {
    "best_for": "需要个人工作台能力，并使用 local_cli的用户",
    "github_forks": 16624,
    "github_stars": 79560,
    "one_liner_en": "A high-throughput and memory-efficient inference and serving engine for LLMs",
    "one_liner_zh": "A high-throughput and memory-efficient inference and serving engine for LLMs",
    "primary_category": {
      "category_id": "personal-workspace",
      "confidence": "high",
      "name_en": "Personal Workspace",
      "name_zh": "个人工作台",
      "reason": "curated popular coverage category matched project identity"
    },
    "target_user": "使用 local_cli 等宿主 AI 的用户",
    "title_en": "vllm",
    "title_zh": "vllm 能力包",
    "visible_tags": [
      {
        "label_en": "Knowledge Retrieval",
        "label_zh": "知识检索",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-knowledge-retrieval",
        "type": "product_domain"
      },
      {
        "label_en": "Knowledge Base Q&A",
        "label_zh": "知识库问答",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-knowledge-base-q-a",
        "type": "user_job"
      },
      {
        "label_en": "Workflow Automation",
        "label_zh": "流程自动化",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-workflow-automation",
        "type": "core_capability"
      },
      {
        "label_en": "Checkpoint Resume",
        "label_zh": "断点恢复流程",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-checkpoint-resume",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Evaluation Suite",
        "label_zh": "评测体系",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-evaluation-suite",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_e2df9faea8ea4118b6f56217c795e3bc",
  "page_model": {
    "artifacts": {
      "artifact_slug": "vllm",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install vllm",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/vllm-project/vllm#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "知识检索",
        "知识库问答",
        "流程自动化",
        "断点恢复流程",
        "评测体系"
      ],
      "eyebrow": "个人工作台",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要个人工作台能力，并使用 local_cli的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "A high-throughput and memory-efficient inference and serving engine for LLMs"
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "local_cli",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "mcp_config, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_1a71634c530044a68b9160080d55de0a | https://github.com/vllm-project/vllm/issues/42182 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_58327949a4524ed082bd189b53f713a1 | https://github.com/vllm-project/vllm/issues/40896 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_lora_adapter`?",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_fb1461834fe34049bd05182574d3e5e5 | https://github.com/vllm-project/vllm/issues/42207 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_l…",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.18.1",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_317a03f9de4e459f9be42064c7318b2c | https://github.com/vllm-project/vllm/releases/tag/v0.18.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.18.1",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：[Feature]: Qwen3.5-Moe LoRA Support (experts)",
            "category": "能力坑",
            "evidence": [
              "community_evidence:github | cevd_2d068d43c6654f3cab6b48bf98dad116 | https://github.com/vllm-project/vllm/issues/40005 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Feature]: Qwen3.5-Moe LoRA Support (experts)",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | github_repo:599547518 | https://github.com/vllm-project/vllm | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个运行相关的待验证问题：v0.20.2",
            "category": "运行坑",
            "evidence": [
              "community_evidence:github | cevd_ecf37722dff6494c82b384225e34bcb0 | https://github.com/vllm-project/vllm/releases/tag/v0.20.2 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.20.2",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "未记录 last_activity_observed。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | last_activity_observed missing"
            ],
            "severity": "medium",
            "suggested_check": "补 GitHub 最近 commit、release、issue/PR 响应信号。",
            "title": "维护活跃度未知",
            "user_impact": "新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "downstream_validation.risk_items | github_repo:599547518 | https://github.com/vllm-project/vllm | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "进入安全/权限治理复核队列。",
            "title": "下游验证发现风险项",
            "user_impact": "下游已经要求复核，不能在页面中弱化。"
          },
          {
            "body": "No sandbox install has been executed yet; downstream must verify before user use.",
            "category": "安全/权限坑",
            "evidence": [
              "risks.safety_notes | github_repo:599547518 | https://github.com/vllm-project/vllm | No sandbox install has been executed yet; downstream must verify before user use."
            ],
            "severity": "medium",
            "suggested_check": "转成明确权限清单和安全审查提示。",
            "title": "存在安全注意事项",
            "user_impact": "用户安装前需要知道权限边界和敏感操作。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "risks.scoring_risks | github_repo:599547518 | https://github.com/vllm-project/vllm | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "把风险写入边界卡，并确认是否需要人工复核。",
            "title": "存在评分风险",
            "user_impact": "风险会影响是否适合普通用户安装。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / A100",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_9ce279a037934e8085332120bfdaca86 | https://github.com/vllm-project/vllm/issues/41758 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / A100",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.16.0",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_57ff11ff995a4809a33a38ff7504a5ef | https://github.com/vllm-project/vllm/releases/tag/v0.16.0 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.16.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.17.0",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_4592ced4fde24aa9aa808caa40e25b84 | https://github.com/vllm-project/vllm/releases/tag/v0.17.0 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.17.0",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.18.0",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_7ca09904f8164e94a3a1bc489d32d1ff | https://github.com/vllm-project/vllm/releases/tag/v0.18.0 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.18.0",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.19.0",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_cdb0d7a7f491474da7b93583ec643c00 | https://github.com/vllm-project/vllm/releases/tag/v0.19.0 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.19.0",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 21 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": 2623,
        "forks": 16624,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": 79560
      },
      "source_url": "https://github.com/vllm-project/vllm",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "A high-throughput and memory-efficient inference and serving engine for LLMs",
      "title": "vllm 能力包",
      "trial_prompt": "# vllm - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for vllm-project/vllm.\n\nProject:\n- Name: vllm\n- Repository: https://github.com/vllm-project/vllm\n- Summary: A high-throughput and memory-efficient inference and serving engine for LLMs\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: A high-throughput and memory-efficient inference and serving engine for LLMs\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: A high-throughput and memory-efficient inference and serving engine for LLMs\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. page-1: vLLM Overview. Produce one small intermediate artifact and wait for confirmation.\n2. page-2: Getting Started. Produce one small intermediate artifact and wait for confirmation.\n3. page-3: Core Engine Architecture. Produce one small intermediate artifact and wait for confirmation.\n4. page-4: Model Executor and Worker Architecture. Produce one small intermediate artifact and wait for confirmation.\n5. page-5: Scheduling and Request Processing. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/vllm-project/vllm\n- https://github.com/vllm-project/vllm#readme\n- README.md\n- pyproject.toml\n- vllm/__init__.py\n- vllm/version.py\n- docs/getting_started/installation/README.md\n- docs/getting_started/quickstart.md\n- requirements/common.txt\n- requirements/cuda.txt\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: [Bug]: vLLM v1 with prefix caching: first request differs from subsequen（https://github.com/vllm-project/vllm/issues/40896）；github/github_issue: [AMD][CI Failure][Tracker] Static dashboard tracker for current CI failu（https://github.com/vllm-project/vllm/issues/40554）；github/github_issue: [Usage]: How to proactively clear CPU-resident memory left behind by unl（https://github.com/vllm-project/vllm/issues/42207）；github/github_issue: [Feature]: Qwen3.5-Moe LoRA Support (experts)（https://github.com/vllm-project/vllm/issues/40005）；github/github_issue: [Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / （https://github.com/vllm-project/vllm/issues/41758）；github/github_issue: [Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async sch（https://github.com/vllm-project/vllm/issues/42182）；github/github_release: v0.20.2（https://github.com/vllm-project/vllm/releases/tag/v0.20.2）；github/github_release: v0.20.1（https://github.com/vllm-project/vllm/releases/tag/v0.20.1）；github/github_release: v0.20.0（https://github.com/vllm-project/vllm/releases/tag/v0.20.0）；github/github_release: v0.19.1（https://github.com/vllm-project/vllm/releases/tag/v0.19.1）；github/github_release: v0.19.0（https://github.com/vllm-project/vllm/releases/tag/v0.19.0）；github/github_release: v0.18.1（https://github.com/vllm-project/vllm/releases/tag/v0.18.1）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: vLLM v1 with prefix caching: first request differs from subsequen",
              "url": "https://github.com/vllm-project/vllm/issues/40896"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[AMD][CI Failure][Tracker] Static dashboard tracker for current CI failu",
              "url": "https://github.com/vllm-project/vllm/issues/40554"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Usage]: How to proactively clear CPU-resident memory left behind by unl",
              "url": "https://github.com/vllm-project/vllm/issues/42207"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Feature]: Qwen3.5-Moe LoRA Support (experts)",
              "url": "https://github.com/vllm-project/vllm/issues/40005"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / ",
              "url": "https://github.com/vllm-project/vllm/issues/41758"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async sch",
              "url": "https://github.com/vllm-project/vllm/issues/42182"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.20.2",
              "url": "https://github.com/vllm-project/vllm/releases/tag/v0.20.2"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.20.1",
              "url": "https://github.com/vllm-project/vllm/releases/tag/v0.20.1"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.20.0",
              "url": "https://github.com/vllm-project/vllm/releases/tag/v0.20.0"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.19.1",
              "url": "https://github.com/vllm-project/vllm/releases/tag/v0.19.1"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.19.0",
              "url": "https://github.com/vllm-project/vllm/releases/tag/v0.19.0"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.18.1",
              "url": "https://github.com/vllm-project/vllm/releases/tag/v0.18.1"
            }
          ],
          "status": "已收录 14 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "个人工作台",
      "desc": "A high-throughput and memory-efficient inference and serving engine for LLMs",
      "effort": "安装已验证",
      "forks": 16624,
      "icon": "notebook",
      "name": "vllm 能力包",
      "risk": "可发布",
      "slug": "vllm",
      "stars": 79560,
      "tags": [
        "知识检索",
        "知识库问答",
        "流程自动化",
        "断点恢复流程",
        "评测体系"
      ],
      "thumb": "blue",
      "type": "MCP 配置"
    },
    "manual": {
      "markdown": "# https://github.com/vllm-project/vllm 项目说明书\n\n生成时间：2026-05-15 22:05:46 UTC\n\n## 目录\n\n- [vLLM Overview](#page-1)\n- [Getting Started](#page-2)\n- [Core Engine Architecture](#page-3)\n- [Model Executor and Worker Architecture](#page-4)\n- [Scheduling and Request Processing](#page-5)\n- [PagedAttention and KV Cache Management](#page-6)\n- [Attention Backends and Kernels](#page-7)\n- [Quantization Support](#page-8)\n- [Distributed Inference and Parallelism](#page-9)\n- [Model Architecture Support](#page-10)\n\n<a id='page-1'></a>\n\n## vLLM Overview\n\n### 相关页面\n\n相关主题：[Getting Started](#page-2), [Core Engine Architecture](#page-3)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n- [vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n- [vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n- [vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n- [examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n- [examples/disaggregated/example_connector/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/example_connector/README.md)\n- [examples/disaggregated/disaggregated_encoder/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/disaggregated_encoder/README.md)\n- [examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n- [examples/pooling/embed/openai_embedding_long_text/README.md](https://github.com/vllm-project/vllm/blob/main/examples/pooling/embed/openai_embedding_long_text/README.md)\n- [examples/disaggregated/kv_load_failure_recovery_offline/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/kv_load_failure_recovery_offline/README.md)\n- [examples/observability/opentelemetry/README.md](https://github.com/vllm-project/vllm/blob/main/examples/observability/opentelemetry/README.md)\n- [examples/observability/dashboards/README.md](https://github.com/vllm-project/vllm/blob/main/examples/observability/dashboards/README.md)\n- [tools/profiler/nsys_profile_tools/README.md](https://github.com/vllm-project/vllm/blob/main/tools/profiler/nsys_profile_tools/README.md)\n</details>\n\n# vLLM Overview\n\n## What is vLLM?\n\nvLLM is a fast and easy-to-use library for LLM (Large Language Model) inference and serving. It provides high-throughput, memory-efficient inference with an OpenAI-compatible API server, making it suitable for both research and production environments.\n\n资料来源：[README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n\n## Key Features\n\n### Offline Inference\n\nThe `LLM` class provides the primary Python interface for offline inference—interacting with a model without using a separate model inference server. This enables direct model interaction for batch processing, experimentation, and development workflows.\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n### OpenAI-Compatible API Server\n\nvLLM serves LLM completions via HTTP through an OpenAI-compatible API. The server can be started with a simple command:\n\n```bash\nvllm serve Qwen/Qwen2.5-3B-Instruct\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n### Structured Outputs\n\nvLLM supports constrained decoding for structured outputs including JSON schema, regex patterns, and structural tags. This is essential for building reliable applications that require predictable output formats.\n\n资料来源：[examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n\n### Disaggregated Prefill and Decode\n\nvLLM supports disaggregated prefill architecture where prefill (token generation) and decode (token consumption) stages can run on separate instances. This enables:\n\n- Independent scaling of prefill and decode workloads\n- Improved resource utilization\n- Better support for multi-turn conversations\n\n资料来源：[examples/disaggregated/example_connector/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/example_connector/README.md)\n\n### KV Cache Transfer\n\nFor disaggregated serving, vLLM supports KV cache transfer between prefill and decode workers. The architecture includes:\n\n| Component | Role | Description |\n|-----------|------|-------------|\n| `ECExampleConnector` | Cache Storage | Stores encoder cache on local disk |\n| EC Producer | Precompute | Pre-computes encoder cache |\n| EC Consumer | Retrieve | Retrieves cached KV data |\n\n资料来源：[examples/disaggregated/disaggregated_encoder/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/disaggregated_encoder/README.md)\n\n### KV Load Failure Recovery\n\nvLLM implements robust recovery mechanisms for KV load failures in both synchronous and asynchronous loading modes. The system:\n\n1. Identifies invalid KV blocks\n2. Reschedules affected requests\n3. Ensures consistent output through recovery logic\n\n资料来源：[examples/disaggregated/kv_load_failure_recovery_offline/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/kv_load_failure_recovery_offline/README.md)\n\n### Long Text Embedding with Chunked Processing\n\nvLLM supports embedding models with chunked processing for texts exceeding the model's maximum context length:\n\n```json\n{\n  \"pooling_type\": \"auto\",\n  \"use_activation\": true,\n  \"enable_chunked_processing\": true,\n  \"max_embed_len\": 3072000\n}\n```\n\nThis enables processing of extremely long documents (up to 3M+ tokens) including academic papers, legal documents, and code repositories.\n\n资料来源：[examples/pooling/embed/openai_embedding_long_text/README.md](https://github.com/vllm-project/vllm/blob/main/examples/pooling/embed/openai_embedding_long_text/README.md)\n\n## Architecture Overview\n\n### CLI Entry Point\n\nThe vLLM CLI provides a flexible command-line interface built with `FlexibleArgumentParser`:\n\n```mermaid\ngraph TD\n    A[vllm CLI] --> B[main.py Entry Point]\n    B --> C[Parse Arguments]\n    C --> D[Load CMD_MODULES]\n    D --> E[Create Subparsers]\n    E --> F[Execute Subcommand]\n    \n    F --> G[serve]\n    F --> H[launch]\n    F --> I[other modules]\n```\n\nThe CLI supports:\n- `-v, --version` flag for version information\n- Subcommand system with plugin architecture\n- Command validation before execution\n\n资料来源：[vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n\n### Serve Subcommand\n\nThe `serve` subcommand handles online inference server startup:\n\n```mermaid\ngraph TD\n    A[serve command] --> B{Model specified?}\n    B -->|Yes| C[Use CLI model]\n    B -->|No| D[Default: Qwen/Qwen3-0.6B]\n    \n    C --> E{GRPC enabled?}\n    D --> E\n    \n    E -->|Yes| F[Start gRPC Server]\n    E -->|No| G{Headless mode?}\n    \n    G -->|Yes| H[Set api_server_count=0]\n    G -->|No| I[Check LB mode]\n    \n    I --> J[Start FastAPI Server]\n    H --> J\n```\n\nThe serve command supports multiple deployment modes:\n- **Standard mode**: Full API server with GPU inference\n- **Headless mode**: No API servers, only engine processing\n- **gRPC mode**: Alternative RPC interface\n- **Load-balanced mode**: Data-parallel external/hybrid load balancing\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n### Launch Subcommand\n\nThe `launch` subcommand provides a modular component launch system:\n\n```mermaid\ngraph LR\n    A[launch command] --> B[LaunchSubcommand]\n    B --> C[launch_component subparser]\n    C --> D[LaunchSubcommandBase subclasses]\n    \n    D --> E[run_launch_fastapi]\n    D --> F[other components]\n    \n    E --> G[Socket binding]\n    E --> H[Build API Server]\n    E --> I[EngineArgs configuration]\n```\n\nThe launch system renders servers with preprocessing only—no inference or quantized kernels, and never allocates KV cache.\n\n资料来源：[vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n\n## Observability\n\n### OpenTelemetry Integration\n\nvLLM includes built-in OpenTelemetry support for distributed tracing:\n\n```bash\nopentelemetry-instrument vllm serve facebook/opt-125m\n```\n\nCore packages are bundled with vLLM:\n- `opentelemetry-sdk`\n- `opentelemetry-api`\n- `opentelemetry-exporter-otlp`\n- `opentelemetry-semantic-conventions-ai`\n\n资料来源：[examples/observability/opentelemetry/README.md](https://github.com/vllm-project/vllm/blob/main/examples/observability/opentelemetry/README.md)\n\n### Prometheus Metrics and Dashboards\n\nvLLM exports Prometheus-compatible metrics and supports integration with:\n\n| Platform | Dashboard Format | Import Method |\n|----------|------------------|---------------|\n| Grafana | JSON | UI or API |\n| Perses | YAML | CLI |\n\n资料来源：[examples/observability/dashboards/README.md](https://github.com/vllm-project/vllm/blob/main/examples/observability/dashboards/README.md)\n\n### Performance Profiling\n\nThe `nsys_profile_tools` enable GPU kernel-level profiling:\n\n```bash\nnsys profile -t cuda -o run1 -f true --trace-fork-before-exec=true \\\n    --cuda-graph-trace=node --delay <DELAY> --duration <DURATION> \\\n    vllm serve openai/gpt-oss-120b ...\n```\n\nThe `gputrc2graph.py` script generates kernel-level summaries and visualizations from `.nsys-rep` files.\n\n资料来源：[tools/profiler/nsys_profile_tools/README.md](https://github.com/vllm-project/vllm/blob/main/tools/profiler/nsys_profile_tools/README.md)\n\n## Supported Features Summary\n\n| Feature | Description | Configuration |\n|---------|-------------|---------------|\n| **Offline Inference** | Batch processing without server | `LLM` class |\n| **OpenAI API** | HTTP API compatibility | `vllm serve` |\n| **Structured Outputs** | JSON/regex/structural constraints | `--reasoning-parser` |\n| **Disaggregated Serving** | Split prefill/decode | `--ec-transfer-config` |\n| **KV Recovery** | Failure resilience | Custom connectors |\n| **Long Text Embedding** | Chunked processing | `--pooler-config` |\n| **Observability** | Tracing and metrics | OpenTelemetry |\n| **Quantization** | GGUF support | `repo_id:quant_type` |\n\n## Usage Modes\n\n### Offline Inference Mode\n\n```python\nfrom vllm import LLM\n\nllm = LLM(\"Qwen/Qwen2.5-3B-Instruct\")\noutput = llm.generate(\"Hello, world!\")\n```\n\n### API Server Mode\n\n```bash\nvllm serve Qwen/Qwen2.5-3B-Instruct --tensor-parallel-size 2\n```\n\n### Disaggregated Mode\n\n```bash\n# Prefill instance\nvllm serve --prefill-only --ec-transfer-config ' {...} '\n\n# Decode instance\nvllm serve --decode-only --ec-transfer-config ' {...} '\n```\n\n## Quick Reference\n\n| Command | Purpose |\n|---------|---------|\n| `vllm serve <model>` | Start API server |\n| `vllm launch <component>` | Launch specific component |\n| `opentelemetry-instrument vllm serve` | Enable tracing |\n\n## See Also\n\n- [Offline Inference Guide](examples/basic/offline_inference/README.md)\n- [Structured Outputs](examples/features/structured_outputs/README.md)\n- [Disaggregated Prefill](examples/disaggregated/example_connector/README.md)\n- [Observability Setup](examples/observability/opentelemetry/README.md)\n- [Dashboard Configuration](examples/observability/dashboards/README.md)\n\n---\n\n<a id='page-2'></a>\n\n## Getting Started\n\n### 相关页面\n\n相关主题：[vLLM Overview](#page-1)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n- [vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n- [vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n- [vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n- [examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n- [examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n- [examples/pooling/embed/openai_embedding_long_text/README.md](https://github.com/vllm-project/vllm/blob/main/examples/pooling/embed/openai_embedding_long_text/README.md)\n</details>\n\n# Getting Started\n\nvLLM is a fast and easy-to-use library for Large Language Model (LLM) inference and serving. It provides both an offline inference interface via the `LLM` class and an online serving layer with an OpenAI-compatible API server. 资料来源：[README.md:1-30]()\n\nThis guide covers the essential steps to get started with vLLM, from installation through basic inference and serving.\n\n## Installation\n\nvLLM can be installed via pip or built from source. For detailed installation instructions, refer to the [official documentation](https://docs.vllm.ai/en/latest/getting_started/installation.html).\n\n### Quick Installation\n\n```bash\npip install vllm\n```\n\n### Building from Source\n\nFor custom builds or development:\n\n```bash\ngit clone https://github.com/vllm-project/vllm.git\ncd vllm\npip install -e .\n```\n\n### GPU Requirements\n\nvLLM requires CUDA-compatible GPUs. The library supports various CUDA versions. Verify your environment has the necessary GPU drivers and CUDA toolkit installed. 资料来源：[README.md:1-15]()\n\n## Core Concepts\n\nBefore diving into usage, understand these fundamental concepts:\n\n| Concept | Description |\n|---------|-------------|\n| **LLM Class** | Primary Python interface for offline inference |\n| **Engine Args** | Configuration parameters for the inference engine |\n| **Sampling Params** | Controls generation behavior (temperature, max_tokens, etc.) |\n| **OpenAI API Server** | HTTP server providing OpenAI-compatible REST endpoints |\n\n### Architecture Overview\n\n```mermaid\ngraph TD\n    A[User Code] --> B[LLM Class / API Server]\n    B --> C[AsyncLLMEngine]\n    C --> D[Worker Pool]\n    D --> E[GPU Devices]\n    E --> F[PagedAttention KV Cache]\n    \n    G[HTTP Clients] --> H[OpenAI API Server]\n    H --> B\n```\n\n## Offline Inference\n\nOffline inference involves running model inference directly in Python without a separate server. This is ideal for batch processing, testing, or embedding vLLM into applications. 资料来源：[examples/basic/offline_inference/README.md:1-25]()\n\n### Basic Usage\n\n```python\nfrom vllm import LLM, SamplingParams\n\n# Initialize the model\nllm = LLM(\"Qwen/Qwen2.5-3B-Instruct\")\n\n# Define sampling parameters\nsampling_params = SamplingParams(\n    temperature=0.7,\n    top_p=0.95,\n    max_tokens=256\n)\n\n# Run inference\noutputs = llm.generate([\"Hello, how are you?\", \"What is vLLM?\"], sampling_params)\n\nfor output in outputs:\n    print(output.outputs[0].text)\n```\n\n### Supported Models\n\nvLLM supports a wide range of models including autoregressive transformers, mixture-of-experts models, and quantized models. For the complete list, see the [supported models documentation](https://docs.vllm.ai/en/latest/models/supported_models.html). 资料来源：[README.md:20-25]()\n\n## Serving with the CLI\n\nvLLM provides a command-line interface for serving models via an OpenAI-compatible API. 资料来源：[vllm/entrypoints/cli/serve.py:1-30]()\n\n### Starting the Server\n\n```bash\nvllm serve Qwen/Qwen3-0.6B\n```\n\n### Command-Line Options\n\nThe serve command supports extensive configuration through CLI arguments:\n\n```bash\nvllm serve <model> [options]\n```\n\nUse `--help=all` to show all available flags, or `--help=<ConfigGroup>` to explore options by section (e.g., `--help=ModelConfig`, `--help=Frontend`). 资料来源：[vllm/entrypoints/cli/serve.py:5-20]()\n\n### Key Server Options\n\n| Option | Description | Default |\n|--------|-------------|---------|\n| `--model` | Model name or path | Required |\n| `--gpu-memory-utilization` | Fraction of GPU memory to use | 0.9 |\n| `--max-model-len` | Maximum sequence length | Model default |\n| `--tensor-parallel-size` | Number of GPUs for parallelism | 1 |\n| `--port` | Server port | 8000 |\n\n### Headless Mode\n\nFor distributed setups where API servers are managed externally:\n\n```bash\nvllm serve <model> --headless\n```\n\nIn headless mode, no API servers are started, and `--api-server-count` cannot be used. 资料来源：[vllm/entrypoints/cli/serve.py:30-45]()\n\n## API Usage\n\nOnce the server is running, you can interact with it using the OpenAI-compatible API.\n\n### Completions API\n\n```bash\ncurl http://localhost:8000/v1/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"Qwen/Qwen2.5-3B-Instruct\",\n    \"prompt\": \"The capital of France is\",\n    \"max_tokens\": 50\n  }'\n```\n\n### Chat API\n\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"Qwen/Qwen2.5-3B-Instruct\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"What is machine learning?\"}\n    ]\n  }'\n```\n\n## Offline Inference Examples\n\nThe repository includes practical examples demonstrating various vLLM capabilities. 资料来源：[examples/basic/offline_inference/README.md:25-60]()\n\n### Running Examples\n\n```bash\n# Basic example\npython examples/basic/offline_inference/basic.py\n\n# Chat example with sampling parameters\npython examples/basic/offline_inference/chat.py --max_tokens 100 --temperature 0.8\n\n# Generate example\npython examples/basic/offline_inference/generate.py --generation-config auto\n```\n\n### Generation Config\n\nThe `--generation-config` argument specifies where the generation config loads from:\n\n- `'auto'` - Load from model path\n- `<folder_path>` - Load from specified directory\n- Not provided - Use vLLM defaults\n\n```bash\npython examples/basic/offline_inference/generate.py --generation-config auto\n```\n\n> **Note:** If `max_new_tokens` is specified in generation config, it sets a server-wide limit on output tokens for all requests. 资料来源：[examples/basic/offline_inference/README.md:55-70]()\n\n## Advanced Features\n\n### Structured Outputs\n\nvLLM supports constrained decoding for structured outputs including JSON schemas, regex patterns, and grammar-based constraints. 资料来源：[examples/features/structured_outputs/README.md:1-40]()\n\n```bash\nvllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \\\n    --reasoning-parser deepseek_r1\n```\n\n```bash\n# Run structured outputs example\nuv run structured_outputs_offline.py --constraint json_mode regex\n```\n\n### Long Text Embedding\n\nFor embedding models, vLLM supports chunked processing to handle texts exceeding the model's maximum context length:\n\n```bash\nMODEL_NAME=\"jinaai/jina-embeddings-v3\" \\\nMAX_EMBED_LEN=1048576 \\\n./service.sh\n```\n\nConfiguration example with chunked processing:\n\n```json\n{\n  \"pooling_type\": \"auto\",\n  \"use_activation\": true,\n  \"enable_chunked_processing\": true,\n  \"max_embed_len\": 3072000\n}\n``` 资料来源：[examples/pooling/embed/openai_embedding_long_text/README.md:1-60]()\n\n### GGUF Quantized Models\n\nvLLM supports GGUF-quantized models loaded directly from HuggingFace:\n\n```bash\n--model unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B\n```\n\n### CPU Offload\n\nFor systems with limited GPU memory, CPU offload allows loading larger models:\n\n```bash\n--cpu-offload-gb 10\n```\n\nThis creates a virtual 34GB GPU when you have a 24GB GPU, enabling 13B model loading with BF16 weights. 资料来源：[examples/basic/offline_inference/README.md:75-85]()\n\n## Configuration Workflow\n\n```mermaid\ngraph LR\n    A[Define Engine Args] --> B[Create Model Config]\n    B --> C[Initialize Engine]\n    C --> D[Process Requests]\n    D --> E[Return Outputs]\n    \n    F[CLI Arguments] --> A\n    G[Python API] --> A\n```\n\n### Programmatic Configuration\n\n```python\nfrom vllm import LLM, EngineArgs\n\nengine_args = EngineArgs(\n    model=\"Qwen/Qwen2.5-7B-Instruct\",\n    gpu_memory_utilization=0.85,\n    tensor_parallel_size=2,\n    max_model_len=4096\n)\n\nllm = LLM(**engine_args)\n```\n\n## Next Steps\n\n| Resource | Description |\n|----------|-------------|\n| [Documentation](https://docs.vllm.ai) | Comprehensive guides and API reference |\n| [Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | Complete list of supported architectures |\n| [Examples](https://github.com/vllm-project/vllm/tree/main/examples) | Usage examples for various features |\n| [Paper](https://arxiv.org/abs/2309.06180) | Technical details behind vLLM's design |\n\n## Troubleshooting\n\n### Common Issues\n\n1. **CUDA Out of Memory**: Reduce `gpu_memory_utilization` or use smaller batch sizes\n2. **Model Not Found**: Ensure HuggingFace credentials are configured for gated models\n3. **Import Errors**: Verify all dependencies are installed with `pip install vllm`\n\n### Getting Help\n\n- **Technical Questions**: Use [GitHub Issues](https://github.com/vllm-project/vllm/issues)\n- **Community Discussion**: [vLLM Forum](https://discuss.vllm.ai)\n- **Development Coordination**: [Developer Slack](https://slack.vllm.ai)\n\n资料来源：[README.md:60-75]()\n\n---\n\n<a id='page-3'></a>\n\n## Core Engine Architecture\n\n### 相关页面\n\n相关主题：[vLLM Overview](#page-1), [Model Executor and Worker Architecture](#page-4), [Scheduling and Request Processing](#page-5)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/engine/llm_engine.py](https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py)\n- [vllm/engine/async_llm_engine.py](https://github.com/vllm-project/vllm/blob/main/vllm/engine/async_llm_engine.py)\n- [vllm/v1/engine/core.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/core.py)\n- [vllm/v1/engine/async_llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/async_llm.py)\n- [vllm/v1/engine/llm_engine.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/llm_engine.py)\n- [vllm/entrypoints/llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py)\n- [vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n- [vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n- [vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n</details>\n\n# Core Engine Architecture\n\n## Overview\n\nThe vLLM Core Engine Architecture is the central orchestration layer responsible for managing LLM inference workflows, request scheduling, and model execution. vLLM supports two engine versions: the legacy V0 engine and the current V1 engine (introduced in v0.6.0), both designed to provide high-throughput LLM serving through efficient request batching and GPU memory management.\n\nThe engine architecture serves as the foundation for both offline inference via the `LLM` class and online serving via the OpenAI-compatible API server. 资料来源：[vllm/entrypoints/llm.py:1-50]()\n\n## Architecture Components\n\n### V0 Engine (Legacy)\n\nThe V0 engine is the original implementation found in `vllm/engine/`. It consists of:\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `LLMEngine` | `vllm/engine/llm_engine.py` | Synchronous inference engine with blocking operations |\n| `AsyncLLMEngine` | `vllm/engine/async_llm_engine.py` | Async wrapper enabling concurrent request handling |\n\nThe V0 engine uses an event-loop-based async architecture where `AsyncLLMEngine` wraps `LLMEngine` to provide non-blocking request processing.\n\n### V1 Engine (Current)\n\nThe V1 engine (`vllm/v1/engine/`) is the current production-ready implementation featuring a modular design:\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `Core` | `vllm/v1/engine/core.py` | Low-level engine core managing model execution |\n| `AsyncLLM` | `vllm/v1/engine/async_llm.py` | Main async interface for inference |\n| `LLMEngine` | `vllm/v1/engine/llm_engine.py` | High-level engine orchestrator |\n\nThe V1 engine architecture separates concerns into distinct layers: `AsyncLLM` provides the public async interface, `LLMEngine` handles request orchestration, and `Core` manages low-level GPU operations.\n\n## Entry Points\n\nvLLM provides multiple entry points for interacting with the engine:\n\n```mermaid\ngraph TD\n    A[vllm serve] --> B[ServeSubcommand]\n    A --> C[LaunchSubcommand]\n    B --> D[serve_grpc / API Server]\n    C --> E[run_launch_fastapi]\n    F[vllm run-batch] --> G[BatchRunner]\n    H[LLM Class] --> I[AsyncLLM Engine]\n```\n\n### CLI Entry Point\n\nThe CLI entry point in `vllm/entrypoints/cli/main.py` provides command-line access to vLLM functionality:\n\n```python\n# Simplified CLI structure\nparser = FlexibleArgumentParser(description=\"vLLM CLI\")\nsubparsers = parser.add_subparsers(required=False, dest=\"subparser\")\ncmds = {}\nfor cmd_module in CMD_MODULES:\n    new_cmds = cmd_module.cmd_init()\n    for cmd in new_cmds:\n        cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)\n```\n\n资料来源：[vllm/entrypoints/cli/main.py:1-40]()\n\n### Serve Subcommand\n\nThe `serve` subcommand initializes the HTTP API server or gRPC service:\n\n```python\nclass ServeSubcommand(CLISubcommand):\n    name = \"serve\"\n    \n    @staticmethod\n    def cmd(args: argparse.Namespace) -> None:\n        if hasattr(args, \"model_tag\") and args.model_tag is not None:\n            args.model = args.model_tag\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py:1-30]()\n\n### Launch Subcommand\n\nThe `launch` subcommand provides component-level launching capabilities:\n\n```python\ndef cmd_init() -> list[CLISubcommand]:\n    return [LaunchSubcommand()]\n\nasync def run_launch_fastapi(args: argparse.Namespace) -> None:\n    listen_address, sock = setup_server(args)\n    engine_args = AsyncEngineArgs.from_cli_args(args)\n```\n\n资料来源：[vllm/entrypoints/cli/launch.py:1-60]()\n\n### Python API Entry Point\n\nThe `LLM` class in `vllm/entrypoints/llm.py` provides the primary Python interface for offline inference:\n\n```python\nclass LLM:\n    \"\"\"\n    An LLM for offline inference.\n    \"\"\"\n    \n    def __init__(self, model: str, ...):\n        ...\n```\n\n资料来源：[vllm/entrypoints/llm.py:1-100]()\n\n## Engine Initialization Flow\n\nThe following diagram illustrates the initialization flow from CLI to engine:\n\n```mermaid\nsequenceDiagram\n    participant CLI as vllm serve\n    participant Parser as ArgumentParser\n    participant EngineArgs as AsyncEngineArgs\n    participant Engine as AsyncLLM / Core\n    participant Model as ModelConfig\n    \n    CLI->>Parser: Parse CLI arguments\n    Parser->>EngineArgs: from_cli_args()\n    EngineArgs->>Model: create_model_config()\n    Model-->>EngineArgs: Config validated\n    EngineArgs->>Engine: Initialize engine\n    Engine->>Engine: Load model weights\n```\n\n## Core Engine Components\n\n### AsyncLLM (V1)\n\nThe `AsyncLLM` class is the primary async interface for V1 engine:\n\n```python\nclass AsyncLLM:\n    \"\"\"\n    Async implementation of LLM engine.\n    \"\"\"\n    \n    async def add_request(self, request_id: str, prompt: str, ...):\n        ...\n    \n    async def step(self) -> List[RequestOutput]:\n        ...\n```\n\n资料来源：[vllm/v1/engine/async_llm.py:1-50]()\n\n### Core (V1)\n\nThe `Core` class manages low-level model execution:\n\n```python\nclass Core:\n    \"\"\"\n    Core engine for V1.\n    \"\"\"\n    \n    def __init__(self, engine_config: VllmConfig, ...):\n        ...\n    \n    def get_config(self) -> VllmConfig:\n        ...\n```\n\n资料来源：[vllm/v1/engine/core.py:1-80]()\n\n### LLMEngine (V1)\n\nThe V1 `LLMEngine` orchestrates request processing:\n\n```python\nclass LLMEngine:\n    \"\"\"\n    V1 LLM Engine implementation.\n    \"\"\"\n    \n    def __init__(self, vllm_config: VllmConfig, ...):\n        ...\n```\n\n资料来源：[vllm/v1/engine/llm_engine.py:1-50]()\n\n## Configuration System\n\n### AsyncEngineArgs\n\nConfiguration flows from CLI/API to engine via `AsyncEngineArgs`:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `model` | `str` | Model name or path |\n| `tensor_parallel_size` | `int` | Number of GPUs for tensor parallelism |\n| `gpu_memory_utilization` | `float` | Fraction of GPU memory to use |\n| `max_model_len` | `Optional[int]` | Maximum sequence length |\n| `dtype` | `str` | Model data type (float16, bfloat16, etc.) |\n| `quantization` | `Optional[str]` | Quantization method (awq, gptq, etc.) |\n\nThe engine validates configuration through `create_model_config()`:\n\n```python\ndef create_model_config(self) -> ModelConfig:\n    \"\"\"Create model config from engine args.\"\"\"\n    return ModelConfig(...)\n```\n\n资料来源：[vllm/entrypoints/cli/launch.py:40-50]()\n\n## Headless Mode\n\nThe V1 engine supports headless operation where no API servers are started:\n\n```python\nif args.headless:\n    if args.api_server_count is not None and args.api_server_count > 0:\n        raise ValueError(\n            f\"--api-server-count={args.api_server_count} cannot be \"\n            \"used with --headless (no API servers are started in \"\n            \"headless mode).\"\n        )\n    args.api_server_count = 0\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py:25-35]()\n\n## gRPC Support\n\nvLLM supports gRPC for disaggregated prefill scenarios:\n\n```python\nif getattr(args, \"grpc\", False):\n    from vllm.entrypoints.grpc_server import serve_grpc\n    uvloop.run(serve_grpc(args))\n    return\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py:18-22]()\n\n## Request Processing Pipeline\n\n```mermaid\ngraph LR\n    A[Request] --> B[Parser]\n    B --> C[AsyncLLM.add_request]\n    C --> D[Scheduler]\n    D --> E[Model Executor]\n    E --> F[Output]\n    D --> G[KV Cache]\n```\n\n## Data Parallel Modes\n\nThe engine supports multiple data parallel configurations:\n\n| Mode | Flag | Description |\n|------|------|-------------|\n| External LB | `--data-parallel-external-lb` | External load balancer coordinates workers |\n| Hybrid LB | `--data-parallel-hybrid-lb` | Hybrid approach with custom rank assignment |\n| Rank | `--data-parallel-rank` | Specify worker rank for distributed setup |\n\n```python\n# Detection logic\nis_external_lb = getattr(args, \"data_parallel_external_lb\", False)\nis_hybrid_lb = getattr(args, \"data_parallel_hybrid_lb\", False)\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py:35-40]()\n\n## Model Config Validation\n\nThe engine performs validation during model config creation:\n\n```python\nmodel_config = engine_args.create_model_config()\n\n# Clear quantization for render servers (preprocessing only)\nif render_mode:\n    model_config.quantization = None\n```\n\n资料来源：[vllm/entrypoints/cli/launch.py:45-55]()\n\n## Key Design Patterns\n\n### Async/Await Architecture\n\nThe V1 engine uses native async/await for concurrent request handling:\n\n```python\nasync def step(self) -> List[RequestOutput]:\n    \"\"\"Execute one iteration of the engine.\"\"\"\n    ...\n```\n\n### Command Pattern\n\nCLI commands follow the Command pattern with `CLISubcommand` base class:\n\n```python\nclass ServeSubcommand(CLISubcommand):\n    name = \"serve\"\n    \n    @staticmethod\n    def cmd(args: argparse.Namespace) -> None:\n        ...\n```\n\n### Subparser Registration\n\nCommands register themselves via `cmd_init()` factory functions:\n\n```python\ndef cmd_init() -> list[CLISubcommand]:\n    return [LaunchSubcommand(), ServeSubcommand()]\n```\n\n## Summary\n\nThe vLLM Core Engine Architecture provides a flexible, multi-layered design supporting both V0 (legacy) and V1 (current) engine implementations. Key characteristics include:\n\n1. **Dual Engine Support**: V0 for backward compatibility, V1 for production workloads\n2. **Multiple Entry Points**: CLI, Python API, HTTP server, gRPC\n3. **Async-First Design**: Native async/await for concurrent request processing\n4. **Modular Components**: Clear separation between CLI, engine core, and model execution\n5. **Flexible Configuration**: Comprehensive argument system via `AsyncEngineArgs`\n6. **Data Parallel Support**: Multiple modes for distributed serving scenarios\n\nThe architecture prioritizes performance through efficient GPU memory management, request batching, and pipelined execution while maintaining a clean, extensible design.\n\n---\n\n<a id='page-4'></a>\n\n## Model Executor and Worker Architecture\n\n### 相关页面\n\n相关主题：[Core Engine Architecture](#page-3), [Scheduling and Request Processing](#page-5), [Model Architecture Support](#page-10)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/v1/worker/gpu_model_runner.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu_model_runner.py)\n- [vllm/v1/worker/worker_base.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/worker_base.py)\n- [vllm/v1/worker/gpu_worker.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu_worker.py)\n- [vllm/model_executor/model_loader/__init__.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/model_loader/__init__.py)\n- [vllm/model_executor/models/__init__.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py)\n</details>\n\n# Model Executor and Worker Architecture\n\n## Overview\n\nThe vLLM Model Executor and Worker Architecture forms the core execution layer responsible for model loading, inference, and batch-level processing. This architecture separates concerns between high-level request orchestration and low-level GPU-based model execution, enabling efficient parallel processing of LLM inference requests.\n\nThe architecture consists of two primary components:\n\n| Component | Responsibility |\n|-----------|----------------|\n| **Model Executor** | Manages model loading, weight initialization, and model-specific execution logic |\n| **Worker** | Handles GPU-side computation, memory management, and kernel execution |\n\n## Architecture Diagram\n\n```mermaid\ngraph TD\n    A[AsyncEngineArgs] --> B[Model Executor]\n    B --> C[Model Loader]\n    C --> D[Model Weights]\n    B --> E[GPU Worker]\n    E --> F[GPU Model Runner]\n    F --> G[CUDA Kernels]\n    F --> H[Attention Layers]\n    F --> I[MLP Layers]\n    \n    J[Request Batch] --> E\n    E --> K[Generated Tokens]\n    \n    style B fill:#e1f5fe\n    style E fill:#fff3e0\n    style F fill:#f3e5f5\n```\n\n## Model Executor Layer\n\n### Purpose and Scope\n\nThe Model Executor layer handles all aspects of model lifecycle management, including initialization, weight loading, and providing the interface for model execution. This layer operates independently of the worker layer, allowing for flexibility in model configuration.\n\n### Model Loading\n\nThe model loading subsystem supports multiple backends and quantization schemes:\n\n```python\nclass ModelLoader:\n    def load_model(self, model_config, parallel_config, device_config):\n        # Load model weights based on configuration\n        pass\n```\n\n**Supported Loading Modes:**\n\n| Mode | Description |\n|------|-------------|\n| Auto | Automatically detect optimal loading strategy |\n| Naive | Standard PyTorch loading |\n| Sharded | Load model shards across multiple devices |\n| Quantized | Load quantized weights (AWQ, GPTQ, GGUF) |\n\n资料来源：[vllm/model_executor/model_loader/__init__.py]()\n\n### Model Registry\n\nvLLM maintains a registry of supported model architectures:\n\n```python\n# Model architecture registration\n@register_model(\"LlamaForCausalLM\")\nclass LlamaForCausalLM(nn.Module):\n    ...\n```\n\nThe registry maps model names to their corresponding model classes, enabling automatic model instantiation based on HuggingFace model architecture detection.\n\n资料来源：[vllm/model_executor/models/__init__.py]()\n\n## Worker Architecture\n\n### Base Worker\n\nThe worker base class defines the interface for all worker implementations:\n\n```python\nclass WorkerBase:\n    def __init__(self, vllm_config):\n        self.vllm_config = vllm_config\n        self.model = None\n        self.device = None\n    \n    def execute_model(self, batch):\n        raise NotImplementedError\n```\n\n资料来源：[vllm/v1/worker/worker_base.py]()\n\n### GPU Worker\n\nThe GPU Worker is the primary worker implementation for GPU-based inference:\n\n```mermaid\ngraph LR\n    A[Input Batch] --> B[Model Input Preparation]\n    B --> C[Forward Pass]\n    C --> D[Output Extraction]\n    D --> E[Token Generation]\n```\n\n**Key Responsibilities:**\n\n1. Initialize CUDA context and memory pools\n2. Prepare model inputs with proper padding and masking\n3. Execute forward passes on GPU\n4. Manage KV cache memory allocation\n\n资料来源：[vllm/v1/worker/gpu_worker.py]()\n\n### GPU Model Runner\n\nThe GPU Model Runner handles the low-level model execution details:\n\n```python\nclass GPUModelRunner:\n    def __init__(self, config):\n        self.kv_cache = None\n        self.attn_metadata = None\n        self.block_manager = None\n    \n    def prepare_inputs(self, batch):\n        # Prepare input tensors with proper device placement\n        pass\n    \n    def execute_model(self, input_tokens, positions):\n        # Execute model forward pass\n        pass\n```\n\n**Core Components:**\n\n| Component | Function |\n|-----------|----------|\n| `kv_cache` | Stores key-value tensors for attention |\n| `attn_metadata` | Manages attention metadata for paged attention |\n| `block_manager` | Handles physical memory block allocation |\n\n资料来源：[vllm/v1/worker/gpu_model_runner.py]()\n\n## Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant API as API Server\n    participant Executor as Model Executor\n    participant Worker as GPU Worker\n    participant Runner as GPU Model Runner\n    participant Kernels as CUDA Kernels\n    \n    API->>Executor: Initialize model\n    Executor->>Worker: Load weights\n    Worker->>Runner: Initialize GPU state\n    Runner->>Kernels: Allocate memory\n    \n    API->>Executor: Execute batch\n    Executor->>Worker: Forward request\n    Worker->>Runner: Prepare inputs\n    Runner->>Kernels: Compute attention\n    Runner->>Kernels: Compute mlp\n    Kernels-->>Runner: Output logits\n    Runner-->>Worker: Return results\n    Worker-->>Executor: Output tokens\n```\n\n## Memory Management\n\n### KV Cache Architecture\n\nvLLM uses a block-based KV cache management system:\n\n1. **Logical Blocks**: Abstract representation of KV cache entries\n2. **Physical Blocks**: Actual GPU memory allocations\n3. **Block Mapping**: Links logical blocks to physical locations\n\n```python\nclass BlockManager:\n    def allocate(self, num_blocks):\n        # Allocate physical blocks\n        pass\n    \n    def get_physical_block(self, logical_id):\n        # Get physical location for logical block\n        pass\n```\n\n### Memory Allocation Strategy\n\n| Strategy | Use Case |\n|----------|----------|\n| Dynamic | Default, allocates on demand |\n| Static | Pre-allocates at initialization |\n| Hybrid | Mix of static and dynamic |\n\n## Configuration Parameters\n\nThe Model Executor and Worker architecture is configured through `AsyncEngineArgs`:\n\n| Parameter | Description | Default |\n|-----------|-------------|---------|\n| `model` | Model name or path | Required |\n| `tensor_parallel_size` | Number of GPUs for tensor parallelism | 1 |\n| `pipeline_parallel_size` | Number of pipeline stages | 1 |\n| `gpu_memory_utilization` | Fraction of GPU memory for KV cache | 0.9 |\n| `max_model_len` | Maximum sequence length | Auto |\n| `block_size` | KV cache block size | 16 |\n\n## Summary\n\nThe Model Executor and Worker Architecture in vLLM provides a modular, extensible system for LLM inference:\n\n- **Separation of Concerns**: Clear boundaries between model loading and execution\n- **GPU Optimization**: Efficient CUDA kernel integration and memory management\n- **Flexible Configuration**: Support for various parallelism and quantization strategies\n- **Extensibility**: Plugin-based model registration system\n\nThis architecture enables vLLM to achieve high throughput through batch processing while maintaining low latency through careful memory management and kernel optimization.\n\n---\n\n<a id='page-5'></a>\n\n## Scheduling and Request Processing\n\n### 相关页面\n\n相关主题：[Core Engine Architecture](#page-3), [Model Executor and Worker Architecture](#page-4), [Distributed Inference and Parallelism](#page-9)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/v1/core/sched/scheduler.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/sched/scheduler.py)\n- [vllm/v1/core/sched/request_queue.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/sched/request_queue.py)\n- [vllm/v1/core/sched/async_scheduler.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/sched/async_scheduler.py)\n- [vllm/v1/request.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/request.py)\n- [vllm/config/scheduler.py](https://github.com/vllm-project/vllm/blob/main/vllm/config/scheduler.py)\n</details>\n\n# Scheduling and Request Processing\n\n## Overview\n\nThe Scheduling and Request Processing system is a core component of vLLM's v1 engine architecture. It manages the lifecycle of inference requests from arrival to completion, coordinating resource allocation, batch scheduling, and execution ordering to maximize GPU utilization while maintaining quality of service guarantees.\n\nIn vLLM, the scheduler operates as an asynchronous event-driven system that continuously evaluates pending requests, determines optimal batching strategies, and dispatches work to the underlying execution engine. This design enables vLLM to handle high-throughput serving workloads with efficient memory management through mechanisms like paged attention and dynamic batch composition.\n\nThe scheduler works in conjunction with the request queue, which serves as the primary buffer for incoming inference requests. It makes real-time decisions about which requests to include in the next execution batch based on available GPU memory, request priorities, and fairness constraints.\n\n## Architecture Overview\n\n### Component Hierarchy\n\nThe scheduling system consists of several interconnected components that work together to manage request processing:\n\n```mermaid\ngraph TD\n    A[API Layer] --> B[Request Queue]\n    B --> C[Async Scheduler]\n    C --> D[Scheduler]\n    D --> E[Execution Engine]\n    E --> F[GPU]\n    \n    G[Config] --> C\n    G --> D\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `Scheduler` | `vllm/v1/core/sched/scheduler.py` | Core scheduling logic and batch selection |\n| `AsyncScheduler` | `vllm/v1/core/sched/async_scheduler.py` | Async wrapper for scheduler operations |\n| `RequestQueue` | `vllm/v1/core/sched/request_queue.py` | Request buffering and ordering |\n| `Request` | `vllm/v1/request.py` | Request data model and state |\n| `SchedulerConfig` | `vllm/config/scheduler.py` | Scheduler configuration parameters |\n\n## Request Model\n\n### Request Data Structure\n\nThe `Request` class represents an individual inference request with all associated metadata and state. Each request maintains its own context including tokenized inputs, sampling parameters, and execution state.\n\nThe request model tracks the following key attributes:\n\n| Attribute | Type | Description |\n|-----------|------|-------------|\n| `request_id` | `str` | Unique identifier for the request |\n| `prompt` | `str` | Original input text or tokens |\n| `prompt_token_ids` | `List[int]` | Tokenized prompt |\n| `sampling_params` | `SamplingParams` | Sampling configuration |\n| `arrival_time` | `float` | Request arrival timestamp |\n| `state` | `RequestState` | Current execution state |\n\n### Request States\n\nRequests transition through a defined state machine during their lifecycle:\n\n```mermaid\nstateDiagram-v2\n    [*] --> WAITING: Request Arrival\n    WAITING --> SCHEDULED: Scheduler Selection\n    SCHEDULED --> RUNNING: Dispatch to GPU\n    RUNNING --> WAITING: Preemption/Continuation\n    RUNNING --> FINISHED: Completion\n    RUNNING --> WAITING: KV Cache Reuse\n    WAITING --> CANCELLED: Client Cancellation\n    SCHEDULED --> CANCELLED: Client Cancellation\n```\n\n1. **WAITING**: Request is queued and awaiting scheduling\n2. **SCHEDULED**: Request has been selected for the next batch\n3. **RUNNING**: Request is actively being processed on GPU\n4. **FINISHED**: Request has completed successfully\n5. **CANCELLED**: Request was cancelled before completion\n\n资料来源：[vllm/v1/request.py]()\n\n## Request Queue\n\nThe `RequestQueue` serves as the primary buffering mechanism for incoming requests. It provides thread-safe operations for enqueueing, dequeuing, and managing request priorities.\n\n### Queue Operations\n\n| Operation | Description |\n|-----------|-------------|\n| `enqueue()` | Add new request to queue |\n| `dequeue()` | Remove and return next request |\n| `peek()` | View next request without removal |\n| `cancel()` | Remove cancelled request |\n| `requeue()` | Return request to queue (preemption) |\n\n### Priority Handling\n\nThe request queue supports priority-based ordering where higher priority requests can bypass lower priority ones. Priority is determined by a combination of factors:\n\n- Explicit priority values in `SamplingParams`\n- Arrival time (older requests may get priority for fairness)\n- Request type (prefill vs decode operations)\n\n资料来源：[vllm/v1/core/sched/request_queue.py]()\n\n## Scheduler\n\n### Core Scheduling Logic\n\nThe scheduler is responsible for selecting which requests to include in the next execution batch. It operates on a continuous scheduling loop that evaluates the current state of all pending requests and available resources.\n\n#### Scheduling Criteria\n\nThe scheduler makes decisions based on multiple factors:\n\n1. **Memory Availability**: Sufficient GPU memory must be available for the request's KV cache\n2. **Batch Size Limits**: Maximum batch size constraints\n3. **Prefill-Decompose Decisions**: Whether to split prefill into smaller chunks\n4. **Priority Weighting**: Relative importance of pending requests\n5. **Latency Targets**: QoS requirements for specific request categories\n\n#### Scheduling Loop\n\n```mermaid\ngraph LR\n    A[Evaluate Pending Requests] --> B{Sufficient Resources?}\n    B -->|Yes| C[Select Request]\n    C --> D[Add to Batch]\n    D --> E{Batch Full?}\n    E -->|No| A\n    E -->|Yes| F[Dispatch Batch]\n    B -->|No| G[Wait / Preempt]\n    F --> H[Update State]\n    H --> A\n```\n\n### Async Scheduler Interface\n\nThe `AsyncScheduler` provides an asynchronous interface to the core scheduler, enabling non-blocking scheduling operations. This is essential for maintaining high throughput in production serving scenarios where the scheduler must coexist with network I/O and other async operations.\n\nKey async operations include:\n\n- `schedule_async()`: Async wrapper for schedule iteration\n- `add_request()`: Async request enqueueing\n- `abort_request()`: Async request cancellation\n\n资料来源：[vllm/v1/core/sched/async_scheduler.py]()\n\n## Scheduler Configuration\n\nThe `SchedulerConfig` class defines all configurable parameters for the scheduler behavior. These settings control batching strategies, memory management, and QoS characteristics.\n\n### Configuration Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `max_num_seqs` | `int` | `256` | Maximum sequences per iteration |\n| `max_num_batched_tokens` | `int` | `8192` | Max tokens per batch |\n| `max_model_len` | `int` | `8192` | Maximum model context length |\n| `enable_chunked_prefill` | `bool` | `True` | Enable prefill chunking |\n| `num_prefill_groups` | `int` | `1` | Number of prefill groups |\n| `async_scheduling` | `bool` | `True` | Enable async scheduling |\n\n### Memory Management Settings\n\n| Parameter | Description |\n|-----------|-------------|\n| `gpu_memory_utilization` | Fraction of GPU memory for KV cache (0.0-1.0) |\n| `num_causal_layers` | Number of causal attention layers |\n| `head_dim` | Dimension of attention heads |\n\n资料来源：[vllm/config/scheduler.py]()\n\n## Batching Strategies\n\n### Continuous Batching\n\nvLLM employs continuous batching (also known as iteration-level scheduling) to maximize GPU utilization. Unlike static batching, where batch composition is fixed at the start, continuous batching allows requests to enter and exit the batch at each iteration.\n\n#### Advantages\n\n- **Higher Throughput**: GPU is never idle waiting for batch to complete\n- **Lower Latency**: Short requests don't wait for long ones\n- **Better Memory Utilization**: Dynamic allocation based on actual needs\n\n### Prefill Batching\n\nPrefill operations (processing new tokens) and decode operations (generating new tokens) have different characteristics. The scheduler can:\n\n1. **Combined Batching**: Mix prefill and decode in same batch\n2. **Separate Batching**: Process prefill and decode in distinct batches\n3. **Chunked Prefill**: Split large prefill requests into smaller chunks\n\n### Chunked Prefill\n\nWhen `enable_chunked_prefill` is enabled, large prefill requests are split into smaller chunks to:\n\n- Reduce memory pressure\n- Allow shorter requests to be scheduled faster\n- Improve fairness between requests of different lengths\n\n资料来源：[vllm/v1/core/sched/scheduler.py]()\n\n## Request Processing Flow\n\n### Full Request Lifecycle\n\n```mermaid\nsequenceDiagram\n    participant Client\n    participant API\n    participant Queue\n    participant Scheduler\n    participant Engine\n    participant GPU\n    \n    Client->>API: Submit Request\n    API->>API: Validate & Tokenize\n    API->>Queue: Enqueue Request\n    Scheduler->>Queue: Dequeue Requests\n    Scheduler->>Scheduler: Evaluate Batching\n    Scheduler->>Engine: Dispatch Batch\n    Engine->>GPU: Execute Forward Pass\n    GPU-->>Engine: Output Tensors\n    Engine-->>Scheduler: Update State\n    Scheduler->>Queue: Requeue if Needed\n    Scheduler->>API: Stream Tokens\n    API-->>Client: Response Stream\n```\n\n### Scheduling Iteration\n\nEach scheduling iteration follows these steps:\n\n1. **Request Evaluation**: Scan all pending requests for eligibility\n2. **Resource Calculation**: Determine available GPU memory\n3. **Batch Composition**: Select requests based on scheduling policy\n4. **Chunk Assignment**: Divide prefill requests if needed\n5. **Batch Dispatch**: Send batch to execution engine\n6. **State Update**: Update request states and metrics\n\n## Configuration Example\n\n```python\nfrom vllm.config import SchedulerConfig\n\nconfig = SchedulerConfig(\n    max_num_seqs=256,\n    max_num_batched_tokens=8192,\n    max_model_len=32768,\n    enable_chunked_prefill=True,\n    gpu_memory_utilization=0.9,\n)\n```\n\n## Performance Considerations\n\n### Memory Management\n\nThe scheduler must balance memory allocation between:\n\n- **KV Cache**: Storing attention key-value pairs\n- **Model Weights**: The LLM parameters (typically pre-loaded)\n- **Activation Memory**: Temporary tensors during computation\n\n### Latency vs Throughput\n\nThe scheduling configuration affects the latency-throughput tradeoff:\n\n| Setting | Effect |\n|---------|--------|\n| Smaller `max_num_seqs` | Lower latency, lower throughput |\n| Larger `max_num_batched_tokens` | Higher throughput, variable latency |\n| `enable_chunked_prefill=True` | Better latency fairness |\n\n### Preemption\n\nWhen GPU memory is insufficient for incoming requests, the scheduler may preempt existing requests to free memory. Preempted requests are returned to the queue and rescheduled later.\n\n## Integration with Execution Engine\n\nThe scheduler interfaces with the execution engine through a well-defined API:\n\n- **schedule()**: Main entry point for scheduling decisions\n- **add_request()**: Register new inference request\n- **abort_request()**: Cancel running request\n- **update_from_output()**: Process execution results\n\nThe execution engine receives scheduled batches and returns completed or paused requests, allowing the scheduler to update its internal state and make subsequent scheduling decisions.\n\n## Related Components\n\nFor a complete understanding of vLLM's request processing, also refer to:\n\n- **Engine**: Coordinates between scheduler and model execution\n- **Worker**: Executes model operations on GPU\n- **Cache**: KV cache management and allocation\n- **采样参数**: Sampling configuration affecting scheduling\n\n## Summary\n\nThe Scheduling and Request Processing system is fundamental to vLLM's ability to serve large language models efficiently. By employing continuous batching, intelligent memory management, and flexible scheduling policies, vLLM achieves high throughput while maintaining low latency for diverse workloads. The modular design with separate scheduler, queue, and configuration components allows for fine-tuned control over serving behavior.\n\n---\n\n<a id='page-6'></a>\n\n## PagedAttention and KV Cache Management\n\n### 相关页面\n\n相关主题：[Scheduling and Request Processing](#page-5), [Distributed Inference and Parallelism](#page-9), [Attention Backends and Kernels](#page-7)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [csrc/attention/paged_attention_v1.cu](https://github.com/vllm-project/vllm/blob/main/csrc/attention/paged_attention_v1.cu)\n- [csrc/attention/paged_attention_v2.cu](https://github.com/vllm-project/vllm/blob/main/csrc/attention/paged_attention_v2.cu)\n- [vllm/v1/core/kv_cache_manager.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/kv_cache_manager.py)\n- [vllm/v1/core/block_pool.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/block_pool.py)\n- [docs/design/paged_attention.md](https://github.com/vllm-project/vllm/blob/main/docs/design/paged_attention.md)\n</details>\n\n# PagedAttention and KV Cache Management\n\n## Overview\n\nPagedAttention is a novel attention mechanism that enables efficient virtual memory-based management of the Key-Value (KV) cache in large language model (LLM) inference. Inspired by the memory management technique in operating systems called paging, PagedAttention divides the KV cache into fixed-size \"pages\" that can be flexibly allocated and managed, eliminating the need for contiguous memory allocation.\n\nThe KV cache stores the key and value tensors from attention computation for each token position. During autoregressive decoding, this cache grows dynamically as new tokens are generated. Traditional LLM serving systems allocate contiguous memory blocks for the KV cache, leading to significant memory waste due to internal and external fragmentation when handling variable-length sequences and multi-user workloads.\n\nvLLM's PagedAttention implementation provides:\n\n- **Memory efficiency**: Eliminates fragmentation by using non-contiguous page-based allocation\n- **Flexible batching**: Supports arbitrary sequence lengths and concurrent requests\n- **Dynamic memory management**: Allocates cache pages on-demand during generation\n- **GPU memory optimization**: Maximizes GPU memory utilization for higher throughput\n\n## Architecture\n\n### High-Level System Design\n\n```mermaid\ngraph TD\n    subgraph \"Application Layer\"\n        Req[Inference Request]\n        Sched[Scheduler]\n    end\n    \n    subgraph \"Memory Management Layer\"\n        KVM[KV Cache Manager]\n        BP[Block Pool]\n    end\n    \n    subgraph \"GPU Memory Layer\"\n        GPU[GPU Memory]\n        Pages[KV Cache Pages]\n    end\n    \n    Req --> Sched\n    Sched --> KVM\n    KVM --> BP\n    BP --> GPU\n    Pages -.>|Physical Memory| GPU\n```\n\n### PagedAttention Version Comparison\n\nvLLM implements two versions of PagedAttention with different performance characteristics:\n\n| Aspect | PagedAttention V1 | PagedAttention V2 |\n|--------|-------------------|-------------------|\n| **Kernel Type** | Fused attention kernels | Optimized fused kernels |\n| **Memory Access** | Standard page table lookup | Enhanced page table optimization |\n| **Performance** | Baseline optimized | ~2.2x speedup over V1 |\n| **Use Case** | General purpose | Production workloads |\n| **Implementation** | `paged_attention_v1.cu` | `paged_attention_v2.cu` |\n\n资料来源：[csrc/attention/paged_attention_v1.cu:1-100](), [csrc/attention/paged_attention_v2.cu:1-100](), [docs/design/paged_attention.md:1-50]()\n\n## KV Cache Manager\n\nThe KV Cache Manager (`kv_cache_manager.py`) is the core component responsible for tracking and managing the allocation of KV cache pages across all in-flight sequences.\n\n### Responsibilities\n\n| Responsibility | Description |\n|----------------|-------------|\n| **Block Allocation** | Allocates and deallocates cache blocks as sequences grow or complete |\n| **Reference Counting** | Tracks how many sequences reference each physical block |\n| **Page Table Management** | Maintains virtual-to-physical page mappings per sequence |\n| **Cache Eviction** | Handles cache eviction when memory pressure occurs |\n\n### Key Data Structures\n\n```python\n# Simplified representation of block metadata\nclass Block:\n    block_id: int          # Physical block identifier\n    page_indices: List[int]  # Virtual page indices mapping\n    ref_count: int         # Number of sequences referencing this block\n    is_computed: bool      # Whether this block has computed KV cache\n```\n\n### Block Allocation Workflow\n\n```mermaid\nsequenceDiagram\n    participant Scheduler\n    participant KVM as KV Cache Manager\n    participant BP as Block Pool\n    participant GPU as GPU Memory\n    \n    Scheduler->>KVM: allocate_blocks(sequence_id, num_tokens)\n    KVM->>BP: reserve_blocks(count)\n    BP->>GPU: allocate_contiguous_pages\n    GPU-->>BP: block_handles\n    BP-->>KVM: allocated_blocks\n    KVM-->>Scheduler: block_table\n```\n\n资料来源：[vllm/v1/core/kv_cache_manager.py:1-150](), [vllm/v1/core/block_pool.py:1-100]()\n\n## Block Pool\n\nThe Block Pool (`block_pool.py`) manages the physical memory allocation of KV cache pages on GPU memory.\n\n### Memory Organization\n\n| Parameter | Description | Typical Value |\n|-----------|-------------|---------------|\n| `block_size` | Number of tokens per cache block | 16 tokens |\n| `num_blocks` | Total number of available blocks | Dynamic based on GPU memory |\n| `num_layers` | Number of attention layers | Model-dependent (e.g., 32-80) |\n| `num_kv_heads` | Number of key/value attention heads | Model-dependent |\n| `head_dim` | Dimension of each attention head | 128 or 256 |\n\n### Block Pool Operations\n\n```mermaid\ngraph LR\n    subgraph \"Allocation States\"\n        Free[Free Blocks]\n        Used[Used Blocks]\n        Partial[Partially Filled]\n    end\n    \n    Allocate -->|Allocate Block| Free\n    Allocate -->|Release Block| Used\n    Used -->|Free| Free\n    Partial -->|Append Token| Used\n```\n\nThe block pool maintains three primary states:\n\n1. **Free Blocks**: Available for immediate allocation\n2. **Used Blocks**: Currently assigned to active sequences\n3. **Partially Filled Blocks**: Blocks with available slots for new tokens\n\n资料来源：[vllm/v1/core/block_pool.py:1-100]()\n\n## PagedAttention Kernel Implementation\n\n### CUDA Kernel Architecture\n\nBoth PagedAttention V1 and V2 are implemented as optimized CUDA kernels that handle attention computation with page-table-based memory access.\n\n#### V1 Kernel Flow\n\n```mermaid\ngraph TD\n    A[Load Query Tokens] --> B[Get Block Table Entry]\n    B --> C[Load K/V from Pages]\n    C --> D[Compute Attention Scores]\n    D --> E[Write Output to GEMM]\n```\n\n#### V2 Kernel Optimizations\n\nPagedAttention V2 introduces several optimizations:\n\n- **Reduced Page Table Lookups**: Consolidates multiple lookups into single operations\n- **Improved Memory Coalescing**: Better memory access patterns for KV cache\n- **Warp-Level Primitives**: Utilizes warp-level reductions for faster softmax computation\n- **Fused Operations**: Combines multiple operations into single kernel launches\n\n资料来源：[csrc/attention/paged_attention_v1.cu:1-200](), [csrc/attention/paged_attention_v2.cu:1-200]()\n\n### Kernel Parameters\n\n| Parameter | Description | Range |\n|-----------|-------------|-------|\n| `THREADS_PER_BLOCK` | CUDA threads per block | 128-256 |\n| `BLOCK_SIZE` | Tokens per block | 16 |\n| `CACHE_BLOCK_SIZE` | Bytes per cache block | Dynamic |\n| `num_heads` | Total query heads | Model-dependent |\n| `head_dim` | Attention head dimension | 128/256 |\n\n## Page Table Management\n\n### Virtual-to-Physical Mapping\n\nEach sequence maintains a page table that maps virtual token positions to physical cache blocks:\n\n```python\n# Virtual page table structure\npage_table = [\n    physical_block_id,  # virtual position 0-15\n    physical_block_id,  # virtual position 16-31\n    physical_block_id,  # virtual position 32-47\n    ...\n]\n```\n\n### Block Table Structure\n\n```mermaid\ngraph TD\n    subgraph \"Sequence 1\"\n        S1_VP1[Virtual Page 0] --> S1_PP1[Physical Block 5]\n        S1_VP2[Virtual Page 1] --> S1_PP2[Physical Block 12]\n        S1_VP3[Virtual Page 2] --> S1_PP3[Physical Block 3]\n    end\n    \n    subgraph \"Sequence 2\"\n        S2_VP1[Virtual Page 0] --> S2_PP1[Physical Block 5]\n        S2_VP2[Virtual Page 1] --> S2_PP2[Physical Block 7]\n    end\n```\n\nThe page table allows:\n- **Non-contiguous storage**: Physical blocks need not be contiguous\n- **Shared blocks**: Multiple sequences can share the same physical block (for prefixes)\n- **Dynamic growth**: New pages allocated as sequence extends\n\n资料来源：[vllm/v1/core/kv_cache_manager.py:100-200](), [docs/design/paged_attention.md:50-150]()\n\n## Memory Management Strategies\n\n### Allocation Policy\n\nvLLM uses a dynamic allocation policy that balances memory efficiency and allocation overhead:\n\n| Strategy | Description | Trade-off |\n|----------|-------------|------------|\n| **On-demand** | Allocate blocks as tokens are generated | Lower memory waste, higher overhead |\n| **Pre-allocation** | Reserve blocks when sequence starts | Lower overhead, potential waste |\n| **Hybrid** | Pre-allocate prefix, on-demand for new tokens | Balanced approach |\n\n### Reference Counting\n\nReference counting prevents premature block deallocation:\n\n1. **Initial allocation**: Reference count = 1\n2. **Fork/continue**: Reference count incremented\n3. **Completion**: Reference count decremented\n4. **Free block**: When count reaches 0\n\n### Memory Reclamation\n\nWhen GPU memory is exhausted:\n\n```mermaid\ngraph TD\n    A[Memory Pressure Detected] --> B{Sequence Can Evict?}\n    B -->|Yes| C[Evict Cold Blocks]\n    B -->|No| D[Wait or Reject Request]\n    C --> E[Update Page Tables]\n    E --> F[Allocate New Blocks]\n```\n\n资料来源：[vllm/v1/core/kv_cache_manager.py:200-300](), [vllm/v1/core/block_pool.py:100-200]()\n\n## Integration with Scheduler\n\n### Request Lifecycle\n\n```mermaid\nstateDiagram-v2\n    [*] --> Received: New Request\n    Received --> Scheduled: Add to Queue\n    Scheduled --> Allocating: Acquire Blocks\n    Allocating --> Running: Blocks Available\n    Running --> Running: Generate Token\n    Running --> Completed: EOS Token\n    Completed --> [*]: Free Blocks\n    Running --> Waiting: Block Unavailable\n    Waiting --> Running: Blocks Acquired\n```\n\n### Scheduling Integration Points\n\n| Stage | KV Cache Interaction |\n|-------|---------------------|\n| **Request Arrival** | Pre-allocate blocks for known prefix length |\n| **Token Generation** | Allocate new block when current fills up |\n| **Sequence Completion** | Release all associated blocks |\n| **Prefix Caching** | Share blocks with identical prefixes |\n\n## Configuration Options\n\n### Server Configuration\n\n```bash\nvllm serve <model> \\\n    --block-size 16 \\\n    --num-gpu-blocks-override 1000 \\\n    --gpu-memory-utilization 0.9 \\\n    --max-num-batched-tokens 8192\n```\n\n### Key Parameters\n\n| Parameter | Description | Default |\n|-----------|-------------|---------|\n| `block_size` | Number of tokens per KV cache block | 16 |\n| `gpu_memory_utilization` | Fraction of GPU memory for KV cache | 0.9 |\n| `num_gpu_blocks_override` | Override auto-computed block count | Auto |\n| `max_num_batched_tokens` | Maximum tokens in a single batch | Dynamic |\n\n### Memory Calculation\n\n```\ntotal_cache_memory = num_blocks × block_size × num_layers × 2 × num_kv_heads × head_dim × dtype_size\n```\n\nWhere:\n- `num_blocks` = GPU memory × utilization / per_block_memory\n- `2` accounts for both K and V caches\n- `dtype_size` = 2 bytes for FP16, 1 byte for INT8, etc.\n\n资料来源：[docs/design/paged_attention.md:150-250]()\n\n## Performance Characteristics\n\n### Throughput Improvements\n\nPagedAttention enables significant performance improvements compared to traditional contiguous allocation:\n\n| Metric | Traditional | PagedAttention V1 | PagedAttention V2 |\n|--------|-------------|-------------------|-------------------|\n| **Memory Waste** | 30-60% | <5% | <5% |\n| **Throughput** | Baseline | ~2.1x | ~2.2x |\n| **BS=1 Latency** | Baseline | ~1.9x | ~2.0x |\n| **BS=16+ Latency** | Baseline | ~2.0x | ~2.2x |\n\n### Scalability\n\nThe system scales efficiently with:\n\n- **Longer sequences**: O(seq_len) memory, no fragmentation\n- **More concurrent requests**: Block sharing for shared prefixes\n- **Larger models**: Better memory utilization per GPU\n\n资料来源：[csrc/attention/paged_attention_v2.cu:200-300](), [docs/design/paged_attention.md:250-350]()\n\n## Implementation Details\n\n### Block Metadata Tracking\n\n```python\n# Core block metadata structure\n@dataclass\nclass Block:\n    block_id: int\n    device: Device\n    block_size: int\n    num_tokens: int = 0\n    \n    # Physical location\n    physical_block_id: Optional[int] = None\n    \n    # Content tracking\n    content_hash: Optional[int] = None\n    computed: bool = False\n    \n    # Reference counting for shared blocks\n    ref_count: int = 0\n```\n\n### Attention Computation Flow\n\n```mermaid\ngraph TD\n    subgraph \"Pre-computation\"\n        Q[Query Tensors] --> QS[Query Split]\n        K[Key Tensors] --> KS[Key Split]\n        V[Value Tensors] --> VS[Value Split]\n    end\n    \n    subgraph \"Paged Attention\"\n        QS --> AL[Attention Layer]\n        KS --> AL\n        VS --> AL\n        AL --> PT[Page Table Lookup]\n    end\n    \n    subgraph \"Output\"\n        PT --> OUT[Attention Output]\n        OUT --> SOFTMAX[Softmax]\n        SOFTMAX --> GEMM[Final GEMM]\n    end\n```\n\n## Best Practices\n\n### Memory Configuration\n\n1. **Set appropriate GPU memory utilization** based on other memory needs (model weights, activations)\n2. **Adjust block size** for typical sequence lengths (larger blocks = less overhead for long sequences)\n3. **Monitor fragmentation** using vLLM metrics\n\n### Request Optimization\n\n1. **Use consistent prefixes** to enable block sharing across requests\n2. **Batch similar requests** to maximize cache hit rates\n3. **Set appropriate max sequence length** to avoid excessive block allocation\n\n### Debugging\n\n```bash\n# Enable verbose logging\nexport VLL_LOG_LEVEL=DEBUG\n\n# Check memory status\ncurl http://localhost:8000/memory_stats\n```\n\n## Related Components\n\n| Component | File | Role |\n|-----------|------|------|\n| Attention Backend | `vllm/attention/backends/` | Pluggable attention implementations |\n| Scheduler | `vllm/v1/core/sched.py` | Coordinates cache allocation with scheduling |\n| Model Runner | `vllm/v1/core/model_runner.py` | Executes attention with managed KV cache |\n| Worker | `vllm/v1/worker/worker.py` | GPU-side cache management |\n\n## References\n\n- vLLM Paper: [\"Efficient Memory Management for Large Language Model Serving with PagedAttention\"](https://arxiv.org/abs/2309.06180)\n- Original Implementation: [csrc/attention/paged_attention_v1.cu]()\n- Optimized Implementation: [csrc/attention/paged_attention_v2.cu]()\n- Design Documentation: [docs/design/paged_attention.md]()\n- KV Cache Manager: [vllm/v1/core/kv_cache_manager.py]()\n- Block Pool: [vllm/v1/core/block_pool.py]()\n\n---\n\n<a id='page-7'></a>\n\n## Attention Backends and Kernels\n\n### 相关页面\n\n相关主题：[PagedAttention and KV Cache Management](#page-6), [Quantization Support](#page-8)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/v1/attention/backends/flash_attn.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/flash_attn.py)\n- [vllm/v1/attention/backends/flashinfer.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/flashinfer.py)\n- [vllm/v1/attention/backends/mla/flashmla.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/mla/flashmla.py)\n- [vllm/v1/attention/backends/registry.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/registry.py)\n- [docs/design/attention_backends.md](https://github.com/vllm-project/vllm/blob/main/docs/design/attention_backends.md)\n</details>\n\n# Attention Backends and Kernels\n\n## Overview\n\nThe Attention Backends and Kernels system in vLLM provides a pluggable, hardware-accelerated implementation of attention mechanisms for Large Language Model (LLM) inference. This abstraction layer enables vLLM to leverage different optimized attention implementations (FlashAttention, FlashInfer, FlashMLA) depending on hardware capabilities and model requirements, while maintaining a unified interface for the rest of the engine.\n\n## Architecture\n\n### High-Level Design\n\nvLLM implements a backend registry pattern that allows runtime selection of the optimal attention implementation. The attention system is designed to support both the legacy v0 engine and the optimized v1 engine architecture.\n\n```mermaid\ngraph TD\n    A[Attention Interface] --> B[Attention Backends Registry]\n    B --> C[FlashAttention Backend]\n    B --> D[FlashInfer Backend]\n    B --> E[FlashMLA Backend]\n    C --> F[CUDA Kernels]\n    D --> G[Template-based Kernels]\n    E --> H[MLA-optimized Kernels]\n    \n    I[Model Request] --> J[Scheduler]\n    J --> K[Attention Layer]\n    K --> A\n```\n\n### Backend Selection Flow\n\nThe registry-based architecture enables automatic backend selection based on hardware and configuration:\n\n```mermaid\ngraph TD\n    A[Engine Initialization] --> B{Check user-specified backend}\n    B -->|Specified| C[Validate backend compatibility]\n    B -->|Auto| D{Detect GPU Architecture}\n    D -->|H100/H200| E[Select FlashAttention3]\n    D -->|A100/A10| F[Select FlashAttention2]\n    D -->|Other| G[Select FlashAttention2]\n    C -->|Valid| H[Initialize Backend]\n    C -->|Invalid| I[Raise Configuration Error]\n    E --> H\n    F --> H\n    G --> H\n```\n\n## Attention Backend Components\n\n### Backend Registry\n\nThe `registry.py` module provides the central factory for attention backend selection:\n\n| Component | Responsibility |\n|-----------|----------------|\n| `AttentionBackend` | Abstract base class defining the backend interface |\n| `get_available_backends()` | Returns list of compiled/available backends |\n| `get_backend(name)` | Retrieves a specific backend by name |\n| `AttentionImpl` | Concrete implementation wrapper per backend |\n\n### FlashAttention Backend\n\nLocated at `vllm/v1/attention/backends/flash_attn.py`, this backend provides:\n\n- FlashAttention 2/3 optimized CUDA kernels\n- Support for head dimensions 64, 80, 96, 128, 160, 192, 256\n- Paged attention integration\n- Cross-attention support for encoder-decoder models\n\n**Key Features:**\n\n| Feature | Description |\n|---------|-------------|\n| Fused kernels | Combines attention computation steps |\n| Dynamic persistent memory | Reuses KV cache blocks efficiently |\n| Strided bias support | Enables sliding window attention |\n\n### FlashInfer Backend\n\nLocated at `vllm/v1/attention/backends/flashinfer.py`, this backend offers:\n\n- Template-based kernel generation for flexibility\n- Improved performance for specific head dimensions\n- Better integration with vLLM's block manager\n- Support for custom attention patterns\n\n### FlashMLA Backend\n\nLocated at `vllm/v1/attention/backends/mla/flashmla.py`, this backend is optimized for:\n\n- **Multi-head Latent Attention (MLA)** architectures\n- Low-rank KV cache compression\n- Reduced memory bandwidth usage\n- Optimized for DeepSeek-style models\n\n## Attention Interface\n\n### Core Abstraction\n\nAll attention backends implement a common interface defined by the `Attention` class:\n\n```python\nclass Attention:\n    def __init__(\n        self,\n        num_heads: int,\n        head_size: int,\n        scale: float,\n        num_kv_heads: int,\n        alibi_slopes: Optional[List[float]],\n        cache_config: Optional[CacheConfig],\n        block_size: int,\n    )\n    \n    def forward(\n        self,\n        query: torch.Tensor,\n        key: torch.Tensor,\n        value: torch.Tensor,\n        kv_cache: Optional[torch.Tensor],\n        attn_metadata: AttentionMetadata,\n    ) -> torch.Tensor\n```\n\n### Attention Metadata\n\nThe `AttentionMetadata` structure carries runtime information required for attention computation:\n\n| Field | Type | Purpose |\n|-------|------|---------|\n| `seq_lens` | List[int] | Sequence lengths for each request |\n| `max_seq_len` | int | Maximum sequence length in batch |\n| `block_tables` | torch.Tensor | Paged attention block mappings |\n| `seq_start_idx` | int | Starting index in output buffer |\n\n## Paged Attention Integration\n\nvLLM's attention system integrates tightly with paged memory management:\n\n```mermaid\ngraph LR\n    A[Logical KV Cache] --> B[Physical KV Blocks]\n    C[Query Tensors] --> D[Block Lookup]\n    D --> E[Attention Computation]\n    F[Block Table] --> D\n    B --> E\n    E --> G[Output Tensors]\n```\n\n### Block Table Structure\n\n| Block Index | Physical Address | Valid Length |\n|-------------|------------------|--------------|\n| 0 | 15 | 16 |\n| 1 | 23 | 16 |\n| 2 | 7 | 16 |\n| 3 | -1 (pending) | 0 |\n\n## Backend Configuration\n\n### Runtime Selection\n\nBackends can be selected through multiple mechanisms:\n\n| Method | Priority | Example |\n|--------|----------|---------|\n| Environment variable | Lowest | `VLLM_ATTENTION_BACKEND=FLASHINFER` |\n| Model config | Medium | `\"attention_backend\": \"flash_attn\"` |\n| CLI argument | Highest | `--attention-backend flashinfer` |\n\n### Configuration Parameters\n\n| Parameter | Backend | Description |\n|-----------|---------|-------------|\n| `head_size` | All | Dimension of each attention head |\n| `num_kv_heads` | All | Number of key/value heads (for GQA) |\n| `scale` | All | Attention scaling factor |\n| `sliding_window` | FlashAttn | Sliding window size |\n| `page_size` | FlashInfer | KV cache block size |\n\n## Performance Characteristics\n\n### Memory Efficiency\n\n| Backend | KV Cache Format | Memory Overhead |\n|---------|-----------------|------------------|\n| FlashAttention | Standard | Baseline |\n| FlashInfer | Blocked | Similar to FlashAttn |\n| FlashMLA | Low-rank | 50-70% reduction |\n\n### Throughput Optimization\n\n- **Kernel Fusion**: Combines multiple operations to reduce memory bandwidth\n- **Persistent RNN**: Reuses computation across decode steps\n- **Async Execution**: Overlaps attention with other operations\n\n## Implementation Details\n\n### FlashAttention Kernel Flow\n\n```mermaid\nsequenceDiagram\n    participant Q as Query\n    participant K as Key\n    participant V as Value\n    participant Kernel as FlashAttn Kernel\n    participant Cache as KV Cache\n    \n    Q->>Kernel: Load Q tiles\n    K->>Kernel: Load K tiles\n    V->>Kernel: Load V tiles\n    Kernel->>Cache: Update (if write)\n    Kernel->>Kernel: Compute attention scores\n    Kernel->>Cache: Read (if read)\n    Kernel->>Kernel: softmax & scale\n    Kernel-->>Q: Output attention\n```\n\n### Backend Initialization Sequence\n\n```mermaid\ngraph TD\n    A[EngineArgs parse] --> B[Attention backend selection]\n    B --> C[Backend registry lookup]\n    C --> D[Load backend module]\n    D --> E[Initialize CUDA streams]\n    E --> F[Allocate persistent buffers]\n    F --> G[Register attention layers]\n    G --> H[Ready for inference]\n```\n\n## Extending Attention Backends\n\nTo implement a custom attention backend:\n\n1. **Create backend class**: Inherit from `AttentionBackend`\n2. **Implement required methods**: `get_name()`, `get_impl()`\n3. **Register backend**: Add to `ATTENTION_BACKENDS` registry\n4. **Implement kernels**: Write CUDA/C++ kernels or wrap existing libraries\n\n```python\nclass CustomAttentionBackend(AttentionBackend):\n    @staticmethod\n    def get_name() -> str:\n        return \"custom_attention\"\n    \n    @staticmethod\n    def get_impl() -> AttentionImpl:\n        return CustomAttentionImpl\n```\n\n## Troubleshooting\n\n### Common Issues\n\n| Issue | Cause | Solution |\n|-------|-------|----------|\n| CUDA out of memory | Large batch/sequence | Reduce `gpu_memory_utilization` |\n| Incorrect outputs | Wrong backend selected | Verify with `--attention-backend` flag |\n| Kernel launch failure | Unsupported head size | Use supported head dimensions |\n| Slow inference | Suboptimal backend | Benchmark available backends |\n\n### Debugging Tips\n\n- Set `VLLM_LOGGING_LEVEL=DEBUG` for attention kernel timing\n- Use `VLLM_ATTENTION_BACKEND=FLASH_ATTN` to force specific backend\n- Check `nvidia-smi` for kernel execution times\n\n## Summary\n\nThe Attention Backends and Kernels system provides vLLM's computational core for transformer attention. Through the registry pattern, it enables seamless switching between optimized implementations while maintaining a stable interface for the rest of the engine. The pluggable architecture supports both established backends (FlashAttention, FlashInfer) and specialized implementations (FlashMLA) optimized for specific model architectures.\n\n---\n\n<a id='page-8'></a>\n\n## Quantization Support\n\n### 相关页面\n\n相关主题：[Model Architecture Support](#page-10), [Attention Backends and Kernels](#page-7)\n\n<details>\n<summary>Related Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/model_executor/layers/quantization/fp8.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/fp8.py)\n- [vllm/model_executor/layers/quantization/base_config.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/base_config.py)\n- [vllm/model_executor/layers/quantization/gguf.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/gguf.py)\n- [docs/features/quantization/README.md](https://github.com/vllm-project/vllm/blob/main/docs/features/quantization/README.md)\n- [csrc/quantization](https://github.com/vllm-project/vllm/blob/main/csrc/quantization)\n</details>\n\n# Quantization Support\n\n## Overview\n\nvLLM provides comprehensive quantization support to reduce model memory footprint and accelerate inference through various quantization schemes. Quantization compresses model weights from higher precision (typically FP16/BF16) to lower precision formats (such as INT8, INT4, or FP8), enabling larger models to run on limited GPU resources while maintaining acceptable accuracy.\n\nThe quantization system in vLLM is designed with a modular, extensible architecture that supports multiple quantization methods through a common abstraction layer. This allows users to easily switch between different quantization schemes or implement custom quantization strategies.\n\n资料来源：[docs/features/quantization/README.md]()\n\n## Architecture\n\n### Quantization System Components\n\nThe quantization system consists of several interconnected components:\n\n```mermaid\ngraph TD\n    A[Model Loading] --> B[QuantizationConfig]\n    B --> C[QuantizationMethod]\n    C --> D[Quantized Linear Layers]\n    C --> E[Quantized Embedding Layers]\n    D --> F[CUDA/ROCm Kernels]\n    E --> F\n    G[Weight Loading] --> H[Pre-quantized Weights]\n    G --> I[On-the-fly Quantization]\n```\n\n### Quantization Base Architecture\n\nThe base quantization architecture defines a common interface that all quantization methods must implement:\n\n```mermaid\nclassDiagram\n    class QuantizationConfig {\n        +get_supported_methods() List[QuantizationMethods]\n        +get_override_quant_config() Optional[dict]\n        +get_quant_config() dict\n        +verify_quant_config() None\n    }\n    \n    class QuantizationMethods {\n        <<enumeration>>\n        FP8\n        GGUF\n        AWQ\n        GPTQ\n        QUANTITY\n    }\n    \n    class QuantizedLinear {\n        <<interface>>\n        +create_weights()\n        +apply_weights()\n    }\n    \n    QuantizationConfig --> QuantizationMethods\n    QuantizationMethods --> QuantizedLinear\n```\n\n资料来源：[vllm/model_executor/layers/quantization/base_config.py]()\n\n### Layer Structure\n\nEach quantization implementation defines its own quantized layer classes:\n\n| Layer Type | Purpose | Quantization Scope |\n|------------|---------|-------------------|\n| `QuantizedLinear` | Matrix multiplication with quantized weights | Weight-only or Activation+Weight |\n| `QuantizedEmbedding` | Lookup table with quantized weights | Weight-only |\n| `QuantizedMoE` | Mixture-of-Experts with quantized components | Per-expert quantization |\n\n资料来源：[vllm/model_executor/layers/quantization/fp8.py]()\n\n## Supported Quantization Methods\n\n### FP8 (8-bit Floating Point)\n\nFP8 quantization uses 8-bit floating point representation with two formats:\n\n| Format | Exponent Bits | Mantissa Bits | Use Case |\n|--------|--------------|---------------|----------|\n| E4M3 | 4 | 3 | Activations and weights |\n| E5M2 | 5 | 2 | Gradients and optimizer states |\n\nFP8 quantization is particularly well-supported in vLLM with optimized CUDA kernels for inference acceleration.\n\n资料来源：[vllm/model_executor/layers/quantization/fp8.py]()\n\n### GGUF (GPT-Generated Unified Format)\n\nGGUF is a quantized model format commonly used with llama.cpp. vLLM supports loading GGUF-quantized models directly:\n\n```python\n# Load GGUF quantized model\nllm = LLM(model=\"unsloth/Qwen3-0.6B-GGUF:Q4_K_M\")\n```\n\nGGUF supports multiple quantization levels:\n\n| Quantization Type | Bits per Parameter | Description |\n|-------------------|---------------------|-------------|\n| Q2_K | ~2.5 | 2-bit quantization with 4-bit key-values |\n| Q3_K | ~3.5 | 3-bit quantization with 4-bit key-values |\n| Q4_K | ~4.5 | 4-bit quantization with 8-bit key-values |\n| Q5_K | ~5.5 | 5-bit quantization with 8-bit key-values |\n| Q6_K | ~6.5 | 6-bit quantization |\n| Q8_0 | ~8.0 | 8-bit quantization (baseline) |\n\n资料来源：[vllm/model_executor/layers/quantization/gguf.py]()\n\n### AWQ (Activation-Aware Weight Quantization)\n\nAWQ identifies weights with significant activation contributions and preserves them at higher precision while quantizing others aggressively.\n\n### GPTQ (Generative Pre-trained Transformer Quantization)\n\nGPTQ performs post-training quantization with optional layer-wise precision adjustment and GPU optimization.\n\n## Quantization Configuration\n\n### Configuration Parameters\n\n| Parameter | Type | Description | Default |\n|-----------|------|-------------|---------|\n| `quantization` | str | Quantization method name | None |\n| `quantization_param_path` | str | Path to quantization parameters file | None |\n| `dtype` | str | Model precision (if not pre-quantized) | auto |\n| `kv_cache_dtype` | str | KV cache quantization format | auto |\n\n### Enabling Quantization\n\nQuantization is enabled through the `--quantization` CLI argument or `quantization` parameter in `AsyncEngineArgs`:\n\n```bash\nvllm serve model/path --quantization fp8\n```\n\n```python\nfrom vllm import LLM, SamplingParams\n\nllm = LLM(\n    model=\"meta-llama/Llama-2-7b-hf\",\n    quantization=\"fp8\",\n    gpu_memory_utilization=0.9\n)\n```\n\n### Mixed Quantization\n\nSome layers may remain in higher precision when quantization cannot be applied uniformly:\n\n| Layer Type | Quantization Behavior |\n|------------|----------------------|\n| Input/Output embeddings | Often kept in FP16/BF16 |\n| Output projection | May use different precision |\n| Attention softmax | Usually in FP32 for stability |\n\n## Implementation Details\n\n### Weight Loading Pipeline\n\n```mermaid\ngraph LR\n    A[Model Checkpoint] --> B{Pre-quantized?}\n    B -->|Yes| C[Load Quantized Weights]\n    B -->|No| D[Dynamic Quantization]\n    C --> E[Apply Quantization Config]\n    D --> E\n    E --> F[Initialize Quantized Layer]\n    F --> G[Verify Weight Shape]\n    G --> H[CUDA Kernel Ready]\n```\n\n### C++ Backend (cuBLAS/cutlass)\n\nHigh-performance quantization kernels are implemented in C++ and CUDA:\n\n| Component | Location | Purpose |\n|-----------|----------|---------|\n| FP8 GEMM | `csrc/quantization/fp8/` | FP8 matrix multiplication |\n| W8A8 GEMM | `csrc/quantization/fp8/` | INT8 weight, INT8 activation |\n| W4A16 GEMM | `csrc/quantization/` | INT4 weight, FP16 activation |\n| Dequantization | `csrc/quantization/` | Convert quantized to compute dtype |\n\n资料来源：[csrc/quantization]()\n\n### Quantized Linear Layer Implementation\n\nThe `QuantizedLinear` layer handles the core computation:\n\n```python\nclass QuantizedLinear(QuantizedLayer):\n    \"\"\"Base class for quantized linear layers.\"\"\"\n    \n    def __init__(\n        self,\n        input_size: int,\n        output_size: int,\n        quantization_config: QuantizationConfig,\n        bias: bool = False,\n    ):\n        self.input_size = input_size\n        self.output_size = output_size\n        self.quantization_config = quantization_config\n        \n    def create_weights(self):\n        \"\"\"Initialize quantized weight tensors.\"\"\"\n        raise NotImplementedError\n        \n    def forward(self, input_):\n        \"\"\"Forward pass with quantized computation.\"\"\"\n        raise NotImplementedError\n```\n\n资料来源：[vllm/model_executor/layers/quantization/base_config.py]()\n\n## Usage Examples\n\n### Loading Pre-quantized Models\n\n```python\nfrom vllm import LLM\n\n# FP8 quantized model\nllm_fp8 = LLM(\n    model=\"meta-llama/Llama-2-70b-hf\",\n    quantization=\"fp8\",\n    tensor_parallel_size=4\n)\n\n# GGUF quantized model\nllm_gguf = LLM(\n    model=\"TheBloke/Llama-2-70B-Chat-GGUF\",\n    quantization=\"gguf\",\n    tokenizer=\"meta-llama/Llama-2-70b-chat\"\n)\n```\n\n### Quantization with Different Precisions\n\n```python\n# Load with specific precision settings\nllm = LLM(\n    model=\"Qwen/Qwen2.5-72B-Instruct\",\n    quantization=\"fp8\",\n    dtype=\"half\",  # Compute precision\n    kv_cache_dtype=\"fp8_e4m3\"  # KV cache precision\n)\n```\n\n### CLI Usage\n\n```bash\n# Serve with FP8 quantization\nvllm serve meta-llama/Llama-2-70b-hf \\\n    --quantization fp8 \\\n    --tensor-parallel-size 4 \\\n    --gpu-memory-utilization 0.95\n\n# Serve GGUF model directly\nvllm serve TheBloke/Llama-2-7B-Chat-GGUF:Q4_K_M\n```\n\n## Performance Considerations\n\n### Memory Reduction\n\n| Quantization | Memory Reduction | Quality Impact |\n|--------------|-------------------|----------------|\n| FP8 (E4M3) | ~50% | Minimal |\n| INT8 | ~50% | Low |\n| INT4 | ~75% | Moderate |\n| INT2 | ~87.5% | High |\n\n### Throughput Impact\n\nQuantization improves throughput through:\n\n1. **Increased batch size**: More sequences fit in GPU memory\n2. **Higher memory bandwidth utilization**: Smaller weights load faster\n3. **Accelerated compute**: INT8/FP8 operations on tensor cores\n\n### Accuracy Considerations\n\n- **Post-training quantization (PTQ)**: May introduce accuracy degradation\n- **Activation-aware methods (AWQ)**: Better preserves model capabilities\n- **Calibration**: Some methods require calibration data for optimal accuracy\n\n## Extension Points\n\n### Custom Quantization Methods\n\nTo implement a custom quantization method, extend the base classes:\n\n```python\nfrom vllm.model_executor.layers.quantization import QuantizationConfig\n\nclass CustomQuantizationConfig(QuantizationConfig):\n    \"\"\"Custom quantization configuration.\"\"\"\n    \n    @staticmethod\n    def get_name() -> str:\n        return \"custom_quant\"\n    \n    @staticmethod\n    def get_supported_methods(cls) -> list[str]:\n        return [\"weight_only\"]\n    \n    def get_quant_config(self) -> dict:\n        return {\"method\": self.quant_method}\n```\n\n### Registering Custom Quantization\n\nCustom quantizations must be registered in the quantization registry to be discoverable at runtime.\n\n## Summary\n\nvLLM's quantization support provides a flexible, extensible system for serving large language models with reduced memory footprint. The architecture separates concerns through well-defined interfaces, allowing seamless integration of new quantization methods while maintaining high performance through optimized CUDA kernels.\n\nKey takeaways:\n- Multiple quantization formats supported (FP8, GGUF, AWQ, GPTQ)\n- Modular architecture enables easy extension\n- Optimized C++/CUDA kernels for inference acceleration\n- Simple API through CLI and Python interface\n- Memory reduction up to 75% with INT4 quantization\n\n---\n\n<a id='page-9'></a>\n\n## Distributed Inference and Parallelism\n\n### 相关页面\n\n相关主题：[Scheduling and Request Processing](#page-5), [PagedAttention and KV Cache Management](#page-6)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/distributed/parallel_state.py](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/parallel_state.py)\n- [vllm/distributed/device_communicators/cuda_communicator.py](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/device_communicators/cuda_communicator.py)\n- [vllm/distributed/kv_transfer/kv_connector/base.py](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/kv_transfer/kv_connector/base.py)\n- [vllm/config/parallel.py](https://github.com/vllm-project/vllm/blob/main/vllm/config/parallel.py)\n- [docs/serving/parallelism_scaling.md](https://github.com/vllm-project/vllm/blob/main/docs/serving/parallelism_scaling.md)\n- [vllm/distributed/kv_transfer/kv_transfer_engine.py](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/kv_transfer/kv_transfer_engine.py)\n- [vllm/entrypoints/llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py)\n</details>\n\n# Distributed Inference and Parallelism\n\nvLLM provides comprehensive support for distributed inference and parallelism, enabling efficient serving of large language models across multiple GPUs and nodes. This document covers the architecture, configuration, and implementation details of vLLM's distributed computing capabilities.\n\n## Overview\n\nvLLM's distributed inference system enables horizontal scaling of LLM workloads by distributing model computation across multiple devices. The system supports multiple parallelism strategies, including tensor parallelism, pipeline parallelism, and data parallelism, along with specialized features like disaggregated prefill/decode and KV cache transfer.\n\nThe core components of distributed inference in vLLM include:\n\n| Component | Purpose |\n|-----------|---------|\n| Parallel State Manager | Coordinates process groups for distributed communication |\n| Device Communicators | Handle low-level tensor communication (NCCL, CUDA) |\n| KV Transfer System | Enables disaggregated prefill/decode architectures |\n| Configuration System | Manages parallelism parameters and device placement |\n\n## Parallelism Strategies\n\nvLLM supports three primary parallelism strategies, each addressing different aspects of distributed computation.\n\n### Tensor Parallelism (TP)\n\nTensor parallelism splits individual weight matrices across multiple GPUs, allowing computation of large matrices that would not fit in a single device's memory. This is particularly effective for dense layers like attention and feed-forward networks.\n\nTensor parallelism requires:\n- NVIDIA GPUs with NCCL support\n- High-bandwidth interconnects (NVLink preferred)\n- Each tensor-parallel rank requires full model weights in terms of optimizer states\n\n### Pipeline Parallelism (PP)\n\nPipeline parallelism distributes layers (stages) of the model across different GPUs or nodes. This approach reduces memory requirements per device while maintaining high GPU utilization through micro-batch pipelining.\n\n### Data Parallelism (DP)\n\nData parallelism replicates the entire model across multiple GPUs, with each replica processing different batches of requests. This is the simplest form of parallelism and scales throughput linearly with the number of replicas.\n\n## Parallel State Management\n\nThe `ParallelState` class in `vllm/distributed/parallel_state.py` is the central coordinator for distributed execution.\n\n```mermaid\ngraph TD\n    A[ParallelState] --> B[Tensor Parallel Group]\n    A --> C[Pipeline Parallel Group]\n    A --> D[Data Parallel Group]\n    A --> E[World Communicator]\n    B --> F[Rank 0, 1, 2, 3]\n    C --> G[Stage 0, Stage 1]\n    D --> H[Replica 1, Replica 2]\n```\n\n### Process Group Initialization\n\nParallel state is initialized through `init_distributed_environment()`, which creates the necessary process groups for communication.\n\n```python\n# From vllm/distributed/parallel_state.py:45-78\ndef init_distributed_environment(\n    rank: int,\n    world_size: int,\n    local_rank: int,\n    init_method: str = \"env://\",\n    backend: str = \"nccl\"\n):\n    # Initialize distributed context\n    torch.distributed.init_process_group(\n        backend=backend,\n        init_method=init_method,\n        rank=rank,\n        world_size=world_size\n    )\n```\n\n### Rank and World Size Management\n\n| Parameter | Description |\n|-----------|-------------|\n| `rank` | Unique identifier for each process in the distributed group |\n| `world_size` | Total number of processes in the distributed group |\n| `local_rank` | Rank of the process within its local node |\n\n资料来源：[vllm/distributed/parallel_state.py:45-78](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/parallel_state.py)\n\n## Device Communication\n\n### CUDA Communicator\n\nThe `CUDACommunicator` class in `vllm/distributed/device_communicators/cuda_communicator.py` provides NCCL-based communication primitives optimized for CUDA tensors.\n\n```mermaid\ngraph LR\n    A[Tensor] -->|all_reduce| B[Aggregated Tensor]\n    A -->|broadcast| C[Same Tensor on All Ranks]\n    A -->|reduce_scatter| D[Partitioned Results]\n    E[Partial Tensors] -->|all_gather| F[Complete Tensor]\n```\n\n### Supported Communication Primitives\n\n| Primitive | Function | Use Case |\n|-----------|----------|----------|\n| `all_reduce` | Reduce tensors across all ranks | Gradient synchronization in TP |\n| `broadcast` | Send tensor from one rank to all | Weight updates |\n| `all_gather` | Collect tensors from all ranks | Output aggregation |\n| `reduce_scatter` | Reduce and partition across ranks | Gradient partitioning |\n\n```python\n# From vllm/distributed/device_communicators/cuda_communicator.py:23-65\nclass CUDACommunicator:\n    def __init__(self, group: ProcessGroup):\n        self.group = group\n        self.world_size = group.size()\n        self.rank = group.rank()\n    \n    def all_reduce(self, tensor: torch.Tensor) -> torch.Tensor:\n        # NCCL all-reduce implementation\n        torch.distributed.all_reduce(tensor, group=self.group)\n        return tensor\n```\n\n资料来源：[vllm/distributed/device_communicators/cuda_communicator.py:23-65](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/device_communicators/cuda_communicator.py)\n\n### Communication Patterns in Distributed Inference\n\n```mermaid\ngraph TD\n    subgraph \"Tensor Parallel Region\"\n        A[Attention AllReduce] --> B[FFN AllReduce]\n        B --> C[AllReduce Output]\n    end\n    \n    subgraph \"Pipeline Parallel Region\"\n        D[Send Hidden States] --> E[Receive Hidden States]\n        E --> F[Backward Pass]\n    end\n    \n    subgraph \"Data Parallel Region\"\n        G[Synchronize KV Cache] --> H[Load Balance Requests]\n    end\n```\n\n## Disaggregated Prefill and Decode\n\nvLLM supports disaggregated prefill/decode architectures where prefill (initial prompt processing) and decode (token generation) stages run on separate GPU clusters. This enables independent scaling of prefill and decode resources.\n\n### KV Transfer Architecture\n\nThe KV transfer system enables sharing of KV cache between prefill and decode instances.\n\n```mermaid\ngraph LR\n    A[Prefill Instance] -->|KV Transfer| B[Shared Storage]\n    B -->|KV Load| C[Decode Instance]\n    \n    subgraph \"Prefill Process\"\n        A1[Tokenize Prompts] --> A2[Process Prefill]\n        A2 --> A3[Save KV Cache]\n    end\n    \n    subgraph \"Decode Process\"\n        C1[Load KV Cache] --> C2[Generate Tokens]\n        C2 --> C3[Streaming Output]\n    end\n```\n\n资料来源：[examples/disaggregated/example_connector/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/example_connector/README.md)\n\n### KV Connector Base\n\nThe `KVConnectorBase` class defines the interface for KV cache transfer implementations.\n\n```python\n# From vllm/distributed/kv_transfer/kv_connector/base.py:15-85\nclass KVConnectorBase(ABC):\n    def __init__(self, kv_transfer_config: KVTransferParams):\n        self.config = kv_transfer_config\n    \n    @abstractmethod\n    def load_kv_cache(\n        self,\n        requests: List[TransferJob],\n        scheduler: Any\n    ) -> None:\n        \"\"\"Load KV cache from external source during prefill\"\"\"\n        pass\n    \n    @abstractmethod\n    def save_kv_cache(\n        self,\n        blocks: List[PhysicalTokenBlock],\n        kv_pair: KVCache,\n        callback: Callable\n    ) -> TransferJob:\n        \"\"\"Save KV cache to external storage during decode\"\"\"\n        pass\n```\n\n| Method | Purpose |\n|--------|---------|\n| `load_kv_cache` | Load KV cache during prefill phase |\n| `save_kv_cache` | Persist KV cache during decode phase |\n| `profile_num_available_blocks` | Determine available transfer capacity |\n\n资料来源：[vllm/distributed/kv_transfer/kv_connector/base.py:15-85](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/kv_transfer/kv_connector/base.py)\n\n### Example Connector Implementation\n\nThe `ExampleConnector` provides a reference implementation for KV transfer:\n\n```python\n# From examples/disaggregated/example_connector/\nclass ExampleConnector(KVConnectorBase):\n    def __init__(self, kv_transfer_config: KVTransferParams):\n        super().__init__(kv_transfer_config)\n        self.local_storage = \"./local_storage\"\n```\n\nThe connector workflow:\n1. **Prefill Phase**: Process prompts and save KV state to `local_storage` directory\n2. **Decode Phase**: Load KV state from storage and continue generation\n\n资料来源：[examples/disaggregated/example_connector/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/example_connector/README.md)\n\n## Configuration\n\n### Parallel Configuration Parameters\n\nThe `ParallelConfig` class in `vllm/config/parallel.py` manages all parallelism settings.\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `tensor_parallel_size` | int | 1 | Number of GPUs for tensor parallelism |\n| `pipeline_parallel_size` | int | 1 | Number of pipeline stages |\n| `data_parallel_size` | int | 1 | Number of data parallel replicas |\n| `data_parallel_size_per_region` | int | - | DP size for hybrid parallelism |\n| `data_parallel_master_port` | int | 29500 | Port for DP master communication |\n| `data_parallel_master_addr` | str | - | Address for DP master |\n| `numa_aware` | bool | False | Enable NUMA-aware GPU placement |\n\n```python\n# From vllm/config/parallel.py:10-45\n@dataclass\nclass ParallelConfig:\n    tensor_parallel_size: int = 1\n    pipeline_parallel_size: int = 1\n    data_parallel_size: int = 1\n    data_parallel_size_per_region: Optional[int] = None\n    data_parallel_master_port: int = 29500\n    data_parallel_master_addr: Optional[str] = None\n    data_parallel_standalone: bool = False\n    numa_aware: bool = False\n```\n\n资料来源：[vllm/config/parallel.py:10-45](https://github.com/vllm-project/vllm/blob/main/vllm/config/parallel.py)\n\n### Environment Variables\n\n| Variable | Description |\n|----------|-------------|\n| `CUDA_VISIBLE_DEVICES` | Comma-separated list of GPU IDs to use |\n| `VLLM_HOST_IP` | Host IP for distributed communication |\n| `VLLM_PORT` | Port for worker communication |\n\n### Launching Distributed Inference\n\n#### Multi-GPU Launch with `torchrun`\n\n```bash\ntorchrun --nproc_per_node=4 \\\n    --nnodes=1 \\\n    vllm/entrypoints/llm.py \\\n    --model meta-llama/Llama-2-70b-hf \\\n    --tensor-parallel-size 4\n```\n\n#### Multi-Node Launch\n\n```bash\ntorchrun --nproc_per_node=8 \\\n    --nnodes=2 \\\n    --node_rank=0 \\\n    --master_addr=10.0.0.1 \\\n    --master_port=29500 \\\n    vllm/entrypoints/llm.py \\\n    --model meta-llama/Llama-2-70b-hf \\\n    --tensor-parallel-size 4 \\\n    --pipeline-parallel-size 4\n```\n\n## Scaling Strategies\n\n### Scaling Guidelines\n\n| Model Size | GPU Memory | Recommended Configuration |\n|------------|------------|---------------------------|\n| 7B | 24GB | TP=1, DP=single node |\n| 13B | 48GB | TP=2, DP=single node |\n| 70B | 320GB+ | TP=4 or 8, PP=2+ |\n| 405B | 800GB+ | TP=8, PP=8, multi-node |\n\n资料来源：[docs/serving/parallelism_scaling.md](https://github.com/vllm-project/vllm/blob/main/docs/serving/parallelism_scaling.md)\n\n### Disaggregated Prefill Scaling\n\nFor disaggregated prefill/decode, consider:\n\n| Workload Pattern | Prefill Resources | Decode Resources |\n|-----------------|-------------------|-------------------|\n| Short prompts, many requests | Scale prefill | Scale decode |\n| Long prompts, few requests | Scale prefill with TP | Scale decode |\n| Mixed workload | Balance both | Balance both |\n\n### Scaling Best Practices\n\n1. **Start with tensor parallelism** for intra-node scaling\n2. **Add pipeline parallelism** for multi-node deployments\n3. **Use data parallelism** to increase throughput on same-stage workloads\n4. **Enable disaggregation** when prefill and decode have different resource needs\n\n资料来源：[docs/serving/parallelism_scaling.md](https://github.com/vllm-project/vllm/blob/main/docs/serving/parallelism_scaling.md)\n\n## Architecture Diagram\n\n```mermaid\ngraph TB\n    subgraph \"vLLM Distributed Architecture\"\n        subgraph \"Process 0\"\n            P0_M[Model Shard 0]\n            P0_S[Scheduler]\n            P0_C[Cache Engine]\n        end\n        \n        subgraph \"Process 1\"\n            P1_M[Model Shard 1]\n            P1_S[Scheduler]\n            P1_C[Cache Engine]\n        end\n        \n        subgraph \"Process N\"\n            PN_M[Model Shard N]\n            PN_S[Scheduler]\n            PN_C[Cache Engine]\n        end\n        \n        NCCL[NCCL AllReduce]\n        \n        P0_S <-->|NCCL| P1_S\n        P1_S <-->|NCCL| PN_S\n        P0_S <-->|NCCL| PN_S\n        \n        P0_M <-->|Forward Pass| P0_S\n        P1_M <-->|Forward Pass| P1_S\n        PN_M <-->|Forward Pass| PN_S\n    end\n    \n    subgraph \"External Services\"\n        KV[KV Transfer Service]\n        Redis[(Redis/Storage)]\n    end\n    \n    P0_C <-->|Save KV| Redis\n    PN_C <-->|Load KV| Redis\n    Redis <--> KV\n```\n\n## See Also\n\n- [Offline Inference Documentation](../features/structured_outputs.md)\n- [API Reference - LLM Class](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html)\n- [Sampling Parameters](https://docs.vllm.ai/en/latest/api/inference_params.html)\n- [Disaggregated Prefill Example](../disaggregated/example_connector/README.md)\n\n---\n\n<a id='page-10'></a>\n\n## Model Architecture Support\n\n### 相关页面\n\n相关主题：[Model Executor and Worker Architecture](#page-4), [Quantization Support](#page-8)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n- [vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n- [vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n- [examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n- [examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n- [README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n</details>\n\n# Model Architecture Support\n\n## Overview\n\nvLLM provides comprehensive support for various LLM architectures, enabling users to serve, fine-tune, and run inference with a wide range of transformer-based models. The system is designed to be architecture-agnostic while providing optimized implementations for popular model families.\n\n## Core Model Loading Architecture\n\n### Python API (Offline Inference)\n\nThe primary Python interface for running offline inference is the `LLM` class, which handles model loading and inference without requiring a separate inference server.\n\n```python\n# Basic usage example\nfrom vllm import LLM\n\nllm = LLM(model=\"Qwen/Qwen3-0.6B\")\noutput = llm.generate(\"Hello, world!\")\n```\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n### CLI-based Model Serving\n\nThe vLLM CLI provides a `serve` subcommand for HTTP-based model serving:\n\n```bash\nvllm serve Qwen/Qwen3-0.6B\n```\n\nIf no model is specified, the CLI defaults to `Qwen/Qwen3-0.6B`.\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n## Supported Model Categories\n\n### Language Models\n\nvLLM supports a broad range of causal language models including:\n\n- **Decoder-only transformers**: Standard autoregressive models\n- **Mixture of Experts (MoE)**: Sparse architectures like Mixtral\n- **Multimodal models**: Models that process multiple input types\n\n### Quantization Support\n\nvLLM supports quantized models through GGUF format, enabling deployment of compressed models with reduced memory footprint.\n\nExample loading a quantized model directly from HuggingFace:\n\n```bash\n--model unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B\n```\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n## Model Configuration\n\n### Generation Configuration\n\nThe `--generation-config` argument specifies where the generation config is loaded from:\n\n| Value | Source | Description |\n|-------|--------|-------------|\n| `auto` | Model path | Loads from model's configuration directory |\n| `<folder_path>` | Local folder | Loads from specified directory |\n| Not provided | vLLM defaults | Uses built-in default parameters |\n\n> If `max_new_tokens` is specified in generation config, it sets a server-wide limit on output tokens for all requests.\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n### Engine Arguments\n\nModel configuration is controlled through `AsyncEngineArgs`, which processes CLI arguments and creates the model configuration:\n\n```python\nengine_args = AsyncEngineArgs.from_cli_args(args)\nmodel_config = engine_args.create_model_config()\n```\n\n资料来源：[vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n\n## Structured Outputs\n\nvLLM supports structured output generation for models that support it, including reasoning models:\n\n```bash\nvllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \\\n    --reasoning-parser deepseek_r1\n```\n\nThis enables compliance with output format constraints defined by the model.\n\n资料来源：[examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n\n## CPU Offload Support\n\nFor models that exceed available GPU memory, vLLM provides CPU offload capabilities:\n\n```bash\n--cpu-offload-gb 10\n```\n\nThis creates virtual GPU memory by offloading portions of the model to CPU RAM. For example, with a 24GB GPU and 10GB offload, you can effectively load a 13B model requiring ~26GB.\n\n> **Note**: This requires fast CPU-GPU interconnect for acceptable performance.\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n## Model Registry Architecture\n\nThe vLLM CLI uses a modular command structure where model support is registered through subcommand modules:\n\n```python\nfor cmd_module in CMD_MODULES:\n    new_cmds = cmd_module.cmd_init()\n    for cmd in new_cmds:\n        cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)\n```\n\nEach registered command includes validation logic to ensure model configurations are valid before execution.\n\n资料来源：[vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n\n## Serving Modes\n\n### Standard API Server\n\nThe default serving mode starts an HTTP server with OpenAI-compatible endpoints:\n\n```bash\nvllm serve <model_name>\n```\n\n### Headless Mode\n\nFor distributed deployments, headless mode skips API server initialization:\n\n```python\nif args.headless:\n    if args.api_server_count is not None and args.api_server_count > 0:\n        raise ValueError(\n            f\"--api-server-count={args.api_server_count} cannot be \"\n            \"used with --headless (no API servers are started in \"\n            \"headless mode).\"\n        )\n    args.api_server_count = 0\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n### gRPC Server Mode\n\nFor high-performance scenarios, vLLM supports gRPC-based serving:\n\n```python\nif getattr(args, \"grpc\", False):\n    from vllm.entrypoints.grpc_server import serve_grpc\n    uvloop.run(serve_grpc(args))\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n## Data Parallel Modes\n\nvLLM supports distributed model serving through multiple load balancing strategies:\n\n| Mode | Flag | Description |\n|------|------|-------------|\n| External LB | `--data-parallel-external-lb` or `--data-parallel-rank` | External load balancer manages request distribution |\n| Hybrid LB | `--data-parallel-hybrid-lb` or `--data-parallel-start-rank` | Hybrid approach with internal and external coordination |\n\nThe system auto-detects load balancing mode to set appropriate default values for `api_server_count`.\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n## Installation and Quickstart\n\nFor users getting started with model architecture support:\n\n1. Install vLLM following the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation.html)\n2. Review the [quickstart documentation](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)\n3. Check the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html)\n\n资料来源：[README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n\n## Architecture Flow Diagram\n\n```mermaid\ngraph TD\n    A[User Request] --> B{CLI or Python API?}\n    B -->|CLI| C[vllm serve command]\n    B -->|Python| D[LLM class instantiation]\n    C --> E[CLISubcommand processing]\n    D --> F[AsyncEngineArgs configuration]\n    E --> G[Model Registry Lookup]\n    F --> H[Model Config Creation]\n    G --> I[Load Model Architecture]\n    H --> I\n    I --> J{Quantization?}\n    J -->|GGUF| K[Load Quantized Weights]\n    J -->|BF16/FP8| L[Load Standard Weights]\n    K --> M[PagedAttention Engine]\n    L --> M\n    M --> N[Inference Execution]\n```\n\n## Key Implementation Files\n\n| Component | File Path | Purpose |\n|-----------|-----------|---------|\n| CLI Entry | `vllm/entrypoints/cli/main.py` | Main CLI dispatcher and argument parsing |\n| Serve Command | `vllm/entrypoints/cli/serve.py` | HTTP server startup and configuration |\n| Launch Layer | `vllm/entrypoints/cli/launch.py` | FastAPI-based serving layer |\n| Offline Inference | `examples/basic/offline_inference/` | Python API usage examples |\n| Structured Outputs | `examples/features/structured_outputs/` | Advanced output formatting |\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：vllm-project/vllm\n\n摘要：发现 21 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling。\n\n## 1. 安装坑 · 来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_1a71634c530044a68b9160080d55de0a | https://github.com/vllm-project/vllm/issues/42182 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安装坑 · 来源证据：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_58327949a4524ed082bd189b53f713a1 | https://github.com/vllm-project/vllm/issues/40896 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_l…\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_lora_adapter`?\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fb1461834fe34049bd05182574d3e5e5 | https://github.com/vllm-project/vllm/issues/42207 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 4. 安装坑 · 来源证据：v0.18.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.18.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_317a03f9de4e459f9be42064c7318b2c | https://github.com/vllm-project/vllm/releases/tag/v0.18.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 能力坑 · 来源证据：[Feature]: Qwen3.5-Moe LoRA Support (experts)\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：[Feature]: Qwen3.5-Moe LoRA Support (experts)\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_2d068d43c6654f3cab6b48bf98dad116 | https://github.com/vllm-project/vllm/issues/40005 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 6. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:599547518 | https://github.com/vllm-project/vllm | README/documentation is current enough for a first validation pass.\n\n## 7. 运行坑 · 来源证据：v0.20.2\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：v0.20.2\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ecf37722dff6494c82b384225e34bcb0 | https://github.com/vllm-project/vllm/releases/tag/v0.20.2 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 8. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | last_activity_observed missing\n\n## 9. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:599547518 | https://github.com/vllm-project/vllm | no_demo; severity=medium\n\n## 10. 安全/权限坑 · 存在安全注意事项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：No sandbox install has been executed yet; downstream must verify before user use.\n- 对用户的影响：用户安装前需要知道权限边界和敏感操作。\n- 建议检查：转成明确权限清单和安全审查提示。\n- 防护动作：安全注意事项必须面向用户前置展示。\n- 证据：risks.safety_notes | github_repo:599547518 | https://github.com/vllm-project/vllm | No sandbox install has been executed yet; downstream must verify before user use.\n\n## 11. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:599547518 | https://github.com/vllm-project/vllm | no_demo; severity=medium\n\n## 12. 安全/权限坑 · 来源证据：[Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / A100\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / A100\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_9ce279a037934e8085332120bfdaca86 | https://github.com/vllm-project/vllm/issues/41758 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 13. 安全/权限坑 · 来源证据：v0.16.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.16.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_57ff11ff995a4809a33a38ff7504a5ef | https://github.com/vllm-project/vllm/releases/tag/v0.16.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 安全/权限坑 · 来源证据：v0.17.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.17.0\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_4592ced4fde24aa9aa808caa40e25b84 | https://github.com/vllm-project/vllm/releases/tag/v0.17.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 安全/权限坑 · 来源证据：v0.18.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.18.0\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_7ca09904f8164e94a3a1bc489d32d1ff | https://github.com/vllm-project/vllm/releases/tag/v0.18.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 16. 安全/权限坑 · 来源证据：v0.19.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.19.0\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_cdb0d7a7f491474da7b93583ec643c00 | https://github.com/vllm-project/vllm/releases/tag/v0.19.0 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 17. 安全/权限坑 · 来源证据：v0.19.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.19.1\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8c2ec43cf6f147d49f80f22bcd199e8f | https://github.com/vllm-project/vllm/releases/tag/v0.19.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 18. 安全/权限坑 · 来源证据：v0.20.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.20.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_c867dafed19f4b97978d637aea4c7308 | https://github.com/vllm-project/vllm/releases/tag/v0.20.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 19. 安全/权限坑 · 来源证据：v0.20.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.20.1\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_dc6a717c695b47838899da8c9791f907 | https://github.com/vllm-project/vllm/releases/tag/v0.20.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 20. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | issue_or_pr_quality=unknown\n\n## 21. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | release_recency=unknown\n\n<!-- canonical_name: vllm-project/vllm; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "vllm",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "github_repo:599547518",
          "kind": "repo",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/vllm-project/vllm"
        },
        {
          "evidence_id": "art_bebf82825fe1411683aeb7967778e694",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/vllm-project/vllm#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "vllm 说明书",
      "toc": [
        "https://github.com/vllm-project/vllm 项目说明书",
        "目录",
        "vLLM Overview",
        "What is vLLM?",
        "Key Features",
        "Architecture Overview",
        "Observability",
        "Supported Features Summary",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "bd9dbe60601c986b50260f299fe279d057d7d89f",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "README.md",
      "docs/.nav.yml",
      "docs/README.md",
      "docs/configuration/serve_args.md",
      "docs/configuration/optimization.md",
      "docs/configuration/conserving_memory.md",
      "docs/configuration/model_resolution.md",
      "docs/configuration/README.md",
      "docs/configuration/env_vars.md",
      "docs/configuration/engine_args.md",
      "docs/cli/run-batch.md",
      "docs/cli/chat.md",
      "docs/cli/complete.md",
      "docs/cli/.nav.yml",
      "docs/cli/json_tip.inc.md",
      "docs/cli/.meta.yml",
      "docs/cli/README.md",
      "docs/cli/serve.md",
      "docs/api/README.md",
      "docs/benchmarking/cli.md",
      "docs/benchmarking/sweeps.md",
      "docs/benchmarking/README.md",
      "docs/benchmarking/dashboard.md",
      "docs/serving/distributed_troubleshooting.md",
      "docs/serving/openai_compatible_server.md",
      "docs/serving/context_parallel_deployment.md",
      "docs/serving/expert_parallel_deployment.md",
      "docs/serving/data_parallel_deployment.md",
      "docs/serving/parallelism_scaling.md",
      "docs/serving/offline_inference.md",
      "docs/design/moe_kernel_features.md",
      "docs/design/optimization_levels.md",
      "docs/design/torch_compile.md",
      "docs/design/io_processor_plugins.md",
      "docs/design/attention_backends.md",
      "docs/design/torch_compile_multimodal.md",
      "docs/design/fused_moe_modular_kernel.md",
      "docs/design/metrics.md",
      "docs/design/vllm_ir.md"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# vllm - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 vllm 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **AI 研究者或研究型 Agent 构建者**：README 明确围绕研究、实验或论文工作流展开。 证据：`README.md` Claim：`clm_0002` supported 0.86\n\n## 它能做什么\n\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`AGENTS.md`, `docs/getting_started/quickstart.md` Claim：`clm_0001` supported 0.86\n\n## 怎么开始\n\n- `pip install --upgrade uv` 证据：`docs/getting_started/quickstart.md` Claim：`clm_0003` unverified 0.25\n- `curl http://localhost:8000/v1/models` 证据：`docs/getting_started/quickstart.md` Claim：`clm_0004` unverified 0.25\n- `curl http://localhost:8000/v1/completions \\` 证据：`docs/getting_started/quickstart.md` Claim：`clm_0005` unverified 0.25\n- `curl http://localhost:8000/v1/chat/completions \\` 证据：`docs/getting_started/quickstart.md` Claim：`clm_0006` unverified 0.25\n- `curl -LsSf https://astral.sh/uv/install.sh | sh` 证据：`AGENTS.md` Claim：`clm_0007` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：先做角色匹配试用\n- **为什么**：这个项目更像角色库，核心风险是选错角色或把角色文案当执行能力；先用 Prompt Preview 试角色匹配，再决定是否沙盒导入。\n\n### 30 秒判断\n\n- **现在怎么做**：先做角色匹配试用\n- **最小安全下一步**：先用 Prompt Preview 试角色匹配；满意后再隔离导入\n- **先别相信**：角色质量和任务匹配不能直接相信。\n- **继续会触碰**：角色选择偏差、命令执行、宿主 AI 配置\n\n### 现在可以相信\n\n- **适合人群线索：AI 研究者或研究型 Agent 构建者**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`AGENTS.md`, `docs/getting_started/quickstart.md` Claim：`clm_0001` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`AGENTS.md` Claim：`clm_0007` supported 0.86\n\n### 现在还不能相信\n\n- **角色质量和任务匹配不能直接相信。**（unverified）：角色库证明有很多角色，不证明每个角色都适合你的具体任务，也不证明角色能产生高质量结果。\n- **不能把角色文案当成真实执行能力。**（unverified）：安装前只能判断角色描述和任务画像是否匹配，不能证明它能在宿主 AI 里完成任务。\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。 证据：`AGENTS.md`, `CLAUDE.md`\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n\n### 继续会触碰什么\n\n- **角色选择偏差**：用户对任务应该由哪个专家角色处理的判断。 原因：选错角色会让 AI 从错误专业视角回答，浪费时间或误导决策。\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`AGENTS.md`, `docs/getting_started/quickstart.md`\n- **宿主 AI 配置**：Claude/Codex/Cursor/Gemini/OpenCode 等宿主的 plugin、Skill 或规则加载配置。 原因：宿主配置会改变 AI 后续工作方式，可能和用户已有规则冲突。 证据：`AGENTS.md`, `CLAUDE.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`AGENTS.md`, `docs/getting_started/quickstart.md`\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：先用交互式试用验证任务画像和角色匹配，不要先导入整套角色库。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **先备份宿主 AI 配置**：Skill、plugin、规则文件可能改变 Claude/Cursor/Codex 的默认行为。（适用：存在插件 manifest、Skill 或宿主规则入口时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **准备移除宿主 plugin / Skill / 规则入口**：如果试装后行为异常，可以把宿主 AI 恢复到试装前状态。\n- **保留原始角色选择记录**：如果输出偏题，可以回到任务画像阶段重新选择角色，而不是继续沿着错误角色推进。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0008` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`AGENTS.md`, `docs/getting_started/quickstart.md` Claim：`clm_0009` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`AGENTS.md`, `docs/getting_started/quickstart.md` Claim：`clm_0001` supported 0.86\n\n### 上下文规模\n\n- 文件总数：4834\n- 重要文件覆盖：40/4834\n- 证据索引条目：80\n- 角色 / Skill 条目：79\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 vllm 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 vllm 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 vllm 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 79 个角色 / Skill / 项目文档条目。\n\n- **Welcome to vLLM**（project_doc）：! ./assets/logos/vllm-logo-text-light.png { align=\"center\" alt=\"vLLM Light\" class=\"logo-light\" width=\"60%\" } ! ./assets/logos/vllm-logo-text-dark.png { align=\"center\" alt=\"vLLM Dark\" class=\"logo-dark\" width=\"60%\" } 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/README.md`\n- **Summary**（project_doc）：API documentation for vLLM's configuration classes. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/api/README.md`\n- **Benchmark Suites**（project_doc）：vLLM provides comprehensive benchmarking tools for performance testing and evaluation: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/benchmarking/README.md`\n- **vLLM CLI Guide**（project_doc）：The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/README.md`\n- **Configuration Options**（project_doc）：This section lists the most common options for running vLLM. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/configuration/README.md`\n- **Contributing to vLLM**（project_doc）：Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/contributing/README.md`\n- **Summary**（project_doc）：!!! important Many decoder language models can now be automatically loaded using the Transformers modeling backend ../../models/supported models.md transformers without having to implement them in vLLM. See if vllm serve works first! 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/contributing/model/README.md`\n- **Examples**（project_doc）：vLLM's examples are organized into the following categories: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples/README.md`\n- **Features**（project_doc）：The tables below show mutually exclusive features and the support on some hardware. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/features/README.md`\n- **Quantization**（project_doc）：Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/features/quantization/README.md`\n- **Speculative Decoding**（project_doc）：This document shows how to use Speculative Decoding https://arxiv.org/pdf/2302.01318 with vLLM to reduce inter-token latency under medium-to-low QPS query per second , memory-bound workloads. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/features/speculative_decoding/README.md`\n- **Installation**（project_doc）：vLLM supports the following hardware platforms: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/getting_started/installation/README.md`\n- **Pooling Models**（project_doc）：!!! note We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/models/pooling_models/README.md`\n- **Weight Transfer**（project_doc）：vLLM provides a pluggable weight transfer system for synchronizing model weights from a training process to the inference engine during reinforcement learning RL workflows. This is essential for RLHF, GRPO, and other online RL methods where the policy model is iteratively updated during training and the updated weights must be reflected in the inference engine for rollout generation. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/training/weight_transfer/README.md`\n- **Using vLLM**（project_doc）：First, vLLM must be installed ../getting started/installation/README.md for your chosen device in either a Python or Docker environment. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/usage/README.md`\n- **Agent Instructions for vLLM**（project_doc）：These instructions apply to all AI-assisted contributions to vllm-project/vllm . Breaching these guidelines can result in automatic banning. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`AGENTS.md`\n- **Claude**（project_doc）： 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`CLAUDE.md`\n- **About**（project_doc）：Easy, fast, and cheap LLM serving for everyone 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`README.md`\n- **Benchmarks**（project_doc）：This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`benchmarks/README.md`\n- **vLLM benchmark suite**（project_doc）：This directory contains a benchmarking suite for developers to run locally and gain clarity on whether their PR improves/degrades vllm's performance. vLLM also maintains a continuous performance benchmark under perf.vllm.ai https://perf.vllm.ai/ , hosted under PyTorch CI HUD. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`.buildkite/performance-benchmarks/README.md`\n- **vLLM Attention Benchmarking Suite**（project_doc）：Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`benchmarks/attention_benchmarks/README.md`\n- **Automated vLLM Server Parameter Tuning**（project_doc）：Automated vLLM Server Parameter Tuning 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`benchmarks/auto_tune/README.md`\n- **DeepSeek DeepGEMM Kernels Benchmark**（project_doc）：DeepSeek DeepGEMM Kernels Benchmark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`benchmarks/kernels/deepgemm/README.md`\n- **Benchmark KV Cache Offloading with Multi-Turn Conversations**（project_doc）：Benchmark KV Cache Offloading with Multi-Turn Conversations 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`benchmarks/multi_turn/README.md`\n- **Machete Mixed Precision Cutlass-Based GEMM**（project_doc）：Machete Mixed Precision Cutlass-Based GEMM 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`csrc/quantization/machete/Readme.md`\n- **Offline Inference**（project_doc）：The LLM class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/basic/offline_inference/README.md`\n- **Helm Charts**（project_doc）：This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/deployment/chart-helm/README.md`\n- **Disaggregated Encoder**（project_doc）：These example scripts that demonstrate the disaggregated encoder EPD features of vLLM. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/disaggregated/disaggregated_encoder/README.md`\n- **Disaggregated Serving**（project_doc）：This example contains scripts that demonstrate the disaggregated serving features of vLLM. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/disaggregated/disaggregated_serving/README.md`\n- **Disaggregated Prefill V1**（project_doc）：This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/disaggregated/example_connector/README.md`\n- **KV Load Failure Recovery Test**（project_doc）：This example builds upon the example connector example in examples/disaggregated . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/disaggregated/kv_load_failure_recovery_offline/README.md`\n- **LMCache Examples**（project_doc）：This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/disaggregated/lmcache/README.md`\n- **Custom Logits Processors**（project_doc）：This directory contains examples demonstrating how to use custom logits processors with vLLM's offline inference API. Logits processors allow you to modify the model's output distribution before sampling, enabling controlled generation behaviors like token masking, constrained decoding, and custom sampling strategies. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/features/logits_processor/README.md`\n- **Offline Inference with the OpenAI Batch file format**（project_doc）：Offline Inference with the OpenAI Batch file format 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/features/openai_batch/README.md`\n- **Structured Outputs**（project_doc）：This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server. It can run individual constraint type or all of them. It supports both streaming responses and concurrent non-streaming requests. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/features/structured_outputs/README.md`\n- **Qwen2.5-Omni Offline Inference Examples**（project_doc）：Qwen2.5-Omni Offline Inference Examples 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/generate/multimodal/qwen2_5_omni/README.md`\n- **Monitoring Dashboards**（project_doc）：This directory contains monitoring dashboard configurations for vLLM, providing comprehensive observability for your vLLM deployments. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/observability/dashboards/README.md`\n- **Grafana Dashboards for vLLM Monitoring**（project_doc）：Grafana Dashboards for vLLM Monitoring 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/observability/dashboards/grafana/README.md`\n- **Perses Dashboards for vLLM Monitoring**（project_doc）：Perses Dashboards for vLLM Monitoring 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/observability/dashboards/perses/README.md`\n- **Setup OpenTelemetry POC**（project_doc）：Note: The core OpenTelemetry packages opentelemetry-sdk , opentelemetry-api , opentelemetry-exporter-otlp , opentelemetry-semantic-conventions-ai are bundled with vLLM. Manual installation is not required. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/observability/opentelemetry/README.md`\n- **Prometheus and Grafana**（project_doc）：This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through Prometheus https://prometheus.io/ and Grafana https://grafana.com/ websites. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/observability/prometheus_grafana/README.md`\n- **Long Text Embedding with Chunked Processing**（project_doc）：Long Text Embedding with Chunked Processing 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/pooling/embed/openai_embedding_long_text/README.md`\n- **compile test folder structure**（project_doc）：- compile/test .py : various unit tests meant for testing particular code path/features. Future tests are most likely added here. New test files added here will be included in CI automatically - compile/fullgraph/ : full model tests, including all tests previously in compile/piecewise. These tests do not target particular features. New test files added here will be included in CI automatically - compile/distributed/… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`tests/compile/README.md`\n- **GPQA Evaluation using GPT-OSS**（project_doc）：This directory contains GPQA evaluation tests using the GPT-OSS evaluation package and vLLM server. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`tests/evals/gpt_oss/README.md`\n- **GSM8K Accuracy Evaluation**（project_doc）：This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`tests/evals/gsm8k/README.md`\n- **MRCR Long-Context Accuracy Evaluation**（project_doc）：MRCR Long-Context Accuracy Evaluation 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`tests/evals/mrcr/README.md`\n- **EPD Correctness Test**（project_doc）：This test verifies that EPD Encoder-Prefill-Decode disaggregation produces identical outputs to a baseline single instance. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`tests/v1/ec_connector/integration/README.md`\n- **Expert parallel kernels**（project_doc）：Large-scale cluster-level expert parallel, as described in the DeepSeek-V3 Technical Report http://arxiv.org/abs/2412.19437 , is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`tools/ep_kernels/README.md`\n- **gputrc2graph.py**（project_doc）：This script processes NVIDIA Nsight Systems nsys GPU trace files .nsys-rep with -t cuda tracing enabled, and generates kernel-level summaries and visualizations of GPU and non-GPU time. It is useful for profiling and analyzing nsys profile output. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`tools/profiler/nsys_profile_tools/README.md`\n- **Distributed KV cache transfer**（project_doc）：This folder implements distributed KV cache transfer across vLLM instances. Currently the main use case is for disaggregated prefilling. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`vllm/distributed/kv_transfer/README.md`\n- **Quantization Kernel Config**（project_doc）：Use scripts under benchmarks/kernels/ to generate these config files. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`vllm/model_executor/layers/quantization/utils/configs/README.md`\n- **Experimental Model Runner V2**（project_doc）：This directory contains the new model runner which is under active development. Ping Woosuk Kwon https://github.com/WoosukKwon for any changes. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`vllm/v1/worker/gpu/README.md`\n- **Contributing to vLLM**（project_doc）：You may find information about contributing to vLLM on docs.vllm.ai https://docs.vllm.ai/en/latest/contributing . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`CONTRIBUTING.md`\n- **Benchmark CLI**（project_doc）：This section guides you through running benchmark tests with the extensive datasets supported on vLLM. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/benchmarking/cli.md`\n- **Performance Dashboard**（project_doc）：The performance dashboard is used to confirm whether new changes improve/degrade performance under various workloads. It is updated by triggering benchmark runs on every commit with both the perf-benchmarks and ready labels, and when a PR is merged into vLLM. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/benchmarking/dashboard.md`\n- **Parameter Sweeps**（project_doc）：vllm bench sweep is a suite of commands designed to run benchmarks across multiple configurations and compare them by visualizing the results. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/benchmarking/sweeps.md`\n- **vllm bench latency**（project_doc）：--8<-- \"docs/generated/argparse/bench latency.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/bench/latency.md`\n- **vllm bench mm-processor**（project_doc）：vllm bench mm-processor profiles the multimodal input processor pipeline of vision-language models. It measures per-stage latency from the HuggingFace processor through to the encoder forward pass, helping you identify preprocessing bottlenecks and understand how different image resolutions or item counts affect end-to-end request time. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/bench/mm_processor.md`\n- **vllm bench serve**（project_doc）：--8<-- \"docs/generated/argparse/bench serve.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/bench/serve.md`\n- **vllm bench sweep plot**（project_doc）：--8<-- \"docs/generated/argparse/bench sweep plot.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/bench/sweep/plot.md`\n- **vllm bench sweep plot pareto**（project_doc）：--8<-- \"docs/generated/argparse/bench sweep plot pareto.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/bench/sweep/plot_pareto.md`\n- **vllm bench sweep serve**（project_doc）：--8<-- \"docs/generated/argparse/bench sweep serve.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/bench/sweep/serve.md`\n- **vllm bench sweep serve workload**（project_doc）：--8<-- \"docs/generated/argparse/bench sweep serve workload.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/bench/sweep/serve_workload.md`\n- **vllm bench throughput**（project_doc）：--8<-- \"docs/generated/argparse/bench throughput.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/bench/throughput.md`\n- **vllm chat**（project_doc）：--8<-- \"docs/generated/argparse/chat.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/chat.md`\n- **vllm complete**（project_doc）：--8<-- \"docs/generated/argparse/complete.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/complete.md`\n- **Json Tip.Inc**（project_doc）：When passing JSON CLI arguments, the following sets of arguments are equivalent: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/json_tip.inc.md`\n- **vllm run-batch**（project_doc）：--8<-- \"docs/generated/argparse/run-batch.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/run-batch.md`\n- **vllm serve**（project_doc）：--8<-- \"docs/generated/argparse/serve.inc.md\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cli/serve.md`\n- **Contact Us**（project_doc）： 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/community/contact_us.md`\n- **Meetups**（project_doc）：We host regular meetups around the world. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/community/meetups.md`\n- **Sponsors**（project_doc）：vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support! 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/community/sponsors.md`\n- **Conserving Memory**（project_doc）：Large models might cause your machine to run out of memory OOM . Here are some options that help alleviate this problem. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/configuration/conserving_memory.md`\n- **Engine Arguments**（project_doc）：Engine arguments control the behavior of the vLLM engine. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/configuration/engine_args.md`\n- **Environment Variables**（project_doc）：vLLM uses the following environment variables to configure the system: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/configuration/env_vars.md`\n- **Model Resolution**（project_doc）：vLLM loads HuggingFace-compatible models by inspecting the architectures field in config.json of the model repository and finding the corresponding implementation that is registered to vLLM. Nevertheless, our model resolution may fail for the following reasons: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/configuration/model_resolution.md`\n- **Optimization and Tuning**（project_doc）：This guide covers optimization strategies and performance tuning for vLLM V1. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/configuration/optimization.md`\n- **Server Arguments**（project_doc）：The vllm serve command is used to launch the OpenAI-compatible server. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/configuration/serve_args.md`\n- **CI Failures**（project_doc）：What should I do when a CI job fails on my PR, but I don't think my PR caused the failure? 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/contributing/ci/failures.md`\n\n## 证据索引\n\n- 共索引 80 条证据。\n\n- **Welcome to vLLM**（documentation）：! ./assets/logos/vllm-logo-text-light.png { align=\"center\" alt=\"vLLM Light\" class=\"logo-light\" width=\"60%\" } ! ./assets/logos/vllm-logo-text-dark.png { align=\"center\" alt=\"vLLM Dark\" class=\"logo-dark\" width=\"60%\" } 证据：`docs/README.md`\n- **Summary**（documentation）：API documentation for vLLM's configuration classes. 证据：`docs/api/README.md`\n- **Benchmark Suites**（documentation）：vLLM provides comprehensive benchmarking tools for performance testing and evaluation: 证据：`docs/benchmarking/README.md`\n- **vLLM CLI Guide**（documentation）：The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with: 证据：`docs/cli/README.md`\n- **Configuration Options**（documentation）：This section lists the most common options for running vLLM. 证据：`docs/configuration/README.md`\n- **Contributing to vLLM**（documentation）：Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project: 证据：`docs/contributing/README.md`\n- **Summary**（documentation）：!!! important Many decoder language models can now be automatically loaded using the Transformers modeling backend ../../models/supported models.md transformers without having to implement them in vLLM. See if vllm serve works first! 证据：`docs/contributing/model/README.md`\n- **Examples**（documentation）：vLLM's examples are organized into the following categories: 证据：`docs/examples/README.md`\n- **Features**（documentation）：The tables below show mutually exclusive features and the support on some hardware. 证据：`docs/features/README.md`\n- **Quantization**（documentation）：Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices. 证据：`docs/features/quantization/README.md`\n- **Speculative Decoding**（documentation）：This document shows how to use Speculative Decoding https://arxiv.org/pdf/2302.01318 with vLLM to reduce inter-token latency under medium-to-low QPS query per second , memory-bound workloads. 证据：`docs/features/speculative_decoding/README.md`\n- **Installation**（documentation）：vLLM supports the following hardware platforms: 证据：`docs/getting_started/installation/README.md`\n- **Pooling Models**（documentation）：!!! note We currently support pooling models primarily for convenience. This is not guaranteed to provide any performance improvements over using Hugging Face Transformers or Sentence Transformers directly. 证据：`docs/models/pooling_models/README.md`\n- **Weight Transfer**（documentation）：vLLM provides a pluggable weight transfer system for synchronizing model weights from a training process to the inference engine during reinforcement learning RL workflows. This is essential for RLHF, GRPO, and other online RL methods where the policy model is iteratively updated during training and the updated weights must be reflected in the inference engine for rollout generation. 证据：`docs/training/weight_transfer/README.md`\n- **Using vLLM**（documentation）：First, vLLM must be installed ../getting started/installation/README.md for your chosen device in either a Python or Docker environment. 证据：`docs/usage/README.md`\n- **Agent Instructions for vLLM**（documentation）：These instructions apply to all AI-assisted contributions to vllm-project/vllm . Breaching these guidelines can result in automatic banning. 证据：`AGENTS.md`\n- **Claude**（documentation）：@AGENTS.md 证据：`CLAUDE.md`\n- **About**（documentation）：Easy, fast, and cheap LLM serving for everyone 证据：`README.md`\n- **Benchmarks**（documentation）：This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation. 证据：`benchmarks/README.md`\n- **vLLM benchmark suite**（documentation）：This directory contains a benchmarking suite for developers to run locally and gain clarity on whether their PR improves/degrades vllm's performance. vLLM also maintains a continuous performance benchmark under perf.vllm.ai https://perf.vllm.ai/ , hosted under PyTorch CI HUD. 证据：`.buildkite/performance-benchmarks/README.md`\n- **vLLM Attention Benchmarking Suite**（documentation）：Fast, flexible benchmarking for vLLM attention and MLA backends with an extended batch specification grammar. 证据：`benchmarks/attention_benchmarks/README.md`\n- **Automated vLLM Server Parameter Tuning**（documentation）：Automated vLLM Server Parameter Tuning 证据：`benchmarks/auto_tune/README.md`\n- **DeepSeek DeepGEMM Kernels Benchmark**（documentation）：DeepSeek DeepGEMM Kernels Benchmark 证据：`benchmarks/kernels/deepgemm/README.md`\n- **Benchmark KV Cache Offloading with Multi-Turn Conversations**（documentation）：Benchmark KV Cache Offloading with Multi-Turn Conversations 证据：`benchmarks/multi_turn/README.md`\n- **Machete Mixed Precision Cutlass-Based GEMM**（documentation）：Machete Mixed Precision Cutlass-Based GEMM 证据：`csrc/quantization/machete/Readme.md`\n- **Offline Inference**（documentation）：The LLM class provides the primary Python interface for doing offline inference, which is interacting with a model without using a separate model inference server. 证据：`examples/basic/offline_inference/README.md`\n- **Helm Charts**（documentation）：This directory contains a Helm chart for deploying the vllm application. The chart includes configurations for deployment, autoscaling, resource management, and more. 证据：`examples/deployment/chart-helm/README.md`\n- **Disaggregated Encoder**（documentation）：These example scripts that demonstrate the disaggregated encoder EPD features of vLLM. 证据：`examples/disaggregated/disaggregated_encoder/README.md`\n- **Disaggregated Serving**（documentation）：This example contains scripts that demonstrate the disaggregated serving features of vLLM. 证据：`examples/disaggregated/disaggregated_serving/README.md`\n- **Disaggregated Prefill V1**（documentation）：This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM. 证据：`examples/disaggregated/example_connector/README.md`\n- **KV Load Failure Recovery Test**（documentation）：This example builds upon the example connector example in examples/disaggregated . 证据：`examples/disaggregated/kv_load_failure_recovery_offline/README.md`\n- **LMCache Examples**（documentation）：This folder demonstrates how to use LMCache for disaggregated prefilling, CPU offloading and KV cache sharing. 证据：`examples/disaggregated/lmcache/README.md`\n- **Custom Logits Processors**（documentation）：This directory contains examples demonstrating how to use custom logits processors with vLLM's offline inference API. Logits processors allow you to modify the model's output distribution before sampling, enabling controlled generation behaviors like token masking, constrained decoding, and custom sampling strategies. 证据：`examples/features/logits_processor/README.md`\n- **Offline Inference with the OpenAI Batch file format**（documentation）：Offline Inference with the OpenAI Batch file format 证据：`examples/features/openai_batch/README.md`\n- **Structured Outputs**（documentation）：This script demonstrates various structured output capabilities of vLLM's OpenAI-compatible server. It can run individual constraint type or all of them. It supports both streaming responses and concurrent non-streaming requests. 证据：`examples/features/structured_outputs/README.md`\n- **Qwen2.5-Omni Offline Inference Examples**（documentation）：Qwen2.5-Omni Offline Inference Examples 证据：`examples/generate/multimodal/qwen2_5_omni/README.md`\n- **Monitoring Dashboards**（documentation）：This directory contains monitoring dashboard configurations for vLLM, providing comprehensive observability for your vLLM deployments. 证据：`examples/observability/dashboards/README.md`\n- **Grafana Dashboards for vLLM Monitoring**（documentation）：Grafana Dashboards for vLLM Monitoring 证据：`examples/observability/dashboards/grafana/README.md`\n- **Perses Dashboards for vLLM Monitoring**（documentation）：Perses Dashboards for vLLM Monitoring 证据：`examples/observability/dashboards/perses/README.md`\n- **Setup OpenTelemetry POC**（documentation）：Note: The core OpenTelemetry packages opentelemetry-sdk , opentelemetry-api , opentelemetry-exporter-otlp , opentelemetry-semantic-conventions-ai are bundled with vLLM. Manual installation is not required. 证据：`examples/observability/opentelemetry/README.md`\n- **Prometheus and Grafana**（documentation）：This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through Prometheus https://prometheus.io/ and Grafana https://grafana.com/ websites. 证据：`examples/observability/prometheus_grafana/README.md`\n- **Long Text Embedding with Chunked Processing**（documentation）：Long Text Embedding with Chunked Processing 证据：`examples/pooling/embed/openai_embedding_long_text/README.md`\n- **compile test folder structure**（documentation）：- compile/test .py : various unit tests meant for testing particular code path/features. Future tests are most likely added here. New test files added here will be included in CI automatically - compile/fullgraph/ : full model tests, including all tests previously in compile/piecewise. These tests do not target particular features. New test files added here will be included in CI automatically - compile/distributed/ : tests that require multiple GPUs. New test files added here will NOT be included in CI automatically as these tests generally need to be manually configured to run in runners with particular number/type of GPUs. 证据：`tests/compile/README.md`\n- **GPQA Evaluation using GPT-OSS**（documentation）：This directory contains GPQA evaluation tests using the GPT-OSS evaluation package and vLLM server. 证据：`tests/evals/gpt_oss/README.md`\n- **GSM8K Accuracy Evaluation**（documentation）：This directory contains a replacement for the lm-eval-harness GSM8K evaluation, using an isolated GSM8K script and vLLM server for better performance and control. 证据：`tests/evals/gsm8k/README.md`\n- **MRCR Long-Context Accuracy Evaluation**（documentation）：MRCR Long-Context Accuracy Evaluation 证据：`tests/evals/mrcr/README.md`\n- **EPD Correctness Test**（documentation）：This test verifies that EPD Encoder-Prefill-Decode disaggregation produces identical outputs to a baseline single instance. 证据：`tests/v1/ec_connector/integration/README.md`\n- **Expert parallel kernels**（documentation）：Large-scale cluster-level expert parallel, as described in the DeepSeek-V3 Technical Report http://arxiv.org/abs/2412.19437 , is an efficient way to deploy sparse MoE models with many experts. However, such deployment requires many components beyond a normal Python package, including system package support and system driver support. It is impossible to bundle all these components into a Python package. 证据：`tools/ep_kernels/README.md`\n- **gputrc2graph.py**（documentation）：This script processes NVIDIA Nsight Systems nsys GPU trace files .nsys-rep with -t cuda tracing enabled, and generates kernel-level summaries and visualizations of GPU and non-GPU time. It is useful for profiling and analyzing nsys profile output. 证据：`tools/profiler/nsys_profile_tools/README.md`\n- **Distributed KV cache transfer**（documentation）：This folder implements distributed KV cache transfer across vLLM instances. Currently the main use case is for disaggregated prefilling. 证据：`vllm/distributed/kv_transfer/README.md`\n- **Quantization Kernel Config**（documentation）：Use scripts under benchmarks/kernels/ to generate these config files. 证据：`vllm/model_executor/layers/quantization/utils/configs/README.md`\n- **Experimental Model Runner V2**（documentation）：This directory contains the new model runner which is under active development. Ping Woosuk Kwon https://github.com/WoosukKwon for any changes. 证据：`vllm/v1/worker/gpu/README.md`\n- **Contributing to vLLM**（documentation）：You may find information about contributing to vLLM on docs.vllm.ai https://docs.vllm.ai/en/latest/contributing . 证据：`CONTRIBUTING.md`\n- **License**（source_file）：Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ 证据：`LICENSE`\n- **Benchmark CLI**（documentation）：This section guides you through running benchmark tests with the extensive datasets supported on vLLM. 证据：`docs/benchmarking/cli.md`\n- **Performance Dashboard**（documentation）：The performance dashboard is used to confirm whether new changes improve/degrade performance under various workloads. It is updated by triggering benchmark runs on every commit with both the perf-benchmarks and ready labels, and when a PR is merged into vLLM. 证据：`docs/benchmarking/dashboard.md`\n- **Parameter Sweeps**（documentation）：vllm bench sweep is a suite of commands designed to run benchmarks across multiple configurations and compare them by visualizing the results. 证据：`docs/benchmarking/sweeps.md`\n- **vllm bench latency**（documentation）：--8<-- \"docs/generated/argparse/bench latency.inc.md\" 证据：`docs/cli/bench/latency.md`\n- **vllm bench mm-processor**（documentation）：vllm bench mm-processor profiles the multimodal input processor pipeline of vision-language models. It measures per-stage latency from the HuggingFace processor through to the encoder forward pass, helping you identify preprocessing bottlenecks and understand how different image resolutions or item counts affect end-to-end request time. 证据：`docs/cli/bench/mm_processor.md`\n- **vllm bench serve**（documentation）：--8<-- \"docs/generated/argparse/bench serve.inc.md\" 证据：`docs/cli/bench/serve.md`\n- 其余 20 条证据见 `AI_CONTEXT_PACK.json` 或 `EVIDENCE_INDEX.json`。\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`docs/README.md`, `docs/api/README.md`, `docs/benchmarking/README.md`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`docs/README.md`, `docs/api/README.md`, `docs/benchmarking/README.md`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **vLLM Overview**：importance `high`\n  - source_paths: README.md, pyproject.toml, vllm/__init__.py, vllm/version.py\n- **Getting Started**：importance `high`\n  - source_paths: docs/getting_started/installation/README.md, docs/getting_started/quickstart.md, requirements/common.txt, requirements/cuda.txt, setup.py\n- **Core Engine Architecture**：importance `high`\n  - source_paths: vllm/engine/llm_engine.py, vllm/engine/async_llm_engine.py, vllm/v1/engine/core.py, vllm/v1/engine/async_llm.py, vllm/v1/engine/llm_engine.py\n- **Model Executor and Worker Architecture**：importance `high`\n  - source_paths: vllm/v1/worker/gpu_model_runner.py, vllm/v1/worker/worker_base.py, vllm/v1/worker/gpu_worker.py, vllm/model_executor/model_loader/__init__.py, vllm/model_executor/models/__init__.py\n- **Scheduling and Request Processing**：importance `high`\n  - source_paths: vllm/v1/core/sched/scheduler.py, vllm/v1/core/sched/request_queue.py, vllm/v1/core/sched/async_scheduler.py, vllm/v1/request.py, vllm/config/scheduler.py\n- **PagedAttention and KV Cache Management**：importance `high`\n  - source_paths: csrc/attention/paged_attention_v1.cu, csrc/attention/paged_attention_v2.cu, vllm/v1/core/kv_cache_manager.py, vllm/v1/core/block_pool.py, docs/design/paged_attention.md\n- **Attention Backends and Kernels**：importance `medium`\n  - source_paths: vllm/v1/attention/backends/flash_attn.py, vllm/v1/attention/backends/flashinfer.py, vllm/v1/attention/backends/mla/flashmla.py, vllm/v1/attention/backends/registry.py, docs/design/attention_backends.md\n- **Quantization Support**：importance `medium`\n  - source_paths: vllm/model_executor/layers/quantization/fp8.py, vllm/model_executor/layers/quantization/base_config.py, vllm/model_executor/layers/quantization/gguf.py, docs/features/quantization/README.md, csrc/quantization\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `bd9dbe60601c986b50260f299fe279d057d7d89f`\n- inspected_files: `pyproject.toml`, `README.md`, `docs/.nav.yml`, `docs/README.md`, `docs/configuration/serve_args.md`, `docs/configuration/optimization.md`, `docs/configuration/conserving_memory.md`, `docs/configuration/model_resolution.md`, `docs/configuration/README.md`, `docs/configuration/env_vars.md`, `docs/configuration/engine_args.md`, `docs/cli/run-batch.md`, `docs/cli/chat.md`, `docs/cli/complete.md`, `docs/cli/.nav.yml`, `docs/cli/json_tip.inc.md`, `docs/cli/.meta.yml`, `docs/cli/README.md`, `docs/cli/serve.md`, `docs/api/README.md`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_1a71634c530044a68b9160080d55de0a | https://github.com/vllm-project/vllm/issues/42182 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 来源证据：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_58327949a4524ed082bd189b53f713a1 | https://github.com/vllm-project/vllm/issues/40896 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 来源证据：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_l…\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_lora_adapter`?\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_fb1461834fe34049bd05182574d3e5e5 | https://github.com/vllm-project/vllm/issues/42207 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 来源证据：v0.18.1\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.18.1\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_317a03f9de4e459f9be42064c7318b2c | https://github.com/vllm-project/vllm/releases/tag/v0.18.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 来源证据：[Feature]: Qwen3.5-Moe LoRA Support (experts)\n\n- Trigger: GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：[Feature]: Qwen3.5-Moe LoRA Support (experts)\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_2d068d43c6654f3cab6b48bf98dad116 | https://github.com/vllm-project/vllm/issues/40005 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: 能力判断依赖假设\n\n- Trigger: README/documentation is current enough for a first validation pass.\n- Host AI rule: 将假设转成下游验证清单。\n- Why it matters: 假设不成立时，用户拿不到承诺的能力。\n- Evidence: capability.assumptions | github_repo:599547518 | https://github.com/vllm-project/vllm | README/documentation is current enough for a first validation pass.\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 来源证据：v0.20.2\n\n- Trigger: GitHub 社区证据显示该项目存在一个运行相关的待验证问题：v0.20.2\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_ecf37722dff6494c82b384225e34bcb0 | https://github.com/vllm-project/vllm/releases/tag/v0.20.2 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 8: 维护活跃度未知\n\n- Trigger: 未记录 last_activity_observed。\n- Host AI rule: 补 GitHub 最近 commit、release、issue/PR 响应信号。\n- Why it matters: 新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- Evidence: evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | last_activity_observed missing\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 9: 下游验证发现风险项\n\n- Trigger: no_demo\n- Host AI rule: 进入安全/权限治理复核队列。\n- Why it matters: 下游已经要求复核，不能在页面中弱化。\n- Evidence: downstream_validation.risk_items | github_repo:599547518 | https://github.com/vllm-project/vllm | no_demo; severity=medium\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 10: 存在安全注意事项\n\n- Trigger: No sandbox install has been executed yet; downstream must verify before user use.\n- Host AI rule: 转成明确权限清单和安全审查提示。\n- Why it matters: 用户安装前需要知道权限边界和敏感操作。\n- Evidence: risks.safety_notes | github_repo:599547518 | https://github.com/vllm-project/vllm | No sandbox install has been executed yet; downstream must verify before user use.\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：vllm-project/vllm\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：local_cli\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_l…（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：v0.18.1（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：[Feature]: Qwen3.5-Moe LoRA Support (experts)（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/vllm-project/vllm 项目说明书\n\n生成时间：2026-05-15 22:05:46 UTC\n\n## 目录\n\n- [vLLM Overview](#page-1)\n- [Getting Started](#page-2)\n- [Core Engine Architecture](#page-3)\n- [Model Executor and Worker Architecture](#page-4)\n- [Scheduling and Request Processing](#page-5)\n- [PagedAttention and KV Cache Management](#page-6)\n- [Attention Backends and Kernels](#page-7)\n- [Quantization Support](#page-8)\n- [Distributed Inference and Parallelism](#page-9)\n- [Model Architecture Support](#page-10)\n\n<a id='page-1'></a>\n\n## vLLM Overview\n\n### 相关页面\n\n相关主题：[Getting Started](#page-2), [Core Engine Architecture](#page-3)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n- [vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n- [vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n- [vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n- [examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n- [examples/disaggregated/example_connector/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/example_connector/README.md)\n- [examples/disaggregated/disaggregated_encoder/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/disaggregated_encoder/README.md)\n- [examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n- [examples/pooling/embed/openai_embedding_long_text/README.md](https://github.com/vllm-project/vllm/blob/main/examples/pooling/embed/openai_embedding_long_text/README.md)\n- [examples/disaggregated/kv_load_failure_recovery_offline/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/kv_load_failure_recovery_offline/README.md)\n- [examples/observability/opentelemetry/README.md](https://github.com/vllm-project/vllm/blob/main/examples/observability/opentelemetry/README.md)\n- [examples/observability/dashboards/README.md](https://github.com/vllm-project/vllm/blob/main/examples/observability/dashboards/README.md)\n- [tools/profiler/nsys_profile_tools/README.md](https://github.com/vllm-project/vllm/blob/main/tools/profiler/nsys_profile_tools/README.md)\n</details>\n\n# vLLM Overview\n\n## What is vLLM?\n\nvLLM is a fast and easy-to-use library for LLM (Large Language Model) inference and serving. It provides high-throughput, memory-efficient inference with an OpenAI-compatible API server, making it suitable for both research and production environments.\n\n资料来源：[README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n\n## Key Features\n\n### Offline Inference\n\nThe `LLM` class provides the primary Python interface for offline inference—interacting with a model without using a separate model inference server. This enables direct model interaction for batch processing, experimentation, and development workflows.\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n### OpenAI-Compatible API Server\n\nvLLM serves LLM completions via HTTP through an OpenAI-compatible API. The server can be started with a simple command:\n\n```bash\nvllm serve Qwen/Qwen2.5-3B-Instruct\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n### Structured Outputs\n\nvLLM supports constrained decoding for structured outputs including JSON schema, regex patterns, and structural tags. This is essential for building reliable applications that require predictable output formats.\n\n资料来源：[examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n\n### Disaggregated Prefill and Decode\n\nvLLM supports disaggregated prefill architecture where prefill (token generation) and decode (token consumption) stages can run on separate instances. This enables:\n\n- Independent scaling of prefill and decode workloads\n- Improved resource utilization\n- Better support for multi-turn conversations\n\n资料来源：[examples/disaggregated/example_connector/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/example_connector/README.md)\n\n### KV Cache Transfer\n\nFor disaggregated serving, vLLM supports KV cache transfer between prefill and decode workers. The architecture includes:\n\n| Component | Role | Description |\n|-----------|------|-------------|\n| `ECExampleConnector` | Cache Storage | Stores encoder cache on local disk |\n| EC Producer | Precompute | Pre-computes encoder cache |\n| EC Consumer | Retrieve | Retrieves cached KV data |\n\n资料来源：[examples/disaggregated/disaggregated_encoder/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/disaggregated_encoder/README.md)\n\n### KV Load Failure Recovery\n\nvLLM implements robust recovery mechanisms for KV load failures in both synchronous and asynchronous loading modes. The system:\n\n1. Identifies invalid KV blocks\n2. Reschedules affected requests\n3. Ensures consistent output through recovery logic\n\n资料来源：[examples/disaggregated/kv_load_failure_recovery_offline/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/kv_load_failure_recovery_offline/README.md)\n\n### Long Text Embedding with Chunked Processing\n\nvLLM supports embedding models with chunked processing for texts exceeding the model's maximum context length:\n\n```json\n{\n  \"pooling_type\": \"auto\",\n  \"use_activation\": true,\n  \"enable_chunked_processing\": true,\n  \"max_embed_len\": 3072000\n}\n```\n\nThis enables processing of extremely long documents (up to 3M+ tokens) including academic papers, legal documents, and code repositories.\n\n资料来源：[examples/pooling/embed/openai_embedding_long_text/README.md](https://github.com/vllm-project/vllm/blob/main/examples/pooling/embed/openai_embedding_long_text/README.md)\n\n## Architecture Overview\n\n### CLI Entry Point\n\nThe vLLM CLI provides a flexible command-line interface built with `FlexibleArgumentParser`:\n\n```mermaid\ngraph TD\n    A[vllm CLI] --> B[main.py Entry Point]\n    B --> C[Parse Arguments]\n    C --> D[Load CMD_MODULES]\n    D --> E[Create Subparsers]\n    E --> F[Execute Subcommand]\n    \n    F --> G[serve]\n    F --> H[launch]\n    F --> I[other modules]\n```\n\nThe CLI supports:\n- `-v, --version` flag for version information\n- Subcommand system with plugin architecture\n- Command validation before execution\n\n资料来源：[vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n\n### Serve Subcommand\n\nThe `serve` subcommand handles online inference server startup:\n\n```mermaid\ngraph TD\n    A[serve command] --> B{Model specified?}\n    B -->|Yes| C[Use CLI model]\n    B -->|No| D[Default: Qwen/Qwen3-0.6B]\n    \n    C --> E{GRPC enabled?}\n    D --> E\n    \n    E -->|Yes| F[Start gRPC Server]\n    E -->|No| G{Headless mode?}\n    \n    G -->|Yes| H[Set api_server_count=0]\n    G -->|No| I[Check LB mode]\n    \n    I --> J[Start FastAPI Server]\n    H --> J\n```\n\nThe serve command supports multiple deployment modes:\n- **Standard mode**: Full API server with GPU inference\n- **Headless mode**: No API servers, only engine processing\n- **gRPC mode**: Alternative RPC interface\n- **Load-balanced mode**: Data-parallel external/hybrid load balancing\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n### Launch Subcommand\n\nThe `launch` subcommand provides a modular component launch system:\n\n```mermaid\ngraph LR\n    A[launch command] --> B[LaunchSubcommand]\n    B --> C[launch_component subparser]\n    C --> D[LaunchSubcommandBase subclasses]\n    \n    D --> E[run_launch_fastapi]\n    D --> F[other components]\n    \n    E --> G[Socket binding]\n    E --> H[Build API Server]\n    E --> I[EngineArgs configuration]\n```\n\nThe launch system renders servers with preprocessing only—no inference or quantized kernels, and never allocates KV cache.\n\n资料来源：[vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n\n## Observability\n\n### OpenTelemetry Integration\n\nvLLM includes built-in OpenTelemetry support for distributed tracing:\n\n```bash\nopentelemetry-instrument vllm serve facebook/opt-125m\n```\n\nCore packages are bundled with vLLM:\n- `opentelemetry-sdk`\n- `opentelemetry-api`\n- `opentelemetry-exporter-otlp`\n- `opentelemetry-semantic-conventions-ai`\n\n资料来源：[examples/observability/opentelemetry/README.md](https://github.com/vllm-project/vllm/blob/main/examples/observability/opentelemetry/README.md)\n\n### Prometheus Metrics and Dashboards\n\nvLLM exports Prometheus-compatible metrics and supports integration with:\n\n| Platform | Dashboard Format | Import Method |\n|----------|------------------|---------------|\n| Grafana | JSON | UI or API |\n| Perses | YAML | CLI |\n\n资料来源：[examples/observability/dashboards/README.md](https://github.com/vllm-project/vllm/blob/main/examples/observability/dashboards/README.md)\n\n### Performance Profiling\n\nThe `nsys_profile_tools` enable GPU kernel-level profiling:\n\n```bash\nnsys profile -t cuda -o run1 -f true --trace-fork-before-exec=true \\\n    --cuda-graph-trace=node --delay <DELAY> --duration <DURATION> \\\n    vllm serve openai/gpt-oss-120b ...\n```\n\nThe `gputrc2graph.py` script generates kernel-level summaries and visualizations from `.nsys-rep` files.\n\n资料来源：[tools/profiler/nsys_profile_tools/README.md](https://github.com/vllm-project/vllm/blob/main/tools/profiler/nsys_profile_tools/README.md)\n\n## Supported Features Summary\n\n| Feature | Description | Configuration |\n|---------|-------------|---------------|\n| **Offline Inference** | Batch processing without server | `LLM` class |\n| **OpenAI API** | HTTP API compatibility | `vllm serve` |\n| **Structured Outputs** | JSON/regex/structural constraints | `--reasoning-parser` |\n| **Disaggregated Serving** | Split prefill/decode | `--ec-transfer-config` |\n| **KV Recovery** | Failure resilience | Custom connectors |\n| **Long Text Embedding** | Chunked processing | `--pooler-config` |\n| **Observability** | Tracing and metrics | OpenTelemetry |\n| **Quantization** | GGUF support | `repo_id:quant_type` |\n\n## Usage Modes\n\n### Offline Inference Mode\n\n```python\nfrom vllm import LLM\n\nllm = LLM(\"Qwen/Qwen2.5-3B-Instruct\")\noutput = llm.generate(\"Hello, world!\")\n```\n\n### API Server Mode\n\n```bash\nvllm serve Qwen/Qwen2.5-3B-Instruct --tensor-parallel-size 2\n```\n\n### Disaggregated Mode\n\n```bash\n# Prefill instance\nvllm serve --prefill-only --ec-transfer-config ' {...} '\n\n# Decode instance\nvllm serve --decode-only --ec-transfer-config ' {...} '\n```\n\n## Quick Reference\n\n| Command | Purpose |\n|---------|---------|\n| `vllm serve <model>` | Start API server |\n| `vllm launch <component>` | Launch specific component |\n| `opentelemetry-instrument vllm serve` | Enable tracing |\n\n## See Also\n\n- [Offline Inference Guide](examples/basic/offline_inference/README.md)\n- [Structured Outputs](examples/features/structured_outputs/README.md)\n- [Disaggregated Prefill](examples/disaggregated/example_connector/README.md)\n- [Observability Setup](examples/observability/opentelemetry/README.md)\n- [Dashboard Configuration](examples/observability/dashboards/README.md)\n\n---\n\n<a id='page-2'></a>\n\n## Getting Started\n\n### 相关页面\n\n相关主题：[vLLM Overview](#page-1)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n- [vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n- [vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n- [vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n- [examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n- [examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n- [examples/pooling/embed/openai_embedding_long_text/README.md](https://github.com/vllm-project/vllm/blob/main/examples/pooling/embed/openai_embedding_long_text/README.md)\n</details>\n\n# Getting Started\n\nvLLM is a fast and easy-to-use library for Large Language Model (LLM) inference and serving. It provides both an offline inference interface via the `LLM` class and an online serving layer with an OpenAI-compatible API server. 资料来源：[README.md:1-30]()\n\nThis guide covers the essential steps to get started with vLLM, from installation through basic inference and serving.\n\n## Installation\n\nvLLM can be installed via pip or built from source. For detailed installation instructions, refer to the [official documentation](https://docs.vllm.ai/en/latest/getting_started/installation.html).\n\n### Quick Installation\n\n```bash\npip install vllm\n```\n\n### Building from Source\n\nFor custom builds or development:\n\n```bash\ngit clone https://github.com/vllm-project/vllm.git\ncd vllm\npip install -e .\n```\n\n### GPU Requirements\n\nvLLM requires CUDA-compatible GPUs. The library supports various CUDA versions. Verify your environment has the necessary GPU drivers and CUDA toolkit installed. 资料来源：[README.md:1-15]()\n\n## Core Concepts\n\nBefore diving into usage, understand these fundamental concepts:\n\n| Concept | Description |\n|---------|-------------|\n| **LLM Class** | Primary Python interface for offline inference |\n| **Engine Args** | Configuration parameters for the inference engine |\n| **Sampling Params** | Controls generation behavior (temperature, max_tokens, etc.) |\n| **OpenAI API Server** | HTTP server providing OpenAI-compatible REST endpoints |\n\n### Architecture Overview\n\n```mermaid\ngraph TD\n    A[User Code] --> B[LLM Class / API Server]\n    B --> C[AsyncLLMEngine]\n    C --> D[Worker Pool]\n    D --> E[GPU Devices]\n    E --> F[PagedAttention KV Cache]\n    \n    G[HTTP Clients] --> H[OpenAI API Server]\n    H --> B\n```\n\n## Offline Inference\n\nOffline inference involves running model inference directly in Python without a separate server. This is ideal for batch processing, testing, or embedding vLLM into applications. 资料来源：[examples/basic/offline_inference/README.md:1-25]()\n\n### Basic Usage\n\n```python\nfrom vllm import LLM, SamplingParams\n\n# Initialize the model\nllm = LLM(\"Qwen/Qwen2.5-3B-Instruct\")\n\n# Define sampling parameters\nsampling_params = SamplingParams(\n    temperature=0.7,\n    top_p=0.95,\n    max_tokens=256\n)\n\n# Run inference\noutputs = llm.generate([\"Hello, how are you?\", \"What is vLLM?\"], sampling_params)\n\nfor output in outputs:\n    print(output.outputs[0].text)\n```\n\n### Supported Models\n\nvLLM supports a wide range of models including autoregressive transformers, mixture-of-experts models, and quantized models. For the complete list, see the [supported models documentation](https://docs.vllm.ai/en/latest/models/supported_models.html). 资料来源：[README.md:20-25]()\n\n## Serving with the CLI\n\nvLLM provides a command-line interface for serving models via an OpenAI-compatible API. 资料来源：[vllm/entrypoints/cli/serve.py:1-30]()\n\n### Starting the Server\n\n```bash\nvllm serve Qwen/Qwen3-0.6B\n```\n\n### Command-Line Options\n\nThe serve command supports extensive configuration through CLI arguments:\n\n```bash\nvllm serve <model> [options]\n```\n\nUse `--help=all` to show all available flags, or `--help=<ConfigGroup>` to explore options by section (e.g., `--help=ModelConfig`, `--help=Frontend`). 资料来源：[vllm/entrypoints/cli/serve.py:5-20]()\n\n### Key Server Options\n\n| Option | Description | Default |\n|--------|-------------|---------|\n| `--model` | Model name or path | Required |\n| `--gpu-memory-utilization` | Fraction of GPU memory to use | 0.9 |\n| `--max-model-len` | Maximum sequence length | Model default |\n| `--tensor-parallel-size` | Number of GPUs for parallelism | 1 |\n| `--port` | Server port | 8000 |\n\n### Headless Mode\n\nFor distributed setups where API servers are managed externally:\n\n```bash\nvllm serve <model> --headless\n```\n\nIn headless mode, no API servers are started, and `--api-server-count` cannot be used. 资料来源：[vllm/entrypoints/cli/serve.py:30-45]()\n\n## API Usage\n\nOnce the server is running, you can interact with it using the OpenAI-compatible API.\n\n### Completions API\n\n```bash\ncurl http://localhost:8000/v1/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"Qwen/Qwen2.5-3B-Instruct\",\n    \"prompt\": \"The capital of France is\",\n    \"max_tokens\": 50\n  }'\n```\n\n### Chat API\n\n```bash\ncurl http://localhost:8000/v1/chat/completions \\\n  -H \"Content-Type: application/json\" \\\n  -d '{\n    \"model\": \"Qwen/Qwen2.5-3B-Instruct\",\n    \"messages\": [\n      {\"role\": \"user\", \"content\": \"What is machine learning?\"}\n    ]\n  }'\n```\n\n## Offline Inference Examples\n\nThe repository includes practical examples demonstrating various vLLM capabilities. 资料来源：[examples/basic/offline_inference/README.md:25-60]()\n\n### Running Examples\n\n```bash\n# Basic example\npython examples/basic/offline_inference/basic.py\n\n# Chat example with sampling parameters\npython examples/basic/offline_inference/chat.py --max_tokens 100 --temperature 0.8\n\n# Generate example\npython examples/basic/offline_inference/generate.py --generation-config auto\n```\n\n### Generation Config\n\nThe `--generation-config` argument specifies where the generation config loads from:\n\n- `'auto'` - Load from model path\n- `<folder_path>` - Load from specified directory\n- Not provided - Use vLLM defaults\n\n```bash\npython examples/basic/offline_inference/generate.py --generation-config auto\n```\n\n> **Note:** If `max_new_tokens` is specified in generation config, it sets a server-wide limit on output tokens for all requests. 资料来源：[examples/basic/offline_inference/README.md:55-70]()\n\n## Advanced Features\n\n### Structured Outputs\n\nvLLM supports constrained decoding for structured outputs including JSON schemas, regex patterns, and grammar-based constraints. 资料来源：[examples/features/structured_outputs/README.md:1-40]()\n\n```bash\nvllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \\\n    --reasoning-parser deepseek_r1\n```\n\n```bash\n# Run structured outputs example\nuv run structured_outputs_offline.py --constraint json_mode regex\n```\n\n### Long Text Embedding\n\nFor embedding models, vLLM supports chunked processing to handle texts exceeding the model's maximum context length:\n\n```bash\nMODEL_NAME=\"jinaai/jina-embeddings-v3\" \\\nMAX_EMBED_LEN=1048576 \\\n./service.sh\n```\n\nConfiguration example with chunked processing:\n\n```json\n{\n  \"pooling_type\": \"auto\",\n  \"use_activation\": true,\n  \"enable_chunked_processing\": true,\n  \"max_embed_len\": 3072000\n}\n``` 资料来源：[examples/pooling/embed/openai_embedding_long_text/README.md:1-60]()\n\n### GGUF Quantized Models\n\nvLLM supports GGUF-quantized models loaded directly from HuggingFace:\n\n```bash\n--model unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B\n```\n\n### CPU Offload\n\nFor systems with limited GPU memory, CPU offload allows loading larger models:\n\n```bash\n--cpu-offload-gb 10\n```\n\nThis creates a virtual 34GB GPU when you have a 24GB GPU, enabling 13B model loading with BF16 weights. 资料来源：[examples/basic/offline_inference/README.md:75-85]()\n\n## Configuration Workflow\n\n```mermaid\ngraph LR\n    A[Define Engine Args] --> B[Create Model Config]\n    B --> C[Initialize Engine]\n    C --> D[Process Requests]\n    D --> E[Return Outputs]\n    \n    F[CLI Arguments] --> A\n    G[Python API] --> A\n```\n\n### Programmatic Configuration\n\n```python\nfrom vllm import LLM, EngineArgs\n\nengine_args = EngineArgs(\n    model=\"Qwen/Qwen2.5-7B-Instruct\",\n    gpu_memory_utilization=0.85,\n    tensor_parallel_size=2,\n    max_model_len=4096\n)\n\nllm = LLM(**engine_args)\n```\n\n## Next Steps\n\n| Resource | Description |\n|----------|-------------|\n| [Documentation](https://docs.vllm.ai) | Comprehensive guides and API reference |\n| [Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html) | Complete list of supported architectures |\n| [Examples](https://github.com/vllm-project/vllm/tree/main/examples) | Usage examples for various features |\n| [Paper](https://arxiv.org/abs/2309.06180) | Technical details behind vLLM's design |\n\n## Troubleshooting\n\n### Common Issues\n\n1. **CUDA Out of Memory**: Reduce `gpu_memory_utilization` or use smaller batch sizes\n2. **Model Not Found**: Ensure HuggingFace credentials are configured for gated models\n3. **Import Errors**: Verify all dependencies are installed with `pip install vllm`\n\n### Getting Help\n\n- **Technical Questions**: Use [GitHub Issues](https://github.com/vllm-project/vllm/issues)\n- **Community Discussion**: [vLLM Forum](https://discuss.vllm.ai)\n- **Development Coordination**: [Developer Slack](https://slack.vllm.ai)\n\n资料来源：[README.md:60-75]()\n\n---\n\n<a id='page-3'></a>\n\n## Core Engine Architecture\n\n### 相关页面\n\n相关主题：[vLLM Overview](#page-1), [Model Executor and Worker Architecture](#page-4), [Scheduling and Request Processing](#page-5)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/engine/llm_engine.py](https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py)\n- [vllm/engine/async_llm_engine.py](https://github.com/vllm-project/vllm/blob/main/vllm/engine/async_llm_engine.py)\n- [vllm/v1/engine/core.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/core.py)\n- [vllm/v1/engine/async_llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/async_llm.py)\n- [vllm/v1/engine/llm_engine.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/engine/llm_engine.py)\n- [vllm/entrypoints/llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py)\n- [vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n- [vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n- [vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n</details>\n\n# Core Engine Architecture\n\n## Overview\n\nThe vLLM Core Engine Architecture is the central orchestration layer responsible for managing LLM inference workflows, request scheduling, and model execution. vLLM supports two engine versions: the legacy V0 engine and the current V1 engine (introduced in v0.6.0), both designed to provide high-throughput LLM serving through efficient request batching and GPU memory management.\n\nThe engine architecture serves as the foundation for both offline inference via the `LLM` class and online serving via the OpenAI-compatible API server. 资料来源：[vllm/entrypoints/llm.py:1-50]()\n\n## Architecture Components\n\n### V0 Engine (Legacy)\n\nThe V0 engine is the original implementation found in `vllm/engine/`. It consists of:\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `LLMEngine` | `vllm/engine/llm_engine.py` | Synchronous inference engine with blocking operations |\n| `AsyncLLMEngine` | `vllm/engine/async_llm_engine.py` | Async wrapper enabling concurrent request handling |\n\nThe V0 engine uses an event-loop-based async architecture where `AsyncLLMEngine` wraps `LLMEngine` to provide non-blocking request processing.\n\n### V1 Engine (Current)\n\nThe V1 engine (`vllm/v1/engine/`) is the current production-ready implementation featuring a modular design:\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `Core` | `vllm/v1/engine/core.py` | Low-level engine core managing model execution |\n| `AsyncLLM` | `vllm/v1/engine/async_llm.py` | Main async interface for inference |\n| `LLMEngine` | `vllm/v1/engine/llm_engine.py` | High-level engine orchestrator |\n\nThe V1 engine architecture separates concerns into distinct layers: `AsyncLLM` provides the public async interface, `LLMEngine` handles request orchestration, and `Core` manages low-level GPU operations.\n\n## Entry Points\n\nvLLM provides multiple entry points for interacting with the engine:\n\n```mermaid\ngraph TD\n    A[vllm serve] --> B[ServeSubcommand]\n    A --> C[LaunchSubcommand]\n    B --> D[serve_grpc / API Server]\n    C --> E[run_launch_fastapi]\n    F[vllm run-batch] --> G[BatchRunner]\n    H[LLM Class] --> I[AsyncLLM Engine]\n```\n\n### CLI Entry Point\n\nThe CLI entry point in `vllm/entrypoints/cli/main.py` provides command-line access to vLLM functionality:\n\n```python\n# Simplified CLI structure\nparser = FlexibleArgumentParser(description=\"vLLM CLI\")\nsubparsers = parser.add_subparsers(required=False, dest=\"subparser\")\ncmds = {}\nfor cmd_module in CMD_MODULES:\n    new_cmds = cmd_module.cmd_init()\n    for cmd in new_cmds:\n        cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)\n```\n\n资料来源：[vllm/entrypoints/cli/main.py:1-40]()\n\n### Serve Subcommand\n\nThe `serve` subcommand initializes the HTTP API server or gRPC service:\n\n```python\nclass ServeSubcommand(CLISubcommand):\n    name = \"serve\"\n    \n    @staticmethod\n    def cmd(args: argparse.Namespace) -> None:\n        if hasattr(args, \"model_tag\") and args.model_tag is not None:\n            args.model = args.model_tag\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py:1-30]()\n\n### Launch Subcommand\n\nThe `launch` subcommand provides component-level launching capabilities:\n\n```python\ndef cmd_init() -> list[CLISubcommand]:\n    return [LaunchSubcommand()]\n\nasync def run_launch_fastapi(args: argparse.Namespace) -> None:\n    listen_address, sock = setup_server(args)\n    engine_args = AsyncEngineArgs.from_cli_args(args)\n```\n\n资料来源：[vllm/entrypoints/cli/launch.py:1-60]()\n\n### Python API Entry Point\n\nThe `LLM` class in `vllm/entrypoints/llm.py` provides the primary Python interface for offline inference:\n\n```python\nclass LLM:\n    \"\"\"\n    An LLM for offline inference.\n    \"\"\"\n    \n    def __init__(self, model: str, ...):\n        ...\n```\n\n资料来源：[vllm/entrypoints/llm.py:1-100]()\n\n## Engine Initialization Flow\n\nThe following diagram illustrates the initialization flow from CLI to engine:\n\n```mermaid\nsequenceDiagram\n    participant CLI as vllm serve\n    participant Parser as ArgumentParser\n    participant EngineArgs as AsyncEngineArgs\n    participant Engine as AsyncLLM / Core\n    participant Model as ModelConfig\n    \n    CLI->>Parser: Parse CLI arguments\n    Parser->>EngineArgs: from_cli_args()\n    EngineArgs->>Model: create_model_config()\n    Model-->>EngineArgs: Config validated\n    EngineArgs->>Engine: Initialize engine\n    Engine->>Engine: Load model weights\n```\n\n## Core Engine Components\n\n### AsyncLLM (V1)\n\nThe `AsyncLLM` class is the primary async interface for V1 engine:\n\n```python\nclass AsyncLLM:\n    \"\"\"\n    Async implementation of LLM engine.\n    \"\"\"\n    \n    async def add_request(self, request_id: str, prompt: str, ...):\n        ...\n    \n    async def step(self) -> List[RequestOutput]:\n        ...\n```\n\n资料来源：[vllm/v1/engine/async_llm.py:1-50]()\n\n### Core (V1)\n\nThe `Core` class manages low-level model execution:\n\n```python\nclass Core:\n    \"\"\"\n    Core engine for V1.\n    \"\"\"\n    \n    def __init__(self, engine_config: VllmConfig, ...):\n        ...\n    \n    def get_config(self) -> VllmConfig:\n        ...\n```\n\n资料来源：[vllm/v1/engine/core.py:1-80]()\n\n### LLMEngine (V1)\n\nThe V1 `LLMEngine` orchestrates request processing:\n\n```python\nclass LLMEngine:\n    \"\"\"\n    V1 LLM Engine implementation.\n    \"\"\"\n    \n    def __init__(self, vllm_config: VllmConfig, ...):\n        ...\n```\n\n资料来源：[vllm/v1/engine/llm_engine.py:1-50]()\n\n## Configuration System\n\n### AsyncEngineArgs\n\nConfiguration flows from CLI/API to engine via `AsyncEngineArgs`:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `model` | `str` | Model name or path |\n| `tensor_parallel_size` | `int` | Number of GPUs for tensor parallelism |\n| `gpu_memory_utilization` | `float` | Fraction of GPU memory to use |\n| `max_model_len` | `Optional[int]` | Maximum sequence length |\n| `dtype` | `str` | Model data type (float16, bfloat16, etc.) |\n| `quantization` | `Optional[str]` | Quantization method (awq, gptq, etc.) |\n\nThe engine validates configuration through `create_model_config()`:\n\n```python\ndef create_model_config(self) -> ModelConfig:\n    \"\"\"Create model config from engine args.\"\"\"\n    return ModelConfig(...)\n```\n\n资料来源：[vllm/entrypoints/cli/launch.py:40-50]()\n\n## Headless Mode\n\nThe V1 engine supports headless operation where no API servers are started:\n\n```python\nif args.headless:\n    if args.api_server_count is not None and args.api_server_count > 0:\n        raise ValueError(\n            f\"--api-server-count={args.api_server_count} cannot be \"\n            \"used with --headless (no API servers are started in \"\n            \"headless mode).\"\n        )\n    args.api_server_count = 0\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py:25-35]()\n\n## gRPC Support\n\nvLLM supports gRPC for disaggregated prefill scenarios:\n\n```python\nif getattr(args, \"grpc\", False):\n    from vllm.entrypoints.grpc_server import serve_grpc\n    uvloop.run(serve_grpc(args))\n    return\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py:18-22]()\n\n## Request Processing Pipeline\n\n```mermaid\ngraph LR\n    A[Request] --> B[Parser]\n    B --> C[AsyncLLM.add_request]\n    C --> D[Scheduler]\n    D --> E[Model Executor]\n    E --> F[Output]\n    D --> G[KV Cache]\n```\n\n## Data Parallel Modes\n\nThe engine supports multiple data parallel configurations:\n\n| Mode | Flag | Description |\n|------|------|-------------|\n| External LB | `--data-parallel-external-lb` | External load balancer coordinates workers |\n| Hybrid LB | `--data-parallel-hybrid-lb` | Hybrid approach with custom rank assignment |\n| Rank | `--data-parallel-rank` | Specify worker rank for distributed setup |\n\n```python\n# Detection logic\nis_external_lb = getattr(args, \"data_parallel_external_lb\", False)\nis_hybrid_lb = getattr(args, \"data_parallel_hybrid_lb\", False)\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py:35-40]()\n\n## Model Config Validation\n\nThe engine performs validation during model config creation:\n\n```python\nmodel_config = engine_args.create_model_config()\n\n# Clear quantization for render servers (preprocessing only)\nif render_mode:\n    model_config.quantization = None\n```\n\n资料来源：[vllm/entrypoints/cli/launch.py:45-55]()\n\n## Key Design Patterns\n\n### Async/Await Architecture\n\nThe V1 engine uses native async/await for concurrent request handling:\n\n```python\nasync def step(self) -> List[RequestOutput]:\n    \"\"\"Execute one iteration of the engine.\"\"\"\n    ...\n```\n\n### Command Pattern\n\nCLI commands follow the Command pattern with `CLISubcommand` base class:\n\n```python\nclass ServeSubcommand(CLISubcommand):\n    name = \"serve\"\n    \n    @staticmethod\n    def cmd(args: argparse.Namespace) -> None:\n        ...\n```\n\n### Subparser Registration\n\nCommands register themselves via `cmd_init()` factory functions:\n\n```python\ndef cmd_init() -> list[CLISubcommand]:\n    return [LaunchSubcommand(), ServeSubcommand()]\n```\n\n## Summary\n\nThe vLLM Core Engine Architecture provides a flexible, multi-layered design supporting both V0 (legacy) and V1 (current) engine implementations. Key characteristics include:\n\n1. **Dual Engine Support**: V0 for backward compatibility, V1 for production workloads\n2. **Multiple Entry Points**: CLI, Python API, HTTP server, gRPC\n3. **Async-First Design**: Native async/await for concurrent request processing\n4. **Modular Components**: Clear separation between CLI, engine core, and model execution\n5. **Flexible Configuration**: Comprehensive argument system via `AsyncEngineArgs`\n6. **Data Parallel Support**: Multiple modes for distributed serving scenarios\n\nThe architecture prioritizes performance through efficient GPU memory management, request batching, and pipelined execution while maintaining a clean, extensible design.\n\n---\n\n<a id='page-4'></a>\n\n## Model Executor and Worker Architecture\n\n### 相关页面\n\n相关主题：[Core Engine Architecture](#page-3), [Scheduling and Request Processing](#page-5), [Model Architecture Support](#page-10)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/v1/worker/gpu_model_runner.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu_model_runner.py)\n- [vllm/v1/worker/worker_base.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/worker_base.py)\n- [vllm/v1/worker/gpu_worker.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/worker/gpu_worker.py)\n- [vllm/model_executor/model_loader/__init__.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/model_loader/__init__.py)\n- [vllm/model_executor/models/__init__.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py)\n</details>\n\n# Model Executor and Worker Architecture\n\n## Overview\n\nThe vLLM Model Executor and Worker Architecture forms the core execution layer responsible for model loading, inference, and batch-level processing. This architecture separates concerns between high-level request orchestration and low-level GPU-based model execution, enabling efficient parallel processing of LLM inference requests.\n\nThe architecture consists of two primary components:\n\n| Component | Responsibility |\n|-----------|----------------|\n| **Model Executor** | Manages model loading, weight initialization, and model-specific execution logic |\n| **Worker** | Handles GPU-side computation, memory management, and kernel execution |\n\n## Architecture Diagram\n\n```mermaid\ngraph TD\n    A[AsyncEngineArgs] --> B[Model Executor]\n    B --> C[Model Loader]\n    C --> D[Model Weights]\n    B --> E[GPU Worker]\n    E --> F[GPU Model Runner]\n    F --> G[CUDA Kernels]\n    F --> H[Attention Layers]\n    F --> I[MLP Layers]\n    \n    J[Request Batch] --> E\n    E --> K[Generated Tokens]\n    \n    style B fill:#e1f5fe\n    style E fill:#fff3e0\n    style F fill:#f3e5f5\n```\n\n## Model Executor Layer\n\n### Purpose and Scope\n\nThe Model Executor layer handles all aspects of model lifecycle management, including initialization, weight loading, and providing the interface for model execution. This layer operates independently of the worker layer, allowing for flexibility in model configuration.\n\n### Model Loading\n\nThe model loading subsystem supports multiple backends and quantization schemes:\n\n```python\nclass ModelLoader:\n    def load_model(self, model_config, parallel_config, device_config):\n        # Load model weights based on configuration\n        pass\n```\n\n**Supported Loading Modes:**\n\n| Mode | Description |\n|------|-------------|\n| Auto | Automatically detect optimal loading strategy |\n| Naive | Standard PyTorch loading |\n| Sharded | Load model shards across multiple devices |\n| Quantized | Load quantized weights (AWQ, GPTQ, GGUF) |\n\n资料来源：[vllm/model_executor/model_loader/__init__.py]()\n\n### Model Registry\n\nvLLM maintains a registry of supported model architectures:\n\n```python\n# Model architecture registration\n@register_model(\"LlamaForCausalLM\")\nclass LlamaForCausalLM(nn.Module):\n    ...\n```\n\nThe registry maps model names to their corresponding model classes, enabling automatic model instantiation based on HuggingFace model architecture detection.\n\n资料来源：[vllm/model_executor/models/__init__.py]()\n\n## Worker Architecture\n\n### Base Worker\n\nThe worker base class defines the interface for all worker implementations:\n\n```python\nclass WorkerBase:\n    def __init__(self, vllm_config):\n        self.vllm_config = vllm_config\n        self.model = None\n        self.device = None\n    \n    def execute_model(self, batch):\n        raise NotImplementedError\n```\n\n资料来源：[vllm/v1/worker/worker_base.py]()\n\n### GPU Worker\n\nThe GPU Worker is the primary worker implementation for GPU-based inference:\n\n```mermaid\ngraph LR\n    A[Input Batch] --> B[Model Input Preparation]\n    B --> C[Forward Pass]\n    C --> D[Output Extraction]\n    D --> E[Token Generation]\n```\n\n**Key Responsibilities:**\n\n1. Initialize CUDA context and memory pools\n2. Prepare model inputs with proper padding and masking\n3. Execute forward passes on GPU\n4. Manage KV cache memory allocation\n\n资料来源：[vllm/v1/worker/gpu_worker.py]()\n\n### GPU Model Runner\n\nThe GPU Model Runner handles the low-level model execution details:\n\n```python\nclass GPUModelRunner:\n    def __init__(self, config):\n        self.kv_cache = None\n        self.attn_metadata = None\n        self.block_manager = None\n    \n    def prepare_inputs(self, batch):\n        # Prepare input tensors with proper device placement\n        pass\n    \n    def execute_model(self, input_tokens, positions):\n        # Execute model forward pass\n        pass\n```\n\n**Core Components:**\n\n| Component | Function |\n|-----------|----------|\n| `kv_cache` | Stores key-value tensors for attention |\n| `attn_metadata` | Manages attention metadata for paged attention |\n| `block_manager` | Handles physical memory block allocation |\n\n资料来源：[vllm/v1/worker/gpu_model_runner.py]()\n\n## Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant API as API Server\n    participant Executor as Model Executor\n    participant Worker as GPU Worker\n    participant Runner as GPU Model Runner\n    participant Kernels as CUDA Kernels\n    \n    API->>Executor: Initialize model\n    Executor->>Worker: Load weights\n    Worker->>Runner: Initialize GPU state\n    Runner->>Kernels: Allocate memory\n    \n    API->>Executor: Execute batch\n    Executor->>Worker: Forward request\n    Worker->>Runner: Prepare inputs\n    Runner->>Kernels: Compute attention\n    Runner->>Kernels: Compute mlp\n    Kernels-->>Runner: Output logits\n    Runner-->>Worker: Return results\n    Worker-->>Executor: Output tokens\n```\n\n## Memory Management\n\n### KV Cache Architecture\n\nvLLM uses a block-based KV cache management system:\n\n1. **Logical Blocks**: Abstract representation of KV cache entries\n2. **Physical Blocks**: Actual GPU memory allocations\n3. **Block Mapping**: Links logical blocks to physical locations\n\n```python\nclass BlockManager:\n    def allocate(self, num_blocks):\n        # Allocate physical blocks\n        pass\n    \n    def get_physical_block(self, logical_id):\n        # Get physical location for logical block\n        pass\n```\n\n### Memory Allocation Strategy\n\n| Strategy | Use Case |\n|----------|----------|\n| Dynamic | Default, allocates on demand |\n| Static | Pre-allocates at initialization |\n| Hybrid | Mix of static and dynamic |\n\n## Configuration Parameters\n\nThe Model Executor and Worker architecture is configured through `AsyncEngineArgs`:\n\n| Parameter | Description | Default |\n|-----------|-------------|---------|\n| `model` | Model name or path | Required |\n| `tensor_parallel_size` | Number of GPUs for tensor parallelism | 1 |\n| `pipeline_parallel_size` | Number of pipeline stages | 1 |\n| `gpu_memory_utilization` | Fraction of GPU memory for KV cache | 0.9 |\n| `max_model_len` | Maximum sequence length | Auto |\n| `block_size` | KV cache block size | 16 |\n\n## Summary\n\nThe Model Executor and Worker Architecture in vLLM provides a modular, extensible system for LLM inference:\n\n- **Separation of Concerns**: Clear boundaries between model loading and execution\n- **GPU Optimization**: Efficient CUDA kernel integration and memory management\n- **Flexible Configuration**: Support for various parallelism and quantization strategies\n- **Extensibility**: Plugin-based model registration system\n\nThis architecture enables vLLM to achieve high throughput through batch processing while maintaining low latency through careful memory management and kernel optimization.\n\n---\n\n<a id='page-5'></a>\n\n## Scheduling and Request Processing\n\n### 相关页面\n\n相关主题：[Core Engine Architecture](#page-3), [Model Executor and Worker Architecture](#page-4), [Distributed Inference and Parallelism](#page-9)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/v1/core/sched/scheduler.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/sched/scheduler.py)\n- [vllm/v1/core/sched/request_queue.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/sched/request_queue.py)\n- [vllm/v1/core/sched/async_scheduler.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/sched/async_scheduler.py)\n- [vllm/v1/request.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/request.py)\n- [vllm/config/scheduler.py](https://github.com/vllm-project/vllm/blob/main/vllm/config/scheduler.py)\n</details>\n\n# Scheduling and Request Processing\n\n## Overview\n\nThe Scheduling and Request Processing system is a core component of vLLM's v1 engine architecture. It manages the lifecycle of inference requests from arrival to completion, coordinating resource allocation, batch scheduling, and execution ordering to maximize GPU utilization while maintaining quality of service guarantees.\n\nIn vLLM, the scheduler operates as an asynchronous event-driven system that continuously evaluates pending requests, determines optimal batching strategies, and dispatches work to the underlying execution engine. This design enables vLLM to handle high-throughput serving workloads with efficient memory management through mechanisms like paged attention and dynamic batch composition.\n\nThe scheduler works in conjunction with the request queue, which serves as the primary buffer for incoming inference requests. It makes real-time decisions about which requests to include in the next execution batch based on available GPU memory, request priorities, and fairness constraints.\n\n## Architecture Overview\n\n### Component Hierarchy\n\nThe scheduling system consists of several interconnected components that work together to manage request processing:\n\n```mermaid\ngraph TD\n    A[API Layer] --> B[Request Queue]\n    B --> C[Async Scheduler]\n    C --> D[Scheduler]\n    D --> E[Execution Engine]\n    E --> F[GPU]\n    \n    G[Config] --> C\n    G --> D\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `Scheduler` | `vllm/v1/core/sched/scheduler.py` | Core scheduling logic and batch selection |\n| `AsyncScheduler` | `vllm/v1/core/sched/async_scheduler.py` | Async wrapper for scheduler operations |\n| `RequestQueue` | `vllm/v1/core/sched/request_queue.py` | Request buffering and ordering |\n| `Request` | `vllm/v1/request.py` | Request data model and state |\n| `SchedulerConfig` | `vllm/config/scheduler.py` | Scheduler configuration parameters |\n\n## Request Model\n\n### Request Data Structure\n\nThe `Request` class represents an individual inference request with all associated metadata and state. Each request maintains its own context including tokenized inputs, sampling parameters, and execution state.\n\nThe request model tracks the following key attributes:\n\n| Attribute | Type | Description |\n|-----------|------|-------------|\n| `request_id` | `str` | Unique identifier for the request |\n| `prompt` | `str` | Original input text or tokens |\n| `prompt_token_ids` | `List[int]` | Tokenized prompt |\n| `sampling_params` | `SamplingParams` | Sampling configuration |\n| `arrival_time` | `float` | Request arrival timestamp |\n| `state` | `RequestState` | Current execution state |\n\n### Request States\n\nRequests transition through a defined state machine during their lifecycle:\n\n```mermaid\nstateDiagram-v2\n    [*] --> WAITING: Request Arrival\n    WAITING --> SCHEDULED: Scheduler Selection\n    SCHEDULED --> RUNNING: Dispatch to GPU\n    RUNNING --> WAITING: Preemption/Continuation\n    RUNNING --> FINISHED: Completion\n    RUNNING --> WAITING: KV Cache Reuse\n    WAITING --> CANCELLED: Client Cancellation\n    SCHEDULED --> CANCELLED: Client Cancellation\n```\n\n1. **WAITING**: Request is queued and awaiting scheduling\n2. **SCHEDULED**: Request has been selected for the next batch\n3. **RUNNING**: Request is actively being processed on GPU\n4. **FINISHED**: Request has completed successfully\n5. **CANCELLED**: Request was cancelled before completion\n\n资料来源：[vllm/v1/request.py]()\n\n## Request Queue\n\nThe `RequestQueue` serves as the primary buffering mechanism for incoming requests. It provides thread-safe operations for enqueueing, dequeuing, and managing request priorities.\n\n### Queue Operations\n\n| Operation | Description |\n|-----------|-------------|\n| `enqueue()` | Add new request to queue |\n| `dequeue()` | Remove and return next request |\n| `peek()` | View next request without removal |\n| `cancel()` | Remove cancelled request |\n| `requeue()` | Return request to queue (preemption) |\n\n### Priority Handling\n\nThe request queue supports priority-based ordering where higher priority requests can bypass lower priority ones. Priority is determined by a combination of factors:\n\n- Explicit priority values in `SamplingParams`\n- Arrival time (older requests may get priority for fairness)\n- Request type (prefill vs decode operations)\n\n资料来源：[vllm/v1/core/sched/request_queue.py]()\n\n## Scheduler\n\n### Core Scheduling Logic\n\nThe scheduler is responsible for selecting which requests to include in the next execution batch. It operates on a continuous scheduling loop that evaluates the current state of all pending requests and available resources.\n\n#### Scheduling Criteria\n\nThe scheduler makes decisions based on multiple factors:\n\n1. **Memory Availability**: Sufficient GPU memory must be available for the request's KV cache\n2. **Batch Size Limits**: Maximum batch size constraints\n3. **Prefill-Decompose Decisions**: Whether to split prefill into smaller chunks\n4. **Priority Weighting**: Relative importance of pending requests\n5. **Latency Targets**: QoS requirements for specific request categories\n\n#### Scheduling Loop\n\n```mermaid\ngraph LR\n    A[Evaluate Pending Requests] --> B{Sufficient Resources?}\n    B -->|Yes| C[Select Request]\n    C --> D[Add to Batch]\n    D --> E{Batch Full?}\n    E -->|No| A\n    E -->|Yes| F[Dispatch Batch]\n    B -->|No| G[Wait / Preempt]\n    F --> H[Update State]\n    H --> A\n```\n\n### Async Scheduler Interface\n\nThe `AsyncScheduler` provides an asynchronous interface to the core scheduler, enabling non-blocking scheduling operations. This is essential for maintaining high throughput in production serving scenarios where the scheduler must coexist with network I/O and other async operations.\n\nKey async operations include:\n\n- `schedule_async()`: Async wrapper for schedule iteration\n- `add_request()`: Async request enqueueing\n- `abort_request()`: Async request cancellation\n\n资料来源：[vllm/v1/core/sched/async_scheduler.py]()\n\n## Scheduler Configuration\n\nThe `SchedulerConfig` class defines all configurable parameters for the scheduler behavior. These settings control batching strategies, memory management, and QoS characteristics.\n\n### Configuration Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `max_num_seqs` | `int` | `256` | Maximum sequences per iteration |\n| `max_num_batched_tokens` | `int` | `8192` | Max tokens per batch |\n| `max_model_len` | `int` | `8192` | Maximum model context length |\n| `enable_chunked_prefill` | `bool` | `True` | Enable prefill chunking |\n| `num_prefill_groups` | `int` | `1` | Number of prefill groups |\n| `async_scheduling` | `bool` | `True` | Enable async scheduling |\n\n### Memory Management Settings\n\n| Parameter | Description |\n|-----------|-------------|\n| `gpu_memory_utilization` | Fraction of GPU memory for KV cache (0.0-1.0) |\n| `num_causal_layers` | Number of causal attention layers |\n| `head_dim` | Dimension of attention heads |\n\n资料来源：[vllm/config/scheduler.py]()\n\n## Batching Strategies\n\n### Continuous Batching\n\nvLLM employs continuous batching (also known as iteration-level scheduling) to maximize GPU utilization. Unlike static batching, where batch composition is fixed at the start, continuous batching allows requests to enter and exit the batch at each iteration.\n\n#### Advantages\n\n- **Higher Throughput**: GPU is never idle waiting for batch to complete\n- **Lower Latency**: Short requests don't wait for long ones\n- **Better Memory Utilization**: Dynamic allocation based on actual needs\n\n### Prefill Batching\n\nPrefill operations (processing new tokens) and decode operations (generating new tokens) have different characteristics. The scheduler can:\n\n1. **Combined Batching**: Mix prefill and decode in same batch\n2. **Separate Batching**: Process prefill and decode in distinct batches\n3. **Chunked Prefill**: Split large prefill requests into smaller chunks\n\n### Chunked Prefill\n\nWhen `enable_chunked_prefill` is enabled, large prefill requests are split into smaller chunks to:\n\n- Reduce memory pressure\n- Allow shorter requests to be scheduled faster\n- Improve fairness between requests of different lengths\n\n资料来源：[vllm/v1/core/sched/scheduler.py]()\n\n## Request Processing Flow\n\n### Full Request Lifecycle\n\n```mermaid\nsequenceDiagram\n    participant Client\n    participant API\n    participant Queue\n    participant Scheduler\n    participant Engine\n    participant GPU\n    \n    Client->>API: Submit Request\n    API->>API: Validate & Tokenize\n    API->>Queue: Enqueue Request\n    Scheduler->>Queue: Dequeue Requests\n    Scheduler->>Scheduler: Evaluate Batching\n    Scheduler->>Engine: Dispatch Batch\n    Engine->>GPU: Execute Forward Pass\n    GPU-->>Engine: Output Tensors\n    Engine-->>Scheduler: Update State\n    Scheduler->>Queue: Requeue if Needed\n    Scheduler->>API: Stream Tokens\n    API-->>Client: Response Stream\n```\n\n### Scheduling Iteration\n\nEach scheduling iteration follows these steps:\n\n1. **Request Evaluation**: Scan all pending requests for eligibility\n2. **Resource Calculation**: Determine available GPU memory\n3. **Batch Composition**: Select requests based on scheduling policy\n4. **Chunk Assignment**: Divide prefill requests if needed\n5. **Batch Dispatch**: Send batch to execution engine\n6. **State Update**: Update request states and metrics\n\n## Configuration Example\n\n```python\nfrom vllm.config import SchedulerConfig\n\nconfig = SchedulerConfig(\n    max_num_seqs=256,\n    max_num_batched_tokens=8192,\n    max_model_len=32768,\n    enable_chunked_prefill=True,\n    gpu_memory_utilization=0.9,\n)\n```\n\n## Performance Considerations\n\n### Memory Management\n\nThe scheduler must balance memory allocation between:\n\n- **KV Cache**: Storing attention key-value pairs\n- **Model Weights**: The LLM parameters (typically pre-loaded)\n- **Activation Memory**: Temporary tensors during computation\n\n### Latency vs Throughput\n\nThe scheduling configuration affects the latency-throughput tradeoff:\n\n| Setting | Effect |\n|---------|--------|\n| Smaller `max_num_seqs` | Lower latency, lower throughput |\n| Larger `max_num_batched_tokens` | Higher throughput, variable latency |\n| `enable_chunked_prefill=True` | Better latency fairness |\n\n### Preemption\n\nWhen GPU memory is insufficient for incoming requests, the scheduler may preempt existing requests to free memory. Preempted requests are returned to the queue and rescheduled later.\n\n## Integration with Execution Engine\n\nThe scheduler interfaces with the execution engine through a well-defined API:\n\n- **schedule()**: Main entry point for scheduling decisions\n- **add_request()**: Register new inference request\n- **abort_request()**: Cancel running request\n- **update_from_output()**: Process execution results\n\nThe execution engine receives scheduled batches and returns completed or paused requests, allowing the scheduler to update its internal state and make subsequent scheduling decisions.\n\n## Related Components\n\nFor a complete understanding of vLLM's request processing, also refer to:\n\n- **Engine**: Coordinates between scheduler and model execution\n- **Worker**: Executes model operations on GPU\n- **Cache**: KV cache management and allocation\n- **采样参数**: Sampling configuration affecting scheduling\n\n## Summary\n\nThe Scheduling and Request Processing system is fundamental to vLLM's ability to serve large language models efficiently. By employing continuous batching, intelligent memory management, and flexible scheduling policies, vLLM achieves high throughput while maintaining low latency for diverse workloads. The modular design with separate scheduler, queue, and configuration components allows for fine-tuned control over serving behavior.\n\n---\n\n<a id='page-6'></a>\n\n## PagedAttention and KV Cache Management\n\n### 相关页面\n\n相关主题：[Scheduling and Request Processing](#page-5), [Distributed Inference and Parallelism](#page-9), [Attention Backends and Kernels](#page-7)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [csrc/attention/paged_attention_v1.cu](https://github.com/vllm-project/vllm/blob/main/csrc/attention/paged_attention_v1.cu)\n- [csrc/attention/paged_attention_v2.cu](https://github.com/vllm-project/vllm/blob/main/csrc/attention/paged_attention_v2.cu)\n- [vllm/v1/core/kv_cache_manager.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/kv_cache_manager.py)\n- [vllm/v1/core/block_pool.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/core/block_pool.py)\n- [docs/design/paged_attention.md](https://github.com/vllm-project/vllm/blob/main/docs/design/paged_attention.md)\n</details>\n\n# PagedAttention and KV Cache Management\n\n## Overview\n\nPagedAttention is a novel attention mechanism that enables efficient virtual memory-based management of the Key-Value (KV) cache in large language model (LLM) inference. Inspired by the memory management technique in operating systems called paging, PagedAttention divides the KV cache into fixed-size \"pages\" that can be flexibly allocated and managed, eliminating the need for contiguous memory allocation.\n\nThe KV cache stores the key and value tensors from attention computation for each token position. During autoregressive decoding, this cache grows dynamically as new tokens are generated. Traditional LLM serving systems allocate contiguous memory blocks for the KV cache, leading to significant memory waste due to internal and external fragmentation when handling variable-length sequences and multi-user workloads.\n\nvLLM's PagedAttention implementation provides:\n\n- **Memory efficiency**: Eliminates fragmentation by using non-contiguous page-based allocation\n- **Flexible batching**: Supports arbitrary sequence lengths and concurrent requests\n- **Dynamic memory management**: Allocates cache pages on-demand during generation\n- **GPU memory optimization**: Maximizes GPU memory utilization for higher throughput\n\n## Architecture\n\n### High-Level System Design\n\n```mermaid\ngraph TD\n    subgraph \"Application Layer\"\n        Req[Inference Request]\n        Sched[Scheduler]\n    end\n    \n    subgraph \"Memory Management Layer\"\n        KVM[KV Cache Manager]\n        BP[Block Pool]\n    end\n    \n    subgraph \"GPU Memory Layer\"\n        GPU[GPU Memory]\n        Pages[KV Cache Pages]\n    end\n    \n    Req --> Sched\n    Sched --> KVM\n    KVM --> BP\n    BP --> GPU\n    Pages -.>|Physical Memory| GPU\n```\n\n### PagedAttention Version Comparison\n\nvLLM implements two versions of PagedAttention with different performance characteristics:\n\n| Aspect | PagedAttention V1 | PagedAttention V2 |\n|--------|-------------------|-------------------|\n| **Kernel Type** | Fused attention kernels | Optimized fused kernels |\n| **Memory Access** | Standard page table lookup | Enhanced page table optimization |\n| **Performance** | Baseline optimized | ~2.2x speedup over V1 |\n| **Use Case** | General purpose | Production workloads |\n| **Implementation** | `paged_attention_v1.cu` | `paged_attention_v2.cu` |\n\n资料来源：[csrc/attention/paged_attention_v1.cu:1-100](), [csrc/attention/paged_attention_v2.cu:1-100](), [docs/design/paged_attention.md:1-50]()\n\n## KV Cache Manager\n\nThe KV Cache Manager (`kv_cache_manager.py`) is the core component responsible for tracking and managing the allocation of KV cache pages across all in-flight sequences.\n\n### Responsibilities\n\n| Responsibility | Description |\n|----------------|-------------|\n| **Block Allocation** | Allocates and deallocates cache blocks as sequences grow or complete |\n| **Reference Counting** | Tracks how many sequences reference each physical block |\n| **Page Table Management** | Maintains virtual-to-physical page mappings per sequence |\n| **Cache Eviction** | Handles cache eviction when memory pressure occurs |\n\n### Key Data Structures\n\n```python\n# Simplified representation of block metadata\nclass Block:\n    block_id: int          # Physical block identifier\n    page_indices: List[int]  # Virtual page indices mapping\n    ref_count: int         # Number of sequences referencing this block\n    is_computed: bool      # Whether this block has computed KV cache\n```\n\n### Block Allocation Workflow\n\n```mermaid\nsequenceDiagram\n    participant Scheduler\n    participant KVM as KV Cache Manager\n    participant BP as Block Pool\n    participant GPU as GPU Memory\n    \n    Scheduler->>KVM: allocate_blocks(sequence_id, num_tokens)\n    KVM->>BP: reserve_blocks(count)\n    BP->>GPU: allocate_contiguous_pages\n    GPU-->>BP: block_handles\n    BP-->>KVM: allocated_blocks\n    KVM-->>Scheduler: block_table\n```\n\n资料来源：[vllm/v1/core/kv_cache_manager.py:1-150](), [vllm/v1/core/block_pool.py:1-100]()\n\n## Block Pool\n\nThe Block Pool (`block_pool.py`) manages the physical memory allocation of KV cache pages on GPU memory.\n\n### Memory Organization\n\n| Parameter | Description | Typical Value |\n|-----------|-------------|---------------|\n| `block_size` | Number of tokens per cache block | 16 tokens |\n| `num_blocks` | Total number of available blocks | Dynamic based on GPU memory |\n| `num_layers` | Number of attention layers | Model-dependent (e.g., 32-80) |\n| `num_kv_heads` | Number of key/value attention heads | Model-dependent |\n| `head_dim` | Dimension of each attention head | 128 or 256 |\n\n### Block Pool Operations\n\n```mermaid\ngraph LR\n    subgraph \"Allocation States\"\n        Free[Free Blocks]\n        Used[Used Blocks]\n        Partial[Partially Filled]\n    end\n    \n    Allocate -->|Allocate Block| Free\n    Allocate -->|Release Block| Used\n    Used -->|Free| Free\n    Partial -->|Append Token| Used\n```\n\nThe block pool maintains three primary states:\n\n1. **Free Blocks**: Available for immediate allocation\n2. **Used Blocks**: Currently assigned to active sequences\n3. **Partially Filled Blocks**: Blocks with available slots for new tokens\n\n资料来源：[vllm/v1/core/block_pool.py:1-100]()\n\n## PagedAttention Kernel Implementation\n\n### CUDA Kernel Architecture\n\nBoth PagedAttention V1 and V2 are implemented as optimized CUDA kernels that handle attention computation with page-table-based memory access.\n\n#### V1 Kernel Flow\n\n```mermaid\ngraph TD\n    A[Load Query Tokens] --> B[Get Block Table Entry]\n    B --> C[Load K/V from Pages]\n    C --> D[Compute Attention Scores]\n    D --> E[Write Output to GEMM]\n```\n\n#### V2 Kernel Optimizations\n\nPagedAttention V2 introduces several optimizations:\n\n- **Reduced Page Table Lookups**: Consolidates multiple lookups into single operations\n- **Improved Memory Coalescing**: Better memory access patterns for KV cache\n- **Warp-Level Primitives**: Utilizes warp-level reductions for faster softmax computation\n- **Fused Operations**: Combines multiple operations into single kernel launches\n\n资料来源：[csrc/attention/paged_attention_v1.cu:1-200](), [csrc/attention/paged_attention_v2.cu:1-200]()\n\n### Kernel Parameters\n\n| Parameter | Description | Range |\n|-----------|-------------|-------|\n| `THREADS_PER_BLOCK` | CUDA threads per block | 128-256 |\n| `BLOCK_SIZE` | Tokens per block | 16 |\n| `CACHE_BLOCK_SIZE` | Bytes per cache block | Dynamic |\n| `num_heads` | Total query heads | Model-dependent |\n| `head_dim` | Attention head dimension | 128/256 |\n\n## Page Table Management\n\n### Virtual-to-Physical Mapping\n\nEach sequence maintains a page table that maps virtual token positions to physical cache blocks:\n\n```python\n# Virtual page table structure\npage_table = [\n    physical_block_id,  # virtual position 0-15\n    physical_block_id,  # virtual position 16-31\n    physical_block_id,  # virtual position 32-47\n    ...\n]\n```\n\n### Block Table Structure\n\n```mermaid\ngraph TD\n    subgraph \"Sequence 1\"\n        S1_VP1[Virtual Page 0] --> S1_PP1[Physical Block 5]\n        S1_VP2[Virtual Page 1] --> S1_PP2[Physical Block 12]\n        S1_VP3[Virtual Page 2] --> S1_PP3[Physical Block 3]\n    end\n    \n    subgraph \"Sequence 2\"\n        S2_VP1[Virtual Page 0] --> S2_PP1[Physical Block 5]\n        S2_VP2[Virtual Page 1] --> S2_PP2[Physical Block 7]\n    end\n```\n\nThe page table allows:\n- **Non-contiguous storage**: Physical blocks need not be contiguous\n- **Shared blocks**: Multiple sequences can share the same physical block (for prefixes)\n- **Dynamic growth**: New pages allocated as sequence extends\n\n资料来源：[vllm/v1/core/kv_cache_manager.py:100-200](), [docs/design/paged_attention.md:50-150]()\n\n## Memory Management Strategies\n\n### Allocation Policy\n\nvLLM uses a dynamic allocation policy that balances memory efficiency and allocation overhead:\n\n| Strategy | Description | Trade-off |\n|----------|-------------|------------|\n| **On-demand** | Allocate blocks as tokens are generated | Lower memory waste, higher overhead |\n| **Pre-allocation** | Reserve blocks when sequence starts | Lower overhead, potential waste |\n| **Hybrid** | Pre-allocate prefix, on-demand for new tokens | Balanced approach |\n\n### Reference Counting\n\nReference counting prevents premature block deallocation:\n\n1. **Initial allocation**: Reference count = 1\n2. **Fork/continue**: Reference count incremented\n3. **Completion**: Reference count decremented\n4. **Free block**: When count reaches 0\n\n### Memory Reclamation\n\nWhen GPU memory is exhausted:\n\n```mermaid\ngraph TD\n    A[Memory Pressure Detected] --> B{Sequence Can Evict?}\n    B -->|Yes| C[Evict Cold Blocks]\n    B -->|No| D[Wait or Reject Request]\n    C --> E[Update Page Tables]\n    E --> F[Allocate New Blocks]\n```\n\n资料来源：[vllm/v1/core/kv_cache_manager.py:200-300](), [vllm/v1/core/block_pool.py:100-200]()\n\n## Integration with Scheduler\n\n### Request Lifecycle\n\n```mermaid\nstateDiagram-v2\n    [*] --> Received: New Request\n    Received --> Scheduled: Add to Queue\n    Scheduled --> Allocating: Acquire Blocks\n    Allocating --> Running: Blocks Available\n    Running --> Running: Generate Token\n    Running --> Completed: EOS Token\n    Completed --> [*]: Free Blocks\n    Running --> Waiting: Block Unavailable\n    Waiting --> Running: Blocks Acquired\n```\n\n### Scheduling Integration Points\n\n| Stage | KV Cache Interaction |\n|-------|---------------------|\n| **Request Arrival** | Pre-allocate blocks for known prefix length |\n| **Token Generation** | Allocate new block when current fills up |\n| **Sequence Completion** | Release all associated blocks |\n| **Prefix Caching** | Share blocks with identical prefixes |\n\n## Configuration Options\n\n### Server Configuration\n\n```bash\nvllm serve <model> \\\n    --block-size 16 \\\n    --num-gpu-blocks-override 1000 \\\n    --gpu-memory-utilization 0.9 \\\n    --max-num-batched-tokens 8192\n```\n\n### Key Parameters\n\n| Parameter | Description | Default |\n|-----------|-------------|---------|\n| `block_size` | Number of tokens per KV cache block | 16 |\n| `gpu_memory_utilization` | Fraction of GPU memory for KV cache | 0.9 |\n| `num_gpu_blocks_override` | Override auto-computed block count | Auto |\n| `max_num_batched_tokens` | Maximum tokens in a single batch | Dynamic |\n\n### Memory Calculation\n\n```\ntotal_cache_memory = num_blocks × block_size × num_layers × 2 × num_kv_heads × head_dim × dtype_size\n```\n\nWhere:\n- `num_blocks` = GPU memory × utilization / per_block_memory\n- `2` accounts for both K and V caches\n- `dtype_size` = 2 bytes for FP16, 1 byte for INT8, etc.\n\n资料来源：[docs/design/paged_attention.md:150-250]()\n\n## Performance Characteristics\n\n### Throughput Improvements\n\nPagedAttention enables significant performance improvements compared to traditional contiguous allocation:\n\n| Metric | Traditional | PagedAttention V1 | PagedAttention V2 |\n|--------|-------------|-------------------|-------------------|\n| **Memory Waste** | 30-60% | <5% | <5% |\n| **Throughput** | Baseline | ~2.1x | ~2.2x |\n| **BS=1 Latency** | Baseline | ~1.9x | ~2.0x |\n| **BS=16+ Latency** | Baseline | ~2.0x | ~2.2x |\n\n### Scalability\n\nThe system scales efficiently with:\n\n- **Longer sequences**: O(seq_len) memory, no fragmentation\n- **More concurrent requests**: Block sharing for shared prefixes\n- **Larger models**: Better memory utilization per GPU\n\n资料来源：[csrc/attention/paged_attention_v2.cu:200-300](), [docs/design/paged_attention.md:250-350]()\n\n## Implementation Details\n\n### Block Metadata Tracking\n\n```python\n# Core block metadata structure\n@dataclass\nclass Block:\n    block_id: int\n    device: Device\n    block_size: int\n    num_tokens: int = 0\n    \n    # Physical location\n    physical_block_id: Optional[int] = None\n    \n    # Content tracking\n    content_hash: Optional[int] = None\n    computed: bool = False\n    \n    # Reference counting for shared blocks\n    ref_count: int = 0\n```\n\n### Attention Computation Flow\n\n```mermaid\ngraph TD\n    subgraph \"Pre-computation\"\n        Q[Query Tensors] --> QS[Query Split]\n        K[Key Tensors] --> KS[Key Split]\n        V[Value Tensors] --> VS[Value Split]\n    end\n    \n    subgraph \"Paged Attention\"\n        QS --> AL[Attention Layer]\n        KS --> AL\n        VS --> AL\n        AL --> PT[Page Table Lookup]\n    end\n    \n    subgraph \"Output\"\n        PT --> OUT[Attention Output]\n        OUT --> SOFTMAX[Softmax]\n        SOFTMAX --> GEMM[Final GEMM]\n    end\n```\n\n## Best Practices\n\n### Memory Configuration\n\n1. **Set appropriate GPU memory utilization** based on other memory needs (model weights, activations)\n2. **Adjust block size** for typical sequence lengths (larger blocks = less overhead for long sequences)\n3. **Monitor fragmentation** using vLLM metrics\n\n### Request Optimization\n\n1. **Use consistent prefixes** to enable block sharing across requests\n2. **Batch similar requests** to maximize cache hit rates\n3. **Set appropriate max sequence length** to avoid excessive block allocation\n\n### Debugging\n\n```bash\n# Enable verbose logging\nexport VLL_LOG_LEVEL=DEBUG\n\n# Check memory status\ncurl http://localhost:8000/memory_stats\n```\n\n## Related Components\n\n| Component | File | Role |\n|-----------|------|------|\n| Attention Backend | `vllm/attention/backends/` | Pluggable attention implementations |\n| Scheduler | `vllm/v1/core/sched.py` | Coordinates cache allocation with scheduling |\n| Model Runner | `vllm/v1/core/model_runner.py` | Executes attention with managed KV cache |\n| Worker | `vllm/v1/worker/worker.py` | GPU-side cache management |\n\n## References\n\n- vLLM Paper: [\"Efficient Memory Management for Large Language Model Serving with PagedAttention\"](https://arxiv.org/abs/2309.06180)\n- Original Implementation: [csrc/attention/paged_attention_v1.cu]()\n- Optimized Implementation: [csrc/attention/paged_attention_v2.cu]()\n- Design Documentation: [docs/design/paged_attention.md]()\n- KV Cache Manager: [vllm/v1/core/kv_cache_manager.py]()\n- Block Pool: [vllm/v1/core/block_pool.py]()\n\n---\n\n<a id='page-7'></a>\n\n## Attention Backends and Kernels\n\n### 相关页面\n\n相关主题：[PagedAttention and KV Cache Management](#page-6), [Quantization Support](#page-8)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/v1/attention/backends/flash_attn.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/flash_attn.py)\n- [vllm/v1/attention/backends/flashinfer.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/flashinfer.py)\n- [vllm/v1/attention/backends/mla/flashmla.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/mla/flashmla.py)\n- [vllm/v1/attention/backends/registry.py](https://github.com/vllm-project/vllm/blob/main/vllm/v1/attention/backends/registry.py)\n- [docs/design/attention_backends.md](https://github.com/vllm-project/vllm/blob/main/docs/design/attention_backends.md)\n</details>\n\n# Attention Backends and Kernels\n\n## Overview\n\nThe Attention Backends and Kernels system in vLLM provides a pluggable, hardware-accelerated implementation of attention mechanisms for Large Language Model (LLM) inference. This abstraction layer enables vLLM to leverage different optimized attention implementations (FlashAttention, FlashInfer, FlashMLA) depending on hardware capabilities and model requirements, while maintaining a unified interface for the rest of the engine.\n\n## Architecture\n\n### High-Level Design\n\nvLLM implements a backend registry pattern that allows runtime selection of the optimal attention implementation. The attention system is designed to support both the legacy v0 engine and the optimized v1 engine architecture.\n\n```mermaid\ngraph TD\n    A[Attention Interface] --> B[Attention Backends Registry]\n    B --> C[FlashAttention Backend]\n    B --> D[FlashInfer Backend]\n    B --> E[FlashMLA Backend]\n    C --> F[CUDA Kernels]\n    D --> G[Template-based Kernels]\n    E --> H[MLA-optimized Kernels]\n    \n    I[Model Request] --> J[Scheduler]\n    J --> K[Attention Layer]\n    K --> A\n```\n\n### Backend Selection Flow\n\nThe registry-based architecture enables automatic backend selection based on hardware and configuration:\n\n```mermaid\ngraph TD\n    A[Engine Initialization] --> B{Check user-specified backend}\n    B -->|Specified| C[Validate backend compatibility]\n    B -->|Auto| D{Detect GPU Architecture}\n    D -->|H100/H200| E[Select FlashAttention3]\n    D -->|A100/A10| F[Select FlashAttention2]\n    D -->|Other| G[Select FlashAttention2]\n    C -->|Valid| H[Initialize Backend]\n    C -->|Invalid| I[Raise Configuration Error]\n    E --> H\n    F --> H\n    G --> H\n```\n\n## Attention Backend Components\n\n### Backend Registry\n\nThe `registry.py` module provides the central factory for attention backend selection:\n\n| Component | Responsibility |\n|-----------|----------------|\n| `AttentionBackend` | Abstract base class defining the backend interface |\n| `get_available_backends()` | Returns list of compiled/available backends |\n| `get_backend(name)` | Retrieves a specific backend by name |\n| `AttentionImpl` | Concrete implementation wrapper per backend |\n\n### FlashAttention Backend\n\nLocated at `vllm/v1/attention/backends/flash_attn.py`, this backend provides:\n\n- FlashAttention 2/3 optimized CUDA kernels\n- Support for head dimensions 64, 80, 96, 128, 160, 192, 256\n- Paged attention integration\n- Cross-attention support for encoder-decoder models\n\n**Key Features:**\n\n| Feature | Description |\n|---------|-------------|\n| Fused kernels | Combines attention computation steps |\n| Dynamic persistent memory | Reuses KV cache blocks efficiently |\n| Strided bias support | Enables sliding window attention |\n\n### FlashInfer Backend\n\nLocated at `vllm/v1/attention/backends/flashinfer.py`, this backend offers:\n\n- Template-based kernel generation for flexibility\n- Improved performance for specific head dimensions\n- Better integration with vLLM's block manager\n- Support for custom attention patterns\n\n### FlashMLA Backend\n\nLocated at `vllm/v1/attention/backends/mla/flashmla.py`, this backend is optimized for:\n\n- **Multi-head Latent Attention (MLA)** architectures\n- Low-rank KV cache compression\n- Reduced memory bandwidth usage\n- Optimized for DeepSeek-style models\n\n## Attention Interface\n\n### Core Abstraction\n\nAll attention backends implement a common interface defined by the `Attention` class:\n\n```python\nclass Attention:\n    def __init__(\n        self,\n        num_heads: int,\n        head_size: int,\n        scale: float,\n        num_kv_heads: int,\n        alibi_slopes: Optional[List[float]],\n        cache_config: Optional[CacheConfig],\n        block_size: int,\n    )\n    \n    def forward(\n        self,\n        query: torch.Tensor,\n        key: torch.Tensor,\n        value: torch.Tensor,\n        kv_cache: Optional[torch.Tensor],\n        attn_metadata: AttentionMetadata,\n    ) -> torch.Tensor\n```\n\n### Attention Metadata\n\nThe `AttentionMetadata` structure carries runtime information required for attention computation:\n\n| Field | Type | Purpose |\n|-------|------|---------|\n| `seq_lens` | List[int] | Sequence lengths for each request |\n| `max_seq_len` | int | Maximum sequence length in batch |\n| `block_tables` | torch.Tensor | Paged attention block mappings |\n| `seq_start_idx` | int | Starting index in output buffer |\n\n## Paged Attention Integration\n\nvLLM's attention system integrates tightly with paged memory management:\n\n```mermaid\ngraph LR\n    A[Logical KV Cache] --> B[Physical KV Blocks]\n    C[Query Tensors] --> D[Block Lookup]\n    D --> E[Attention Computation]\n    F[Block Table] --> D\n    B --> E\n    E --> G[Output Tensors]\n```\n\n### Block Table Structure\n\n| Block Index | Physical Address | Valid Length |\n|-------------|------------------|--------------|\n| 0 | 15 | 16 |\n| 1 | 23 | 16 |\n| 2 | 7 | 16 |\n| 3 | -1 (pending) | 0 |\n\n## Backend Configuration\n\n### Runtime Selection\n\nBackends can be selected through multiple mechanisms:\n\n| Method | Priority | Example |\n|--------|----------|---------|\n| Environment variable | Lowest | `VLLM_ATTENTION_BACKEND=FLASHINFER` |\n| Model config | Medium | `\"attention_backend\": \"flash_attn\"` |\n| CLI argument | Highest | `--attention-backend flashinfer` |\n\n### Configuration Parameters\n\n| Parameter | Backend | Description |\n|-----------|---------|-------------|\n| `head_size` | All | Dimension of each attention head |\n| `num_kv_heads` | All | Number of key/value heads (for GQA) |\n| `scale` | All | Attention scaling factor |\n| `sliding_window` | FlashAttn | Sliding window size |\n| `page_size` | FlashInfer | KV cache block size |\n\n## Performance Characteristics\n\n### Memory Efficiency\n\n| Backend | KV Cache Format | Memory Overhead |\n|---------|-----------------|------------------|\n| FlashAttention | Standard | Baseline |\n| FlashInfer | Blocked | Similar to FlashAttn |\n| FlashMLA | Low-rank | 50-70% reduction |\n\n### Throughput Optimization\n\n- **Kernel Fusion**: Combines multiple operations to reduce memory bandwidth\n- **Persistent RNN**: Reuses computation across decode steps\n- **Async Execution**: Overlaps attention with other operations\n\n## Implementation Details\n\n### FlashAttention Kernel Flow\n\n```mermaid\nsequenceDiagram\n    participant Q as Query\n    participant K as Key\n    participant V as Value\n    participant Kernel as FlashAttn Kernel\n    participant Cache as KV Cache\n    \n    Q->>Kernel: Load Q tiles\n    K->>Kernel: Load K tiles\n    V->>Kernel: Load V tiles\n    Kernel->>Cache: Update (if write)\n    Kernel->>Kernel: Compute attention scores\n    Kernel->>Cache: Read (if read)\n    Kernel->>Kernel: softmax & scale\n    Kernel-->>Q: Output attention\n```\n\n### Backend Initialization Sequence\n\n```mermaid\ngraph TD\n    A[EngineArgs parse] --> B[Attention backend selection]\n    B --> C[Backend registry lookup]\n    C --> D[Load backend module]\n    D --> E[Initialize CUDA streams]\n    E --> F[Allocate persistent buffers]\n    F --> G[Register attention layers]\n    G --> H[Ready for inference]\n```\n\n## Extending Attention Backends\n\nTo implement a custom attention backend:\n\n1. **Create backend class**: Inherit from `AttentionBackend`\n2. **Implement required methods**: `get_name()`, `get_impl()`\n3. **Register backend**: Add to `ATTENTION_BACKENDS` registry\n4. **Implement kernels**: Write CUDA/C++ kernels or wrap existing libraries\n\n```python\nclass CustomAttentionBackend(AttentionBackend):\n    @staticmethod\n    def get_name() -> str:\n        return \"custom_attention\"\n    \n    @staticmethod\n    def get_impl() -> AttentionImpl:\n        return CustomAttentionImpl\n```\n\n## Troubleshooting\n\n### Common Issues\n\n| Issue | Cause | Solution |\n|-------|-------|----------|\n| CUDA out of memory | Large batch/sequence | Reduce `gpu_memory_utilization` |\n| Incorrect outputs | Wrong backend selected | Verify with `--attention-backend` flag |\n| Kernel launch failure | Unsupported head size | Use supported head dimensions |\n| Slow inference | Suboptimal backend | Benchmark available backends |\n\n### Debugging Tips\n\n- Set `VLLM_LOGGING_LEVEL=DEBUG` for attention kernel timing\n- Use `VLLM_ATTENTION_BACKEND=FLASH_ATTN` to force specific backend\n- Check `nvidia-smi` for kernel execution times\n\n## Summary\n\nThe Attention Backends and Kernels system provides vLLM's computational core for transformer attention. Through the registry pattern, it enables seamless switching between optimized implementations while maintaining a stable interface for the rest of the engine. The pluggable architecture supports both established backends (FlashAttention, FlashInfer) and specialized implementations (FlashMLA) optimized for specific model architectures.\n\n---\n\n<a id='page-8'></a>\n\n## Quantization Support\n\n### 相关页面\n\n相关主题：[Model Architecture Support](#page-10), [Attention Backends and Kernels](#page-7)\n\n<details>\n<summary>Related Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/model_executor/layers/quantization/fp8.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/fp8.py)\n- [vllm/model_executor/layers/quantization/base_config.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/base_config.py)\n- [vllm/model_executor/layers/quantization/gguf.py](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/gguf.py)\n- [docs/features/quantization/README.md](https://github.com/vllm-project/vllm/blob/main/docs/features/quantization/README.md)\n- [csrc/quantization](https://github.com/vllm-project/vllm/blob/main/csrc/quantization)\n</details>\n\n# Quantization Support\n\n## Overview\n\nvLLM provides comprehensive quantization support to reduce model memory footprint and accelerate inference through various quantization schemes. Quantization compresses model weights from higher precision (typically FP16/BF16) to lower precision formats (such as INT8, INT4, or FP8), enabling larger models to run on limited GPU resources while maintaining acceptable accuracy.\n\nThe quantization system in vLLM is designed with a modular, extensible architecture that supports multiple quantization methods through a common abstraction layer. This allows users to easily switch between different quantization schemes or implement custom quantization strategies.\n\n资料来源：[docs/features/quantization/README.md]()\n\n## Architecture\n\n### Quantization System Components\n\nThe quantization system consists of several interconnected components:\n\n```mermaid\ngraph TD\n    A[Model Loading] --> B[QuantizationConfig]\n    B --> C[QuantizationMethod]\n    C --> D[Quantized Linear Layers]\n    C --> E[Quantized Embedding Layers]\n    D --> F[CUDA/ROCm Kernels]\n    E --> F\n    G[Weight Loading] --> H[Pre-quantized Weights]\n    G --> I[On-the-fly Quantization]\n```\n\n### Quantization Base Architecture\n\nThe base quantization architecture defines a common interface that all quantization methods must implement:\n\n```mermaid\nclassDiagram\n    class QuantizationConfig {\n        +get_supported_methods() List[QuantizationMethods]\n        +get_override_quant_config() Optional[dict]\n        +get_quant_config() dict\n        +verify_quant_config() None\n    }\n    \n    class QuantizationMethods {\n        <<enumeration>>\n        FP8\n        GGUF\n        AWQ\n        GPTQ\n        QUANTITY\n    }\n    \n    class QuantizedLinear {\n        <<interface>>\n        +create_weights()\n        +apply_weights()\n    }\n    \n    QuantizationConfig --> QuantizationMethods\n    QuantizationMethods --> QuantizedLinear\n```\n\n资料来源：[vllm/model_executor/layers/quantization/base_config.py]()\n\n### Layer Structure\n\nEach quantization implementation defines its own quantized layer classes:\n\n| Layer Type | Purpose | Quantization Scope |\n|------------|---------|-------------------|\n| `QuantizedLinear` | Matrix multiplication with quantized weights | Weight-only or Activation+Weight |\n| `QuantizedEmbedding` | Lookup table with quantized weights | Weight-only |\n| `QuantizedMoE` | Mixture-of-Experts with quantized components | Per-expert quantization |\n\n资料来源：[vllm/model_executor/layers/quantization/fp8.py]()\n\n## Supported Quantization Methods\n\n### FP8 (8-bit Floating Point)\n\nFP8 quantization uses 8-bit floating point representation with two formats:\n\n| Format | Exponent Bits | Mantissa Bits | Use Case |\n|--------|--------------|---------------|----------|\n| E4M3 | 4 | 3 | Activations and weights |\n| E5M2 | 5 | 2 | Gradients and optimizer states |\n\nFP8 quantization is particularly well-supported in vLLM with optimized CUDA kernels for inference acceleration.\n\n资料来源：[vllm/model_executor/layers/quantization/fp8.py]()\n\n### GGUF (GPT-Generated Unified Format)\n\nGGUF is a quantized model format commonly used with llama.cpp. vLLM supports loading GGUF-quantized models directly:\n\n```python\n# Load GGUF quantized model\nllm = LLM(model=\"unsloth/Qwen3-0.6B-GGUF:Q4_K_M\")\n```\n\nGGUF supports multiple quantization levels:\n\n| Quantization Type | Bits per Parameter | Description |\n|-------------------|---------------------|-------------|\n| Q2_K | ~2.5 | 2-bit quantization with 4-bit key-values |\n| Q3_K | ~3.5 | 3-bit quantization with 4-bit key-values |\n| Q4_K | ~4.5 | 4-bit quantization with 8-bit key-values |\n| Q5_K | ~5.5 | 5-bit quantization with 8-bit key-values |\n| Q6_K | ~6.5 | 6-bit quantization |\n| Q8_0 | ~8.0 | 8-bit quantization (baseline) |\n\n资料来源：[vllm/model_executor/layers/quantization/gguf.py]()\n\n### AWQ (Activation-Aware Weight Quantization)\n\nAWQ identifies weights with significant activation contributions and preserves them at higher precision while quantizing others aggressively.\n\n### GPTQ (Generative Pre-trained Transformer Quantization)\n\nGPTQ performs post-training quantization with optional layer-wise precision adjustment and GPU optimization.\n\n## Quantization Configuration\n\n### Configuration Parameters\n\n| Parameter | Type | Description | Default |\n|-----------|------|-------------|---------|\n| `quantization` | str | Quantization method name | None |\n| `quantization_param_path` | str | Path to quantization parameters file | None |\n| `dtype` | str | Model precision (if not pre-quantized) | auto |\n| `kv_cache_dtype` | str | KV cache quantization format | auto |\n\n### Enabling Quantization\n\nQuantization is enabled through the `--quantization` CLI argument or `quantization` parameter in `AsyncEngineArgs`:\n\n```bash\nvllm serve model/path --quantization fp8\n```\n\n```python\nfrom vllm import LLM, SamplingParams\n\nllm = LLM(\n    model=\"meta-llama/Llama-2-7b-hf\",\n    quantization=\"fp8\",\n    gpu_memory_utilization=0.9\n)\n```\n\n### Mixed Quantization\n\nSome layers may remain in higher precision when quantization cannot be applied uniformly:\n\n| Layer Type | Quantization Behavior |\n|------------|----------------------|\n| Input/Output embeddings | Often kept in FP16/BF16 |\n| Output projection | May use different precision |\n| Attention softmax | Usually in FP32 for stability |\n\n## Implementation Details\n\n### Weight Loading Pipeline\n\n```mermaid\ngraph LR\n    A[Model Checkpoint] --> B{Pre-quantized?}\n    B -->|Yes| C[Load Quantized Weights]\n    B -->|No| D[Dynamic Quantization]\n    C --> E[Apply Quantization Config]\n    D --> E\n    E --> F[Initialize Quantized Layer]\n    F --> G[Verify Weight Shape]\n    G --> H[CUDA Kernel Ready]\n```\n\n### C++ Backend (cuBLAS/cutlass)\n\nHigh-performance quantization kernels are implemented in C++ and CUDA:\n\n| Component | Location | Purpose |\n|-----------|----------|---------|\n| FP8 GEMM | `csrc/quantization/fp8/` | FP8 matrix multiplication |\n| W8A8 GEMM | `csrc/quantization/fp8/` | INT8 weight, INT8 activation |\n| W4A16 GEMM | `csrc/quantization/` | INT4 weight, FP16 activation |\n| Dequantization | `csrc/quantization/` | Convert quantized to compute dtype |\n\n资料来源：[csrc/quantization]()\n\n### Quantized Linear Layer Implementation\n\nThe `QuantizedLinear` layer handles the core computation:\n\n```python\nclass QuantizedLinear(QuantizedLayer):\n    \"\"\"Base class for quantized linear layers.\"\"\"\n    \n    def __init__(\n        self,\n        input_size: int,\n        output_size: int,\n        quantization_config: QuantizationConfig,\n        bias: bool = False,\n    ):\n        self.input_size = input_size\n        self.output_size = output_size\n        self.quantization_config = quantization_config\n        \n    def create_weights(self):\n        \"\"\"Initialize quantized weight tensors.\"\"\"\n        raise NotImplementedError\n        \n    def forward(self, input_):\n        \"\"\"Forward pass with quantized computation.\"\"\"\n        raise NotImplementedError\n```\n\n资料来源：[vllm/model_executor/layers/quantization/base_config.py]()\n\n## Usage Examples\n\n### Loading Pre-quantized Models\n\n```python\nfrom vllm import LLM\n\n# FP8 quantized model\nllm_fp8 = LLM(\n    model=\"meta-llama/Llama-2-70b-hf\",\n    quantization=\"fp8\",\n    tensor_parallel_size=4\n)\n\n# GGUF quantized model\nllm_gguf = LLM(\n    model=\"TheBloke/Llama-2-70B-Chat-GGUF\",\n    quantization=\"gguf\",\n    tokenizer=\"meta-llama/Llama-2-70b-chat\"\n)\n```\n\n### Quantization with Different Precisions\n\n```python\n# Load with specific precision settings\nllm = LLM(\n    model=\"Qwen/Qwen2.5-72B-Instruct\",\n    quantization=\"fp8\",\n    dtype=\"half\",  # Compute precision\n    kv_cache_dtype=\"fp8_e4m3\"  # KV cache precision\n)\n```\n\n### CLI Usage\n\n```bash\n# Serve with FP8 quantization\nvllm serve meta-llama/Llama-2-70b-hf \\\n    --quantization fp8 \\\n    --tensor-parallel-size 4 \\\n    --gpu-memory-utilization 0.95\n\n# Serve GGUF model directly\nvllm serve TheBloke/Llama-2-7B-Chat-GGUF:Q4_K_M\n```\n\n## Performance Considerations\n\n### Memory Reduction\n\n| Quantization | Memory Reduction | Quality Impact |\n|--------------|-------------------|----------------|\n| FP8 (E4M3) | ~50% | Minimal |\n| INT8 | ~50% | Low |\n| INT4 | ~75% | Moderate |\n| INT2 | ~87.5% | High |\n\n### Throughput Impact\n\nQuantization improves throughput through:\n\n1. **Increased batch size**: More sequences fit in GPU memory\n2. **Higher memory bandwidth utilization**: Smaller weights load faster\n3. **Accelerated compute**: INT8/FP8 operations on tensor cores\n\n### Accuracy Considerations\n\n- **Post-training quantization (PTQ)**: May introduce accuracy degradation\n- **Activation-aware methods (AWQ)**: Better preserves model capabilities\n- **Calibration**: Some methods require calibration data for optimal accuracy\n\n## Extension Points\n\n### Custom Quantization Methods\n\nTo implement a custom quantization method, extend the base classes:\n\n```python\nfrom vllm.model_executor.layers.quantization import QuantizationConfig\n\nclass CustomQuantizationConfig(QuantizationConfig):\n    \"\"\"Custom quantization configuration.\"\"\"\n    \n    @staticmethod\n    def get_name() -> str:\n        return \"custom_quant\"\n    \n    @staticmethod\n    def get_supported_methods(cls) -> list[str]:\n        return [\"weight_only\"]\n    \n    def get_quant_config(self) -> dict:\n        return {\"method\": self.quant_method}\n```\n\n### Registering Custom Quantization\n\nCustom quantizations must be registered in the quantization registry to be discoverable at runtime.\n\n## Summary\n\nvLLM's quantization support provides a flexible, extensible system for serving large language models with reduced memory footprint. The architecture separates concerns through well-defined interfaces, allowing seamless integration of new quantization methods while maintaining high performance through optimized CUDA kernels.\n\nKey takeaways:\n- Multiple quantization formats supported (FP8, GGUF, AWQ, GPTQ)\n- Modular architecture enables easy extension\n- Optimized C++/CUDA kernels for inference acceleration\n- Simple API through CLI and Python interface\n- Memory reduction up to 75% with INT4 quantization\n\n---\n\n<a id='page-9'></a>\n\n## Distributed Inference and Parallelism\n\n### 相关页面\n\n相关主题：[Scheduling and Request Processing](#page-5), [PagedAttention and KV Cache Management](#page-6)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/distributed/parallel_state.py](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/parallel_state.py)\n- [vllm/distributed/device_communicators/cuda_communicator.py](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/device_communicators/cuda_communicator.py)\n- [vllm/distributed/kv_transfer/kv_connector/base.py](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/kv_transfer/kv_connector/base.py)\n- [vllm/config/parallel.py](https://github.com/vllm-project/vllm/blob/main/vllm/config/parallel.py)\n- [docs/serving/parallelism_scaling.md](https://github.com/vllm-project/vllm/blob/main/docs/serving/parallelism_scaling.md)\n- [vllm/distributed/kv_transfer/kv_transfer_engine.py](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/kv_transfer/kv_transfer_engine.py)\n- [vllm/entrypoints/llm.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py)\n</details>\n\n# Distributed Inference and Parallelism\n\nvLLM provides comprehensive support for distributed inference and parallelism, enabling efficient serving of large language models across multiple GPUs and nodes. This document covers the architecture, configuration, and implementation details of vLLM's distributed computing capabilities.\n\n## Overview\n\nvLLM's distributed inference system enables horizontal scaling of LLM workloads by distributing model computation across multiple devices. The system supports multiple parallelism strategies, including tensor parallelism, pipeline parallelism, and data parallelism, along with specialized features like disaggregated prefill/decode and KV cache transfer.\n\nThe core components of distributed inference in vLLM include:\n\n| Component | Purpose |\n|-----------|---------|\n| Parallel State Manager | Coordinates process groups for distributed communication |\n| Device Communicators | Handle low-level tensor communication (NCCL, CUDA) |\n| KV Transfer System | Enables disaggregated prefill/decode architectures |\n| Configuration System | Manages parallelism parameters and device placement |\n\n## Parallelism Strategies\n\nvLLM supports three primary parallelism strategies, each addressing different aspects of distributed computation.\n\n### Tensor Parallelism (TP)\n\nTensor parallelism splits individual weight matrices across multiple GPUs, allowing computation of large matrices that would not fit in a single device's memory. This is particularly effective for dense layers like attention and feed-forward networks.\n\nTensor parallelism requires:\n- NVIDIA GPUs with NCCL support\n- High-bandwidth interconnects (NVLink preferred)\n- Each tensor-parallel rank requires full model weights in terms of optimizer states\n\n### Pipeline Parallelism (PP)\n\nPipeline parallelism distributes layers (stages) of the model across different GPUs or nodes. This approach reduces memory requirements per device while maintaining high GPU utilization through micro-batch pipelining.\n\n### Data Parallelism (DP)\n\nData parallelism replicates the entire model across multiple GPUs, with each replica processing different batches of requests. This is the simplest form of parallelism and scales throughput linearly with the number of replicas.\n\n## Parallel State Management\n\nThe `ParallelState` class in `vllm/distributed/parallel_state.py` is the central coordinator for distributed execution.\n\n```mermaid\ngraph TD\n    A[ParallelState] --> B[Tensor Parallel Group]\n    A --> C[Pipeline Parallel Group]\n    A --> D[Data Parallel Group]\n    A --> E[World Communicator]\n    B --> F[Rank 0, 1, 2, 3]\n    C --> G[Stage 0, Stage 1]\n    D --> H[Replica 1, Replica 2]\n```\n\n### Process Group Initialization\n\nParallel state is initialized through `init_distributed_environment()`, which creates the necessary process groups for communication.\n\n```python\n# From vllm/distributed/parallel_state.py:45-78\ndef init_distributed_environment(\n    rank: int,\n    world_size: int,\n    local_rank: int,\n    init_method: str = \"env://\",\n    backend: str = \"nccl\"\n):\n    # Initialize distributed context\n    torch.distributed.init_process_group(\n        backend=backend,\n        init_method=init_method,\n        rank=rank,\n        world_size=world_size\n    )\n```\n\n### Rank and World Size Management\n\n| Parameter | Description |\n|-----------|-------------|\n| `rank` | Unique identifier for each process in the distributed group |\n| `world_size` | Total number of processes in the distributed group |\n| `local_rank` | Rank of the process within its local node |\n\n资料来源：[vllm/distributed/parallel_state.py:45-78](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/parallel_state.py)\n\n## Device Communication\n\n### CUDA Communicator\n\nThe `CUDACommunicator` class in `vllm/distributed/device_communicators/cuda_communicator.py` provides NCCL-based communication primitives optimized for CUDA tensors.\n\n```mermaid\ngraph LR\n    A[Tensor] -->|all_reduce| B[Aggregated Tensor]\n    A -->|broadcast| C[Same Tensor on All Ranks]\n    A -->|reduce_scatter| D[Partitioned Results]\n    E[Partial Tensors] -->|all_gather| F[Complete Tensor]\n```\n\n### Supported Communication Primitives\n\n| Primitive | Function | Use Case |\n|-----------|----------|----------|\n| `all_reduce` | Reduce tensors across all ranks | Gradient synchronization in TP |\n| `broadcast` | Send tensor from one rank to all | Weight updates |\n| `all_gather` | Collect tensors from all ranks | Output aggregation |\n| `reduce_scatter` | Reduce and partition across ranks | Gradient partitioning |\n\n```python\n# From vllm/distributed/device_communicators/cuda_communicator.py:23-65\nclass CUDACommunicator:\n    def __init__(self, group: ProcessGroup):\n        self.group = group\n        self.world_size = group.size()\n        self.rank = group.rank()\n    \n    def all_reduce(self, tensor: torch.Tensor) -> torch.Tensor:\n        # NCCL all-reduce implementation\n        torch.distributed.all_reduce(tensor, group=self.group)\n        return tensor\n```\n\n资料来源：[vllm/distributed/device_communicators/cuda_communicator.py:23-65](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/device_communicators/cuda_communicator.py)\n\n### Communication Patterns in Distributed Inference\n\n```mermaid\ngraph TD\n    subgraph \"Tensor Parallel Region\"\n        A[Attention AllReduce] --> B[FFN AllReduce]\n        B --> C[AllReduce Output]\n    end\n    \n    subgraph \"Pipeline Parallel Region\"\n        D[Send Hidden States] --> E[Receive Hidden States]\n        E --> F[Backward Pass]\n    end\n    \n    subgraph \"Data Parallel Region\"\n        G[Synchronize KV Cache] --> H[Load Balance Requests]\n    end\n```\n\n## Disaggregated Prefill and Decode\n\nvLLM supports disaggregated prefill/decode architectures where prefill (initial prompt processing) and decode (token generation) stages run on separate GPU clusters. This enables independent scaling of prefill and decode resources.\n\n### KV Transfer Architecture\n\nThe KV transfer system enables sharing of KV cache between prefill and decode instances.\n\n```mermaid\ngraph LR\n    A[Prefill Instance] -->|KV Transfer| B[Shared Storage]\n    B -->|KV Load| C[Decode Instance]\n    \n    subgraph \"Prefill Process\"\n        A1[Tokenize Prompts] --> A2[Process Prefill]\n        A2 --> A3[Save KV Cache]\n    end\n    \n    subgraph \"Decode Process\"\n        C1[Load KV Cache] --> C2[Generate Tokens]\n        C2 --> C3[Streaming Output]\n    end\n```\n\n资料来源：[examples/disaggregated/example_connector/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/example_connector/README.md)\n\n### KV Connector Base\n\nThe `KVConnectorBase` class defines the interface for KV cache transfer implementations.\n\n```python\n# From vllm/distributed/kv_transfer/kv_connector/base.py:15-85\nclass KVConnectorBase(ABC):\n    def __init__(self, kv_transfer_config: KVTransferParams):\n        self.config = kv_transfer_config\n    \n    @abstractmethod\n    def load_kv_cache(\n        self,\n        requests: List[TransferJob],\n        scheduler: Any\n    ) -> None:\n        \"\"\"Load KV cache from external source during prefill\"\"\"\n        pass\n    \n    @abstractmethod\n    def save_kv_cache(\n        self,\n        blocks: List[PhysicalTokenBlock],\n        kv_pair: KVCache,\n        callback: Callable\n    ) -> TransferJob:\n        \"\"\"Save KV cache to external storage during decode\"\"\"\n        pass\n```\n\n| Method | Purpose |\n|--------|---------|\n| `load_kv_cache` | Load KV cache during prefill phase |\n| `save_kv_cache` | Persist KV cache during decode phase |\n| `profile_num_available_blocks` | Determine available transfer capacity |\n\n资料来源：[vllm/distributed/kv_transfer/kv_connector/base.py:15-85](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/kv_transfer/kv_connector/base.py)\n\n### Example Connector Implementation\n\nThe `ExampleConnector` provides a reference implementation for KV transfer:\n\n```python\n# From examples/disaggregated/example_connector/\nclass ExampleConnector(KVConnectorBase):\n    def __init__(self, kv_transfer_config: KVTransferParams):\n        super().__init__(kv_transfer_config)\n        self.local_storage = \"./local_storage\"\n```\n\nThe connector workflow:\n1. **Prefill Phase**: Process prompts and save KV state to `local_storage` directory\n2. **Decode Phase**: Load KV state from storage and continue generation\n\n资料来源：[examples/disaggregated/example_connector/README.md](https://github.com/vllm-project/vllm/blob/main/examples/disaggregated/example_connector/README.md)\n\n## Configuration\n\n### Parallel Configuration Parameters\n\nThe `ParallelConfig` class in `vllm/config/parallel.py` manages all parallelism settings.\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `tensor_parallel_size` | int | 1 | Number of GPUs for tensor parallelism |\n| `pipeline_parallel_size` | int | 1 | Number of pipeline stages |\n| `data_parallel_size` | int | 1 | Number of data parallel replicas |\n| `data_parallel_size_per_region` | int | - | DP size for hybrid parallelism |\n| `data_parallel_master_port` | int | 29500 | Port for DP master communication |\n| `data_parallel_master_addr` | str | - | Address for DP master |\n| `numa_aware` | bool | False | Enable NUMA-aware GPU placement |\n\n```python\n# From vllm/config/parallel.py:10-45\n@dataclass\nclass ParallelConfig:\n    tensor_parallel_size: int = 1\n    pipeline_parallel_size: int = 1\n    data_parallel_size: int = 1\n    data_parallel_size_per_region: Optional[int] = None\n    data_parallel_master_port: int = 29500\n    data_parallel_master_addr: Optional[str] = None\n    data_parallel_standalone: bool = False\n    numa_aware: bool = False\n```\n\n资料来源：[vllm/config/parallel.py:10-45](https://github.com/vllm-project/vllm/blob/main/vllm/config/parallel.py)\n\n### Environment Variables\n\n| Variable | Description |\n|----------|-------------|\n| `CUDA_VISIBLE_DEVICES` | Comma-separated list of GPU IDs to use |\n| `VLLM_HOST_IP` | Host IP for distributed communication |\n| `VLLM_PORT` | Port for worker communication |\n\n### Launching Distributed Inference\n\n#### Multi-GPU Launch with `torchrun`\n\n```bash\ntorchrun --nproc_per_node=4 \\\n    --nnodes=1 \\\n    vllm/entrypoints/llm.py \\\n    --model meta-llama/Llama-2-70b-hf \\\n    --tensor-parallel-size 4\n```\n\n#### Multi-Node Launch\n\n```bash\ntorchrun --nproc_per_node=8 \\\n    --nnodes=2 \\\n    --node_rank=0 \\\n    --master_addr=10.0.0.1 \\\n    --master_port=29500 \\\n    vllm/entrypoints/llm.py \\\n    --model meta-llama/Llama-2-70b-hf \\\n    --tensor-parallel-size 4 \\\n    --pipeline-parallel-size 4\n```\n\n## Scaling Strategies\n\n### Scaling Guidelines\n\n| Model Size | GPU Memory | Recommended Configuration |\n|------------|------------|---------------------------|\n| 7B | 24GB | TP=1, DP=single node |\n| 13B | 48GB | TP=2, DP=single node |\n| 70B | 320GB+ | TP=4 or 8, PP=2+ |\n| 405B | 800GB+ | TP=8, PP=8, multi-node |\n\n资料来源：[docs/serving/parallelism_scaling.md](https://github.com/vllm-project/vllm/blob/main/docs/serving/parallelism_scaling.md)\n\n### Disaggregated Prefill Scaling\n\nFor disaggregated prefill/decode, consider:\n\n| Workload Pattern | Prefill Resources | Decode Resources |\n|-----------------|-------------------|-------------------|\n| Short prompts, many requests | Scale prefill | Scale decode |\n| Long prompts, few requests | Scale prefill with TP | Scale decode |\n| Mixed workload | Balance both | Balance both |\n\n### Scaling Best Practices\n\n1. **Start with tensor parallelism** for intra-node scaling\n2. **Add pipeline parallelism** for multi-node deployments\n3. **Use data parallelism** to increase throughput on same-stage workloads\n4. **Enable disaggregation** when prefill and decode have different resource needs\n\n资料来源：[docs/serving/parallelism_scaling.md](https://github.com/vllm-project/vllm/blob/main/docs/serving/parallelism_scaling.md)\n\n## Architecture Diagram\n\n```mermaid\ngraph TB\n    subgraph \"vLLM Distributed Architecture\"\n        subgraph \"Process 0\"\n            P0_M[Model Shard 0]\n            P0_S[Scheduler]\n            P0_C[Cache Engine]\n        end\n        \n        subgraph \"Process 1\"\n            P1_M[Model Shard 1]\n            P1_S[Scheduler]\n            P1_C[Cache Engine]\n        end\n        \n        subgraph \"Process N\"\n            PN_M[Model Shard N]\n            PN_S[Scheduler]\n            PN_C[Cache Engine]\n        end\n        \n        NCCL[NCCL AllReduce]\n        \n        P0_S <-->|NCCL| P1_S\n        P1_S <-->|NCCL| PN_S\n        P0_S <-->|NCCL| PN_S\n        \n        P0_M <-->|Forward Pass| P0_S\n        P1_M <-->|Forward Pass| P1_S\n        PN_M <-->|Forward Pass| PN_S\n    end\n    \n    subgraph \"External Services\"\n        KV[KV Transfer Service]\n        Redis[(Redis/Storage)]\n    end\n    \n    P0_C <-->|Save KV| Redis\n    PN_C <-->|Load KV| Redis\n    Redis <--> KV\n```\n\n## See Also\n\n- [Offline Inference Documentation](../features/structured_outputs.md)\n- [API Reference - LLM Class](https://docs.vllm.ai/en/latest/api/offline_inference/llm.html)\n- [Sampling Parameters](https://docs.vllm.ai/en/latest/api/inference_params.html)\n- [Disaggregated Prefill Example](../disaggregated/example_connector/README.md)\n\n---\n\n<a id='page-10'></a>\n\n## Model Architecture Support\n\n### 相关页面\n\n相关主题：[Model Executor and Worker Architecture](#page-4), [Quantization Support](#page-8)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n- [vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n- [vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n- [examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n- [examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n- [README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n</details>\n\n# Model Architecture Support\n\n## Overview\n\nvLLM provides comprehensive support for various LLM architectures, enabling users to serve, fine-tune, and run inference with a wide range of transformer-based models. The system is designed to be architecture-agnostic while providing optimized implementations for popular model families.\n\n## Core Model Loading Architecture\n\n### Python API (Offline Inference)\n\nThe primary Python interface for running offline inference is the `LLM` class, which handles model loading and inference without requiring a separate inference server.\n\n```python\n# Basic usage example\nfrom vllm import LLM\n\nllm = LLM(model=\"Qwen/Qwen3-0.6B\")\noutput = llm.generate(\"Hello, world!\")\n```\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n### CLI-based Model Serving\n\nThe vLLM CLI provides a `serve` subcommand for HTTP-based model serving:\n\n```bash\nvllm serve Qwen/Qwen3-0.6B\n```\n\nIf no model is specified, the CLI defaults to `Qwen/Qwen3-0.6B`.\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n## Supported Model Categories\n\n### Language Models\n\nvLLM supports a broad range of causal language models including:\n\n- **Decoder-only transformers**: Standard autoregressive models\n- **Mixture of Experts (MoE)**: Sparse architectures like Mixtral\n- **Multimodal models**: Models that process multiple input types\n\n### Quantization Support\n\nvLLM supports quantized models through GGUF format, enabling deployment of compressed models with reduced memory footprint.\n\nExample loading a quantized model directly from HuggingFace:\n\n```bash\n--model unsloth/Qwen3-0.6B-GGUF:Q4_K_M --tokenizer Qwen/Qwen3-0.6B\n```\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n## Model Configuration\n\n### Generation Configuration\n\nThe `--generation-config` argument specifies where the generation config is loaded from:\n\n| Value | Source | Description |\n|-------|--------|-------------|\n| `auto` | Model path | Loads from model's configuration directory |\n| `<folder_path>` | Local folder | Loads from specified directory |\n| Not provided | vLLM defaults | Uses built-in default parameters |\n\n> If `max_new_tokens` is specified in generation config, it sets a server-wide limit on output tokens for all requests.\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n### Engine Arguments\n\nModel configuration is controlled through `AsyncEngineArgs`, which processes CLI arguments and creates the model configuration:\n\n```python\nengine_args = AsyncEngineArgs.from_cli_args(args)\nmodel_config = engine_args.create_model_config()\n```\n\n资料来源：[vllm/entrypoints/cli/launch.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/launch.py)\n\n## Structured Outputs\n\nvLLM supports structured output generation for models that support it, including reasoning models:\n\n```bash\nvllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B \\\n    --reasoning-parser deepseek_r1\n```\n\nThis enables compliance with output format constraints defined by the model.\n\n资料来源：[examples/features/structured_outputs/README.md](https://github.com/vllm-project/vllm/blob/main/examples/features/structured_outputs/README.md)\n\n## CPU Offload Support\n\nFor models that exceed available GPU memory, vLLM provides CPU offload capabilities:\n\n```bash\n--cpu-offload-gb 10\n```\n\nThis creates virtual GPU memory by offloading portions of the model to CPU RAM. For example, with a 24GB GPU and 10GB offload, you can effectively load a 13B model requiring ~26GB.\n\n> **Note**: This requires fast CPU-GPU interconnect for acceptable performance.\n\n资料来源：[examples/basic/offline_inference/README.md](https://github.com/vllm-project/vllm/blob/main/examples/basic/offline_inference/README.md)\n\n## Model Registry Architecture\n\nThe vLLM CLI uses a modular command structure where model support is registered through subcommand modules:\n\n```python\nfor cmd_module in CMD_MODULES:\n    new_cmds = cmd_module.cmd_init()\n    for cmd in new_cmds:\n        cmd.subparser_init(subparsers).set_defaults(dispatch_function=cmd.cmd)\n```\n\nEach registered command includes validation logic to ensure model configurations are valid before execution.\n\n资料来源：[vllm/entrypoints/cli/main.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/main.py)\n\n## Serving Modes\n\n### Standard API Server\n\nThe default serving mode starts an HTTP server with OpenAI-compatible endpoints:\n\n```bash\nvllm serve <model_name>\n```\n\n### Headless Mode\n\nFor distributed deployments, headless mode skips API server initialization:\n\n```python\nif args.headless:\n    if args.api_server_count is not None and args.api_server_count > 0:\n        raise ValueError(\n            f\"--api-server-count={args.api_server_count} cannot be \"\n            \"used with --headless (no API servers are started in \"\n            \"headless mode).\"\n        )\n    args.api_server_count = 0\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n### gRPC Server Mode\n\nFor high-performance scenarios, vLLM supports gRPC-based serving:\n\n```python\nif getattr(args, \"grpc\", False):\n    from vllm.entrypoints.grpc_server import serve_grpc\n    uvloop.run(serve_grpc(args))\n```\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n## Data Parallel Modes\n\nvLLM supports distributed model serving through multiple load balancing strategies:\n\n| Mode | Flag | Description |\n|------|------|-------------|\n| External LB | `--data-parallel-external-lb` or `--data-parallel-rank` | External load balancer manages request distribution |\n| Hybrid LB | `--data-parallel-hybrid-lb` or `--data-parallel-start-rank` | Hybrid approach with internal and external coordination |\n\nThe system auto-detects load balancing mode to set appropriate default values for `api_server_count`.\n\n资料来源：[vllm/entrypoints/cli/serve.py](https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/cli/serve.py)\n\n## Installation and Quickstart\n\nFor users getting started with model architecture support:\n\n1. Install vLLM following the [installation guide](https://docs.vllm.ai/en/latest/getting_started/installation.html)\n2. Review the [quickstart documentation](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)\n3. Check the [list of supported models](https://docs.vllm.ai/en/latest/models/supported_models.html)\n\n资料来源：[README.md](https://github.com/vllm-project/vllm/blob/main/README.md)\n\n## Architecture Flow Diagram\n\n```mermaid\ngraph TD\n    A[User Request] --> B{CLI or Python API?}\n    B -->|CLI| C[vllm serve command]\n    B -->|Python| D[LLM class instantiation]\n    C --> E[CLISubcommand processing]\n    D --> F[AsyncEngineArgs configuration]\n    E --> G[Model Registry Lookup]\n    F --> H[Model Config Creation]\n    G --> I[Load Model Architecture]\n    H --> I\n    I --> J{Quantization?}\n    J -->|GGUF| K[Load Quantized Weights]\n    J -->|BF16/FP8| L[Load Standard Weights]\n    K --> M[PagedAttention Engine]\n    L --> M\n    M --> N[Inference Execution]\n```\n\n## Key Implementation Files\n\n| Component | File Path | Purpose |\n|-----------|-----------|---------|\n| CLI Entry | `vllm/entrypoints/cli/main.py` | Main CLI dispatcher and argument parsing |\n| Serve Command | `vllm/entrypoints/cli/serve.py` | HTTP server startup and configuration |\n| Launch Layer | `vllm/entrypoints/cli/launch.py` | FastAPI-based serving layer |\n| Offline Inference | `examples/basic/offline_inference/` | Python API usage examples |\n| Structured Outputs | `examples/features/structured_outputs/` | Advanced output formatting |\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：vllm-project/vllm\n\n摘要：发现 21 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling。\n\n## 1. 安装坑 · 来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_1a71634c530044a68b9160080d55de0a | https://github.com/vllm-project/vllm/issues/42182 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安装坑 · 来源证据：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_58327949a4524ed082bd189b53f713a1 | https://github.com/vllm-project/vllm/issues/40896 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_l…\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_lora_adapter`?\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fb1461834fe34049bd05182574d3e5e5 | https://github.com/vllm-project/vllm/issues/42207 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 4. 安装坑 · 来源证据：v0.18.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.18.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_317a03f9de4e459f9be42064c7318b2c | https://github.com/vllm-project/vllm/releases/tag/v0.18.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 能力坑 · 来源证据：[Feature]: Qwen3.5-Moe LoRA Support (experts)\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：[Feature]: Qwen3.5-Moe LoRA Support (experts)\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_2d068d43c6654f3cab6b48bf98dad116 | https://github.com/vllm-project/vllm/issues/40005 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 6. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:599547518 | https://github.com/vllm-project/vllm | README/documentation is current enough for a first validation pass.\n\n## 7. 运行坑 · 来源证据：v0.20.2\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：v0.20.2\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ecf37722dff6494c82b384225e34bcb0 | https://github.com/vllm-project/vllm/releases/tag/v0.20.2 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 8. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | last_activity_observed missing\n\n## 9. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:599547518 | https://github.com/vllm-project/vllm | no_demo; severity=medium\n\n## 10. 安全/权限坑 · 存在安全注意事项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：No sandbox install has been executed yet; downstream must verify before user use.\n- 对用户的影响：用户安装前需要知道权限边界和敏感操作。\n- 建议检查：转成明确权限清单和安全审查提示。\n- 防护动作：安全注意事项必须面向用户前置展示。\n- 证据：risks.safety_notes | github_repo:599547518 | https://github.com/vllm-project/vllm | No sandbox install has been executed yet; downstream must verify before user use.\n\n## 11. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:599547518 | https://github.com/vllm-project/vllm | no_demo; severity=medium\n\n## 12. 安全/权限坑 · 来源证据：[Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / A100\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / A100\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_9ce279a037934e8085332120bfdaca86 | https://github.com/vllm-project/vllm/issues/41758 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 13. 安全/权限坑 · 来源证据：v0.16.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.16.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_57ff11ff995a4809a33a38ff7504a5ef | https://github.com/vllm-project/vllm/releases/tag/v0.16.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 安全/权限坑 · 来源证据：v0.17.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.17.0\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_4592ced4fde24aa9aa808caa40e25b84 | https://github.com/vllm-project/vllm/releases/tag/v0.17.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 安全/权限坑 · 来源证据：v0.18.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.18.0\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_7ca09904f8164e94a3a1bc489d32d1ff | https://github.com/vllm-project/vllm/releases/tag/v0.18.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 16. 安全/权限坑 · 来源证据：v0.19.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.19.0\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_cdb0d7a7f491474da7b93583ec643c00 | https://github.com/vllm-project/vllm/releases/tag/v0.19.0 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 17. 安全/权限坑 · 来源证据：v0.19.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.19.1\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8c2ec43cf6f147d49f80f22bcd199e8f | https://github.com/vllm-project/vllm/releases/tag/v0.19.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 18. 安全/权限坑 · 来源证据：v0.20.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.20.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_c867dafed19f4b97978d637aea4c7308 | https://github.com/vllm-project/vllm/releases/tag/v0.20.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 19. 安全/权限坑 · 来源证据：v0.20.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.20.1\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_dc6a717c695b47838899da8c9791f907 | https://github.com/vllm-project/vllm/releases/tag/v0.20.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 20. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | issue_or_pr_quality=unknown\n\n## 21. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | release_recency=unknown\n\n<!-- canonical_name: vllm-project/vllm; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：vllm-project/vllm\n\n摘要：发现 21 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling。\n\n## 1. 安装坑 · 来源证据：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: Qwen3.5-397B-NVFP4 Disagg accuracy gsm8k collapses with async scheduling\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_1a71634c530044a68b9160080d55de0a | https://github.com/vllm-project/vllm/issues/42182 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安装坑 · 来源证据：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: vLLM v1 with prefix caching: first request differs from subsequent identical requests at temperature=0\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_58327949a4524ed082bd189b53f713a1 | https://github.com/vllm-project/vllm/issues/40896 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_l…\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Usage]: How to proactively clear CPU-resident memory left behind by unloaded LoRA adapters after calling `/v1/unload_lora_adapter`?\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fb1461834fe34049bd05182574d3e5e5 | https://github.com/vllm-project/vllm/issues/42207 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 4. 安装坑 · 来源证据：v0.18.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.18.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_317a03f9de4e459f9be42064c7318b2c | https://github.com/vllm-project/vllm/releases/tag/v0.18.1 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 能力坑 · 来源证据：[Feature]: Qwen3.5-Moe LoRA Support (experts)\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：[Feature]: Qwen3.5-Moe LoRA Support (experts)\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_2d068d43c6654f3cab6b48bf98dad116 | https://github.com/vllm-project/vllm/issues/40005 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 6. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:599547518 | https://github.com/vllm-project/vllm | README/documentation is current enough for a first validation pass.\n\n## 7. 运行坑 · 来源证据：v0.20.2\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：v0.20.2\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ecf37722dff6494c82b384225e34bcb0 | https://github.com/vllm-project/vllm/releases/tag/v0.20.2 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 8. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | last_activity_observed missing\n\n## 9. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:599547518 | https://github.com/vllm-project/vllm | no_demo; severity=medium\n\n## 10. 安全/权限坑 · 存在安全注意事项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：No sandbox install has been executed yet; downstream must verify before user use.\n- 对用户的影响：用户安装前需要知道权限边界和敏感操作。\n- 建议检查：转成明确权限清单和安全审查提示。\n- 防护动作：安全注意事项必须面向用户前置展示。\n- 证据：risks.safety_notes | github_repo:599547518 | https://github.com/vllm-project/vllm | No sandbox install has been executed yet; downstream must verify before user use.\n\n## 11. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:599547518 | https://github.com/vllm-project/vllm | no_demo; severity=medium\n\n## 12. 安全/权限坑 · 来源证据：[Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / A100\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：[Bug]: ngram speculative decoding changes greedy output on Qwen3-0.6B / A100\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_9ce279a037934e8085332120bfdaca86 | https://github.com/vllm-project/vllm/issues/41758 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 13. 安全/权限坑 · 来源证据：v0.16.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.16.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_57ff11ff995a4809a33a38ff7504a5ef | https://github.com/vllm-project/vllm/releases/tag/v0.16.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 安全/权限坑 · 来源证据：v0.17.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.17.0\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_4592ced4fde24aa9aa808caa40e25b84 | https://github.com/vllm-project/vllm/releases/tag/v0.17.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 安全/权限坑 · 来源证据：v0.18.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.18.0\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_7ca09904f8164e94a3a1bc489d32d1ff | https://github.com/vllm-project/vllm/releases/tag/v0.18.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 16. 安全/权限坑 · 来源证据：v0.19.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.19.0\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_cdb0d7a7f491474da7b93583ec643c00 | https://github.com/vllm-project/vllm/releases/tag/v0.19.0 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 17. 安全/权限坑 · 来源证据：v0.19.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.19.1\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8c2ec43cf6f147d49f80f22bcd199e8f | https://github.com/vllm-project/vllm/releases/tag/v0.19.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 18. 安全/权限坑 · 来源证据：v0.20.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.20.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_c867dafed19f4b97978d637aea4c7308 | https://github.com/vllm-project/vllm/releases/tag/v0.20.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 19. 安全/权限坑 · 来源证据：v0.20.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.20.1\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_dc6a717c695b47838899da8c9791f907 | https://github.com/vllm-project/vllm/releases/tag/v0.20.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 20. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | issue_or_pr_quality=unknown\n\n## 21. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:599547518 | https://github.com/vllm-project/vllm | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# vllm - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for vllm-project/vllm.\n\nProject:\n- Name: vllm\n- Repository: https://github.com/vllm-project/vllm\n- Summary: A high-throughput and memory-efficient inference and serving engine for LLMs\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: A high-throughput and memory-efficient inference and serving engine for LLMs\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: A high-throughput and memory-efficient inference and serving engine for LLMs\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. page-1: vLLM Overview. Produce one small intermediate artifact and wait for confirmation.\n2. page-2: Getting Started. Produce one small intermediate artifact and wait for confirmation.\n3. page-3: Core Engine Architecture. Produce one small intermediate artifact and wait for confirmation.\n4. page-4: Model Executor and Worker Architecture. Produce one small intermediate artifact and wait for confirmation.\n5. page-5: Scheduling and Request Processing. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/vllm-project/vllm\n- https://github.com/vllm-project/vllm#readme\n- README.md\n- pyproject.toml\n- vllm/__init__.py\n- vllm/version.py\n- docs/getting_started/installation/README.md\n- docs/getting_started/quickstart.md\n- requirements/common.txt\n- requirements/cuda.txt\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：vllm-project/vllm\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install vllm\n```\n\n来源：https://github.com/vllm-project/vllm#readme\n\n## 来源\n\n- repo: https://github.com/vllm-project/vllm\n- docs: https://github.com/vllm-project/vllm#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_4b19ac5ad26a41138863fa55543fb1f5"
}
