{
  "canonical_name": "noahgolmant/pytorch-hessian-eigenthings",
  "compilation_id": "pack_9a2154fa4b7746dea95168caad47d6f9",
  "created_at": "2026-05-16T15:16:51.175213+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=skill, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=skill, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install hessian-eigenthings` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install hessian-eigenthings",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "deterministic_isolated_install",
      "sandbox_validation_id": "sbx_c8dd4b41232f4b80bb1c4448f20b523e"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_60e0f4ffb5f798e55622480bf7cf5501",
    "canonical_name": "noahgolmant/pytorch-hessian-eigenthings",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings",
    "slug": "pytorch-hessian-eigenthings",
    "source_packet_id": "phit_4b0f104cec4d4ab58d90ab1b7f6f7702",
    "source_validation_id": "dval_8a294b814b3b4037a843c14c510c47df"
  },
  "merchandising": {
    "best_for": "需要软件开发与交付能力，并使用 local_cli的用户",
    "github_forks": null,
    "github_stars": null,
    "one_liner_en": "pytorch-hessian-eigenthings",
    "one_liner_zh": "pytorch-hessian-eigenthings",
    "primary_category": {
      "category_id": "software-development",
      "confidence": "medium",
      "name_en": "Software Development",
      "name_zh": "软件开发与交付",
      "reason": "matched_keywords:git, cli"
    },
    "target_user": "使用 local_cli 等宿主 AI 的用户",
    "title_en": "pytorch-hessian-eigenthings",
    "title_zh": "pytorch-hessian-eigenthings 能力包",
    "visible_tags": [
      {
        "label_en": "Knowledge Retrieval",
        "label_zh": "知识检索",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-knowledge-retrieval",
        "type": "product_domain"
      },
      {
        "label_en": "Knowledge Base Q&A",
        "label_zh": "知识库问答",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-knowledge-base-q-a",
        "type": "user_job"
      },
      {
        "label_en": "Workflow Automation",
        "label_zh": "流程自动化",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-workflow-automation",
        "type": "core_capability"
      },
      {
        "label_en": "Node-based Workflow",
        "label_zh": "节点式流程编排",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-node-based-workflow",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Evaluation Suite",
        "label_zh": "评测体系",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-evaluation-suite",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_4b0f104cec4d4ab58d90ab1b7f6f7702",
  "page_model": {
    "artifacts": {
      "artifact_slug": "pytorch-hessian-eigenthings",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install hessian-eigenthings",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/noahgolmant/pytorch-hessian-eigenthings#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "知识检索",
        "知识库问答",
        "流程自动化",
        "节点式流程编排",
        "评测体系"
      ],
      "eyebrow": "软件开发与交付",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要软件开发与交付能力，并使用 local_cli的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "pytorch-hessian-eigenthings"
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "local_cli",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "skill, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "仓库名 `pytorch-hessian-eigenthings` 与安装入口 `hessian-eigenthings` 不完全一致。",
            "category": "身份坑",
            "evidence": [
              "identity.distribution | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | repo=pytorch-hessian-eigenthings; install=hessian-eigenthings"
            ],
            "severity": "medium",
            "suggested_check": "在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。",
            "title": "仓库名和安装名不一致",
            "user_impact": "用户照着仓库名搜索包或照着包名找仓库时容易走错入口。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Python Error: the following arguments are required: experimentname",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_24f46464d79f4ae3830f046c077a2574 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/39 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Python Error: the following arguments are required: experimentname",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a2 — packaging fix",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_7540a696b30c46cdba07c12f33388567 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a2 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v1.0.0a2 — packaging fix",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a3 — fix lanczos OOM",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_e5e68e2f24e1436cb8f3c2f11cefe326 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a3 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v1.0.0a3 — fix lanczos OOM",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_914b7653aa8b4ef2844a2b4690fab2ad | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a4 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_509356ab9b68434992d7237219952ba6 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a5 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_f79a3a34cbab435cb3730b7ae17cf492 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/30 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：ValueError: PENet on the Kitti benchmark suite",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_850f8cf0010c4d269ab71d864610097a | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/41 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：ValueError: PENet on the Kitti benchmark suite",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个运行相关的待验证问题：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'",
            "category": "运行坑",
            "evidence": [
              "community_evidence:github | cevd_b515f5c06a5744b19b667bcbc8123348 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/38 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "未记录 last_activity_observed。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | last_activity_observed missing"
            ],
            "severity": "medium",
            "suggested_check": "补 GitHub 最近 commit、release、issue/PR 响应信号。",
            "title": "维护活跃度未知",
            "user_impact": "新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "downstream_validation.risk_items | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "进入安全/权限治理复核队列。",
            "title": "下游验证发现风险项",
            "user_impact": "下游已经要求复核，不能在页面中弱化。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "risks.scoring_risks | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "把风险写入边界卡，并确认是否需要人工复核。",
            "title": "存在评分风险",
            "user_impact": "风险会影响是否适合普通用户安装。"
          },
          {
            "body": "issue_or_pr_quality=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | issue_or_pr_quality=unknown"
            ],
            "severity": "low",
            "suggested_check": "抽样最近 issue/PR，判断是否长期无人处理。",
            "title": "issue/PR 响应质量未知",
            "user_impact": "用户无法判断遇到问题后是否有人维护。"
          },
          {
            "body": "release_recency=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | release_recency=unknown"
            ],
            "severity": "low",
            "suggested_check": "确认最近 release/tag 和 README 安装命令是否一致。",
            "title": "发布节奏不明确",
            "user_impact": "安装命令和文档可能落后于代码，用户踩坑概率升高。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：身份坑 - 仓库名和安装名不一致。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": null,
        "forks": null,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": null
      },
      "source_url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "pytorch-hessian-eigenthings",
      "title": "pytorch-hessian-eigenthings 能力包",
      "trial_prompt": "# pytorch-hessian-eigenthings - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for noahgolmant/pytorch-hessian-eigenthings.\n\nProject:\n- Name: pytorch-hessian-eigenthings\n- Repository: https://github.com/noahgolmant/pytorch-hessian-eigenthings\n- Summary: pytorch-hessian-eigenthings\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: pytorch-hessian-eigenthings\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: pytorch-hessian-eigenthings\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. introduction: Introduction to hessian-eigenthings. Produce one small intermediate artifact and wait for confirmation.\n2. curvature-matrices: Curvature Matrices Explained. Produce one small intermediate artifact and wait for confirmation.\n3. hvp-approach: Why Hessian-Vector Products. Produce one small intermediate artifact and wait for confirmation.\n4. architecture: System Architecture. Produce one small intermediate artifact and wait for confirmation.\n5. curvature-operators: Curvature Operators. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://news.ycombinator.com/item?id=48132232\n- https://github.com/noahgolmant/pytorch-hessian-eigenthings#readme\n- README.md\n- hessian_eigenthings/__init__.py\n- docs/concepts/ggn-vs-fisher-vs-hessian.md\n- docs/concepts/what-is-the-hessian.md\n- docs/concepts/why-hvp-not-full-h.md\n- hessian_eigenthings/operators/hessian.py\n- hessian_eigenthings/operators/base.py\n- hessian_eigenthings/algorithms/__init__.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: ValueError: PENet on the Kitti benchmark suite（https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/41）；github/github_issue: RuntimeError: One of the differentiated Tensors appears to not have been（https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/30）；github/github_issue: AttributeError: 'HVPOperator' object has no attribute 'zero_grad'（https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/38）；github/github_issue: Python Error: the following arguments are required: experimentname（https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/39）；github/github_release: v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests（https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a5）；github/github_release: v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo（https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a4）；github/github_release: v1.0.0a3 — fix lanczos OOM（https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a3）；github/github_release: v1.0.0a2 — packaging fix（https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a2）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "ValueError: PENet on the Kitti benchmark suite",
              "url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/41"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "RuntimeError: One of the differentiated Tensors appears to not have been",
              "url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/30"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "AttributeError: 'HVPOperator' object has no attribute 'zero_grad'",
              "url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/38"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Python Error: the following arguments are required: experimentname",
              "url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/39"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests",
              "url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a5"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo",
              "url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a4"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v1.0.0a3 — fix lanczos OOM",
              "url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a3"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v1.0.0a2 — packaging fix",
              "url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a2"
            }
          ],
          "status": "已收录 8 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "软件开发与交付",
      "desc": "pytorch-hessian-eigenthings",
      "effort": "安装已验证",
      "forks": null,
      "icon": "code",
      "name": "pytorch-hessian-eigenthings 能力包",
      "risk": "可发布",
      "slug": "pytorch-hessian-eigenthings",
      "stars": null,
      "tags": [
        "知识检索",
        "知识库问答",
        "流程自动化",
        "节点式流程编排",
        "评测体系"
      ],
      "thumb": "gray",
      "type": "Skill Pack"
    },
    "manual": {
      "markdown": "# https://github.com/noahgolmant/pytorch-hessian-eigenthings 项目说明书\n\n生成时间：2026-05-16 15:14:47 UTC\n\n## 目录\n\n- [Introduction to hessian-eigenthings](#introduction)\n- [Installation Guide](#installation)\n- [Curvature Matrices Explained](#curvature-matrices)\n- [Why Hessian-Vector Products](#hvp-approach)\n- [System Architecture](#architecture)\n- [Curvature Operators](#curvature-operators)\n- [Eigendecomposition Algorithms](#algorithms)\n- [Loss Functions](#loss-functions)\n- [Parameter Utilities](#parameter-utilities)\n- [Distributed Computing with DDP](#distributed-computing)\n\n<a id='introduction'></a>\n\n## Introduction to hessian-eigenthings\n\n### 相关页面\n\n相关主题：[Curvature Matrices Explained](#curvature-matrices), [System Architecture](#architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/README.md)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n- [hessian_eigenthings/loss_fns/standard.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n- [examples/huggingface_tiny_gpt2.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/huggingface_tiny_gpt2.py)\n- [mkdocs.yml](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/mkdocs.yml)\n</details>\n\n# Introduction to hessian-eigenthings\n\n## Overview\n\n`hessian-eigenthings` is a PyTorch library that provides efficient and scalable computation of eigendecomposition for the Hessian matrix and related curvature operators in neural networks. The library enables practitioners to compute top eigenvalues and eigenvectors via Lanczos or stochastic power iteration, trace estimates via Hutch++, and spectral density via Stochastic Lanczos Quadrature.\n\n资料来源：[README.md:1]()\n\nThe project targets researchers and engineers studying generalization properties of neural networks, where Hessian eigenvalues and eigenvectors have been implicated in understanding flat minima and model robustness.\n\n## Core Concepts\n\n### What is a Hessian?\n\nThe Hessian matrix is the second-order partial derivatives of a loss function with respect to model parameters. For a neural network with parameters θ and loss L, the Hessian H is defined as:\n\n```\nH[θ][i,j] = ∂²L / ∂θ[i]∂θ[j]\n```\n\nFor modern large-scale models, the Hessian is prohibitively expensive to compute explicitly—it has O(n²) entries where n is the number of parameters (e.g., billions for large language models).\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50]()\n\n### Curvature Operators\n\nInstead of computing the full Hessian matrix, this library works with **curvature operators** that implement matrix-vector products (matvecs). Given a vector v, these operators efficiently compute:\n\n```\nH @ v → operator.matvec(v)\n```\n\nThis approach reduces memory from O(n²) to O(n), making analysis feasible for models with billions of parameters.\n\n### Supported Curvature Matrices\n\n| Operator | Description | Use Case |\n|----------|-------------|----------|\n| `HessianOperator` | Full Hessian of the loss | General curvature analysis |\n| `GGNOperator` | Generalized Gauss-Newton approximation | More stable than raw Hessian; equals Fisher for cross-entropy + softmax |\n| Custom Operators | User-defined curvature operators | Extend to other matrices |\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-30]()\n\n## Architecture\n\nThe library follows a clean separation of concerns with three main layers:\n\n```mermaid\ngraph TD\n    A[User Code] --> B[Algorithms]\n    A --> C[Loss Functions]\n    B --> D[Curvature Operators]\n    C --> D\n    D --> E[LinAlgBackend]\n    \n    B --> B1[Lanczos]\n    B --> B2[Stochastic Power Iteration]\n    B --> B3[Trace Estimation]\n    \n    D --> D1[HessianOperator]\n    D --> D2[GGNOperator]\n    D --> D3[Custom Operators]\n    \n    E --> E1[SingleDeviceBackend]\n    E --> E2[Distributed Backends]\n```\n\n### Component Layers\n\n1. **Algorithms Layer** (`hessian_eigenthings/algorithms/`): Eigenvalue/eigenvector computation methods that operate on any `CurvatureOperator`.\n\n2. **Operators Layer** (`hessian_eigenthings/operators/`): Implementations of various curvature matrices that provide the `matvec()` interface.\n\n3. **Loss Functions Layer** (`hessian_eigenthings/loss_fns/`): Pre-built loss functions with analytical Hessian-vector products for common use cases.\n\n4. **Backend Layer** (`hessian_eigenthings/backends/`): Abstraction for linear algebra operations supporting single-device and distributed execution.\n\n资料来源：[CONTRIBUTING.md:1-30]()\n\n## Algorithms\n\n### Lanczos Eigendecomposition\n\nThe Lanczos algorithm computes the top k eigenvalues and eigenvectors of a symmetric matrix using only matrix-vector products. It achieves this through k iterations of tridiagonal matrix construction.\n\n```mermaid\ngraph LR\n    A[Start Vector v₀] --> B[Iterate i = 1 to k]\n    B --> C[Compute βᵢvᵢ₊₁ = Avᵢ - αᵢvᵢ - βᵢ₋₁vᵢ₋₁]\n    C --> D[Compute αᵢ = vᵢᵀAvᵢ]\n    D --> E[Build Tridiagonal T]\n    E --> F{Eigenvalues of T ≈ eigenvalues of A?}\n    F -->|Yes| G[Eigenpairs Converged]\n```\n\nKey parameters for the Lanczos algorithm:\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `k` | int | required | Number of eigenpairs to compute |\n| `max_iter` | int | 100 | Maximum Lanczos iterations |\n| `tol` | float | 1e-6 | Convergence tolerance |\n| `seed` | int | None | Random seed for reproducibility |\n| `which` | str | \"LM\" | Which eigenvalues: \"LM\" (largest magnitude), \"LA\" (largest algebraic), \"SA\" (smallest algebraic) |\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:1-80]()\n\n### Trace Estimation\n\nThe trace of a matrix can be estimated using stochastic methods without forming the full matrix:\n\n**Hutchinson's Estimator:**\n```\ntrace(A) ≈ (1/m) Σᵢ vᵢᵀ A vᵢ\n```\nwhere vᵢ are random probe vectors.\n\n**Hutch++ Estimator:**\nAn improved estimator with lower variance:\n```\ntrace(A) ≈ (2/m) Σᵢ vᵢᵀ A vᵢ - (1/m) Σⱼ wⱼᵀ A wⱼ\n```\n\n| Method | `num_matvecs` | Variance | Use Case |\n|--------|---------------|----------|----------|\n| `hutchinson` | 100 | Higher | Quick estimates |\n| `hutch++` | 30 | Lower | Production estimates |\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:1-60]()\n\n## Operators\n\n### HessianOperator\n\nThe primary operator for computing Hessian eigendecomposition. It supports two HVP computation methods:\n\n| Method | Description | Memory | Precision |\n|--------|-------------|--------|-----------|\n| `\"autograd\"` (default) | Exact double-backward via `torch.autograd.grad` with `create_graph=True` | Higher | Numerically exact |\n| `\"finite_difference\"` | Central-difference approximation | Lower | O(ε²) bias |\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\n\n# Basic usage\nop = HessianOperator(model=model, dataloader=dataloader, loss_fn=loss_fn)\neigenvalues, eigenvectors = lanczos(op, k=10)\n\n# With parameter filtering (subset of parameters)\nop = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"transformer.h.*.attn.*\"),\n)\n```\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-100]()\n\n### GGNOperator\n\nThe Generalized Gauss-Newton (GGN) operator provides a more numerically stable approximation to the Hessian. For cross-entropy + softmax classification, the GGN equals the Fisher information matrix.\n\n```python\nfrom hessian_eigenthings import GGNOperator\n\nop = GGNOperator(\n    model=model,\n    dataloader=dataloader,\n    forward_fn=model_forward,\n    loss_of_output_fn=loss_of_output_fn,\n)\n```\n\n**Two matvec implementations:**\n\n| Implementation | Description | Memory Footprint |\n|----------------|-------------|------------------|\n| `\"analytical\"` (default) | Finite-difference JVP + analytical loss-Hessian-vec product | Matches one training step |\n| `\"autograd\"` | Full `torch.func.jvp` + autograd double-backward | Scales badly with output size |\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-80]()\n\n## Loss Functions\n\nThe library provides optimized loss functions with closed-form Hessian-vector products for common use cases.\n\n### Standard Loss Functions\n\n```python\nfrom hessian_eigenthings.loss_fns.standard import cross_entropy_loss_of_output\n\nloss_fn = cross_entropy_loss_of_output()  # Returns loss_of_output_fn\n```\n\nThe closed-form cross-entropy HVP is:\n```\nH @ u = (p * u - p * <p, u>) / n\n```\nwhere `p = softmax(output)` and `n` is the number of valid positions.\n\n资料来源：[hessian_eigenthings/loss_fns/standard.py:1-60]()\n\n### HuggingFace Transformers Loss\n\nSpecialized support for HuggingFace models with fused CUDA kernels:\n\n```python\nfrom hessian_eigenthings.loss_fns.huggingface import hf_lm_loss\n\nloss_fn = hf_lm_loss()  # For language modeling\n```\n\n**Fused backend options:**\n\n| Backend | Device | Speedup | Memory |\n|---------|--------|---------|--------|\n| `\"triton\"` | CUDA | ~3.4x faster | 2x reduction |\n| `\"compile\"` | Any | ~2.6x faster | 2x reduction |\n| `\"eager\"` | Any | Baseline | Baseline |\n| `\"auto\"` | Auto-detect | Best available | Best available |\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-80]()\n\n## Usage Examples\n\n### Basic Hessian Eigendecomposition\n\n```python\nimport torch\nfrom torch import nn\nfrom hessian_eigenthings import HessianOperator, lanczos\n\n# Define model and data\nmodel = nn.Sequential(nn.Linear(100, 50), nn.ReLU(), nn.Linear(50, 10))\ndataloader = [(torch.randn(32, 100), torch.randint(0, 10, (32,)))]\n\n# Loss function\ndef loss_fn(model, batch):\n    x, y = batch\n    return nn.functional.cross_entropy(model(x), y)\n\n# Compute top 3 eigenvalues\nop = HessianOperator(model=model, dataloader=dataloader, loss_fn=loss_fn)\nresult = lanczos(op, k=3, max_iter=20, tol=1e-3, seed=0)\n\nfor i, val in enumerate(result.eigenvalues):\n    print(f\"λ_{i+1} = {val.item():.4e}\")\n```\n\n资料来源：[examples/huggingface_tiny_gpt2.py:1-50]()\n\n### Analyzing Attention Layers Only\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\nfrom hessian_eigenthings.util import match_names\n\n# Filter to attention parameters only\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\"),\n)\neigenvalues = lanczos(attn_op, k=5)\n```\n\n### Trace Estimation\n\n```python\nfrom hessian_eigenthings import HessianOperator, trace\n\nop = HessianOperator(model=model, dataloader=dataloader, loss_fn=loss_fn)\nresult = trace(op, num_matvecs=30, method=\"hutch++\", seed=0)\nprint(f\"Trace estimate: {result.estimate:.4e}\")\n```\n\n## Performance Considerations\n\n### Memory Management\n\nFor large models, consider these strategies:\n\n1. **Parameter Filtering**: Analyze only relevant subsets of parameters\n   ```python\n   op = HessianOperator(model, dataloader, loss_fn, param_filter=filter_func)\n   ```\n\n2. **Microbatching**: Process data in smaller chunks\n   ```python\n   op = HessianOperator(model, dataloader, loss_fn, microbatch_size=8)\n   ```\n\n3. **Finite Difference Method**: Use `\"finite_difference\"` for lower memory with FSDP/HSDP/TP\n   ```python\n   op = HessianOperator(model, dataloader, loss_fn, method=\"finite_difference\")\n   ```\n\n### Scalability\n\n| Model Size | Recommended Method | Notes |\n|------------|-------------------|-------|\n| < 1B params | `\"autograd\"` HVP | Numerically exact |\n| 1B - 7B params | `\"analytical\"` GGN | Good memory efficiency |\n| > 7B params | `\"finite_difference\"` HVP | Works with distributed training |\n\n### Computation Cost\n\nThe primary cost driver is the number of matrix-vector products (`matvecs`):\n\n- **Lanczos**: ~k × max_iter matvecs for k eigenpairs\n- **Trace (Hutch++)**: num_matvecs matvecs\n- **Spectral Density**: num_steps × num_random_start matvecs\n\n## API Reference\n\n### Core Functions\n\n| Function | Module | Description |\n|----------|--------|-------------|\n| `lanczos` | `hessian_eigenthings.algorithms` | Lanczos eigendecomposition |\n| `stochastic_power_iteration` | `hessian_eigenthings.algorithms` | Stochastic power iteration |\n| `trace` | `hessian_eigenthings.algorithms` | Trace estimation |\n| `spectral_density` | `hessian_eigenthings.algorithms` | Stochastic Lanczos Quadrature |\n\n### Operators\n\n| Class | Module | Description |\n|-------|--------|-------------|\n| `HessianOperator` | `hessian_eigenthings.operators` | Full Hessian operator |\n| `GGNOperator` | `hessian_eigenthings.operators` | Generalized Gauss-Newton operator |\n| `CurvatureOperator` | `hessian_eigenthings.operators` | Base class for custom operators |\n\n### Utility Functions\n\n| Function | Description |\n|----------|-------------|\n| `match_names(glob_pattern)` | Create parameter filter from glob pattern |\n| `SingleDeviceBackend` | Linear algebra backend for single-device execution |\n\n## Project Information\n\n### Acknowledgements\n\nThe original 2018 implementation was developed by Noah Golmant, Zhewei Yao, Amir Gholami, Michael Mahoney, and Joseph Gonzalez at UC Berkeley's RISELab.\n\nThe deflated power iteration is based on code from [HessianFlow](https://github.com/amirgholami/HessianFlow) (Z. Yao, A. Gholami, Q. Lei, K. Keutzer, M. Mahoney. *\"Hessian-based Analysis of Large Batch Training and Robustness to Adversaries\"*, NeurIPS 2018).\n\nAccelerated stochastic power iteration is from C. De Sa et al.\n\n### Citation\n\n```bibtex\n@misc{hessian-eigenthings,\n    author       = {Noah Golmant and Zhewei Yao and Amir Gholami and Michael Mahoney and Joseph Gonzalez},\n    title        = {pytorch-hessian-eigenthings: efficient PyTorch Hessian eigendecomposition},\n    month        = oct,\n    year         = 2018,\n    version      = {1.0},\n    url          = {https://github.com/noahgolmant/pytorch-hessian-eigenthings}\n}\n```\n\n### Installation\n\n```bash\n# From PyPI (stable release)\npip install hessian-eigenthings\n\n# Development setup\ngit clone https://github.com/noahgolmant/pytorch-hessian-eigenthings\ncd pytorch-hessian-eigenthings\nuv sync --group dev --group docs\n```\n\n### Documentation\n\nFull documentation is available at [noahgolmant.github.io/pytorch-hessian-eigenthings/](https://noahgolmant.github.io/pytorch-hessian-eigenthings/).\n\n资料来源：[README.md:1-100]()\n资料来源：[CONTRIBUTING.md:1-60]()\n资料来源：[mkdocs.yml:1-50]()\n\n---\n\n<a id='installation'></a>\n\n## Installation Guide\n\n### 相关页面\n\n相关主题：[Introduction to hessian-eigenthings](#introduction)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [CONTRIBUTING.md](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/CONTRIBUTING.md)\n- [README.md](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/README.md)\n- [pyproject.toml](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/pyproject.toml)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [examples/transformer_lens_attention_only.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/transformer_lens_attention_only.py)\n- [examples/huggingface_tiny_gpt2.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/huggingface_tiny_gpt2.py)\n</details>\n\n# Installation Guide\n\n## Overview\n\nThis guide covers all aspects of setting up the `hessian-eigenthings` library for computing Hessian eigendecomposition and related curvature matrix operations in PyTorch models.\n\nThe library provides efficient methods for computing top eigenvalues and eigenvectors via Lanczos or stochastic power iteration, trace estimates via Hutch++, and spectral density via Stochastic Lanczos Quadrature. 资料来源：[README.md:1-20]()\n\n## Installation Methods\n\n### PyPI Release (Recommended for Users)\n\nThe latest stable release is available on PyPI:\n\n```bash\npip install hessian-eigenthings\n```\n\nThis installs the core library without optional dependencies for transformer and curvlinops integrations. 资料来源：[README.md:1-10]()\n\n### Development Installation (For Contributors)\n\nFor development, clone the repository and install with all optional dependency groups:\n\n```bash\ngit clone https://github.com/noahgolmant/pytorch-hessian-eigenthings\ncd pytorch-hessian-eigenthings\nuv sync --group dev --group docs --extra transformers --extra transformer-lens --extra curvlinops\n```\n\n资料来源：[CONTRIBUTING.md:5-12]()\n\n## Optional Dependency Groups\n\nThe library uses optional dependency groups defined in `pyproject.toml` to enable specialized functionality:\n\n| Group | Purpose | Typical Use Case |\n|-------|---------|------------------|\n| `dev` | Testing, linting, type checking | Running CI checks locally |\n| `docs` | Building documentation | `mkdocs build --strict` |\n| `transformers` | HuggingFace Transformers integration | `GGNOperator` with HF models |\n| `transformer-lens` | TransformerLens integration | Attention-only Hessian analysis |\n| `curvlinops` | Cross-library validation tests | Testing against external oracle |\n\n资料来源：[CONTRIBUTING.md:8-10]()\n\n## Development Environment Setup\n\n### Prerequisites\n\n| Requirement | Version | Purpose |\n|-------------|---------|---------|\n| Python | ≥3.10 | Core runtime |\n| uv | Latest | Package manager |\n| PyTorch | ≥2.0 | Backend tensor operations |\n| CUDA (optional) | 11.8+ | GPU acceleration for large models |\n\n### Setup Workflow\n\n```mermaid\ngraph TD\n    A[Clone Repository] --> B[Install uv if needed]\n    B --> C[Run uv sync with groups]\n    C --> D[Verify Installation]\n    D --> E{Which workflow?}\n    E -->|Development| F[Run linting checks]\n    E -->|Testing| G[Run pytest]\n    E -->|Documentation| H[Build docs]\n    F --> I[Ready to contribute]\n    G --> I\n    H --> I\n```\n\n### Verification Commands\n\nAfter installation, verify the setup by running the full check suite:\n\n```bash\nuv run ruff check .\nuv run black --check .\nuv run mypy\nuv run pytest\nuv run mkdocs build --strict\n```\n\n资料来源：[CONTRIBUTING.md:14-23]()\n\n## CUDA/GPU Support\n\nThe library provides optimized CUDA kernels for specific operations:\n\n### Triton Kernels (CUDA Only)\n\nThe `hessian_eigenthings.loss_fns._fused_ce_hvp` module includes a hand-written Triton CUDA kernel for fused CE HVP computation. This kernel:\n\n- Eliminates zero `(N, V)` intermediates (output buffer only)\n- Provides ~3.4x speedup over eager mode\n- Reduces peak memory by 2x compared to eager\n- Falls back to `torch.compile` if Triton/CUDA is unavailable\n\n资料来源：[hessian_eigenthings/loss_fns/_fused_ce_hvp.py:50-80]()\n\n### Backend Selection\n\nFor HuggingFace language model loss functions, the `fused` parameter controls kernel selection:\n\n| Setting | Behavior |\n|---------|----------|\n| `\"auto\"` (default) | Picks fastest available: Triton on CUDA (~3.4x speedup), else `torch.compile` |\n| `\"eager\"` | Plain PyTorch implementation, useful for debugging |\n| `\"compile\"` | `torch.compile`-fused via Inductor, works on CPU/CUDA/MPS |\n| `\"triton\"` | Hand-written CUDA Triton kernel (CUDA only) |\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-50]()\n\n## Operator-Specific Dependencies\n\nDifferent curvature operators have different computational requirements:\n\n### HessianOperator\n\nTwo HVP methods are supported:\n\n| Method | Memory Profile | Precision | FSDP/TP Compatible |\n|--------|---------------|-----------|-------------------|\n| `\"autograd\"` (default) | Higher (requires `create_graph=True`) | Numerically exact | No (requires special handling) |\n| `\"finite_difference\"` | Matches one training step | O(ε²) truncation bias | Yes |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-40]()\n\n### GGNOperator\n\nFor the Generalized Gauss-Newton operator, two matvec implementations are available:\n\n| Implementation | Memory | Use Case |\n|----------------|--------|----------|\n| `\"analytical\"` (default) | Matches one training step | LM-scale use, prevents OOM |\n| `\"autograd\"` | Scales with output size | Losses without analytical `.hvp` |\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-60]()\n\n## CI/CD Verification\n\nThe repository uses GitHub Actions for continuous integration. CI runs:\n\n- All linting and type checks\n- Full pytest test suite\n- Example scripts execution\n- Documentation codeblock tests\n\n资料来源：[CONTRIBUTING.md:23-26]()\n\n## Troubleshooting\n\n### Common Issues\n\n| Issue | Solution |\n|-------|----------|\n| Memory OOM with GGNOperator | Use `loss_hvp=\"analytical\"` (default in recent versions) |\n| FSDP/TP compatibility issues | Use `method=\"finite_difference\"` for HessianOperator |\n| Triton not available | Falls back to `torch.compile` automatically |\n| Type checking failures | Run `uv run mypy` locally before submitting PR |\n\n### Diagnostic Scripts\n\nThe repository includes diagnostic scripts for troubleshooting:\n\n- `scripts/repro_ggn_oom.py`: CPU-side memory regression test for GGNOperator OOM issues\n- `scripts/bench_fused_ce_hvp.py`: Microbenchmark for eager vs fused CE HVP performance\n\n资料来源：[scripts/repro_ggn_oom.py:1-40]()\n资料来源：[scripts/bench_fused_ce_hvp.py:1-50]()\n\n## Package Metadata\n\n| Property | Value |\n|----------|-------|\n| Package Name | `hessian-eigenthings` |\n| License | MIT |\n| Documentation | [noahgolmant.github.io/pytorch-hessian-eigenthings](https://noahgolmant.github.io/pytorch-hessian-eigenthings/) |\n| CI Status | [![CI](https://github.com/noahgolmant/pytorch-hessian-eigenthings/actions/workflows/ci.yml/badge.svg)](https://github.com/noahgolmant/pytorch-hessian-eigenthings/actions/workflows/ci.yml) |\n\n资料来源：[README.md:1-20]()\n\n---\n\n<a id='curvature-matrices'></a>\n\n## Curvature Matrices Explained\n\n### 相关页面\n\n相关主题：[Why Hessian-Vector Products](#hvp-approach), [Curvature Operators](#curvature-operators)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/operators/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n- [hessian_eigenthings/operators/fisher.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/fisher.py)\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n- [hessian_eigenthings/loss_fns/_fused_ce_hvp.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n</details>\n\n# Curvature Matrices Explained\n\n## Overview\n\nCurvature matrices characterize the second-order behavior of loss functions in neural networks, providing critical information about optimization landscapes, generalization properties, and model robustness. The `hessian-eigenthings` library provides efficient, matrix-free computation of eigendecompositions for three key curvature matrices: the **Hessian**, the **Generalized Gauss-Newton (GGN)**, and the **Empirical Fisher**.\n\nThese curvature operators serve as the foundation for analyzing flat minima, understanding generalization, and performing second-order optimization. The library implements matrix-vector products (matvecs) directly, avoiding explicit matrix construction which would be computationally infeasible for large neural networks with billions of parameters.\n\n## Curvature Matrices Architecture\n\n```mermaid\ngraph TD\n    A[Loss Function Lθ] --> B[Hessian H = ∇²L]\n    A --> C[Generalized Gauss-Newton G]\n    A --> D[Empirical Fisher F]\n    \n    B --> E[Matrix-Free MatVec]\n    C --> E\n    D --> E\n    \n    E --> F[Lanczos Eigendecomposition]\n    E --> G[Trace Estimation]\n    E --> H[Spectral Density]\n    \n    F --> I[Top-k Eigenpairs]\n    G --> J[Trace Estimate ± SE]\n    H --> K[Spectral Density Plot]\n```\n\n## The Hessian Matrix\n\n### Definition and Role\n\nThe Hessian matrix `H = ∇²L(θ)` is the second derivative of the loss with respect to parameters. It captures the exact local curvature of the loss landscape, making it the most precise but also most computationally expensive curvature matrix.\n\nThe Hessian is symmetric by construction and its eigenvalues reveal critical properties:\n\n- **Large positive eigenvalues** indicate sharp curvature, suggesting the model is in a narrow minimum\n- **Small eigenvalues** indicate flat regions associated with better generalization\n- **Negative eigenvalues** signal instability and potential divergence\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-30](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### HessianOperator Implementation\n\nThe `HessianOperator` class provides two methods for computing Hessian-vector products (HVPs):\n\n```python\nclass HessianOperator(CurvatureOperator):\n    \"\"\"Hessian of `loss_fn(model, batch)` averaged over batches in dataloader.\"\"\"\n    \n    def __init__(\n        self,\n        model: nn.Module,\n        dataloader: Iterable[Any],\n        loss_fn: LossFn,\n        *,\n        param_filter: ParamFilter | None = None,\n        full_dataset: bool = True,\n        num_batches: int | None = None,\n        microbatch_size: int | None = None,\n        method: HvpMethod = \"autograd\",\n        fd_eps: float | None = None,\n        backend: LinAlgBackend[torch.Tensor] | None = None,\n    ) -> None:\n```\n\n**HVP Computation Methods:**\n\n| Method | Description | Memory | Precision |\n|--------|-------------|--------|-----------|\n| `\"autograd\"` | Exact double-backward via `torch.autograd.grad` with `create_graph=True` | Higher (scales with model size) | Numerically exact to rounding |\n| `\"finite_difference\"` | Central-difference `(∇L(θ+εv) − ∇L(θ−εv)) / 2ε` | Lower (two forward+backward passes) | O(ε²) truncation bias |\n\nThe finite-difference method uses dtype-specific epsilon values for optimal precision:\n\n| dtype | Epsilon |\n|-------|---------|\n| float64 | 6e-6 |\n| float32 | 5e-3 |\n| bfloat16 | 0.2 |\n| float16 | 5e-2 |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:30-55](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### When to Use the Hessian\n\nThe Hessian is ideal for:\n- Single-device analysis of models up to ~7B parameters\n- Scenarios requiring exact curvature information\n- Research on loss landscape topology\n- Verifying approximations against ground truth\n\n## Generalized Gauss-Newton (GGN)\n\n### Definition and Mathematical Foundation\n\nThe Generalized Gauss-Newton matrix `G` is a positive semi-definite (PSD) approximation to the Hessian. For a loss of the form `L = (1/n) Σ l(f(xᵢ;θ), yᵢ)`, the GGN is defined as:\n\n```\nG = Jᵀ · H_loss · J\n```\n\nWhere:\n- `J` is the Jacobian of model outputs with respect to parameters\n- `H_loss` is the Hessian of the loss with respect to model outputs\n\nThe GGN is always PSD because `G = Jᵀ · H_loss · J` and `H_loss` is PSD for convex losses.\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-40](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n### GGNOperator Implementation\n\nThe `GGNOperator` class provides two matvec implementations:\n\n```python\nclass GGNOperator(CurvatureOperator):\n    def __init__(\n        self,\n        model: nn.Module,\n        dataloader: Iterable[Any],\n        forward_fn: ForwardFn,\n        loss_of_output_fn: LossOfOutputFn,\n        *,\n        loss_hvp: Literal[\"analytical\", \"autograd\"] = \"analytical\",\n    ) -> None:\n```\n\n| Method | Description | Memory | Use Case |\n|--------|-------------|--------|----------|\n| `\"analytical\"` | Finite-difference JVP + analytical loss-Hessian-vector product | Matches one training step | LM-scale use, OOM-safe |\n| `\"autograd\"` | `torch.func.jvp` + autograd double-backward + `vjp` | Scales with output size | Exact for arbitrary losses |\n\nFor cross-entropy + softmax classification, `G` equals the Fisher information matrix, making the GGN and Fisher equivalent in this common case.\n\n资料来源：[hessian_eigenthings/operators/ggn.py:40-80](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n### Two-Function API Design\n\nThe GGNOperator uses a separation between `forward_fn` and `loss_of_output_fn`:\n\n```python\nForwardFn = Callable[[nn.Module, Any], torch.Tensor]\nLossOfOutputFn = Callable[[torch.Tensor, Any], torch.Tensor]\n```\n\nThis design enables computing `J·v`, `H_loss·(J·v)`, and `Jᵀ·(H_loss·J·v)` without coupling to loss internals.\n\n### Closed-Form Cross-Entropy HVP\n\nFor mean-reduced cross-entropy with softmax, the library provides an optimized analytical HVP:\n\n```\nH_loss @ u = (p * u - p * ⟨p, u⟩) / n\n```\n\nWhere `p = softmax(logits)` and `n` is the count of non-ignored positions.\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n### Fused CE HVP Implementations\n\nThe library provides three backend implementations for the cross-entropy HVP:\n\n| Backend | Description | Memory | Speedup |\n|---------|-------------|--------|---------|\n| `\"eager\"` | Plain PyTorch reference | Highest | 1x baseline |\n| `\"compile\"` | `torch.compile`-fused; Inductor fuses operations | Reduced | ~2.6x faster |\n| `\"triton\"` | Hand-written CUDA Triton kernel | Minimal (output buffer only) | ~3.4x faster |\n\n资料来源：[hessian_eigenthings/loss_fns/_fused_ce_hvp.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n\n## Empirical Fisher\n\n### Definition\n\nThe Empirical Fisher matrix `F` is defined as:\n\n```\nF = (1/n) Σ ∇lᵢ · ∇lᵢᵀ\n```\n\nWhere the expectation over data is replaced by the empirical average over batches. It is always PSD and serves as an approximation to the Fisher Information Matrix.\n\n### EmpiricalFisherOperator\n\n```python\nclass EmpiricalFisherOperator(CurvatureOperator):\n    def __init__(\n        self,\n        model: nn.Module,\n        dataloader: Iterable[Any],\n        loss_fn: LossFn,\n        *,\n        per_sample: bool = False,\n    ) -> None:\n```\n\n资料来源：[hessian_eigenthings/operators/fisher.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/fisher.py)\n\n## Curvature Operator Interface\n\nAll curvature matrices implement the `CurvatureOperator` base class:\n\n```python\nclass CurvatureOperator(ABC):\n    @property\n    @abstractmethod\n    def size(self) -> int:\n        \"\"\"Number of parameters (matrix dimension).\"\"\"\n        ...\n    \n    @property\n    def dtype(self) -> torch.dtype:\n        ...\n    \n    @property\n    def device(self) -> torch.device:\n        ...\n    \n    @abstractmethod\n    def matvec(self, v: torch.Tensor) -> torch.Tensor:\n        \"\"\"Compute matrix-vector product A @ v.\"\"\"\n        ...\n```\n\n资料来源：[hessian_eigenthings/operators/base.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/base.py)\n\n## Parameter Filtering\n\nCurvature operators support computing curvature only over a subset of parameters using `ParamFilter`:\n\n```python\ndef match_names(*patterns: str) -> ParamFilter:\n    \"\"\"Match parameters by name patterns.\"\"\"\n    \ndef match_regex(pattern: str) -> ParamFilter:\n    \"\"\"Match parameters by regex pattern.\"\"\"\n```\n\nThis enables analysis of specific components:\n\n```python\n# Analyze only attention weights\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\"),\n)\n```\n\n## Algorithmic Foundations\n\n### Lanczos Eigendecomposition\n\nThe Lanczos algorithm computes eigenvalues and eigenvectors of large sparse matrices using only matrix-vector products:\n\n```python\ndef lanczos(\n    operator: CurvatureOperator,\n    k: int = 10,\n    max_iter: int = 100,\n    tol: float = 1e-6,\n    which: str = \"LM\",\n) -> EigenResult:\n```\n\nThe algorithm:\n1. Builds a tridiagonal matrix `T` from matvec operations\n2. Computes eigenvalues of `T` as Ritz approximations\n3. Accumulates eigenvectors directly via rank-1 outer-product updates\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n\n### Trace Estimation\n\nTrace estimation uses stochastic probing to estimate `tr(A)` without computing the full matrix:\n\n| Method | Samples | Variance | Description |\n|--------|---------|----------|-------------|\n| Hutchinson | m | O(1/√m) | `(1/m) Σ vᵢᵀ A vᵢ` with Rademacher/Gaussian vectors |\n| Hutch++ | m | O(1/m) | Improved estimator with better constant factors |\n\n```python\ndef trace(\n    operator: CurvatureOperator,\n    *,\n    num_matvecs: int = 100,\n    method: Method = \"hutch++\",\n) -> TraceResult:\n```\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n\n## Operator Selection Guide\n\n```mermaid\ngraph LR\n    A[Need Curvature?] --> B{Exact Hessian?}\n    B -->|Yes, small model| C[HessianOperator<br/>method=autograd]\n    B -->|No| D{Need Fisher/GGN?}\n    D -->|Yes, cross-entropy| E[GGNOperator<br/>loss_hvp=analytical]\n    D -->|Yes, other loss| F[GGNOperator<br/>loss_hvp=autograd]\n    D -->|Empirical Fisher| G[EmpiricalFisherOperator]\n    \n    C --> H[Use lanczos for eigenvalues]\n    E --> H\n    F --> H\n    G --> H\n```\n\n| Scenario | Recommended Operator | Method |\n|----------|----------------------|--------|\n| Exact Hessian, single GPU | `HessianOperator` | `method=\"autograd\"` |\n| Large model, distributed | `HessianOperator` | `method=\"finite_difference\"` |\n| Language modeling, cross-entropy | `GGNOperator` | `loss_hvp=\"analytical\"` |\n| Custom loss, need exact | `GGNOperator` | `loss_hvp=\"autograd\"` |\n| Natural gradient optimization | `EmpiricalFisherOperator` | Default |\n\n## Module Exports\n\n```python\nfrom hessian_eigenthings.operators import (\n    CurvatureOperator,\n    HessianOperator,\n    GGNOperator,\n    EmpiricalFisherOperator,\n    DDPHessianOperator,  # For DistributedDataParallel\n    LambdaOperator,      # Custom curvature wrappers\n)\n\nfrom hessian_eigenthings.algorithms import (\n    lanczos,              # Top-k eigendecomposition\n    trace,                # Trace estimation\n    spectral_density,    # Density plot via SLQ\n    deflated_power_iteration,\n)\n```\n\n资料来源：[hessian_eigenthings/operators/__init__.py:1-25](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n\n## Summary\n\nThe `hessian-eigenthings` library provides a unified interface for computing curvature information in neural networks:\n\n- **Hessian**: Exact local curvature via autograd or finite-difference approximation\n- **GGN**: Positive semi-definite approximation ideal for large-scale analysis\n- **Empirical Fisher**: Sample-based Fisher approximation for natural gradient methods\n\nAll operators provide matrix-free matvec implementations, enabling eigendecomposition and trace estimation for models with billions of parameters without explicit matrix construction.\n\n---\n\n<a id='hvp-approach'></a>\n\n## Why Hessian-Vector Products\n\n### 相关页面\n\n相关主题：[Curvature Matrices Explained](#curvature-matrices), [Eigendecomposition Algorithms](#algorithms)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/loss_fns/_fused_ce_hvp.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n- [hessian_eigenthings/loss_fns/standard.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n</details>\n\n# Why Hessian-Vector Products\n\nHessian-vector products (HVPs) are the computational foundation of this library. Understanding *why* we use HVPs instead of computing the full Hessian matrix is essential for appreciating the design and capabilities of `hessian-eigenthings`.\n\n## The Full Hessian Problem\n\nThe Hessian matrix $H$ of a neural network's loss function is a second-order partial derivative matrix with dimensions $[n \\times n]$, where $n$ is the number of parameters. For modern large-scale models:\n\n| Model | Parameters | Hessian Size | Memory (fp32) |\n|-------|-----------|--------------|---------------|\n| BERT-Base | 110M | 110M × 110M | ~48 TB |\n| GPT-2 | 1.5B | 1.5B × 1.5B | ~9 PB |\n| LLaMA-7B | 7B | 7B × 7B | ~392 PB |\n\nStoring the full Hessian is fundamentally infeasible. Even computing it via automatic differentiation requires $O(n^2)$ operations and memory that scales quadratically with model size. 资料来源：[README.md:1-30]()\n\n## What is a Hessian-Vector Product?\n\nA Hessian-vector product computes $Hv$ for a given vector $v$ without ever constructing $H$ explicitly. The operation takes $O(n)$ time and memory—linear in the number of parameters.\n\nFormally, given:\n- Loss function $\\mathcal{L}(\\theta)$\n- Parameter vector $\\theta \\in \\mathbb{R}^n$\n- Direction vector $v \\in \\mathbb{R}^n$\n\nThe HVP is:\n$$Hv = \\nabla_\\theta^2 \\mathcal{L} \\cdot v = \\frac{\\partial}{\\partial \\theta} \\left( \\nabla_\\theta \\mathcal{L} \\cdot v \\right)$$\n\nThis is implemented as a double-backward pass:\n1. Forward pass → compute loss\n2. Backward pass → compute gradient $\\nabla_\\theta \\mathcal{L}$\n3. Second backward pass → compute Jacobian-vector product $H \\cdot v$\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50]()\n\n## Why HVPs Enable Scalable Curvature Analysis\n\nBy avoiding explicit Hessian construction, HVP-based algorithms can operate on models of any size. This library provides several key algorithms that all rely on HVP as their primitive operation:\n\n```mermaid\ngraph TD\n    A[Hessian-Vector Product] --> B[ Lanczos Eigendecomposition]\n    A --> C[ Hutchinson Trace Estimation]\n    A --> D[ Hutch++ Trace Estimation]\n    A --> E[ Stochastic Lanczos Quadrature]\n    \n    B --> F[Top-k Eigenvalues & Eigenvectors]\n    C --> G[Trace Estimation]\n    D --> G\n    E --> H[Spectral Density Plot]\n```\n\n### Eigendecomposition via Lanczos\n\nThe Lanczos algorithm iteratively builds an orthogonal basis that tridiagonalizes the operator. It requires only matrix-vector products, making it perfect for HVP-based curvature analysis:\n\n| Property | Full Eigendecomp | Lanczos + HVP |\n|----------|------------------|---------------|\n| Memory | $O(n^2)$ | $O(n \\cdot k)$ |\n| Time | $O(n^3)$ | $O(n \\cdot k^2)$ |\n| Storage | Entire matrix | $k$ Lanczos vectors |\n\nWhere $k$ is the number of desired eigenpairs (typically 1-20). 资料来源：[hessian_eigenthings/algorithms/lanczos.py:1-60]()\n\n### Trace Estimation via Hutchinson's Method\n\nThe trace of the Hessian can be estimated without constructing the full matrix:\n\n$$\\text{tr}(H) \\approx \\frac{1}{m} \\sum_{i=1}^{m} v_i^T H v_i$$\n\nwhere $v_i$ are random probe vectors (typically Rademacher or Gaussian). Each term $v_i^T H v_i$ is a single HVP plus a dot product. 资料来源：[hessian_eigenthings/algorithms/trace.py:1-45]()\n\n## HVP Implementation Strategies\n\nThe library provides two distinct methods for computing HVPs, each with different trade-offs:\n\n### Method 1: Autograd (Default)\n\nUses `torch.autograd.grad` with `create_graph=True` for exact double-backward computation:\n\n```python\ndef __init__(\n    self,\n    model: nn.Module,\n    dataloader: Iterable[Any],\n    loss_fn: LossFn,\n    *,\n    method: HvpMethod = \"autograd\",  # Default\n    ...\n) -> None:\n```\n\n**Advantages:**\n- Numerically exact (to floating-point rounding)\n- Works with any differentiable loss function\n- Simple implementation\n\n**Disadvantages:**\n- Builds the full computation graph for second derivatives\n- Memory scales with model complexity and output size\n\n资料来源：[hessian_eigenthings/operators/hessian.py:20-45]()\n\n### Method 2: Finite Difference\n\nUses central-difference approximation:\n$$\\frac{\\nabla_\\theta \\mathcal{L}(\\theta + \\epsilon v) - \\nabla_\\theta \\mathcal{L}(\\theta - \\epsilon v)}{2\\epsilon}$$\n\n**Advantages:**\n- No second-backward graph → lower memory footprint\n- Compatible with distributed training (FSDP/HSDP/TP) without special handling\n\n**Disadvantages:**\n- $O(\\epsilon^2)$ truncation bias\n- Precision-dependent roundoff (~1e-5 fp32, ~1e-2 bf16)\n\n资料来源：[hessian_eigenthings/operators/hessian.py:30-40]()\n\n## Fused HVP for Cross-Entropy Losses\n\nFor language models with large vocabulary softmax heads, computing $H_{\\text{loss}} \\cdot u$ naively allocates multiple $(N, V)$ intermediate tensors (where $V$ is vocabulary size). This is addressed with fused implementations:\n\n| Backend | Speedup | Memory Reduction | Requirements |\n|---------|---------|-------------------|--------------|\n| `eager` | 1× (baseline) | 1× | Any |\n| `compile` | ~2.6× | ~2× | torch.compile |\n| `triton` | ~3.4× | ~2× | CUDA + Triton |\n\nThe fused computation computes:\n$$H_{\\text{loss}} \\cdot u = \\frac{p \\odot u - p \\odot \\langle p, u \\rangle}{n} \\odot \\text{mask}$$\n\nWhere $p = \\text{softmax}(\\text{logits})$ and the implementation avoids materializing the full $(N, V)$ softmax output. 资料来源：[hessian_eigenthings/loss_fns/_fused_ce_hvp.py:1-50]()\n\n## Generalized Gauss-Newton (GGN) Approximation\n\nFor optimization-focused curvature analysis, the GGN matrix $G$ provides a positive semi-definite (PSD) approximation to the Hessian:\n\n$$G = J^T \\cdot H_{\\text{loss}} \\cdot J$$\n\nWhere $J$ is the Jacobian of the model outputs with respect to parameters. For cross-entropy + softmax classification, $G$ equals the Fisher information matrix. The GGN is always PSD by construction, making it suitable for optimization algorithms. 资料来源：[hessian_eigenthings/operators/ggn.py:1-40]()\n\n### GGN Matvec Implementation\n\nThe GGNOperator supports two matvec paths:\n\n1. **Analytical (default):** Finite-difference JVP + analytical loss-Hessian-vector product + single normal backward. Memory footprint matches one normal training step.\n\n2. **Autograd:** Original `torch.func.jvp` + autograd double-backward + `torch.func.vjp`. Numerically exact but scales badly with vocabulary size.\n\n资料来源：[hessian_eigenthings/operators/ggn.py:25-45]()\n\n## Practical Implications\n\nThe HVP approach enables:\n\n| Capability | HVP-Based | Full Hessian |\n|------------|-----------|--------------|\n| 7B parameter model | ✅ ~hours | ❌ impossible |\n| Top-10 eigenpairs | ✅ | ❌ |\n| Trace estimation | ✅ | ❌ |\n| Spectral density | ✅ | ❌ |\n| FSDP compatibility | ✅ (finite-diff) | ❌ |\n\nThe eigenvalues and eigenvectors of the Hessian have been implicated in generalization properties of neural networks. Researchers hypothesize that \"flat minima\" generalize better, that Hessians of large models are very low-rank, and that curvature analysis can guide optimization. 资料来源：[README.md:25-35]()\n\n## Summary\n\nHessian-vector products are the fundamental building block that makes large-scale curvature analysis possible:\n\n1. **Memory efficiency:** $O(n)$ vs $O(n^2)$ for the full Hessian\n2. **Computational efficiency:** $O(n)$ per matvec vs $O(n^2)$ for full computation\n3. **Scalability:** Works with models of any size via iterative algorithms\n4. **Flexibility:** Supports exact (autograd) or memory-efficient (finite-difference) computation\n\nThe `hessian-eigenthings` library provides production-ready implementations of HVP computation and HVP-based algorithms for practical curvature analysis in PyTorch.\n\n---\n\n<a id='architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Curvature Operators](#curvature-operators), [Eigendecomposition Algorithms](#algorithms), [Loss Functions](#loss-functions)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/base.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/base.py)\n- [hessian_eigenthings/algorithms/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/__init__.py)\n- [hessian_eigenthings/linalg/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/linalg/__init__.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n</details>\n\n# System Architecture\n\nThe `pytorch-hessian-eigenthings` library provides an efficient and scalable framework for computing eigendecompositions of curvature matrices—including the Hessian, Generalized Gauss-Newton (GGN) matrix, and empirical Fisher—for arbitrary PyTorch models. The architecture is designed around three core abstractions: **Curvature Operators**, **Algorithms**, and **Linear Algebra Backends**.\n\n## High-Level Architecture Overview\n\nThe library implements a layered architecture that separates mathematical curvature computations from numerical algorithms:\n\n```mermaid\ngraph TD\n    subgraph \"User Layer\"\n        U[User Code]\n    end\n    \n    subgraph \"Algorithm Layer\"\n        LA[Lanczos]\n        TR[Trace Estimation]\n        SP[Stochastic Power Iteration]\n    end\n    \n    subgraph \"Operator Layer\"\n        HO[HessianOperator]\n        GGN[GGNOperator]\n        FO[FisherOperator]\n    end\n    \n    subgraph \"Backend Layer\"\n        B[LinAlgBackend]\n        SD[SingleDeviceBackend]\n    end\n    \n    subgraph \"PyTorch Core\"\n        PT[PyTorch Autograd]\n    end\n    \n    U -->|uses| LA\n    U -->|uses| TR\n    U -->|uses| HO\n    LA -->|operates on| HO\n    TR -->|operates on| HO\n    HO -->|implemented via| B\n    B -->|delegates to| PT\n```\n\n## Core Components\n\n### 1. Curvature Operators\n\nCurvature operators are the foundation of the library. They abstract away the details of how matrix-vector products (matvecs) with curvature matrices are computed, providing a unified interface for algorithms to work with.\n\n#### Base Interface\n\nAll operators inherit from `CurvatureOperator`, which defines the contract for curvature computations:\n\n| Property/Method | Type | Description |\n|-----------------|------|-------------|\n| `size` | `int` | Total number of parameters in the curvature matrix |\n| `dtype` | `torch.dtype` | Data type of the operator |\n| `device` | `torch.device` | Device where computations run |\n| `matvec(v)` | `Callable` | Computes `A @ v` for input vector `v` |\n\n#### Hessian Operator\n\nThe `HessianOperator` computes the Hessian of a loss function with respect to model parameters:\n\n```python\nHessianOperator(\n    model: nn.Module,\n    dataloader: Iterable[Any],\n    loss_fn: LossFn,\n    *,\n    param_filter: ParamFilter | None = None,\n    method: HvpMethod = \"autograd\"  # or \"finite_difference\"\n)\n```\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50]()\n\nTwo HVP computation methods are supported:\n\n| Method | Description | Use Case |\n|--------|-------------|----------|\n| `\"autograd\"` (default) | Exact double-backward via `torch.autograd.grad` | Up to ~7B parameters |\n| `\"finite_difference\"` | Central-difference approximation | FSDP/HSDP/TP at scale |\n\nThe finite difference method uses the approximation:\n\n```\nH(v) ≈ (∇L(θ+εv) − ∇L(θ−εv)) / 2ε\n```\n\nThis avoids second-backward graph entirely, making it compatible with distributed training setups.\n\n#### GGN Operator\n\nThe `GGNOperator` implements the Generalized Gauss-Newton matrix, which is always positive semi-definite:\n\n```python\nGGNOperator(\n    model: nn.Module,\n    dataloader: Iterable[Any],\n    forward_fn: ForwardFn,\n    loss_of_output_fn: LossOfOutputFn,\n    *,\n    loss_hvp: Literal[\"analytical\", \"autograd\"] = \"analytical\"\n)\n```\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-80]()\n\nThe GGN decomposes as `G = J^T · H_loss · J` where:\n- `J` is the Jacobian of the model output with respect to parameters\n- `H_loss` is the Hessian of the loss with respect to the output\n\nFor cross-entropy + softmax classification, `G` equals the Fisher information matrix.\n\n### 2. Algorithms Layer\n\nAlgorithms operate on any `CurvatureOperator` via its `matvec` interface, enabling eigenvalue computation, trace estimation, and spectral density analysis.\n\n#### Lanczos Eigensolver\n\nThe Lanczos algorithm computes the top-k eigenvalues and eigenvectors of a symmetric matrix using only matrix-vector products:\n\n```python\nlanczos(\n    operator: CurvatureOperator,\n    k: int = 10,\n    max_iter: int = 100,\n    tol: float = 1e-3,\n    which: str = \"LA\"  # LA, SA, or LM\n) -> EigendecompositionResult\n```\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:1-50]()\n\n**Key features:**\n\n- **Ritz vector accumulation**: Directly accumulates Ritz vectors into final `(k, n)` layout via rank-1 outer-product updates, avoiding transient `(n, k)` transpose copies\n- **Convergence tracking**: Monitors residual norms `|β_k · s_{k}|` to determine convergence\n- **Eigenvalue selection**: Supports \"LA\" (largest algebraic), \"SA\" (smallest algebraic), and \"LM\" (largest magnitude)\n\n#### Trace Estimation\n\nThe library provides multiple trace estimation methods:\n\n| Method | Description | Samples Required |\n|--------|-------------|------------------|\n| `hutchinson` | Classical Hutchinson: `(1/m) Σ vᵢᵀ A vᵢ` | Higher variance |\n| `hutch++` | Improved estimator with lower variance | ~30 recommended |\n\n```python\ntrace(\n    operator: CurvatureOperator,\n    num_matvecs: int = 30,\n    method: str = \"hutch++\",\n    seed: int | None = None\n) -> TraceResult\n```\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:1-40]()\n\nThe Hutch++ estimator achieves lower variance by using both query and reply vectors.\n\n### 3. Backend Layer\n\nThe `LinAlgBackend` abstract interface decouples linear algebra operations from specific device implementations:\n\n```mermaid\nclassDiagram\n    class LinAlgBackend~T~ {\n        <<abstract>>\n        +matmul(a, b) T\n        +dot(a, b) T\n        +norm(v) T\n        +fill(v, value) T\n        +copy(v) T\n    }\n    \n    class SingleDeviceBackend {\n        +matmul(a, b) Tensor\n        +dot(a, b) Tensor\n        +norm(v) Tensor\n    }\n    \n    LinAlgBackend <|-- SingleDeviceBackend\n```\n\nBackends provide:\n- Vector arithmetic operations (dot product, norm, fill, copy)\n- Device-specific optimizations\n- Memory allocation strategies\n\n## Data Flow\n\n### Eigendecomposition Workflow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Operator as CurvatureOperator\n    participant Backend as LinAlgBackend\n    participant Algo as Lanczos Algorithm\n    participant PyTorch as PyTorch Autograd\n    \n    User->>Operator: Instantiate with model, dataloader\n    User->>Algo: Call lanczos(operator, k)\n    Algo->>Operator: Request matvec(v)\n    Operator->>Backend: Allocate probe vector\n    Backend->>PyTorch: Create tensor\n    Operator->>PyTorch: Forward pass + backward\n    PyTorch-->>Operator: Return HVP result\n    Operator-->>Algo: Return Av\n    Algo->>Algo: Repeat for m iterations\n    Algo-->>User: Return eigenvalues, eigenvectors\n```\n\n### Loss Function Integration\n\nThe library supports two loss function patterns:\n\n```mermaid\ngraph LR\n    subgraph \"Single Function API\"\n        L1[loss_fn<br/>model, batch → scalar]\n    end\n    \n    subgraph \"Two Function API (for GGN)\"\n        F1[forward_fn<br/>model, batch → output]\n        L2[loss_of_output_fn<br/>output, batch → scalar]\n    end\n    \n    L1 --> HO[HessianOperator]\n    F1 --> GGN[GGNOperator]\n    L2 --> GGN\n```\n\n#### HuggingFace Integration\n\nFor language models, the library provides optimized loss functions:\n\n```python\nhf_lm_loss(fused=\"auto\")  # Auto-selects Triton or torch.compile\n```\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-30]()\n\nThe fused implementation:\n- Uses Triton kernels on CUDA (~3.4x speedup, 2x peak-memory reduction)\n- Falls back to `torch.compile` (~2.6x speedup, 2x peak-memory reduction)\n- Eliminates most `(N, V)` intermediates\n\n## Configuration Options\n\n### HessianOperator Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `model` | `nn.Module` | Required | PyTorch model |\n| `dataloader` | `Iterable` | Required | Data batches |\n| `loss_fn` | `LossFn` | Required | Loss computation function |\n| `param_filter` | `ParamFilter` | `None` | Filter parameters by name |\n| `method` | `HvpMethod` | `\"autograd\"` | HVP computation method |\n| `fd_eps` | `float` | `None` | Finite difference epsilon |\n\n### GGNOperator Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `loss_hvp` | `str` | `\"analytical\"` | `\"analytical\"` or `\"autograd\"` |\n| `full_dataset` | `bool` | `True` | Average over full dataset |\n| `num_batches` | `int` | `None` | Limit to first N batches |\n| `microbatch_size` | `int` | `None` | Process in smaller chunks |\n\n### Lanczos Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `k` | `int` | `10` | Number of eigenvalues to compute |\n| `max_iter` | `int` | `100` | Maximum Lanczos iterations |\n| `tol` | `float` | `1e-3` | Convergence tolerance |\n| `which` | `str` | `\"LA\"` | Which eigenvalues (\"LA\", \"SA\", \"LM\") |\n| `reorthogonalize` | `bool` | `False` | Full reorthogonalization |\n\n## Usage Patterns\n\n### Basic Hessian Eigenvalue Computation\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\n\n# Create operator\nhessian_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=lambda m, b: torch.nn.functional.cross_entropy(m(b[0]), b[1])\n)\n\n# Compute top eigenvalues\neig_result = lanczos(hessian_op, k=10, max_iter=100)\nprint(eig_result.eigenvalues)\n```\n\n### Parameter-Filtered Analysis\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\n\n# Analyze only attention parameters\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\")\n)\neig_attn = lanczos(attn_op, k=3)\n```\n\n### Trace Estimation\n\n```python\nfrom hessian_eigenthings import HessianOperator, trace\n\ntrace_result = trace(\n    hessian_op,\n    num_matvecs=30,\n    method=\"hutch++\",\n    seed=42\n)\nprint(f\"Trace estimate: {trace_result.estimate:.4e}\")\n```\n\n## Architecture Benefits\n\n| Benefit | Description |\n|---------|-------------|\n| **Separation of Concerns** | Operators define \"what\" to compute; algorithms define \"how\" |\n| **Flexibility** | Any operator can use any algorithm |\n| **Scalability** | Backends enable device-specific optimizations |\n| **Composability** | Easy to add new operators or algorithms |\n| **Memory Efficiency** | Matrix-free design avoids explicit matrix storage |\n\n## Extension Points\n\n### Adding Custom Curvature Operators\n\nNew operators should subclass `CurvatureOperator` and implement the `matvec` method:\n\n```python\nclass CustomCurvatureOperator(CurvatureOperator):\n    def __init__(self, model, dataloader):\n        super().__init__()\n        self.model = model\n        self.dataloader = dataloader\n        # Register parameters\n    \n    def _matvec(self, v: torch.Tensor) -> torch.Tensor:\n        # Implement A @ v\n        return custom_computation(v)\n```\n\n### Adding New Algorithms\n\nAlgorithms should accept any `CurvatureOperator` and use the backend exclusively:\n\n```python\ndef custom_algorithm(\n    operator: CurvatureOperator,\n    backend: LinAlgBackend | None = None\n) -> SomeResult:\n    backend = backend or SingleDeviceBackend()\n    # Use backend for all vector operations\n```\n\n## Summary\n\nThe system architecture of `pytorch-hessian-eigenthings` follows a clean, modular design that separates curvature matrix computation (operators), numerical algorithms (Lanczos, trace estimation), and linear algebra primitives (backends). This design enables efficient Hessian and GGN eigendecomposition for models ranging from small MLPs to large language models, with support for distributed training and optimized fused computations.\n\n---\n\n<a id='curvature-operators'></a>\n\n## Curvature Operators\n\n### 相关页面\n\n相关主题：[System Architecture](#architecture), [Distributed Computing with DDP](#distributed-computing)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/base.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/base.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/operators/fisher.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/fisher.py)\n- [hessian_eigenthings/operators/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n</details>\n\n# Curvature Operators\n\n## Overview\n\nCurvature Operators in `hessian-eigenthings` provide a matrix-free abstraction for computing Hessian eigendecomposition and related curvature matrices for arbitrary PyTorch models. They implement the `CurvatureOperator` base class interface, enabling efficient computation of eigenvalues, eigenvectors, traces, and spectral densities without explicitly forming potentially massive matrices.\n\nThe core abstraction allows algorithms (Lanczos, power iteration, Hutch++) to operate on any curvature matrix through a unified `matvec(v)` interface that computes $Av$ for any vector $v$, enabling scalability to large models with billions of parameters.\n\n资料来源：[hessian_eigenthings/__init__.py:1-10](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/__init__.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    subgraph \"Curvature Operators\"\n        Base[CurvatureOperator<br/>Base Class]\n        Hessian[HessianOperator]\n        GGN[GGNOperator]\n        Fisher[EmpiricalFisherOperator]\n        Lambda[LambdaOperator]\n        DDP[DDPHessianOperator]\n    end\n    \n    subgraph \"Algorithms\"\n        Lanczos[Lanczos Eigendecomposition]\n        Power[Power Iteration]\n        Trace[Trace Estimation<br/>Hutch++/Hutchinson]\n        Spectral[Spectral Density<br/>Stochastic Lanczos Quadrature]\n    end\n    \n    Base --> Hessian\n    Base --> GGN\n    Base --> Fisher\n    Base --> Lambda\n    Base --> DDP\n    \n    Hessian --> Lanczos\n    GGN --> Lanczos\n    Fisher --> Lanczos\n    Lambda --> Lanczos\n    \n    Hessian --> Trace\n    GGN --> Trace\n    Fisher --> Trace\n    \n    Hessian --> Power\n    GGN --> Power\n    Fisher --> Power\n    \n    Hessian --> Spectral\n    GGN --> Spectral\n    Fisher --> Spectral\n```\n\n## Base Class: CurvatureOperator\n\nAll curvature operators inherit from `CurvatureOperator`, which defines the contract that subclasses must fulfill.\n\n### Core Interface\n\n| Method | Description |\n|--------|-------------|\n| `matvec(v)` | Compute $Av$ where $A$ is the curvature matrix |\n| `size` | Total number of parameters in the operator's scope |\n| `dtype`, `device` | Tensor dtype and device for vector operations |\n\n资料来源：[hessian_eigenthings/operators/base.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/base.py)\n\n### Parameter Filtering\n\nCurvature operators can be restricted to subsets of model parameters using `param_filter`, enabling analysis of specific components (e.g., attention layers only).\n\n```python\nfrom hessian_eigenthings import HessianOperator, match_names\n\n# Filter to attention parameters only\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\")\n)\n```\n\n资料来源：[hessian_eigenthings/__init__.py:35-38](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/__init__.py)\n\n## HessianOperator\n\nComputes the Hessian $\\nabla_{\\theta}^2 \\mathcal{L}$ of the loss function with respect to model parameters.\n\n### Key Features\n\n- **Two HVP methods**: `autograd` (exact double-backward via `torch.autograd.grad`) and `finite_difference` (central difference for FSDP/TP compatibility)\n- **Batched computation**: Automatically averages over multiple batches from the dataloader\n- **Microbatch support**: For large models, process batches in smaller microbatches\n\n### Constructor Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `model` | `nn.Module` | Required | PyTorch model |\n| `dataloader` | `Iterable` | Required | Data batches |\n| `loss_fn` | `LossFn` | Required | Loss computation function |\n| `param_filter` | `ParamFilter \\| None` | `None` | Parameter name filter |\n| `full_dataset` | `bool` | `True` | Average over full dataset |\n| `num_batches` | `int \\| None` | `None` | Limit batches for stochastic estimate |\n| `microbatch_size` | `int \\| None` | `None` | Split batches into smaller microbatches |\n| `method` | `HvpMethod` | `\"autograd\"` | HVP computation method |\n| `fd_eps` | `float \\| None` | `None` | Finite difference epsilon |\n| `backend` | `LinAlgBackend \\| None` | `None` | Linear algebra backend |\n\n### HVP Method Comparison\n\n| Method | Accuracy | Memory | FSDP/TP Compatible | Speed |\n|--------|----------|--------|-------------------|-------|\n| `autograd` | Exact (to rounding) | High | No | Fast |\n| `finite_difference` | $O(\\epsilon^2)$ bias | Low | Yes | 2x passes |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### Finite Difference Epsilon Table\n\n| Dtype | Optimal $\\epsilon$ |\n|-------|-------------------|\n| `float64` | `6e-6` |\n| `float32` | `5e-3` |\n| `bfloat16` | `0.2` |\n| `float16` | `5e-2` |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:34-40](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n## GGNOperator\n\nThe Generalized Gauss-Newton (GGN) matrix $G = J^T H_{loss} J$ provides a PSD approximation to the Hessian that is computationally cheaper while preserving the eigenvalues that matter for optimization.\n\n### Key Features\n\n- **Always PSD**: Unlike the exact Hessian, the GGN is positive semi-definite by construction\n- **Analytical HVP path**: For losses with known HVP (e.g., cross-entropy), uses analytical computation\n- **For cross-entropy + softmax**: $G$ equals the Fisher information matrix\n\n### Two Matvec Implementations\n\n| `loss_hvp` | Description | Memory | Use Case |\n|------------|-------------|--------|----------|\n| `\"analytical\"` (default) | FD JVP + analytical loss-Hessian-vec + one backward | Matches one training step | LM-scale, large vocab |\n| `\"autograd\"` | `torch.func.jvp` + double-backward + `torch.func.vjp` | Scales with output size | Exact, small vocab |\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n### Fused Cross-Entropy HVP\n\nFor language model training, the GGN operator includes a fused kernel for the CE HVP computation:\n\n```python\n# Auto-selects fastest backend: Triton > torch.compile > eager\nhf_lm_loss_of_output(..., fused=\"auto\")\n```\n\nThe fused implementation reduces peak memory by 2x compared to eager, with Triton providing ~3.4x speedup on CUDA.\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-30](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n## EmpiricalFisherOperator\n\nComputes the empirical Fisher information matrix $F = \\frac{1}{N} \\sum_{i=1}^N \\nabla_{\\theta} \\log p(y_i|x_i) \\nabla_{\\theta} \\log p(y_i|x_i)^T$.\n\nFor classification with cross-entropy loss, the empirical Fisher equals the GGN when using the model distribution's expectation.\n\n资料来源：[hessian_eigenthings/operators/fisher.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/fisher.py)\n\n## LambdaOperator\n\nCreates custom curvature operators from lambda functions for testing or custom curvature definitions.\n\n```python\nfrom hessian_eigenthings import LambdaOperator\n\n# Custom operator that always returns a scaled vector\ncustom_op = LambdaOperator(\n    size=1000,\n    matvec=lambda v: 2.0 * v  # Represents 2*I\n)\n```\n\n## DDPHessianOperator\n\nDistributed Data Parallel-aware Hessian operator that handles gradient synchronization across processes.\n\n资料来源：[hessian_eigenthings/operators/__init__.py:15-18](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n\n## Common Usage Patterns\n\n### Computing Top Eigenvalues\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\n\noperator = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn\n)\n\nresult = lanczos(operator, k=10, max_iter=100)\nprint(f\"Top eigenvalues: {result.eigenvalues}\")\n```\n\n### Estimating Trace\n\n```python\nfrom hessian_eigenthings import GGNOperator, trace\n\noperator = GGNOperator(\n    model=model,\n    dataloader=dataloader,\n    forward_fn=model_forward,\n    loss_of_output_fn=loss_fn\n)\n\nresult = trace(operator, num_matvecs=100, method=\"hutch++\")\nprint(f\"Trace estimate: {result.estimate:.4e} ± {result.stderr:.4e}\")\n```\n\n### Component-Specific Analysis\n\n```python\nfrom hessian_eigenthings import HessianOperator, match_regex\n\n# Analyze only attention weights in transformer\nattn_only = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_regex(r\"blocks\\.\\d+\\.attn\\.\")\n)\n\n# Analyze only MLP weights\nmlp_only = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_regex(r\"blocks\\.\\d+\\.mlp\\.\")\n)\n```\n\n## Linear Algebra Backends\n\nThe operators use pluggable `LinAlgBackend` for vector operations, enabling support for different hardware configurations and precision requirements.\n\n| Backend | Use Case |\n|---------|----------|\n| `SingleDeviceBackend` | Single GPU/CPU |\n| (Distributed backends) | Multi-GPU via FSDP/TP |\n\n## Module Exports\n\n```python\nfrom hessian_eigenthings.operators import (\n    CurvatureOperator,\n    DDPHessianOperator,\n    EmpiricalFisherOperator,\n    GGNOperator,\n    HessianOperator,\n    LambdaOperator,\n)\n```\n\n资料来源：[hessian_eigenthings/operators/__init__.py:1-20](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n\n---\n\n<a id='algorithms'></a>\n\n## Eigendecomposition Algorithms\n\n### 相关页面\n\n相关主题：[System Architecture](#architecture), [Why Hessian-Vector Products](#hvp-approach)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/power_iteration.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/power_iteration.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n- [hessian_eigenthings/algorithms/spectral_density.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/spectral_density.py)\n- [hessian_eigenthings/algorithms/result.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/result.py)\n- [hessian_eigenthings/algorithms/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/__init__.py)\n</details>\n\n# Eigendecomposition Algorithms\n\nThe `hessian-eigenthings` library provides a suite of efficient iterative algorithms for computing eigendecompositions of curvature matrices (Hessian, Generalized Gauss-Newton, and Fisher) in PyTorch models. These algorithms enable analysis of neural network loss landscapes by extracting eigenvalues, eigenvectors, spectral densities, and trace estimates without explicitly constructing the full curvature matrix—a critical capability for modern large-scale models.\n\n## Overview\n\nComputing eigendecompositions of curvature matrices is fundamental to understanding generalization properties, flat minima, and training dynamics of neural networks. However, these curvature matrices are prohibitively large (n × n where n is the number of parameters), making explicit construction impossible for modern models.\n\nThe library implements Krylov subspace methods that only require matrix-vector products, enabling efficient computation of:\n\n| Capability | Algorithm | Use Case |\n|------------|-----------|----------|\n| Top-k eigenvalues/eigenvectors | Lanczos, Power Iteration | Finding most-curved directions |\n| Trace estimation | Hutchinson, Hutch++ | Computing average curvature |\n| Spectral density | Stochastic Lanczos Quadrature | Visualizing eigenvalue distribution |\n\n资料来源：[hessian_eigenthings/algorithms/__init__.py:1-29]()\n\n## Algorithm Architecture\n\nThe algorithms in this module follow a consistent design pattern: they accept any `CurvatureOperator` and use the `LinAlgBackend` exclusively for vector arithmetic, ensuring portability across single-device and distributed settings.\n\n```mermaid\ngraph TD\n    A[CurvatureOperator] --> B[Lanczos Algorithm]\n    A --> C[Power Iteration]\n    A --> D[Trace Estimation]\n    A --> E[Spectral Density]\n    \n    B --> F[EigenResult]\n    C --> F\n    D --> G[TraceResult]\n    E --> H[SpectralDensityResult]\n    \n    F --> I[eigenvalues: Tensor]\n    F --> J[eigenvectors: Tensor]\n    F --> K[residuals: Tensor]\n```\n\n资料来源：[hessian_eigenthings/algorithms/result.py]()\n\n## Lanczos Algorithm\n\nThe Lanczos algorithm is the primary method for computing eigenvalues and eigenvectors of symmetric matrices. It builds a Krylov subspace through repeated matrix-vector products, then solves the small tridiagonal eigenvalue problem.\n\n### Symmetric Lanczos Implementation\n\nThe in-house Lanczos implementation provides optional full reorthogonalization to address the loss-of-orthogonality issues classical Lanczos is known for (Paige 1976).\n\n```python\ndef lanczos_tridiagonal(\n    operator: CurvatureOperator,\n    v0: torch.Tensor,\n    max_iter: int,\n    *,\n    reorthogonalize: bool = True,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> LanczosTridiag\n```\n\n**Key characteristics:**\n\n- **Default reorthogonalization**: Enabled for `max_iter <= 50` to suppress ghost eigenvalues\n- **Computational tradeoff**: For larger Krylov dimensions, reorthogonalization becomes O(m²n); users analyzing near-degenerate spectra should re-enable it\n- **Memory efficiency**: Accumulates Ritz vectors directly via rank-1 outer-product updates, avoiding transient (n, k) → (k, n) transpose copies\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:30-58]()\n\n### Lanczos Output Structure\n\n```python\n@dataclass(frozen=True)\nclass LanczosTridiag:\n    \"\"\"Output of one Lanczos run: tridiagonal coefficients + the basis used to build them.\"\"\"\n    alphas: torch.Tensor      # (m,) diagonal\n    betas: torch.Tensor       # (m-1,) off-diagonal\n    basis: list[torch.Tensor] # length m, each (n,)\n    last_beta: float          # ||r_m|| residual norm at termination\n    iterations: int          # m, the actual number of Lanczos steps completed\n```\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:30-43]()\n\n### Full Lanczos Eigensolver\n\nThe high-level `lanczos()` function computes top-k eigenvalues with configurable eigenpair selection:\n\n```python\ndef lanczos(\n    operator: CurvatureOperator,\n    k: int = 1,\n    max_iter: int = 100,\n    *,\n    which: Which = \"LM\",\n    tol: float = 1e-8,\n    seed: int | None = None,\n    reorthogonalize: bool | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> EigenResult\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `operator` | `CurvatureOperator` | required | The curvature matrix operator |\n| `k` | `int` | 1 | Number of eigenpairs to compute |\n| `max_iter` | `int` | 100 | Maximum Lanczos iterations |\n| `which` | `Literal[\"LM\", \"LA\", \"SA\"]` | \"LM\" | Which eigenvalues: LM=largest magnitude, LA=largest algebraic, SA=smallest algebraic |\n| `tol` | `float` | 1e-8 | Convergence tolerance |\n| `seed` | `int \\| None` | None | Random seed for reproducibility |\n| `reorthogonalize` | `bool \\| None` | None | Override default reorthogonalization setting |\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:58-100]()\n\n### Eigenvalue Selection Logic\n\nThe algorithm selects eigenvalues based on the `which` parameter:\n\n```python\nif which == \"LM\":\n    order = torch.argsort(theta.abs(), descending=True)\nelif which == \"LA\":\n    order = torch.argsort(theta, descending=True)\nelif which == \"SA\":\n    order = torch.argsort(theta, descending=False)\n```\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:75-83]()\n\n## Power Iteration\n\nPower iteration is a simpler method for finding the dominant eigenvalue. The library implements deflated power iteration to compute multiple eigenpairs sequentially by projecting out previously found directions.\n\n### Single Power Iteration\n\n```python\ndef power_iteration_one(\n    operator: CurvatureOperator,\n    v0: torch.Tensor,\n    max_iter: int,\n    tol: float = 1e-6,\n    backend: LinAlgBackend | None = None,\n) -> tuple[torch.Tensor, torch.Tensor]\n```\n\n资料来源：[hessian_eigenthings/algorithms/power_iteration.py]()\n\n### Deflated Power Iteration\n\nDeflated power iteration extends the basic method to compute multiple eigenpairs:\n\n```python\ndef deflated_power_iteration(\n    operator: CurvatureOperator,\n    num_eigs: int,\n    max_iter: int,\n    tol: float = 1e-6,\n    seed: int | None = None,\n    backend: LinAlgBackend | None = None,\n) -> EigenResult\n```\n\nThe deflation process removes previously found eigenvectors from the subspace before searching for the next eigenpair, preventing convergence to already-computed directions.\n\n资料来源：[hessian_eigenthings/algorithms/power_iteration.py]()\n\n## Trace Estimation\n\nTrace estimation provides a way to compute the average eigenvalue (trace / dimension) without full eigendecomposition. This is computationally much cheaper and useful for understanding overall curvature magnitude.\n\n### Hutchinson's Estimator\n\nHutchinson's method estimates the trace using random probe vectors:\n\n```python\ndef hutchinson(\n    operator: CurvatureOperator,\n    *,\n    num_samples: int = 100,\n    distribution: Distribution = \"rademacher\",\n    seed: int | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> TraceResult\n```\n\nThe estimator computes: `(1/m) Σ vᵢᵀ A vᵢ`\n\nWhere vᵢ are random vectors from the specified distribution. Rademacher distribution provides lower variance than Gaussian.\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:48-71]()\n\n### Hutch++ Estimator\n\nHutch++ is an improved estimator with better convergence properties:\n\n```python\ndef hutch_plus_plus(\n    operator: CurvatureOperator,\n    *,\n    num_matvecs: int = 30,\n    seed: int | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> TraceResult\n```\n\nHutch++ uses a structured random sampling approach that achieves lower variance than standard Hutchinson with the same number of matrix-vector products.\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:22-47]()\n\n### Unified Trace Interface\n\n```python\ndef trace(\n    operator: CurvatureOperator,\n    num_matvecs: int = 30,\n    *,\n    method: Literal[\"hutchinson\", \"hutch++\"] = \"hutch++\",\n    seed: int | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> TraceResult\n```\n\n**Validation:** The `num_matvecs` parameter is validated to be at least 1.\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:71-84]()\n\n### Trace Result Structure\n\n```python\n@dataclass\nclass TraceResult:\n    estimate: float      # The trace estimate\n    stderr: float       # Standard error of the estimate\n    num_matvecs: int     # Number of matrix-vector products used\n    operator_size: int   # Dimension of the operator\n```\n\n资料来源：[hessian_eigenthings/algorithms/result.py]()\n\n## Spectral Density\n\nSpectral density estimation computes the eigenvalue distribution (density function) across the spectrum, enabling visualization and analysis of the full eigenvalue structure.\n\n### Stochastic Lanczos Quadrature\n\nThe `spectral_density()` function implements Stochastic Lanczos Quadrature (SLQ) to compute the spectral density:\n\n```python\ndef spectral_density(\n    operator: CurvatureOperator,\n    num_runs: int = 16,\n    lanczos_steps: int = 50,\n    seed: int | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> SpectralDensityResult\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `operator` | `CurvatureOperator` | required | The curvature matrix operator |\n| `num_runs` | `int` | 16 | Number of randomized runs for averaging |\n| `lanczos_steps` | `int` | 50 | Lanczos iterations per run |\n| `seed` | `int \\| None` | None | Random seed |\n\n资料来源：[hessian_eigenthings/algorithms/spectral_density.py]()\n\n### Spectral Density Result\n\n```python\n@dataclass\nclass SpectralDensityResult:\n    grid: torch.Tensor      # Eigenvalue grid points\n    density: torch.Tensor   # Density values at each grid point\n    eigenvalues: list[torch.Tensor]  # Eigenvalues from each run\n    eigenvectors: list[list[torch.Tensor]]  # Corresponding eigenvectors\n```\n\nThe spectral density integrates to 1: `∫ density(λ) dλ ≈ 1`, which can be verified using numerical integration.\n\n资料来源：[hessian_eigenthings/algorithms/result.py]()\n\n## Common Result Types\n\nAll algorithms return standardized result objects that encapsulate the computed quantities along with metadata about the computation.\n\n```python\n@dataclass\nclass EigenResult:\n    eigenvalues: torch.Tensor       # (k,) tensor of eigenvalues\n    eigenvectors: torch.Tensor      # (k, n) matrix of eigenvectors\n    residuals: torch.Tensor          # (k,) convergence residuals\n    iterations: int                 # Number of iterations run\n    converged: bool                 # Whether all eigenpairs converged\n```\n\n资料来源：[hessian_eigenthings/algorithms/result.py]()\n\n## Algorithm Selection Guide\n\n```mermaid\ngraph LR\n    A[Goal] --> B{Eigenpairs?}\n    B -->|Yes, top-k| C[How many?]\n    C -->|1-10| D[Lanczos]\n    C -->|Many| E{Orthogonality critical?}\n    E -->|Yes| D\n    E -->|No| F[Deflated Power Iteration]\n    \n    B -->|Trace only| G{Accuracy priority?}\n    G -->|High| H[Hutch++]\n    G -->|Standard| I[Hutchinson]\n    \n    B -->|Full distribution| J[Spectral Density]\n    \n    D --> K[EigenResult]\n    F --> K\n    H --> L[TraceResult]\n    I --> L\n    J --> M[SpectralDensityResult]\n```\n\n### Decision Criteria\n\n| Scenario | Recommended Algorithm | Notes |\n|----------|----------------------|-------|\n| Top eigenvalues for single/large batch | `lanczos()` | Best accuracy, moderate cost |\n| Quick dominant eigenvalue | `deflated_power_iteration()` | Lower memory, less accurate |\n| Trace with limited matvecs | `hutch_plus_plus()` | Better convergence than Hutchinson |\n| Trace estimation | `hutchinson()` | Simpler, more matvecs needed |\n| Eigenvalue histogram/distribution | `spectral_density()` | Visualize full spectrum |\n\n## Integration with Curvature Operators\n\nThe algorithms are designed to work with any `CurvatureOperator` implementation, including:\n\n- `HessianOperator`: Exact Hessian via autograd or finite differences\n- `GGNOperator`: Generalized Gauss-Newton matrix\n- `EmpiricalFisherOperator`: Empirical Fisher information matrix\n- `DDPHessianOperator`: Distributed Data Parallel Hessian\n\nThis abstraction allows the same algorithm code to work across different curvature definitions without modification.\n\n资料来源：[hessian_eigenthings/algorithms/__init__.py:1-29]()\n\n## Performance Considerations\n\n### Memory Efficiency in Lanczos\n\nThe Lanczos implementation optimizes memory for large-scale models by:\n\n1. Avoiding allocation of full (n, m) basis matrix\n2. Using rank-1 outer-product updates for eigenvector accumulation\n3. Computing Ritz vectors directly into final (k, n) layout\n\n### Reorthogonalization Tradeoffs\n\n| Setting | Memory | Computation | Accuracy |\n|---------|--------|-------------|----------|\n| `reorthogonalize=True` | O(mn) | O(m²n) | High orthogonality |\n| `reorthogonalize=False` | O(mn) basis list | O(mn) | May have ghost eigenvalues |\n\nFor `max_iter <= 50`, reorthogonalization is enabled by default. For larger Krylov dimensions, it defaults off to maintain acceptable performance.\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:23-28]()\n\n## Example Usage\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos, trace, spectral_density\n\n# Create Hessian operator\noperator = HessianOperator(model, dataloader, loss_fn)\n\n# Compute top-5 eigenvalues and eigenvectors\neig_result = lanczos(operator, k=5, max_iter=40, tol=1e-7, seed=0)\nprint(f\"Top eigenvalue: {eig_result.eigenvalues[0]}\")\n\n# Estimate trace with Hutch++\ntrace_result = trace(operator, num_matvecs=99, method=\"hutch++\", seed=0)\nprint(f\"Trace estimate: {trace_result.estimate}\")\n\n# Compute spectral density\ndensity_result = spectral_density(operator, num_runs=8, lanczos_steps=40, seed=0)\n# Visualize with: plt.plot(density_result.grid, density_result.density)\n```\n\n资料来源：[examples/supervised_mlp.py:1-50]()\n\n---\n\n<a id='loss-functions'></a>\n\n## Loss Functions\n\n### 相关页面\n\n相关主题：[Curvature Operators](#curvature-operators), [Parameter Utilities](#parameter-utilities)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/loss_fns/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/__init__.py)\n- [hessian_eigenthings/loss_fns/standard.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n- [hessian_eigenthings/loss_fns/transformer_lens.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/transformer_lens.py)\n- [hessian_eigenthings/loss_fns/_fused_ce_hvp.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n</details>\n\n# Loss Functions\n\nLoss functions in this repository serve as the bridge between model outputs and curvature operators (Hessian and Generalized Gauss-Newton matrices). They provide the necessary computations for Hessian-vector products and support multiple backend implementations optimized for different use cases.\n\n## Overview\n\nThe loss functions module (`hessian_eigenthings/loss_fns/`) provides two distinct function signatures depending on the target operator:\n\n| Function Type | Signature | Used By |\n|---------------|-----------|---------|\n| `loss_fn` | `(model: nn.Module, batch: Any) -> torch.Tensor` | `HessianOperator` |\n| `loss_of_output_fn` | `(output: torch.Tensor, batch: Any) -> torch.Tensor` | `GGNOperator` |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n资料来源：[hessian_eigenthings/operators/ggn.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Loss Function Entry Points] --> B[Standard Losses]\n    A --> C[HuggingFace Losses]\n    A --> D[TransformerLens Losses]\n    \n    B --> B1[MSE Loss]\n    B --> B2[Cross-Entropy Loss with HVP]\n    \n    C --> C1[Autoregressive LM Loss]\n    C --> C2[Shifted CE with Analytical HVP]\n    C --> C3[Fused CE HVP Backends]\n    \n    D --> D1[TransformerLens HookedModel Loss]\n    \n    C3 --> C3a[Triton Kernel]\n    C3 --> C3b[torch.compile]\n    C3 --> C3c[Eager Reference]\n```\n\n## Standard Loss Functions\n\nThe `standard.py` module provides loss functions for common supervised learning scenarios with closed-form Hessian-vector products.\n\n资料来源：[hessian_eigenthings/loss_fns/standard.py:1-80](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n\n### MSE Loss\n\nReturns a wrapper compatible with `GGNOperator` for mean-squared error loss:\n\n```python\ndef mse_loss_of_output() -> Callable[[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], torch.Tensor]:\n    \"\"\"Make a `loss_of_output_fn` for `GGNOperator` from a (output, target) criterion.\"\"\"\n```\n\n### Cross-Entropy Loss with Analytical HVP\n\nThe cross-entropy implementation includes a closed-form Hessian-vector product for efficient computation:\n\n```python\ndef _ce_hvp(\n    output: torch.Tensor, batch: tuple[torch.Tensor, torch.Tensor], u: torch.Tensor\n) -> torch.Tensor:\n    \"\"\"Closed-form H @ u for mean-reduced softmax + cross-entropy.\n    \n    `output` has shape `(N, C)` (logits). For each row,\n    `H_row = (diag(p) - p p^T) / N` where `p = softmax(output)`.\n    \"\"\"\n```\n\n**Mathematical Foundation:**\n\nFor mean-reduced softmax + cross-entropy, the Hessian takes the form:\n\n```\nH_row = (diag(p) - p·p^T) / N\n```\n\nwhere:\n- `p = softmax(output)` is the predicted probability distribution\n- `N` is the number of samples\n- `u` is the input vector\n\n资料来源：[hessian_eigenthings/loss_fns/standard.py:40-55](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n\n## HuggingFace Transformers Integration\n\nThe `huggingface.py` module provides loss functions specifically designed for HuggingFace Transformers models. These handle the internal loss computation that occurs when `labels` are present in the batch.\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:60-90](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n### Autoregressive Language Model Loss\n\n```python\ndef hf_lm_loss() -> Callable[[nn.Module, dict[str, Any]], torch.Tensor]:\n    \"\"\"For autoregressive LMs: `loss_fn(model, batch)` calls `model(**batch).loss`.\"\"\"\n```\n\nThe batch must include `labels` so HuggingFace computes the loss internally. For causal language models, this is typically `labels=input_ids` with the standard internal shift.\n\n### Shifted Cross-Entropy with Analytical HVP\n\nFor large-scale language model analysis, a shifted cross-entropy variant provides both the loss function and its analytical Hessian-vector product:\n\n```python\ndef hf_lm_shifted_ce(fused: FusedCEHvpBackend = \"auto\") -> _LossOfOutputWithHvp:\n    \"\"\"Shifted CE loss with analytical H @ u for autoregressive LMs.\"\"\"\n```\n\n**Shift Mechanism:**\n- The loss shifts logits left (discards last position) and labels right (discards first position)\n- Matches how `cross_entropy(ignore_index=-100)` handles gradient computation\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n## Fused Cross-Entropy HVP Backends\n\nThe `_fused_ce_hvp.py` module implements optimized backends for computing the cross-entropy Hessian-vector product.\n\n资料来源：[hessian_eigenthings/loss_fns/_fused_ce_hvp.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n\n### Backend Selection\n\n| Backend | Description | Performance | Availability |\n|---------|-------------|-------------|--------------|\n| `\"auto\"` | Auto-select fastest available | Optimal | Default |\n| `\"triton\"` | Hand-written CUDA Triton kernel | ~3.4x speedup, 2x memory reduction | CUDA + Triton |\n| `\"compile\"` | `torch.compile`-fused | ~2.6x speedup, 2x memory reduction | torch >= 2.0 |\n| `\"eager\"` | Plain PyTorch reference | Baseline | Always |\n\n```python\nFusedCEHvpBackend = Literal[\"auto\", \"eager\", \"compile\", \"triton\"]\n```\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:25-35](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n### Backend Resolution Logic\n\n```mermaid\ngraph LR\n    A[Backend: \"auto\"] --> B{Device: CUDA?}\n    B -->|Yes + Triton available| C[Triton Kernel]\n    B -->|No| D{torch.compile available?}\n    D -->|Yes| E[torch.compile Backend]\n    D -->|No| F[Eager Backend]\n```\n\nThe resolution checks:\n1. If `backend != \"auto\"`, use the specified backend\n2. If `\"auto\"` and CUDA + Triton available → Triton\n3. If `\"auto\"` and torch.compile available → compile\n4. Otherwise → eager\n\n**Important:** The Triton kernel asserts `logits.is_cuda`, so on CUDA-equipped hosts running CPU inputs, the system falls back to `compile`.\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:40-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n### Memory Optimization\n\nAt LM scale (e.g., B=64, T=256, V=50304, fp32):\n\n| Implementation | Memory Footprint | Intermediate Tensors |\n|----------------|------------------|----------------------|\n| Eager | ~19.6 GB | ~6 (N, V) tensors |\n| Compile | ~3.3 GB | ~1 (N, V) tensor |\n| Target | ~3.3 GB | Output buffer only |\n\nThe fused implementations eliminate intermediates by computing:\n\n```\nout_flat = (p * u - p * <p, u>) * mask / n_valid\n```\n\nwith shape `(N, V)` in a single kernel pass.\n\n资料来源：[scripts/bench_fused_ce_hvp.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/scripts/bench_fused_ce_hvp.py)\n\n## Loss Function Wrapper\n\nThe `_LossOfOutputWithHvp` class wraps a loss function with its analytical Hessian-vector product:\n\n```python\nclass _LossOfOutputWithHvp:\n    \"\"\"Loss-of-output callable that also carries an analytical `.hvp` method.\n    \n    Wraps a plain `(output, batch) -> loss` function and a `(output, batch, u)\n    -> H_loss @ u` function in a single callable. `GGNOperator` checks for the\n    presence of `.hvp` and uses it as the loss-Hessian-vector product, skipping\n    the autograd `create_graph=True` double-backward path entirely.\n    \"\"\"\n```\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:100-120](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n## GGN Operator Integration\n\nThe `GGNOperator` automatically detects and uses analytical HVPs when available:\n\n```python\nGGNOperator` picks this up automatically and skips the autograd\ndouble-backward.\n\nTwo implementations of the matvec are available via `loss_hvp=`:\n\n* ``\"analytical\"`` (default): finite-difference JVP + analytical loss-Hessian-vec\n  product (read from `loss_of_output_fn.hvp`, which must be present) + a single\n  normal backward to apply `J^T`. Memory footprint matches one normal training\n  step. Required for LM-scale use.\n\n* ``\"autograd\"``: the original `torch.func.jvp` + autograd double-backward +\n  `torch.func.vjp` path. Numerically exact and supports any loss, but memory\n  scales badly with output size.\n```\n\n资料来源：[hessian_eigenthings/operators/ggn.py:10-30](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n## TransformerLens Integration\n\nFor TransformerLens `HookedModel` architectures, a dedicated loss function handles the hook-based forward pass:\n\n资料来源：[hessian_eigenthings/loss_fns/transformer_lens.py:1-40](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/transformer_lens.py)\n\n```python\ndef tlens_loss() -> Callable[[nn.Module, Any], torch.Tensor]:\n    \"\"\"Loss function for TransformerLens HookedModel.\"\"\"\n```\n\n## Workflow: Choosing a Loss Function\n\n```mermaid\ngraph TD\n    A[Start] --> B{Model Type?}\n    B -->|Standard MLP/CNN| C[Use HessianOperator]\n    B -->|HuggingFace Transformers| D[Use GGNOperator]\n    B -->|TransformerLens| E[Use HessianOperator]\n    \n    C --> F[standard.mse_loss_of_output]\n    C --> G[standard.cross_entropy_loss_of_output]\n    \n    D --> H{Scale?}\n    H -->|Small model| I[hf_lm_loss with GGNOperator]\n    H -->|Large model| J[hf_lm_shifted_ce with GGNOperator]\n    \n    E --> K[tlens_loss with HessianOperator]\n    \n    J --> L[Choose HVP Backend]\n    L --> M{Device?}\n    M -->|CUDA + Triton| N[Use Triton backend]\n    M -->|CPU/MPS| O[Use compile or eager]\n```\n\n## Complete API Reference\n\n### Standard Module\n\n| Function | Returns | HVP Available |\n|----------|---------|---------------|\n| `mse_loss_of_output()` | `loss_of_output_fn` | No |\n| `cross_entropy_loss_of_output()` | `loss_of_output_fn` | Yes |\n| `_ce_hvp()` | Analytical HVP | - |\n\n### HuggingFace Module\n\n| Function | Returns | HVP Available |\n|----------|---------|---------------|\n| `hf_lm_loss()` | `loss_fn` | Via GGNOperator |\n| `hf_lm_shifted_ce(fused)` | `_LossOfOutputWithHvp` | Yes |\n| `_LossOfOutputWithHvp` | Wrapper class | Via `.hvp` attribute |\n\n### Fused CE HVP Module\n\n| Function | Description |\n|----------|-------------|\n| `_ce_hvp_reference()` | Eager reference implementation |\n| `_get_compiled_impl()` | Returns `torch.compile` wrapped version |\n| `compiled_ce_hvp()` | Compiled backend entry point |\n| `triton_ce_hvp()` | Triton kernel entry point |\n\n## Usage Examples\n\n### Standard Classification\n\n```python\nfrom hessian_eigenthings.operators import HessianOperator\nfrom hessian_eigenthings.loss_fns.standard import cross_entropy_loss_of_output\n\noperator = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=lambda m, b: torch.nn.functional.cross_entropy(m(b[0]), b[1]),\n)\n```\n\n### HuggingFace Large Model (Memory-Optimized)\n\n```python\nfrom hessian_eigenthings.operators import GGNOperator\nfrom hessian_eigenthings.loss_fns.huggingface import hf_lm_shifted_ce\n\nloss_fn = hf_lm_shifted_ce(fused=\"auto\")  # Auto-selects best backend\n\noperator = GGNOperator(\n    model=model,\n    dataloader=dataloader,\n    forward_fn=lambda m, b: m(**b).logits,\n    loss_of_output_fn=loss_fn,\n    loss_hvp=\"analytical\",  # Default, uses .hvp attribute\n)\n```\n\n资料来源：[hessian_eigenthings/loss_fns/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/__init__.py)\n\n---\n\n<a id='parameter-utilities'></a>\n\n## Parameter Utilities\n\n### 相关页面\n\n相关主题：[Curvature Operators](#curvature-operators)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/param_utils.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/param_utils.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [examples/transformer_lens_attention_only.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/transformer_lens_attention_only.py)\n- [examples/huggingface_tiny_gpt2.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/huggingface_tiny_gpt2.py)\n</details>\n\n# Parameter Utilities\n\nThe parameter utilities module (`param_utils.py`) provides essential infrastructure for managing, filtering, and manipulating PyTorch model parameters within the Hessian eigendecomposition pipeline. These utilities form the foundation that connects curvature operators to the underlying model parameters, enabling efficient computation of Hessian-vector products and eigendecomposition across arbitrary subsets of model parameters.\n\n## Overview\n\nWhen working with large neural networks, it is often necessary to compute curvature information for only a subset of parameters. The parameter utilities support this use case through a flexible filtering mechanism combined with utilities for parameter vectorization, reshaping, and batch management.\n\nThe core responsibilities of the parameter utilities include:\n\n1. **Parameter Extraction** - Gathering named parameters from PyTorch modules\n2. **Parameter Filtering** - Selecting subsets of parameters based on name patterns or custom predicates\n3. **Vectorization** - Flattening parameters into vectors and reshaping vectors back to parameter shapes\n4. **Size Tracking** - Maintaining offset mappings for efficient vector-to-parameter conversions\n\n资料来源：[hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n## Core Types and Interfaces\n\n### ParamFilter Type\n\nThe `ParamFilter` type alias defines the contract for parameter selection functions:\n\n```python\nParamFilter = Callable[[str, nn.Parameter], bool]\n```\n\nA `ParamFilter` is a callable that takes two arguments:\n- `name: str` - The fully-qualified parameter name within the model\n- `param: nn.Parameter` - The parameter tensor itself\n\nThe function returns `True` if the parameter should be included in the operation, `False` otherwise.\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### Parameter Collection Utilities\n\nThe module provides functions for extracting and organizing model parameters:\n\n| Function | Purpose |\n|----------|---------|\n| `get_param_names(model)` | Returns list of parameter names as fully-qualified strings |\n| `get_param_list(model)` | Returns list of parameter tensors |\n| `get_param_sizes(model)` | Returns list of parameter tensor sizes |\n| `get_filtered_params(model, param_filter)` | Returns filtered parameter names and tensors |\n\nThese functions work together to build the data structures required by curvature operators.\n\n资料来源：[hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n## Parameter Vectorization\n\n### Flattening Parameters to Vectors\n\nThe utilities support bidirectional conversion between parameter dictionaries and flat vectors. This is essential for Lanczos-based eigendecomposition algorithms that operate on vector spaces.\n\n```python\ndef flatten_params(param_dict: dict[str, Tensor]) -> Tensor:\n    \"\"\"Flatten all parameters into a single 1D tensor.\"\"\"\n```\n\nThe flattening process concatenates all parameter tensors in a deterministic order, preserving the mapping between parameter names and vector offsets.\n\n### Reshaping Vectors Back to Parameters\n\n```python\ndef unflatten_params(\n    vec: Tensor,\n    param_names: list[str],\n    param_list: list[Tensor],\n    sizes: list[torch.Size]\n) -> dict[str, Tensor]:\n    \"\"\"Reshape a flat vector back to parameter dictionary.\"\"\"\n```\n\nThe unflattening operation uses offset tracking to slice the vector and reshape each slice to match the original parameter shape:\n\n```python\nfor name, param, size in zip(param_names, param_list, sizes, strict=True):\n    out[name] = vec[offset : offset + size].reshape_as(param)\n    offset += size\n```\n\n资料来源：[hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n## Parameter Filtering Patterns\n\n### Name-Based Filtering with `match_names`\n\nThe most common filtering pattern uses glob-style matching against parameter names. The `match_names` function creates a `ParamFilter` from a list of name patterns:\n\n```python\ndef match_names(*patterns: str) -> ParamFilter:\n    \"\"\"Create a filter matching parameter names against glob patterns.\"\"\"\n```\n\n**Example usage:**\n\n```python\nfrom hessian_eigenthings.operators import HessianOperator\nfrom hessian_eigenthings.param_utils import match_names\n\n# Filter attention parameters only\nattn_filter = match_names(\"blocks.*.attn.*\")\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=attn_filter\n)\n\n# Filter MLP parameters only\nmlp_filter = match_names(\"blocks.*.mlp.*\")\nmlp_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=mlp_filter\n)\n```\n\n资料来源：[examples/transformer_lens_attention_only.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/transformer_lens_attention_only.py)\n\n### Multiple Pattern Matching\n\nThe `match_names` function supports multiple patterns, useful for targeting disjoint parameter groups:\n\n```python\n# Match multiple parameter groups\nfilter_fn = match_names(\n    \"transformer.h.*.attn.*\",\n    \"transformer.h.*.mlp.*\"\n)\n```\n\n### HuggingFace-Specific Patterns\n\nWhen working with HuggingFace transformers, parameter names follow a predictable structure:\n\n```python\n# Attention parameters in GPT-2\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=hf_lm_loss(),\n    param_filter=match_names(\"transformer.h.*.attn.*\"),\n)\n```\n\n资料来源：[examples/huggingface_tiny_gpt2.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/huggingface_tiny_gpt2.py)\n\n## Integration with Curvature Operators\n\n### Operator Size and Parameter Tracking\n\nCurvature operators maintain internal state about the parameters they operate on:\n\n| Attribute | Type | Description |\n|-----------|------|-------------|\n| `_param_names` | `list[str]` | Names of parameters in the filtered set |\n| `_param_list` | `list[Tensor]` | Parameter tensors |\n| `_sizes` | `list[torch.Size]` | Original tensor shapes for reshaping |\n| `size` | `int` | Total number of parameters (sum of all parameter elements) |\n\nThe `size` property is computed as:\n\n```python\nself.size = sum(p.numel() for p in self._param_list)\n```\n\nThis total parameter count determines the dimensionality of the vector space in which eigendecomposition occurs.\n\n资料来源：[hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### Data Flow Diagram\n\n```mermaid\ngraph TD\n    A[PyTorch Model] --> B[get_param_names]\n    A --> C[get_param_list]\n    A --> D[get_param_sizes]\n    B --> E[ParamFilter Application]\n    C --> E\n    D --> E\n    E --> F[Filtered Parameter Collections]\n    F --> G[Curvature Operator]\n    G --> H[matvec Operations]\n    H --> I[Eigendecomposition Results]\n    \n    J[Input Vector] --> K[unflatten_params]\n    F --> K\n    K --> L[Parameter Dict]\n    L --> H\n```\n\n## Practical Examples\n\n### Computing Attention-Only Hessian Eigendecomposition\n\n```python\nfrom hessian_eigenthings.algorithms import lanczos\nfrom hessian_eigenthings.operators import HessianOperator\nfrom hessian_eigenthings.param_utils import match_names\n\n# Create attention-only operator\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\")\n)\n\nprint(f\"Attention-only Hessian size: {attn_op.size} parameters\")\n\n# Compute top-3 eigenvalues\neig_attn = lanczos(attn_op, k=3, max_iter=20, tol=1e-3, seed=0)\nfor i, val in enumerate(eig_attn.eigenvalues):\n    print(f\"  λ_{i + 1} = {val.item(): .4e}\")\n```\n\n### Comparing Block-Specific Curvature\n\n```python\n# Full model Hessian\nfull_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn\n)\n\n# Attention block only\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\")\n)\n\n# MLP block only\nmlp_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.mlp.*\")\n)\n\n# Compare eigenvalue spectra\nfull_eig = lanczos(full_op, k=10)\nattn_eig = lanczos(attn_op, k=10)\nmlp_eig = lanczos(mlp_op, k=10)\n```\n\n资料来源：[examples/transformer_lens_attention_only.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/transformer_lens_attention_only.py)\n\n## Advanced Filtering\n\n### Custom Filter Functions\n\nFor complex filtering logic beyond glob matching, implement a custom `ParamFilter`:\n\n```python\ndef custom_filter(name: str, param: nn.Parameter) -> bool:\n    # Include only parameters with > 1000 elements\n    if param.numel() < 1000:\n        return False\n    # Exclude certain modules\n    if \"embedding\" in name:\n        return False\n    # Include based on naming patterns\n    return \"layer\" in name or \"head\" in name\n\noperator = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=custom_filter\n)\n```\n\n### Filter Composition\n\nFilters can be combined using standard Python patterns:\n\n```python\n# Intersection of patterns\ncombined_filter = lambda name, param: (\n    match_names(\"blocks.*.*.*\")(name, param) and\n    param.dtype == torch.float32\n)\n\n# Negation\nexclude_ln = lambda name, param: not (\n    match_names(\".*laynorm.*\", \".*ln.*\")(name, param)\n)\n```\n\n## Performance Considerations\n\n### Parameter Access Patterns\n\nThe parameter utilities maintain strict ordering between name lists and tensor lists to enable efficient offset-based indexing. When iterating over parameters in performance-critical paths:\n\n1. Use the pre-computed `_sizes` list to avoid repeated `param.shape` calls\n2. Leverage the `strict=True` zip when all lists are guaranteed to be aligned\n3. Prefer in-place reshaping over copies when possible\n\n### Memory Implications\n\n| Operation | Memory Pattern |\n|-----------|----------------|\n| `flatten_params` | Allocates new tensor of size `sum(numel)` |\n| `unflatten_params` | Creates dict, views from original vector |\n| `matvec` | No parameter data copies; uses VJP/JVP chains |\n\nThe vectorization maintains a view relationship with original parameters where possible, minimizing memory overhead during iterative algorithms.\n\n## API Reference\n\n### `match_names`\n\n```python\ndef match_names(*patterns: str) -> ParamFilter:\n    \"\"\"Create a ParamFilter matching parameter names against glob patterns.\"\"\"\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `patterns` | `str` | Glob patterns to match against parameter names |\n\n**Returns:** A callable `ParamFilter` that returns `True` for parameters matching any of the provided patterns.\n\n**Supported Glob Patterns:**\n- `*` - Matches any sequence of characters within a path component\n- `**` - Matches any sequence of path components (if supported)\n- `?` - Matches a single character\n- `[abc]` - Matches any character in the set\n\n### Parameter Extraction Functions\n\n```python\ndef get_param_names(model: nn.Module) -> list[str]:\n    \"\"\"Extract fully-qualified parameter names from a model.\"\"\"\n\ndef get_param_list(model: nn.Module) -> list[Tensor]:\n    \"\"\"Extract parameter tensors from a model.\"\"\"\n\ndef get_filtered_params(\n    model: nn.Module,\n    param_filter: ParamFilter | None\n) -> tuple[list[str], list[Tensor]]:\n    \"\"\"Extract filtered parameter names and tensors.\"\"\"\n```\n\n资料来源：[hessian_eigenthings/param_utils.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/param_utils.py)\n\n---\n\n<a id='distributed-computing'></a>\n\n## Distributed Computing with DDP\n\n### 相关页面\n\n相关主题：[Curvature Operators](#curvature-operators)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/distributed/ddp.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/distributed/ddp.py)\n- [hessian_eigenthings/operators/distributed/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/distributed/__init__.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/__init__.py)\n- [hessian_eigenthings/operators/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n</details>\n\n# Distributed Computing with DDP\n\nThe `hessian-eigenthings` library provides native support for distributed training scenarios through `DDPHessianOperator`, a specialized curvature operator that extends the base `HessianOperator` to work correctly with PyTorch's `DistributedDataParallel` (DDP) wrapper.\n\n## Overview\n\nIn distributed training environments, the Hessian eigenvalue computations must account for how DDP synchronizes gradients across multiple processes. The `DDPHessianOperator` handles this synchronization transparently, ensuring that the Hessian-vector products (HVPs) computed across different ranks are properly averaged.\n\n**Key characteristics:**\n\n- Subclass of `HessianOperator` with distributed awareness\n- Automatically averages HVPs across all data-parallel ranks\n- Compatible with standard DDP-wrapped models\n- Supports the same API as the base `HessianOperator`\n- Handles the autograd graph complexity introduced by DDP's all-reduce operations\n\n## Architecture\n\n### Class Hierarchy\n\n```\nCurvatureOperator (base interface)\n    └── HessianOperator (base implementation)\n            └── DDPHessianOperator (DDP-aware extension)\n```\n\n### Data Flow Diagram\n\n```mermaid\ngraph TD\n    A[Model wrapped with DDP] --> B[DDPHessianOperator]\n    B --> C[Per-rank HVP Computation]\n    C --> D[Autograd-aware All-Reduce]\n    D --> E[Synchronized HVP across Ranks]\n    \n    F[torch.autograd.grad calls] --> G[Regular .backward hooks]\n    G --> H[No explicit all-reduce]\n    \n    I[Expected HVP] --> J[Actual HVP without DDPHessianOperator]\n    \n    K[Expected HVP] --> L[Actual HVP with DDPHessianOperator]\n    \n    style D fill:#90EE90\n    style H fill:#FFB6C1\n    style L fill:#90EE90\n```\n\n### DDP Behavior Explanation\n\nThe core challenge addressed by `DDPHessianOperator` stems from how PyTorch's `DistributedDataParallel` handles gradient synchronization:\n\n1. **DDP's all-reduce mechanism**: DDP normally fires its all-reduce operation inside the autograd graph during `loss.backward()`, synchronizing gradients across all ranks\n2. **Standard HessianOperator limitation**: When using `torch.autograd.grad` directly (as the base `HessianOperator` does), the DDP hooks fire on `.grad` accumulation rather than on `autograd.grad`'s return value\n3. **Resulting discrepancy**: Without explicit handling, the computed HVP does not match the single-process HVP computed on the union of all per-rank batches\n\nThe `DDPHessianOperator` resolves this by adding an explicit autograd-aware all-reduce after each gradient computation call, ensuring the resulting HVP equals the single-process HVP.\n\n## API Reference\n\n### DDPHessianOperator\n\n```python\nclass DDPHessianOperator(HessianOperator):\n    \"\"\"HessianOperator that all-reduces the HVP across torch.distributed ranks.\"\"\"\n```\n\n#### Constructor Parameters\n\n| Parameter | Type | Required | Default | Description |\n|-----------|------|----------|---------|-------------|\n| `model` | `nn.Module` | Yes | - | Model (may be DDP-wrapped; params are read directly) |\n| `dataloader` | `Iterable[Any]` | Yes | - | Data loader providing batches to average over |\n| `loss_fn` | `LossFn` | Yes | - | Loss function `forward_fn(...) -> loss` |\n| `param_filter` | `ParamFilter \\| None` | No | `None` | Optional filter for subset of parameters |\n| `full_dataset` | `bool` | No | `True` | Whether to compute Hessian over full dataset |\n| `num_batches` | `int \\| None` | No | `None` | Number of batches to sample if not full dataset |\n| `microbatch_size` | `int \\| None` | No | `None` | Chunk batch into micro-batches for memory |\n| `microbatch_unsafe` | `bool` | No | `False` | Skip gradient accumulation safety checks |\n| `method` | `HvpMethod` | No | `\"autograd\"` | HVP computation method |\n| `fd_eps` | `float \\| None` | No | `None` | Finite difference epsilon |\n| `backend` | `LinAlgBackend[torch.Tensor] \\| None` | No | `None` | Linear algebra backend |\n\nInherits all parameters from `HessianOperator` base class.\n\n#### Inherited Methods\n\n| Method | Description |\n|--------|-------------|\n| `matvec(v)` | Compute H·v where H is the Hessian averaged over batches |\n| `size` | Total number of parameters in the filtered parameter set |\n| `dtype` | Data type of parameters |\n| `device` | Device of parameters |\n\n### Import Location\n\n```python\nfrom hessian_eigenthings.operators import DDPHessianOperator\n```\n\nOr via the distributed submodule:\n\n```python\nfrom hessian_eigenthings.operators.distributed import DDPHessianOperator\n```\n\n资料来源：[hessian_eigenthings/operators/distributed/__init__.py:1-3]()\n\n## Usage Patterns\n\n### Basic Usage with DDP-wrapped Model\n\n```python\nimport torch\nimport torch.distributed as dist\nfrom torch.nn.parallel import DistributedDataParallel as DDP\nfrom hessian_eigenthings.operators import DDPHessianOperator\nfrom hessian_eigenthings.algorithms import lanczos\n\n# Assume model, dataloader, and loss_fn are already set up\nddp_model = DDP(model)\n\n# Create the distributed Hessian operator\nhessian_op = DDPHessianOperator(\n    model=ddp_model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n)\n\n# Compute eigenvalues using Lanczos algorithm\neigenvalues, eigenvectors = lanczos(hessian_op, k=10, max_iter=50)\n```\n\n### With Parameter Filtering\n\n```python\nfrom hessian_eigenthings.param_utils import match_names\n\n# Focus on specific layer parameters\nhessian_op = DDPHessianOperator(\n    model=ddp_model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"layer.4.*\"),\n)\n```\n\n### Using with Different HVP Methods\n\n```python\n# Using finite difference method (more memory-efficient)\nhessian_op_fd = DDPHessianOperator(\n    model=ddp_model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    method=\"finite_difference\",\n    fd_eps=1e-5,\n)\n```\n\n## Key Design Decisions\n\n### Autograd-aware All-Reduce\n\nThe `DDPHessianOperator` adds an explicit all-reduce operation that integrates with PyTorch's autograd engine. This ensures:\n\n- The all-reduce operation is included in the autograd graph when needed\n- Gradient flows correctly through the distributed computation\n- The final HVP is properly synchronized across all ranks\n\n### Parameter Access\n\nThe operator reads parameters directly from the model, whether or not it is wrapped with DDP:\n\n```python\n# From the source:\n# \"The model passed in may already be wrapped with\n#  torch.nn.parallel.DistributedDataParallel; we read params from it directly.\"\n```\n\nThis design allows seamless usage with existing DDP-wrapped models without modification.\n\n### Batch Distribution\n\nEach rank should receive its own shard of the dataset:\n\n> \"Each rank should be receiving its own shard of the dataset (typical pattern: a `torch.utils.data.distributed.DistributedSampler`).\"\n\n资料来源：[hessian_eigenthings/operators/distributed/ddp.py:21-25]()\n\n## Comparison with Single-Process HessianOperator\n\n| Aspect | `HessianOperator` | `DDPHessianOperator` |\n|--------|-------------------|----------------------|\n| Use case | Single GPU / CPU | Multi-GPU distributed |\n| Gradient sync | Manual handling required | Automatic via all-reduce |\n| DDP compatibility | May produce incorrect HVPs | Correct by design |\n| API | Identical | Identical |\n| Performance overhead | None | Single all-reduce per HVP |\n\n## Relationship to Other Operators\n\nThe `hessian_eigenthings` package provides multiple curvature operators:\n\n| Operator | Description | Distributed Support |\n|----------|-------------|---------------------|\n| `HessianOperator` | Full Hessian computation | Not DDP-aware |\n| `DDPHessianOperator` | Full Hessian with DDP sync | DDP-aware |\n| `GGNOperator` | Generalized Gauss-Newton | Not DDP-aware (as of v1.0) |\n| `EmpiricalFisherOperator` | Empirical Fisher matrix | Not DDP-aware |\n\n资料来源：[hessian_eigenthings/__init__.py:15-24]()\n\n## Limitations and Considerations\n\n1. **Current scope**: Only `HessianOperator` has a DDP-aware counterpart; other operators like `GGNOperator` and `EmpiricalFisherOperator` do not yet have distributed variants\n2. **Gradient hooks**: The operator does not currently support all DDP gradient hook mechanisms\n3. **Multi-node training**: While the operator uses standard `torch.distributed` primitives, performance at very large scale (>8 nodes) has not been extensively benchmarked\n4. **Mixed precision**: When using fp16/bf16 training, ensure consistent dtype across all ranks\n\n## Error Handling\n\nThe operator relies on standard PyTorch distributed error handling:\n\n- If `torch.distributed` is not initialized, standard errors will be raised\n- Mismatched tensor shapes across ranks will result in collective operation errors\n- Device mismatch (e.g., some ranks on CUDA, some on CPU) is not supported\n\n## Testing and Validation\n\nThe DDP functionality should be tested in a true distributed environment. Basic validation includes:\n\n1. **Consistency check**: HVP computed via `DDPHessianOperator` should equal the single-process HVP when aggregating all batch shards\n2. **Numerical accuracy**: Eigenvalues computed with DDP should match single-GPU results within floating-point tolerance\n3. **Scaling**: Computation time should scale sub-linearly with number of GPUs for large models\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：noahgolmant/pytorch-hessian-eigenthings\n\n摘要：发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：身份坑 - 仓库名和安装名不一致。\n\n## 1. 身份坑 · 仓库名和安装名不一致\n\n- 严重度：medium\n- 证据强度：runtime_trace\n- 发现：仓库名 `pytorch-hessian-eigenthings` 与安装入口 `hessian-eigenthings` 不完全一致。\n- 对用户的影响：用户照着仓库名搜索包或照着包名找仓库时容易走错入口。\n- 建议检查：在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- 复现命令：`pip install hessian-eigenthings`\n- 防护动作：页面必须同时展示 repo 名和真实安装入口，避免用户搜索错包。\n- 证据：identity.distribution | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | repo=pytorch-hessian-eigenthings; install=hessian-eigenthings\n\n## 2. 安装坑 · 来源证据：Python Error: the following arguments are required: experimentname\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Python Error: the following arguments are required: experimentname\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_24f46464d79f4ae3830f046c077a2574 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/39 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：v1.0.0a2 — packaging fix\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a2 — packaging fix\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_7540a696b30c46cdba07c12f33388567 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a2 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：v1.0.0a3 — fix lanczos OOM\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a3 — fix lanczos OOM\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e5e68e2f24e1436cb8f3c2f11cefe326 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a3 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 5. 安装坑 · 来源证据：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_914b7653aa8b4ef2844a2b4690fab2ad | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a4 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 6. 安装坑 · 来源证据：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_509356ab9b68434992d7237219952ba6 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a5 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 7. 配置坑 · 来源证据：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f79a3a34cbab435cb3730b7ae17cf492 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/30 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 配置坑 · 来源证据：ValueError: PENet on the Kitti benchmark suite\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：ValueError: PENet on the Kitti benchmark suite\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_850f8cf0010c4d269ab71d864610097a | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/41 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 9. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | README/documentation is current enough for a first validation pass.\n\n## 10. 运行坑 · 来源证据：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b515f5c06a5744b19b667bcbc8123348 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/38 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 11. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | last_activity_observed missing\n\n## 12. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | no_demo; severity=medium\n\n## 13. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | no_demo; severity=medium\n\n## 14. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | issue_or_pr_quality=unknown\n\n## 15. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | release_recency=unknown\n\n<!-- canonical_name: noahgolmant/pytorch-hessian-eigenthings; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "pytorch-hessian-eigenthings",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "hn_item:48132232",
          "kind": "hn",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://news.ycombinator.com/item?id=48132232"
        },
        {
          "evidence_id": "art_82176a1723a2491b89b1dc14bf5cdebd",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/noahgolmant/pytorch-hessian-eigenthings#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "pytorch-hessian-eigenthings 说明书",
      "toc": [
        "https://github.com/noahgolmant/pytorch-hessian-eigenthings 项目说明书",
        "目录",
        "Introduction to hessian-eigenthings",
        "Overview",
        "Core Concepts",
        "Architecture",
        "Algorithms",
        "Operators",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "eb04d493f7da48aa5fb19b58f057bb34ad57789a",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "README.md",
      "uv.lock",
      "docs/index.md",
      "docs/how-to/distributed-ddp.md",
      "docs/how-to/custom-curvature-operators.md",
      "docs/how-to/analyze-a-huggingface-model.md",
      "docs/how-to/per-layer-hessian.md",
      "docs/how-to/analyze-with-transformer-lens.md",
      "docs/how-to/custom-loss-functions.md",
      "docs/reference/loss_fns.md",
      "docs/reference/api.md",
      "docs/reference/param_utils.md",
      "docs/reference/operators.md",
      "docs/reference/algorithms.md",
      "docs/concepts/ggn-vs-fisher-vs-hessian.md",
      "docs/concepts/fused-ce-hvp.md",
      "docs/concepts/top-k-eigenvalues.md",
      "docs/concepts/why-hvp-not-full-h.md",
      "docs/concepts/trace-estimation.md",
      "docs/concepts/numerical-stability.md",
      "docs/concepts/what-is-the-hessian.md",
      "docs/concepts/spectral-density.md",
      "docs/getting-started/transformers-quickstart.md",
      "docs/getting-started/quickstart.md",
      "docs/getting-started/installation.md",
      "examples/transformer_lens_attention_only.py",
      "examples/README.md",
      "examples/huggingface_tiny_gpt2.py",
      "examples/supervised_mlp.py"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# pytorch-hessian-eigenthings - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 pytorch-hessian-eigenthings 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **想在安装前理解开源项目价值和边界的用户**：当前证据主要来自项目文档。 证据：`README.md` Claim：`clm_0002` supported 0.86\n\n## 它能做什么\n\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n## 怎么开始\n\n- `pip install hessian-eigenthings` 证据：`README.md` Claim：`clm_0003` supported 0.86, `clm_0004` supported 0.86\n- `pip install \"hessian-eigenthings[transformers,transformer-lens]\"` 证据：`README.md` Claim：`clm_0004` supported 0.86\n- `git clone https://github.com/noahgolmant/pytorch-hessian-eigenthings` 证据：`README.md` Claim：`clm_0005` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：先做角色匹配试用\n- **为什么**：这个项目更像角色库，核心风险是选错角色或把角色文案当执行能力；先用 Prompt Preview 试角色匹配，再决定是否沙盒导入。\n\n### 30 秒判断\n\n- **现在怎么做**：先做角色匹配试用\n- **最小安全下一步**：先用 Prompt Preview 试角色匹配；满意后再隔离导入\n- **先别相信**：角色质量和任务匹配不能直接相信。\n- **继续会触碰**：角色选择偏差、命令执行、本地环境或项目文件\n\n### 现在可以相信\n\n- **适合人群线索：想在安装前理解开源项目价值和边界的用户**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0003` supported 0.86, `clm_0004` supported 0.86\n\n### 现在还不能相信\n\n- **角色质量和任务匹配不能直接相信。**（unverified）：角色库证明有很多角色，不证明每个角色都适合你的具体任务，也不证明角色能产生高质量结果。\n- **不能把角色文案当成真实执行能力。**（unverified）：安装前只能判断角色描述和任务画像是否匹配，不能证明它能在宿主 AI 里完成任务。\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n\n### 继续会触碰什么\n\n- **角色选择偏差**：用户对任务应该由哪个专家角色处理的判断。 原因：选错角色会让 AI 从错误专业视角回答，浪费时间或误导决策。\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：先用交互式试用验证任务画像和角色匹配，不要先导入整套角色库。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **保留原始角色选择记录**：如果输出偏题，可以回到任务画像阶段重新选择角色，而不是继续沿着错误角色推进。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0006` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md` Claim：`clm_0007` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n### 上下文规模\n\n- 文件总数：88\n- 重要文件覆盖：40/88\n- 证据索引条目：42\n- 角色 / Skill 条目：26\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 pytorch-hessian-eigenthings 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 pytorch-hessian-eigenthings 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 pytorch-hessian-eigenthings 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 26 个角色 / Skill / 项目文档条目。\n\n- **pytorch-hessian-eigenthings**（project_doc）：! PyPI https://img.shields.io/pypi/v/hessian-eigenthings.svg https://pypi.org/project/hessian-eigenthings/ ! Documentation https://img.shields.io/badge/docs-noahgolmant.github.io-blue https://noahgolmant.github.io/pytorch-hessian-eigenthings/ ! CI https://github.com/noahgolmant/pytorch-hessian-eigenthings/actions/workflows/ci.yml/badge.svg https://github.com/noahgolmant/pytorch-hessian-eigenthings/actions/workflows/… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`README.md`\n- **Examples**（project_doc）：Self-contained scripts demonstrating common usage patterns. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/README.md`\n- **Contributing**（project_doc）：Thanks for your interest in contributing. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`CONTRIBUTING.md`\n- **hessian-eigenthings**（project_doc）：Iterative eigendecomposition of curvature operators Hessian, GGN, empirical Fisher for arbitrary PyTorch models, including HuggingFace and TransformerLens transformers. Top eigenvalues via Lanczos or power iteration, trace via Hutch++, and the spectral density via Stochastic Lanczos Quadrature, all matrix-free. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/index.md`\n- **Fused CE Hessian-vector product**（project_doc）：When you analyze a HuggingFace causal LM with GGNOperator and hf lm loss of output , the library uses a fused kernel for the core cross-entropy Hessian-vector product CE HVP instead of relying on autograd to differentiate the loss twice. This page explains why, what the kernel does, and what trade-offs each backend has. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/concepts/fused-ce-hvp.md`\n- **GGN vs Fisher vs Hessian**（project_doc）：These three matrices are easy to conflate and often called \"the curvature\" interchangeably. They are not the same. Read this page before deciding which operator to instantiate; mistaking one for another is the most common pitfall in curvature analysis. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/concepts/ggn-vs-fisher-vs-hessian.md`\n- **Numerical stability**（project_doc）：A short guide to dtype choice, finite-difference $\\varepsilon$ tuning, and reorthogonalization defaults. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/concepts/numerical-stability.md`\n- **Spectral density**（project_doc）：The eigenvalue density or density of states of an $n \\times n$ symmetric operator $H$ is the probability density of its eigenvalues: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/concepts/spectral-density.md`\n- **Top-k eigenvalues**（project_doc）：The two algorithms in this library for computing the top $k$ eigenpairs of a symmetric operator: power iteration with deflation and Lanczos . They share the same EigenResult return type but differ in how they search the spectrum. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/concepts/top-k-eigenvalues.md`\n- **Trace estimation**（project_doc）：The trace of a matrix-free operator can't be read off directly — we don't have the diagonal. Stochastic estimators infer it from a small number of $Hv$ products. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/concepts/trace-estimation.md`\n- **What is the Hessian**（project_doc）：The Hessian of a scalar loss $L \\theta $ with respect to model parameters $\\theta \\in \\mathbb{R}^n$ is the matrix of second partial derivatives: 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/concepts/what-is-the-hessian.md`\n- **Why HVP, not full $H$**（project_doc）：Forming the full Hessian costs $O n^2 $ memory. For a 7B-parameter model that's $\\sim$200 PB. Iterative algorithms power iteration, Lanczos, Hutchinson only ever need to apply $H$ to a vector, never to materialize it — so they cost $O n $ memory. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/concepts/why-hvp-not-full-h.md`\n- **Installation**（project_doc）： 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/getting-started/installation.md`\n- **Quickstart**（project_doc）：A complete top-k Hessian eigenvalue calculation on a small MLP, end to end. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/getting-started/quickstart.md`\n- **Transformers quickstart**（project_doc）：End-to-end on a HuggingFace causal LM. Requires the optional transformers extra. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/getting-started/transformers-quickstart.md`\n- **Analyze a HuggingFace model**（project_doc）：Compute the Hessian spectrum of any HuggingFace causal LM e.g. GPT-2, Llama, Qwen . Requires the optional transformers extra. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/how-to/analyze-a-huggingface-model.md`\n- **Analyze with TransformerLens**（project_doc）：Compute curvature on a TransformerLens https://github.com/TransformerLensOrg/TransformerLens HookedTransformer . Useful for mechanistic-interpretability work where you want to compute Hessian-related quantities while using TLens hooks. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/how-to/analyze-with-transformer-lens.md`\n- **Custom curvature operators**（project_doc）：The algorithms in this library Lanczos, power iteration, Hutchinson, Hutch++, SLQ operate on a CurvatureOperator interface. Subclass it to wire in any matrix-free symmetric operator. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/how-to/custom-curvature-operators.md`\n- **Custom loss functions**（project_doc）：Each operator takes a loss-function callable. The exact signature differs slightly by operator because GGN and empirical Fisher need to know more than just \"the loss\". 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/how-to/custom-loss-functions.md`\n- **Distributed DDP**（project_doc）：DDPHessianOperator averages the Hessian-vector product across torch.distributed ranks. Each rank receives its own shard of the dataset typical pattern: DistributedSampler ; the per-rank HVP is all-reduced before being returned. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/how-to/distributed-ddp.md`\n- **Per-layer Hessian**（project_doc）：Restrict the curvature operator to a subset of parameters using param filter . Operator size shrinks to match, and matvec only differentiates through the selected parameters. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/how-to/per-layer-hessian.md`\n- **Algorithms**（project_doc）：::: hessian eigenthings.algorithms.lanczos.lanczos 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/reference/algorithms.md`\n- **API reference**（project_doc）：The library's public surface is re-exported from the top-level hessian eigenthings package, so from hessian eigenthings import HessianOperator, lanczos, ... works for everything below. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/reference/api.md`\n- **Loss functions**（project_doc）：::: hessian eigenthings.loss fns.standard.supervised loss 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/reference/loss_fns.md`\n- **Operators**（project_doc）：::: hessian eigenthings.operators.base.CurvatureOperator 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/reference/operators.md`\n- **Parameter selection**（project_doc）：::: hessian eigenthings.param utils.select parameters 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/reference/param_utils.md`\n\n## 证据索引\n\n- 共索引 42 条证据。\n\n- **pytorch-hessian-eigenthings**（documentation）：! PyPI https://img.shields.io/pypi/v/hessian-eigenthings.svg https://pypi.org/project/hessian-eigenthings/ ! Documentation https://img.shields.io/badge/docs-noahgolmant.github.io-blue https://noahgolmant.github.io/pytorch-hessian-eigenthings/ ! CI https://github.com/noahgolmant/pytorch-hessian-eigenthings/actions/workflows/ci.yml/badge.svg https://github.com/noahgolmant/pytorch-hessian-eigenthings/actions/workflows/ci.yml ! License https://img.shields.io/badge/license-MIT-green.svg LICENSE 证据：`README.md`\n- **Examples**（documentation）：Self-contained scripts demonstrating common usage patterns. 证据：`examples/README.md`\n- **Contributing**（documentation）：Thanks for your interest in contributing. 证据：`CONTRIBUTING.md`\n- **License**（source_file）：Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files the \"Software\" , to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 证据：`LICENSE`\n- **hessian-eigenthings**（documentation）：Iterative eigendecomposition of curvature operators Hessian, GGN, empirical Fisher for arbitrary PyTorch models, including HuggingFace and TransformerLens transformers. Top eigenvalues via Lanczos or power iteration, trace via Hutch++, and the spectral density via Stochastic Lanczos Quadrature, all matrix-free. 证据：`docs/index.md`\n- **Fused CE Hessian-vector product**（documentation）：When you analyze a HuggingFace causal LM with GGNOperator and hf lm loss of output , the library uses a fused kernel for the core cross-entropy Hessian-vector product CE HVP instead of relying on autograd to differentiate the loss twice. This page explains why, what the kernel does, and what trade-offs each backend has. 证据：`docs/concepts/fused-ce-hvp.md`\n- **GGN vs Fisher vs Hessian**（documentation）：These three matrices are easy to conflate and often called \"the curvature\" interchangeably. They are not the same. Read this page before deciding which operator to instantiate; mistaking one for another is the most common pitfall in curvature analysis. 证据：`docs/concepts/ggn-vs-fisher-vs-hessian.md`\n- **Numerical stability**（documentation）：A short guide to dtype choice, finite-difference $\\varepsilon$ tuning, and reorthogonalization defaults. 证据：`docs/concepts/numerical-stability.md`\n- **Spectral density**（documentation）：The eigenvalue density or density of states of an $n \\times n$ symmetric operator $H$ is the probability density of its eigenvalues: 证据：`docs/concepts/spectral-density.md`\n- **Top-k eigenvalues**（documentation）：The two algorithms in this library for computing the top $k$ eigenpairs of a symmetric operator: power iteration with deflation and Lanczos . They share the same EigenResult return type but differ in how they search the spectrum. 证据：`docs/concepts/top-k-eigenvalues.md`\n- **Trace estimation**（documentation）：The trace of a matrix-free operator can't be read off directly — we don't have the diagonal. Stochastic estimators infer it from a small number of $Hv$ products. 证据：`docs/concepts/trace-estimation.md`\n- **What is the Hessian**（documentation）：The Hessian of a scalar loss $L \\theta $ with respect to model parameters $\\theta \\in \\mathbb{R}^n$ is the matrix of second partial derivatives: 证据：`docs/concepts/what-is-the-hessian.md`\n- **Why HVP, not full $H$**（documentation）：Forming the full Hessian costs $O n^2 $ memory. For a 7B-parameter model that's $\\sim$200 PB. Iterative algorithms power iteration, Lanczos, Hutchinson only ever need to apply $H$ to a vector, never to materialize it — so they cost $O n $ memory. 证据：`docs/concepts/why-hvp-not-full-h.md`\n- **Installation**（documentation）：Installation Optional extras: To work on the library itself: 证据：`docs/getting-started/installation.md`\n- **Quickstart**（documentation）：A complete top-k Hessian eigenvalue calculation on a small MLP, end to end. 证据：`docs/getting-started/quickstart.md`\n- **Transformers quickstart**（documentation）：End-to-end on a HuggingFace causal LM. Requires the optional transformers extra. 证据：`docs/getting-started/transformers-quickstart.md`\n- **Analyze a HuggingFace model**（documentation）：Compute the Hessian spectrum of any HuggingFace causal LM e.g. GPT-2, Llama, Qwen . Requires the optional transformers extra. 证据：`docs/how-to/analyze-a-huggingface-model.md`\n- **Analyze with TransformerLens**（documentation）：Compute curvature on a TransformerLens https://github.com/TransformerLensOrg/TransformerLens HookedTransformer . Useful for mechanistic-interpretability work where you want to compute Hessian-related quantities while using TLens hooks. 证据：`docs/how-to/analyze-with-transformer-lens.md`\n- **Custom curvature operators**（documentation）：The algorithms in this library Lanczos, power iteration, Hutchinson, Hutch++, SLQ operate on a CurvatureOperator interface. Subclass it to wire in any matrix-free symmetric operator. 证据：`docs/how-to/custom-curvature-operators.md`\n- **Custom loss functions**（documentation）：Each operator takes a loss-function callable. The exact signature differs slightly by operator because GGN and empirical Fisher need to know more than just \"the loss\". 证据：`docs/how-to/custom-loss-functions.md`\n- **Distributed DDP**（documentation）：DDPHessianOperator averages the Hessian-vector product across torch.distributed ranks. Each rank receives its own shard of the dataset typical pattern: DistributedSampler ; the per-rank HVP is all-reduced before being returned. 证据：`docs/how-to/distributed-ddp.md`\n- **Per-layer Hessian**（documentation）：Restrict the curvature operator to a subset of parameters using param filter . Operator size shrinks to match, and matvec only differentiates through the selected parameters. 证据：`docs/how-to/per-layer-hessian.md`\n- **Algorithms**（documentation）：::: hessian eigenthings.algorithms.lanczos.lanczos 证据：`docs/reference/algorithms.md`\n- **API reference**（documentation）：The library's public surface is re-exported from the top-level hessian eigenthings package, so from hessian eigenthings import HessianOperator, lanczos, ... works for everything below. 证据：`docs/reference/api.md`\n- **Loss functions**（documentation）：::: hessian eigenthings.loss fns.standard.supervised loss 证据：`docs/reference/loss_fns.md`\n- **Operators**（documentation）：::: hessian eigenthings.operators.base.CurvatureOperator 证据：`docs/reference/operators.md`\n- **Parameter selection**（documentation）：::: hessian eigenthings.param utils.select parameters 证据：`docs/reference/param_utils.md`\n- **Settings**（structured_config）：{ \"hooks\": { \"PostToolUse\": { \"matcher\": \"Edit Write MultiEdit\", \"hooks\": { \"type\": \"command\", \"command\": \".claude/hooks/check-docs-affected.sh\" } } } } 证据：`.claude/settings.json`\n- **Byte-compiled / optimized / DLL files**（source_file）：Byte-compiled / optimized / DLL files pycache / .py cod $py.class 证据：`.gitignore`\n- **Eager attention: HF defaults to flash/SDPA which lacks a CPU**（source_file）：\"\"\"Top-k Hessian eigenvalues of a HuggingFace causal LM sshleifer/tiny-gpt2 . 证据：`examples/huggingface_tiny_gpt2.py`\n- **Supervised Mlp**（source_file）：\"\"\"Top-k eigenvalues, trace, and spectral density of a small MLP's Hessian. 证据：`examples/supervised_mlp.py`\n- **Transformer Lens Attention Only**（source_file）：\"\"\"Per-block Hessian analysis of a TransformerLens model: attention-only vs MLP-only. 证据：`examples/transformer_lens_attention_only.py`\n- **Init**（source_file）：\"\"\"Iterative eigendecomposition of curvature operators Hessian, GGN, Fisher for PyTorch models.\"\"\" 证据：`hessian_eigenthings/__init__.py`\n- **Batching**（source_file）：\"\"\"Dataloader iteration helpers and the microbatching safety guard. 证据：`hessian_eigenthings/batching.py`\n- **Param Utils**（source_file）：\"\"\"Parameter selection glob/regex and flat-vector ↔ per-param dict conversion.\"\"\" 证据：`hessian_eigenthings/param_utils.py`\n- **Mkdocs**（source_file）：site name: hessian-eigenthings site description: Iterative eigendecomposition of curvature operators for PyTorch models. site url: https://noahgolmant.github.io/pytorch-hessian-eigenthings/ repo url: https://github.com/noahgolmant/pytorch-hessian-eigenthings repo name: noahgolmant/pytorch-hessian-eigenthings edit uri: edit/master/docs/ 证据：`mkdocs.yml`\n- **ML conventions: N803/N806 uppercase vars like B, T, H, N , N812**（source_file）：build-system requires = \"hatchling\" build-backend = \"hatchling.build\" 证据：`pyproject.toml`\n- **Free between cells to keep peak-memory stats clean on CUDA.**（source_file）：\"\"\"Microbenchmark: eager vs fused CE HVP peak memory and wall time. 证据：`scripts/bench_fused_ce_hvp.py`\n- **!/usr/bin/env bash**（source_file）：!/usr/bin/env bash Spin up a single A100-80GB GCP VM, rsync the current branch, run the GPU-marked GGN matvec validation tests, capture results, teardown. Usage: ./scripts/gpu validate.sh Requires: gcloud configured, project set via PROJECT env var. 证据：`scripts/gpu_validate.sh`\n- **!/usr/bin/env bash**（source_file）：!/usr/bin/env bash Spin up an A100-80GB GCP VM, rsync the feat/fused-ce-hvp branch, run the fused CE HVP tests including the Triton path that skips without CUDA , run the microbenchmark for memory + wall-time numbers, capture results, teardown. 证据：`scripts/gpu_validate_fused.sh`\n- **---------------------------------------------------------------------------**（source_file）：\"\"\"Minimal CPU repro for the GGNOperator memory blowup. 证据：`scripts/repro_ggn_oom.py`\n- **Conftest**（source_file）：def pytest collection modifyitems config: pytest.Config, items: list pytest.Item - None: try: import curvlinops noqa: F401 except ImportError: skip = pytest.mark.skip reason=\"curvlinops not installed\" for item in items: if \"curvlinops\" in item.keywords: item.add marker skip 证据：`tests/conftest.py`\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`README.md`, `examples/README.md`, `CONTRIBUTING.md`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`README.md`, `examples/README.md`, `CONTRIBUTING.md`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Introduction to hessian-eigenthings**：importance `high`\n  - source_paths: README.md, hessian_eigenthings/__init__.py\n- **Installation Guide**：importance `high`\n  - source_paths: pyproject.toml, CONTRIBUTING.md\n- **Curvature Matrices Explained**：importance `high`\n  - source_paths: docs/concepts/ggn-vs-fisher-vs-hessian.md, docs/concepts/what-is-the-hessian.md\n- **Why Hessian-Vector Products**：importance `high`\n  - source_paths: docs/concepts/why-hvp-not-full-h.md, hessian_eigenthings/operators/hessian.py\n- **System Architecture**：importance `high`\n  - source_paths: hessian_eigenthings/operators/base.py, hessian_eigenthings/algorithms/__init__.py, hessian_eigenthings/linalg/__init__.py\n- **Curvature Operators**：importance `high`\n  - source_paths: hessian_eigenthings/operators/base.py, hessian_eigenthings/operators/hessian.py, hessian_eigenthings/operators/ggn.py, hessian_eigenthings/operators/fisher.py, hessian_eigenthings/operators/__init__.py\n- **Eigendecomposition Algorithms**：importance `high`\n  - source_paths: hessian_eigenthings/algorithms/lanczos.py, hessian_eigenthings/algorithms/power_iteration.py, hessian_eigenthings/algorithms/trace.py, hessian_eigenthings/algorithms/spectral_density.py, hessian_eigenthings/algorithms/result.py\n- **Loss Functions**：importance `medium`\n  - source_paths: hessian_eigenthings/loss_fns/__init__.py, hessian_eigenthings/loss_fns/standard.py, hessian_eigenthings/loss_fns/huggingface.py, hessian_eigenthings/loss_fns/transformer_lens.py, hessian_eigenthings/loss_fns/_fused_ce_hvp.py\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `eb04d493f7da48aa5fb19b58f057bb34ad57789a`\n- inspected_files: `pyproject.toml`, `README.md`, `uv.lock`, `docs/index.md`, `docs/how-to/distributed-ddp.md`, `docs/how-to/custom-curvature-operators.md`, `docs/how-to/analyze-a-huggingface-model.md`, `docs/how-to/per-layer-hessian.md`, `docs/how-to/analyze-with-transformer-lens.md`, `docs/how-to/custom-loss-functions.md`, `docs/reference/loss_fns.md`, `docs/reference/api.md`, `docs/reference/param_utils.md`, `docs/reference/operators.md`, `docs/reference/algorithms.md`, `docs/concepts/ggn-vs-fisher-vs-hessian.md`, `docs/concepts/fused-ce-hvp.md`, `docs/concepts/top-k-eigenvalues.md`, `docs/concepts/why-hvp-not-full-h.md`, `docs/concepts/trace-estimation.md`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 仓库名和安装名不一致\n\n- Trigger: 仓库名 `pytorch-hessian-eigenthings` 与安装入口 `hessian-eigenthings` 不完全一致。\n- Host AI rule: 在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- Why it matters: 用户照着仓库名搜索包或照着包名找仓库时容易走错入口。\n- Evidence: identity.distribution | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | repo=pytorch-hessian-eigenthings; install=hessian-eigenthings\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 来源证据：Python Error: the following arguments are required: experimentname\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Python Error: the following arguments are required: experimentname\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_24f46464d79f4ae3830f046c077a2574 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/39 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 来源证据：v1.0.0a2 — packaging fix\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a2 — packaging fix\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_7540a696b30c46cdba07c12f33388567 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a2 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 来源证据：v1.0.0a3 — fix lanczos OOM\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a3 — fix lanczos OOM\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_e5e68e2f24e1436cb8f3c2f11cefe326 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a3 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 来源证据：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_914b7653aa8b4ef2844a2b4690fab2ad | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a4 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: 来源证据：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_509356ab9b68434992d7237219952ba6 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a5 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 来源证据：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_f79a3a34cbab435cb3730b7ae17cf492 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/30 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 8: 来源证据：ValueError: PENet on the Kitti benchmark suite\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：ValueError: PENet on the Kitti benchmark suite\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_850f8cf0010c4d269ab71d864610097a | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/41 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 9: 能力判断依赖假设\n\n- Trigger: README/documentation is current enough for a first validation pass.\n- Host AI rule: 将假设转成下游验证清单。\n- Why it matters: 假设不成立时，用户拿不到承诺的能力。\n- Evidence: capability.assumptions | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | README/documentation is current enough for a first validation pass.\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 10: 来源证据：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'\n\n- Trigger: GitHub 社区证据显示该项目存在一个运行相关的待验证问题：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_b515f5c06a5744b19b667bcbc8123348 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/38 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：noahgolmant/pytorch-hessian-eigenthings\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：local_cli\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 仓库名和安装名不一致（medium）：用户照着仓库名搜索包或照着包名找仓库时容易走错入口。 建议检查：在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- 来源证据：Python Error: the following arguments are required: experimentname（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：v1.0.0a2 — packaging fix（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：v1.0.0a3 — fix lanczos OOM（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/noahgolmant/pytorch-hessian-eigenthings 项目说明书\n\n生成时间：2026-05-16 15:14:47 UTC\n\n## 目录\n\n- [Introduction to hessian-eigenthings](#introduction)\n- [Installation Guide](#installation)\n- [Curvature Matrices Explained](#curvature-matrices)\n- [Why Hessian-Vector Products](#hvp-approach)\n- [System Architecture](#architecture)\n- [Curvature Operators](#curvature-operators)\n- [Eigendecomposition Algorithms](#algorithms)\n- [Loss Functions](#loss-functions)\n- [Parameter Utilities](#parameter-utilities)\n- [Distributed Computing with DDP](#distributed-computing)\n\n<a id='introduction'></a>\n\n## Introduction to hessian-eigenthings\n\n### 相关页面\n\n相关主题：[Curvature Matrices Explained](#curvature-matrices), [System Architecture](#architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/README.md)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n- [hessian_eigenthings/loss_fns/standard.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n- [examples/huggingface_tiny_gpt2.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/huggingface_tiny_gpt2.py)\n- [mkdocs.yml](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/mkdocs.yml)\n</details>\n\n# Introduction to hessian-eigenthings\n\n## Overview\n\n`hessian-eigenthings` is a PyTorch library that provides efficient and scalable computation of eigendecomposition for the Hessian matrix and related curvature operators in neural networks. The library enables practitioners to compute top eigenvalues and eigenvectors via Lanczos or stochastic power iteration, trace estimates via Hutch++, and spectral density via Stochastic Lanczos Quadrature.\n\n资料来源：[README.md:1]()\n\nThe project targets researchers and engineers studying generalization properties of neural networks, where Hessian eigenvalues and eigenvectors have been implicated in understanding flat minima and model robustness.\n\n## Core Concepts\n\n### What is a Hessian?\n\nThe Hessian matrix is the second-order partial derivatives of a loss function with respect to model parameters. For a neural network with parameters θ and loss L, the Hessian H is defined as:\n\n```\nH[θ][i,j] = ∂²L / ∂θ[i]∂θ[j]\n```\n\nFor modern large-scale models, the Hessian is prohibitively expensive to compute explicitly—it has O(n²) entries where n is the number of parameters (e.g., billions for large language models).\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50]()\n\n### Curvature Operators\n\nInstead of computing the full Hessian matrix, this library works with **curvature operators** that implement matrix-vector products (matvecs). Given a vector v, these operators efficiently compute:\n\n```\nH @ v → operator.matvec(v)\n```\n\nThis approach reduces memory from O(n²) to O(n), making analysis feasible for models with billions of parameters.\n\n### Supported Curvature Matrices\n\n| Operator | Description | Use Case |\n|----------|-------------|----------|\n| `HessianOperator` | Full Hessian of the loss | General curvature analysis |\n| `GGNOperator` | Generalized Gauss-Newton approximation | More stable than raw Hessian; equals Fisher for cross-entropy + softmax |\n| Custom Operators | User-defined curvature operators | Extend to other matrices |\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-30]()\n\n## Architecture\n\nThe library follows a clean separation of concerns with three main layers:\n\n```mermaid\ngraph TD\n    A[User Code] --> B[Algorithms]\n    A --> C[Loss Functions]\n    B --> D[Curvature Operators]\n    C --> D\n    D --> E[LinAlgBackend]\n    \n    B --> B1[Lanczos]\n    B --> B2[Stochastic Power Iteration]\n    B --> B3[Trace Estimation]\n    \n    D --> D1[HessianOperator]\n    D --> D2[GGNOperator]\n    D --> D3[Custom Operators]\n    \n    E --> E1[SingleDeviceBackend]\n    E --> E2[Distributed Backends]\n```\n\n### Component Layers\n\n1. **Algorithms Layer** (`hessian_eigenthings/algorithms/`): Eigenvalue/eigenvector computation methods that operate on any `CurvatureOperator`.\n\n2. **Operators Layer** (`hessian_eigenthings/operators/`): Implementations of various curvature matrices that provide the `matvec()` interface.\n\n3. **Loss Functions Layer** (`hessian_eigenthings/loss_fns/`): Pre-built loss functions with analytical Hessian-vector products for common use cases.\n\n4. **Backend Layer** (`hessian_eigenthings/backends/`): Abstraction for linear algebra operations supporting single-device and distributed execution.\n\n资料来源：[CONTRIBUTING.md:1-30]()\n\n## Algorithms\n\n### Lanczos Eigendecomposition\n\nThe Lanczos algorithm computes the top k eigenvalues and eigenvectors of a symmetric matrix using only matrix-vector products. It achieves this through k iterations of tridiagonal matrix construction.\n\n```mermaid\ngraph LR\n    A[Start Vector v₀] --> B[Iterate i = 1 to k]\n    B --> C[Compute βᵢvᵢ₊₁ = Avᵢ - αᵢvᵢ - βᵢ₋₁vᵢ₋₁]\n    C --> D[Compute αᵢ = vᵢᵀAvᵢ]\n    D --> E[Build Tridiagonal T]\n    E --> F{Eigenvalues of T ≈ eigenvalues of A?}\n    F -->|Yes| G[Eigenpairs Converged]\n```\n\nKey parameters for the Lanczos algorithm:\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `k` | int | required | Number of eigenpairs to compute |\n| `max_iter` | int | 100 | Maximum Lanczos iterations |\n| `tol` | float | 1e-6 | Convergence tolerance |\n| `seed` | int | None | Random seed for reproducibility |\n| `which` | str | \"LM\" | Which eigenvalues: \"LM\" (largest magnitude), \"LA\" (largest algebraic), \"SA\" (smallest algebraic) |\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:1-80]()\n\n### Trace Estimation\n\nThe trace of a matrix can be estimated using stochastic methods without forming the full matrix:\n\n**Hutchinson's Estimator:**\n```\ntrace(A) ≈ (1/m) Σᵢ vᵢᵀ A vᵢ\n```\nwhere vᵢ are random probe vectors.\n\n**Hutch++ Estimator:**\nAn improved estimator with lower variance:\n```\ntrace(A) ≈ (2/m) Σᵢ vᵢᵀ A vᵢ - (1/m) Σⱼ wⱼᵀ A wⱼ\n```\n\n| Method | `num_matvecs` | Variance | Use Case |\n|--------|---------------|----------|----------|\n| `hutchinson` | 100 | Higher | Quick estimates |\n| `hutch++` | 30 | Lower | Production estimates |\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:1-60]()\n\n## Operators\n\n### HessianOperator\n\nThe primary operator for computing Hessian eigendecomposition. It supports two HVP computation methods:\n\n| Method | Description | Memory | Precision |\n|--------|-------------|--------|-----------|\n| `\"autograd\"` (default) | Exact double-backward via `torch.autograd.grad` with `create_graph=True` | Higher | Numerically exact |\n| `\"finite_difference\"` | Central-difference approximation | Lower | O(ε²) bias |\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\n\n# Basic usage\nop = HessianOperator(model=model, dataloader=dataloader, loss_fn=loss_fn)\neigenvalues, eigenvectors = lanczos(op, k=10)\n\n# With parameter filtering (subset of parameters)\nop = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"transformer.h.*.attn.*\"),\n)\n```\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-100]()\n\n### GGNOperator\n\nThe Generalized Gauss-Newton (GGN) operator provides a more numerically stable approximation to the Hessian. For cross-entropy + softmax classification, the GGN equals the Fisher information matrix.\n\n```python\nfrom hessian_eigenthings import GGNOperator\n\nop = GGNOperator(\n    model=model,\n    dataloader=dataloader,\n    forward_fn=model_forward,\n    loss_of_output_fn=loss_of_output_fn,\n)\n```\n\n**Two matvec implementations:**\n\n| Implementation | Description | Memory Footprint |\n|----------------|-------------|------------------|\n| `\"analytical\"` (default) | Finite-difference JVP + analytical loss-Hessian-vec product | Matches one training step |\n| `\"autograd\"` | Full `torch.func.jvp` + autograd double-backward | Scales badly with output size |\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-80]()\n\n## Loss Functions\n\nThe library provides optimized loss functions with closed-form Hessian-vector products for common use cases.\n\n### Standard Loss Functions\n\n```python\nfrom hessian_eigenthings.loss_fns.standard import cross_entropy_loss_of_output\n\nloss_fn = cross_entropy_loss_of_output()  # Returns loss_of_output_fn\n```\n\nThe closed-form cross-entropy HVP is:\n```\nH @ u = (p * u - p * <p, u>) / n\n```\nwhere `p = softmax(output)` and `n` is the number of valid positions.\n\n资料来源：[hessian_eigenthings/loss_fns/standard.py:1-60]()\n\n### HuggingFace Transformers Loss\n\nSpecialized support for HuggingFace models with fused CUDA kernels:\n\n```python\nfrom hessian_eigenthings.loss_fns.huggingface import hf_lm_loss\n\nloss_fn = hf_lm_loss()  # For language modeling\n```\n\n**Fused backend options:**\n\n| Backend | Device | Speedup | Memory |\n|---------|--------|---------|--------|\n| `\"triton\"` | CUDA | ~3.4x faster | 2x reduction |\n| `\"compile\"` | Any | ~2.6x faster | 2x reduction |\n| `\"eager\"` | Any | Baseline | Baseline |\n| `\"auto\"` | Auto-detect | Best available | Best available |\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-80]()\n\n## Usage Examples\n\n### Basic Hessian Eigendecomposition\n\n```python\nimport torch\nfrom torch import nn\nfrom hessian_eigenthings import HessianOperator, lanczos\n\n# Define model and data\nmodel = nn.Sequential(nn.Linear(100, 50), nn.ReLU(), nn.Linear(50, 10))\ndataloader = [(torch.randn(32, 100), torch.randint(0, 10, (32,)))]\n\n# Loss function\ndef loss_fn(model, batch):\n    x, y = batch\n    return nn.functional.cross_entropy(model(x), y)\n\n# Compute top 3 eigenvalues\nop = HessianOperator(model=model, dataloader=dataloader, loss_fn=loss_fn)\nresult = lanczos(op, k=3, max_iter=20, tol=1e-3, seed=0)\n\nfor i, val in enumerate(result.eigenvalues):\n    print(f\"λ_{i+1} = {val.item():.4e}\")\n```\n\n资料来源：[examples/huggingface_tiny_gpt2.py:1-50]()\n\n### Analyzing Attention Layers Only\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\nfrom hessian_eigenthings.util import match_names\n\n# Filter to attention parameters only\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\"),\n)\neigenvalues = lanczos(attn_op, k=5)\n```\n\n### Trace Estimation\n\n```python\nfrom hessian_eigenthings import HessianOperator, trace\n\nop = HessianOperator(model=model, dataloader=dataloader, loss_fn=loss_fn)\nresult = trace(op, num_matvecs=30, method=\"hutch++\", seed=0)\nprint(f\"Trace estimate: {result.estimate:.4e}\")\n```\n\n## Performance Considerations\n\n### Memory Management\n\nFor large models, consider these strategies:\n\n1. **Parameter Filtering**: Analyze only relevant subsets of parameters\n   ```python\n   op = HessianOperator(model, dataloader, loss_fn, param_filter=filter_func)\n   ```\n\n2. **Microbatching**: Process data in smaller chunks\n   ```python\n   op = HessianOperator(model, dataloader, loss_fn, microbatch_size=8)\n   ```\n\n3. **Finite Difference Method**: Use `\"finite_difference\"` for lower memory with FSDP/HSDP/TP\n   ```python\n   op = HessianOperator(model, dataloader, loss_fn, method=\"finite_difference\")\n   ```\n\n### Scalability\n\n| Model Size | Recommended Method | Notes |\n|------------|-------------------|-------|\n| < 1B params | `\"autograd\"` HVP | Numerically exact |\n| 1B - 7B params | `\"analytical\"` GGN | Good memory efficiency |\n| > 7B params | `\"finite_difference\"` HVP | Works with distributed training |\n\n### Computation Cost\n\nThe primary cost driver is the number of matrix-vector products (`matvecs`):\n\n- **Lanczos**: ~k × max_iter matvecs for k eigenpairs\n- **Trace (Hutch++)**: num_matvecs matvecs\n- **Spectral Density**: num_steps × num_random_start matvecs\n\n## API Reference\n\n### Core Functions\n\n| Function | Module | Description |\n|----------|--------|-------------|\n| `lanczos` | `hessian_eigenthings.algorithms` | Lanczos eigendecomposition |\n| `stochastic_power_iteration` | `hessian_eigenthings.algorithms` | Stochastic power iteration |\n| `trace` | `hessian_eigenthings.algorithms` | Trace estimation |\n| `spectral_density` | `hessian_eigenthings.algorithms` | Stochastic Lanczos Quadrature |\n\n### Operators\n\n| Class | Module | Description |\n|-------|--------|-------------|\n| `HessianOperator` | `hessian_eigenthings.operators` | Full Hessian operator |\n| `GGNOperator` | `hessian_eigenthings.operators` | Generalized Gauss-Newton operator |\n| `CurvatureOperator` | `hessian_eigenthings.operators` | Base class for custom operators |\n\n### Utility Functions\n\n| Function | Description |\n|----------|-------------|\n| `match_names(glob_pattern)` | Create parameter filter from glob pattern |\n| `SingleDeviceBackend` | Linear algebra backend for single-device execution |\n\n## Project Information\n\n### Acknowledgements\n\nThe original 2018 implementation was developed by Noah Golmant, Zhewei Yao, Amir Gholami, Michael Mahoney, and Joseph Gonzalez at UC Berkeley's RISELab.\n\nThe deflated power iteration is based on code from [HessianFlow](https://github.com/amirgholami/HessianFlow) (Z. Yao, A. Gholami, Q. Lei, K. Keutzer, M. Mahoney. *\"Hessian-based Analysis of Large Batch Training and Robustness to Adversaries\"*, NeurIPS 2018).\n\nAccelerated stochastic power iteration is from C. De Sa et al.\n\n### Citation\n\n```bibtex\n@misc{hessian-eigenthings,\n    author       = {Noah Golmant and Zhewei Yao and Amir Gholami and Michael Mahoney and Joseph Gonzalez},\n    title        = {pytorch-hessian-eigenthings: efficient PyTorch Hessian eigendecomposition},\n    month        = oct,\n    year         = 2018,\n    version      = {1.0},\n    url          = {https://github.com/noahgolmant/pytorch-hessian-eigenthings}\n}\n```\n\n### Installation\n\n```bash\n# From PyPI (stable release)\npip install hessian-eigenthings\n\n# Development setup\ngit clone https://github.com/noahgolmant/pytorch-hessian-eigenthings\ncd pytorch-hessian-eigenthings\nuv sync --group dev --group docs\n```\n\n### Documentation\n\nFull documentation is available at [noahgolmant.github.io/pytorch-hessian-eigenthings/](https://noahgolmant.github.io/pytorch-hessian-eigenthings/).\n\n资料来源：[README.md:1-100]()\n资料来源：[CONTRIBUTING.md:1-60]()\n资料来源：[mkdocs.yml:1-50]()\n\n---\n\n<a id='installation'></a>\n\n## Installation Guide\n\n### 相关页面\n\n相关主题：[Introduction to hessian-eigenthings](#introduction)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [CONTRIBUTING.md](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/CONTRIBUTING.md)\n- [README.md](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/README.md)\n- [pyproject.toml](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/pyproject.toml)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [examples/transformer_lens_attention_only.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/transformer_lens_attention_only.py)\n- [examples/huggingface_tiny_gpt2.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/huggingface_tiny_gpt2.py)\n</details>\n\n# Installation Guide\n\n## Overview\n\nThis guide covers all aspects of setting up the `hessian-eigenthings` library for computing Hessian eigendecomposition and related curvature matrix operations in PyTorch models.\n\nThe library provides efficient methods for computing top eigenvalues and eigenvectors via Lanczos or stochastic power iteration, trace estimates via Hutch++, and spectral density via Stochastic Lanczos Quadrature. 资料来源：[README.md:1-20]()\n\n## Installation Methods\n\n### PyPI Release (Recommended for Users)\n\nThe latest stable release is available on PyPI:\n\n```bash\npip install hessian-eigenthings\n```\n\nThis installs the core library without optional dependencies for transformer and curvlinops integrations. 资料来源：[README.md:1-10]()\n\n### Development Installation (For Contributors)\n\nFor development, clone the repository and install with all optional dependency groups:\n\n```bash\ngit clone https://github.com/noahgolmant/pytorch-hessian-eigenthings\ncd pytorch-hessian-eigenthings\nuv sync --group dev --group docs --extra transformers --extra transformer-lens --extra curvlinops\n```\n\n资料来源：[CONTRIBUTING.md:5-12]()\n\n## Optional Dependency Groups\n\nThe library uses optional dependency groups defined in `pyproject.toml` to enable specialized functionality:\n\n| Group | Purpose | Typical Use Case |\n|-------|---------|------------------|\n| `dev` | Testing, linting, type checking | Running CI checks locally |\n| `docs` | Building documentation | `mkdocs build --strict` |\n| `transformers` | HuggingFace Transformers integration | `GGNOperator` with HF models |\n| `transformer-lens` | TransformerLens integration | Attention-only Hessian analysis |\n| `curvlinops` | Cross-library validation tests | Testing against external oracle |\n\n资料来源：[CONTRIBUTING.md:8-10]()\n\n## Development Environment Setup\n\n### Prerequisites\n\n| Requirement | Version | Purpose |\n|-------------|---------|---------|\n| Python | ≥3.10 | Core runtime |\n| uv | Latest | Package manager |\n| PyTorch | ≥2.0 | Backend tensor operations |\n| CUDA (optional) | 11.8+ | GPU acceleration for large models |\n\n### Setup Workflow\n\n```mermaid\ngraph TD\n    A[Clone Repository] --> B[Install uv if needed]\n    B --> C[Run uv sync with groups]\n    C --> D[Verify Installation]\n    D --> E{Which workflow?}\n    E -->|Development| F[Run linting checks]\n    E -->|Testing| G[Run pytest]\n    E -->|Documentation| H[Build docs]\n    F --> I[Ready to contribute]\n    G --> I\n    H --> I\n```\n\n### Verification Commands\n\nAfter installation, verify the setup by running the full check suite:\n\n```bash\nuv run ruff check .\nuv run black --check .\nuv run mypy\nuv run pytest\nuv run mkdocs build --strict\n```\n\n资料来源：[CONTRIBUTING.md:14-23]()\n\n## CUDA/GPU Support\n\nThe library provides optimized CUDA kernels for specific operations:\n\n### Triton Kernels (CUDA Only)\n\nThe `hessian_eigenthings.loss_fns._fused_ce_hvp` module includes a hand-written Triton CUDA kernel for fused CE HVP computation. This kernel:\n\n- Eliminates zero `(N, V)` intermediates (output buffer only)\n- Provides ~3.4x speedup over eager mode\n- Reduces peak memory by 2x compared to eager\n- Falls back to `torch.compile` if Triton/CUDA is unavailable\n\n资料来源：[hessian_eigenthings/loss_fns/_fused_ce_hvp.py:50-80]()\n\n### Backend Selection\n\nFor HuggingFace language model loss functions, the `fused` parameter controls kernel selection:\n\n| Setting | Behavior |\n|---------|----------|\n| `\"auto\"` (default) | Picks fastest available: Triton on CUDA (~3.4x speedup), else `torch.compile` |\n| `\"eager\"` | Plain PyTorch implementation, useful for debugging |\n| `\"compile\"` | `torch.compile`-fused via Inductor, works on CPU/CUDA/MPS |\n| `\"triton\"` | Hand-written CUDA Triton kernel (CUDA only) |\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-50]()\n\n## Operator-Specific Dependencies\n\nDifferent curvature operators have different computational requirements:\n\n### HessianOperator\n\nTwo HVP methods are supported:\n\n| Method | Memory Profile | Precision | FSDP/TP Compatible |\n|--------|---------------|-----------|-------------------|\n| `\"autograd\"` (default) | Higher (requires `create_graph=True`) | Numerically exact | No (requires special handling) |\n| `\"finite_difference\"` | Matches one training step | O(ε²) truncation bias | Yes |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-40]()\n\n### GGNOperator\n\nFor the Generalized Gauss-Newton operator, two matvec implementations are available:\n\n| Implementation | Memory | Use Case |\n|----------------|--------|----------|\n| `\"analytical\"` (default) | Matches one training step | LM-scale use, prevents OOM |\n| `\"autograd\"` | Scales with output size | Losses without analytical `.hvp` |\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-60]()\n\n## CI/CD Verification\n\nThe repository uses GitHub Actions for continuous integration. CI runs:\n\n- All linting and type checks\n- Full pytest test suite\n- Example scripts execution\n- Documentation codeblock tests\n\n资料来源：[CONTRIBUTING.md:23-26]()\n\n## Troubleshooting\n\n### Common Issues\n\n| Issue | Solution |\n|-------|----------|\n| Memory OOM with GGNOperator | Use `loss_hvp=\"analytical\"` (default in recent versions) |\n| FSDP/TP compatibility issues | Use `method=\"finite_difference\"` for HessianOperator |\n| Triton not available | Falls back to `torch.compile` automatically |\n| Type checking failures | Run `uv run mypy` locally before submitting PR |\n\n### Diagnostic Scripts\n\nThe repository includes diagnostic scripts for troubleshooting:\n\n- `scripts/repro_ggn_oom.py`: CPU-side memory regression test for GGNOperator OOM issues\n- `scripts/bench_fused_ce_hvp.py`: Microbenchmark for eager vs fused CE HVP performance\n\n资料来源：[scripts/repro_ggn_oom.py:1-40]()\n资料来源：[scripts/bench_fused_ce_hvp.py:1-50]()\n\n## Package Metadata\n\n| Property | Value |\n|----------|-------|\n| Package Name | `hessian-eigenthings` |\n| License | MIT |\n| Documentation | [noahgolmant.github.io/pytorch-hessian-eigenthings](https://noahgolmant.github.io/pytorch-hessian-eigenthings/) |\n| CI Status | [![CI](https://github.com/noahgolmant/pytorch-hessian-eigenthings/actions/workflows/ci.yml/badge.svg)](https://github.com/noahgolmant/pytorch-hessian-eigenthings/actions/workflows/ci.yml) |\n\n资料来源：[README.md:1-20]()\n\n---\n\n<a id='curvature-matrices'></a>\n\n## Curvature Matrices Explained\n\n### 相关页面\n\n相关主题：[Why Hessian-Vector Products](#hvp-approach), [Curvature Operators](#curvature-operators)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/operators/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n- [hessian_eigenthings/operators/fisher.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/fisher.py)\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n- [hessian_eigenthings/loss_fns/_fused_ce_hvp.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n</details>\n\n# Curvature Matrices Explained\n\n## Overview\n\nCurvature matrices characterize the second-order behavior of loss functions in neural networks, providing critical information about optimization landscapes, generalization properties, and model robustness. The `hessian-eigenthings` library provides efficient, matrix-free computation of eigendecompositions for three key curvature matrices: the **Hessian**, the **Generalized Gauss-Newton (GGN)**, and the **Empirical Fisher**.\n\nThese curvature operators serve as the foundation for analyzing flat minima, understanding generalization, and performing second-order optimization. The library implements matrix-vector products (matvecs) directly, avoiding explicit matrix construction which would be computationally infeasible for large neural networks with billions of parameters.\n\n## Curvature Matrices Architecture\n\n```mermaid\ngraph TD\n    A[Loss Function Lθ] --> B[Hessian H = ∇²L]\n    A --> C[Generalized Gauss-Newton G]\n    A --> D[Empirical Fisher F]\n    \n    B --> E[Matrix-Free MatVec]\n    C --> E\n    D --> E\n    \n    E --> F[Lanczos Eigendecomposition]\n    E --> G[Trace Estimation]\n    E --> H[Spectral Density]\n    \n    F --> I[Top-k Eigenpairs]\n    G --> J[Trace Estimate ± SE]\n    H --> K[Spectral Density Plot]\n```\n\n## The Hessian Matrix\n\n### Definition and Role\n\nThe Hessian matrix `H = ∇²L(θ)` is the second derivative of the loss with respect to parameters. It captures the exact local curvature of the loss landscape, making it the most precise but also most computationally expensive curvature matrix.\n\nThe Hessian is symmetric by construction and its eigenvalues reveal critical properties:\n\n- **Large positive eigenvalues** indicate sharp curvature, suggesting the model is in a narrow minimum\n- **Small eigenvalues** indicate flat regions associated with better generalization\n- **Negative eigenvalues** signal instability and potential divergence\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-30](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### HessianOperator Implementation\n\nThe `HessianOperator` class provides two methods for computing Hessian-vector products (HVPs):\n\n```python\nclass HessianOperator(CurvatureOperator):\n    \"\"\"Hessian of `loss_fn(model, batch)` averaged over batches in dataloader.\"\"\"\n    \n    def __init__(\n        self,\n        model: nn.Module,\n        dataloader: Iterable[Any],\n        loss_fn: LossFn,\n        *,\n        param_filter: ParamFilter | None = None,\n        full_dataset: bool = True,\n        num_batches: int | None = None,\n        microbatch_size: int | None = None,\n        method: HvpMethod = \"autograd\",\n        fd_eps: float | None = None,\n        backend: LinAlgBackend[torch.Tensor] | None = None,\n    ) -> None:\n```\n\n**HVP Computation Methods:**\n\n| Method | Description | Memory | Precision |\n|--------|-------------|--------|-----------|\n| `\"autograd\"` | Exact double-backward via `torch.autograd.grad` with `create_graph=True` | Higher (scales with model size) | Numerically exact to rounding |\n| `\"finite_difference\"` | Central-difference `(∇L(θ+εv) − ∇L(θ−εv)) / 2ε` | Lower (two forward+backward passes) | O(ε²) truncation bias |\n\nThe finite-difference method uses dtype-specific epsilon values for optimal precision:\n\n| dtype | Epsilon |\n|-------|---------|\n| float64 | 6e-6 |\n| float32 | 5e-3 |\n| bfloat16 | 0.2 |\n| float16 | 5e-2 |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:30-55](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### When to Use the Hessian\n\nThe Hessian is ideal for:\n- Single-device analysis of models up to ~7B parameters\n- Scenarios requiring exact curvature information\n- Research on loss landscape topology\n- Verifying approximations against ground truth\n\n## Generalized Gauss-Newton (GGN)\n\n### Definition and Mathematical Foundation\n\nThe Generalized Gauss-Newton matrix `G` is a positive semi-definite (PSD) approximation to the Hessian. For a loss of the form `L = (1/n) Σ l(f(xᵢ;θ), yᵢ)`, the GGN is defined as:\n\n```\nG = Jᵀ · H_loss · J\n```\n\nWhere:\n- `J` is the Jacobian of model outputs with respect to parameters\n- `H_loss` is the Hessian of the loss with respect to model outputs\n\nThe GGN is always PSD because `G = Jᵀ · H_loss · J` and `H_loss` is PSD for convex losses.\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-40](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n### GGNOperator Implementation\n\nThe `GGNOperator` class provides two matvec implementations:\n\n```python\nclass GGNOperator(CurvatureOperator):\n    def __init__(\n        self,\n        model: nn.Module,\n        dataloader: Iterable[Any],\n        forward_fn: ForwardFn,\n        loss_of_output_fn: LossOfOutputFn,\n        *,\n        loss_hvp: Literal[\"analytical\", \"autograd\"] = \"analytical\",\n    ) -> None:\n```\n\n| Method | Description | Memory | Use Case |\n|--------|-------------|--------|----------|\n| `\"analytical\"` | Finite-difference JVP + analytical loss-Hessian-vector product | Matches one training step | LM-scale use, OOM-safe |\n| `\"autograd\"` | `torch.func.jvp` + autograd double-backward + `vjp` | Scales with output size | Exact for arbitrary losses |\n\nFor cross-entropy + softmax classification, `G` equals the Fisher information matrix, making the GGN and Fisher equivalent in this common case.\n\n资料来源：[hessian_eigenthings/operators/ggn.py:40-80](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n### Two-Function API Design\n\nThe GGNOperator uses a separation between `forward_fn` and `loss_of_output_fn`:\n\n```python\nForwardFn = Callable[[nn.Module, Any], torch.Tensor]\nLossOfOutputFn = Callable[[torch.Tensor, Any], torch.Tensor]\n```\n\nThis design enables computing `J·v`, `H_loss·(J·v)`, and `Jᵀ·(H_loss·J·v)` without coupling to loss internals.\n\n### Closed-Form Cross-Entropy HVP\n\nFor mean-reduced cross-entropy with softmax, the library provides an optimized analytical HVP:\n\n```\nH_loss @ u = (p * u - p * ⟨p, u⟩) / n\n```\n\nWhere `p = softmax(logits)` and `n` is the count of non-ignored positions.\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n### Fused CE HVP Implementations\n\nThe library provides three backend implementations for the cross-entropy HVP:\n\n| Backend | Description | Memory | Speedup |\n|---------|-------------|--------|---------|\n| `\"eager\"` | Plain PyTorch reference | Highest | 1x baseline |\n| `\"compile\"` | `torch.compile`-fused; Inductor fuses operations | Reduced | ~2.6x faster |\n| `\"triton\"` | Hand-written CUDA Triton kernel | Minimal (output buffer only) | ~3.4x faster |\n\n资料来源：[hessian_eigenthings/loss_fns/_fused_ce_hvp.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n\n## Empirical Fisher\n\n### Definition\n\nThe Empirical Fisher matrix `F` is defined as:\n\n```\nF = (1/n) Σ ∇lᵢ · ∇lᵢᵀ\n```\n\nWhere the expectation over data is replaced by the empirical average over batches. It is always PSD and serves as an approximation to the Fisher Information Matrix.\n\n### EmpiricalFisherOperator\n\n```python\nclass EmpiricalFisherOperator(CurvatureOperator):\n    def __init__(\n        self,\n        model: nn.Module,\n        dataloader: Iterable[Any],\n        loss_fn: LossFn,\n        *,\n        per_sample: bool = False,\n    ) -> None:\n```\n\n资料来源：[hessian_eigenthings/operators/fisher.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/fisher.py)\n\n## Curvature Operator Interface\n\nAll curvature matrices implement the `CurvatureOperator` base class:\n\n```python\nclass CurvatureOperator(ABC):\n    @property\n    @abstractmethod\n    def size(self) -> int:\n        \"\"\"Number of parameters (matrix dimension).\"\"\"\n        ...\n    \n    @property\n    def dtype(self) -> torch.dtype:\n        ...\n    \n    @property\n    def device(self) -> torch.device:\n        ...\n    \n    @abstractmethod\n    def matvec(self, v: torch.Tensor) -> torch.Tensor:\n        \"\"\"Compute matrix-vector product A @ v.\"\"\"\n        ...\n```\n\n资料来源：[hessian_eigenthings/operators/base.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/base.py)\n\n## Parameter Filtering\n\nCurvature operators support computing curvature only over a subset of parameters using `ParamFilter`:\n\n```python\ndef match_names(*patterns: str) -> ParamFilter:\n    \"\"\"Match parameters by name patterns.\"\"\"\n    \ndef match_regex(pattern: str) -> ParamFilter:\n    \"\"\"Match parameters by regex pattern.\"\"\"\n```\n\nThis enables analysis of specific components:\n\n```python\n# Analyze only attention weights\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\"),\n)\n```\n\n## Algorithmic Foundations\n\n### Lanczos Eigendecomposition\n\nThe Lanczos algorithm computes eigenvalues and eigenvectors of large sparse matrices using only matrix-vector products:\n\n```python\ndef lanczos(\n    operator: CurvatureOperator,\n    k: int = 10,\n    max_iter: int = 100,\n    tol: float = 1e-6,\n    which: str = \"LM\",\n) -> EigenResult:\n```\n\nThe algorithm:\n1. Builds a tridiagonal matrix `T` from matvec operations\n2. Computes eigenvalues of `T` as Ritz approximations\n3. Accumulates eigenvectors directly via rank-1 outer-product updates\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n\n### Trace Estimation\n\nTrace estimation uses stochastic probing to estimate `tr(A)` without computing the full matrix:\n\n| Method | Samples | Variance | Description |\n|--------|---------|----------|-------------|\n| Hutchinson | m | O(1/√m) | `(1/m) Σ vᵢᵀ A vᵢ` with Rademacher/Gaussian vectors |\n| Hutch++ | m | O(1/m) | Improved estimator with better constant factors |\n\n```python\ndef trace(\n    operator: CurvatureOperator,\n    *,\n    num_matvecs: int = 100,\n    method: Method = \"hutch++\",\n) -> TraceResult:\n```\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n\n## Operator Selection Guide\n\n```mermaid\ngraph LR\n    A[Need Curvature?] --> B{Exact Hessian?}\n    B -->|Yes, small model| C[HessianOperator<br/>method=autograd]\n    B -->|No| D{Need Fisher/GGN?}\n    D -->|Yes, cross-entropy| E[GGNOperator<br/>loss_hvp=analytical]\n    D -->|Yes, other loss| F[GGNOperator<br/>loss_hvp=autograd]\n    D -->|Empirical Fisher| G[EmpiricalFisherOperator]\n    \n    C --> H[Use lanczos for eigenvalues]\n    E --> H\n    F --> H\n    G --> H\n```\n\n| Scenario | Recommended Operator | Method |\n|----------|----------------------|--------|\n| Exact Hessian, single GPU | `HessianOperator` | `method=\"autograd\"` |\n| Large model, distributed | `HessianOperator` | `method=\"finite_difference\"` |\n| Language modeling, cross-entropy | `GGNOperator` | `loss_hvp=\"analytical\"` |\n| Custom loss, need exact | `GGNOperator` | `loss_hvp=\"autograd\"` |\n| Natural gradient optimization | `EmpiricalFisherOperator` | Default |\n\n## Module Exports\n\n```python\nfrom hessian_eigenthings.operators import (\n    CurvatureOperator,\n    HessianOperator,\n    GGNOperator,\n    EmpiricalFisherOperator,\n    DDPHessianOperator,  # For DistributedDataParallel\n    LambdaOperator,      # Custom curvature wrappers\n)\n\nfrom hessian_eigenthings.algorithms import (\n    lanczos,              # Top-k eigendecomposition\n    trace,                # Trace estimation\n    spectral_density,    # Density plot via SLQ\n    deflated_power_iteration,\n)\n```\n\n资料来源：[hessian_eigenthings/operators/__init__.py:1-25](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n\n## Summary\n\nThe `hessian-eigenthings` library provides a unified interface for computing curvature information in neural networks:\n\n- **Hessian**: Exact local curvature via autograd or finite-difference approximation\n- **GGN**: Positive semi-definite approximation ideal for large-scale analysis\n- **Empirical Fisher**: Sample-based Fisher approximation for natural gradient methods\n\nAll operators provide matrix-free matvec implementations, enabling eigendecomposition and trace estimation for models with billions of parameters without explicit matrix construction.\n\n---\n\n<a id='hvp-approach'></a>\n\n## Why Hessian-Vector Products\n\n### 相关页面\n\n相关主题：[Curvature Matrices Explained](#curvature-matrices), [Eigendecomposition Algorithms](#algorithms)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/loss_fns/_fused_ce_hvp.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n- [hessian_eigenthings/loss_fns/standard.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n</details>\n\n# Why Hessian-Vector Products\n\nHessian-vector products (HVPs) are the computational foundation of this library. Understanding *why* we use HVPs instead of computing the full Hessian matrix is essential for appreciating the design and capabilities of `hessian-eigenthings`.\n\n## The Full Hessian Problem\n\nThe Hessian matrix $H$ of a neural network's loss function is a second-order partial derivative matrix with dimensions $[n \\times n]$, where $n$ is the number of parameters. For modern large-scale models:\n\n| Model | Parameters | Hessian Size | Memory (fp32) |\n|-------|-----------|--------------|---------------|\n| BERT-Base | 110M | 110M × 110M | ~48 TB |\n| GPT-2 | 1.5B | 1.5B × 1.5B | ~9 PB |\n| LLaMA-7B | 7B | 7B × 7B | ~392 PB |\n\nStoring the full Hessian is fundamentally infeasible. Even computing it via automatic differentiation requires $O(n^2)$ operations and memory that scales quadratically with model size. 资料来源：[README.md:1-30]()\n\n## What is a Hessian-Vector Product?\n\nA Hessian-vector product computes $Hv$ for a given vector $v$ without ever constructing $H$ explicitly. The operation takes $O(n)$ time and memory—linear in the number of parameters.\n\nFormally, given:\n- Loss function $\\mathcal{L}(\\theta)$\n- Parameter vector $\\theta \\in \\mathbb{R}^n$\n- Direction vector $v \\in \\mathbb{R}^n$\n\nThe HVP is:\n$$Hv = \\nabla_\\theta^2 \\mathcal{L} \\cdot v = \\frac{\\partial}{\\partial \\theta} \\left( \\nabla_\\theta \\mathcal{L} \\cdot v \\right)$$\n\nThis is implemented as a double-backward pass:\n1. Forward pass → compute loss\n2. Backward pass → compute gradient $\\nabla_\\theta \\mathcal{L}$\n3. Second backward pass → compute Jacobian-vector product $H \\cdot v$\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50]()\n\n## Why HVPs Enable Scalable Curvature Analysis\n\nBy avoiding explicit Hessian construction, HVP-based algorithms can operate on models of any size. This library provides several key algorithms that all rely on HVP as their primitive operation:\n\n```mermaid\ngraph TD\n    A[Hessian-Vector Product] --> B[ Lanczos Eigendecomposition]\n    A --> C[ Hutchinson Trace Estimation]\n    A --> D[ Hutch++ Trace Estimation]\n    A --> E[ Stochastic Lanczos Quadrature]\n    \n    B --> F[Top-k Eigenvalues & Eigenvectors]\n    C --> G[Trace Estimation]\n    D --> G\n    E --> H[Spectral Density Plot]\n```\n\n### Eigendecomposition via Lanczos\n\nThe Lanczos algorithm iteratively builds an orthogonal basis that tridiagonalizes the operator. It requires only matrix-vector products, making it perfect for HVP-based curvature analysis:\n\n| Property | Full Eigendecomp | Lanczos + HVP |\n|----------|------------------|---------------|\n| Memory | $O(n^2)$ | $O(n \\cdot k)$ |\n| Time | $O(n^3)$ | $O(n \\cdot k^2)$ |\n| Storage | Entire matrix | $k$ Lanczos vectors |\n\nWhere $k$ is the number of desired eigenpairs (typically 1-20). 资料来源：[hessian_eigenthings/algorithms/lanczos.py:1-60]()\n\n### Trace Estimation via Hutchinson's Method\n\nThe trace of the Hessian can be estimated without constructing the full matrix:\n\n$$\\text{tr}(H) \\approx \\frac{1}{m} \\sum_{i=1}^{m} v_i^T H v_i$$\n\nwhere $v_i$ are random probe vectors (typically Rademacher or Gaussian). Each term $v_i^T H v_i$ is a single HVP plus a dot product. 资料来源：[hessian_eigenthings/algorithms/trace.py:1-45]()\n\n## HVP Implementation Strategies\n\nThe library provides two distinct methods for computing HVPs, each with different trade-offs:\n\n### Method 1: Autograd (Default)\n\nUses `torch.autograd.grad` with `create_graph=True` for exact double-backward computation:\n\n```python\ndef __init__(\n    self,\n    model: nn.Module,\n    dataloader: Iterable[Any],\n    loss_fn: LossFn,\n    *,\n    method: HvpMethod = \"autograd\",  # Default\n    ...\n) -> None:\n```\n\n**Advantages:**\n- Numerically exact (to floating-point rounding)\n- Works with any differentiable loss function\n- Simple implementation\n\n**Disadvantages:**\n- Builds the full computation graph for second derivatives\n- Memory scales with model complexity and output size\n\n资料来源：[hessian_eigenthings/operators/hessian.py:20-45]()\n\n### Method 2: Finite Difference\n\nUses central-difference approximation:\n$$\\frac{\\nabla_\\theta \\mathcal{L}(\\theta + \\epsilon v) - \\nabla_\\theta \\mathcal{L}(\\theta - \\epsilon v)}{2\\epsilon}$$\n\n**Advantages:**\n- No second-backward graph → lower memory footprint\n- Compatible with distributed training (FSDP/HSDP/TP) without special handling\n\n**Disadvantages:**\n- $O(\\epsilon^2)$ truncation bias\n- Precision-dependent roundoff (~1e-5 fp32, ~1e-2 bf16)\n\n资料来源：[hessian_eigenthings/operators/hessian.py:30-40]()\n\n## Fused HVP for Cross-Entropy Losses\n\nFor language models with large vocabulary softmax heads, computing $H_{\\text{loss}} \\cdot u$ naively allocates multiple $(N, V)$ intermediate tensors (where $V$ is vocabulary size). This is addressed with fused implementations:\n\n| Backend | Speedup | Memory Reduction | Requirements |\n|---------|---------|-------------------|--------------|\n| `eager` | 1× (baseline) | 1× | Any |\n| `compile` | ~2.6× | ~2× | torch.compile |\n| `triton` | ~3.4× | ~2× | CUDA + Triton |\n\nThe fused computation computes:\n$$H_{\\text{loss}} \\cdot u = \\frac{p \\odot u - p \\odot \\langle p, u \\rangle}{n} \\odot \\text{mask}$$\n\nWhere $p = \\text{softmax}(\\text{logits})$ and the implementation avoids materializing the full $(N, V)$ softmax output. 资料来源：[hessian_eigenthings/loss_fns/_fused_ce_hvp.py:1-50]()\n\n## Generalized Gauss-Newton (GGN) Approximation\n\nFor optimization-focused curvature analysis, the GGN matrix $G$ provides a positive semi-definite (PSD) approximation to the Hessian:\n\n$$G = J^T \\cdot H_{\\text{loss}} \\cdot J$$\n\nWhere $J$ is the Jacobian of the model outputs with respect to parameters. For cross-entropy + softmax classification, $G$ equals the Fisher information matrix. The GGN is always PSD by construction, making it suitable for optimization algorithms. 资料来源：[hessian_eigenthings/operators/ggn.py:1-40]()\n\n### GGN Matvec Implementation\n\nThe GGNOperator supports two matvec paths:\n\n1. **Analytical (default):** Finite-difference JVP + analytical loss-Hessian-vector product + single normal backward. Memory footprint matches one normal training step.\n\n2. **Autograd:** Original `torch.func.jvp` + autograd double-backward + `torch.func.vjp`. Numerically exact but scales badly with vocabulary size.\n\n资料来源：[hessian_eigenthings/operators/ggn.py:25-45]()\n\n## Practical Implications\n\nThe HVP approach enables:\n\n| Capability | HVP-Based | Full Hessian |\n|------------|-----------|--------------|\n| 7B parameter model | ✅ ~hours | ❌ impossible |\n| Top-10 eigenpairs | ✅ | ❌ |\n| Trace estimation | ✅ | ❌ |\n| Spectral density | ✅ | ❌ |\n| FSDP compatibility | ✅ (finite-diff) | ❌ |\n\nThe eigenvalues and eigenvectors of the Hessian have been implicated in generalization properties of neural networks. Researchers hypothesize that \"flat minima\" generalize better, that Hessians of large models are very low-rank, and that curvature analysis can guide optimization. 资料来源：[README.md:25-35]()\n\n## Summary\n\nHessian-vector products are the fundamental building block that makes large-scale curvature analysis possible:\n\n1. **Memory efficiency:** $O(n)$ vs $O(n^2)$ for the full Hessian\n2. **Computational efficiency:** $O(n)$ per matvec vs $O(n^2)$ for full computation\n3. **Scalability:** Works with models of any size via iterative algorithms\n4. **Flexibility:** Supports exact (autograd) or memory-efficient (finite-difference) computation\n\nThe `hessian-eigenthings` library provides production-ready implementations of HVP computation and HVP-based algorithms for practical curvature analysis in PyTorch.\n\n---\n\n<a id='architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Curvature Operators](#curvature-operators), [Eigendecomposition Algorithms](#algorithms), [Loss Functions](#loss-functions)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/base.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/base.py)\n- [hessian_eigenthings/algorithms/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/__init__.py)\n- [hessian_eigenthings/linalg/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/linalg/__init__.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n</details>\n\n# System Architecture\n\nThe `pytorch-hessian-eigenthings` library provides an efficient and scalable framework for computing eigendecompositions of curvature matrices—including the Hessian, Generalized Gauss-Newton (GGN) matrix, and empirical Fisher—for arbitrary PyTorch models. The architecture is designed around three core abstractions: **Curvature Operators**, **Algorithms**, and **Linear Algebra Backends**.\n\n## High-Level Architecture Overview\n\nThe library implements a layered architecture that separates mathematical curvature computations from numerical algorithms:\n\n```mermaid\ngraph TD\n    subgraph \"User Layer\"\n        U[User Code]\n    end\n    \n    subgraph \"Algorithm Layer\"\n        LA[Lanczos]\n        TR[Trace Estimation]\n        SP[Stochastic Power Iteration]\n    end\n    \n    subgraph \"Operator Layer\"\n        HO[HessianOperator]\n        GGN[GGNOperator]\n        FO[FisherOperator]\n    end\n    \n    subgraph \"Backend Layer\"\n        B[LinAlgBackend]\n        SD[SingleDeviceBackend]\n    end\n    \n    subgraph \"PyTorch Core\"\n        PT[PyTorch Autograd]\n    end\n    \n    U -->|uses| LA\n    U -->|uses| TR\n    U -->|uses| HO\n    LA -->|operates on| HO\n    TR -->|operates on| HO\n    HO -->|implemented via| B\n    B -->|delegates to| PT\n```\n\n## Core Components\n\n### 1. Curvature Operators\n\nCurvature operators are the foundation of the library. They abstract away the details of how matrix-vector products (matvecs) with curvature matrices are computed, providing a unified interface for algorithms to work with.\n\n#### Base Interface\n\nAll operators inherit from `CurvatureOperator`, which defines the contract for curvature computations:\n\n| Property/Method | Type | Description |\n|-----------------|------|-------------|\n| `size` | `int` | Total number of parameters in the curvature matrix |\n| `dtype` | `torch.dtype` | Data type of the operator |\n| `device` | `torch.device` | Device where computations run |\n| `matvec(v)` | `Callable` | Computes `A @ v` for input vector `v` |\n\n#### Hessian Operator\n\nThe `HessianOperator` computes the Hessian of a loss function with respect to model parameters:\n\n```python\nHessianOperator(\n    model: nn.Module,\n    dataloader: Iterable[Any],\n    loss_fn: LossFn,\n    *,\n    param_filter: ParamFilter | None = None,\n    method: HvpMethod = \"autograd\"  # or \"finite_difference\"\n)\n```\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50]()\n\nTwo HVP computation methods are supported:\n\n| Method | Description | Use Case |\n|--------|-------------|----------|\n| `\"autograd\"` (default) | Exact double-backward via `torch.autograd.grad` | Up to ~7B parameters |\n| `\"finite_difference\"` | Central-difference approximation | FSDP/HSDP/TP at scale |\n\nThe finite difference method uses the approximation:\n\n```\nH(v) ≈ (∇L(θ+εv) − ∇L(θ−εv)) / 2ε\n```\n\nThis avoids second-backward graph entirely, making it compatible with distributed training setups.\n\n#### GGN Operator\n\nThe `GGNOperator` implements the Generalized Gauss-Newton matrix, which is always positive semi-definite:\n\n```python\nGGNOperator(\n    model: nn.Module,\n    dataloader: Iterable[Any],\n    forward_fn: ForwardFn,\n    loss_of_output_fn: LossOfOutputFn,\n    *,\n    loss_hvp: Literal[\"analytical\", \"autograd\"] = \"analytical\"\n)\n```\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-80]()\n\nThe GGN decomposes as `G = J^T · H_loss · J` where:\n- `J` is the Jacobian of the model output with respect to parameters\n- `H_loss` is the Hessian of the loss with respect to the output\n\nFor cross-entropy + softmax classification, `G` equals the Fisher information matrix.\n\n### 2. Algorithms Layer\n\nAlgorithms operate on any `CurvatureOperator` via its `matvec` interface, enabling eigenvalue computation, trace estimation, and spectral density analysis.\n\n#### Lanczos Eigensolver\n\nThe Lanczos algorithm computes the top-k eigenvalues and eigenvectors of a symmetric matrix using only matrix-vector products:\n\n```python\nlanczos(\n    operator: CurvatureOperator,\n    k: int = 10,\n    max_iter: int = 100,\n    tol: float = 1e-3,\n    which: str = \"LA\"  # LA, SA, or LM\n) -> EigendecompositionResult\n```\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:1-50]()\n\n**Key features:**\n\n- **Ritz vector accumulation**: Directly accumulates Ritz vectors into final `(k, n)` layout via rank-1 outer-product updates, avoiding transient `(n, k)` transpose copies\n- **Convergence tracking**: Monitors residual norms `|β_k · s_{k}|` to determine convergence\n- **Eigenvalue selection**: Supports \"LA\" (largest algebraic), \"SA\" (smallest algebraic), and \"LM\" (largest magnitude)\n\n#### Trace Estimation\n\nThe library provides multiple trace estimation methods:\n\n| Method | Description | Samples Required |\n|--------|-------------|------------------|\n| `hutchinson` | Classical Hutchinson: `(1/m) Σ vᵢᵀ A vᵢ` | Higher variance |\n| `hutch++` | Improved estimator with lower variance | ~30 recommended |\n\n```python\ntrace(\n    operator: CurvatureOperator,\n    num_matvecs: int = 30,\n    method: str = \"hutch++\",\n    seed: int | None = None\n) -> TraceResult\n```\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:1-40]()\n\nThe Hutch++ estimator achieves lower variance by using both query and reply vectors.\n\n### 3. Backend Layer\n\nThe `LinAlgBackend` abstract interface decouples linear algebra operations from specific device implementations:\n\n```mermaid\nclassDiagram\n    class LinAlgBackend~T~ {\n        <<abstract>>\n        +matmul(a, b) T\n        +dot(a, b) T\n        +norm(v) T\n        +fill(v, value) T\n        +copy(v) T\n    }\n    \n    class SingleDeviceBackend {\n        +matmul(a, b) Tensor\n        +dot(a, b) Tensor\n        +norm(v) Tensor\n    }\n    \n    LinAlgBackend <|-- SingleDeviceBackend\n```\n\nBackends provide:\n- Vector arithmetic operations (dot product, norm, fill, copy)\n- Device-specific optimizations\n- Memory allocation strategies\n\n## Data Flow\n\n### Eigendecomposition Workflow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Operator as CurvatureOperator\n    participant Backend as LinAlgBackend\n    participant Algo as Lanczos Algorithm\n    participant PyTorch as PyTorch Autograd\n    \n    User->>Operator: Instantiate with model, dataloader\n    User->>Algo: Call lanczos(operator, k)\n    Algo->>Operator: Request matvec(v)\n    Operator->>Backend: Allocate probe vector\n    Backend->>PyTorch: Create tensor\n    Operator->>PyTorch: Forward pass + backward\n    PyTorch-->>Operator: Return HVP result\n    Operator-->>Algo: Return Av\n    Algo->>Algo: Repeat for m iterations\n    Algo-->>User: Return eigenvalues, eigenvectors\n```\n\n### Loss Function Integration\n\nThe library supports two loss function patterns:\n\n```mermaid\ngraph LR\n    subgraph \"Single Function API\"\n        L1[loss_fn<br/>model, batch → scalar]\n    end\n    \n    subgraph \"Two Function API (for GGN)\"\n        F1[forward_fn<br/>model, batch → output]\n        L2[loss_of_output_fn<br/>output, batch → scalar]\n    end\n    \n    L1 --> HO[HessianOperator]\n    F1 --> GGN[GGNOperator]\n    L2 --> GGN\n```\n\n#### HuggingFace Integration\n\nFor language models, the library provides optimized loss functions:\n\n```python\nhf_lm_loss(fused=\"auto\")  # Auto-selects Triton or torch.compile\n```\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-30]()\n\nThe fused implementation:\n- Uses Triton kernels on CUDA (~3.4x speedup, 2x peak-memory reduction)\n- Falls back to `torch.compile` (~2.6x speedup, 2x peak-memory reduction)\n- Eliminates most `(N, V)` intermediates\n\n## Configuration Options\n\n### HessianOperator Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `model` | `nn.Module` | Required | PyTorch model |\n| `dataloader` | `Iterable` | Required | Data batches |\n| `loss_fn` | `LossFn` | Required | Loss computation function |\n| `param_filter` | `ParamFilter` | `None` | Filter parameters by name |\n| `method` | `HvpMethod` | `\"autograd\"` | HVP computation method |\n| `fd_eps` | `float` | `None` | Finite difference epsilon |\n\n### GGNOperator Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `loss_hvp` | `str` | `\"analytical\"` | `\"analytical\"` or `\"autograd\"` |\n| `full_dataset` | `bool` | `True` | Average over full dataset |\n| `num_batches` | `int` | `None` | Limit to first N batches |\n| `microbatch_size` | `int` | `None` | Process in smaller chunks |\n\n### Lanczos Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `k` | `int` | `10` | Number of eigenvalues to compute |\n| `max_iter` | `int` | `100` | Maximum Lanczos iterations |\n| `tol` | `float` | `1e-3` | Convergence tolerance |\n| `which` | `str` | `\"LA\"` | Which eigenvalues (\"LA\", \"SA\", \"LM\") |\n| `reorthogonalize` | `bool` | `False` | Full reorthogonalization |\n\n## Usage Patterns\n\n### Basic Hessian Eigenvalue Computation\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\n\n# Create operator\nhessian_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=lambda m, b: torch.nn.functional.cross_entropy(m(b[0]), b[1])\n)\n\n# Compute top eigenvalues\neig_result = lanczos(hessian_op, k=10, max_iter=100)\nprint(eig_result.eigenvalues)\n```\n\n### Parameter-Filtered Analysis\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\n\n# Analyze only attention parameters\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\")\n)\neig_attn = lanczos(attn_op, k=3)\n```\n\n### Trace Estimation\n\n```python\nfrom hessian_eigenthings import HessianOperator, trace\n\ntrace_result = trace(\n    hessian_op,\n    num_matvecs=30,\n    method=\"hutch++\",\n    seed=42\n)\nprint(f\"Trace estimate: {trace_result.estimate:.4e}\")\n```\n\n## Architecture Benefits\n\n| Benefit | Description |\n|---------|-------------|\n| **Separation of Concerns** | Operators define \"what\" to compute; algorithms define \"how\" |\n| **Flexibility** | Any operator can use any algorithm |\n| **Scalability** | Backends enable device-specific optimizations |\n| **Composability** | Easy to add new operators or algorithms |\n| **Memory Efficiency** | Matrix-free design avoids explicit matrix storage |\n\n## Extension Points\n\n### Adding Custom Curvature Operators\n\nNew operators should subclass `CurvatureOperator` and implement the `matvec` method:\n\n```python\nclass CustomCurvatureOperator(CurvatureOperator):\n    def __init__(self, model, dataloader):\n        super().__init__()\n        self.model = model\n        self.dataloader = dataloader\n        # Register parameters\n    \n    def _matvec(self, v: torch.Tensor) -> torch.Tensor:\n        # Implement A @ v\n        return custom_computation(v)\n```\n\n### Adding New Algorithms\n\nAlgorithms should accept any `CurvatureOperator` and use the backend exclusively:\n\n```python\ndef custom_algorithm(\n    operator: CurvatureOperator,\n    backend: LinAlgBackend | None = None\n) -> SomeResult:\n    backend = backend or SingleDeviceBackend()\n    # Use backend for all vector operations\n```\n\n## Summary\n\nThe system architecture of `pytorch-hessian-eigenthings` follows a clean, modular design that separates curvature matrix computation (operators), numerical algorithms (Lanczos, trace estimation), and linear algebra primitives (backends). This design enables efficient Hessian and GGN eigendecomposition for models ranging from small MLPs to large language models, with support for distributed training and optimized fused computations.\n\n---\n\n<a id='curvature-operators'></a>\n\n## Curvature Operators\n\n### 相关页面\n\n相关主题：[System Architecture](#architecture), [Distributed Computing with DDP](#distributed-computing)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/base.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/base.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/operators/fisher.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/fisher.py)\n- [hessian_eigenthings/operators/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n</details>\n\n# Curvature Operators\n\n## Overview\n\nCurvature Operators in `hessian-eigenthings` provide a matrix-free abstraction for computing Hessian eigendecomposition and related curvature matrices for arbitrary PyTorch models. They implement the `CurvatureOperator` base class interface, enabling efficient computation of eigenvalues, eigenvectors, traces, and spectral densities without explicitly forming potentially massive matrices.\n\nThe core abstraction allows algorithms (Lanczos, power iteration, Hutch++) to operate on any curvature matrix through a unified `matvec(v)` interface that computes $Av$ for any vector $v$, enabling scalability to large models with billions of parameters.\n\n资料来源：[hessian_eigenthings/__init__.py:1-10](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/__init__.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    subgraph \"Curvature Operators\"\n        Base[CurvatureOperator<br/>Base Class]\n        Hessian[HessianOperator]\n        GGN[GGNOperator]\n        Fisher[EmpiricalFisherOperator]\n        Lambda[LambdaOperator]\n        DDP[DDPHessianOperator]\n    end\n    \n    subgraph \"Algorithms\"\n        Lanczos[Lanczos Eigendecomposition]\n        Power[Power Iteration]\n        Trace[Trace Estimation<br/>Hutch++/Hutchinson]\n        Spectral[Spectral Density<br/>Stochastic Lanczos Quadrature]\n    end\n    \n    Base --> Hessian\n    Base --> GGN\n    Base --> Fisher\n    Base --> Lambda\n    Base --> DDP\n    \n    Hessian --> Lanczos\n    GGN --> Lanczos\n    Fisher --> Lanczos\n    Lambda --> Lanczos\n    \n    Hessian --> Trace\n    GGN --> Trace\n    Fisher --> Trace\n    \n    Hessian --> Power\n    GGN --> Power\n    Fisher --> Power\n    \n    Hessian --> Spectral\n    GGN --> Spectral\n    Fisher --> Spectral\n```\n\n## Base Class: CurvatureOperator\n\nAll curvature operators inherit from `CurvatureOperator`, which defines the contract that subclasses must fulfill.\n\n### Core Interface\n\n| Method | Description |\n|--------|-------------|\n| `matvec(v)` | Compute $Av$ where $A$ is the curvature matrix |\n| `size` | Total number of parameters in the operator's scope |\n| `dtype`, `device` | Tensor dtype and device for vector operations |\n\n资料来源：[hessian_eigenthings/operators/base.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/base.py)\n\n### Parameter Filtering\n\nCurvature operators can be restricted to subsets of model parameters using `param_filter`, enabling analysis of specific components (e.g., attention layers only).\n\n```python\nfrom hessian_eigenthings import HessianOperator, match_names\n\n# Filter to attention parameters only\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\")\n)\n```\n\n资料来源：[hessian_eigenthings/__init__.py:35-38](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/__init__.py)\n\n## HessianOperator\n\nComputes the Hessian $\\nabla_{\\theta}^2 \\mathcal{L}$ of the loss function with respect to model parameters.\n\n### Key Features\n\n- **Two HVP methods**: `autograd` (exact double-backward via `torch.autograd.grad`) and `finite_difference` (central difference for FSDP/TP compatibility)\n- **Batched computation**: Automatically averages over multiple batches from the dataloader\n- **Microbatch support**: For large models, process batches in smaller microbatches\n\n### Constructor Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `model` | `nn.Module` | Required | PyTorch model |\n| `dataloader` | `Iterable` | Required | Data batches |\n| `loss_fn` | `LossFn` | Required | Loss computation function |\n| `param_filter` | `ParamFilter \\| None` | `None` | Parameter name filter |\n| `full_dataset` | `bool` | `True` | Average over full dataset |\n| `num_batches` | `int \\| None` | `None` | Limit batches for stochastic estimate |\n| `microbatch_size` | `int \\| None` | `None` | Split batches into smaller microbatches |\n| `method` | `HvpMethod` | `\"autograd\"` | HVP computation method |\n| `fd_eps` | `float \\| None` | `None` | Finite difference epsilon |\n| `backend` | `LinAlgBackend \\| None` | `None` | Linear algebra backend |\n\n### HVP Method Comparison\n\n| Method | Accuracy | Memory | FSDP/TP Compatible | Speed |\n|--------|----------|--------|-------------------|-------|\n| `autograd` | Exact (to rounding) | High | No | Fast |\n| `finite_difference` | $O(\\epsilon^2)$ bias | Low | Yes | 2x passes |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### Finite Difference Epsilon Table\n\n| Dtype | Optimal $\\epsilon$ |\n|-------|-------------------|\n| `float64` | `6e-6` |\n| `float32` | `5e-3` |\n| `bfloat16` | `0.2` |\n| `float16` | `5e-2` |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:34-40](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n## GGNOperator\n\nThe Generalized Gauss-Newton (GGN) matrix $G = J^T H_{loss} J$ provides a PSD approximation to the Hessian that is computationally cheaper while preserving the eigenvalues that matter for optimization.\n\n### Key Features\n\n- **Always PSD**: Unlike the exact Hessian, the GGN is positive semi-definite by construction\n- **Analytical HVP path**: For losses with known HVP (e.g., cross-entropy), uses analytical computation\n- **For cross-entropy + softmax**: $G$ equals the Fisher information matrix\n\n### Two Matvec Implementations\n\n| `loss_hvp` | Description | Memory | Use Case |\n|------------|-------------|--------|----------|\n| `\"analytical\"` (default) | FD JVP + analytical loss-Hessian-vec + one backward | Matches one training step | LM-scale, large vocab |\n| `\"autograd\"` | `torch.func.jvp` + double-backward + `torch.func.vjp` | Scales with output size | Exact, small vocab |\n\n资料来源：[hessian_eigenthings/operators/ggn.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n### Fused Cross-Entropy HVP\n\nFor language model training, the GGN operator includes a fused kernel for the CE HVP computation:\n\n```python\n# Auto-selects fastest backend: Triton > torch.compile > eager\nhf_lm_loss_of_output(..., fused=\"auto\")\n```\n\nThe fused implementation reduces peak memory by 2x compared to eager, with Triton providing ~3.4x speedup on CUDA.\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-30](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n## EmpiricalFisherOperator\n\nComputes the empirical Fisher information matrix $F = \\frac{1}{N} \\sum_{i=1}^N \\nabla_{\\theta} \\log p(y_i|x_i) \\nabla_{\\theta} \\log p(y_i|x_i)^T$.\n\nFor classification with cross-entropy loss, the empirical Fisher equals the GGN when using the model distribution's expectation.\n\n资料来源：[hessian_eigenthings/operators/fisher.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/fisher.py)\n\n## LambdaOperator\n\nCreates custom curvature operators from lambda functions for testing or custom curvature definitions.\n\n```python\nfrom hessian_eigenthings import LambdaOperator\n\n# Custom operator that always returns a scaled vector\ncustom_op = LambdaOperator(\n    size=1000,\n    matvec=lambda v: 2.0 * v  # Represents 2*I\n)\n```\n\n## DDPHessianOperator\n\nDistributed Data Parallel-aware Hessian operator that handles gradient synchronization across processes.\n\n资料来源：[hessian_eigenthings/operators/__init__.py:15-18](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n\n## Common Usage Patterns\n\n### Computing Top Eigenvalues\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos\n\noperator = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn\n)\n\nresult = lanczos(operator, k=10, max_iter=100)\nprint(f\"Top eigenvalues: {result.eigenvalues}\")\n```\n\n### Estimating Trace\n\n```python\nfrom hessian_eigenthings import GGNOperator, trace\n\noperator = GGNOperator(\n    model=model,\n    dataloader=dataloader,\n    forward_fn=model_forward,\n    loss_of_output_fn=loss_fn\n)\n\nresult = trace(operator, num_matvecs=100, method=\"hutch++\")\nprint(f\"Trace estimate: {result.estimate:.4e} ± {result.stderr:.4e}\")\n```\n\n### Component-Specific Analysis\n\n```python\nfrom hessian_eigenthings import HessianOperator, match_regex\n\n# Analyze only attention weights in transformer\nattn_only = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_regex(r\"blocks\\.\\d+\\.attn\\.\")\n)\n\n# Analyze only MLP weights\nmlp_only = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_regex(r\"blocks\\.\\d+\\.mlp\\.\")\n)\n```\n\n## Linear Algebra Backends\n\nThe operators use pluggable `LinAlgBackend` for vector operations, enabling support for different hardware configurations and precision requirements.\n\n| Backend | Use Case |\n|---------|----------|\n| `SingleDeviceBackend` | Single GPU/CPU |\n| (Distributed backends) | Multi-GPU via FSDP/TP |\n\n## Module Exports\n\n```python\nfrom hessian_eigenthings.operators import (\n    CurvatureOperator,\n    DDPHessianOperator,\n    EmpiricalFisherOperator,\n    GGNOperator,\n    HessianOperator,\n    LambdaOperator,\n)\n```\n\n资料来源：[hessian_eigenthings/operators/__init__.py:1-20](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n\n---\n\n<a id='algorithms'></a>\n\n## Eigendecomposition Algorithms\n\n### 相关页面\n\n相关主题：[System Architecture](#architecture), [Why Hessian-Vector Products](#hvp-approach)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/algorithms/lanczos.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/lanczos.py)\n- [hessian_eigenthings/algorithms/power_iteration.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/power_iteration.py)\n- [hessian_eigenthings/algorithms/trace.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/trace.py)\n- [hessian_eigenthings/algorithms/spectral_density.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/spectral_density.py)\n- [hessian_eigenthings/algorithms/result.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/result.py)\n- [hessian_eigenthings/algorithms/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/algorithms/__init__.py)\n</details>\n\n# Eigendecomposition Algorithms\n\nThe `hessian-eigenthings` library provides a suite of efficient iterative algorithms for computing eigendecompositions of curvature matrices (Hessian, Generalized Gauss-Newton, and Fisher) in PyTorch models. These algorithms enable analysis of neural network loss landscapes by extracting eigenvalues, eigenvectors, spectral densities, and trace estimates without explicitly constructing the full curvature matrix—a critical capability for modern large-scale models.\n\n## Overview\n\nComputing eigendecompositions of curvature matrices is fundamental to understanding generalization properties, flat minima, and training dynamics of neural networks. However, these curvature matrices are prohibitively large (n × n where n is the number of parameters), making explicit construction impossible for modern models.\n\nThe library implements Krylov subspace methods that only require matrix-vector products, enabling efficient computation of:\n\n| Capability | Algorithm | Use Case |\n|------------|-----------|----------|\n| Top-k eigenvalues/eigenvectors | Lanczos, Power Iteration | Finding most-curved directions |\n| Trace estimation | Hutchinson, Hutch++ | Computing average curvature |\n| Spectral density | Stochastic Lanczos Quadrature | Visualizing eigenvalue distribution |\n\n资料来源：[hessian_eigenthings/algorithms/__init__.py:1-29]()\n\n## Algorithm Architecture\n\nThe algorithms in this module follow a consistent design pattern: they accept any `CurvatureOperator` and use the `LinAlgBackend` exclusively for vector arithmetic, ensuring portability across single-device and distributed settings.\n\n```mermaid\ngraph TD\n    A[CurvatureOperator] --> B[Lanczos Algorithm]\n    A --> C[Power Iteration]\n    A --> D[Trace Estimation]\n    A --> E[Spectral Density]\n    \n    B --> F[EigenResult]\n    C --> F\n    D --> G[TraceResult]\n    E --> H[SpectralDensityResult]\n    \n    F --> I[eigenvalues: Tensor]\n    F --> J[eigenvectors: Tensor]\n    F --> K[residuals: Tensor]\n```\n\n资料来源：[hessian_eigenthings/algorithms/result.py]()\n\n## Lanczos Algorithm\n\nThe Lanczos algorithm is the primary method for computing eigenvalues and eigenvectors of symmetric matrices. It builds a Krylov subspace through repeated matrix-vector products, then solves the small tridiagonal eigenvalue problem.\n\n### Symmetric Lanczos Implementation\n\nThe in-house Lanczos implementation provides optional full reorthogonalization to address the loss-of-orthogonality issues classical Lanczos is known for (Paige 1976).\n\n```python\ndef lanczos_tridiagonal(\n    operator: CurvatureOperator,\n    v0: torch.Tensor,\n    max_iter: int,\n    *,\n    reorthogonalize: bool = True,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> LanczosTridiag\n```\n\n**Key characteristics:**\n\n- **Default reorthogonalization**: Enabled for `max_iter <= 50` to suppress ghost eigenvalues\n- **Computational tradeoff**: For larger Krylov dimensions, reorthogonalization becomes O(m²n); users analyzing near-degenerate spectra should re-enable it\n- **Memory efficiency**: Accumulates Ritz vectors directly via rank-1 outer-product updates, avoiding transient (n, k) → (k, n) transpose copies\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:30-58]()\n\n### Lanczos Output Structure\n\n```python\n@dataclass(frozen=True)\nclass LanczosTridiag:\n    \"\"\"Output of one Lanczos run: tridiagonal coefficients + the basis used to build them.\"\"\"\n    alphas: torch.Tensor      # (m,) diagonal\n    betas: torch.Tensor       # (m-1,) off-diagonal\n    basis: list[torch.Tensor] # length m, each (n,)\n    last_beta: float          # ||r_m|| residual norm at termination\n    iterations: int          # m, the actual number of Lanczos steps completed\n```\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:30-43]()\n\n### Full Lanczos Eigensolver\n\nThe high-level `lanczos()` function computes top-k eigenvalues with configurable eigenpair selection:\n\n```python\ndef lanczos(\n    operator: CurvatureOperator,\n    k: int = 1,\n    max_iter: int = 100,\n    *,\n    which: Which = \"LM\",\n    tol: float = 1e-8,\n    seed: int | None = None,\n    reorthogonalize: bool | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> EigenResult\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `operator` | `CurvatureOperator` | required | The curvature matrix operator |\n| `k` | `int` | 1 | Number of eigenpairs to compute |\n| `max_iter` | `int` | 100 | Maximum Lanczos iterations |\n| `which` | `Literal[\"LM\", \"LA\", \"SA\"]` | \"LM\" | Which eigenvalues: LM=largest magnitude, LA=largest algebraic, SA=smallest algebraic |\n| `tol` | `float` | 1e-8 | Convergence tolerance |\n| `seed` | `int \\| None` | None | Random seed for reproducibility |\n| `reorthogonalize` | `bool \\| None` | None | Override default reorthogonalization setting |\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:58-100]()\n\n### Eigenvalue Selection Logic\n\nThe algorithm selects eigenvalues based on the `which` parameter:\n\n```python\nif which == \"LM\":\n    order = torch.argsort(theta.abs(), descending=True)\nelif which == \"LA\":\n    order = torch.argsort(theta, descending=True)\nelif which == \"SA\":\n    order = torch.argsort(theta, descending=False)\n```\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:75-83]()\n\n## Power Iteration\n\nPower iteration is a simpler method for finding the dominant eigenvalue. The library implements deflated power iteration to compute multiple eigenpairs sequentially by projecting out previously found directions.\n\n### Single Power Iteration\n\n```python\ndef power_iteration_one(\n    operator: CurvatureOperator,\n    v0: torch.Tensor,\n    max_iter: int,\n    tol: float = 1e-6,\n    backend: LinAlgBackend | None = None,\n) -> tuple[torch.Tensor, torch.Tensor]\n```\n\n资料来源：[hessian_eigenthings/algorithms/power_iteration.py]()\n\n### Deflated Power Iteration\n\nDeflated power iteration extends the basic method to compute multiple eigenpairs:\n\n```python\ndef deflated_power_iteration(\n    operator: CurvatureOperator,\n    num_eigs: int,\n    max_iter: int,\n    tol: float = 1e-6,\n    seed: int | None = None,\n    backend: LinAlgBackend | None = None,\n) -> EigenResult\n```\n\nThe deflation process removes previously found eigenvectors from the subspace before searching for the next eigenpair, preventing convergence to already-computed directions.\n\n资料来源：[hessian_eigenthings/algorithms/power_iteration.py]()\n\n## Trace Estimation\n\nTrace estimation provides a way to compute the average eigenvalue (trace / dimension) without full eigendecomposition. This is computationally much cheaper and useful for understanding overall curvature magnitude.\n\n### Hutchinson's Estimator\n\nHutchinson's method estimates the trace using random probe vectors:\n\n```python\ndef hutchinson(\n    operator: CurvatureOperator,\n    *,\n    num_samples: int = 100,\n    distribution: Distribution = \"rademacher\",\n    seed: int | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> TraceResult\n```\n\nThe estimator computes: `(1/m) Σ vᵢᵀ A vᵢ`\n\nWhere vᵢ are random vectors from the specified distribution. Rademacher distribution provides lower variance than Gaussian.\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:48-71]()\n\n### Hutch++ Estimator\n\nHutch++ is an improved estimator with better convergence properties:\n\n```python\ndef hutch_plus_plus(\n    operator: CurvatureOperator,\n    *,\n    num_matvecs: int = 30,\n    seed: int | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> TraceResult\n```\n\nHutch++ uses a structured random sampling approach that achieves lower variance than standard Hutchinson with the same number of matrix-vector products.\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:22-47]()\n\n### Unified Trace Interface\n\n```python\ndef trace(\n    operator: CurvatureOperator,\n    num_matvecs: int = 30,\n    *,\n    method: Literal[\"hutchinson\", \"hutch++\"] = \"hutch++\",\n    seed: int | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> TraceResult\n```\n\n**Validation:** The `num_matvecs` parameter is validated to be at least 1.\n\n资料来源：[hessian_eigenthings/algorithms/trace.py:71-84]()\n\n### Trace Result Structure\n\n```python\n@dataclass\nclass TraceResult:\n    estimate: float      # The trace estimate\n    stderr: float       # Standard error of the estimate\n    num_matvecs: int     # Number of matrix-vector products used\n    operator_size: int   # Dimension of the operator\n```\n\n资料来源：[hessian_eigenthings/algorithms/result.py]()\n\n## Spectral Density\n\nSpectral density estimation computes the eigenvalue distribution (density function) across the spectrum, enabling visualization and analysis of the full eigenvalue structure.\n\n### Stochastic Lanczos Quadrature\n\nThe `spectral_density()` function implements Stochastic Lanczos Quadrature (SLQ) to compute the spectral density:\n\n```python\ndef spectral_density(\n    operator: CurvatureOperator,\n    num_runs: int = 16,\n    lanczos_steps: int = 50,\n    seed: int | None = None,\n    backend: LinAlgBackend[torch.Tensor] | None = None,\n) -> SpectralDensityResult\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `operator` | `CurvatureOperator` | required | The curvature matrix operator |\n| `num_runs` | `int` | 16 | Number of randomized runs for averaging |\n| `lanczos_steps` | `int` | 50 | Lanczos iterations per run |\n| `seed` | `int \\| None` | None | Random seed |\n\n资料来源：[hessian_eigenthings/algorithms/spectral_density.py]()\n\n### Spectral Density Result\n\n```python\n@dataclass\nclass SpectralDensityResult:\n    grid: torch.Tensor      # Eigenvalue grid points\n    density: torch.Tensor   # Density values at each grid point\n    eigenvalues: list[torch.Tensor]  # Eigenvalues from each run\n    eigenvectors: list[list[torch.Tensor]]  # Corresponding eigenvectors\n```\n\nThe spectral density integrates to 1: `∫ density(λ) dλ ≈ 1`, which can be verified using numerical integration.\n\n资料来源：[hessian_eigenthings/algorithms/result.py]()\n\n## Common Result Types\n\nAll algorithms return standardized result objects that encapsulate the computed quantities along with metadata about the computation.\n\n```python\n@dataclass\nclass EigenResult:\n    eigenvalues: torch.Tensor       # (k,) tensor of eigenvalues\n    eigenvectors: torch.Tensor      # (k, n) matrix of eigenvectors\n    residuals: torch.Tensor          # (k,) convergence residuals\n    iterations: int                 # Number of iterations run\n    converged: bool                 # Whether all eigenpairs converged\n```\n\n资料来源：[hessian_eigenthings/algorithms/result.py]()\n\n## Algorithm Selection Guide\n\n```mermaid\ngraph LR\n    A[Goal] --> B{Eigenpairs?}\n    B -->|Yes, top-k| C[How many?]\n    C -->|1-10| D[Lanczos]\n    C -->|Many| E{Orthogonality critical?}\n    E -->|Yes| D\n    E -->|No| F[Deflated Power Iteration]\n    \n    B -->|Trace only| G{Accuracy priority?}\n    G -->|High| H[Hutch++]\n    G -->|Standard| I[Hutchinson]\n    \n    B -->|Full distribution| J[Spectral Density]\n    \n    D --> K[EigenResult]\n    F --> K\n    H --> L[TraceResult]\n    I --> L\n    J --> M[SpectralDensityResult]\n```\n\n### Decision Criteria\n\n| Scenario | Recommended Algorithm | Notes |\n|----------|----------------------|-------|\n| Top eigenvalues for single/large batch | `lanczos()` | Best accuracy, moderate cost |\n| Quick dominant eigenvalue | `deflated_power_iteration()` | Lower memory, less accurate |\n| Trace with limited matvecs | `hutch_plus_plus()` | Better convergence than Hutchinson |\n| Trace estimation | `hutchinson()` | Simpler, more matvecs needed |\n| Eigenvalue histogram/distribution | `spectral_density()` | Visualize full spectrum |\n\n## Integration with Curvature Operators\n\nThe algorithms are designed to work with any `CurvatureOperator` implementation, including:\n\n- `HessianOperator`: Exact Hessian via autograd or finite differences\n- `GGNOperator`: Generalized Gauss-Newton matrix\n- `EmpiricalFisherOperator`: Empirical Fisher information matrix\n- `DDPHessianOperator`: Distributed Data Parallel Hessian\n\nThis abstraction allows the same algorithm code to work across different curvature definitions without modification.\n\n资料来源：[hessian_eigenthings/algorithms/__init__.py:1-29]()\n\n## Performance Considerations\n\n### Memory Efficiency in Lanczos\n\nThe Lanczos implementation optimizes memory for large-scale models by:\n\n1. Avoiding allocation of full (n, m) basis matrix\n2. Using rank-1 outer-product updates for eigenvector accumulation\n3. Computing Ritz vectors directly into final (k, n) layout\n\n### Reorthogonalization Tradeoffs\n\n| Setting | Memory | Computation | Accuracy |\n|---------|--------|-------------|----------|\n| `reorthogonalize=True` | O(mn) | O(m²n) | High orthogonality |\n| `reorthogonalize=False` | O(mn) basis list | O(mn) | May have ghost eigenvalues |\n\nFor `max_iter <= 50`, reorthogonalization is enabled by default. For larger Krylov dimensions, it defaults off to maintain acceptable performance.\n\n资料来源：[hessian_eigenthings/algorithms/lanczos.py:23-28]()\n\n## Example Usage\n\n```python\nfrom hessian_eigenthings import HessianOperator, lanczos, trace, spectral_density\n\n# Create Hessian operator\noperator = HessianOperator(model, dataloader, loss_fn)\n\n# Compute top-5 eigenvalues and eigenvectors\neig_result = lanczos(operator, k=5, max_iter=40, tol=1e-7, seed=0)\nprint(f\"Top eigenvalue: {eig_result.eigenvalues[0]}\")\n\n# Estimate trace with Hutch++\ntrace_result = trace(operator, num_matvecs=99, method=\"hutch++\", seed=0)\nprint(f\"Trace estimate: {trace_result.estimate}\")\n\n# Compute spectral density\ndensity_result = spectral_density(operator, num_runs=8, lanczos_steps=40, seed=0)\n# Visualize with: plt.plot(density_result.grid, density_result.density)\n```\n\n资料来源：[examples/supervised_mlp.py:1-50]()\n\n---\n\n<a id='loss-functions'></a>\n\n## Loss Functions\n\n### 相关页面\n\n相关主题：[Curvature Operators](#curvature-operators), [Parameter Utilities](#parameter-utilities)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/loss_fns/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/__init__.py)\n- [hessian_eigenthings/loss_fns/standard.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n- [hessian_eigenthings/loss_fns/huggingface.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n- [hessian_eigenthings/loss_fns/transformer_lens.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/transformer_lens.py)\n- [hessian_eigenthings/loss_fns/_fused_ce_hvp.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n</details>\n\n# Loss Functions\n\nLoss functions in this repository serve as the bridge between model outputs and curvature operators (Hessian and Generalized Gauss-Newton matrices). They provide the necessary computations for Hessian-vector products and support multiple backend implementations optimized for different use cases.\n\n## Overview\n\nThe loss functions module (`hessian_eigenthings/loss_fns/`) provides two distinct function signatures depending on the target operator:\n\n| Function Type | Signature | Used By |\n|---------------|-----------|---------|\n| `loss_fn` | `(model: nn.Module, batch: Any) -> torch.Tensor` | `HessianOperator` |\n| `loss_of_output_fn` | `(output: torch.Tensor, batch: Any) -> torch.Tensor` | `GGNOperator` |\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n资料来源：[hessian_eigenthings/operators/ggn.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Loss Function Entry Points] --> B[Standard Losses]\n    A --> C[HuggingFace Losses]\n    A --> D[TransformerLens Losses]\n    \n    B --> B1[MSE Loss]\n    B --> B2[Cross-Entropy Loss with HVP]\n    \n    C --> C1[Autoregressive LM Loss]\n    C --> C2[Shifted CE with Analytical HVP]\n    C --> C3[Fused CE HVP Backends]\n    \n    D --> D1[TransformerLens HookedModel Loss]\n    \n    C3 --> C3a[Triton Kernel]\n    C3 --> C3b[torch.compile]\n    C3 --> C3c[Eager Reference]\n```\n\n## Standard Loss Functions\n\nThe `standard.py` module provides loss functions for common supervised learning scenarios with closed-form Hessian-vector products.\n\n资料来源：[hessian_eigenthings/loss_fns/standard.py:1-80](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n\n### MSE Loss\n\nReturns a wrapper compatible with `GGNOperator` for mean-squared error loss:\n\n```python\ndef mse_loss_of_output() -> Callable[[torch.Tensor, tuple[torch.Tensor, torch.Tensor]], torch.Tensor]:\n    \"\"\"Make a `loss_of_output_fn` for `GGNOperator` from a (output, target) criterion.\"\"\"\n```\n\n### Cross-Entropy Loss with Analytical HVP\n\nThe cross-entropy implementation includes a closed-form Hessian-vector product for efficient computation:\n\n```python\ndef _ce_hvp(\n    output: torch.Tensor, batch: tuple[torch.Tensor, torch.Tensor], u: torch.Tensor\n) -> torch.Tensor:\n    \"\"\"Closed-form H @ u for mean-reduced softmax + cross-entropy.\n    \n    `output` has shape `(N, C)` (logits). For each row,\n    `H_row = (diag(p) - p p^T) / N` where `p = softmax(output)`.\n    \"\"\"\n```\n\n**Mathematical Foundation:**\n\nFor mean-reduced softmax + cross-entropy, the Hessian takes the form:\n\n```\nH_row = (diag(p) - p·p^T) / N\n```\n\nwhere:\n- `p = softmax(output)` is the predicted probability distribution\n- `N` is the number of samples\n- `u` is the input vector\n\n资料来源：[hessian_eigenthings/loss_fns/standard.py:40-55](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/standard.py)\n\n## HuggingFace Transformers Integration\n\nThe `huggingface.py` module provides loss functions specifically designed for HuggingFace Transformers models. These handle the internal loss computation that occurs when `labels` are present in the batch.\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:60-90](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n### Autoregressive Language Model Loss\n\n```python\ndef hf_lm_loss() -> Callable[[nn.Module, dict[str, Any]], torch.Tensor]:\n    \"\"\"For autoregressive LMs: `loss_fn(model, batch)` calls `model(**batch).loss`.\"\"\"\n```\n\nThe batch must include `labels` so HuggingFace computes the loss internally. For causal language models, this is typically `labels=input_ids` with the standard internal shift.\n\n### Shifted Cross-Entropy with Analytical HVP\n\nFor large-scale language model analysis, a shifted cross-entropy variant provides both the loss function and its analytical Hessian-vector product:\n\n```python\ndef hf_lm_shifted_ce(fused: FusedCEHvpBackend = \"auto\") -> _LossOfOutputWithHvp:\n    \"\"\"Shifted CE loss with analytical H @ u for autoregressive LMs.\"\"\"\n```\n\n**Shift Mechanism:**\n- The loss shifts logits left (discards last position) and labels right (discards first position)\n- Matches how `cross_entropy(ignore_index=-100)` handles gradient computation\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n## Fused Cross-Entropy HVP Backends\n\nThe `_fused_ce_hvp.py` module implements optimized backends for computing the cross-entropy Hessian-vector product.\n\n资料来源：[hessian_eigenthings/loss_fns/_fused_ce_hvp.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/_fused_ce_hvp.py)\n\n### Backend Selection\n\n| Backend | Description | Performance | Availability |\n|---------|-------------|-------------|--------------|\n| `\"auto\"` | Auto-select fastest available | Optimal | Default |\n| `\"triton\"` | Hand-written CUDA Triton kernel | ~3.4x speedup, 2x memory reduction | CUDA + Triton |\n| `\"compile\"` | `torch.compile`-fused | ~2.6x speedup, 2x memory reduction | torch >= 2.0 |\n| `\"eager\"` | Plain PyTorch reference | Baseline | Always |\n\n```python\nFusedCEHvpBackend = Literal[\"auto\", \"eager\", \"compile\", \"triton\"]\n```\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:25-35](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n### Backend Resolution Logic\n\n```mermaid\ngraph LR\n    A[Backend: \"auto\"] --> B{Device: CUDA?}\n    B -->|Yes + Triton available| C[Triton Kernel]\n    B -->|No| D{torch.compile available?}\n    D -->|Yes| E[torch.compile Backend]\n    D -->|No| F[Eager Backend]\n```\n\nThe resolution checks:\n1. If `backend != \"auto\"`, use the specified backend\n2. If `\"auto\"` and CUDA + Triton available → Triton\n3. If `\"auto\"` and torch.compile available → compile\n4. Otherwise → eager\n\n**Important:** The Triton kernel asserts `logits.is_cuda`, so on CUDA-equipped hosts running CPU inputs, the system falls back to `compile`.\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:40-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n### Memory Optimization\n\nAt LM scale (e.g., B=64, T=256, V=50304, fp32):\n\n| Implementation | Memory Footprint | Intermediate Tensors |\n|----------------|------------------|----------------------|\n| Eager | ~19.6 GB | ~6 (N, V) tensors |\n| Compile | ~3.3 GB | ~1 (N, V) tensor |\n| Target | ~3.3 GB | Output buffer only |\n\nThe fused implementations eliminate intermediates by computing:\n\n```\nout_flat = (p * u - p * <p, u>) * mask / n_valid\n```\n\nwith shape `(N, V)` in a single kernel pass.\n\n资料来源：[scripts/bench_fused_ce_hvp.py:1-60](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/scripts/bench_fused_ce_hvp.py)\n\n## Loss Function Wrapper\n\nThe `_LossOfOutputWithHvp` class wraps a loss function with its analytical Hessian-vector product:\n\n```python\nclass _LossOfOutputWithHvp:\n    \"\"\"Loss-of-output callable that also carries an analytical `.hvp` method.\n    \n    Wraps a plain `(output, batch) -> loss` function and a `(output, batch, u)\n    -> H_loss @ u` function in a single callable. `GGNOperator` checks for the\n    presence of `.hvp` and uses it as the loss-Hessian-vector product, skipping\n    the autograd `create_graph=True` double-backward path entirely.\n    \"\"\"\n```\n\n资料来源：[hessian_eigenthings/loss_fns/huggingface.py:100-120](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/huggingface.py)\n\n## GGN Operator Integration\n\nThe `GGNOperator` automatically detects and uses analytical HVPs when available:\n\n```python\nGGNOperator` picks this up automatically and skips the autograd\ndouble-backward.\n\nTwo implementations of the matvec are available via `loss_hvp=`:\n\n* ``\"analytical\"`` (default): finite-difference JVP + analytical loss-Hessian-vec\n  product (read from `loss_of_output_fn.hvp`, which must be present) + a single\n  normal backward to apply `J^T`. Memory footprint matches one normal training\n  step. Required for LM-scale use.\n\n* ``\"autograd\"``: the original `torch.func.jvp` + autograd double-backward +\n  `torch.func.vjp` path. Numerically exact and supports any loss, but memory\n  scales badly with output size.\n```\n\n资料来源：[hessian_eigenthings/operators/ggn.py:10-30](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n## TransformerLens Integration\n\nFor TransformerLens `HookedModel` architectures, a dedicated loss function handles the hook-based forward pass:\n\n资料来源：[hessian_eigenthings/loss_fns/transformer_lens.py:1-40](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/transformer_lens.py)\n\n```python\ndef tlens_loss() -> Callable[[nn.Module, Any], torch.Tensor]:\n    \"\"\"Loss function for TransformerLens HookedModel.\"\"\"\n```\n\n## Workflow: Choosing a Loss Function\n\n```mermaid\ngraph TD\n    A[Start] --> B{Model Type?}\n    B -->|Standard MLP/CNN| C[Use HessianOperator]\n    B -->|HuggingFace Transformers| D[Use GGNOperator]\n    B -->|TransformerLens| E[Use HessianOperator]\n    \n    C --> F[standard.mse_loss_of_output]\n    C --> G[standard.cross_entropy_loss_of_output]\n    \n    D --> H{Scale?}\n    H -->|Small model| I[hf_lm_loss with GGNOperator]\n    H -->|Large model| J[hf_lm_shifted_ce with GGNOperator]\n    \n    E --> K[tlens_loss with HessianOperator]\n    \n    J --> L[Choose HVP Backend]\n    L --> M{Device?}\n    M -->|CUDA + Triton| N[Use Triton backend]\n    M -->|CPU/MPS| O[Use compile or eager]\n```\n\n## Complete API Reference\n\n### Standard Module\n\n| Function | Returns | HVP Available |\n|----------|---------|---------------|\n| `mse_loss_of_output()` | `loss_of_output_fn` | No |\n| `cross_entropy_loss_of_output()` | `loss_of_output_fn` | Yes |\n| `_ce_hvp()` | Analytical HVP | - |\n\n### HuggingFace Module\n\n| Function | Returns | HVP Available |\n|----------|---------|---------------|\n| `hf_lm_loss()` | `loss_fn` | Via GGNOperator |\n| `hf_lm_shifted_ce(fused)` | `_LossOfOutputWithHvp` | Yes |\n| `_LossOfOutputWithHvp` | Wrapper class | Via `.hvp` attribute |\n\n### Fused CE HVP Module\n\n| Function | Description |\n|----------|-------------|\n| `_ce_hvp_reference()` | Eager reference implementation |\n| `_get_compiled_impl()` | Returns `torch.compile` wrapped version |\n| `compiled_ce_hvp()` | Compiled backend entry point |\n| `triton_ce_hvp()` | Triton kernel entry point |\n\n## Usage Examples\n\n### Standard Classification\n\n```python\nfrom hessian_eigenthings.operators import HessianOperator\nfrom hessian_eigenthings.loss_fns.standard import cross_entropy_loss_of_output\n\noperator = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=lambda m, b: torch.nn.functional.cross_entropy(m(b[0]), b[1]),\n)\n```\n\n### HuggingFace Large Model (Memory-Optimized)\n\n```python\nfrom hessian_eigenthings.operators import GGNOperator\nfrom hessian_eigenthings.loss_fns.huggingface import hf_lm_shifted_ce\n\nloss_fn = hf_lm_shifted_ce(fused=\"auto\")  # Auto-selects best backend\n\noperator = GGNOperator(\n    model=model,\n    dataloader=dataloader,\n    forward_fn=lambda m, b: m(**b).logits,\n    loss_of_output_fn=loss_fn,\n    loss_hvp=\"analytical\",  # Default, uses .hvp attribute\n)\n```\n\n资料来源：[hessian_eigenthings/loss_fns/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/loss_fns/__init__.py)\n\n---\n\n<a id='parameter-utilities'></a>\n\n## Parameter Utilities\n\n### 相关页面\n\n相关主题：[Curvature Operators](#curvature-operators)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/param_utils.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/param_utils.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n- [examples/transformer_lens_attention_only.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/transformer_lens_attention_only.py)\n- [examples/huggingface_tiny_gpt2.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/huggingface_tiny_gpt2.py)\n</details>\n\n# Parameter Utilities\n\nThe parameter utilities module (`param_utils.py`) provides essential infrastructure for managing, filtering, and manipulating PyTorch model parameters within the Hessian eigendecomposition pipeline. These utilities form the foundation that connects curvature operators to the underlying model parameters, enabling efficient computation of Hessian-vector products and eigendecomposition across arbitrary subsets of model parameters.\n\n## Overview\n\nWhen working with large neural networks, it is often necessary to compute curvature information for only a subset of parameters. The parameter utilities support this use case through a flexible filtering mechanism combined with utilities for parameter vectorization, reshaping, and batch management.\n\nThe core responsibilities of the parameter utilities include:\n\n1. **Parameter Extraction** - Gathering named parameters from PyTorch modules\n2. **Parameter Filtering** - Selecting subsets of parameters based on name patterns or custom predicates\n3. **Vectorization** - Flattening parameters into vectors and reshaping vectors back to parameter shapes\n4. **Size Tracking** - Maintaining offset mappings for efficient vector-to-parameter conversions\n\n资料来源：[hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n## Core Types and Interfaces\n\n### ParamFilter Type\n\nThe `ParamFilter` type alias defines the contract for parameter selection functions:\n\n```python\nParamFilter = Callable[[str, nn.Parameter], bool]\n```\n\nA `ParamFilter` is a callable that takes two arguments:\n- `name: str` - The fully-qualified parameter name within the model\n- `param: nn.Parameter` - The parameter tensor itself\n\nThe function returns `True` if the parameter should be included in the operation, `False` otherwise.\n\n资料来源：[hessian_eigenthings/operators/hessian.py:1-50](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### Parameter Collection Utilities\n\nThe module provides functions for extracting and organizing model parameters:\n\n| Function | Purpose |\n|----------|---------|\n| `get_param_names(model)` | Returns list of parameter names as fully-qualified strings |\n| `get_param_list(model)` | Returns list of parameter tensors |\n| `get_param_sizes(model)` | Returns list of parameter tensor sizes |\n| `get_filtered_params(model, param_filter)` | Returns filtered parameter names and tensors |\n\nThese functions work together to build the data structures required by curvature operators.\n\n资料来源：[hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n## Parameter Vectorization\n\n### Flattening Parameters to Vectors\n\nThe utilities support bidirectional conversion between parameter dictionaries and flat vectors. This is essential for Lanczos-based eigendecomposition algorithms that operate on vector spaces.\n\n```python\ndef flatten_params(param_dict: dict[str, Tensor]) -> Tensor:\n    \"\"\"Flatten all parameters into a single 1D tensor.\"\"\"\n```\n\nThe flattening process concatenates all parameter tensors in a deterministic order, preserving the mapping between parameter names and vector offsets.\n\n### Reshaping Vectors Back to Parameters\n\n```python\ndef unflatten_params(\n    vec: Tensor,\n    param_names: list[str],\n    param_list: list[Tensor],\n    sizes: list[torch.Size]\n) -> dict[str, Tensor]:\n    \"\"\"Reshape a flat vector back to parameter dictionary.\"\"\"\n```\n\nThe unflattening operation uses offset tracking to slice the vector and reshape each slice to match the original parameter shape:\n\n```python\nfor name, param, size in zip(param_names, param_list, sizes, strict=True):\n    out[name] = vec[offset : offset + size].reshape_as(param)\n    offset += size\n```\n\n资料来源：[hessian_eigenthings/operators/ggn.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/ggn.py)\n\n## Parameter Filtering Patterns\n\n### Name-Based Filtering with `match_names`\n\nThe most common filtering pattern uses glob-style matching against parameter names. The `match_names` function creates a `ParamFilter` from a list of name patterns:\n\n```python\ndef match_names(*patterns: str) -> ParamFilter:\n    \"\"\"Create a filter matching parameter names against glob patterns.\"\"\"\n```\n\n**Example usage:**\n\n```python\nfrom hessian_eigenthings.operators import HessianOperator\nfrom hessian_eigenthings.param_utils import match_names\n\n# Filter attention parameters only\nattn_filter = match_names(\"blocks.*.attn.*\")\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=attn_filter\n)\n\n# Filter MLP parameters only\nmlp_filter = match_names(\"blocks.*.mlp.*\")\nmlp_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=mlp_filter\n)\n```\n\n资料来源：[examples/transformer_lens_attention_only.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/transformer_lens_attention_only.py)\n\n### Multiple Pattern Matching\n\nThe `match_names` function supports multiple patterns, useful for targeting disjoint parameter groups:\n\n```python\n# Match multiple parameter groups\nfilter_fn = match_names(\n    \"transformer.h.*.attn.*\",\n    \"transformer.h.*.mlp.*\"\n)\n```\n\n### HuggingFace-Specific Patterns\n\nWhen working with HuggingFace transformers, parameter names follow a predictable structure:\n\n```python\n# Attention parameters in GPT-2\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=hf_lm_loss(),\n    param_filter=match_names(\"transformer.h.*.attn.*\"),\n)\n```\n\n资料来源：[examples/huggingface_tiny_gpt2.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/huggingface_tiny_gpt2.py)\n\n## Integration with Curvature Operators\n\n### Operator Size and Parameter Tracking\n\nCurvature operators maintain internal state about the parameters they operate on:\n\n| Attribute | Type | Description |\n|-----------|------|-------------|\n| `_param_names` | `list[str]` | Names of parameters in the filtered set |\n| `_param_list` | `list[Tensor]` | Parameter tensors |\n| `_sizes` | `list[torch.Size]` | Original tensor shapes for reshaping |\n| `size` | `int` | Total number of parameters (sum of all parameter elements) |\n\nThe `size` property is computed as:\n\n```python\nself.size = sum(p.numel() for p in self._param_list)\n```\n\nThis total parameter count determines the dimensionality of the vector space in which eigendecomposition occurs.\n\n资料来源：[hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n\n### Data Flow Diagram\n\n```mermaid\ngraph TD\n    A[PyTorch Model] --> B[get_param_names]\n    A --> C[get_param_list]\n    A --> D[get_param_sizes]\n    B --> E[ParamFilter Application]\n    C --> E\n    D --> E\n    E --> F[Filtered Parameter Collections]\n    F --> G[Curvature Operator]\n    G --> H[matvec Operations]\n    H --> I[Eigendecomposition Results]\n    \n    J[Input Vector] --> K[unflatten_params]\n    F --> K\n    K --> L[Parameter Dict]\n    L --> H\n```\n\n## Practical Examples\n\n### Computing Attention-Only Hessian Eigendecomposition\n\n```python\nfrom hessian_eigenthings.algorithms import lanczos\nfrom hessian_eigenthings.operators import HessianOperator\nfrom hessian_eigenthings.param_utils import match_names\n\n# Create attention-only operator\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\")\n)\n\nprint(f\"Attention-only Hessian size: {attn_op.size} parameters\")\n\n# Compute top-3 eigenvalues\neig_attn = lanczos(attn_op, k=3, max_iter=20, tol=1e-3, seed=0)\nfor i, val in enumerate(eig_attn.eigenvalues):\n    print(f\"  λ_{i + 1} = {val.item(): .4e}\")\n```\n\n### Comparing Block-Specific Curvature\n\n```python\n# Full model Hessian\nfull_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn\n)\n\n# Attention block only\nattn_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.attn.*\")\n)\n\n# MLP block only\nmlp_op = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"blocks.*.mlp.*\")\n)\n\n# Compare eigenvalue spectra\nfull_eig = lanczos(full_op, k=10)\nattn_eig = lanczos(attn_op, k=10)\nmlp_eig = lanczos(mlp_op, k=10)\n```\n\n资料来源：[examples/transformer_lens_attention_only.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/examples/transformer_lens_attention_only.py)\n\n## Advanced Filtering\n\n### Custom Filter Functions\n\nFor complex filtering logic beyond glob matching, implement a custom `ParamFilter`:\n\n```python\ndef custom_filter(name: str, param: nn.Parameter) -> bool:\n    # Include only parameters with > 1000 elements\n    if param.numel() < 1000:\n        return False\n    # Exclude certain modules\n    if \"embedding\" in name:\n        return False\n    # Include based on naming patterns\n    return \"layer\" in name or \"head\" in name\n\noperator = HessianOperator(\n    model=model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=custom_filter\n)\n```\n\n### Filter Composition\n\nFilters can be combined using standard Python patterns:\n\n```python\n# Intersection of patterns\ncombined_filter = lambda name, param: (\n    match_names(\"blocks.*.*.*\")(name, param) and\n    param.dtype == torch.float32\n)\n\n# Negation\nexclude_ln = lambda name, param: not (\n    match_names(\".*laynorm.*\", \".*ln.*\")(name, param)\n)\n```\n\n## Performance Considerations\n\n### Parameter Access Patterns\n\nThe parameter utilities maintain strict ordering between name lists and tensor lists to enable efficient offset-based indexing. When iterating over parameters in performance-critical paths:\n\n1. Use the pre-computed `_sizes` list to avoid repeated `param.shape` calls\n2. Leverage the `strict=True` zip when all lists are guaranteed to be aligned\n3. Prefer in-place reshaping over copies when possible\n\n### Memory Implications\n\n| Operation | Memory Pattern |\n|-----------|----------------|\n| `flatten_params` | Allocates new tensor of size `sum(numel)` |\n| `unflatten_params` | Creates dict, views from original vector |\n| `matvec` | No parameter data copies; uses VJP/JVP chains |\n\nThe vectorization maintains a view relationship with original parameters where possible, minimizing memory overhead during iterative algorithms.\n\n## API Reference\n\n### `match_names`\n\n```python\ndef match_names(*patterns: str) -> ParamFilter:\n    \"\"\"Create a ParamFilter matching parameter names against glob patterns.\"\"\"\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `patterns` | `str` | Glob patterns to match against parameter names |\n\n**Returns:** A callable `ParamFilter` that returns `True` for parameters matching any of the provided patterns.\n\n**Supported Glob Patterns:**\n- `*` - Matches any sequence of characters within a path component\n- `**` - Matches any sequence of path components (if supported)\n- `?` - Matches a single character\n- `[abc]` - Matches any character in the set\n\n### Parameter Extraction Functions\n\n```python\ndef get_param_names(model: nn.Module) -> list[str]:\n    \"\"\"Extract fully-qualified parameter names from a model.\"\"\"\n\ndef get_param_list(model: nn.Module) -> list[Tensor]:\n    \"\"\"Extract parameter tensors from a model.\"\"\"\n\ndef get_filtered_params(\n    model: nn.Module,\n    param_filter: ParamFilter | None\n) -> tuple[list[str], list[Tensor]]:\n    \"\"\"Extract filtered parameter names and tensors.\"\"\"\n```\n\n资料来源：[hessian_eigenthings/param_utils.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/param_utils.py)\n\n---\n\n<a id='distributed-computing'></a>\n\n## Distributed Computing with DDP\n\n### 相关页面\n\n相关主题：[Curvature Operators](#curvature-operators)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [hessian_eigenthings/operators/distributed/ddp.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/distributed/ddp.py)\n- [hessian_eigenthings/operators/distributed/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/distributed/__init__.py)\n- [hessian_eigenthings/operators/hessian.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/hessian.py)\n- [hessian_eigenthings/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/__init__.py)\n- [hessian_eigenthings/operators/__init__.py](https://github.com/noahgolmant/pytorch-hessian-eigenthings/blob/main/hessian_eigenthings/operators/__init__.py)\n</details>\n\n# Distributed Computing with DDP\n\nThe `hessian-eigenthings` library provides native support for distributed training scenarios through `DDPHessianOperator`, a specialized curvature operator that extends the base `HessianOperator` to work correctly with PyTorch's `DistributedDataParallel` (DDP) wrapper.\n\n## Overview\n\nIn distributed training environments, the Hessian eigenvalue computations must account for how DDP synchronizes gradients across multiple processes. The `DDPHessianOperator` handles this synchronization transparently, ensuring that the Hessian-vector products (HVPs) computed across different ranks are properly averaged.\n\n**Key characteristics:**\n\n- Subclass of `HessianOperator` with distributed awareness\n- Automatically averages HVPs across all data-parallel ranks\n- Compatible with standard DDP-wrapped models\n- Supports the same API as the base `HessianOperator`\n- Handles the autograd graph complexity introduced by DDP's all-reduce operations\n\n## Architecture\n\n### Class Hierarchy\n\n```\nCurvatureOperator (base interface)\n    └── HessianOperator (base implementation)\n            └── DDPHessianOperator (DDP-aware extension)\n```\n\n### Data Flow Diagram\n\n```mermaid\ngraph TD\n    A[Model wrapped with DDP] --> B[DDPHessianOperator]\n    B --> C[Per-rank HVP Computation]\n    C --> D[Autograd-aware All-Reduce]\n    D --> E[Synchronized HVP across Ranks]\n    \n    F[torch.autograd.grad calls] --> G[Regular .backward hooks]\n    G --> H[No explicit all-reduce]\n    \n    I[Expected HVP] --> J[Actual HVP without DDPHessianOperator]\n    \n    K[Expected HVP] --> L[Actual HVP with DDPHessianOperator]\n    \n    style D fill:#90EE90\n    style H fill:#FFB6C1\n    style L fill:#90EE90\n```\n\n### DDP Behavior Explanation\n\nThe core challenge addressed by `DDPHessianOperator` stems from how PyTorch's `DistributedDataParallel` handles gradient synchronization:\n\n1. **DDP's all-reduce mechanism**: DDP normally fires its all-reduce operation inside the autograd graph during `loss.backward()`, synchronizing gradients across all ranks\n2. **Standard HessianOperator limitation**: When using `torch.autograd.grad` directly (as the base `HessianOperator` does), the DDP hooks fire on `.grad` accumulation rather than on `autograd.grad`'s return value\n3. **Resulting discrepancy**: Without explicit handling, the computed HVP does not match the single-process HVP computed on the union of all per-rank batches\n\nThe `DDPHessianOperator` resolves this by adding an explicit autograd-aware all-reduce after each gradient computation call, ensuring the resulting HVP equals the single-process HVP.\n\n## API Reference\n\n### DDPHessianOperator\n\n```python\nclass DDPHessianOperator(HessianOperator):\n    \"\"\"HessianOperator that all-reduces the HVP across torch.distributed ranks.\"\"\"\n```\n\n#### Constructor Parameters\n\n| Parameter | Type | Required | Default | Description |\n|-----------|------|----------|---------|-------------|\n| `model` | `nn.Module` | Yes | - | Model (may be DDP-wrapped; params are read directly) |\n| `dataloader` | `Iterable[Any]` | Yes | - | Data loader providing batches to average over |\n| `loss_fn` | `LossFn` | Yes | - | Loss function `forward_fn(...) -> loss` |\n| `param_filter` | `ParamFilter \\| None` | No | `None` | Optional filter for subset of parameters |\n| `full_dataset` | `bool` | No | `True` | Whether to compute Hessian over full dataset |\n| `num_batches` | `int \\| None` | No | `None` | Number of batches to sample if not full dataset |\n| `microbatch_size` | `int \\| None` | No | `None` | Chunk batch into micro-batches for memory |\n| `microbatch_unsafe` | `bool` | No | `False` | Skip gradient accumulation safety checks |\n| `method` | `HvpMethod` | No | `\"autograd\"` | HVP computation method |\n| `fd_eps` | `float \\| None` | No | `None` | Finite difference epsilon |\n| `backend` | `LinAlgBackend[torch.Tensor] \\| None` | No | `None` | Linear algebra backend |\n\nInherits all parameters from `HessianOperator` base class.\n\n#### Inherited Methods\n\n| Method | Description |\n|--------|-------------|\n| `matvec(v)` | Compute H·v where H is the Hessian averaged over batches |\n| `size` | Total number of parameters in the filtered parameter set |\n| `dtype` | Data type of parameters |\n| `device` | Device of parameters |\n\n### Import Location\n\n```python\nfrom hessian_eigenthings.operators import DDPHessianOperator\n```\n\nOr via the distributed submodule:\n\n```python\nfrom hessian_eigenthings.operators.distributed import DDPHessianOperator\n```\n\n资料来源：[hessian_eigenthings/operators/distributed/__init__.py:1-3]()\n\n## Usage Patterns\n\n### Basic Usage with DDP-wrapped Model\n\n```python\nimport torch\nimport torch.distributed as dist\nfrom torch.nn.parallel import DistributedDataParallel as DDP\nfrom hessian_eigenthings.operators import DDPHessianOperator\nfrom hessian_eigenthings.algorithms import lanczos\n\n# Assume model, dataloader, and loss_fn are already set up\nddp_model = DDP(model)\n\n# Create the distributed Hessian operator\nhessian_op = DDPHessianOperator(\n    model=ddp_model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n)\n\n# Compute eigenvalues using Lanczos algorithm\neigenvalues, eigenvectors = lanczos(hessian_op, k=10, max_iter=50)\n```\n\n### With Parameter Filtering\n\n```python\nfrom hessian_eigenthings.param_utils import match_names\n\n# Focus on specific layer parameters\nhessian_op = DDPHessianOperator(\n    model=ddp_model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    param_filter=match_names(\"layer.4.*\"),\n)\n```\n\n### Using with Different HVP Methods\n\n```python\n# Using finite difference method (more memory-efficient)\nhessian_op_fd = DDPHessianOperator(\n    model=ddp_model,\n    dataloader=dataloader,\n    loss_fn=loss_fn,\n    method=\"finite_difference\",\n    fd_eps=1e-5,\n)\n```\n\n## Key Design Decisions\n\n### Autograd-aware All-Reduce\n\nThe `DDPHessianOperator` adds an explicit all-reduce operation that integrates with PyTorch's autograd engine. This ensures:\n\n- The all-reduce operation is included in the autograd graph when needed\n- Gradient flows correctly through the distributed computation\n- The final HVP is properly synchronized across all ranks\n\n### Parameter Access\n\nThe operator reads parameters directly from the model, whether or not it is wrapped with DDP:\n\n```python\n# From the source:\n# \"The model passed in may already be wrapped with\n#  torch.nn.parallel.DistributedDataParallel; we read params from it directly.\"\n```\n\nThis design allows seamless usage with existing DDP-wrapped models without modification.\n\n### Batch Distribution\n\nEach rank should receive its own shard of the dataset:\n\n> \"Each rank should be receiving its own shard of the dataset (typical pattern: a `torch.utils.data.distributed.DistributedSampler`).\"\n\n资料来源：[hessian_eigenthings/operators/distributed/ddp.py:21-25]()\n\n## Comparison with Single-Process HessianOperator\n\n| Aspect | `HessianOperator` | `DDPHessianOperator` |\n|--------|-------------------|----------------------|\n| Use case | Single GPU / CPU | Multi-GPU distributed |\n| Gradient sync | Manual handling required | Automatic via all-reduce |\n| DDP compatibility | May produce incorrect HVPs | Correct by design |\n| API | Identical | Identical |\n| Performance overhead | None | Single all-reduce per HVP |\n\n## Relationship to Other Operators\n\nThe `hessian_eigenthings` package provides multiple curvature operators:\n\n| Operator | Description | Distributed Support |\n|----------|-------------|---------------------|\n| `HessianOperator` | Full Hessian computation | Not DDP-aware |\n| `DDPHessianOperator` | Full Hessian with DDP sync | DDP-aware |\n| `GGNOperator` | Generalized Gauss-Newton | Not DDP-aware (as of v1.0) |\n| `EmpiricalFisherOperator` | Empirical Fisher matrix | Not DDP-aware |\n\n资料来源：[hessian_eigenthings/__init__.py:15-24]()\n\n## Limitations and Considerations\n\n1. **Current scope**: Only `HessianOperator` has a DDP-aware counterpart; other operators like `GGNOperator` and `EmpiricalFisherOperator` do not yet have distributed variants\n2. **Gradient hooks**: The operator does not currently support all DDP gradient hook mechanisms\n3. **Multi-node training**: While the operator uses standard `torch.distributed` primitives, performance at very large scale (>8 nodes) has not been extensively benchmarked\n4. **Mixed precision**: When using fp16/bf16 training, ensure consistent dtype across all ranks\n\n## Error Handling\n\nThe operator relies on standard PyTorch distributed error handling:\n\n- If `torch.distributed` is not initialized, standard errors will be raised\n- Mismatched tensor shapes across ranks will result in collective operation errors\n- Device mismatch (e.g., some ranks on CUDA, some on CPU) is not supported\n\n## Testing and Validation\n\nThe DDP functionality should be tested in a true distributed environment. Basic validation includes:\n\n1. **Consistency check**: HVP computed via `DDPHessianOperator` should equal the single-process HVP when aggregating all batch shards\n2. **Numerical accuracy**: Eigenvalues computed with DDP should match single-GPU results within floating-point tolerance\n3. **Scaling**: Computation time should scale sub-linearly with number of GPUs for large models\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：noahgolmant/pytorch-hessian-eigenthings\n\n摘要：发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：身份坑 - 仓库名和安装名不一致。\n\n## 1. 身份坑 · 仓库名和安装名不一致\n\n- 严重度：medium\n- 证据强度：runtime_trace\n- 发现：仓库名 `pytorch-hessian-eigenthings` 与安装入口 `hessian-eigenthings` 不完全一致。\n- 对用户的影响：用户照着仓库名搜索包或照着包名找仓库时容易走错入口。\n- 建议检查：在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- 复现命令：`pip install hessian-eigenthings`\n- 防护动作：页面必须同时展示 repo 名和真实安装入口，避免用户搜索错包。\n- 证据：identity.distribution | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | repo=pytorch-hessian-eigenthings; install=hessian-eigenthings\n\n## 2. 安装坑 · 来源证据：Python Error: the following arguments are required: experimentname\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Python Error: the following arguments are required: experimentname\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_24f46464d79f4ae3830f046c077a2574 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/39 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：v1.0.0a2 — packaging fix\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a2 — packaging fix\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_7540a696b30c46cdba07c12f33388567 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a2 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：v1.0.0a3 — fix lanczos OOM\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a3 — fix lanczos OOM\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e5e68e2f24e1436cb8f3c2f11cefe326 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a3 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 5. 安装坑 · 来源证据：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_914b7653aa8b4ef2844a2b4690fab2ad | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a4 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 6. 安装坑 · 来源证据：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_509356ab9b68434992d7237219952ba6 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a5 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 7. 配置坑 · 来源证据：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f79a3a34cbab435cb3730b7ae17cf492 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/30 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 配置坑 · 来源证据：ValueError: PENet on the Kitti benchmark suite\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：ValueError: PENet on the Kitti benchmark suite\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_850f8cf0010c4d269ab71d864610097a | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/41 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 9. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | README/documentation is current enough for a first validation pass.\n\n## 10. 运行坑 · 来源证据：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b515f5c06a5744b19b667bcbc8123348 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/38 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 11. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | last_activity_observed missing\n\n## 12. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | no_demo; severity=medium\n\n## 13. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | no_demo; severity=medium\n\n## 14. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | issue_or_pr_quality=unknown\n\n## 15. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | release_recency=unknown\n\n<!-- canonical_name: noahgolmant/pytorch-hessian-eigenthings; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：noahgolmant/pytorch-hessian-eigenthings\n\n摘要：发现 15 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：身份坑 - 仓库名和安装名不一致。\n\n## 1. 身份坑 · 仓库名和安装名不一致\n\n- 严重度：medium\n- 证据强度：runtime_trace\n- 发现：仓库名 `pytorch-hessian-eigenthings` 与安装入口 `hessian-eigenthings` 不完全一致。\n- 对用户的影响：用户照着仓库名搜索包或照着包名找仓库时容易走错入口。\n- 建议检查：在 npm/PyPI/GitHub 上确认包名映射和官方 README 说明。\n- 复现命令：`pip install hessian-eigenthings`\n- 防护动作：页面必须同时展示 repo 名和真实安装入口，避免用户搜索错包。\n- 证据：identity.distribution | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | repo=pytorch-hessian-eigenthings; install=hessian-eigenthings\n\n## 2. 安装坑 · 来源证据：Python Error: the following arguments are required: experimentname\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Python Error: the following arguments are required: experimentname\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_24f46464d79f4ae3830f046c077a2574 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/39 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：v1.0.0a2 — packaging fix\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a2 — packaging fix\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_7540a696b30c46cdba07c12f33388567 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a2 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：v1.0.0a3 — fix lanczos OOM\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a3 — fix lanczos OOM\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e5e68e2f24e1436cb8f3c2f11cefe326 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a3 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 5. 安装坑 · 来源证据：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a4 — backend handles CPU-generator + CUDA-tensor combo\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_914b7653aa8b4ef2844a2b4690fab2ad | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a4 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 6. 安装坑 · 来源证据：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v1.0.0a5 — comprehensive LLM-scale memory fixes + regression tests\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_509356ab9b68434992d7237219952ba6 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/releases/tag/v1.0.0a5 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 7. 配置坑 · 来源证据：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：RuntimeError: One of the differentiated Tensors appears to not have been used in the graph.\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f79a3a34cbab435cb3730b7ae17cf492 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/30 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 配置坑 · 来源证据：ValueError: PENet on the Kitti benchmark suite\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：ValueError: PENet on the Kitti benchmark suite\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_850f8cf0010c4d269ab71d864610097a | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/41 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 9. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | README/documentation is current enough for a first validation pass.\n\n## 10. 运行坑 · 来源证据：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：AttributeError: 'HVPOperator' object has no attribute 'zero_grad'\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b515f5c06a5744b19b667bcbc8123348 | https://github.com/noahgolmant/pytorch-hessian-eigenthings/issues/38 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 11. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | last_activity_observed missing\n\n## 12. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | no_demo; severity=medium\n\n## 13. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | no_demo; severity=medium\n\n## 14. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | issue_or_pr_quality=unknown\n\n## 15. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | hn_item:48132232 | https://news.ycombinator.com/item?id=48132232 | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# pytorch-hessian-eigenthings - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for noahgolmant/pytorch-hessian-eigenthings.\n\nProject:\n- Name: pytorch-hessian-eigenthings\n- Repository: https://github.com/noahgolmant/pytorch-hessian-eigenthings\n- Summary: pytorch-hessian-eigenthings\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: pytorch-hessian-eigenthings\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: pytorch-hessian-eigenthings\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. introduction: Introduction to hessian-eigenthings. Produce one small intermediate artifact and wait for confirmation.\n2. curvature-matrices: Curvature Matrices Explained. Produce one small intermediate artifact and wait for confirmation.\n3. hvp-approach: Why Hessian-Vector Products. Produce one small intermediate artifact and wait for confirmation.\n4. architecture: System Architecture. Produce one small intermediate artifact and wait for confirmation.\n5. curvature-operators: Curvature Operators. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://news.ycombinator.com/item?id=48132232\n- https://github.com/noahgolmant/pytorch-hessian-eigenthings#readme\n- README.md\n- hessian_eigenthings/__init__.py\n- docs/concepts/ggn-vs-fisher-vs-hessian.md\n- docs/concepts/what-is-the-hessian.md\n- docs/concepts/why-hvp-not-full-h.md\n- hessian_eigenthings/operators/hessian.py\n- hessian_eigenthings/operators/base.py\n- hessian_eigenthings/algorithms/__init__.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：noahgolmant/pytorch-hessian-eigenthings\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install hessian-eigenthings\n```\n\n来源：https://github.com/noahgolmant/pytorch-hessian-eigenthings#readme\n\n## 来源\n\n- hn: https://news.ycombinator.com/item?id=48132232\n- docs: https://github.com/noahgolmant/pytorch-hessian-eigenthings#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_8a294b814b3b4037a843c14c510c47df"
}