{
  "canonical_name": "allenai/olmocr",
  "compilation_id": "pack_90f3e648547e46c6b74303b44c7c69ee",
  "created_at": "2026-06-13T23:28:33.683459+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=skill, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=skill, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install olmocr` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install olmocr",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "llm_execute_isolated_install",
      "sandbox_validation_id": "sbx_82f81cda8ace45f1841cc9f05a28805b"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_343545ac147fb0fd459458fefa102a6f",
    "canonical_name": "allenai/olmocr",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/allenai/olmocr",
    "slug": "olmocr",
    "source_packet_id": "phit_f1e74219b52549699b589928ee9f5ca3",
    "source_validation_id": "dval_57c72090fafa43429aa6c026373d7d67"
  },
  "merchandising": {
    "best_for": "需要软件开发与交付能力，并使用 local_cli的用户",
    "github_forks": 1401,
    "github_stars": 17389,
    "one_liner_en": "Toolkit for linearizing PDFs for LLM datasets/training",
    "one_liner_zh": "用于将 PDF 线性化处理以便构建 LLM 数据集和训练的工具包",
    "primary_category": {
      "category_id": "software-development",
      "confidence": "medium",
      "name_en": "Software Development",
      "name_zh": "软件开发与交付",
      "reason": "matched_keywords:cli"
    },
    "target_user": "使用 local_cli 等宿主 AI 的用户",
    "title_en": "olmocr",
    "title_zh": "olmocr 能力包",
    "visible_tags": [
      {
        "label_en": "Software Development",
        "label_zh": "软件开发与交付",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-software-development",
        "type": "product_domain"
      },
      {
        "label_en": "Open Source Capability Building",
        "label_zh": "开源能力构建",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-open-source-capability-building",
        "type": "user_job"
      },
      {
        "label_en": "Project Capability",
        "label_zh": "项目能力包",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-project-capability",
        "type": "core_capability"
      },
      {
        "label_en": "Verifiable Workflow",
        "label_zh": "可验证工作流",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-verifiable-workflow",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Evidence-backed Context",
        "label_zh": "证据化上下文",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-evidence-backed-context",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_f1e74219b52549699b589928ee9f5ca3",
  "page_model": {
    "artifacts": {
      "artifact_slug": "olmocr",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install olmocr",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/allenai/olmocr#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "软件开发与交付",
        "开源能力构建",
        "项目能力包",
        "可验证工作流",
        "证据化上下文"
      ],
      "eyebrow": "软件开发与交付",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要软件开发与交付能力，并使用 local_cli的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "Toolkit for linearizing PDFs for LLM datasets/training"
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "local_cli",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "skill, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "Project evidence flags a installation risk. Review the linked source before relying on this workflow.",
            "category": "Installation risk",
            "evidence": [
              "community_evidence:github | https://github.com/allenai/olmocr/issues/461"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Installation risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "Project evidence flags a configuration risk. Review the linked source before relying on this workflow.",
            "category": "Configuration risk",
            "evidence": [
              "community_evidence:github | https://github.com/allenai/olmocr/issues/455"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Configuration risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "Capability evidence risk",
            "evidence": [
              "capability.assumptions | github_repo:858798469 | https://github.com/allenai/olmocr"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Capability evidence risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.",
            "category": "Maintenance risk",
            "evidence": [
              "community_evidence:github | https://github.com/allenai/olmocr/issues/463"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Maintenance risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.",
            "category": "Maintenance risk",
            "evidence": [
              "community_evidence:github | https://github.com/allenai/olmocr/issues/460"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Maintenance risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.",
            "category": "Maintenance risk",
            "evidence": [
              "evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Maintenance risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "no_demo",
            "category": "Security or permission risk",
            "evidence": [
              "downstream_validation.risk_items | github_repo:858798469 | https://github.com/allenai/olmocr"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Security or permission risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "no_demo",
            "category": "Security or permission risk",
            "evidence": [
              "risks.scoring_risks | github_repo:858798469 | https://github.com/allenai/olmocr"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Security or permission risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.",
            "category": "Security or permission risk",
            "evidence": [
              "community_evidence:github | https://github.com/allenai/olmocr/issues/459"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Security or permission risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.",
            "category": "Security or permission risk",
            "evidence": [
              "community_evidence:github | https://github.com/allenai/olmocr/issues/451"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Security or permission risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.",
            "category": "Security or permission risk",
            "evidence": [
              "community_evidence:github | https://github.com/allenai/olmocr/issues/452"
            ],
            "severity": "medium",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Security or permission risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "issue_or_pr_quality=unknown。",
            "category": "Maintenance risk",
            "evidence": [
              "evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr"
            ],
            "severity": "low",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Maintenance risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          },
          {
            "body": "release_recency=unknown。",
            "category": "Maintenance risk",
            "evidence": [
              "evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr"
            ],
            "severity": "low",
            "suggested_check": "Reproduce the official install and quickstart path in an isolated environment.",
            "title": "Maintenance risk requires verification",
            "user_impact": "May increase setup, validation, or first-run risk for the user."
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "Found 13 structured pitfall item(s), including 0 high/blocking item(s). Top priority: Installation risk - Installation risk requires verification.",
        "title": "Pitfall Log"
      },
      "snapshot": {
        "contributors": 16,
        "forks": 1401,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": 17389
      },
      "source_url": "https://github.com/allenai/olmocr",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "Toolkit for linearizing PDFs for LLM datasets/training",
      "title": "olmocr 能力包",
      "trial_prompt": "# olmocr - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for allenai/olmocr.\n\nProject:\n- Name: olmocr\n- Repository: https://github.com/allenai/olmocr\n- Summary: Toolkit for linearizing PDFs for LLM datasets/training\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: Toolkit for linearizing PDFs for LLM datasets/training\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Toolkit for linearizing PDFs for LLM datasets/training\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. page-1: Installation and Platform Support. Produce one small intermediate artifact and wait for confirmation.\n2. page-2: Pipeline and Inference Modes. Produce one small intermediate artifact and wait for confirmation.\n3. page-3: Benchmark Suite and OCR Evaluation. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/allenai/olmocr\n- https://github.com/allenai/olmocr#readme\n- README.md\n- pyproject.toml\n- Dockerfile\n- Dockerfile.with-model\n- docs/source/installation.md\n- olmocr/pipeline.py\n- olmocr/work_queue.py\n- olmocr/datatypes.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: Fail to parse b4c3c4ac3d6f7b52a993cec7ca8b3ad43cecabad_page_3.pdf（https://github.com/allenai/olmocr/issues/463）；github/github_issue: olmocr.bench scoring: `partial_ratio` falsely matches when candidate is （https://github.com/allenai/olmocr/issues/461）；github/github_issue: Model allenai/olmOCR-2-7B-1025 on DeepInfra will be deprecated on 2026-0（https://github.com/allenai/olmocr/issues/460）；github/github_issue: Writing markdown error : 'gbk' codec can't encode character '\\u1eca' in （https://github.com/allenai/olmocr/issues/459）；github/github_issue: configurable timeout for HTTP client in server method（https://github.com/allenai/olmocr/issues/455）；github/github_issue: numpy is missing from [bench] dependencies（https://github.com/allenai/olmocr/issues/452）；github/github_issue: [bug] badly formed help string（https://github.com/allenai/olmocr/issues/451）；github/github_release: v0.4.27（https://github.com/allenai/olmocr/releases/tag/v0.4.27）；github/github_release: v0.4.25（https://github.com/allenai/olmocr/releases/tag/v0.4.25）；github/github_release: v0.4.24（https://github.com/allenai/olmocr/releases/tag/v0.4.24）；github/github_release: v0.4.21（https://github.com/allenai/olmocr/releases/tag/v0.4.21）；github/github_release: v0.4.20（https://github.com/allenai/olmocr/releases/tag/v0.4.20）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Fail to parse b4c3c4ac3d6f7b52a993cec7ca8b3ad43cecabad_page_3.pdf",
              "url": "https://github.com/allenai/olmocr/issues/463"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "olmocr.bench scoring: `partial_ratio` falsely matches when candidate is ",
              "url": "https://github.com/allenai/olmocr/issues/461"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Model allenai/olmOCR-2-7B-1025 on DeepInfra will be deprecated on 2026-0",
              "url": "https://github.com/allenai/olmocr/issues/460"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Writing markdown error : 'gbk' codec can't encode character '\\u1eca' in ",
              "url": "https://github.com/allenai/olmocr/issues/459"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "configurable timeout for HTTP client in server method",
              "url": "https://github.com/allenai/olmocr/issues/455"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "numpy is missing from [bench] dependencies",
              "url": "https://github.com/allenai/olmocr/issues/452"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[bug] badly formed help string",
              "url": "https://github.com/allenai/olmocr/issues/451"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.4.27",
              "url": "https://github.com/allenai/olmocr/releases/tag/v0.4.27"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.4.25",
              "url": "https://github.com/allenai/olmocr/releases/tag/v0.4.25"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.4.24",
              "url": "https://github.com/allenai/olmocr/releases/tag/v0.4.24"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.4.21",
              "url": "https://github.com/allenai/olmocr/releases/tag/v0.4.21"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.4.20",
              "url": "https://github.com/allenai/olmocr/releases/tag/v0.4.20"
            }
          ],
          "status": "已收录 12 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "软件开发与交付",
      "desc": "Toolkit for linearizing PDFs for LLM datasets/training",
      "effort": "安装已验证",
      "forks": 1401,
      "icon": "code",
      "name": "olmocr 能力包",
      "risk": "可发布",
      "slug": "olmocr",
      "stars": 17389,
      "tags": [
        "软件开发与交付",
        "开源能力构建",
        "项目能力包",
        "可验证工作流",
        "证据化上下文"
      ],
      "thumb": "gray",
      "type": "Skill Pack"
    },
    "manual": {
      "markdown": "# https://github.com/allenai/olmocr Project Manual\n\nGenerated at: 2026-06-13 23:23:00 UTC\n\n## Table of Contents\n\n- [Installation and Platform Support](#page-1)\n- [Pipeline and Inference Modes](#page-2)\n- [Benchmark Suite and OCR Evaluation](#page-3)\n- [Model Training, Filtering, and Synthetic Data](#page-4)\n\n<a id='page-1'></a>\n\n## Installation and Platform Support\n\n### Related Pages\n\nRelated topics: [Pipeline and Inference Modes](#page-2)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n- [pyproject.toml](https://github.com/allenai/olmocr/blob/main/pyproject.toml)\n- [Dockerfile](https://github.com/allenai/olmocr/blob/main/Dockerfile)\n- [Dockerfile.with-model](https://github.com/allenai/olmocr/blob/main/Dockerfile.with-model)\n- [docs/source/installation.md](https://github.com/allenai/olmocr/blob/main/docs/source/installation.md)\n- [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n- [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n</details>\n\n# Installation and Platform Support\n\n## Overview\n\nolmOCR is a vision-language-model-based OCR pipeline that can be run either against a local NVIDIA GPU or against any remote OpenAI-API-compatible inference server. Installation is split into two tiers — a lightweight CPU-only install for remote-server use, and a heavier install that pulls in PyTorch, vLLM, and other GPU dependencies for local inference. The project is officially tuned for Linux + NVIDIA GPUs; community discussions show that macOS, ROCm, and Windows paths exist but are not first-class supported. Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\n## System Dependencies\n\nolmOCR renders PDF pages to images before sending them to the VLM, which requires a working `poppler-utils` install plus several TrueType font packages so that rendered glyphs look correct. The README specifies the Ubuntu/Debian install line:\n\n```bash\nsudo apt-get update\nsudo apt-get install poppler-utils ttf-mscorefonts-installer msttcorefonts \\\n  fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools\n```\n\nSource: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\nOn non-Ubuntu systems the user is expected to translate this list to the equivalent platform package manager. Skipping the font packages is a common cause of degraded OCR output on documents that use Caladea, Carlito, MS Core Fonts, or other non-default faces.\n\n## Python Installation\n\nThe project explicitly recommends installing into a fresh Conda environment because the local-GPU dependency set (PyTorch, flash-attn, vLLM) is fragile in pre-existing environments. Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\n```bash\nconda create -n olmocr python=3.11\nconda activate olmocr\n```\n\nPython 3.11 is the version called out in the README. After activation, the user picks one of two `pip install` options:\n\n| Option | Command | When to use |\n|---|---|---|\n| Remote inference only | `pip install olmocr` | You have a vLLM / OpenAI-API server running elsewhere. Avoids the ~2 GB+ of GPU dependencies. |\n| Local GPU inference | `pip install olmocr[gpu]` (or the full `pip install .` per the README) | You want olmOCR to spawn a local vLLM instance. Requires a recent NVIDIA GPU. |\n\nSource: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\nThe optional extras groups are also relevant for downstream workflows:\n\n- `[train]` — extra packages required by the RLVR training pipeline documented in `olmocr/train/README.md`. That README additionally pins `transformers==4.52.4` and `flash-attn>=2.8.0.post2 --no-build-isolation`. Source: [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n- `[bench]` — used to run `python -m olmocr.bench.benchmark`. Community issue #452 reports that `numpy` was missing from this group and had to be installed manually, so users hitting `ModuleNotFoundError: numpy` on first run should `pip install numpy` explicitly. Source: [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n\n## Platform Support and Known Limitations\n\n```mermaid\nflowchart LR\n    A[Choose install path] --> B{Remote vLLM<br/>server available?}\n    B -- Yes --> C[pip install olmocr<br/>Linux/macOS/Windows]\n    B -- No --> D{NVIDIA GPU<br/>on Linux?}\n    D -- Yes --> E[pip install olmocr gpu extras<br/>poppler-utils + fonts]\n    D -- No --> F[Community paths:<br/>macOS/ROCm/CPU]\n```\n\nThe README states that local inference is tested on a recent NVIDIA GPU. Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\nPlatforms beyond this are community-driven:\n\n- **macOS / Apple Silicon.** Issue #33 has 28 comments requesting official support. As of release v0.4.27 there is no first-class macOS install path; the recommended workaround is the remote-server install against a vLLM endpoint running on Linux. Source: community context, issue #33.\n- **AMD ROCm.** Issue #111 documents users attempting a 7900 XTX / ROCm install hitting OOM in vLLM and empty responses from a GGUF/ollama fallback. Source: community context, issue #111.\n- **Windows.** The CLI works under Windows 11 + Conda + Python 3.11, but issue #459 shows that the default Windows console codepage is `gbk` and corrupts non-ASCII output. The accepted workaround in that thread is to set `PYTHONIOENCODING=utf-8` and run `chcp 65001` from PowerShell before launching olmOCR. Source: community context, issue #459.\n\nThe training stack documented in `olmocr/train/README.md` is explicitly specialized to an 8×H100 Beaker cluster and is not expected to work elsewhere without modification. Source: [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n\n## Docker and Common Installation Pitfalls\n\nFor users who do not want to manage the system dependencies by hand, a `Dockerfile` and a `Dockerfile.with-model` variant are shipped at the repository root. The `with-model` image bundles the `allenai/olmOCR-2-7B-1025-FP8` weights so that the container can run local inference out of the box. Community issue #161 also provides a community-maintained CUDA 12.1 / Ubuntu 22.04 Dockerfile for users who want a known-good base. Source: [Dockerfile](https://github.com/allenai/olmocr/blob/main/Dockerfile), [Dockerfile.with-model](https://github.com/allenai/olmocr/blob/main/Dockerfile.with-model); community context, issue #161.\n\nThe most common first-run failures, drawn from the issue tracker and the in-repo docs, are:\n\n1. **Missing `poppler-utils` or fonts** — pages render as boxes or with missing glyphs; install the apt packages listed above.\n2. **Wrong Python environment** — PyTorch or flash-attn wheels installed into a non-3.11 env cause opaque import errors; recreate the conda env.\n3. **`numpy` / missing `[bench]` extras** — covered in issue #452.\n4. **Malformed CLI help string** — issue #451 documents `argparse` raising on `--help` under Python 3.14, so use Python 3.11 for now. Source: community context, issue #451.\n5. **HTTP client timeout in server runner** — `olmocr/bench/runners/run_server.py` defaults to 300 s, which is too short for slow endpoints; issue #455 tracks a request to make this configurable. Source: community context, issue #455.\n\n## See Also\n\n- [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md) — Training-specific environment setup, including dataset preparation.\n- [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md) — Benchmark harness, including the documented document categories.\n- [docs/source/installation.md](https://github.com/allenai/olmocr/blob/main/docs/source/installation.md) — Sphinx-rendered installation guide (mirrors the README).\n- GitHub issues #33, #111, #161, #451, #452, #455, #459 — Platform and install-related discussions referenced above.\n\n---\n\n<a id='page-2'></a>\n\n## Pipeline and Inference Modes\n\n### Related Pages\n\nRelated topics: [Installation and Platform Support](#page-1), [Benchmark Suite and OCR Evaluation](#page-3)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [olmocr/pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/pipeline.py)\n- [olmocr/work_queue.py](https://github.com/allenai/olmocr/blob/main/olmocr/work_queue.py)\n- [olmocr/datatypes.py](https://github.com/allenai/olmocr/blob/main/olmocr/datatypes.py)\n- [olmocr/s3_utils.py](https://github.com/allenai/olmocr/blob/main/olmocr/s3_utils.py)\n- [olmocr/prompts/prompts.py](https://github.com/allenai/olmocr/blob/main/olmocr/prompts/prompts.py)\n- [olmocr/image_utils.py](https://github.com/allenai/olmocr/blob/main/olmocr/image_utils.py)\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n</details>\n\n# Pipeline and Inference Modes\n\nThe olmOCR pipeline (`olmocr/pipeline.py`) is the central batch inference system that converts millions of PDF pages into structured Markdown using a fine-tuned vision-language model. It is invoked either as `python -m olmocr.pipeline <workspace> ...` or simply `olmocr <workspace> ...`, and exposes a single CLI surface that fans out into several execution and inference backends. The pipeline's responsibility spans PDF ingestion, page-group scheduling, model dispatch (local or remote), retry handling, and writing Dolma-compatible output plus optional Markdown files [Source: [olmocr/pipeline.py:1-50]()] [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n## Pipeline Responsibilities and Workspace Layout\n\nA pipeline run is anchored to a single `workspace` argument — a local folder or an `s3://bucket/prefix/` path. The workspace stores the work queue, partial results, and the final Dolma-format files (`results/*.jsonl.gz`) preserving the input PDF folder structure. The pipeline is described as a \"Manager for running millions of PDFs through a batch inference pipeline\" in its top-level help text [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\nKey behaviors of the workspace:\n\n- Acts as a work queue when running on S3, allowing multiple worker nodes to coordinate via shared prefixes [Source: [olmocr/work_queue.py:1-80]()] [Source: [olmocr/s3_utils.py:1-120]()] .\n- Local workspaces behave identically, but contention is serialized at the filesystem level.\n- Pages are grouped (`--pages_per_group`, default small batch) and each group is dispatched independently. Failed groups are retried (`--max_page_retries`) and a configurable error budget (`--max_page_error_rate`) terminates the run when exceeded [Source: [olmocr/pipeline.py:100-180]()] .\n- When `--markdown` is passed, an additional `markdown/` folder is populated with human-readable outputs alongside the Dolma JSONL stream [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n## Local vLLM Inference Mode\n\nIn the default mode, the pipeline spawns a local vLLM server and runs inference in-process. This requires the GPU-enabled install (`pip install olmocr[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/`) and exposes a number of vLLM passthrough arguments [Source: [olmocr/pipeline.py:200-260]()] [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\nNotable local-mode flags include:\n\n| Flag | Purpose |\n|------|---------|\n| `--model` | Path or HF repo id (default `allenai/olmOCR-7B-0725-FP8`) |\n| `--gpu-memory-utilization` | Fraction of VRAM allocated to KV-cache (forwarded to `vllm serve`) |\n| `--max_model_len` | Upper bound on KV-cache tokens; lower it if vLLM fails to start |\n| `--tensor-parallel-size` / `-tp` | Tensor parallel degree for vLLM |\n| `--data-parallel-size` / `-dp` | Data parallel degree |\n| `--port` | Port for the local vLLM server |\n| `--apply_filter` | Runs an additional quality filter pass on the parsed output |\n| `--guided_decoding` | Enables guided decoding for YAML-typed model outputs |\n\nThe pipeline also controls how PDF pages are rendered before being sent to the model: `--target_longest_image_dim` controls the rendered image size and `--target_anchor_text_len` caps the amount of anchor text used (legacy models only) [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n## Remote Inference Server Mode (OpenAI-Compatible)\n\nA second mode targets external vLLM servers, DeepInfra, or any OpenAI-compatible endpoint. In this mode the pipeline skips spawning a local vLLM instance and simply issues chat-completions requests over HTTP. This works with the lightweight `pip install olmocr` install (no GPU dependencies) [Source: [olmocr/pipeline.py:300-360]()] [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\nKey arguments for remote/server mode:\n\n- `--server`: URL of an external vLLM or compatible server, e.g. `http://remote-host:8000/v1`. When set, the pipeline does not start a local server.\n- `--api_key`: Bearer token passed via `Authorization` header.\n- `--max_concurrent_requests`: Caps in-flight HTTP requests to the provider.\n- `--workers`: Limits page groups processed simultaneously; setting to `1` ensures one group finishes before the next starts — useful for providers with low concurrency caps.\n- `--pages_per_group`: Smaller groups help on providers with strict concurrent-request ceilings.\n- `--model`: Model identifier (provider-specific; e.g. `allenai/olmOCR-2-7B-1025` on DeepInfra).\n\nExample invocation: `olmocr ./localworkspace --server http://remote-server:8000/v1 --model allenai/olmOCR-2-7B-1025-FP8 --markdown --pdfs *.pdf` [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n## Multi-Node / Beaker Cluster Mode\n\nFor very large jobs, the pipeline can coordinate multiple workers against an S3-backed workspace. The first worker seeds the queue: `olmocr s3://my_s3_bucket/pdfworkspaces/exampleworkspace --pdfs s3://my_s3_bucket/jakep/gnarly_pdfs/*.pdf`. Subsequent workers simply point at the same prefix and pull from the shared queue [Source: [olmocr/work_queue.py:80-180]()] [Source: [olmocr/s3_utils.py:120-220]()] [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\nAlternatively, the pipeline can be submitted directly to Beaker with `--beaker`, `--beaker_workspace`, `--beaker_cluster`, `--beaker_gpus`, and `--beaker_priority`. This path is documented as the standard way to run multi-node jobs at AI2 [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n```mermaid\nflowchart LR\n    A[User / CLI] --> B[olmocr.pipeline]\n    B --> C{Inference Mode}\n    C -- local GPU --> D[Spawn vLLM]\n    C -- --server --> E[Remote vLLM / OpenAI API]\n    C -- --beaker / S3 --> F[Shared S3 Workspace]\n    D --> G[Page Groups -> Model]\n    E --> G\n    F --> G\n    G --> H[Dolma JSONL + Markdown]\n```\n\n## Common Failure Modes and Community-Reported Issues\n\nSeveral issues trace directly back to pipeline/inference behavior:\n\n- **Empty Markdown output on specific PDFs** (issue #463): the pipeline reports success and writes the markdown file but the body is empty, indicating model output failed validation for that page rather than a crash [Source: [issue #463](https://github.com/allenai/olmocr/issues/463)].\n- **Encoding errors when writing Markdown** (issue #459): Windows + GBK locale cannot encode certain Unicode codepoints emitted by the model. Setting `PYTHONIOENCODING=utf-8` or `chcp 65001` before launching the pipeline resolves it [Source: [issue #459](https://github.com/allenai/olmocr/issues/459)].\n- **Hard-coded 300s timeout in server runner** (issue #455): the bench HTTP client in `olmocr/bench/runners/run_server.py` uses a fixed timeout, which is insufficient for some larger page groups; community has requested making it configurable [Source: [issue #455](https://github.com/allenai/olmocr/issues/455)].\n- **DeepInfra model deprecation** (issue #460): `allenai/olmOCR-2-7B-1025` is scheduled for deprecation on DeepInfra on 2026-05-07; users running the remote mode should plan for a self-hosted vLLM endpoint [Source: [issue #460](https://github.com/allenai/olmocr/issues/460)].\n\nA known release-line note: v0.4.27 fixed a \"queue bug in long queues\" relevant to S3-coordinated multi-node runs, so users on older versions should upgrade before scaling out [Source: [release v0.4.27](https://github.com/allenai/olmocr/releases/tag/v0.4.27)].\n\n## See Also\n\n- [Training Guide](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md) — for fine-tuning new olmOCR models.\n- [olmOCR-Bench](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md) — for evaluating pipeline output.\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md) — top-level usage documentation.\n\n---\n\n<a id='page-3'></a>\n\n## Benchmark Suite and OCR Evaluation\n\n### Related Pages\n\nRelated topics: [Pipeline and Inference Modes](#page-2), [Model Training, Filtering, and Synthetic Data](#page-4)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n- [olmocr/bench/benchmark.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/benchmark.py)\n- [olmocr/bench/tests.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/tests.py)\n- [olmocr/bench/prompts.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/prompts.py)\n- [olmocr/bench/runners/run_server.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/runners/run_server.py)\n- [olmocr/bench/runners/run_olmocr_pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/runners/run_olmocr_pipeline.py)\n- [olmocr/bench/scripts/workspace_to_bench.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/scripts/workspace_to_bench.py)\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n- [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n</details>\n\n# Benchmark Suite and OCR Evaluation\n\n## Purpose and Scope\n\n`olmOCR-Bench` is the project's machine-checkable evaluation harness for document-level OCR systems. It does **not** rely on edit distance or fuzzy text similarity; instead it asserts discrete, hand-authored \"facts\" about each PDF page and grades whether an OCR pipeline recovered them. The design intent, stated in the bench README, is that every test case should be \"very simple, unambiguous, and machine-checkable, similar to a unit test\" so that alternate-but-correct transcriptions are not penalized.\n\nSource: [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n\nThe benchmark operates on **single-page PDFs** so that digital metadata is preserved, and accepts Markdown or plain-text output from any OCR tool. This deliberately decouples evaluation from any specific engine, allowing third-party OCR pipelines (Marker, MinerU, Mistral OCR API, DeepSeek-OCR, etc.) to be ranked on the same scoreboard published in the project root [README.md](https://github.com/allenai/olmocr/blob/main/README.md).\n\n## Document Types and Test Categories\n\nThe suite defines seven categories that historically challenge OCR systems. Each category is sourced via a different acquisition strategy documented in [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md):\n\n| Category | Code | Source strategy |\n| --- | --- | --- |\n| arXiv Math | AR | Single-TeX-source arXiv math papers, validated with KaTeX |\n| Old Scans Math | OSM | Public-domain math textbooks from the Internet Archive, manually annotated |\n| Tables | TA | Internally crawled PDFs filtered for tables with Gemini-Flash-2.0 |\n| Old Scans | OS | Library of Congress letters with existing transcriptions |\n| Headers & Footers | HF | DocLayout-YOLO \"abandon\" regions; tests expect text to be **absent** |\n| Multi Column | MC | Claude-Sonnet-3.7 HTML renders of multi-article pages |\n| Long Tiny Text | LTT | Dense small-print archive pages (dictionaries, references) |\n\nTests are stored as JSONL files in the `bench_data/` directory of the dataset (`allenai/olmOCR-bench` on Hugging Face). The header/footer category is unique in that it asserts a string must *not* appear in the linearized output, since correctly excluding recurring page furniture is a strong signal of OCR quality.\n\n## Benchmark Principles and Scoring\n\nThe README enumerates rules that shape how candidate outputs are scored. These are worth understanding before debugging a low score:\n\n- Output is treated as plain-text Unicode in a natural reading order; Markdown syntax is **ignored** when matching (so `**enlightenment**` still matches `enlightenment`).\n- Matching is **not position-sensitive**, except for header/footer tests, which constrain search to the first or last N characters.\n- Tables may be Markdown or HTML `<table>`; math must be delimited by `$`, `$$`, `\\(`, or `\\[` and must render with KaTeX.\n- All Unicode is normalized to NFC; hyphens, single quotes, and double quotes are folded to ASCII variants before comparison.\n- Each test belongs to one of the seven categories plus a \"Base\" category used for sanity checks.\n\nThe actual scoring logic lives in `TextPresenceTest.run` and related classes inside `olmocr/bench/tests.py`. As reported in [issue #461](https://github.com/allenai/olmocr/issues/461), `partial_ratio` from RapidFuzz can produce false positives when the candidate `md_content` is much shorter than the queried text (e.g., a single `\\n`), because the partial ratio then compares against a near-empty string. Patches to the test logic must guard against this degenerate case.\n\n## Running the Benchmark\n\nA typical run follows the recipe documented in the bench README:\n\n```bash\n# 1. Install GPU extras for local inference\npip install olmocr[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/\n\n# 2. Convert benchmark PDFs into model output using a runner\npython -m olmocr.bench.convert olmocr_pipeline --dir ./olmOCR-bench/bench_data\n\n# 3. Score the outputs\npython -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data\n```\n\nTwo runners are shipped under `olmocr/bench/runners/`:\n\n- `run_olmocr_pipeline.py` invokes the local `olmocr.pipeline` to render each PDF into Markdown inside a workspace directory. The helper `olmocr/bench/scripts/workspace_to_bench.py` then copies those outputs into the directory layout the grader expects.\n- `run_server.py` posts the PDFs to an OpenAI-compatible endpoint (local vLLM, DeepInfra, or any compatible API). [Issue #455](https://github.com/allenai/olmocr/issues/455) tracks a feature request to expose a configurable HTTP timeout, since the current default of 300 seconds is insufficient for slow backends or large pages.\n\nThe CLI entry point in `olmocr/bench/benchmark.py` has surfaced at least two regressions: a malformed `argparse` help string ([issue #451](https://github.com/allenai/olmocr/issues/451)) and a missing `numpy` import in the `[bench]` extras ([issue #452](https://github.com/allenai/olmocr/issues/452)). Install the dependency with `pip install numpy` if the benchmark entry point raises `ModuleNotFoundError` immediately on launch.\n\nFor annotation review, the README points to an internal tool:\n\n```bash\npython -m olmocr.bench.review_app --port 5000 --debug \\\n    ./olmOCR-bench/bench_data/multi_column.jsonl --force\n```\n\n## Common Failure Modes Seen by Users\n\nThe community has filed several reports that intersect with the benchmark workflow:\n\n- **Empty markdown output for a specific page.** Running `olmocr output/ --pdfs bench_data/pdfs/headers_footers/<hash>_page_3.pdf --model /root/olmOCR-2-7B-1025-FP8/ --markdown` produced an empty Markdown file ([issue #463](https://github.com/allenai/olmocr/issues/463)). Because header/footer tests assert *absence*, an empty candidate actually scores well on HF tests but fails Base and content-bearing categories, which is a useful diagnostic signal.\n- **GBK encoding crash on Windows.** [Issue #459](https://github.com/allenai/olmocr/issues/459) describes a `'gbk' codec can't encode character '\\u1eca'` error during Markdown writing. Setting `PYTHONIOENCODING=utf-8` (or `chcp 65001` in PowerShell) before invoking the pipeline resolves the failure when running under the default Windows codepage.\n- **DeepInfra deprecation.** [Issue #460](https://github.com/allenai/olmocr/issues/460) notes that the hosted `allenai/olmOCR-2-7B-1025` snapshot will be retired on 2026-05-07. Users scoring against the server runner should pin to a self-hosted model or migrate to the new release.\n- **Scoring inversion for near-empty candidates.** As noted above, [issue #461](https://github.com/allenai/olmocr/issues/461) reports that `TextPresenceTest.run` in [olmocr/bench/tests.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/tests.py) can invert its decision when the candidate collapses to whitespace. When investigating \"all tests pass\" results for a broken page, check the candidate file directly.\n\n## Integration with Training\n\nThe same JSONL schema is reused inside the GRPO trainer. Per [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md), `mine_html_templates.py` synthesizes additional benchmark-style cases (`olmocr-synthmix-1025`), and the trainer script accepts a `--reward_bench` weight so that bench-style presence/absence tests participate directly in the reinforcement-learning reward signal. This is the mechanism behind the v0.4.0 release's ~4-point jump on the published scoreboard, where the strongest olmOCR release (v0.4.0) reaches an Overall score of 82.4±1.1.\n\n## See Also\n\n- olmOCR Pipeline and Inference\n- Training olmOCR Models (GRPO + Synthetic Data)\n- Dolma Output Viewer and Workspace Format\n- Issue #451, #452, #455, #459, #460, #461, #463 on GitHub for known bench regressions and platform quirks.\n\n---\n\n<a id='page-4'></a>\n\n## Model Training, Filtering, and Synthetic Data\n\n### Related Pages\n\nRelated topics: [Pipeline and Inference Modes](#page-2), [Benchmark Suite and OCR Evaluation](#page-3)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n- [olmocr/train/train.py](https://github.com/allenai/olmocr/blob/main/olmocr/train/train.py)\n- [olmocr/train/grpo_train.py](https://github.com/allenai/olmocr/blob/main/olmocr/train/grpo_train.py)\n- [olmocr/train/config.py](https://github.com/allenai/olmocr/blob/main/olmocr/train/config.py)\n- [olmocr/train/dataloader.py](https://github.com/allenai/olmocr/blob/main/olmocr/train/dataloader.py)\n- [olmocr/data/buildsilver.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/buildsilver.py)\n- [olmocr/data/prepare_olmocrmix.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/prepare_olmocrmix.py)\n- [olmocr/data/filter.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/filter.py)\n- [olmocr/synth/mine_html_templates.py](https://github.com/allenai/olmocr/blob/main/olmocr/synth/mine_html_templates.py)\n- [olmocr/pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/pipeline.py)\n- [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n</details>\n\n# Model Training, Filtering, and Synthetic Data\n\n## Overview\n\nolmOCR's model training, filtering, and synthetic data pipeline is the machinery that produces fine-tuned OCR models such as `allenai/olmOCR-2-7B-1025-FP8` (referenced in [README.md](https://github.com/allenai/olmocr/blob/main/README.md)). The pipeline combines three concerns:\n\n1. **Training** — supervised fine-tuning and GRPO-based reinforcement learning on PDF/markdown pairs, orchestrated in [olmocr/train/](https://github.com/allenai/olmocr/tree/main/olmocr/train).\n2. **Filtering and data preparation** — converting raw PDFs and human/LLM-generated markdown into the single-page layout that the trainer consumes, handled in [olmocr/data/](https://github.com/allenai/olmocr/tree/main/olmocr/data).\n3. **Synthetic data** — generating additional training examples from HTML templates to bolster coverage of hard document types, handled in [olmocr/synth/](https://github.com/allenai/olmocr/tree/main/olmocr/synth).\n\nThe README frames the broader workflow as: \"Processing millions of PDFs through a finetuned model using VLLM\" via [olmocr/pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/pipeline.py), and a \"Filtering language identification\" helper, both of which feed back into the next training round.\n\n```mermaid\nflowchart LR\n    A[Raw PDFs] --> B[olmocr/pipeline.py]\n    B --> C[olmocr.data.filter]\n    C --> D[prepare_workspace / prepare_olmocrmix]\n    E[mine_html_templates.py] --> F[Synthetic Markdown]\n    D --> G[olmocr-synthmix-1025]\n    F --> G\n    G --> H[train.py / grpo_train.py]\n    H --> I[Fine-tuned olmOCR Model]\n```\n\n## Training Pipeline\n\nThe training stack lives under [olmocr/train/](https://github.com/allenai/olmocr/tree/main/olmocr/train) and exposes two entry points:\n\n- `train.py` — supervised fine-tuning, described in [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n- `grpo_train.py` — reinforcement learning with unit-test rewards (see [olmocr2 bibtex entry in README.md](https://github.com/allenai/olmocr/blob/main/README.md), titled \"olmOCR 2: Unit Test Rewards for Document OCR\").\n\n### Environment and dependencies\n\nTraining extends the base install with the `[train]` extra plus pinned versions:\n\n```bash\npip install .[train]\npip install transformers==4.52.4\npip install flash-attn>=2.8.0.post2 --no-build-isolation\n```\n\nSource: [olmocr/train/README.md:21-25](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\n### Hardware layout\n\nThe training guide states the run is \"performed on an 8xH100 GPU node. One GPU is dedicated to running VLLM, while the other 7 are used to run training.\" Because the harness splits one GPU off for vLLm-based generation, a single H100 node is the minimum viable target, which is why community members have asked for macOS and ROCm support (issues [#33](https://github.com/allenai/olmocr/issues/33) and [#111](https://github.com/allenai/olmocr/issues/111)).\n\n### GRPO hyperparameters\n\nThe provided training script ([scripts/train/grpotrainer-beaker-multi-gpu-augusta.sh](https://github.com/allenai/olmocr/blob/main/scripts/train/grpotrainer-beaker-multi-gpu-augusta.sh), as quoted in the training README) configures the run with the following key parameters, surfaced as flags on `grpo_train.py`:\n\n| Flag | Example value | Meaning |\n|---|---|---|\n| `--reward_bench` | `1.0` | Weight for olmOCR-bench unit-test reward |\n| `--reward_front_matter` | `1.0` | Weight for YAML front-matter validity |\n| `--reward_eos` | `1.0` | Weight for proper end-of-sequence behavior |\n| `--beta` | `0.01` | KL penalty in GRPO objective |\n| `--learning_rate` | `2e-6` | Optimizer learning rate |\n| `--gradient_accumulation_steps` | `28` | Effective batch-size multiplier |\n| `--num-gpus` | `8` | Total GPUs on the Beaker node |\n\nSource: [olmocr/train/README.md:9-12](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\n### Data loader and config\n\n`dataloader.py` is responsible for reading single-page PDFs and their paired markdown annotations, while `config.py` centralizes the YAML-driven hyperparameters consumed by `train.py` and `grpo_train.py`. The training README explicitly requires \"Each PDF needs to be a single page only!\", which means long documents must be split before training, and the markdown front matter follows a defined schema:\n\n```markdown\n---\nprimary_language: en\nis_rotation_valid: True\nrotation_correction: 0\nis_table: False\nis_diagram: False\n---\n```\n\nSource: [olmocr/train/README.md:33-46](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md). The fields `is_table` and `is_diagram` line up with the page-classification features the inference path uses, and `rotation_correction` corresponds to the deskew step in [olmocr/pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/pipeline.py).\n\n## Data Preparation and Filtering\n\nThe data side has two recommended entry points documented in the training README:\n\n1. **`prepare_olmocrmix.py`** — pulls a subset of [allenai/olmOCR-mix-1025](https://huggingface.co/datasets/allenai/olmOCR-mix-1025) and unpacks it into the single-page PDF / markdown layout the trainer expects. Example invocation:\n\n   ```bash\n   python -m olmocr.data.prepare_olmocrmix \\\n     --dataset-path allenai/olmOCR-mix-1025 \\\n     --destination ~/olmOCR-mix-1025-extracted \\\n     --subset 01_books --split eval\n   ```\n\n   Source: [olmocr/train/README.md:55-60](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\n2. **`prepare_workspace`** — reuses the user's own olmOCR inference output. After running `python -m olmocr.pipeline ./localworkspace --pdfs /home/username/pdfs/*.pdf`, the workspace contains Dolma-formatted JSON; `prepare_workspace` reshapes that into the PDF/markdown layout, which is convenient for fine-tuning on a domain-specific corpus.\n\n   Source: [olmocr/train/README.md:62-66](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\nFiltering sits in front of this: the main [README.md](https://github.com/allenai/olmocr/blob/main/README.md) links to a \"Filtering language identification\" tool, exposed as [olmocr/data/filter.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/filter.py). This stage drops pages whose detected language does not match `primary_language` in the front matter and removes noisy rows before they reach the dataloader. `buildsilver.py` is the higher-level utility that turns raw PDFs into a \"silver\" training set by running an earlier checkpoint over them and keeping the outputs that pass the filters.\n\n## Synthetic Data\n\nThe synthetic-data module in [olmocr/synth/mine_html_templates.py](https://github.com/allenai/olmocr/blob/main/olmocr/synth/mine_html_templates.py) generates additional training documents from HTML templates. The training README states that the resulting mix is \"in the same format as other olmOCR-bench test cases,\" which means a synthetic dataset can be scored directly with [olmocr/bench/](https://github.com/allenai/olmocr/tree/main/olmocr/bench) and folded into `grpo_train.py` as `train_bench_data_folder` (e.g. `/data/jakep/grpo_data_mixes/olmocr-synthmix-1025-v2-rotate10p/bench_data`).\n\nSource: [olmocr/train/README.md:6-8](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\nSynthetic data is most useful for the document categories olmOCR historically struggles with — the same seven categories documented in [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md) (ArXiv math, Old Scans math, Tables, Old Scans, Headers/Footers, Multi-Column, Long Tiny Text). When a category is under-represented in human-labeled data, mined templates can be converted into `(single_page_pdf, markdown)` pairs and pushed through the same filter + GRPO loop.\n\n## Community Notes and Failure Modes\n\n- **Platform support gap.** The training stack requires NVIDIA GPUs and CUDA-specific kernels such as `flash-attn`; this is the root cause of the long-running macOS (issue [#33](https://github.com/allenai/olmocr/issues/33)) and ROCm (issue [#111](https://github.com/allenai/olmocr/issues/111)) requests. The community has produced unofficial CUDA-based Dockerfiles (issue [#161](https://github.com/allenai/olmocr/issues/161)) and a Gradio-based test harness (issue [#75](https://github.com/allenai/olmocr/issues/75)) to lower the bar, but training itself is not yet cross-platform.\n- **Encoding on Windows.** When running downstream pipeline stages on Windows, non-ASCII characters can fail with `'gbk' codec can't encode character` (issue [#459](https://github.com/allenai/olmocr/issues/459)). Setting `PYTHONIOENCODING=utf-8` and `chcp 65001` is the documented workaround.\n- **External model hosting deprecation.** `allenai/olmOCR-2-7B-1025` hosted on DeepInfra is being deprecated on 2026-05-07 (issue [#460](https://github.com/allenai/olmocr/issues/460)); users who cannot migrate to local vLLM should re-export the model through a self-hosted OpenAI-compatible endpoint, which `pipeline.py` consumes via the `--server` flag.\n- **Scoring side-effects during training.** Reward signals that include olmOCR-bench tests are sensitive to near-empty candidates — `TextPresenceTest.run` in [olmocr/bench/tests.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/tests.py) can falsely pass when a candidate is just `\"\\n\"` (issue [#461](https://github.com/allenai/olmocr/issues/461)). Practitioners using `--reward_bench` should sanity-check their reward values during GRPO rollouts.\n- **Unparsed PDFs surface as training noise.** Issue [#463](https://github.com/allenai/olmocr/issues/463) shows that some pages render to an empty markdown; these should be filtered out by the language/rotation filters in `filter.py` before being added to the next training mix, otherwise the GRPO reward may collapse.\n\n## See Also\n\n- [Pipeline and Inference](./Pipeline-and-Inference.md)\n- [olmOCR-Bench Evaluation](./OlmOCR-Bench-Evaluation.md)\n- [Data Formats and Workspace Layout](./Data-Formats-and-Workspace-Layout.md)\n\n---\n\n<!-- evidence_pipeline_checked: true -->\n<!-- evidence_injected: true -->\n\n---\n\n## Pitfall Log\n\nProject: allenai/olmocr\n\nSummary: Found 13 structured pitfall item(s), including 0 high/blocking item(s). Top priority: Installation risk - Installation risk requires verification.\n\n## 1. Installation risk - Installation risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a installation risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/461\n\n## 2. Configuration risk - Configuration risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a configuration risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/455\n\n## 3. Capability evidence risk - Capability evidence risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: README/documentation is current enough for a first validation pass.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: capability.assumptions | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 4. Maintenance risk - Maintenance risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/463\n\n## 5. Maintenance risk - Maintenance risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/460\n\n## 6. Maintenance risk - Maintenance risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 7. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: no_demo\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: downstream_validation.risk_items | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 8. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: no_demo\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: risks.scoring_risks | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 9. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/459\n\n## 10. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/451\n\n## 11. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/452\n\n## 12. Maintenance risk - Maintenance risk requires verification\n\n- Severity: low\n- Evidence strength: source_linked\n- Finding: issue_or_pr_quality=unknown。\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 13. Maintenance risk - Maintenance risk requires verification\n\n- Severity: low\n- Evidence strength: source_linked\n- Finding: release_recency=unknown。\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n\n<!-- canonical_name: allenai/olmocr; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "olmocr",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "github_repo:858798469",
          "kind": "repo",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/allenai/olmocr"
        },
        {
          "evidence_id": "art_26f093aeaf1d44ee9d8f7b44b889ad92",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/allenai/olmocr#readme"
        }
      ],
      "summary": "Full DeepWiki/Human Wiki output with Doramagic pitfall notes appended.",
      "title": "olmocr Manual",
      "toc": [
        "https://github.com/allenai/olmocr Project Manual",
        "Table of Contents",
        "Installation and Platform Support",
        "Overview",
        "System Dependencies",
        "Python Installation",
        "Platform Support and Known Limitations",
        "Docker and Common Installation Pitfalls",
        "Pitfall Log"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "f7cfe4c22098b154c76b6ec950d1c0a464eecf8d",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "Dockerfile",
      "pyproject.toml",
      "README.md",
      "docs/source/overview.md",
      "docs/source/CHANGELOG.md",
      "docs/source/conf.py",
      "docs/source/index.md",
      "docs/source/CONTRIBUTING.md",
      "docs/source/installation.md"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# olmocr - Doramagic AI Context Pack\n\n> Purpose: pre-work context for the user's host AI. This pack does not prove that the project has been installed, run, or validated.\n\n## Project\n\n- canonical_name: `allenai/olmocr`\n- capability: Toolkit for linearizing PDFs for LLM datasets/training\n- expected_user_outcome: Toolkit for linearizing PDFs for LLM datasets/training\n\n## Operating Boundaries\n\n- Do not claim that the project has been installed, run, called through an API, or used on local files unless separate evidence proves it.\n- Project facts must come from repo evidence, Claim Graph, or explicit source references.\n- When a capability is not verified, mark it as unverified instead of completing it as fact.\n- publish_status: `publishable`\n- blocking_gaps: none\n\n---\n\n## Doramagic Context Augmentation\n\nThe following sections strengthen the repository context for a host AI. Human Manual data is a reading route, and pitfall notes become operating constraints.\n\n## Human Manual Outline\n\nUsage rule: this is only a reading route and salience signal, not factual authority. Concrete claims must still return to repo evidence or Claim Graph.\n\nHost AI hard rules:\n- Do not treat page titles, section order, summaries, or importance values as factual project evidence.\n- When explaining the Human Manual outline, state that it is only a reading route or salience signal.\n- Capability, installation, compatibility, runtime state, and risk claims must cite repo evidence, source paths, or Claim Graph.\n\n- **Installation and Platform Support**: importance `high`\n  - source_paths: README.md, pyproject.toml, Dockerfile, Dockerfile.with-model, docs/source/installation.md\n- **Pipeline and Inference Modes**: importance `high`\n  - source_paths: olmocr/pipeline.py, olmocr/work_queue.py, olmocr/datatypes.py, olmocr/s3_utils.py, olmocr/prompts/prompts.py\n- **Benchmark Suite and OCR Evaluation**: importance `high`\n  - source_paths: olmocr/bench/README.md, olmocr/bench/benchmark.py, olmocr/bench/tests.py, olmocr/bench/runners/run_server.py, olmocr/bench/runners/run_olmocr_pipeline.py\n- **Model Training, Filtering, and Synthetic Data**: importance `medium`\n  - source_paths: olmocr/train/train.py, olmocr/train/grpo_train.py, olmocr/train/config.py, olmocr/train/dataloader.py, olmocr/data/buildsilver.py\n\n## Repo Inspection Evidence\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `f7cfe4c22098b154c76b6ec950d1c0a464eecf8d`\n- inspected_files: `Dockerfile`, `pyproject.toml`, `README.md`, `docs/source/overview.md`, `docs/source/CHANGELOG.md`, `docs/source/conf.py`, `docs/source/index.md`, `docs/source/CONTRIBUTING.md`, `docs/source/installation.md`\n\nHost AI hard rules:\n- Without repo_clone_verified=true, do not claim that the source code has been read.\n- Without repo_inspection_verified=true, do not write README, docs, or package-file conclusions as facts.\n- Without quick_start_verified=true, do not claim that the Quick Start path has run successfully.\n\n## Doramagic Pitfall Constraints\n\nThese rules come from Doramagic discovery, validation, or compilation findings. The host AI must treat them as operating constraints, not background notes.\n\n### Constraint 1: Installation risk requires verification\n\n- Trigger: Project evidence flags a installation risk. Review the linked source before relying on this workflow.\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/461\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n\n### Constraint 2: Configuration risk requires verification\n\n- Trigger: Project evidence flags a configuration risk. Review the linked source before relying on this workflow.\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/455\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n\n### Constraint 3: Capability evidence risk requires verification\n\n- Trigger: README/documentation is current enough for a first validation pass.\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: capability.assumptions | github_repo:858798469 | https://github.com/allenai/olmocr\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n\n### Constraint 4: Maintenance risk requires verification\n\n- Trigger: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/463\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n\n### Constraint 5: Maintenance risk requires verification\n\n- Trigger: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/460\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n\n### Constraint 6: Maintenance risk requires verification\n\n- Trigger: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n\n### Constraint 7: Security or permission risk requires verification\n\n- Trigger: no_demo\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: downstream_validation.risk_items | github_repo:858798469 | https://github.com/allenai/olmocr\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n\n### Constraint 8: Security or permission risk requires verification\n\n- Trigger: no_demo\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: risks.scoring_risks | github_repo:858798469 | https://github.com/allenai/olmocr\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n\n### Constraint 9: Security or permission risk requires verification\n\n- Trigger: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/459\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n\n### Constraint 10: Security or permission risk requires verification\n\n- Trigger: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- Host AI rule: Reproduce the official install and quickstart path in an isolated environment.\n- Why it matters: May increase setup, validation, or first-run risk for the user.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/451\n- Hard boundary: Do not present this pitfall as solved, verified, or ignorable unless later evidence explicitly closes it.\n",
      "summary": "Context and operating boundaries for the user's host AI.",
      "title": "AI Context Pack"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card\n\nProject: allenai/olmocr\n\n## Doramagic Trial Decision\n\nCurrent decision: ready for pre-publication recommendation checks. First use should still start with least privilege, a temporary directory, and rollback.\n\n## What The User Can Do Now\n\n- Read the Human Manual first to understand purpose and main workflows.\n- Copy the Prompt Preview for a pre-install trial. This checks interaction feel, not real execution.\n- Test the official Quick Start command in an isolated environment before using a primary machine.\n\n## What Not To Do Yet\n\n- Do not treat Prompt Preview output as an actual project run.\n- Do not treat metadata-only validation as sandbox install validation.\n- Do not write unverified capabilities as supported, tested, or safe to install.\n- Do not provide production data, private files, real secrets, or primary configuration directories on first use.\n\n## Pre-install Checklist\n\n- Host AI match: local_cli\n- Official install entry state: official entry found\n- Verification location: temporary directory, temporary host, or container required\n- Rollback readiness: required\n- API keys, network access, file writes, or host configuration changes: treat as high risk until confirmed\n- Install command, actual output, and failure logs: must be recorded\n\n## Current Blockers\n\n- No blockers.\n\n## Project-specific Pitfalls\n\n- Installation risk requires verification (medium): May increase setup, validation, or first-run risk for the user. Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Configuration risk requires verification (medium): May increase setup, validation, or first-run risk for the user. Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Capability evidence risk requires verification (medium): May increase setup, validation, or first-run risk for the user. Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Maintenance risk requires verification (medium): May increase setup, validation, or first-run risk for the user. Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Maintenance risk requires verification (medium): May increase setup, validation, or first-run risk for the user. Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n\n## Risk And Permission Notes\n\n- no_demo: medium\n\n## Evidence Gaps\n\n- No structured evidence gaps found.\n",
      "summary": "Install, permission, validation, and recommendation risks.",
      "title": "Boundary & Risk Card"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/allenai/olmocr Project Manual\n\nGenerated at: 2026-06-13 23:23:00 UTC\n\n## Table of Contents\n\n- [Installation and Platform Support](#page-1)\n- [Pipeline and Inference Modes](#page-2)\n- [Benchmark Suite and OCR Evaluation](#page-3)\n- [Model Training, Filtering, and Synthetic Data](#page-4)\n\n<a id='page-1'></a>\n\n## Installation and Platform Support\n\n### Related Pages\n\nRelated topics: [Pipeline and Inference Modes](#page-2)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n- [pyproject.toml](https://github.com/allenai/olmocr/blob/main/pyproject.toml)\n- [Dockerfile](https://github.com/allenai/olmocr/blob/main/Dockerfile)\n- [Dockerfile.with-model](https://github.com/allenai/olmocr/blob/main/Dockerfile.with-model)\n- [docs/source/installation.md](https://github.com/allenai/olmocr/blob/main/docs/source/installation.md)\n- [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n- [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n</details>\n\n# Installation and Platform Support\n\n## Overview\n\nolmOCR is a vision-language-model-based OCR pipeline that can be run either against a local NVIDIA GPU or against any remote OpenAI-API-compatible inference server. Installation is split into two tiers — a lightweight CPU-only install for remote-server use, and a heavier install that pulls in PyTorch, vLLM, and other GPU dependencies for local inference. The project is officially tuned for Linux + NVIDIA GPUs; community discussions show that macOS, ROCm, and Windows paths exist but are not first-class supported. Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\n## System Dependencies\n\nolmOCR renders PDF pages to images before sending them to the VLM, which requires a working `poppler-utils` install plus several TrueType font packages so that rendered glyphs look correct. The README specifies the Ubuntu/Debian install line:\n\n```bash\nsudo apt-get update\nsudo apt-get install poppler-utils ttf-mscorefonts-installer msttcorefonts \\\n  fonts-crosextra-caladea fonts-crosextra-carlito gsfonts lcdf-typetools\n```\n\nSource: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\nOn non-Ubuntu systems the user is expected to translate this list to the equivalent platform package manager. Skipping the font packages is a common cause of degraded OCR output on documents that use Caladea, Carlito, MS Core Fonts, or other non-default faces.\n\n## Python Installation\n\nThe project explicitly recommends installing into a fresh Conda environment because the local-GPU dependency set (PyTorch, flash-attn, vLLM) is fragile in pre-existing environments. Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\n```bash\nconda create -n olmocr python=3.11\nconda activate olmocr\n```\n\nPython 3.11 is the version called out in the README. After activation, the user picks one of two `pip install` options:\n\n| Option | Command | When to use |\n|---|---|---|\n| Remote inference only | `pip install olmocr` | You have a vLLM / OpenAI-API server running elsewhere. Avoids the ~2 GB+ of GPU dependencies. |\n| Local GPU inference | `pip install olmocr[gpu]` (or the full `pip install .` per the README) | You want olmOCR to spawn a local vLLM instance. Requires a recent NVIDIA GPU. |\n\nSource: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\nThe optional extras groups are also relevant for downstream workflows:\n\n- `[train]` — extra packages required by the RLVR training pipeline documented in `olmocr/train/README.md`. That README additionally pins `transformers==4.52.4` and `flash-attn>=2.8.0.post2 --no-build-isolation`. Source: [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n- `[bench]` — used to run `python -m olmocr.bench.benchmark`. Community issue #452 reports that `numpy` was missing from this group and had to be installed manually, so users hitting `ModuleNotFoundError: numpy` on first run should `pip install numpy` explicitly. Source: [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n\n## Platform Support and Known Limitations\n\n```mermaid\nflowchart LR\n    A[Choose install path] --> B{Remote vLLM<br/>server available?}\n    B -- Yes --> C[pip install olmocr<br/>Linux/macOS/Windows]\n    B -- No --> D{NVIDIA GPU<br/>on Linux?}\n    D -- Yes --> E[pip install olmocr gpu extras<br/>poppler-utils + fonts]\n    D -- No --> F[Community paths:<br/>macOS/ROCm/CPU]\n```\n\nThe README states that local inference is tested on a recent NVIDIA GPU. Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n\nPlatforms beyond this are community-driven:\n\n- **macOS / Apple Silicon.** Issue #33 has 28 comments requesting official support. As of release v0.4.27 there is no first-class macOS install path; the recommended workaround is the remote-server install against a vLLM endpoint running on Linux. Source: community context, issue #33.\n- **AMD ROCm.** Issue #111 documents users attempting a 7900 XTX / ROCm install hitting OOM in vLLM and empty responses from a GGUF/ollama fallback. Source: community context, issue #111.\n- **Windows.** The CLI works under Windows 11 + Conda + Python 3.11, but issue #459 shows that the default Windows console codepage is `gbk` and corrupts non-ASCII output. The accepted workaround in that thread is to set `PYTHONIOENCODING=utf-8` and run `chcp 65001` from PowerShell before launching olmOCR. Source: community context, issue #459.\n\nThe training stack documented in `olmocr/train/README.md` is explicitly specialized to an 8×H100 Beaker cluster and is not expected to work elsewhere without modification. Source: [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n\n## Docker and Common Installation Pitfalls\n\nFor users who do not want to manage the system dependencies by hand, a `Dockerfile` and a `Dockerfile.with-model` variant are shipped at the repository root. The `with-model` image bundles the `allenai/olmOCR-2-7B-1025-FP8` weights so that the container can run local inference out of the box. Community issue #161 also provides a community-maintained CUDA 12.1 / Ubuntu 22.04 Dockerfile for users who want a known-good base. Source: [Dockerfile](https://github.com/allenai/olmocr/blob/main/Dockerfile), [Dockerfile.with-model](https://github.com/allenai/olmocr/blob/main/Dockerfile.with-model); community context, issue #161.\n\nThe most common first-run failures, drawn from the issue tracker and the in-repo docs, are:\n\n1. **Missing `poppler-utils` or fonts** — pages render as boxes or with missing glyphs; install the apt packages listed above.\n2. **Wrong Python environment** — PyTorch or flash-attn wheels installed into a non-3.11 env cause opaque import errors; recreate the conda env.\n3. **`numpy` / missing `[bench]` extras** — covered in issue #452.\n4. **Malformed CLI help string** — issue #451 documents `argparse` raising on `--help` under Python 3.14, so use Python 3.11 for now. Source: community context, issue #451.\n5. **HTTP client timeout in server runner** — `olmocr/bench/runners/run_server.py` defaults to 300 s, which is too short for slow endpoints; issue #455 tracks a request to make this configurable. Source: community context, issue #455.\n\n## See Also\n\n- [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md) — Training-specific environment setup, including dataset preparation.\n- [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md) — Benchmark harness, including the documented document categories.\n- [docs/source/installation.md](https://github.com/allenai/olmocr/blob/main/docs/source/installation.md) — Sphinx-rendered installation guide (mirrors the README).\n- GitHub issues #33, #111, #161, #451, #452, #455, #459 — Platform and install-related discussions referenced above.\n\n---\n\n<a id='page-2'></a>\n\n## Pipeline and Inference Modes\n\n### Related Pages\n\nRelated topics: [Installation and Platform Support](#page-1), [Benchmark Suite and OCR Evaluation](#page-3)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [olmocr/pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/pipeline.py)\n- [olmocr/work_queue.py](https://github.com/allenai/olmocr/blob/main/olmocr/work_queue.py)\n- [olmocr/datatypes.py](https://github.com/allenai/olmocr/blob/main/olmocr/datatypes.py)\n- [olmocr/s3_utils.py](https://github.com/allenai/olmocr/blob/main/olmocr/s3_utils.py)\n- [olmocr/prompts/prompts.py](https://github.com/allenai/olmocr/blob/main/olmocr/prompts/prompts.py)\n- [olmocr/image_utils.py](https://github.com/allenai/olmocr/blob/main/olmocr/image_utils.py)\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n</details>\n\n# Pipeline and Inference Modes\n\nThe olmOCR pipeline (`olmocr/pipeline.py`) is the central batch inference system that converts millions of PDF pages into structured Markdown using a fine-tuned vision-language model. It is invoked either as `python -m olmocr.pipeline <workspace> ...` or simply `olmocr <workspace> ...`, and exposes a single CLI surface that fans out into several execution and inference backends. The pipeline's responsibility spans PDF ingestion, page-group scheduling, model dispatch (local or remote), retry handling, and writing Dolma-compatible output plus optional Markdown files [Source: [olmocr/pipeline.py:1-50]()] [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n## Pipeline Responsibilities and Workspace Layout\n\nA pipeline run is anchored to a single `workspace` argument — a local folder or an `s3://bucket/prefix/` path. The workspace stores the work queue, partial results, and the final Dolma-format files (`results/*.jsonl.gz`) preserving the input PDF folder structure. The pipeline is described as a \"Manager for running millions of PDFs through a batch inference pipeline\" in its top-level help text [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\nKey behaviors of the workspace:\n\n- Acts as a work queue when running on S3, allowing multiple worker nodes to coordinate via shared prefixes [Source: [olmocr/work_queue.py:1-80]()] [Source: [olmocr/s3_utils.py:1-120]()] .\n- Local workspaces behave identically, but contention is serialized at the filesystem level.\n- Pages are grouped (`--pages_per_group`, default small batch) and each group is dispatched independently. Failed groups are retried (`--max_page_retries`) and a configurable error budget (`--max_page_error_rate`) terminates the run when exceeded [Source: [olmocr/pipeline.py:100-180]()] .\n- When `--markdown` is passed, an additional `markdown/` folder is populated with human-readable outputs alongside the Dolma JSONL stream [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n## Local vLLM Inference Mode\n\nIn the default mode, the pipeline spawns a local vLLM server and runs inference in-process. This requires the GPU-enabled install (`pip install olmocr[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/`) and exposes a number of vLLM passthrough arguments [Source: [olmocr/pipeline.py:200-260]()] [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\nNotable local-mode flags include:\n\n| Flag | Purpose |\n|------|---------|\n| `--model` | Path or HF repo id (default `allenai/olmOCR-7B-0725-FP8`) |\n| `--gpu-memory-utilization` | Fraction of VRAM allocated to KV-cache (forwarded to `vllm serve`) |\n| `--max_model_len` | Upper bound on KV-cache tokens; lower it if vLLM fails to start |\n| `--tensor-parallel-size` / `-tp` | Tensor parallel degree for vLLM |\n| `--data-parallel-size` / `-dp` | Data parallel degree |\n| `--port` | Port for the local vLLM server |\n| `--apply_filter` | Runs an additional quality filter pass on the parsed output |\n| `--guided_decoding` | Enables guided decoding for YAML-typed model outputs |\n\nThe pipeline also controls how PDF pages are rendered before being sent to the model: `--target_longest_image_dim` controls the rendered image size and `--target_anchor_text_len` caps the amount of anchor text used (legacy models only) [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n## Remote Inference Server Mode (OpenAI-Compatible)\n\nA second mode targets external vLLM servers, DeepInfra, or any OpenAI-compatible endpoint. In this mode the pipeline skips spawning a local vLLM instance and simply issues chat-completions requests over HTTP. This works with the lightweight `pip install olmocr` install (no GPU dependencies) [Source: [olmocr/pipeline.py:300-360]()] [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\nKey arguments for remote/server mode:\n\n- `--server`: URL of an external vLLM or compatible server, e.g. `http://remote-host:8000/v1`. When set, the pipeline does not start a local server.\n- `--api_key`: Bearer token passed via `Authorization` header.\n- `--max_concurrent_requests`: Caps in-flight HTTP requests to the provider.\n- `--workers`: Limits page groups processed simultaneously; setting to `1` ensures one group finishes before the next starts — useful for providers with low concurrency caps.\n- `--pages_per_group`: Smaller groups help on providers with strict concurrent-request ceilings.\n- `--model`: Model identifier (provider-specific; e.g. `allenai/olmOCR-2-7B-1025` on DeepInfra).\n\nExample invocation: `olmocr ./localworkspace --server http://remote-server:8000/v1 --model allenai/olmOCR-2-7B-1025-FP8 --markdown --pdfs *.pdf` [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n## Multi-Node / Beaker Cluster Mode\n\nFor very large jobs, the pipeline can coordinate multiple workers against an S3-backed workspace. The first worker seeds the queue: `olmocr s3://my_s3_bucket/pdfworkspaces/exampleworkspace --pdfs s3://my_s3_bucket/jakep/gnarly_pdfs/*.pdf`. Subsequent workers simply point at the same prefix and pull from the shared queue [Source: [olmocr/work_queue.py:80-180]()] [Source: [olmocr/s3_utils.py:120-220]()] [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\nAlternatively, the pipeline can be submitted directly to Beaker with `--beaker`, `--beaker_workspace`, `--beaker_cluster`, `--beaker_gpus`, and `--beaker_priority`. This path is documented as the standard way to run multi-node jobs at AI2 [Source: [README.md](https://github.com/allenai/olmocr/blob/main/README.md)].\n\n```mermaid\nflowchart LR\n    A[User / CLI] --> B[olmocr.pipeline]\n    B --> C{Inference Mode}\n    C -- local GPU --> D[Spawn vLLM]\n    C -- --server --> E[Remote vLLM / OpenAI API]\n    C -- --beaker / S3 --> F[Shared S3 Workspace]\n    D --> G[Page Groups -> Model]\n    E --> G\n    F --> G\n    G --> H[Dolma JSONL + Markdown]\n```\n\n## Common Failure Modes and Community-Reported Issues\n\nSeveral issues trace directly back to pipeline/inference behavior:\n\n- **Empty Markdown output on specific PDFs** (issue #463): the pipeline reports success and writes the markdown file but the body is empty, indicating model output failed validation for that page rather than a crash [Source: [issue #463](https://github.com/allenai/olmocr/issues/463)].\n- **Encoding errors when writing Markdown** (issue #459): Windows + GBK locale cannot encode certain Unicode codepoints emitted by the model. Setting `PYTHONIOENCODING=utf-8` or `chcp 65001` before launching the pipeline resolves it [Source: [issue #459](https://github.com/allenai/olmocr/issues/459)].\n- **Hard-coded 300s timeout in server runner** (issue #455): the bench HTTP client in `olmocr/bench/runners/run_server.py` uses a fixed timeout, which is insufficient for some larger page groups; community has requested making it configurable [Source: [issue #455](https://github.com/allenai/olmocr/issues/455)].\n- **DeepInfra model deprecation** (issue #460): `allenai/olmOCR-2-7B-1025` is scheduled for deprecation on DeepInfra on 2026-05-07; users running the remote mode should plan for a self-hosted vLLM endpoint [Source: [issue #460](https://github.com/allenai/olmocr/issues/460)].\n\nA known release-line note: v0.4.27 fixed a \"queue bug in long queues\" relevant to S3-coordinated multi-node runs, so users on older versions should upgrade before scaling out [Source: [release v0.4.27](https://github.com/allenai/olmocr/releases/tag/v0.4.27)].\n\n## See Also\n\n- [Training Guide](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md) — for fine-tuning new olmOCR models.\n- [olmOCR-Bench](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md) — for evaluating pipeline output.\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md) — top-level usage documentation.\n\n---\n\n<a id='page-3'></a>\n\n## Benchmark Suite and OCR Evaluation\n\n### Related Pages\n\nRelated topics: [Pipeline and Inference Modes](#page-2), [Model Training, Filtering, and Synthetic Data](#page-4)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n- [olmocr/bench/benchmark.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/benchmark.py)\n- [olmocr/bench/tests.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/tests.py)\n- [olmocr/bench/prompts.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/prompts.py)\n- [olmocr/bench/runners/run_server.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/runners/run_server.py)\n- [olmocr/bench/runners/run_olmocr_pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/runners/run_olmocr_pipeline.py)\n- [olmocr/bench/scripts/workspace_to_bench.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/scripts/workspace_to_bench.py)\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n- [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n</details>\n\n# Benchmark Suite and OCR Evaluation\n\n## Purpose and Scope\n\n`olmOCR-Bench` is the project's machine-checkable evaluation harness for document-level OCR systems. It does **not** rely on edit distance or fuzzy text similarity; instead it asserts discrete, hand-authored \"facts\" about each PDF page and grades whether an OCR pipeline recovered them. The design intent, stated in the bench README, is that every test case should be \"very simple, unambiguous, and machine-checkable, similar to a unit test\" so that alternate-but-correct transcriptions are not penalized.\n\nSource: [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n\nThe benchmark operates on **single-page PDFs** so that digital metadata is preserved, and accepts Markdown or plain-text output from any OCR tool. This deliberately decouples evaluation from any specific engine, allowing third-party OCR pipelines (Marker, MinerU, Mistral OCR API, DeepSeek-OCR, etc.) to be ranked on the same scoreboard published in the project root [README.md](https://github.com/allenai/olmocr/blob/main/README.md).\n\n## Document Types and Test Categories\n\nThe suite defines seven categories that historically challenge OCR systems. Each category is sourced via a different acquisition strategy documented in [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md):\n\n| Category | Code | Source strategy |\n| --- | --- | --- |\n| arXiv Math | AR | Single-TeX-source arXiv math papers, validated with KaTeX |\n| Old Scans Math | OSM | Public-domain math textbooks from the Internet Archive, manually annotated |\n| Tables | TA | Internally crawled PDFs filtered for tables with Gemini-Flash-2.0 |\n| Old Scans | OS | Library of Congress letters with existing transcriptions |\n| Headers & Footers | HF | DocLayout-YOLO \"abandon\" regions; tests expect text to be **absent** |\n| Multi Column | MC | Claude-Sonnet-3.7 HTML renders of multi-article pages |\n| Long Tiny Text | LTT | Dense small-print archive pages (dictionaries, references) |\n\nTests are stored as JSONL files in the `bench_data/` directory of the dataset (`allenai/olmOCR-bench` on Hugging Face). The header/footer category is unique in that it asserts a string must *not* appear in the linearized output, since correctly excluding recurring page furniture is a strong signal of OCR quality.\n\n## Benchmark Principles and Scoring\n\nThe README enumerates rules that shape how candidate outputs are scored. These are worth understanding before debugging a low score:\n\n- Output is treated as plain-text Unicode in a natural reading order; Markdown syntax is **ignored** when matching (so `**enlightenment**` still matches `enlightenment`).\n- Matching is **not position-sensitive**, except for header/footer tests, which constrain search to the first or last N characters.\n- Tables may be Markdown or HTML `<table>`; math must be delimited by `$`, `$$`, `\\(`, or `\\[` and must render with KaTeX.\n- All Unicode is normalized to NFC; hyphens, single quotes, and double quotes are folded to ASCII variants before comparison.\n- Each test belongs to one of the seven categories plus a \"Base\" category used for sanity checks.\n\nThe actual scoring logic lives in `TextPresenceTest.run` and related classes inside `olmocr/bench/tests.py`. As reported in [issue #461](https://github.com/allenai/olmocr/issues/461), `partial_ratio` from RapidFuzz can produce false positives when the candidate `md_content` is much shorter than the queried text (e.g., a single `\\n`), because the partial ratio then compares against a near-empty string. Patches to the test logic must guard against this degenerate case.\n\n## Running the Benchmark\n\nA typical run follows the recipe documented in the bench README:\n\n```bash\n# 1. Install GPU extras for local inference\npip install olmocr[gpu] --find-links https://flashinfer.ai/whl/cu124/torch2.4/flashinfer/\n\n# 2. Convert benchmark PDFs into model output using a runner\npython -m olmocr.bench.convert olmocr_pipeline --dir ./olmOCR-bench/bench_data\n\n# 3. Score the outputs\npython -m olmocr.bench.benchmark --dir ./olmOCR-bench/bench_data\n```\n\nTwo runners are shipped under `olmocr/bench/runners/`:\n\n- `run_olmocr_pipeline.py` invokes the local `olmocr.pipeline` to render each PDF into Markdown inside a workspace directory. The helper `olmocr/bench/scripts/workspace_to_bench.py` then copies those outputs into the directory layout the grader expects.\n- `run_server.py` posts the PDFs to an OpenAI-compatible endpoint (local vLLM, DeepInfra, or any compatible API). [Issue #455](https://github.com/allenai/olmocr/issues/455) tracks a feature request to expose a configurable HTTP timeout, since the current default of 300 seconds is insufficient for slow backends or large pages.\n\nThe CLI entry point in `olmocr/bench/benchmark.py` has surfaced at least two regressions: a malformed `argparse` help string ([issue #451](https://github.com/allenai/olmocr/issues/451)) and a missing `numpy` import in the `[bench]` extras ([issue #452](https://github.com/allenai/olmocr/issues/452)). Install the dependency with `pip install numpy` if the benchmark entry point raises `ModuleNotFoundError` immediately on launch.\n\nFor annotation review, the README points to an internal tool:\n\n```bash\npython -m olmocr.bench.review_app --port 5000 --debug \\\n    ./olmOCR-bench/bench_data/multi_column.jsonl --force\n```\n\n## Common Failure Modes Seen by Users\n\nThe community has filed several reports that intersect with the benchmark workflow:\n\n- **Empty markdown output for a specific page.** Running `olmocr output/ --pdfs bench_data/pdfs/headers_footers/<hash>_page_3.pdf --model /root/olmOCR-2-7B-1025-FP8/ --markdown` produced an empty Markdown file ([issue #463](https://github.com/allenai/olmocr/issues/463)). Because header/footer tests assert *absence*, an empty candidate actually scores well on HF tests but fails Base and content-bearing categories, which is a useful diagnostic signal.\n- **GBK encoding crash on Windows.** [Issue #459](https://github.com/allenai/olmocr/issues/459) describes a `'gbk' codec can't encode character '\\u1eca'` error during Markdown writing. Setting `PYTHONIOENCODING=utf-8` (or `chcp 65001` in PowerShell) before invoking the pipeline resolves the failure when running under the default Windows codepage.\n- **DeepInfra deprecation.** [Issue #460](https://github.com/allenai/olmocr/issues/460) notes that the hosted `allenai/olmOCR-2-7B-1025` snapshot will be retired on 2026-05-07. Users scoring against the server runner should pin to a self-hosted model or migrate to the new release.\n- **Scoring inversion for near-empty candidates.** As noted above, [issue #461](https://github.com/allenai/olmocr/issues/461) reports that `TextPresenceTest.run` in [olmocr/bench/tests.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/tests.py) can invert its decision when the candidate collapses to whitespace. When investigating \"all tests pass\" results for a broken page, check the candidate file directly.\n\n## Integration with Training\n\nThe same JSONL schema is reused inside the GRPO trainer. Per [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md), `mine_html_templates.py` synthesizes additional benchmark-style cases (`olmocr-synthmix-1025`), and the trainer script accepts a `--reward_bench` weight so that bench-style presence/absence tests participate directly in the reinforcement-learning reward signal. This is the mechanism behind the v0.4.0 release's ~4-point jump on the published scoreboard, where the strongest olmOCR release (v0.4.0) reaches an Overall score of 82.4±1.1.\n\n## See Also\n\n- olmOCR Pipeline and Inference\n- Training olmOCR Models (GRPO + Synthetic Data)\n- Dolma Output Viewer and Workspace Format\n- Issue #451, #452, #455, #459, #460, #461, #463 on GitHub for known bench regressions and platform quirks.\n\n---\n\n<a id='page-4'></a>\n\n## Model Training, Filtering, and Synthetic Data\n\n### Related Pages\n\nRelated topics: [Pipeline and Inference Modes](#page-2), [Benchmark Suite and OCR Evaluation](#page-3)\n\n<details>\n<summary>Related Source Files</summary>\n\nThe following source files were used to generate this page:\n\n- [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md)\n- [olmocr/train/train.py](https://github.com/allenai/olmocr/blob/main/olmocr/train/train.py)\n- [olmocr/train/grpo_train.py](https://github.com/allenai/olmocr/blob/main/olmocr/train/grpo_train.py)\n- [olmocr/train/config.py](https://github.com/allenai/olmocr/blob/main/olmocr/train/config.py)\n- [olmocr/train/dataloader.py](https://github.com/allenai/olmocr/blob/main/olmocr/train/dataloader.py)\n- [olmocr/data/buildsilver.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/buildsilver.py)\n- [olmocr/data/prepare_olmocrmix.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/prepare_olmocrmix.py)\n- [olmocr/data/filter.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/filter.py)\n- [olmocr/synth/mine_html_templates.py](https://github.com/allenai/olmocr/blob/main/olmocr/synth/mine_html_templates.py)\n- [olmocr/pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/pipeline.py)\n- [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md)\n- [README.md](https://github.com/allenai/olmocr/blob/main/README.md)\n</details>\n\n# Model Training, Filtering, and Synthetic Data\n\n## Overview\n\nolmOCR's model training, filtering, and synthetic data pipeline is the machinery that produces fine-tuned OCR models such as `allenai/olmOCR-2-7B-1025-FP8` (referenced in [README.md](https://github.com/allenai/olmocr/blob/main/README.md)). The pipeline combines three concerns:\n\n1. **Training** — supervised fine-tuning and GRPO-based reinforcement learning on PDF/markdown pairs, orchestrated in [olmocr/train/](https://github.com/allenai/olmocr/tree/main/olmocr/train).\n2. **Filtering and data preparation** — converting raw PDFs and human/LLM-generated markdown into the single-page layout that the trainer consumes, handled in [olmocr/data/](https://github.com/allenai/olmocr/tree/main/olmocr/data).\n3. **Synthetic data** — generating additional training examples from HTML templates to bolster coverage of hard document types, handled in [olmocr/synth/](https://github.com/allenai/olmocr/tree/main/olmocr/synth).\n\nThe README frames the broader workflow as: \"Processing millions of PDFs through a finetuned model using VLLM\" via [olmocr/pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/pipeline.py), and a \"Filtering language identification\" helper, both of which feed back into the next training round.\n\n```mermaid\nflowchart LR\n    A[Raw PDFs] --> B[olmocr/pipeline.py]\n    B --> C[olmocr.data.filter]\n    C --> D[prepare_workspace / prepare_olmocrmix]\n    E[mine_html_templates.py] --> F[Synthetic Markdown]\n    D --> G[olmocr-synthmix-1025]\n    F --> G\n    G --> H[train.py / grpo_train.py]\n    H --> I[Fine-tuned olmOCR Model]\n```\n\n## Training Pipeline\n\nThe training stack lives under [olmocr/train/](https://github.com/allenai/olmocr/tree/main/olmocr/train) and exposes two entry points:\n\n- `train.py` — supervised fine-tuning, described in [olmocr/train/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n- `grpo_train.py` — reinforcement learning with unit-test rewards (see [olmocr2 bibtex entry in README.md](https://github.com/allenai/olmocr/blob/main/README.md), titled \"olmOCR 2: Unit Test Rewards for Document OCR\").\n\n### Environment and dependencies\n\nTraining extends the base install with the `[train]` extra plus pinned versions:\n\n```bash\npip install .[train]\npip install transformers==4.52.4\npip install flash-attn>=2.8.0.post2 --no-build-isolation\n```\n\nSource: [olmocr/train/README.md:21-25](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\n### Hardware layout\n\nThe training guide states the run is \"performed on an 8xH100 GPU node. One GPU is dedicated to running VLLM, while the other 7 are used to run training.\" Because the harness splits one GPU off for vLLm-based generation, a single H100 node is the minimum viable target, which is why community members have asked for macOS and ROCm support (issues [#33](https://github.com/allenai/olmocr/issues/33) and [#111](https://github.com/allenai/olmocr/issues/111)).\n\n### GRPO hyperparameters\n\nThe provided training script ([scripts/train/grpotrainer-beaker-multi-gpu-augusta.sh](https://github.com/allenai/olmocr/blob/main/scripts/train/grpotrainer-beaker-multi-gpu-augusta.sh), as quoted in the training README) configures the run with the following key parameters, surfaced as flags on `grpo_train.py`:\n\n| Flag | Example value | Meaning |\n|---|---|---|\n| `--reward_bench` | `1.0` | Weight for olmOCR-bench unit-test reward |\n| `--reward_front_matter` | `1.0` | Weight for YAML front-matter validity |\n| `--reward_eos` | `1.0` | Weight for proper end-of-sequence behavior |\n| `--beta` | `0.01` | KL penalty in GRPO objective |\n| `--learning_rate` | `2e-6` | Optimizer learning rate |\n| `--gradient_accumulation_steps` | `28` | Effective batch-size multiplier |\n| `--num-gpus` | `8` | Total GPUs on the Beaker node |\n\nSource: [olmocr/train/README.md:9-12](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\n### Data loader and config\n\n`dataloader.py` is responsible for reading single-page PDFs and their paired markdown annotations, while `config.py` centralizes the YAML-driven hyperparameters consumed by `train.py` and `grpo_train.py`. The training README explicitly requires \"Each PDF needs to be a single page only!\", which means long documents must be split before training, and the markdown front matter follows a defined schema:\n\n```markdown\n---\nprimary_language: en\nis_rotation_valid: True\nrotation_correction: 0\nis_table: False\nis_diagram: False\n---\n```\n\nSource: [olmocr/train/README.md:33-46](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md). The fields `is_table` and `is_diagram` line up with the page-classification features the inference path uses, and `rotation_correction` corresponds to the deskew step in [olmocr/pipeline.py](https://github.com/allenai/olmocr/blob/main/olmocr/pipeline.py).\n\n## Data Preparation and Filtering\n\nThe data side has two recommended entry points documented in the training README:\n\n1. **`prepare_olmocrmix.py`** — pulls a subset of [allenai/olmOCR-mix-1025](https://huggingface.co/datasets/allenai/olmOCR-mix-1025) and unpacks it into the single-page PDF / markdown layout the trainer expects. Example invocation:\n\n   ```bash\n   python -m olmocr.data.prepare_olmocrmix \\\n     --dataset-path allenai/olmOCR-mix-1025 \\\n     --destination ~/olmOCR-mix-1025-extracted \\\n     --subset 01_books --split eval\n   ```\n\n   Source: [olmocr/train/README.md:55-60](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\n2. **`prepare_workspace`** — reuses the user's own olmOCR inference output. After running `python -m olmocr.pipeline ./localworkspace --pdfs /home/username/pdfs/*.pdf`, the workspace contains Dolma-formatted JSON; `prepare_workspace` reshapes that into the PDF/markdown layout, which is convenient for fine-tuning on a domain-specific corpus.\n\n   Source: [olmocr/train/README.md:62-66](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\nFiltering sits in front of this: the main [README.md](https://github.com/allenai/olmocr/blob/main/README.md) links to a \"Filtering language identification\" tool, exposed as [olmocr/data/filter.py](https://github.com/allenai/olmocr/blob/main/olmocr/data/filter.py). This stage drops pages whose detected language does not match `primary_language` in the front matter and removes noisy rows before they reach the dataloader. `buildsilver.py` is the higher-level utility that turns raw PDFs into a \"silver\" training set by running an earlier checkpoint over them and keeping the outputs that pass the filters.\n\n## Synthetic Data\n\nThe synthetic-data module in [olmocr/synth/mine_html_templates.py](https://github.com/allenai/olmocr/blob/main/olmocr/synth/mine_html_templates.py) generates additional training documents from HTML templates. The training README states that the resulting mix is \"in the same format as other olmOCR-bench test cases,\" which means a synthetic dataset can be scored directly with [olmocr/bench/](https://github.com/allenai/olmocr/tree/main/olmocr/bench) and folded into `grpo_train.py` as `train_bench_data_folder` (e.g. `/data/jakep/grpo_data_mixes/olmocr-synthmix-1025-v2-rotate10p/bench_data`).\n\nSource: [olmocr/train/README.md:6-8](https://github.com/allenai/olmocr/blob/main/olmocr/train/README.md).\n\nSynthetic data is most useful for the document categories olmOCR historically struggles with — the same seven categories documented in [olmocr/bench/README.md](https://github.com/allenai/olmocr/blob/main/olmocr/bench/README.md) (ArXiv math, Old Scans math, Tables, Old Scans, Headers/Footers, Multi-Column, Long Tiny Text). When a category is under-represented in human-labeled data, mined templates can be converted into `(single_page_pdf, markdown)` pairs and pushed through the same filter + GRPO loop.\n\n## Community Notes and Failure Modes\n\n- **Platform support gap.** The training stack requires NVIDIA GPUs and CUDA-specific kernels such as `flash-attn`; this is the root cause of the long-running macOS (issue [#33](https://github.com/allenai/olmocr/issues/33)) and ROCm (issue [#111](https://github.com/allenai/olmocr/issues/111)) requests. The community has produced unofficial CUDA-based Dockerfiles (issue [#161](https://github.com/allenai/olmocr/issues/161)) and a Gradio-based test harness (issue [#75](https://github.com/allenai/olmocr/issues/75)) to lower the bar, but training itself is not yet cross-platform.\n- **Encoding on Windows.** When running downstream pipeline stages on Windows, non-ASCII characters can fail with `'gbk' codec can't encode character` (issue [#459](https://github.com/allenai/olmocr/issues/459)). Setting `PYTHONIOENCODING=utf-8` and `chcp 65001` is the documented workaround.\n- **External model hosting deprecation.** `allenai/olmOCR-2-7B-1025` hosted on DeepInfra is being deprecated on 2026-05-07 (issue [#460](https://github.com/allenai/olmocr/issues/460)); users who cannot migrate to local vLLM should re-export the model through a self-hosted OpenAI-compatible endpoint, which `pipeline.py` consumes via the `--server` flag.\n- **Scoring side-effects during training.** Reward signals that include olmOCR-bench tests are sensitive to near-empty candidates — `TextPresenceTest.run` in [olmocr/bench/tests.py](https://github.com/allenai/olmocr/blob/main/olmocr/bench/tests.py) can falsely pass when a candidate is just `\"\\n\"` (issue [#461](https://github.com/allenai/olmocr/issues/461)). Practitioners using `--reward_bench` should sanity-check their reward values during GRPO rollouts.\n- **Unparsed PDFs surface as training noise.** Issue [#463](https://github.com/allenai/olmocr/issues/463) shows that some pages render to an empty markdown; these should be filtered out by the language/rotation filters in `filter.py` before being added to the next training mix, otherwise the GRPO reward may collapse.\n\n## See Also\n\n- [Pipeline and Inference](./Pipeline-and-Inference.md)\n- [olmOCR-Bench Evaluation](./OlmOCR-Bench-Evaluation.md)\n- [Data Formats and Workspace Layout](./Data-Formats-and-Workspace-Layout.md)\n\n---\n\n<!-- evidence_pipeline_checked: true -->\n<!-- evidence_injected: true -->\n\n---\n\n## Pitfall Log\n\nProject: allenai/olmocr\n\nSummary: Found 13 structured pitfall item(s), including 0 high/blocking item(s). Top priority: Installation risk - Installation risk requires verification.\n\n## 1. Installation risk - Installation risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a installation risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/461\n\n## 2. Configuration risk - Configuration risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a configuration risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/455\n\n## 3. Capability evidence risk - Capability evidence risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: README/documentation is current enough for a first validation pass.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: capability.assumptions | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 4. Maintenance risk - Maintenance risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/463\n\n## 5. Maintenance risk - Maintenance risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/460\n\n## 6. Maintenance risk - Maintenance risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 7. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: no_demo\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: downstream_validation.risk_items | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 8. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: no_demo\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: risks.scoring_risks | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 9. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/459\n\n## 10. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/451\n\n## 11. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/452\n\n## 12. Maintenance risk - Maintenance risk requires verification\n\n- Severity: low\n- Evidence strength: source_linked\n- Finding: issue_or_pr_quality=unknown。\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 13. Maintenance risk - Maintenance risk requires verification\n\n- Severity: low\n- Evidence strength: source_linked\n- Finding: release_recency=unknown。\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n\n<!-- canonical_name: allenai/olmocr; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "Full DeepWiki/Human Wiki output with Doramagic pitfall notes appended.",
      "title": "Human Manual"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log\n\nProject: allenai/olmocr\n\nSummary: Found 13 structured pitfall item(s), including 0 high/blocking item(s). Top priority: Installation risk - Installation risk requires verification.\n\n## 1. Installation risk - Installation risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a installation risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/461\n\n## 2. Configuration risk - Configuration risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a configuration risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/455\n\n## 3. Capability evidence risk - Capability evidence risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: README/documentation is current enough for a first validation pass.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: capability.assumptions | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 4. Maintenance risk - Maintenance risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/463\n\n## 5. Maintenance risk - Maintenance risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/460\n\n## 6. Maintenance risk - Maintenance risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a maintenance risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 7. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: no_demo\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: downstream_validation.risk_items | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 8. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: no_demo\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: risks.scoring_risks | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 9. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/459\n\n## 10. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/451\n\n## 11. Security or permission risk - Security or permission risk requires verification\n\n- Severity: medium\n- Evidence strength: source_linked\n- Finding: Project evidence flags a security or permission risk. Review the linked source before relying on this workflow.\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: community_evidence:github | https://github.com/allenai/olmocr/issues/452\n\n## 12. Maintenance risk - Maintenance risk requires verification\n\n- Severity: low\n- Evidence strength: source_linked\n- Finding: issue_or_pr_quality=unknown。\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n\n## 13. Maintenance risk - Maintenance risk requires verification\n\n- Severity: low\n- Evidence strength: source_linked\n- Finding: release_recency=unknown。\n- User impact: May increase setup, validation, or first-run risk for the user.\n- Suggested check: Reproduce the official install and quickstart path in an isolated environment.\n- Evidence: evidence.maintainer_signals | github_repo:858798469 | https://github.com/allenai/olmocr\n",
      "summary": "Identity, install, configuration, runtime, and safety issues to check before use.",
      "title": "Pitfall Log"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# olmocr - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for allenai/olmocr.\n\nProject:\n- Name: olmocr\n- Repository: https://github.com/allenai/olmocr\n- Summary: Toolkit for linearizing PDFs for LLM datasets/training\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: Toolkit for linearizing PDFs for LLM datasets/training\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Toolkit for linearizing PDFs for LLM datasets/training\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. page-1: Installation and Platform Support. Produce one small intermediate artifact and wait for confirmation.\n2. page-2: Pipeline and Inference Modes. Produce one small intermediate artifact and wait for confirmation.\n3. page-3: Benchmark Suite and OCR Evaluation. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/allenai/olmocr\n- https://github.com/allenai/olmocr#readme\n- README.md\n- pyproject.toml\n- Dockerfile\n- Dockerfile.with-model\n- docs/source/installation.md\n- olmocr/pipeline.py\n- olmocr/work_queue.py\n- olmocr/datatypes.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "summary": "A safe trial prompt for feeling the workflow before installing the project.",
      "title": "Prompt Preview / Pre-install Trial Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / Official Entry\n\nProject: allenai/olmocr\n\n## Official Install Entry\n\n### Python / pip - Official install entry\n\n```bash\npip install olmocr\n```\n\nSource: https://github.com/allenai/olmocr#readme\n\n## Sources\n\n- repo: https://github.com/allenai/olmocr\n- docs: https://github.com/allenai/olmocr#readme\n",
      "summary": "Official entry points extracted from the project README or install docs.",
      "title": "Quick Start / Official Entry"
    }
  },
  "validation_id": "dval_57c72090fafa43429aa6c026373d7d67"
}
