{
  "canonical_name": "neuml/txtai",
  "compilation_id": "pack_8936044c23a54b4db0f6a9917891a79b",
  "created_at": "2026-05-16T23:19:16.970693+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=skill, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=skill, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install txtai` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install txtai",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "llm_execute_isolated_install",
      "sandbox_validation_id": "sbx_e775b5f84bd349b5bde798d510ae5f39"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_7305b996eaf797994230a61ac09ab3d0",
    "canonical_name": "neuml/txtai",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/neuml/txtai",
    "slug": "txtai",
    "source_packet_id": "phit_5dab3b1b35cc4f41b0d3a07d112b355c",
    "source_validation_id": "dval_0113bd6941264ce5bd0bac2c55fff767"
  },
  "merchandising": {
    "best_for": "需要软件开发与交付能力，并使用 local_cli的用户",
    "github_forks": 826,
    "github_stars": 12570,
    "one_liner_en": "💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows",
    "one_liner_zh": "💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows",
    "primary_category": {
      "category_id": "software-development",
      "confidence": "medium",
      "name_en": "Software Development",
      "name_zh": "软件开发与交付",
      "reason": "matched_keywords:git, cli"
    },
    "target_user": "使用 local_cli 等宿主 AI 的用户",
    "title_en": "txtai",
    "title_zh": "txtai 能力包",
    "visible_tags": [
      {
        "label_en": "MCP Tools",
        "label_zh": "MCP 工具",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-mcp-tools",
        "type": "product_domain"
      },
      {
        "label_en": "Knowledge Base Q&A",
        "label_zh": "知识库问答",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-knowledge-base-q-a",
        "type": "user_job"
      },
      {
        "label_en": "Structured Data Extraction",
        "label_zh": "结构化数据提取",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-structured-data-extraction",
        "type": "core_capability"
      },
      {
        "label_en": "Node-based Workflow",
        "label_zh": "节点式流程编排",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-node-based-workflow",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Open Source Tool",
        "label_zh": "开源工具",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-open-source-tool",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_5dab3b1b35cc4f41b0d3a07d112b355c",
  "page_model": {
    "artifacts": {
      "artifact_slug": "txtai",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install txtai",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/neuml/txtai#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "MCP 工具",
        "知识库问答",
        "结构化数据提取",
        "节点式流程编排",
        "开源工具"
      ],
      "eyebrow": "软件开发与交付",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要软件开发与交付能力，并使用 local_cli的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows"
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "local_cli",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "skill, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add `txtai_minimal` package",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_4e4050b59fdf4d51ac21e82d248e589e | https://github.com/neuml/txtai/issues/1090 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Add `txtai_minimal` package",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Captions implementation",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_ff6fe05065564e85a549d7656b85a852 | https://github.com/neuml/txtai/issues/1084 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Add custom Captions implementation",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Questions pipeline implementation",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_fa68ea29089d497bb8278c3c360c363c | https://github.com/neuml/txtai/issues/1087 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Add custom Questions pipeline implementation",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Sequences pipeline implementation",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_0099afb385ef498d8020521bbf3e9e64 | https://github.com/neuml/txtai/issues/1086 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Add custom Sequences pipeline implementation",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Summary pipeline implementation",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_9d8f75c31fc6450d8659b310f25d0bf4 | https://github.com/neuml/txtai/issues/1085 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Add custom Summary pipeline implementation",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add minimal Docker build script",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_38e1e56a991b412bbc749d2c0b687d54 | https://github.com/neuml/txtai/issues/1092 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Add minimal Docker build script",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Make `transformers` package optional for minimal install",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_d755829200de4d93b2f4d2f71b36aaf1 | https://github.com/neuml/txtai/issues/1091 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Make `transformers` package optional for minimal install",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Reduce dependencies to just `numpy` for minimal install",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_395b735968ab4ec8ad849070d43cd18f | https://github.com/neuml/txtai/issues/1093 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Reduce dependencies to just `numpy` for minimal install",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support minimal install for edge devices",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_3ec8e039baf4412888ed3ac543ea1361 | https://github.com/neuml/txtai/issues/1089 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Support minimal install for edge devices",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Various training pipeline fixes for v5",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_b47325d7496248a993d980c3d59f3269 | https://github.com/neuml/txtai/issues/1088 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Various training pipeline fixes for v5",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Zero dependency minimal install",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_7839a465e7e64f94acfff001c1da978c | https://github.com/neuml/txtai/issues/1094 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Zero dependency minimal install",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v9.9.0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_8b842ce68a1c4c1ab0a40a3ed1fd213c | https://github.com/neuml/txtai/releases/tag/v9.9.0 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v9.9.0",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | github_repo:286301447 | https://github.com/neuml/txtai | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个运行相关的待验证问题：v9.7.0",
            "category": "运行坑",
            "evidence": [
              "community_evidence:github | cevd_e3f61e41f6e84b9b9ee33d5e18dbff63 | https://github.com/neuml/txtai/releases/tag/v9.7.0 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v9.7.0",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "未记录 last_activity_observed。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | last_activity_observed missing"
            ],
            "severity": "medium",
            "suggested_check": "补 GitHub 最近 commit、release、issue/PR 响应信号。",
            "title": "维护活跃度未知",
            "user_impact": "新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "downstream_validation.risk_items | github_repo:286301447 | https://github.com/neuml/txtai | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "进入安全/权限治理复核队列。",
            "title": "下游验证发现风险项",
            "user_impact": "下游已经要求复核，不能在页面中弱化。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 22 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：Add `txtai_minimal` package。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": 23,
        "forks": 826,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": 12570
      },
      "source_url": "https://github.com/neuml/txtai",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows",
      "title": "txtai 能力包",
      "trial_prompt": "# txtai - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for neuml/txtai.\n\nProject:\n- Name: txtai\n- Repository: https://github.com/neuml/txtai\n- Summary: 💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: 💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: 💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. introduction: Introduction to txtai. Produce one small intermediate artifact and wait for confirmation.\n2. getting-started: Getting Started. Produce one small intermediate artifact and wait for confirmation.\n3. architecture: System Architecture. Produce one small intermediate artifact and wait for confirmation.\n4. embeddings-core: Embeddings Database. Produce one small intermediate artifact and wait for confirmation.\n5. pipelines: Pipelines. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/neuml/txtai\n- https://github.com/neuml/txtai#readme\n- README.md\n- src/python/txtai/__init__.py\n- docs/index.md\n- pyproject.toml\n- setup.py\n- docs/install.md\n- examples/01_Introducing_txtai.ipynb\n- src/python/txtai/embeddings/base.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: Add LiteRT-LM LLM（https://github.com/neuml/txtai/issues/1095）；github/github_issue: Zero dependency minimal install（https://github.com/neuml/txtai/issues/1094）；github/github_issue: Reduce dependencies to just `numpy` for minimal install（https://github.com/neuml/txtai/issues/1093）；github/github_issue: Make `transformers` package optional for minimal install（https://github.com/neuml/txtai/issues/1091）；github/github_issue: Add minimal Docker build script（https://github.com/neuml/txtai/issues/1092）；github/github_issue: Add `txtai_minimal` package（https://github.com/neuml/txtai/issues/1090）；github/github_issue: Various training pipeline fixes for v5（https://github.com/neuml/txtai/issues/1088）；github/github_issue: Add custom Questions pipeline implementation（https://github.com/neuml/txtai/issues/1087）；github/github_issue: Add custom Sequences pipeline implementation（https://github.com/neuml/txtai/issues/1086）；github/github_issue: Add custom Summary pipeline implementation（https://github.com/neuml/txtai/issues/1085）；github/github_issue: Add custom Captions implementation（https://github.com/neuml/txtai/issues/1084）；github/github_issue: Support minimal install for edge devices（https://github.com/neuml/txtai/issues/1089）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Add LiteRT-LM LLM",
              "url": "https://github.com/neuml/txtai/issues/1095"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Zero dependency minimal install",
              "url": "https://github.com/neuml/txtai/issues/1094"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Reduce dependencies to just `numpy` for minimal install",
              "url": "https://github.com/neuml/txtai/issues/1093"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Make `transformers` package optional for minimal install",
              "url": "https://github.com/neuml/txtai/issues/1091"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Add minimal Docker build script",
              "url": "https://github.com/neuml/txtai/issues/1092"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Add `txtai_minimal` package",
              "url": "https://github.com/neuml/txtai/issues/1090"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Various training pipeline fixes for v5",
              "url": "https://github.com/neuml/txtai/issues/1088"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Add custom Questions pipeline implementation",
              "url": "https://github.com/neuml/txtai/issues/1087"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Add custom Sequences pipeline implementation",
              "url": "https://github.com/neuml/txtai/issues/1086"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Add custom Summary pipeline implementation",
              "url": "https://github.com/neuml/txtai/issues/1085"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Add custom Captions implementation",
              "url": "https://github.com/neuml/txtai/issues/1084"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Support minimal install for edge devices",
              "url": "https://github.com/neuml/txtai/issues/1089"
            }
          ],
          "status": "已收录 12 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "软件开发与交付",
      "desc": "💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows",
      "effort": "安装已验证",
      "forks": 826,
      "icon": "code",
      "name": "txtai 能力包",
      "risk": "可发布",
      "slug": "txtai",
      "stars": 12570,
      "tags": [
        "MCP 工具",
        "知识库问答",
        "结构化数据提取",
        "节点式流程编排",
        "开源工具"
      ],
      "thumb": "gray",
      "type": "Skill Pack"
    },
    "manual": {
      "markdown": "# https://github.com/neuml/txtai 项目说明书\n\n生成时间：2026-05-16 22:47:03 UTC\n\n## 目录\n\n- [Introduction to txtai](#introduction)\n- [Getting Started](#getting-started)\n- [System Architecture](#architecture)\n- [Embeddings Database](#embeddings-core)\n- [Pipelines](#pipelines)\n- [Workflows](#workflows)\n- [Agents](#agents)\n- [Database Integration](#database-integration)\n- [Graph Networks](#graph-networks)\n- [Scoring and Retrieval Algorithms](#scoring-algorithms)\n\n<a id='introduction'></a>\n\n## Introduction to txtai\n\n### 相关页面\n\n相关主题：[System Architecture](#architecture), [Getting Started](#getting-started)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/neuml/txtai/blob/main/README.md)\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [src/python/txtai/pipeline/data/textractor.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/data/textractor.py)\n- [src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n- [src/python/txtai/workflow/task/template.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n- [examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n- [src/python/txtai/agent/tool/read.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/read.py)\n</details>\n\n# Introduction to txtai\n\ntxtai is an **all-in-one AI framework** for semantic search, large language model (LLM) orchestration, and language model workflows. It provides a unified platform that combines vector search capabilities with traditional database features, enabling developers to build sophisticated AI-powered applications without managing multiple disparate systems.\n\n## Overview\n\nThe central innovation of txtai is its **embeddings database** - a hybrid data store that unifies:\n\n- **Sparse vector indexes** for keyword-based search\n- **Dense vector indexes** for semantic similarity\n- **Graph networks** for relationship modeling\n- **Relational databases** for structured data storage\n\nThis architecture enables txtai to function both as a standalone vector search engine and as a powerful knowledge source for RAG (Retrieval Augmented Generation) applications. 资料来源：[README.md:1-20]()\n\n## Architecture\n\nThe txtai architecture follows a layered design that separates concerns while maintaining tight integration between components.\n\n```mermaid\ngraph TD\n    subgraph \"txtai Architecture\"\n        A[Application Layer] --> B[Workflow Engine]\n        B --> C[Pipeline System]\n        C --> D[Embeddings Database]\n        D --> E[(Vector Index)]\n        D --> F[(Graph Network)]\n        D --> G[(Relational DB)]\n    end\n    \n    subgraph \"Pipelines\"\n        H[LLM Pipeline] \n        I[Data Pipeline]\n        J[Text Pipeline]\n        K[Audio Pipeline]\n    end\n    \n    C --> H\n    C --> I\n    C --> J\n    C --> K\n```\n\n### Core Components\n\n| Component | Purpose | Key Features |\n|-----------|---------|--------------|\n| **Embeddings** | Vector database | Content storage, semantic search, hybrid queries |\n| **Workflows** | Task orchestration | Parallel/sequential execution, function routing |\n| **Pipelines** | Model execution | LLM, data processing, text operations |\n| **Agents** | Autonomous execution | Tool-based reasoning, MCP integration |\n\n## Key Features\n\n### Vector Search Capabilities\n\ntxtai provides comprehensive vector search functionality including:\n\n- Semantic/similarity/vector/neural search 资料来源：[README.md:45-50]()\n- SQL integration for hybrid queries\n- Object storage support\n- Topic modeling\n- Graph analysis\n- Multimodal indexing (text, audio, images, video)\n\n### Pipeline System\n\nThe pipeline system enables flexible model execution:\n\n| Pipeline Type | Capabilities |\n|---------------|--------------|\n| **LLM** | Text generation, chat, vision support 资料来源：[src/python/txtai/pipeline/llm/llm.py:1-50]() |\n| **Data** | Text extraction, document processing, chunking 资料来源：[src/python/txtai/pipeline/data/textractor.py:1-50]() |\n| **Text** | NER, transcription, translation |\n| **Audio** | Speech recognition, text-to-speech |\n| **Train** | Model fine-tuning, ONNX conversion |\n\n### Workflow Engine\n\nWorkflows orchestrate complex AI tasks using a template-based system:\n\n```mermaid\ngraph LR\n    A[Input] --> B[Template Task]\n    B --> C[LLM Pipeline]\n    C --> D[Output]\n    \n    B -.->|config| E[Template Rules]\n    E -->|format| B\n```\n\nThe template system supports named parameters, positional arguments, and strict/conservative parsing modes. 资料来源：[src/python/txtai/workflow/task/template.py:1-40]()\n\n## Installation\n\n### Core Dependencies\n\n```python\n# Default installation includes\ndefault = [\n    \"faiss-cpu>=1.7.1.post2\",\n    \"huggingface-hub>=0.34.0\",\n    \"msgpack>=1.0.7\",\n    \"numpy>=1.18.4\",\n    \"regex>=2022.8.17\",\n    \"pyyaml>=5.3\",\n    \"safetensors>=0.4.5\",\n    \"torch>=2.4\",\n    \"transformers>=4.56.2\",\n]\n```\n\n资料来源：[setup.py:18-30]()\n\n### Optional Installations\n\n| Extra | Purpose | Command |\n|-------|---------|---------|\n| `pipeline` | All pipeline dependencies | `pip install txtai[pipeline]` |\n| `vectors` | Embedding models | `pip install txtai[vectors]` |\n| `api` | REST API server | `pip install txtai[api]` |\n| `agent` | Autonomous agents | `pip install txtai[agent]` |\n| `all` | Complete installation | `pip install txtai[all]` |\n\n资料来源：[setup.py:55-150]()\n\n## Basic Usage\n\n### Semantic Search\n\n```python\nimport txtai\n\nembeddings = txtai.Embeddings()\nembeddings.index([\"Correct\", \"Not what we hoped\"])\nresult = embeddings.search(\"positive\", 1)\n# Returns: [(0, 0.29862046241760254)]\n```\n\n资料来源：[README.md:35-42]()\n\n### Text Extraction\n\n```python\nfrom txtai.pipeline import Textractor\n\ntextractor = Textractor(backend=\"docling\", sections=True)\nfor chunk in textractor(\"document.pdf\"):\n    process(chunk)\n```\n\n资料来源：[examples/rag_quickstart.py:1-50]()\n\n### RAG Pipeline\n\n```python\nfrom txtai import Embeddings, RAG\nfrom txtai.pipeline import Textractor\n\n# Build embeddings database\nembeddings = Embeddings(content=True, path=\"Qwen/Qwen3-Embedding-0.6B\")\nembeddings.index(chunks)\n\n# Create RAG pipeline\ntemplate = \"\"\"\n  Answer the following question using the provided context.\n  Question: {question}\n  Context: {context}\n\"\"\"\n\nrag = RAG(\n    embeddings,\n    \"Qwen/Qwen3-0.6B\",\n    system=\"You are a friendly assistant\",\n    template=template,\n)\n\nresult = rag(\"Summarize the main advancements made by BERT\")\n```\n\n资料来源：[examples/rag_quickstart.py:50-80]()\n\n## Use Cases\n\n### Semantic Search\n\nTraditional keyword-based search systems match exact terms. Semantic search understands natural language and identifies results with similar meaning, regardless of keyword overlap. 资料来源：[README.md:45-55]()\n\n### RAG Applications\n\nThe embeddings database serves as a knowledge source for LLM applications:\n\n```mermaid\ngraph TD\n    A[User Query] --> B[Embeddings Search]\n    B --> C[Retrieve Context]\n    C --> D[LLM Generation]\n    D --> E[Response]\n    \n    F[(Embeddings DB)] -.->|retrieval| C\n```\n\n### Autonomous Agents\n\nAgents utilize tools to perform complex tasks:\n\n```python\nfrom txtai.agent.tool.read import ReadTool\n\ntool = ReadTool(maxlength=40000)\ncontent = tool.forward(\"path/to/file.txt\")\n```\n\n资料来源：[src/python/txtai/agent/tool/read.py:1-60]()\n\n## API Server\n\ntxtai includes a built-in REST API for cross-language compatibility:\n\n```yaml\n# app.yml\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n```\n\n```bash\nCONFIG=app.yml uvicorn \"txtai.api:app\"\ncurl -X GET \"http://localhost:8000/search?query=positive\"\n```\n\n资料来源：[README.md:50-60]()\n\n## Design Principles\n\n1. **Low footprint** - Install additional dependencies and scale up when needed 资料来源：[README.md:60-70]()\n2. **Local execution** - No need to ship data to remote services\n3. **Model flexibility** - Work with micromodels up to large language models\n4. **Modular design** - Use individual components or the full stack\n\n## Requirements\n\n- Python 3.10+\n- PyTorch 2.4+\n- Transformers 4.56.2+\n\nThe framework supports Hugging Face models, llama.cpp, Ollama, vLLM, and more for embeddings and LLM operations. 资料来源：[setup.py:18-30]()\n\n---\n\n<a id='getting-started'></a>\n\n## Getting Started\n\n### 相关页面\n\n相关主题：[Introduction to txtai](#introduction)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [README.md](https://github.com/neuml/txtai/blob/main/README.md)\n- [examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n- [examples/wiki.py](https://github.com/neuml/txtai/blob/main/examples/wiki.py)\n- [examples/workflows.py](https://github.com/neuml/txtai/blob/main/examples/workflows.py)\n- [src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n</details>\n\n# Getting Started\n\ntxtai is an all-in-one AI framework for semantic search, LLM orchestration and language model workflows. The Getting Started guide provides the essential knowledge needed to install, configure, and begin using txtai effectively.\n\n## Overview\n\ntxtai provides a unified interface for building AI applications with:\n\n- **Vector search** with SQL, object storage, topic modeling, graph analysis and multimodal indexing\n- **Embeddings** for text, documents, audio, images and video\n- **Pipelines** powered by language models for prompts, question-answering, labeling, transcription, and translation\n- **Workflows** that orchestrate multi-step processes\n- **Agents** for autonomous decision-making\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Installation\n\n### Requirements\n\n| Requirement | Minimum Version | Notes |\n|-------------|-----------------|-------|\n| Python | 3.10+ | Required runtime |\n| PyTorch | 2.4+ | Core deep learning framework |\n| Transformers | 4.56.2+ | Model loading and inference |\n| NumPy | 1.18.4+ | Numerical operations |\n\n资料来源：[setup.py:27-37](https://github.com/neuml/txtai/blob/main/setup.py)\n\n### Installation Methods\n\n#### Standard Installation\n\nStandard installation includes all default dependencies:\n\n```bash\npip install txtai\n```\n\nThis installs the core package with default dependencies including FAISS, HuggingFace Hub, msgpack, numpy, regex, PyYAML, safetensors, PyTorch, and transformers.\n\n#### Minimal Installation\n\nFor environments with limited resources, install a minimal version:\n\n```bash\nMINIMAL=1 pip install txtai\n```\n\n资料来源：[setup.py:18-25](https://github.com/neuml/txtai/blob/main/setup.py)\n\n### Optional Extras\n\ntxtai provides modular extras to scale functionality based on needs:\n\n| Extra | Purpose | Key Dependencies |\n|-------|---------|------------------|\n| `api` | REST API hosting | fastapi, uvicorn, aiohttp |\n| `pipeline-audio` | Audio processing | scipy, soundfile, webrtcvad |\n| `pipeline-data` | Data extraction | beautifulsoup4, docling, tika |\n| `pipeline-image` | Image processing | pillow, timm, imagehash |\n| `pipeline-llm` | LLM inference | litellm, llama-cpp-python |\n| `pipeline-text` | Text processing | gliner, sentencepiece |\n| `pipeline-train` | Model training | accelerate, peft, onnx |\n| `vectors` | Vector embeddings | sentence-transformers |\n| `ann` | ANN indexes | faiss, hnswlib, annoy |\n| `workflow` | Workflow tasks | requests, pandas, openpyxl |\n| `agent` | Agent framework | smolagents, mcpadapt |\n| `all` | Everything | All above extras |\n\n资料来源：[setup.py:38-98](https://github.com/neuml/txtai/blob/main/setup.py)\n\nInstall with extras:\n\n```bash\npip install txtai[api,workflow]\npip install txtai[all]\n```\n\n## Core Concepts\n\n### Embeddings Database\n\nThe central component of txtai is the **embeddings database**, which combines:\n\n- Vector indexes (sparse and dense)\n- Graph networks\n- Relational databases\n\nThis foundation enables vector search and serves as a knowledge source for LLM applications.\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n```mermaid\ngraph TD\n    A[Embeddings Database] --> B[Vector Indexes]\n    A --> C[Graph Networks]\n    A --> D[Relational Database]\n    B --> E[Dense Vectors]\n    B --> F[Sparse Vectors]\n```\n\n### Pipeline Architecture\n\nPipelines are language model powered components that perform specific tasks:\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Pipeline]\n    B --> C[LLM Pipeline]\n    B --> D[Data Pipeline]\n    B --> E[Text Pipeline]\n    C --> F[Generated Content]\n    D --> G[Extracted Data]\n    E --> H[Processed Text]\n```\n\nKey pipeline types:\n\n| Pipeline | Function |\n|----------|----------|\n| `Summary` | Text summarization |\n| `Textractor` | Text extraction from files |\n| `RAG` | Retrieval augmented generation |\n| `Transcription` | Audio to text |\n| `Translation` | Language translation |\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n\n### Workflow System\n\nWorkflows orchestrate multi-step processes by connecting tasks:\n\n```mermaid\ngraph TD\n    A[Workflow Input] --> B[Task 1]\n    B --> C[Task 2]\n    C --> D[Task N]\n    D --> E[Workflow Output]\n    \n    F[Template Task] -->|formats| B\n    G[URL Task] -->|fetches| B\n```\n\n资料来源：[examples/workflows.py](https://github.com/neuml/txtai/blob/main/examples/workflows.py)\n\n## Quick Start\n\n### Basic Semantic Search\n\nThe simplest way to get started with txtai is semantic search:\n\n```python\nimport txtai\n\nembeddings = txtai.Embeddings()\nembeddings.index([\"Correct\", \"Not what we hoped\"])\nembeddings.search(\"positive\", 1)\n# Returns: [(0, 0.29862046241760254)]\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n### Building a RAG Application\n\nRetrieval Augmented Generation combines embeddings search with LLM inference:\n\n```python\nfrom txtai import Embeddings, RAG\nfrom txtai.pipeline import Textractor\n\n# Step 1: Extract text from documents\ntextractor = Textractor()\nchunks = []\nfor f in [\"document1.pdf\", \"document2.pdf\"]:\n    for chunk in textractor(f):\n        chunks.append((f, chunk))\n\n# Step 2: Build embeddings database\nembeddings = Embeddings(content=True, path=\"Qwen/Qwen3-Embedding-0.6B\", maxlength=2048)\nembeddings.index(chunks)\n\n# Step 3: Create RAG pipeline\ntemplate = \"\"\"\n  Answer the following question using the provided context.\n\n  Question:\n  {question}\n\n  Context:\n  {context}\n\"\"\"\n\nrag = RAG(\n    embeddings,\n    \"Qwen/Qwen3-0.6B\",\n    system=\"You are a friendly assistant\",\n    template=template,\n    output=\"flatten\",\n)\n\n# Step 4: Query\nquestion = \"Summarize the main advancements made by BERT\"\nprint(rag(question, maxlength=2048, stripthink=True))\n```\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n### Wikipedia Summarization\n\nExample combining external API with summarization:\n\n```python\nimport requests\nfrom txtai.pipeline import Summary\n\nclass Application:\n    SEARCH_TEMPLATE = \"https://en.wikipedia.org/w/api.php?action=opensearch&search=%s&limit=1&namespace=0&format=json\"\n    CONTENT_TEMPLATE = \"https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&titles=%s\"\n\n    def __init__(self):\n        self.summary = Summary(\"sshleifer/distilbart-cnn-12-6\")\n\n    def query(self, query):\n        # Search Wikipedia\n        data = requests.get(self.SEARCH_TEMPLATE % query).json()\n        if data and data[1]:\n            page = data[1][0]\n            content = requests.get(self.CONTENT_TEMPLATE % page).json()\n            content = list(content[\"query\"][\"pages\"].values())[0][\"extract\"]\n            return self.summary(content)\n        return None\n```\n\n资料来源：[examples/wiki.py](https://github.com/neuml/txtai/blob/main/examples/wiki.py)\n\n## Configuration\n\n### Embeddings Configuration\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `path` | str | None | Model path (HuggingFace, llama.cpp, Ollama, vLLM) |\n| `content` | bool | False | Enable content storage |\n| `maxlength` | int | None | Maximum sequence length |\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n### LLM Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `maxlength` | int | None | Maximum sequence length |\n| `stream` | bool | False | Enable streaming response |\n| `stop` | list | None | List of stop strings |\n| `stripthink` | bool | varies | Strip thinking tags from output |\n| `defaultrole` | str | \"auto\" | Default role for text inputs |\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py:18-34](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n\n### RAG Configuration\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `embeddings` | Embeddings | Embeddings database instance |\n| `path` | str | LLM model path |\n| `system` | str | System prompt for the LLM |\n| `template` | str | Prompt template with `{question}` and `{context}` |\n| `output` | str | Output format (\"flatten\", etc.) |\n\n## API Service\n\ntxtai includes a built-in API for language-agnostic applications:\n\n```yaml\n# app.yml\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n```\n\n```bash\nCONFIG=app.yml uvicorn \"txtai.api:app\"\ncurl -X GET \"http://localhost:8000/search?query=positive\"\n```\n\nBenefits of the API service:\n- Work with your programming language of choice\n- Run local - no need to ship data to remote services\n- Supports models from micromodels to large language models\n- Low footprint - scale up when needed\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Next Steps\n\nAfter completing the Getting Started guide, explore:\n\n1. **Example Notebooks** - Over 70 example notebooks covering all functionality\n2. **Workflows** - Build complex multi-step pipelines\n3. **Agents** - Create autonomous AI agents with tool use\n4. **Graph Analysis** - Network and relationship analysis\n5. **Model Training** - Fine-tune models for your use case\n\nAll examples are available at: https://neuml.github.io/txtai/examples\n\n---\n\n<a id='architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Embeddings Database](#embeddings-core), [Database Integration](#database-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/neuml/txtai/blob/main/README.md)\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [src/python/txtai/pipeline/data/textractor.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/data/textractor.py)\n- [src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n- [src/python/txtai/agent/tool/function.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/function.py)\n- [src/python/txtai/agent/tool/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/factory.py)\n- [src/python/txtai/workflow/task/template.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n- [src/python/txtai/cloud/hub.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/cloud/hub.py)\n</details>\n\n# System Architecture\n\ntxtai is an **all-in-one AI framework** designed for semantic search, LLM orchestration, and language model workflows. The architecture is built around a modular, extensible design that allows developers to scale from micromodels to large language models with minimal footprint.\n\n## Architecture Overview\n\nThe txtai architecture consists of several interconnected layers that work together to provide comprehensive AI capabilities.\n\n```mermaid\ngraph TB\n    subgraph \"Client Layer\"\n        API[\"REST API\"]\n        Console[\"Console\"]\n        Library[\"Python Library\"]\n    end\n    \n    subgraph \"Core Engine\"\n        Embeddings[\"Embeddings Database\"]\n        Workflow[\"Workflow Engine\"]\n        Agent[\"Agent System\"]\n    end\n    \n    subgraph \"Pipeline Layer\"\n        LLMPipeline[\"LLM Pipeline\"]\n        DataPipeline[\"Data Pipeline\"]\n        TextPipeline[\"Text Pipeline\"]\n        TrainPipeline[\"Train Pipeline\"]\n        AudioPipeline[\"Audio Pipeline\"]\n        ImagePipeline[\"Image Pipeline\"]\n    end\n    \n    subgraph \"Index Layer\"\n        ANN[\"ANN Index\"]\n        Documents[\"Document Store\"]\n        Graph[\"Graph Network\"]\n        Database[\"Relational DB\"]\n    end\n    \n    API --> Embeddings\n    API --> Workflow\n    API --> Agent\n    Embeddings --> ANN\n    Embeddings --> Documents\n    Embeddings --> Graph\n    Embeddings --> Database\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Core Components\n\n### Embeddings Database\n\nThe **embeddings database** is the foundational component of txtai. It serves as a union of multiple index types:\n\n| Index Type | Purpose | Description |\n|------------|---------|-------------|\n| Vector Index (Sparse) | Sparse embeddings | Traditional BM25-style scoring |\n| Vector Index (Dense) | Dense embeddings | Neural network-based semantic vectors |\n| Graph Networks | Relationship mapping | NetworkX-based graph structures |\n| Relational Databases | Structured data | SQLAlchemy-based data storage |\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\nThe embeddings database enables vector search capabilities and serves as a powerful knowledge source for large language model (LLM) applications.\n\n### Pipeline System\n\nPipelines are the processing units of txtai, each designed for specific tasks:\n\n```mermaid\ngraph LR\n    Input[\"Input Data\"] --> Pipeline[\"Pipeline\"]\n    \n    subgraph \"Pipeline Types\"\n        Audio[\"Audio Pipeline\"]\n        Data[\"Data Pipeline\"]\n        Image[\"Image Pipeline\"]\n        LLM[\"LLM Pipeline\"]\n        Text[\"Text Pipeline\"]\n        Train[\"Train Pipeline\"]\n    end\n    \n    Pipeline --> Output[\"Output Data\"]\n```\n\n#### Data Pipeline\n\nThe Data Pipeline handles text extraction and processing:\n\n| Component | Function |\n|-----------|----------|\n| `Textractor` | Extracts text from files and URLs |\n| `FileToHTML` | Converts files to HTML format |\n| `HTMLToMarkdown` | Transforms HTML to Markdown |\n| `Segmentation` | Handles text segmentation (sentences, lines, paragraphs) |\n\nThe `Textractor` class supports multiple backends for file extraction:\n\n```python\ndef __init__(\n    self,\n    sentences=False,\n    lines=False,\n    paragraphs=False,\n    minlength=None,\n    join=False,\n    sections=False,\n    cleantext=True,\n    chunker=None,\n    headers=None,\n    backend=\"available\",\n    safeopen=False,\n    **kwargs,\n):\n```\n\n资料来源：[src/python/txtai/pipeline/data/textractor.py:19-37](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/data/textractor.py)\n\n#### LLM Pipeline\n\nThe LLM Pipeline provides LLM generation capabilities:\n\n```python\ndef __call__(\n    self,\n    text,\n    maxlength=None,\n    stream=False,\n    stop=None,\n    defaultrole=\"auto\",\n    stripthink=None,\n    **kwargs,\n):\n```\n\nKey features:\n- Supports streaming responses\n- Configurable stop sequences\n- Built-in thinking tag stripping\n- Chat and vision model support\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py:1-50](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n\n### Agent System\n\nThe Agent System enables autonomous AI agents with tool use capabilities:\n\n```mermaid\ngraph TD\n    User[\"User Input\"] --> Agent[\"Agent\"]\n    Agent --> Tools[\"Tool Collection\"]\n    Tools --> EmbeddingsTool[\"Embeddings Tool\"]\n    Tools --> FunctionTool[\"Function Tool\"]\n    Tools --> DefaultTools[\"Default Tools\"]\n    \n    subgraph \"Tool Factory\"\n        create[\"create()\"]\n        createtool[\"createtool()\"]\n    end\n    \n    Tools --> Result[\"Result\"]\n    Result --> Agent\n```\n\n#### FunctionTool\n\nA `FunctionTool` wraps descriptive configuration with a target function for LLM prompts:\n\n```python\nclass FunctionTool(Tool):\n    def __init__(self, config):\n        self.name = config[\"name\"]\n        self.description = config[\"description\"]\n        self.inputs = config[\"inputs\"]\n        self.output_type = config.get(\"output\", config.get(\"output_type\", \"any\"))\n        self.target = config[\"target\"]\n```\n\n资料来源：[src/python/txtai/agent/tool/function.py:1-50](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/function.py)\n\n#### Tool Factory\n\nThe `ToolFactory` creates tools from various sources:\n\n| Input Type | Creation Method |\n|------------|------------------|\n| Tool instance | Direct pass-through |\n| Function/Method | Auto-wrapped with `createtool()` |\n| Dictionary | Config-based creation |\n| String alias | Lookup in `DEFAULTS` registry |\n| HTTP URL | MCP tool collection import |\n\n资料来源：[src/python/txtai/agent/tool/factory.py:1-100](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/factory.py)\n\n### Workflow Engine\n\nThe Workflow Engine processes tasks through a series of steps:\n\n```mermaid\ngraph LR\n    Task[\"Task Input\"] --> Template[\"Template Task\"]\n    Template --> Process[\"Process\"]\n    Process --> Output[\"Task Output\"]\n```\n\n#### Template Task\n\nThe `TemplateTask` generates text from templates, supporting LLM prompt preparation:\n\n```python\nclass TemplateTask(Task):\n    def register(self, template=None, rules=None, strict=True):\n        self.template = template if template else self.defaulttemplate()\n        self.rules = rules if rules else self.defaultrules()\n        self.formatter = TemplateFormatter() if strict else Formatter()\n```\n\nTemplate processing supports three input types:\n\n| Input Type | Processing Method |\n|------------|-------------------|\n| Dictionary | Named parameters from keys |\n| Tuple | Numbered parameters (arg0, arg1, ...) |\n| Other | Default `{text}` parameter |\n\n资料来源：[src/python/txtai/workflow/task/template.py:1-50](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n\n## Module Organization\n\n### Package Structure\n\n```\ntxtai/\n├── agent/           # Agent system and tools\n├── api/             # REST API implementation\n├── cloud/           # Cloud integration (Hub)\n├── embeddings/      # Embeddings database\n├── scoring/         # Scoring/ranking system\n├── pipeline/       # Pipeline components\n│   ├── audio/      # Audio processing\n│   ├── data/       # Data extraction\n│   ├── image/      # Image processing\n│   ├── llm/        # LLM generation\n│   ├── text/       # Text processing\n│   └── train/      # Model training\n└── workflow/       # Workflow engine\n```\n\n### Dependency Groups\n\nThe project uses optional dependency groups for modular installation:\n\n| Extra | Purpose | Key Dependencies |\n|-------|---------|------------------|\n| `default` | Core functionality | faiss-cpu, numpy, torch, transformers |\n| `api` | REST API | fastapi, uvicorn, aiohttp |\n| `pipeline` | All pipelines | Combination of all pipeline extras |\n| `agent` | Agent system | smolagents, mcpadapt, jinja2 |\n| `vectors` | Vector models | sentence-transformers, litellm |\n\n资料来源：[setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n\n## Data Flow Architecture\n\n### Semantic Search Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Embeddings\n    participant ANN\n    participant Documents\n    \n    User->>Embeddings: index(documents)\n    Embeddings->>ANN: build(vector)\n    Embeddings->>Documents: store(data)\n    \n    User->>Embeddings: search(query)\n    Embeddings->>ANN: query(vector)\n    ANN-->>Embeddings: top-k results\n    Embeddings->>Documents: retrieve(ids)\n    Documents-->>Embeddings: document content\n    Embeddings-->>User: ranked results\n```\n\n### RAG Pipeline Flow\n\n```mermaid\ngraph TD\n    Question[\"User Question\"] --> Embeddings[\"Embeddings DB\"]\n    Embeddings --> Context[\"Retrieved Context\"]\n    Context --> LLM[\"LLM Pipeline\"]\n    LLM --> Answer[\"Generated Answer\"]\n    \n    subgraph \"Indexing Phase\"\n        Documents[\"Documents\"] --> Textractor[\"Textractor\"]\n        Textractor --> Chunks[\"Text Chunks\"]\n        Chunks --> Embeddings\n    end\n```\n\n## Cloud Integration\n\n### Hub Module\n\nThe Hub module enables model and data sharing via Hugging Face:\n\n```python\ndef upload(self, path):\n    with open(path, \"r\", encoding=\"utf-8\") as f:\n        content = f.read()\n    \n    if \"embeddings \" not in content:\n        content += \"documents filter=lfs diff=lfs merge=lfs -text\\n\"\n        content += \"embeddings filter=lfs diff=lfs merge=lfs -text\\n\"\n    \n    huggingface_hub.upload_file(...)\n```\n\nFeatures:\n- Automatic LFS tracking for embeddings\n- Token-based authentication\n- Remote index management\n\n资料来源：[src/python/txtai/cloud/hub.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/cloud/hub.py)\n\n## Use Cases Built on Architecture\n\n| Use Case | Architecture Components |\n|----------|------------------------|\n| Semantic Search | Embeddings + ANN Index + Workflow |\n| RAG | Embeddings + LLM Pipeline + Template Task |\n| LLM Orchestration | Agent System + Tool Factory + LLM Pipeline |\n| Document Processing | Data Pipeline + Embeddings + Workflow |\n| Multi-model Workflows | Pipeline Composition + Workflow Engine |\n\n## Summary\n\nThe txtai architecture provides a flexible, extensible foundation for AI applications:\n\n1. **Embeddings Database** - Unified vector/graph/relational indexing\n2. **Pipeline System** - Modular processing for diverse data types\n3. **Agent System** - Tool-augmented autonomous AI\n4. **Workflow Engine** - Task orchestration and automation\n5. **Cloud Integration** - Model sharing and deployment\n\nThis modular design enables developers to start with minimal installations and scale up capabilities as needed, supporting everything from simple semantic search to complex multi-model RAG systems with autonomous agents.\n\n---\n\n<a id='embeddings-core'></a>\n\n## Embeddings Database\n\n### 相关页面\n\n相关主题：[System Architecture](#architecture), [Scoring and Retrieval Algorithms](#scoring-algorithms)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/embeddings/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/__init__.py)\n- [src/python/txtai/embeddings/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/base.py)\n- [src/python/txtai/embeddings/index/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/index/__init__.py)\n- [src/python/txtai/embeddings/search/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/search/__init__.py)\n- [src/python/txtai/vectors/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/vectors/__init__.py)\n- [docs/embeddings/index.md](https://github.com/neuml/txtai/blob/main/docs/embeddings/index.md)\n</details>\n\n# Embeddings Database\n\nThe Embeddings Database is the foundational component of txtai, providing unified vector search capabilities that combine sparse and dense vector indexing, graph networks, and relational database functionality.\n\n## Overview\n\nThe txtai embeddings database serves as a comprehensive solution for semantic search and vector-based data retrieval. It extends traditional database capabilities by incorporating neural network-based embedding generation and similarity search.\n\n**Core Architecture:**\n\n```mermaid\ngraph TD\n    A[Embeddings Database] --> B[Vector Indexes]\n    A --> C[Graph Networks]\n    A --> D[Relational Database]\n    B --> B1[Dense Vectors]\n    B --> B2[Sparse Vectors]\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Key Features\n\n| Feature | Description |\n|---------|-------------|\n| Vector Search | Semantic similarity search using dense and sparse embeddings |\n| SQL Support | Traditional database queries combined with vector search |\n| Content Storage | Built-in document content storage |\n| Graph Analysis | Integration with graph networks for relationship-based queries |\n| Multimodal Indexing | Support for text, documents, audio, images, and video |\n| Topic Modeling | Built-in topic modeling capabilities |\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Installation and Setup\n\n### Basic Installation\n\n```python\nimport txtai\n\nembeddings = txtai.Embeddings()\n```\n\n### Configuration Options\n\nThe embeddings database can be configured with the following key parameters:\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `path` | string | None | Model path (Hugging Face, llama.cpp, Ollama, vLLM) |\n| `content` | bool | False | Enable content storage |\n| `maxlength` | int | None | Maximum sequence length |\n| `quantize` | bool | False | Enable model quantization |\n| `gpu` | bool | True | Enable GPU inference |\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n## Core Operations\n\n### Indexing Documents\n\n```python\nembeddings = txtai.Embeddings()\nembeddings.index([\"Correct\", \"Not what we hoped\"])\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n### Semantic Search\n\n```python\nresults = embeddings.search(\"positive\", 1)\n# Returns: [(0, 0.29862046241760254)]\n```\n\nThe search method returns tuples of (index, score) ordered by similarity.\n\n## Architecture Components\n\n### Vector Indexes\n\nThe embeddings database supports multiple vector index backends:\n\n| Backend | Package | Configuration |\n|---------|---------|---------------|\n| FAISS | `faiss-cpu` | Default backend |\n| Annoy | `annoy` | Via `ann` extra |\n| HNSWLib | `hnswlib` | Via `ann` extra |\n| pgvector | `pgvector` | Via `ann` extra |\n| sqlite-vec | `sqlite-vec` | Via `ann` extra |\n\n资料来源：[setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n\n### Vector Models\n\nSupported embedding model providers:\n\n- **Sentence Transformers**: `sentence-transformers>=5.0.0`\n- **ONNX Models**: `onnx>=1.11.0`, `onnxruntime>=1.11.0`\n- **llama.cpp**: `llama-cpp-python>=0.2.75`\n- **LiteLLM**: `litellm>=1.37.16`\n- **Model2Vec**: `model2vec>=0.3.0`\n- **Static Vectors**: `staticvectors>=0.2.0`\n\n资料来源：[setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n\n## Data Flow\n\n```mermaid\ngraph LR\n    A[Input Text] --> B[Tokenizer]\n    B --> C[Embedding Model]\n    C --> D[Vector Index]\n    D --> E[Search Query]\n    E --> F[Similarity Score]\n    F --> G[Results]\n    \n    H[(Content Store)] --> G\n```\n\n## RAG Integration\n\nThe embeddings database serves as the knowledge source for Retrieval Augmented Generation (RAG) workflows:\n\n```python\nfrom txtai import Embeddings, RAG\n\n# Build embeddings database\nembeddings = Embeddings(content=True, path=\"Qwen/Qwen3-Embedding-0.6B\", maxlength=2048)\nembeddings.index(chunks)\n\n# Create RAG pipeline\nrag = RAG(\n    embeddings,\n    \"Qwen/Qwen3-0.6B\",\n    system=\"You are a friendly assistant\",\n    template=template,\n    output=\"flatten\",\n)\n```\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n### RAG Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `similarity` | Embeddings/Similarity | Knowledge source database |\n| `path` | string | LLM model path |\n| `quantize` | bool | Enable model quantization |\n| `gpu` | bool | Enable GPU inference |\n| `template` | string | Prompt template |\n| `context` | int | Maximum context length |\n| `minscore` | float | Minimum similarity score |\n| `mintokens` | int | Minimum token count |\n\n资料来源：[src/python/txtai/pipeline/llm/rag.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/rag.py)\n\n## API Deployment\n\nThe embeddings database can be exposed as a REST API:\n\n```yaml\n# app.yml\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n```\n\n```bash\nCONFIG=app.yml uvicorn \"txtai.api:app\"\ncurl -X GET \"http://localhost:8000/search?query=positive\"\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Workflow Integration\n\nThe embeddings database integrates with txtai workflows for processing pipelines:\n\n```python\nfrom txtai.workflow import Workflow, Task\n\n# Text extraction workflow\ntextractor = Textractor()\nchunks = []\nfor f, chunk in textractor(files):\n    chunks.append((f, chunk))\n\n# Index extracted content\nembeddings = Embeddings(content=True, path=\"Qwen/Qwen3-Embedding-0.6B\")\nembeddings.index(chunks)\n```\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n## Dependencies\n\n### Core Dependencies\n\n| Package | Version | Purpose |\n|---------|---------|---------|\n| `faiss-cpu` | >=1.7.1.post2 | Default vector index |\n| `torch` | >=2.4 | Tensor operations |\n| `transformers` | >=4.56.2 | Model inference |\n| `huggingface-hub` | >=0.34.0 | Model management |\n| `numpy` | >=1.18.4 | Numerical operations |\n| `safetensors` | >=0.4.5 | Model serialization |\n\n资料来源：[setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n\n### Optional Dependencies\n\n| Extra | Packages | Use Case |\n|-------|----------|----------|\n| `ann` | annoy, hnswlib, pgvector | Alternative indexes |\n| `vectors` | sentence-transformers, litellm | Vector models |\n| `similarity` | ann + vectors | Full search capabilities |\n\n## Use Cases\n\n### Semantic Search\n\nTraditional search systems use keywords to find data. Semantic search understands natural language and identifies results with the same meaning, not necessarily the same keywords.\n\n### Knowledge Base for LLMs\n\nThe embeddings database serves as a powerful knowledge source for LLM applications, enabling retrieval augmented generation (RAG) processes.\n\n### Multimodal Applications\n\n- **Text**: Document embeddings for text classification and clustering\n- **Images**: Visual similarity search\n- **Audio**: Speech-to-text with semantic search\n- **Video**: Frame-level semantic indexing\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Performance Considerations\n\n- **Index Size**: Vector indexes scale with document count and embedding dimensionality\n- **GPU Acceleration**: Enable `gpu=True` for faster inference on compatible hardware\n- **Quantization**: Use `quantize=True` to reduce memory footprint with minimal accuracy loss\n- **Batch Processing**: Index documents in batches for optimal throughput\n\n## See Also\n\n- [RAG Pipeline](https://neuml.github.io/txtai/pipeline/text/rag) - Retrieval Augmented Generation\n- [Similarity Pipeline](https://neuml.github.io/txtai/pipeline/text/similarity) - Text similarity scoring\n- [Workflows](https://neuml.github.io/txtai/workflows) - Pipeline orchestration\n- [Agent Framework](https://neuml.github.io/txtai/agent) - Autonomous AI agents\n\n---\n\n<a id='pipelines'></a>\n\n## Pipelines\n\n### 相关页面\n\n相关主题：[Workflows](#workflows), [Agents](#agents)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/pipeline/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/__init__.py)\n- [src/python/txtai/pipeline/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/factory.py)\n- [src/python/txtai/pipeline/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/base.py)\n- [src/python/txtai/pipeline/text/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/text/__init__.py)\n- [src/python/txtai/pipeline/audio/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/audio/__init__.py)\n- [src/python/txtai/pipeline/llm/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/__init__.py)\n- [docs/pipeline/index.md](https://github.com/neuml/txtai/blob/main/docs/pipeline/index.md)\n</details>\n\n# Pipelines\n\n## Overview\n\nPipelines are the core processing components in txtai that power language model workflows. They provide a unified interface for various text processing, transformation, and generation tasks including text extraction, summarization, transcription, translation, and LLM inference.\n\nThe pipeline system enables txtai to work with text, documents, audio, images and video by providing modular, composable processing units that can be chained together to build complex workflows. 资料来源：[setup.py](setup.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    subgraph \"Pipeline Categories\"\n        Text[Text Pipelines]\n        Audio[Audio Pipelines]\n        LLM[LLM Pipelines]\n        Data[Data Pipelines]\n    end\n    \n    subgraph \"Base Infrastructure\"\n        Base[Pipeline Base Class]\n        Factory[Pipeline Factory]\n    end\n    \n    Text --> Base\n    Audio --> Base\n    LLM --> Base\n    Data --> Base\n    Base --> Factory\n```\n\n## Pipeline Categories\n\n### Text Pipelines\n\nText pipelines handle natural language processing tasks such as entity recognition, text classification, and language detection.\n\n| Pipeline | Description | Key Features |\n|----------|-------------|--------------|\n| Labels | Text classification and labeling | Supports GLiNER-based models for NER and classification |\n| Translation | Language translation | Powered by transformers |\n| Segmentation | Text splitting | Sentence, line, paragraph, and section segmentation |\n\n资料来源：[src/python/txtai/pipeline/text/__init__.py](src/python/txtai/pipeline/text/__init__.py)\n\n### Audio Pipelines\n\nAudio pipelines enable speech-to-text transcription and processing of audio content.\n\n| Component | Description |\n|-----------|-------------|\n| Transcription | Convert audio to text using webrtcvad for voice activity detection |\n| Audio processing | Support for sounddevice and soundfile libraries |\n\nThese pipelines require `scipy`, `sounddevice`, `soundfile`, and `webrtcvad-wheels` dependencies. 资料来源：[setup.py](setup.py)\n\n### LLM Pipelines\n\nLLM (Large Language Model) pipelines provide interfaces for text generation, chat completion, and vision-capable model interactions.\n\n```mermaid\ngraph LR\n    A[Input Text/List/Dict] --> B[Generator]\n    B --> C[stream]\n    B --> D[batch]\n    C --> E[Streaming Response]\n    D --> F[Complete Response]\n```\n\nThe LLM pipeline supports multiple backends including HuggingFace transformers and liteLLM for unified LLM access. Key capabilities include:\n\n- **Chat completion**: Supports both raw prompts and structured chat templates\n- **Vision support**: Can process image inputs with vision-capable models\n- **Streaming**: Real-time token-by-token response streaming\n- **Stop sequences**: Configurable stop tokens for controlled generation\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py](src/python/txtai/pipeline/llm/llm.py), [src/python/txtai/pipeline/llm/huggingface.py](src/python/txtai/pipeline/llm/huggingface.py)\n\n### Data Pipelines\n\nData pipelines handle document parsing, text extraction, and content conversion.\n\n#### Textractor\n\nThe Textractor class extracts text from files using various backends:\n\n```python\nTextractor(\n    sentences=False,      # Split into sentences\n    lines=False,          # Split into lines\n    paragraphs=False,     # Split into paragraphs\n    minlength=None,       # Minimum text length filter\n    join=False,           # Join segments\n    sections=False,       # Extract sections\n    cleantext=True,       # Clean extracted text\n    chunker=None,         # Custom text chunker\n    headers=None,         # HTTP headers for URLs\n    backend=\"available\",  # \"tika\", \"docling\", or \"available\"\n    safeopen=False        # Restrict to safe URLs only\n)\n```\n\nSupported backends:\n- **Tika**: Apache Tika for comprehensive document parsing\n- **Docling**: Alternative backend using docling library\n\n资料来源：[src/python/txtai/pipeline/data/textractor.py](src/python/txtai/pipeline/data/textractor.py)\n\n#### FileToHTML\n\nConverts various file formats to HTML for further processing.\n\n```mermaid\ngraph LR\n    A[File Input] --> B[Backend Selection]\n    B --> C{Tika Available?}\n    C -->|Yes| D[Tika Backend]\n    C -->|No| E{Docling Available?}\n    E -->|Yes| F[Docling Backend]\n    E -->|No| G[Return None]\n    D --> H[HTML Output]\n    F --> H\n```\n\nThe backend detection follows this priority:\n1. Tika (if `tika` package is installed)\n2. Docling (if `docling` package is installed)\n3. Returns `None` if no backend available\n\n资料来源：[src/python/txtai/pipeline/data/filetohtml.py](src/python/txtai/pipeline/data/filetohtml.py)\n\n#### HTMLToMarkdown\n\nConverts HTML content to Markdown format, supporting paragraph and section-level processing.\n\n## Pipeline Factory\n\nThe pipeline factory pattern enables dynamic pipeline instantiation based on configuration:\n\n```python\n# Via factory\npipeline = PipelineFactory.create(config)\n\n# Direct instantiation\npipeline = Summary(\"model-name\")\npipeline = Textractor()\n```\n\nThis allows pipelines to be defined in YAML configuration files and instantiated programmatically. 资料来源：[src/python/txtai/pipeline/factory.py](src/python/txtai/pipeline/factory.py)\n\n## Common Patterns\n\n### Text Processing Chain\n\n```mermaid\ngraph TD\n    A[File/URL] --> B[Textractor]\n    B --> C[Segmentation]\n    C --> D[Text Clean]\n    D --> E[Ready for Embeddings]\n```\n\n### LLM Generation Flow\n\n```python\nfrom txtai.pipeline import LLM\n\n# Direct model loading\nllm = LLM(\"model-name\")\n\n# Generate with various input formats\nresult = llm(\"What is artificial intelligence?\")\nresult = llm([{\"role\": \"user\", \"content\": \"Hello\"}])\nresult = llm(\"Generate a summary\", stream=True)\n```\n\nParameters supported by LLM pipelines:\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| text | str/list/dict | required | Input text or messages |\n| maxlength | int | 512 | Maximum sequence length |\n| stream | bool | False | Enable streaming response |\n| stop | list | None | Stop sequences |\n| defaultrole | str | \"auto\" | Default role for text inputs |\n| stripthink | bool | None | Strip thinking tags |\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py](src/python/txtai/pipeline/llm/llm.py)\n\n## Dependencies\n\nPipelines are organized into optional dependency groups:\n\n```python\nextras[\"pipeline-audio\"]   # Audio processing\nextras[\"pipeline-data\"]     # Document parsing\nextras[\"pipeline-image\"]    # Image processing\nextras[\"pipeline-llm\"]      # LLM inference\nextras[\"pipeline-text\"]     # NLP tasks\nextras[\"pipeline-train\"]    # Model training\n```\n\nFull pipeline installation:\n```bash\npip install txtai[pipeline]\n```\n\n资料来源：[setup.py](setup.py)\n\n## Integration with Workflows\n\nPipelines integrate seamlessly with the workflow system:\n\n```python\nfrom txtai.workflow import Workflow, Task\nfrom txtai.pipeline import Summary, Textractor\n\n# Create a pipeline-based workflow\nworkflow = Workflow([\n    Task(Textractor(paragraphs=True)),\n    Task(Summary())\n])\n\n# Process documents\nresults = workflow([\"path/to/document.pdf\"])\n```\n\nThe TemplateTask enables prompt template processing within workflows, supporting string formatting with named or positional parameters. 资料来源：[src/python/txtai/workflow/task/template.py](src/python/txtai/workflow/task/template.py)\n\n## See Also\n\n- [Workflows](workflow.md) - Chaining pipelines together\n- [Embeddings](embeddings.md) - Vector indexing and search\n- [Agents](agent.md) - LLM-powered autonomous agents\n\n---\n\n<a id='workflows'></a>\n\n## Workflows\n\n### 相关页面\n\n相关主题：[Pipelines](#pipelines)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/workflow/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/__init__.py)\n- [src/python/txtai/workflow/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/base.py)\n- [src/python/txtai/workflow/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/factory.py)\n- [src/python/txtai/workflow/task/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/__init__.py)\n- [src/python/txtai/workflow/task/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/base.py)\n- [src/python/txtai/workflow/task/template.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n- [examples/workflow_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/workflow_quickstart.py)\n</details>\n\n# Workflows\n\n## Overview\n\nWorkflows in txtai provide a declarative, pipeline-based approach to processing data through a series of connected tasks. They enable developers to chain together multiple processing components—including text extraction, summarization, translation, and large language model (LLM) calls—into cohesive data processing pipelines.\n\nThe workflow system supports:\n\n- **Deterministic pipelines**: Define processing steps that execute in sequence\n- **LLM-powered workflows**: Integrate language model prompts for intelligent processing\n- **Parallel execution**: Configure thread or process-based concurrency\n- **Flexible data transformation**: Handle one-to-many and many-to-one data transformations\n- **Template-based prompts**: Generate prompts dynamically using template formatting\n\n资料来源：[examples/workflow_quickstart.py:1-25]()\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Input Data] --> B[Workflow]\n    B --> C[Task 1: Textractor]\n    C --> D[Task 2: Summary]\n    D --> E[Task N: Translation/LLM]\n    E --> F[Output Results]\n    \n    G[Task Configuration] --> B\n    H[Initialize Action] --> C\n    I[Finalize Action] --> E\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `Workflow` | `workflow/__init__.py` | Main workflow orchestration class |\n| `WorkflowBase` | `workflow/base.py` | Base class with core workflow logic |\n| `Task` | `workflow/task/base.py` | Base class for all workflow tasks |\n| `TemplateTask` | `workflow/task/template.py` | Task with template prompt support |\n| `WorkflowFactory` | `workflow/factory.py` | Factory for workflow instantiation |\n\n资料来源：[src/python/txtai/workflow/__init__.py:1-50]()\n\n## Workflow Configuration\n\n### Python API\n\n```python\nfrom txtai import Workflow\nfrom txtai.workflow import Task\n\nworkflow = Workflow([\n    Task(textractor),\n    Task(summary),\n    Task(translate)\n])\n\n# Execute workflow\nresults = list(workflow(input_data))\n```\n\n### YAML Configuration\n\n```yaml\nworkflow:\n  tasks:\n    - action: textractor\n      task: textractor\n    - action: summary\n      task: summary\n```\n\n资料来源：[src/python/txtai/workflow/factory.py:1-30]()\n\n## Task System\n\n### Task Base Class\n\nThe `Task` class is the fundamental building block of workflows. Each task defines:\n\n- **Action**: Callable function or list of functions to execute\n- **Select**: Filter functions to select specific data\n- **Concurrency**: Thread or process-based execution\n- **Data handling**: Unpacking, column selection, and merging strategies\n\n```mermaid\ngraph LR\n    A[Input Element] --> B{Selection Filter}\n    B -->|Pass| C[Action Execution]\n    B -->|Fail| D[Skip Element]\n    C --> E[One-to-Many Transform]\n    E --> F[Output Elements]\n```\n\n#### Task Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `action` | callable/list | `None` | Action(s) to execute on each data element |\n| `select` | callable/list | `None` | Filter(s) to select data to process |\n| `unpack` | bool | `True` | Unwrap data from (id, data, tag) tuples |\n| `column` | int | `None` | Column index to select from tuple elements |\n| `merge` | str | `\"hstack\"` | Merge mode for multi-action outputs |\n| `initialize` | callable | `None` | Action executed before processing |\n| `finalize` | callable | `None` | Action executed after processing |\n| `concurrency` | str | `None` | `\"thread\"` or `\"process\"` for parallel execution |\n| `onetomany` | bool | `True` | Enable one-to-many data transformations |\n\n资料来源：[src/python/txtai/workflow/task/base.py:15-45]()\n\n### TemplateTask\n\n`TemplateTask` extends the base `Task` class with template processing capabilities for generating LLM prompts.\n\n```python\nfrom txtai.workflow.task import TemplateTask\n\ndef summary_template(text):\n    return f\"\"\"Summarize the following text in 40 words or less.\n    \n{text}\n\"\"\"\n\ndef translate_template(text, language):\n    return f\"\"\"Translate the following text to {language}.\n    \n{text}\n\"\"\"\n\nworkflow = Workflow([\n    Task(textractor),\n    Task(llm, action=lambda inputs: llm([summary_template(x) for x in inputs])),\n    Task(llm, action=lambda inputs: llm([translate_template(x, \"fr\") for x in inputs]))\n])\n```\n\n#### Template Processing\n\n| Element Type | Template Behavior |\n|--------------|-------------------|\n| `dict` | Pass as named template parameters |\n| `tuple` | Pass as `arg0`...`argN` template parameters |\n| `str` | Pass as `{text}` parameter |\n\n资料来源：[src/python/txtai/workflow/task/template.py:1-40]()\n\n## Workflow Execution\n\n### Basic Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Workflow\n    participant Task1\n    participant TaskN\n    \n    User->>Workflow: workflow(data)\n    Workflow->>Task1: process(input)\n    Task1->>Task1: select()\n    Task1->>Task1: action()\n    Task1->>Task1: onetomany transform\n    Task1-->>TaskN: intermediate results\n    TaskN->>TaskN: select()\n    TaskN->>TaskN: action()\n    TaskN-->>User: final results\n```\n\n### Concurrency Modes\n\n| Mode | Use Case | Configuration |\n|------|----------|---------------|\n| `thread` | I/O-bound tasks | `concurrency=\"thread\"` |\n| `process` | CPU-bound tasks | `concurrency=\"process\"` |\n| `None` | Sequential execution | Default |\n\n资料来源：[src/python/txtai/workflow/base.py:1-60]()\n\n## Pipeline Components\n\n### Available Pipeline Tasks\n\n| Pipeline | Purpose | Example Usage |\n|----------|---------|---------------|\n| `Textractor` | Extract text from URLs/files | `Task(textractor)` |\n| `Summary` | Generate text summaries | `Task(summary)` |\n| `Translation` | Translate between languages | `Task(translate)` |\n| `LLM` | Execute LLM prompts | `Task(llm)` |\n| `Segmentation` | Split text into segments | `Task(segmentation)` |\n| `Transcription` | Audio to text | `Task(transcription)` |\n\n### Quick Start Example\n\n```python\nfrom txtai import LLM, Workflow\nfrom txtai.pipeline import Summary, Textractor, Translation\nfrom txtai.workflow import Task\n\n# Step 1: Define available pipelines\ntextractor = Textractor(backend=\"docling\", headers={\"user-agent\": \"Mozilla/5.0\"})\nsummary = Summary()\ntranslate = Translation()\n\n# Step 2: Define workflow tasks\nworkflow = Workflow([\n    Task(textractor),           # Extract text from URL\n    Task(summary),              # Generate summary\n    Task(lambda inputs: [translate(x, \"fr\") for x in inputs])  # Translate to French\n])\n\n# Step 3: Run the workflow\nresults = list(workflow([\"https://neuml.com\"]))\n```\n\n资料来源：[examples/workflow_quickstart.py:1-30]()\n\n### LLM-Powered Workflow Example\n\n```python\nfrom txtai import LLM, Workflow\nfrom txtai.pipeline import Textractor\nfrom txtai.workflow import Task\n\ntextractor = Textractor(backend=\"docling\")\nllm = LLM(\"Qwen/Qwen3-4B-Instruct-2507\")\n\ndef summary(text):\n    return f\"\"\"Summarize the following text in 40 words or less.\n{text}\n\"\"\"\n\ndef translate(text, language):\n    return f\"\"\"Translate the following text to {language}.\n{text}\n\"\"\"\n\nworkflow = Workflow([\n    Task(textractor),\n    Task(lambda inputs: llm([summary(x) for x in inputs], maxlength=25000)),\n    Task(lambda inputs: llm([translate(x, \"fr\") for x in inputs], maxlength=25000))\n])\n\nresults = list(workflow([\"https://neuml.com\"]))\n```\n\n资料来源：[examples/workflow_quickstart.py:35-60]()\n\n## Data Flow\n\n```mermaid\ngraph LR\n    A[(Input Data)] --> B[Workflow]\n    B --> C[Task 1: Textractor]\n    C --> D[Intermediate Data]\n    D --> E[Task 2: Summary]\n    E --> F[Intermediate Data]\n    F --> G[Task N: Custom Action]\n    G --> H[(Output Results)]\n    \n    I[URL Input] -->|UrlTask| C\n    J[File Input] -->|FileTask| C\n    K[Stream Input] -->|StreamTask| C\n```\n\n### Input/Output Handling\n\n- **URL input**: Processed via `UrlTask` wrapper\n- **File input**: Processed via `FileTask` wrapper\n- **Stream input**: Processed via `StreamTask` wrapper\n- **Tuple unpacking**: Supports `(id, data, tag)` format for labeled data\n\n资料来源：[src/python/txtai/workflow/base.py:60-100]()\n\n## Advanced Configuration\n\n### Custom Task Actions\n\n```python\nfrom txtai.workflow import Task\n\ndef custom_filter(element):\n    \"\"\"Filter elements before processing\"\"\"\n    return len(element) > 10\n\ndef custom_action(element):\n    \"\"\"Process single element\"\"\"\n    return element.upper()\n\ndef custom_finalize(results):\n    \"\"\"Aggregate results after processing\"\"\"\n    return sorted(results, key=lambda x: x[1], reverse=True)\n\nworkflow = Workflow([\n    Task(\n        action=custom_action,\n        select=custom_filter,\n        finalize=custom_finalize,\n        concurrency=\"thread\"\n    )\n])\n```\n\n### Workflow Factory\n\n```python\nfrom txtai.workflow import WorkflowFactory\n\n# Create workflow from configuration\nworkflow = WorkflowFactory.create(config)\n\n# Run with data\nresults = list(workflow(data))\n```\n\n资料来源：[src/python/txtai/workflow/factory.py:1-50]()\n\n## Best Practices\n\n1. **Use appropriate concurrency**: Set `concurrency=\"thread\"` for I/O-bound tasks (API calls, file operations) and `concurrency=\"process\"` for CPU-intensive tasks.\n\n2. **Leverage one-to-many transforms**: Tasks can produce multiple outputs from single inputs, enable with `onetomany=True`.\n\n3. **Template formatting**: Use `TemplateTask` for consistent prompt formatting across LLM calls.\n\n4. **Error handling**: Implement `select` filters to skip invalid data before reaching action processing.\n\n5. **Pipeline selection**: Only install necessary pipeline extras—e.g., `pip install txtai[pipeline-data]` for text extraction workflows.\n\n资料来源：[examples/workflow_quickstart.py:1-25]()\n\n## See Also\n\n- [Embeddings](embeddings.md) - Vector search and similarity indexing\n- [Pipelines](pipelines.md) - Language model pipelines\n- [LLM Configuration](llm.md) - Large language model setup\n- [API Documentation](api.md) - REST API reference\n\n---\n\n<a id='agents'></a>\n\n## Agents\n\n### 相关页面\n\n相关主题：[Pipelines](#pipelines), [Workflows](#workflows)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/agent/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/__init__.py)\n- [src/python/txtai/agent/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/base.py)\n- [src/python/txtai/agent/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/factory.py)\n- [src/python/txtai/agent/tool/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/__init__.py)\n- [src/python/txtai/agent/tool/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/blob/main/src/python/txtai/agent/tool/factory.py)\n- [src/python/txtai/agent/tool/function.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/function.py)\n- [docs/agent/index.md](https://github.com/neuml/txtai/blob/main/docs/agent/index.md)\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py) (agent dependencies)\n</details>\n\n# Agents\n\nAgents in txtai enable autonomous task execution by combining Large Language Models (LLMs) with a flexible tool system. The agent framework allows building AI-powered workflows that can reason, plan, and execute actions through configurable tools.\n\n## Overview\n\ntxtai agents are built on top of [smolagents](https://github.com/smol-ai/smolagents) and provide:\n\n- **Tool-based execution**: Agents use tools to interact with external systems and data sources\n- **LLM orchestration**: Seamless integration with various LLM backends including local models, APIs, and managed services\n- **MCP support**: Integration with Model Context Protocol for extended tool collections\n- **Configurable behavior**: Flexible configuration for tool selection, prompt templates, and execution parameters\n\nThe agent architecture separates concerns between tool definition, tool management, and agent execution, enabling modular and extensible agent systems.\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[User Input] --> B[Agent Core]\n    B --> C[LLM Engine]\n    C --> D[Tool Selection]\n    D --> E[Tool Execution]\n    E --> F[Results]\n    F --> C\n    C --> G[Final Response]\n    \n    H[ToolFactory] --> I[FunctionTool]\n    H --> J[EmbeddingsTool]\n    H --> K[MCPTools]\n    H --> L[DefaultTools]\n    \n    subgraph Tool System\n        I\n        J\n        K\n        L\n    end\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| Agent Base | `agent/base.py` | Core agent logic and execution loop |\n| Agent Factory | `agent/factory.py` | Factory for creating configured agents |\n| Tool Factory | `agent/tool/factory.py` | Creates and manages tool instances |\n| Function Tool | `agent/tool/function.py` | Wraps Python functions as LLM tools |\n| Tool Init | `agent/tool/__init__.py` | Tool abstractions and shared interfaces |\n\n## Tool System\n\n### Tool Types\n\nThe tool system supports multiple tool creation patterns:\n\n| Type | Description | Source |\n|------|-------------|--------|\n| `FunctionTool` | Wraps a Python function with config | `agent/tool/function.py:1` |\n| `EmbeddingsTool` | Embeddings-based search and retrieval | `agent/tool/factory.py` |\n| `MCP Tools` | External MCP tool collections | `agent/tool/factory.py` |\n| Default Tools | Built-in system tools | `agent/tool/factory.py` |\n\n### FunctionTool\n\nThe `FunctionTool` class wraps descriptive configuration and injects it along with a target function into an LLM prompt.\n\n```python\nfrom smolagents import Tool\n\nclass FunctionTool(Tool):\n    \"\"\"\n    A FunctionTool takes descriptive configuration and injects it along with a target function into an LLM prompt.\n    \"\"\"\n```\n\n**Configuration Schema:**\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `name` | string | Yes | Tool identifier |\n| `description` | string | Yes | Human-readable tool description for LLM |\n| `inputs` | dict | Yes | Input parameter definitions |\n| `output` | string | No | Output type (default: `\"any\"`) |\n| `output_type` | string | No | Alternative output type field |\n| `target` | callable | Yes | Function to execute |\n\n### Tool Factory\n\nThe `ToolFactory` class handles tool creation from various configuration formats:\n\n```python\n@staticmethod\ndef create(config):\n    \"\"\"\n    Creates a new list of tools. This method iterates of the `tools` configuration option and creates a Tool instance\n    for each entry. This supports the following:\n\n      - Tool instance\n      - Dictionary with `name`, `description`, `inputs`, `output` and `target` function configuration\n      - String with a tool alias name\n\n    Returns:\n        list of tools\n    \"\"\"\n```\n\n**Supported Tool Input Formats:**\n\n1. **Tool Instance**: Direct `smolagents.Tool` objects\n2. **Dictionary**: Config with `name`, `description`, `inputs`, `output`, and `target`\n3. **String Alias**: Named shortcuts (e.g., `\"webview\"`, `\"defaults\"`)\n4. **MCP URL**: HTTP URLs for MCP tool collections\n\n### Default Tools\n\n| Tool Name | Description |\n|-----------|-------------|\n| `webview` | Web content reading and extraction |\n| `read` | File and content reading capabilities |\n| `defaults` | Loads all default tools |\n\n## Agent Configuration\n\n### Dependencies\n\nThe agent module requires the following dependencies (specified in `setup.py:17`):\n\n```python\nextras[\"agent\"] = [\n    \"jinja2>=3.1.6\",\n    \"mcpadapt>=0.1.0\",\n    \"smolagents>=1.23\",\n]\n```\n\n| Dependency | Version | Purpose |\n|------------|---------|---------|\n| `jinja2` | >=3.1.6 | Template processing for prompts |\n| `mcpadapt` | >=0.1.0 | MCP tool protocol adapter |\n| `smolagents` | >=1.23 | Core agent framework |\n\n### Configuration Options\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `llm` | dict | LLM configuration with model path and parameters |\n| `tools` | list | List of tool configurations or aliases |\n| `template` | string | Prompt template for agent instructions |\n| `max iterations` | int | Maximum agent execution iterations |\n| `tool` | string | Primary execution tool/function |\n\n## Agent Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant U as User\n    participant A as Agent\n    participant L as LLM\n    participant T as Tool Manager\n    \n    U->>A: Execute Task\n    A->>L: Generate Tool Call\n    L-->>A: Tool Selection\n    A->>T: Execute Tool\n    T-->>A: Tool Result\n    A->>L: Process Result\n    L-->>A: Response\n    A-->>U: Final Answer\n```\n\n## Integration with LLM Pipelines\n\nAgents integrate with txtai's LLM pipeline system:\n\n```python\n# From llm.py - Agent uses LLM generation\ndef generator(self, text, maxlength, stream, stop, defaultrole, stripthink, **kwargs):\n    \"\"\"\n    Runs LLM generation for agent reasoning and tool calls.\n    \"\"\"\n    return self.generator(text, maxlength, stream, stop, defaultrole, stripthink, **kwargs)\n```\n\n### LLM Integration Methods\n\n| Method | Purpose |\n|--------|---------|\n| `ischat()` | Check if the LLM supports chat mode |\n| `isvision()` | Check if the LLM supports vision/multimodal |\n| `generator()` | Execute LLM generation |\n\n## Tool Execution\n\nThe tool execution is handled by the `forward` method in `FunctionTool`:\n\n```python\ndef forward(self, *args, **kwargs):\n    \"\"\"\n    Runs target function.\n\n    Args:\n        args: positional args\n        kwargs: keyword args\n\n    Returns:\n        result\n    \"\"\"\n    return self.target(*args, **kwargs)\n```\n\n### Execution Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `args` | tuple | Positional arguments for tool function |\n| `kwargs` | dict | Keyword arguments for tool function |\n\n## Best Practices\n\n1. **Tool Descriptions**: Provide clear, detailed descriptions for each tool to help the LLM understand when and how to use it\n2. **Input Schemas**: Define complete input schemas with types and descriptions\n3. **Error Handling**: Tools should handle errors gracefully and return meaningful error messages\n4. **Configuration Validation**: Use the factory pattern to validate tool configurations before execution\n\n## See Also\n\n- [Pipeline LLM](../pipeline/llm/) - LLM configuration and execution\n- [Workflows](../workflow/) - Task orchestration\n- [Embeddings](../embeddings/) - Vector search capabilities\n\n---\n\n<a id='database-integration'></a>\n\n## Database Integration\n\n### 相关页面\n\n相关主题：[Embeddings Database](#embeddings-core), [Scoring and Retrieval Algorithms](#scoring-algorithms)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [src/python/txtai/pipeline/data/textractor.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/data/textractor.py)\n- [src/python/txtai/embeddings/index/documents.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/index/documents.py)\n- [src/python/txtai/cloud/hub.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/cloud/hub.py)\n- [examples/workflows.py](https://github.com/neuml/txtai/blob/main/examples/workflows.py)\n- [src/python/txtai/workflow/task/template.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n</details>\n\n# Database Integration\n\n## Overview\n\ntxtai provides comprehensive database integration capabilities that enable vector search combined with structured data storage. The database layer is a core component of the embeddings database architecture, which unifies vector indexes (sparse and dense), graph networks, and relational databases into a single powerful system.\n\n资料来源：[setup.py:42-44]()\n\nThe database integration serves as the foundation for:\n\n- Persistent storage of indexed documents and embeddings\n- SQL-based querying alongside vector similarity search\n- Integration with external data sources and cloud storage\n- Workflow task management and state persistence\n\n## Architecture\n\nThe database layer in txtai follows a modular architecture with several key components:\n\n```mermaid\ngraph TD\n    A[Embeddings Database] --> B[Database Layer]\n    B --> C[SQLAlchemy Backend]\n    B --> D[DuckDB Backend]\n    C --> E[Schema Models]\n    D --> E\n    B --> F[Database Factory]\n    F --> G[Database Client]\n    G --> H[Query Engine]\n    H --> I[Document Storage]\n    H --> J[Vector Indexes]\n```\n\n### Key Components\n\n| Component | Purpose | Source |\n|-----------|---------|--------|\n| Database Base | Abstract base class defining database operations | [src/python/txtai/database/base.py]() |\n| Database Client | Client interface for executing queries | [src/python/txtai/database/client.py]() |\n| Database Factory | Factory pattern for creating database instances | [src/python/txtai/database/factory.py]() |\n| Schema Models | SQLAlchemy ORM models for data structures | [src/python/txtai/database/schema/__init__.py]() |\n\n## Supported Databases\n\n### SQLite/PostgreSQL with SQLAlchemy\n\ntxtai supports traditional relational databases through SQLAlchemy, providing cross-database compatibility. This enables seamless integration with:\n\n- **SQLite**: Lightweight, file-based database for local development\n- **PostgreSQL**: Production-grade relational database with advanced features\n- **pgvector**: PostgreSQL extension for vector similarity search\n\n资料来源：[setup.py:42-44]()\n\n```python\n# Example configuration\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n    database:\n        url: postgresql+psycopg2://user:pass@localhost:5432/txtai\n```\n\n### DuckDB\n\nDuckDB is an embedded analytical database that provides high-performance OLAP capabilities. txtai includes DuckDB as a database extra for analytical workloads.\n\n资料来源：[setup.py:42]()\n\n```python\nextras[\"database\"] = [\"duckdb>=0.8.0\", \"pillow>=7.1.2\", \"sqlalchemy>=2.0.20\"]\n```\n\n## Database Configuration\n\n### Configuration Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `url` | string | `sqlite:///txtai.db` | Database connection URL |\n| `path` | string | None | Path for file-based databases |\n| `indexed` | bool | True | Enable full-text indexing |\n| `content` | bool | True | Store original document content |\n| `objects` | string | `storage` | Object storage backend |\n| `embeddings` | string | `embeddings` | Embeddings storage directory |\n\n### Connection String Formats\n\n```\n# SQLite (default)\nsqlite:///txtai.db\n\n# PostgreSQL with pgvector\npostgresql://user:pass@localhost:5432/txtai\n\n# DuckDB\nduckdb:///path/to/database.duckdb\n\n# MySQL\nmysql+pymysql://user:pass@localhost:3306/txtai\n```\n\n## Document Storage System\n\nThe document storage system provides efficient handling of indexed content with support for batch operations and streaming.\n\n### Core Operations\n\n```python\nclass DocumentStorage:\n    def add(self, documents):\n        \"\"\"Add documents to storage\"\"\"\n        \n    def close(self):\n        \"\"\"Close and persist storage\"\"\"\n        \n    def delete(self, ids):\n        \"\"\"Remove documents by ID\"\"\"\n        \n    def get(self, ids):\n        \"\"\"Retrieve documents by ID\"\"\"\n```\n\n资料来源：[src/python/txtai/embeddings/index/documents.py:1-50]()\n\n### Batch Processing\n\nDocuments are processed in batches for optimal memory usage and performance:\n\n```python\n# Batch-based document indexing\nself.serializer.savestream(documents, self.documents)\nself.batch += 1\nself.size += len(documents)\n```\n\n## Integration with Pipeline Data Processing\n\nThe Textractor module demonstrates database integration with external data sources, supporting extraction from various file formats and URLs.\n\n资料来源：[src/python/txtai/pipeline/data/textractor.py:1-60]()\n\n### Supported Input Methods\n\n| Method | Description |\n|--------|-------------|\n| URL Fetching | Extract text from web pages via HTTP |\n| File Processing | Handle local and remote file paths |\n| Cloud Storage | Integration with cloud object storage |\n| Database Queries | Direct content retrieval from databases |\n\n### Safe Open Mode\n\nFor security-sensitive environments, Textractor supports safe open mode that restricts access to local temporary directories and non-private URLs only:\n\n```python\nself.safeopen = os.path.realpath(\n    tempfile.gettempdir() if isinstance(safeopen, bool) else safeopen\n) if safeopen else safeopen\n```\n\n## Cloud Storage Integration\n\ntxtai extends database capabilities with cloud storage integration for managing large embeddings and document collections.\n\n### HuggingFace Hub Integration\n\nThe cloud hub module provides seamless upload and download of index files to HuggingFace repositories:\n\n资料来源：[src/python/txtai/cloud/hub.py:1-60]()\n\n```python\n# Automatic LFS tracking for large files\nif \"embeddings \" not in content:\n    content += \"documents filter=lfs diff=lfs merge=lfs -text\\n\"\n    content += \"embeddings filter=lfs diff=lfs merge=lfs -text\\n\"\n```\n\n### Supported Cloud Providers\n\n| Provider | Package | Features |\n|----------|---------|----------|\n| HuggingFace Hub | `huggingface-hub` | Model hosting, datasets |\n| S3 Compatible | `apache-libcloud` | Object storage |\n| Google Cloud | `apache-libcloud` | Cloud storage |\n| Azure | `apache-libcloud` | Blob storage |\n\n## Workflow Database Tasks\n\nWorkflows can include database operations as part of complex data processing pipelines.\n\n### Task Configuration\n\n```python\nelif wtype == \"tabular\":\n    data[wtype] = component\n    tasks.append({\"action\": wtype})\n```\n\n资料来源：[examples/workflows.py:1-100]()\n\n### Workflow Task Types\n\n| Task Type | Description | Data Format |\n|-----------|-------------|-------------|\n| `tabular` | Database table operations | CSV, DataFrame |\n| `service` | Database connection pooling | Configuration |\n| `extract` | Data extraction to database | Various sources |\n\n## Query Patterns\n\n### Vector + SQL Hybrid Queries\n\ntxtai supports combining vector similarity search with traditional SQL filtering:\n\n```python\n# Semantic search with SQL filter\nresults = embeddings.search(\n    \"Find documents about machine learning\",\n    filter=\"category = 'technical' AND year > 2020\"\n)\n```\n\n### Batch Query Optimization\n\nFor high-throughput applications:\n\n```python\n# Batch query for multiple inputs\nqueries = [\"query1\", \"query2\", \"query3\"]\nresults = embeddings.batchsearch(queries, limit=10)\n```\n\n## Performance Considerations\n\n### Database Selection Guide\n\n| Use Case | Recommended Database |\n|----------|----------------------|\n| Local development | SQLite |\n| Production workloads | PostgreSQL + pgvector |\n| Analytical queries | DuckDB |\n| Large-scale embeddings | S3 + PostgreSQL hybrid |\n\n### Optimization Tips\n\n1. **Index Management**: Regular index rebuilding for maintaining query performance\n2. **Batch Operations**: Use batch operations for bulk inserts and updates\n3. **Connection Pooling**: Configure appropriate pool sizes for concurrent access\n4. **Object Storage**: Offload large embeddings to object storage while keeping metadata in database\n\n## Installation\n\nInstall database dependencies using extras:\n\n```bash\n# Basic database support\npip install txtai[database]\n\n# With all database backends\npip install txtai[all]\n\n# Individual backends\npip install txtai[duckdb]      # DuckDB analytics\npip install txtai[ann]         # Vector indexes with pgvector\n```\n\n## See Also\n\n- [Embeddings Configuration](embeddings/configuration/database.md) - Detailed configuration guide\n- [Vector Search](../vectors/overview.md) - Combining database with vector search\n- [Workflow Tasks](../workflow/overview.md) - Database tasks in workflows\n- [Cloud Storage](../cloud/hub.md) - HuggingFace Hub integration\n\n---\n\n<a id='graph-networks'></a>\n\n## Graph Networks\n\n### 相关页面\n\n相关主题：[Embeddings Database](#embeddings-core), [Database Integration](#database-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/graph/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/graph/__init__.py)\n- [src/python/txtai/graph/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/graph/base.py)\n- [src/python/txtai/graph/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/graph/factory.py)\n- [src/python/txtai/graph/networkx.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/graph/networkx.py)\n- [docs/embeddings/configuration/graph.md](https://github.com/neuml/txtai/blob/main/docs/embeddings/configuration/graph.md)\n</details>\n\n# Graph Networks\n\nGraph Networks in txtai provide a powerful mechanism for building and traversing graph-based knowledge structures. This module enables semantic relationship mapping, topic modeling, and community detection capabilities within the broader txtai ecosystem.\n\n## Overview\n\nGraph Networks are a core component of txtai's architecture that combines vector search capabilities with graph-based relationship analysis. The graph module creates network representations that can be queried, traversed, and analyzed for patterns and communities.\n\n```mermaid\ngraph TB\n    subgraph \"txtai Graph Module\"\n        GB[Graph Base Class]\n        GX[NetworkX Backend]\n        GF[Graph Factory]\n        GT[Topics Module]\n    end\n    \n    GB --> GX\n    GB --> GT\n    GF --> GB\n    \n    GX --> CD[Community Detection]\n    GX --> NT[Node Traversal]\n    GX --> RL[Relationship Links]\n```\n\n资料来源：[src/python/txtai/graph/base.py:1-20]()\n\n## Architecture\n\n### Base Graph Class\n\nThe `Graph` class serves as the foundational abstraction for all graph implementations. It provides a consistent interface for graph operations regardless of the underlying backend.\n\n```python\nclass Graph:\n    \"\"\"\n    Base class for Graph instances. This class builds graph networks. Supports topic modeling\n    and relationship traversal.\n    \"\"\"\n```\n\n**Key Attributes:**\n\n| Attribute | Type | Description |\n|-----------|------|-------------|\n| `config` | dict | Graph configuration settings |\n| `backend` | object | Graph backend implementation |\n| `categories` | list | Topic modeling categories |\n| `topics` | object | Topics instance for topic analysis |\n| `text` | str | Column name for text data (default: \"text\") |\n| `object` | str | Column name for object data (default: \"object\") |\n| `copyattributes` | bool | Flag to copy all attributes when True |\n| `relationships` | str | Column name for manually-provided edges |\n| `relations` | dict | Stored relationships dictionary |\n\n资料来源：[src/python/txtai/graph/base.py:26-43]()\n\n### NetworkX Backend\n\nThe NetworkX backend (`NetworkX` class) provides the actual graph implementation using the NetworkX library. This backend supports:\n\n- **Community Detection**: Uses Louvain algorithm for partition detection\n- **Graph Operations**: Node/edge addition, removal, and querying\n- **Serialization**: Pickle-based serialization for persistence\n- **Distance Computation**: Weight-based distance calculations between nodes\n\n资料来源：[src/python/txtai/graph/networkx.py:1-30]()\n\n## Core Methods\n\n### Graph Creation and Management\n\n```python\ndef create(self):\n    \"\"\"Creates the graph network.\"\"\"\n    raise NotImplementedError\n\ndef count(self):\n    \"\"\"Returns the total number of nodes in graph.\"\"\"\n    raise NotImplementedError\n```\n\n| Method | Description | Returns |\n|--------|-------------|---------|\n| `create()` | Initializes and returns a new graph network | Graph instance |\n| `count()` | Returns total number of nodes | int |\n| `scan(attribute, data)` | Iterates over nodes matching criteria | Node iterator or tuples |\n| `node(nodeid)` | Retrieves node attributes | dict or None |\n\n资料来源：[src/python/txtai/graph/base.py:45-65]()\n\n### Node Operations\n\n| Method | Description |\n|--------|-------------|\n| `addnode(node, **attrs)` | Adds a single node with attributes |\n| `addnodes(nodes)` | Adds multiple nodes from iterable |\n| `removenode(node)` | Removes a node from the graph |\n| `hasnode(node)` | Checks if node exists |\n\n资料来源：[src/python/txtai/graph/networkx.py:55-75]()\n\n## Community Detection\n\nThe NetworkX backend implements advanced community detection using the Louvain algorithm:\n\n```python\ndef communities(self, config):\n    \"\"\"\n    Runs community detection on graph.\n\n    Args:\n        config: topic configuration\n\n    Returns:\n        list of [ids] per community\n    \"\"\"\n    level = config.get(\"level\", \"best\")\n    results = list(louvain_partitions(self.backend, weight=\"weight\", resolution=config.get(\"resolution\", 100), seed=0))\n    return results[0] if level == \"first\" else results[-1]\n```\n\n**Configuration Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `level` | str | \"best\" | Partition level to use (\"first\" or \"best\") |\n| `resolution` | int | 100 | Louvain resolution parameter |\n\n资料来源：[src/python/txtai/graph/networkx.py:35-55]()\n\n## Distance Computation\n\nThe graph module calculates distances between nodes based on edge weights:\n\n```python\ndef distance(self, source, target, attrs):\n    \"\"\"\n    Computes distance between source and target nodes using weight.\n\n    Returns:\n        distance between source and target\n    \"\"\"\n    distance = max(1.0 - attrs[\"weight\"], 0.0)\n    return distance if distance >= 0.15 else 1.00\n```\n\n**Distance Algorithm:**\n- Distance = `1 - weight` (inverted weight)\n- Minimum threshold: 0.15 (edges below this are considered near-duplicates)\n- Maximum distance cap: 1.00\n\n资料来源：[src/python/txtai/graph/networkx.py:60-75]()\n\n## Serialization\n\n### TAR File Loading (Legacy Format)\n\nThe NetworkX backend supports loading graphs from legacy TAR archives:\n\n```python\ndef loadtar(self, path):\n    \"\"\"\n    Loads a graph from the legacy TAR file.\n\n    Args:\n        path: path to graph\n    \"\"\"\n    serializer = SerializeFactory.create(\"pickle\")\n    \n    with TemporaryDirectory() as directory:\n        archive = ArchiveFactory.create(directory)\n        archive.load(path, \"tar\")\n        self.backend = serializer.load(f\"{directory}/graph\")\n```\n\n**Serialization Features:**\n- Pickle-based serialization for backwards compatibility\n- TAR archive extraction to temporary directory\n- Optional category loading from separate file\n\n资料来源：[src/python/txtai/graph/networkx.py:80-100]()\n\n## Configuration\n\nGraph networks are configured through a dictionary passed to the constructor:\n\n```yaml\ngraph:\n    columns:\n        text: \"content\"        # Column containing text data\n        object: \"data\"        # Column containing object data\n        relationships: \"edges\" # Column for manual edge definitions\n    copyattributes: false      # Copy all attributes when True\n```\n\n**Column Configuration Options:**\n\n| Key | Default | Description |\n|-----|---------|-------------|\n| `text` | \"text\" | Name of the text column |\n| `object` | \"object\" | Name of the object column |\n| `relationships` | \"relationships\" | Name of the relationships column |\n\n资料来源：[docs/embeddings/configuration/graph.md]() and [src/python/txtai/graph/base.py:38-40]()\n\n## Topic Modeling Integration\n\nThe Graph module integrates with txtai's topic modeling capabilities:\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Graph Indexing]\n    B --> C[Topic Modeling]\n    C --> D[Community Detection]\n    D --> E[Category Assignment]\n    \n    C --> F[Topics Module]\n    F --> G[Topic Analysis]\n```\n\n**Topics Integration:**\n- `self.categories`: Stores discovered topic categories\n- `self.topics`: Topics instance for analysis operations\n- Community detection feeds into topic assignment\n\n资料来源：[src/python/txtai/graph/base.py:30-32]()\n\n## Error Handling\n\nThe Graph module requires the NetworkX library for operation:\n\n```python\nif not NETWORKX:\n    raise ImportError('NetworkX is not available - install \"graph\" extra to enable')\n```\n\nTo enable Graph Networks functionality, install the graph extra:\n\n```bash\npip install txtai[graph]\n```\n\n资料来源：[src/python/txtai/graph/networkx.py:23-25]()\n\n## Usage Example\n\n```python\nfrom txtai import Graph\n\n# Create graph with default configuration\ngraph = Graph({})\n\n# Create the graph network\ngraph.create()\n\n# Add nodes with attributes\ngraph.addnode(1, text=\"example node\", weight=0.5)\ngraph.addnode(2, text=\"another node\", weight=0.7)\n\n# Query nodes\nnodes = list(graph.scan())\ncount = graph.count()\n\n# Get community detection results\nconfig = {\"level\": \"best\", \"resolution\": 100}\ncommunities = graph.communities(config)\n```\n\n## Related Components\n\n| Component | Description |\n|-----------|-------------|\n| `Embeddings` | Vector index that may utilize graphs for relationship mapping |\n| `Topics` | Topic modeling module integrated with graph analysis |\n| `Archive` | Serialization system for graph persistence |\n| `Workflow` | Orchestration that can incorporate graph operations |\n\n---\n\n<a id='scoring-algorithms'></a>\n\n## Scoring and Retrieval Algorithms\n\n### 相关页面\n\n相关主题：[Embeddings Database](#embeddings-core), [Database Integration](#database-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [src/python/txtai/embeddings/index/documents.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/index/documents.py)\n- [src/python/txtai/api/cluster.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/api/cluster.py)\n- [src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n- [src/python/txtai/embeddings/database/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/database/__init__.py)\n</details>\n\n# Scoring and Retrieval Algorithms\n\n## Overview\n\ntxtai provides a sophisticated scoring and retrieval system that powers its semantic search capabilities. At its core, the system combines vector-based similarity search with traditional BM25 sparse retrieval, enabling hybrid search approaches that leverage both semantic understanding and exact term matching.\n\nThe scoring module is an integral part of the embeddings database architecture, which unifies vector indexes, graph networks, and relational databases into a single powerful knowledge source for LLM applications.\n\n资料来源：[README.md]()\n\n## Architecture\n\nThe scoring and retrieval system follows a modular factory pattern that allows different scoring algorithms to be plugged in based on configuration. The architecture is designed to support both standalone scoring operations and integrated retrieval within the embeddings database.\n\n```mermaid\ngraph TD\n    A[Query Input] --> B[Query Processing]\n    B --> C{Scoring Algorithm}\n    C --> D[BM25 Scoring]\n    C --> E[Vector Similarity]\n    C --> F[Hybrid Scoring]\n    D --> G[Results Aggregation]\n    E --> G\n    F --> G\n    G --> H[Ranked Results]\n```\n\n资料来源：[setup.py](), [src/python/txtai/scoring/factory.py]()\n\n## Scoring Module Structure\n\nThe scoring subsystem is defined in the `txtai.scoring` package and includes base classes and concrete implementations for different retrieval algorithms.\n\n### Key Components\n\n| Component | Purpose | Location |\n|-----------|---------|----------|\n| `Base` | Abstract base class defining scoring interface | `scoring/base.py` |\n| `BM25` | Traditional sparse retrieval algorithm | `scoring/bm25.py` |\n| `Factory` | Factory for creating scoring instances | `scoring/factory.py` |\n| Package `__init__` | Module exports and configuration | `scoring/__init__.py` |\n\n资料来源：[setup.py:50-51]()\n\n## BM25 Retrieval Algorithm\n\nBM25 (Best Matching 25) is a probabilistic ranking function used for information retrieval. It extends the binary independence model to incorporate term frequency and document length normalization.\n\n### Configuration Options\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `k1` | float | 1.5 | Term frequency saturation parameter |\n| `b` | float | 0.75 | Length normalization factor |\n| `avgdl` | float | auto | Average document length |\n\n### How BM25 Works\n\n1. **Term Frequency (TF)**: Measures how often a term appears in a document, with saturation to prevent over-weighting\n2. **Inverse Document Frequency (IDF)**: Weights terms by their rarity across the corpus\n3. **Document Length Normalization**: Adjusts scores based on document length relative to average\n\n资料来源：[src/python/txtai/scoring/bm25.py](), [src/python/txtai/scoring/base.py]()\n\n## Integration with Embeddings Database\n\nThe scoring algorithms are tightly integrated with the embeddings database system, which handles document indexing, storage, and retrieval.\n\n```mermaid\ngraph LR\n    A[Documents] --> B[Indexing Pipeline]\n    B --> C[Vector Index]\n    B --> D[BM25 Index]\n    B --> E[Document Store]\n    F[Query] --> G[Hybrid Search]\n    C --> G\n    D --> G\n    E --> G\n    G --> H[Scored Results]\n```\n\n### Document Storage\n\nDocuments are managed through a streaming serializer that writes to temporary files, enabling efficient handling of large document collections:\n\n```python\n# Add batch of documents\nself.serializer.savestream(documents, self.documents)\nself.batch += 1\nself.size += len(documents)\n```\n\n资料来源：[src/python/txtai/embeddings/index/documents.py:18-23]()\n\n## Search API\n\nThe scoring system exposes search capabilities through both local and cluster APIs, supporting single queries, batch searches, and hybrid scoring with configurable weights.\n\n### Search Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `query` | str | Search query string |\n| `limit` | int | Maximum number of results (default: 10) |\n| `weights` | list | Hybrid score weights for combining methods |\n| `index` | str | Specific index to search |\n| `parameters` | dict | Named parameters for advanced queries |\n| `graph` | bool | Include graph network results |\n\n### Query Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant Client\n    participant API\n    participant Scorer\n    participant Index\n    \n    Client->>API: search(query, limit)\n    API->>Scorer: execute(query)\n    Scorer->>Index: score(query)\n    Index-->>Scorer: raw scores\n    Scorer-->>API: scored results\n    API->>API: aggregate(query, results)\n    API->>API: sort and limit\n    API-->>Client: ranked results\n```\n\n资料来源：[src/python/txtai/api/cluster.py:85-105]()\n\n### Batch Search\n\nFor processing multiple queries efficiently, the API supports batch search operations:\n\n```python\ndef batchsearch(self, queries, limit=None, weights=None, index=None, parameters=None, graph=False):\n    \"\"\"\n    Finds documents most similar to the input queries.\n    \"\"\"\n```\n\n资料来源：[src/python/txtai/api/cluster.py:107-127]()\n\n## Hybrid Scoring\n\ntxtai supports hybrid scoring that combines multiple retrieval methods. This is particularly valuable when you want to leverage both semantic similarity (vector-based) and exact term matching (BM25).\n\n### Weight Configuration\n\nThe `weights` parameter controls how different scoring methods are combined:\n\n| Weights | Effect |\n|---------|--------|\n| `[0.0, 1.0]` | Pure vector search |\n| `[1.0, 0.0]` | Pure BM25 search |\n| `[0.5, 0.5]` | Equal contribution |\n| `[0.7, 0.3]` | Vector-biased hybrid |\n\n### Result Aggregation\n\nResults from multiple scoring methods are combined using weighted aggregation and sorted by the combined score:\n\n```python\n# Combine aggregate functions and sort\nresults = self.aggregate(query, results)\n# Limit results\nreturn results[: (limit if limit else 10)]\n```\n\n资料来源：[src/python/txtai/api/cluster.py:97-99]()\n\n## Configuration Reference\n\n### Scoring Extra Dependencies\n\nTo use the scoring module, install the `scoring` extra:\n\n```bash\npip install txtai[scoring]\n```\n\nRequired dependencies from `setup.py`:\n\n| Package | Version | Purpose |\n|---------|---------|---------|\n| sqlalchemy | >=2.0.20 | Database operations |\n\n资料来源：[setup.py:50-51]()\n\n### Complete Installation Options\n\nFor full functionality including scoring:\n\n```bash\npip install txtai[all]\n```\n\nThis includes:\n- `ann` - Approximate nearest neighbor indexes\n- `vectors` - Vector similarity components\n- `scoring` - Retrieval algorithms\n- `api` - REST API service\n\n资料来源：[setup.py:66-81]()\n\n## Usage Examples\n\n### Basic Embeddings Search\n\n```python\nimport txtai\n\nembeddings = txtai.Embeddings()\nembeddings.index([\"Correct\", \"Not what we hoped\"])\nembeddings.search(\"positive\", 1)\n# Returns: [(0, 0.29862046241760254)]\n```\n\n### Configuration via YAML\n\n```yaml\n# app.yml\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n```\n\n资料来源：[README.md]()\n\n## Related Components\n\n| Component | Relationship |\n|-----------|--------------|\n| **Embeddings Database** | Primary consumer of scoring algorithms |\n| **Vector Indexes** | Works alongside scoring for hybrid search |\n| **API Service** | Exposes scoring via REST endpoints |\n| **Workflow Engine** | Uses scoring for document routing |\n\nThe scoring and retrieval algorithms form the computational backbone of txtai's search capabilities, enabling both high-precision keyword matching through BM25 and semantic understanding through vector similarity. This combination allows applications to benefit from the strengths of both traditional information retrieval and modern neural embeddings.\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：neuml/txtai\n\n摘要：发现 22 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：Add `txtai_minimal` package。\n\n## 1. 安装坑 · 来源证据：Add `txtai_minimal` package\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add `txtai_minimal` package\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_4e4050b59fdf4d51ac21e82d248e589e | https://github.com/neuml/txtai/issues/1090 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安装坑 · 来源证据：Add custom Captions implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Captions implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ff6fe05065564e85a549d7656b85a852 | https://github.com/neuml/txtai/issues/1084 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 3. 安装坑 · 来源证据：Add custom Questions pipeline implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Questions pipeline implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fa68ea29089d497bb8278c3c360c363c | https://github.com/neuml/txtai/issues/1087 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：Add custom Sequences pipeline implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Sequences pipeline implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_0099afb385ef498d8020521bbf3e9e64 | https://github.com/neuml/txtai/issues/1086 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 5. 安装坑 · 来源证据：Add custom Summary pipeline implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Summary pipeline implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_9d8f75c31fc6450d8659b310f25d0bf4 | https://github.com/neuml/txtai/issues/1085 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 6. 安装坑 · 来源证据：Add minimal Docker build script\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add minimal Docker build script\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_38e1e56a991b412bbc749d2c0b687d54 | https://github.com/neuml/txtai/issues/1092 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 7. 安装坑 · 来源证据：Make `transformers` package optional for minimal install\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Make `transformers` package optional for minimal install\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d755829200de4d93b2f4d2f71b36aaf1 | https://github.com/neuml/txtai/issues/1091 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 8. 安装坑 · 来源证据：Reduce dependencies to just `numpy` for minimal install\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Reduce dependencies to just `numpy` for minimal install\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_395b735968ab4ec8ad849070d43cd18f | https://github.com/neuml/txtai/issues/1093 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 9. 安装坑 · 来源证据：Support minimal install for edge devices\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support minimal install for edge devices\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_3ec8e039baf4412888ed3ac543ea1361 | https://github.com/neuml/txtai/issues/1089 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：Various training pipeline fixes for v5\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Various training pipeline fixes for v5\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b47325d7496248a993d980c3d59f3269 | https://github.com/neuml/txtai/issues/1088 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 11. 安装坑 · 来源证据：Zero dependency minimal install\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Zero dependency minimal install\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_7839a465e7e64f94acfff001c1da978c | https://github.com/neuml/txtai/issues/1094 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 12. 安装坑 · 来源证据：v9.9.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v9.9.0\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8b842ce68a1c4c1ab0a40a3ed1fd213c | https://github.com/neuml/txtai/releases/tag/v9.9.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 13. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:286301447 | https://github.com/neuml/txtai | README/documentation is current enough for a first validation pass.\n\n## 14. 运行坑 · 来源证据：v9.7.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：v9.7.0\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e3f61e41f6e84b9b9ee33d5e18dbff63 | https://github.com/neuml/txtai/releases/tag/v9.7.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | last_activity_observed missing\n\n## 16. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:286301447 | https://github.com/neuml/txtai | no_demo; severity=medium\n\n## 17. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:286301447 | https://github.com/neuml/txtai | no_demo; severity=medium\n\n## 18. 安全/权限坑 · 来源证据：Add LiteRT-LM LLM\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Add LiteRT-LM LLM\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8731971fd8404df082f75ecf565fef8b | https://github.com/neuml/txtai/issues/1095 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 19. 安全/权限坑 · 来源证据：v9.6.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v9.6.0\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fe429d6152304930bda149c4fbd35318 | https://github.com/neuml/txtai/releases/tag/v9.6.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 20. 安全/权限坑 · 来源证据：v9.8.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v9.8.0\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8bebd6701b454baaae22d73522ce9704 | https://github.com/neuml/txtai/releases/tag/v9.8.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 21. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | issue_or_pr_quality=unknown\n\n## 22. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | release_recency=unknown\n\n<!-- canonical_name: neuml/txtai; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "txtai",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "github_repo:286301447",
          "kind": "repo",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/neuml/txtai"
        },
        {
          "evidence_id": "art_aa63afbfafb94c67827adad079c717f2",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/neuml/txtai#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "txtai 说明书",
      "toc": [
        "https://github.com/neuml/txtai 项目说明书",
        "目录",
        "Introduction to txtai",
        "Overview",
        "Architecture",
        "Key Features",
        "Installation",
        "Default installation includes",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "57de5b3a94e888219d543e84fe4b0abdaed94c28",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "README.md",
      "docs/further.md",
      "docs/why.md",
      "docs/index.md",
      "docs/usecases.md",
      "docs/install.md",
      "docs/poweredby.md",
      "docs/examples.md",
      "docs/cloud.md",
      "docs/faq.md",
      "docs/models.md",
      "docs/observability.md",
      "docs/api/customization.md",
      "docs/api/security.md",
      "docs/api/index.md",
      "docs/api/configuration.md",
      "docs/api/methods.md",
      "docs/api/mcp.md",
      "docs/api/openai.md",
      "docs/api/cluster.md",
      "docs/embeddings/index.md",
      "docs/embeddings/indexing.md",
      "docs/embeddings/format.md",
      "docs/embeddings/methods.md",
      "docs/embeddings/query.md",
      "docs/workflow/schedule.md",
      "docs/workflow/index.md",
      "docs/pipeline/index.md",
      "docs/agent/index.md",
      "docs/agent/configuration.md",
      "docs/agent/methods.md",
      "docs/embeddings/configuration/scoring.md",
      "docs/embeddings/configuration/index.md",
      "docs/embeddings/configuration/ann.md",
      "docs/embeddings/configuration/database.md",
      "docs/embeddings/configuration/cloud.md",
      "docs/embeddings/configuration/graph.md",
      "docs/embeddings/configuration/vectors.md",
      "docs/embeddings/configuration/general.md"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# txtai - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 txtai 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **AI 研究者或研究型 Agent 构建者**：README 明确围绕研究、实验或论文工作流展开。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **正在使用 Claude/Codex/Cursor/Gemini 等宿主 AI 的开发者**：README 或插件配置提到多个宿主 AI。 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n## 它能做什么\n\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md`, `docs/install.md` Claim：`clm_0001` supported 0.86\n\n## 怎么开始\n\n- `curl -X GET \"http://localhost:8000/search?query=positive\"` 证据：`README.md` Claim：`clm_0004` supported 0.86\n- `pip install txtai` 证据：`README.md` Claim：`clm_0005` supported 0.86, `clm_0006` supported 0.86, `clm_0007` supported 0.86, `clm_0008` supported 0.86 等\n- `pip install txtai[all]` 证据：`docs/install.md` Claim：`clm_0006` supported 0.86\n- `pip install txtai[ann]` 证据：`docs/install.md` Claim：`clm_0007` supported 0.86\n- `pip install txtai[api]` 证据：`docs/install.md` Claim：`clm_0008` supported 0.86\n- `pip install txtai[cloud]` 证据：`docs/install.md` Claim：`clm_0009` supported 0.86\n- `pip install txtai[console]` 证据：`docs/install.md` Claim：`clm_0010` supported 0.86\n- `pip install txtai[database]` 证据：`docs/install.md` Claim：`clm_0011` supported 0.86\n- `pip install txtai[graph]` 证据：`docs/install.md` Claim：`clm_0012` supported 0.86\n- `pip install txtai[model]` 证据：`docs/install.md` Claim：`clm_0013` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：先做角色匹配试用\n- **为什么**：这个项目更像角色库，核心风险是选错角色或把角色文案当执行能力；先用 Prompt Preview 试角色匹配，再决定是否沙盒导入。\n\n### 30 秒判断\n\n- **现在怎么做**：先做角色匹配试用\n- **最小安全下一步**：先用 Prompt Preview 试角色匹配；满意后再隔离导入\n- **先别相信**：角色质量和任务匹配不能直接相信。\n- **继续会触碰**：角色选择偏差、命令执行、本地环境或项目文件\n\n### 现在可以相信\n\n- **适合人群线索：AI 研究者或研究型 Agent 构建者**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **适合人群线索：正在使用 Claude/Codex/Cursor/Gemini 等宿主 AI 的开发者**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0003` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md`, `docs/install.md` Claim：`clm_0001` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0004` supported 0.86\n\n### 现在还不能相信\n\n- **角色质量和任务匹配不能直接相信。**（unverified）：角色库证明有很多角色，不证明每个角色都适合你的具体任务，也不证明角色能产生高质量结果。\n- **不能把角色文案当成真实执行能力。**（unverified）：安装前只能判断角色描述和任务画像是否匹配，不能证明它能在宿主 AI 里完成任务。\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n\n### 继续会触碰什么\n\n- **角色选择偏差**：用户对任务应该由哪个专家角色处理的判断。 原因：选错角色会让 AI 从错误专业视角回答，浪费时间或误导决策。\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`, `docs/install.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`, `docs/install.md`\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：先用交互式试用验证任务画像和角色匹配，不要先导入整套角色库。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **保留原始角色选择记录**：如果输出偏题，可以回到任务画像阶段重新选择角色，而不是继续沿着错误角色推进。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0023` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md`, `docs/install.md` Claim：`clm_0024` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md`, `docs/install.md` Claim：`clm_0001` supported 0.86\n\n### 上下文规模\n\n- 文件总数：567\n- 重要文件覆盖：40/567\n- 证据索引条目：80\n- 角色 / Skill 条目：76\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 txtai 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 txtai 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 txtai 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 76 个角色 / Skill / 项目文档条目。\n\n- **Installation**（project_doc）：! install images/install.png only-light ! install images/install-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/install.md`\n- **Why txtai?**（project_doc）：txtai is an all-in-one AI framework for semantic search, LLM orchestration and language model workflows. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`README.md`\n- **Cloud**（project_doc）：! cloud images/cloud.png only-light ! cloud images/cloud-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/cloud.md`\n- **Examples**（project_doc）：! examples images/examples.png only-light ! examples images/examples-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/examples.md`\n- **FAQ**（project_doc）：Below is a list of frequently asked questions and common issues encountered. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/faq.md`\n- **Further reading**（project_doc）：! further images/further.png only-light ! further images/further-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/further.md`\n- **Index**（project_doc）：txtai is an all-in-one AI framework for semantic search, LLM orchestration and language model workflows. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/index.md`\n- **Model guide**（project_doc）：See the table below for the current recommended models. These models all allow commercial use and offer a blend of speed and performance. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/models.md`\n- **Observability**（project_doc）：! agent https://raw.githubusercontent.com/neuml/mlflow-txtai/master/images/agent.png 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/observability.md`\n- **Powered by txtai**（project_doc）：The following applications are powered by txtai. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/poweredby.md`\n- **Use Cases**（project_doc）：The following sections introduce common txtai use cases. A comprehensive set of over 70 example notebooks and applications ../examples are also available. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/usecases.md`\n- **Why txtai?**（project_doc）：! why images/why.png only-light ! why images/why-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/why.md`\n- **Configuration**（project_doc）：An agent takes two main arguments, an LLM and a list of tools. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/agent/configuration.md`\n- **Agent**（project_doc）：An agent automatically creates workflows to answer multi-faceted user requests. Agents iteratively prompt and/or interface with tools to step through a process and ultimately come to an answer for a request. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/agent/index.md`\n- **Methods**（project_doc）：::: txtai.agent.base.Agent. init ::: txtai.agent.base.Agent. call 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/agent/methods.md`\n- **Distributed embeddings clusters**（project_doc）：The API supports combining multiple API instances into a single logical embeddings index. An example configuration is shown below. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/api/cluster.md`\n- **Configuration**（project_doc）：Configuration is set through YAML. In most cases, YAML keys map to fields names in Python. The example in the previous section ../ gave a full-featured example covering a wide array of configuration options. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/api/configuration.md`\n- **Customization**（project_doc）：The txtai API has a number of features out of the box that are designed to help get started quickly. API services can also be augmented with custom code and functionality. The two main ways to do this are with extensions and dependencies. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/api/customization.md`\n- **API**（project_doc）：! api ../images/api.png only-light ! api ../images/api-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/api/index.md`\n- **Model Context Protocol**（project_doc）：The Model Context Protocol MCP https://modelcontextprotocol.io/introduction is an open standard that enables developers to build secure, two-way connections between their data sources and AI-powered tools. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/api/mcp.md`\n- **Methods**（project_doc）：::: txtai.api.API options: inherited members: true filters: - \"! del \" - \"!flows\" - \"!function\" - \"!indexes\" - \"!limit\" - \"!pipes\" - \"!read\" - \"!resolve\" - \"!weights\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/api/methods.md`\n- **OpenAI-compatible API**（project_doc）：The API can be configured to serve an OpenAI-compatible API as shown below. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/api/openai.md`\n- **Security**（project_doc）：The default implementation of an API service runs via HTTP and is fully open. If the service is being run as a prototype on an internal network, that may be fine. In most scenarios, the connection should at least be encrypted. Authorization is another built-in feature that requires a valid API token with each request. See below for more. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/api/security.md`\n- **ANN**（project_doc）：Approximate Nearest Neighbor ANN index configuration for storing vector embeddings. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/configuration/ann.md`\n- **Cloud**（project_doc）：The following describes parameters used to sync indexes with cloud storage. Cloud object storage, the Hugging Face Hub https://huggingface.co/models and custom providers are all supported. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/configuration/cloud.md`\n- **Database**（project_doc）：Databases store metadata, text and binary content. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/configuration/database.md`\n- **General**（project_doc）：Enables sparse keyword indexing for this embeddings. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/configuration/general.md`\n- **Graph**（project_doc）：Enable graph storage via the graph parameter. This component requires the graph ../../../install/ graph extras package. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/configuration/graph.md`\n- **Configuration**（project_doc）：The following describes available embeddings configuration. These parameters are set in the Embeddings constructor ../methods txtai.embeddings.base.Embeddings. init via either the config parameter or as keyword arguments. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/configuration/index.md`\n- **Scoring**（project_doc）：Enable scoring support via the scoring parameter. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/configuration/scoring.md`\n- **Vectors**（project_doc）：The following covers available vector model configuration options. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/configuration/vectors.md`\n- **Index format**（project_doc）：! format ../images/format.png only-light ! format ../images/format-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/format.md`\n- **Embeddings**（project_doc）：! embeddings ../images/embeddings.png only-light ! embeddings ../images/embeddings-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/index.md`\n- **Index guide**（project_doc）：! indexing ../images/indexing.png only-light ! indexing ../images/indexing-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/indexing.md`\n- **Methods**（project_doc）：::: txtai.embeddings.Embeddings options: filters: - \"!columns\" - \"!createann\" - \"!createcloud\" - \"!createdatabase\" - \"!creategraph\" - \"!createids\" - \"!createindexes\" - \"!createscoring\" - \"!checkarchive\" - \"!configure\" - \"!defaultallowed\" - \"!defaults\" - \"!initindex\" - \"!loadquery\" - \"!loadvectors\" 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/methods.md`\n- **Query guide**（project_doc）：! query ../images/query.png only-light ! query ../images/query-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/embeddings/query.md`\n- **Audio Mixer**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/audio/audiomixer.md`\n- **Audio Stream**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/audio/audiostream.md`\n- **Microphone**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/audio/microphone.md`\n- **Text To Audio**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/audio/texttoaudio.md`\n- **Text To Speech**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/audio/texttospeech.md`\n- **Transcription**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/audio/transcription.md`\n- **File To HTML**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/data/filetohtml.md`\n- **HTML To Markdown**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/data/htmltomd.md`\n- **Segmentation**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/data/segmentation.md`\n- **Tabular**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/data/tabular.md`\n- **Textractor**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/data/textractor.md`\n- **Tokenizer**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/data/tokenizer.md`\n- **Caption**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/image/caption.md`\n- **ImageHash**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/image/imagehash.md`\n- **Objects**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/image/objects.md`\n- **Pipeline**（project_doc）：! pipeline ../images/pipeline.png only-light ! pipeline ../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/index.md`\n- **LLM**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/llm/llm.md`\n- **RAG**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/llm/rag.md`\n- **Entity**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/text/entity.md`\n- **Labels**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/text/labels.md`\n- **Reranker**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/text/reranker.md`\n- **Similarity**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/text/similarity.md`\n- **Summary**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/text/summary.md`\n- **Translation**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/text/translation.md`\n- **HFOnnx**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/train/hfonnx.md`\n- **MLOnnx**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/train/mlonnx.md`\n- **HFTrainer**（project_doc）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/pipeline/train/trainer.md`\n- **Workflow**（project_doc）：! workflow ../images/workflow.png only-light ! workflow ../images/workflow-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/index.md`\n- **Schedule**（project_doc）：! schedule ../images/schedule.png only-light ! schedule ../images/schedule-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/schedule.md`\n- **Console Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/console.md`\n- **Export Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/export.md`\n- **File Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/file.md`\n- **Image Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/image.md`\n- **Tasks**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/index.md`\n- **Retrieve Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/retrieve.md`\n- **Service Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/service.md`\n- **Storage Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/storage.md`\n- **Template Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/template.md`\n- **Url Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/url.md`\n- **Workflow Task**（project_doc）：! task ../../images/task.png only-light ! task ../../images/task-dark.png only-dark 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/workflow/task/workflow.md`\n\n## 证据索引\n\n- 共索引 80 条证据。\n\n- **Installation**（documentation）：! install images/install.png only-light ! install images/install-dark.png only-dark 证据：`docs/install.md`\n- **Why txtai?**（documentation）：txtai is an all-in-one AI framework for semantic search, LLM orchestration and language model workflows. 证据：`README.md`\n- **License**（source_file）：Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ 证据：`LICENSE`\n- **Cloud**（documentation）：! cloud images/cloud.png only-light ! cloud images/cloud-dark.png only-dark 证据：`docs/cloud.md`\n- **Examples**（documentation）：! examples images/examples.png only-light ! examples images/examples-dark.png only-dark 证据：`docs/examples.md`\n- **FAQ**（documentation）：Below is a list of frequently asked questions and common issues encountered. 证据：`docs/faq.md`\n- **Further reading**（documentation）：! further images/further.png only-light ! further images/further-dark.png only-dark 证据：`docs/further.md`\n- **Index**（documentation）：txtai is an all-in-one AI framework for semantic search, LLM orchestration and language model workflows. 证据：`docs/index.md`\n- **Model guide**（documentation）：See the table below for the current recommended models. These models all allow commercial use and offer a blend of speed and performance. 证据：`docs/models.md`\n- **Observability**（documentation）：! agent https://raw.githubusercontent.com/neuml/mlflow-txtai/master/images/agent.png 证据：`docs/observability.md`\n- **Powered by txtai**（documentation）：The following applications are powered by txtai. 证据：`docs/poweredby.md`\n- **Use Cases**（documentation）：The following sections introduce common txtai use cases. A comprehensive set of over 70 example notebooks and applications ../examples are also available. 证据：`docs/usecases.md`\n- **Why txtai?**（documentation）：! why images/why.png only-light ! why images/why-dark.png only-dark 证据：`docs/why.md`\n- **Configuration**（documentation）：An agent takes two main arguments, an LLM and a list of tools. 证据：`docs/agent/configuration.md`\n- **Agent**（documentation）：An agent automatically creates workflows to answer multi-faceted user requests. Agents iteratively prompt and/or interface with tools to step through a process and ultimately come to an answer for a request. 证据：`docs/agent/index.md`\n- **Methods**（documentation）：::: txtai.agent.base.Agent. init ::: txtai.agent.base.Agent. call 证据：`docs/agent/methods.md`\n- **Distributed embeddings clusters**（documentation）：The API supports combining multiple API instances into a single logical embeddings index. An example configuration is shown below. 证据：`docs/api/cluster.md`\n- **Configuration**（documentation）：Configuration is set through YAML. In most cases, YAML keys map to fields names in Python. The example in the previous section ../ gave a full-featured example covering a wide array of configuration options. 证据：`docs/api/configuration.md`\n- **Customization**（documentation）：The txtai API has a number of features out of the box that are designed to help get started quickly. API services can also be augmented with custom code and functionality. The two main ways to do this are with extensions and dependencies. 证据：`docs/api/customization.md`\n- **API**（documentation）：! api ../images/api.png only-light ! api ../images/api-dark.png only-dark 证据：`docs/api/index.md`\n- **Model Context Protocol**（documentation）：The Model Context Protocol MCP https://modelcontextprotocol.io/introduction is an open standard that enables developers to build secure, two-way connections between their data sources and AI-powered tools. 证据：`docs/api/mcp.md`\n- **Methods**（documentation）：::: txtai.api.API options: inherited members: true filters: - \"! del \" - \"!flows\" - \"!function\" - \"!indexes\" - \"!limit\" - \"!pipes\" - \"!read\" - \"!resolve\" - \"!weights\" 证据：`docs/api/methods.md`\n- **OpenAI-compatible API**（documentation）：The API can be configured to serve an OpenAI-compatible API as shown below. 证据：`docs/api/openai.md`\n- **Security**（documentation）：The default implementation of an API service runs via HTTP and is fully open. If the service is being run as a prototype on an internal network, that may be fine. In most scenarios, the connection should at least be encrypted. Authorization is another built-in feature that requires a valid API token with each request. See below for more. 证据：`docs/api/security.md`\n- **ANN**（documentation）：Approximate Nearest Neighbor ANN index configuration for storing vector embeddings. 证据：`docs/embeddings/configuration/ann.md`\n- **Cloud**（documentation）：The following describes parameters used to sync indexes with cloud storage. Cloud object storage, the Hugging Face Hub https://huggingface.co/models and custom providers are all supported. 证据：`docs/embeddings/configuration/cloud.md`\n- **Database**（documentation）：Databases store metadata, text and binary content. 证据：`docs/embeddings/configuration/database.md`\n- **General**（documentation）：Enables sparse keyword indexing for this embeddings. 证据：`docs/embeddings/configuration/general.md`\n- **Graph**（documentation）：Enable graph storage via the graph parameter. This component requires the graph ../../../install/ graph extras package. 证据：`docs/embeddings/configuration/graph.md`\n- **Configuration**（documentation）：The following describes available embeddings configuration. These parameters are set in the Embeddings constructor ../methods txtai.embeddings.base.Embeddings. init via either the config parameter or as keyword arguments. 证据：`docs/embeddings/configuration/index.md`\n- **Scoring**（documentation）：Enable scoring support via the scoring parameter. 证据：`docs/embeddings/configuration/scoring.md`\n- **Vectors**（documentation）：The following covers available vector model configuration options. 证据：`docs/embeddings/configuration/vectors.md`\n- **Index format**（documentation）：! format ../images/format.png only-light ! format ../images/format-dark.png only-dark 证据：`docs/embeddings/format.md`\n- **Embeddings**（documentation）：! embeddings ../images/embeddings.png only-light ! embeddings ../images/embeddings-dark.png only-dark 证据：`docs/embeddings/index.md`\n- **Index guide**（documentation）：! indexing ../images/indexing.png only-light ! indexing ../images/indexing-dark.png only-dark 证据：`docs/embeddings/indexing.md`\n- **Methods**（documentation）：::: txtai.embeddings.Embeddings options: filters: - \"!columns\" - \"!createann\" - \"!createcloud\" - \"!createdatabase\" - \"!creategraph\" - \"!createids\" - \"!createindexes\" - \"!createscoring\" - \"!checkarchive\" - \"!configure\" - \"!defaultallowed\" - \"!defaults\" - \"!initindex\" - \"!loadquery\" - \"!loadvectors\" 证据：`docs/embeddings/methods.md`\n- **Query guide**（documentation）：! query ../images/query.png only-light ! query ../images/query-dark.png only-dark 证据：`docs/embeddings/query.md`\n- **Audio Mixer**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/audio/audiomixer.md`\n- **Audio Stream**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/audio/audiostream.md`\n- **Microphone**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/audio/microphone.md`\n- **Text To Audio**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/audio/texttoaudio.md`\n- **Text To Speech**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/audio/texttospeech.md`\n- **Transcription**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/audio/transcription.md`\n- **File To HTML**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/data/filetohtml.md`\n- **HTML To Markdown**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/data/htmltomd.md`\n- **Segmentation**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/data/segmentation.md`\n- **Tabular**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/data/tabular.md`\n- **Textractor**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/data/textractor.md`\n- **Tokenizer**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/data/tokenizer.md`\n- **Caption**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/image/caption.md`\n- **ImageHash**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/image/imagehash.md`\n- **Objects**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/image/objects.md`\n- **Pipeline**（documentation）：! pipeline ../images/pipeline.png only-light ! pipeline ../images/pipeline-dark.png only-dark 证据：`docs/pipeline/index.md`\n- **LLM**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/llm/llm.md`\n- **RAG**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/llm/rag.md`\n- **Entity**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/text/entity.md`\n- **Labels**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/text/labels.md`\n- **Reranker**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/text/reranker.md`\n- **Similarity**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/text/similarity.md`\n- **Summary**（documentation）：! pipeline ../../images/pipeline.png only-light ! pipeline ../../images/pipeline-dark.png only-dark 证据：`docs/pipeline/text/summary.md`\n- 其余 20 条证据见 `AI_CONTEXT_PACK.json` 或 `EVIDENCE_INDEX.json`。\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`docs/install.md`, `README.md`, `LICENSE`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`docs/install.md`, `README.md`, `LICENSE`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Introduction to txtai**：importance `high`\n  - source_paths: README.md, src/python/txtai/__init__.py, docs/index.md\n- **Getting Started**：importance `high`\n  - source_paths: pyproject.toml, setup.py, docs/install.md, examples/01_Introducing_txtai.ipynb\n- **System Architecture**：importance `high`\n  - source_paths: src/python/txtai/embeddings/base.py, src/python/txtai/ann/__init__.py, src/python/txtai/ann/dense/factory.py, src/python/txtai/scoring/__init__.py, docs/api/index.md\n- **Embeddings Database**：importance `high`\n  - source_paths: src/python/txtai/embeddings/__init__.py, src/python/txtai/embeddings/base.py, src/python/txtai/embeddings/index/__init__.py, src/python/txtai/embeddings/search/__init__.py, src/python/txtai/vectors/__init__.py\n- **Pipelines**：importance `high`\n  - source_paths: src/python/txtai/pipeline/__init__.py, src/python/txtai/pipeline/factory.py, src/python/txtai/pipeline/base.py, src/python/txtai/pipeline/text/__init__.py, src/python/txtai/pipeline/audio/__init__.py\n- **Workflows**：importance `high`\n  - source_paths: src/python/txtai/workflow/__init__.py, src/python/txtai/workflow/base.py, src/python/txtai/workflow/factory.py, src/python/txtai/workflow/task/__init__.py, docs/workflow/index.md\n- **Agents**：importance `medium`\n  - source_paths: src/python/txtai/agent/__init__.py, src/python/txtai/agent/base.py, src/python/txtai/agent/factory.py, src/python/txtai/agent/tool/__init__.py, docs/agent/index.md\n- **Database Integration**：importance `high`\n  - source_paths: src/python/txtai/database/__init__.py, src/python/txtai/database/base.py, src/python/txtai/database/client.py, src/python/txtai/database/factory.py, src/python/txtai/database/schema/__init__.py\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `57de5b3a94e888219d543e84fe4b0abdaed94c28`\n- inspected_files: `pyproject.toml`, `README.md`, `docs/further.md`, `docs/why.md`, `docs/index.md`, `docs/usecases.md`, `docs/install.md`, `docs/poweredby.md`, `docs/examples.md`, `docs/cloud.md`, `docs/faq.md`, `docs/models.md`, `docs/observability.md`, `docs/api/customization.md`, `docs/api/security.md`, `docs/api/index.md`, `docs/api/configuration.md`, `docs/api/methods.md`, `docs/api/mcp.md`, `docs/api/openai.md`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 来源证据：Add `txtai_minimal` package\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add `txtai_minimal` package\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_4e4050b59fdf4d51ac21e82d248e589e | https://github.com/neuml/txtai/issues/1090 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 来源证据：Add custom Captions implementation\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Captions implementation\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_ff6fe05065564e85a549d7656b85a852 | https://github.com/neuml/txtai/issues/1084 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 来源证据：Add custom Questions pipeline implementation\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Questions pipeline implementation\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_fa68ea29089d497bb8278c3c360c363c | https://github.com/neuml/txtai/issues/1087 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 来源证据：Add custom Sequences pipeline implementation\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Sequences pipeline implementation\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_0099afb385ef498d8020521bbf3e9e64 | https://github.com/neuml/txtai/issues/1086 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 来源证据：Add custom Summary pipeline implementation\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Summary pipeline implementation\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_9d8f75c31fc6450d8659b310f25d0bf4 | https://github.com/neuml/txtai/issues/1085 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: 来源证据：Add minimal Docker build script\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add minimal Docker build script\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_38e1e56a991b412bbc749d2c0b687d54 | https://github.com/neuml/txtai/issues/1092 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 来源证据：Make `transformers` package optional for minimal install\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Make `transformers` package optional for minimal install\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_d755829200de4d93b2f4d2f71b36aaf1 | https://github.com/neuml/txtai/issues/1091 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 8: 来源证据：Reduce dependencies to just `numpy` for minimal install\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Reduce dependencies to just `numpy` for minimal install\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_395b735968ab4ec8ad849070d43cd18f | https://github.com/neuml/txtai/issues/1093 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 9: 来源证据：Support minimal install for edge devices\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support minimal install for edge devices\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_3ec8e039baf4412888ed3ac543ea1361 | https://github.com/neuml/txtai/issues/1089 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 10: 来源证据：Various training pipeline fixes for v5\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Various training pipeline fixes for v5\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_b47325d7496248a993d980c3d59f3269 | https://github.com/neuml/txtai/issues/1088 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：neuml/txtai\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：local_cli\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 来源证据：Add `txtai_minimal` package（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：Add custom Captions implementation（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：Add custom Questions pipeline implementation（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：Add custom Sequences pipeline implementation（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 来源证据：Add custom Summary pipeline implementation（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/neuml/txtai 项目说明书\n\n生成时间：2026-05-16 22:47:03 UTC\n\n## 目录\n\n- [Introduction to txtai](#introduction)\n- [Getting Started](#getting-started)\n- [System Architecture](#architecture)\n- [Embeddings Database](#embeddings-core)\n- [Pipelines](#pipelines)\n- [Workflows](#workflows)\n- [Agents](#agents)\n- [Database Integration](#database-integration)\n- [Graph Networks](#graph-networks)\n- [Scoring and Retrieval Algorithms](#scoring-algorithms)\n\n<a id='introduction'></a>\n\n## Introduction to txtai\n\n### 相关页面\n\n相关主题：[System Architecture](#architecture), [Getting Started](#getting-started)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/neuml/txtai/blob/main/README.md)\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [src/python/txtai/pipeline/data/textractor.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/data/textractor.py)\n- [src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n- [src/python/txtai/workflow/task/template.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n- [examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n- [src/python/txtai/agent/tool/read.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/read.py)\n</details>\n\n# Introduction to txtai\n\ntxtai is an **all-in-one AI framework** for semantic search, large language model (LLM) orchestration, and language model workflows. It provides a unified platform that combines vector search capabilities with traditional database features, enabling developers to build sophisticated AI-powered applications without managing multiple disparate systems.\n\n## Overview\n\nThe central innovation of txtai is its **embeddings database** - a hybrid data store that unifies:\n\n- **Sparse vector indexes** for keyword-based search\n- **Dense vector indexes** for semantic similarity\n- **Graph networks** for relationship modeling\n- **Relational databases** for structured data storage\n\nThis architecture enables txtai to function both as a standalone vector search engine and as a powerful knowledge source for RAG (Retrieval Augmented Generation) applications. 资料来源：[README.md:1-20]()\n\n## Architecture\n\nThe txtai architecture follows a layered design that separates concerns while maintaining tight integration between components.\n\n```mermaid\ngraph TD\n    subgraph \"txtai Architecture\"\n        A[Application Layer] --> B[Workflow Engine]\n        B --> C[Pipeline System]\n        C --> D[Embeddings Database]\n        D --> E[(Vector Index)]\n        D --> F[(Graph Network)]\n        D --> G[(Relational DB)]\n    end\n    \n    subgraph \"Pipelines\"\n        H[LLM Pipeline] \n        I[Data Pipeline]\n        J[Text Pipeline]\n        K[Audio Pipeline]\n    end\n    \n    C --> H\n    C --> I\n    C --> J\n    C --> K\n```\n\n### Core Components\n\n| Component | Purpose | Key Features |\n|-----------|---------|--------------|\n| **Embeddings** | Vector database | Content storage, semantic search, hybrid queries |\n| **Workflows** | Task orchestration | Parallel/sequential execution, function routing |\n| **Pipelines** | Model execution | LLM, data processing, text operations |\n| **Agents** | Autonomous execution | Tool-based reasoning, MCP integration |\n\n## Key Features\n\n### Vector Search Capabilities\n\ntxtai provides comprehensive vector search functionality including:\n\n- Semantic/similarity/vector/neural search 资料来源：[README.md:45-50]()\n- SQL integration for hybrid queries\n- Object storage support\n- Topic modeling\n- Graph analysis\n- Multimodal indexing (text, audio, images, video)\n\n### Pipeline System\n\nThe pipeline system enables flexible model execution:\n\n| Pipeline Type | Capabilities |\n|---------------|--------------|\n| **LLM** | Text generation, chat, vision support 资料来源：[src/python/txtai/pipeline/llm/llm.py:1-50]() |\n| **Data** | Text extraction, document processing, chunking 资料来源：[src/python/txtai/pipeline/data/textractor.py:1-50]() |\n| **Text** | NER, transcription, translation |\n| **Audio** | Speech recognition, text-to-speech |\n| **Train** | Model fine-tuning, ONNX conversion |\n\n### Workflow Engine\n\nWorkflows orchestrate complex AI tasks using a template-based system:\n\n```mermaid\ngraph LR\n    A[Input] --> B[Template Task]\n    B --> C[LLM Pipeline]\n    C --> D[Output]\n    \n    B -.->|config| E[Template Rules]\n    E -->|format| B\n```\n\nThe template system supports named parameters, positional arguments, and strict/conservative parsing modes. 资料来源：[src/python/txtai/workflow/task/template.py:1-40]()\n\n## Installation\n\n### Core Dependencies\n\n```python\n# Default installation includes\ndefault = [\n    \"faiss-cpu>=1.7.1.post2\",\n    \"huggingface-hub>=0.34.0\",\n    \"msgpack>=1.0.7\",\n    \"numpy>=1.18.4\",\n    \"regex>=2022.8.17\",\n    \"pyyaml>=5.3\",\n    \"safetensors>=0.4.5\",\n    \"torch>=2.4\",\n    \"transformers>=4.56.2\",\n]\n```\n\n资料来源：[setup.py:18-30]()\n\n### Optional Installations\n\n| Extra | Purpose | Command |\n|-------|---------|---------|\n| `pipeline` | All pipeline dependencies | `pip install txtai[pipeline]` |\n| `vectors` | Embedding models | `pip install txtai[vectors]` |\n| `api` | REST API server | `pip install txtai[api]` |\n| `agent` | Autonomous agents | `pip install txtai[agent]` |\n| `all` | Complete installation | `pip install txtai[all]` |\n\n资料来源：[setup.py:55-150]()\n\n## Basic Usage\n\n### Semantic Search\n\n```python\nimport txtai\n\nembeddings = txtai.Embeddings()\nembeddings.index([\"Correct\", \"Not what we hoped\"])\nresult = embeddings.search(\"positive\", 1)\n# Returns: [(0, 0.29862046241760254)]\n```\n\n资料来源：[README.md:35-42]()\n\n### Text Extraction\n\n```python\nfrom txtai.pipeline import Textractor\n\ntextractor = Textractor(backend=\"docling\", sections=True)\nfor chunk in textractor(\"document.pdf\"):\n    process(chunk)\n```\n\n资料来源：[examples/rag_quickstart.py:1-50]()\n\n### RAG Pipeline\n\n```python\nfrom txtai import Embeddings, RAG\nfrom txtai.pipeline import Textractor\n\n# Build embeddings database\nembeddings = Embeddings(content=True, path=\"Qwen/Qwen3-Embedding-0.6B\")\nembeddings.index(chunks)\n\n# Create RAG pipeline\ntemplate = \"\"\"\n  Answer the following question using the provided context.\n  Question: {question}\n  Context: {context}\n\"\"\"\n\nrag = RAG(\n    embeddings,\n    \"Qwen/Qwen3-0.6B\",\n    system=\"You are a friendly assistant\",\n    template=template,\n)\n\nresult = rag(\"Summarize the main advancements made by BERT\")\n```\n\n资料来源：[examples/rag_quickstart.py:50-80]()\n\n## Use Cases\n\n### Semantic Search\n\nTraditional keyword-based search systems match exact terms. Semantic search understands natural language and identifies results with similar meaning, regardless of keyword overlap. 资料来源：[README.md:45-55]()\n\n### RAG Applications\n\nThe embeddings database serves as a knowledge source for LLM applications:\n\n```mermaid\ngraph TD\n    A[User Query] --> B[Embeddings Search]\n    B --> C[Retrieve Context]\n    C --> D[LLM Generation]\n    D --> E[Response]\n    \n    F[(Embeddings DB)] -.->|retrieval| C\n```\n\n### Autonomous Agents\n\nAgents utilize tools to perform complex tasks:\n\n```python\nfrom txtai.agent.tool.read import ReadTool\n\ntool = ReadTool(maxlength=40000)\ncontent = tool.forward(\"path/to/file.txt\")\n```\n\n资料来源：[src/python/txtai/agent/tool/read.py:1-60]()\n\n## API Server\n\ntxtai includes a built-in REST API for cross-language compatibility:\n\n```yaml\n# app.yml\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n```\n\n```bash\nCONFIG=app.yml uvicorn \"txtai.api:app\"\ncurl -X GET \"http://localhost:8000/search?query=positive\"\n```\n\n资料来源：[README.md:50-60]()\n\n## Design Principles\n\n1. **Low footprint** - Install additional dependencies and scale up when needed 资料来源：[README.md:60-70]()\n2. **Local execution** - No need to ship data to remote services\n3. **Model flexibility** - Work with micromodels up to large language models\n4. **Modular design** - Use individual components or the full stack\n\n## Requirements\n\n- Python 3.10+\n- PyTorch 2.4+\n- Transformers 4.56.2+\n\nThe framework supports Hugging Face models, llama.cpp, Ollama, vLLM, and more for embeddings and LLM operations. 资料来源：[setup.py:18-30]()\n\n---\n\n<a id='getting-started'></a>\n\n## Getting Started\n\n### 相关页面\n\n相关主题：[Introduction to txtai](#introduction)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [README.md](https://github.com/neuml/txtai/blob/main/README.md)\n- [examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n- [examples/wiki.py](https://github.com/neuml/txtai/blob/main/examples/wiki.py)\n- [examples/workflows.py](https://github.com/neuml/txtai/blob/main/examples/workflows.py)\n- [src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n</details>\n\n# Getting Started\n\ntxtai is an all-in-one AI framework for semantic search, LLM orchestration and language model workflows. The Getting Started guide provides the essential knowledge needed to install, configure, and begin using txtai effectively.\n\n## Overview\n\ntxtai provides a unified interface for building AI applications with:\n\n- **Vector search** with SQL, object storage, topic modeling, graph analysis and multimodal indexing\n- **Embeddings** for text, documents, audio, images and video\n- **Pipelines** powered by language models for prompts, question-answering, labeling, transcription, and translation\n- **Workflows** that orchestrate multi-step processes\n- **Agents** for autonomous decision-making\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Installation\n\n### Requirements\n\n| Requirement | Minimum Version | Notes |\n|-------------|-----------------|-------|\n| Python | 3.10+ | Required runtime |\n| PyTorch | 2.4+ | Core deep learning framework |\n| Transformers | 4.56.2+ | Model loading and inference |\n| NumPy | 1.18.4+ | Numerical operations |\n\n资料来源：[setup.py:27-37](https://github.com/neuml/txtai/blob/main/setup.py)\n\n### Installation Methods\n\n#### Standard Installation\n\nStandard installation includes all default dependencies:\n\n```bash\npip install txtai\n```\n\nThis installs the core package with default dependencies including FAISS, HuggingFace Hub, msgpack, numpy, regex, PyYAML, safetensors, PyTorch, and transformers.\n\n#### Minimal Installation\n\nFor environments with limited resources, install a minimal version:\n\n```bash\nMINIMAL=1 pip install txtai\n```\n\n资料来源：[setup.py:18-25](https://github.com/neuml/txtai/blob/main/setup.py)\n\n### Optional Extras\n\ntxtai provides modular extras to scale functionality based on needs:\n\n| Extra | Purpose | Key Dependencies |\n|-------|---------|------------------|\n| `api` | REST API hosting | fastapi, uvicorn, aiohttp |\n| `pipeline-audio` | Audio processing | scipy, soundfile, webrtcvad |\n| `pipeline-data` | Data extraction | beautifulsoup4, docling, tika |\n| `pipeline-image` | Image processing | pillow, timm, imagehash |\n| `pipeline-llm` | LLM inference | litellm, llama-cpp-python |\n| `pipeline-text` | Text processing | gliner, sentencepiece |\n| `pipeline-train` | Model training | accelerate, peft, onnx |\n| `vectors` | Vector embeddings | sentence-transformers |\n| `ann` | ANN indexes | faiss, hnswlib, annoy |\n| `workflow` | Workflow tasks | requests, pandas, openpyxl |\n| `agent` | Agent framework | smolagents, mcpadapt |\n| `all` | Everything | All above extras |\n\n资料来源：[setup.py:38-98](https://github.com/neuml/txtai/blob/main/setup.py)\n\nInstall with extras:\n\n```bash\npip install txtai[api,workflow]\npip install txtai[all]\n```\n\n## Core Concepts\n\n### Embeddings Database\n\nThe central component of txtai is the **embeddings database**, which combines:\n\n- Vector indexes (sparse and dense)\n- Graph networks\n- Relational databases\n\nThis foundation enables vector search and serves as a knowledge source for LLM applications.\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n```mermaid\ngraph TD\n    A[Embeddings Database] --> B[Vector Indexes]\n    A --> C[Graph Networks]\n    A --> D[Relational Database]\n    B --> E[Dense Vectors]\n    B --> F[Sparse Vectors]\n```\n\n### Pipeline Architecture\n\nPipelines are language model powered components that perform specific tasks:\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Pipeline]\n    B --> C[LLM Pipeline]\n    B --> D[Data Pipeline]\n    B --> E[Text Pipeline]\n    C --> F[Generated Content]\n    D --> G[Extracted Data]\n    E --> H[Processed Text]\n```\n\nKey pipeline types:\n\n| Pipeline | Function |\n|----------|----------|\n| `Summary` | Text summarization |\n| `Textractor` | Text extraction from files |\n| `RAG` | Retrieval augmented generation |\n| `Transcription` | Audio to text |\n| `Translation` | Language translation |\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n\n### Workflow System\n\nWorkflows orchestrate multi-step processes by connecting tasks:\n\n```mermaid\ngraph TD\n    A[Workflow Input] --> B[Task 1]\n    B --> C[Task 2]\n    C --> D[Task N]\n    D --> E[Workflow Output]\n    \n    F[Template Task] -->|formats| B\n    G[URL Task] -->|fetches| B\n```\n\n资料来源：[examples/workflows.py](https://github.com/neuml/txtai/blob/main/examples/workflows.py)\n\n## Quick Start\n\n### Basic Semantic Search\n\nThe simplest way to get started with txtai is semantic search:\n\n```python\nimport txtai\n\nembeddings = txtai.Embeddings()\nembeddings.index([\"Correct\", \"Not what we hoped\"])\nembeddings.search(\"positive\", 1)\n# Returns: [(0, 0.29862046241760254)]\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n### Building a RAG Application\n\nRetrieval Augmented Generation combines embeddings search with LLM inference:\n\n```python\nfrom txtai import Embeddings, RAG\nfrom txtai.pipeline import Textractor\n\n# Step 1: Extract text from documents\ntextractor = Textractor()\nchunks = []\nfor f in [\"document1.pdf\", \"document2.pdf\"]:\n    for chunk in textractor(f):\n        chunks.append((f, chunk))\n\n# Step 2: Build embeddings database\nembeddings = Embeddings(content=True, path=\"Qwen/Qwen3-Embedding-0.6B\", maxlength=2048)\nembeddings.index(chunks)\n\n# Step 3: Create RAG pipeline\ntemplate = \"\"\"\n  Answer the following question using the provided context.\n\n  Question:\n  {question}\n\n  Context:\n  {context}\n\"\"\"\n\nrag = RAG(\n    embeddings,\n    \"Qwen/Qwen3-0.6B\",\n    system=\"You are a friendly assistant\",\n    template=template,\n    output=\"flatten\",\n)\n\n# Step 4: Query\nquestion = \"Summarize the main advancements made by BERT\"\nprint(rag(question, maxlength=2048, stripthink=True))\n```\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n### Wikipedia Summarization\n\nExample combining external API with summarization:\n\n```python\nimport requests\nfrom txtai.pipeline import Summary\n\nclass Application:\n    SEARCH_TEMPLATE = \"https://en.wikipedia.org/w/api.php?action=opensearch&search=%s&limit=1&namespace=0&format=json\"\n    CONTENT_TEMPLATE = \"https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&titles=%s\"\n\n    def __init__(self):\n        self.summary = Summary(\"sshleifer/distilbart-cnn-12-6\")\n\n    def query(self, query):\n        # Search Wikipedia\n        data = requests.get(self.SEARCH_TEMPLATE % query).json()\n        if data and data[1]:\n            page = data[1][0]\n            content = requests.get(self.CONTENT_TEMPLATE % page).json()\n            content = list(content[\"query\"][\"pages\"].values())[0][\"extract\"]\n            return self.summary(content)\n        return None\n```\n\n资料来源：[examples/wiki.py](https://github.com/neuml/txtai/blob/main/examples/wiki.py)\n\n## Configuration\n\n### Embeddings Configuration\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `path` | str | None | Model path (HuggingFace, llama.cpp, Ollama, vLLM) |\n| `content` | bool | False | Enable content storage |\n| `maxlength` | int | None | Maximum sequence length |\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n### LLM Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `maxlength` | int | None | Maximum sequence length |\n| `stream` | bool | False | Enable streaming response |\n| `stop` | list | None | List of stop strings |\n| `stripthink` | bool | varies | Strip thinking tags from output |\n| `defaultrole` | str | \"auto\" | Default role for text inputs |\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py:18-34](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n\n### RAG Configuration\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `embeddings` | Embeddings | Embeddings database instance |\n| `path` | str | LLM model path |\n| `system` | str | System prompt for the LLM |\n| `template` | str | Prompt template with `{question}` and `{context}` |\n| `output` | str | Output format (\"flatten\", etc.) |\n\n## API Service\n\ntxtai includes a built-in API for language-agnostic applications:\n\n```yaml\n# app.yml\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n```\n\n```bash\nCONFIG=app.yml uvicorn \"txtai.api:app\"\ncurl -X GET \"http://localhost:8000/search?query=positive\"\n```\n\nBenefits of the API service:\n- Work with your programming language of choice\n- Run local - no need to ship data to remote services\n- Supports models from micromodels to large language models\n- Low footprint - scale up when needed\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Next Steps\n\nAfter completing the Getting Started guide, explore:\n\n1. **Example Notebooks** - Over 70 example notebooks covering all functionality\n2. **Workflows** - Build complex multi-step pipelines\n3. **Agents** - Create autonomous AI agents with tool use\n4. **Graph Analysis** - Network and relationship analysis\n5. **Model Training** - Fine-tune models for your use case\n\nAll examples are available at: https://neuml.github.io/txtai/examples\n\n---\n\n<a id='architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Embeddings Database](#embeddings-core), [Database Integration](#database-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/neuml/txtai/blob/main/README.md)\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [src/python/txtai/pipeline/data/textractor.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/data/textractor.py)\n- [src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n- [src/python/txtai/agent/tool/function.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/function.py)\n- [src/python/txtai/agent/tool/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/factory.py)\n- [src/python/txtai/workflow/task/template.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n- [src/python/txtai/cloud/hub.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/cloud/hub.py)\n</details>\n\n# System Architecture\n\ntxtai is an **all-in-one AI framework** designed for semantic search, LLM orchestration, and language model workflows. The architecture is built around a modular, extensible design that allows developers to scale from micromodels to large language models with minimal footprint.\n\n## Architecture Overview\n\nThe txtai architecture consists of several interconnected layers that work together to provide comprehensive AI capabilities.\n\n```mermaid\ngraph TB\n    subgraph \"Client Layer\"\n        API[\"REST API\"]\n        Console[\"Console\"]\n        Library[\"Python Library\"]\n    end\n    \n    subgraph \"Core Engine\"\n        Embeddings[\"Embeddings Database\"]\n        Workflow[\"Workflow Engine\"]\n        Agent[\"Agent System\"]\n    end\n    \n    subgraph \"Pipeline Layer\"\n        LLMPipeline[\"LLM Pipeline\"]\n        DataPipeline[\"Data Pipeline\"]\n        TextPipeline[\"Text Pipeline\"]\n        TrainPipeline[\"Train Pipeline\"]\n        AudioPipeline[\"Audio Pipeline\"]\n        ImagePipeline[\"Image Pipeline\"]\n    end\n    \n    subgraph \"Index Layer\"\n        ANN[\"ANN Index\"]\n        Documents[\"Document Store\"]\n        Graph[\"Graph Network\"]\n        Database[\"Relational DB\"]\n    end\n    \n    API --> Embeddings\n    API --> Workflow\n    API --> Agent\n    Embeddings --> ANN\n    Embeddings --> Documents\n    Embeddings --> Graph\n    Embeddings --> Database\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Core Components\n\n### Embeddings Database\n\nThe **embeddings database** is the foundational component of txtai. It serves as a union of multiple index types:\n\n| Index Type | Purpose | Description |\n|------------|---------|-------------|\n| Vector Index (Sparse) | Sparse embeddings | Traditional BM25-style scoring |\n| Vector Index (Dense) | Dense embeddings | Neural network-based semantic vectors |\n| Graph Networks | Relationship mapping | NetworkX-based graph structures |\n| Relational Databases | Structured data | SQLAlchemy-based data storage |\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\nThe embeddings database enables vector search capabilities and serves as a powerful knowledge source for large language model (LLM) applications.\n\n### Pipeline System\n\nPipelines are the processing units of txtai, each designed for specific tasks:\n\n```mermaid\ngraph LR\n    Input[\"Input Data\"] --> Pipeline[\"Pipeline\"]\n    \n    subgraph \"Pipeline Types\"\n        Audio[\"Audio Pipeline\"]\n        Data[\"Data Pipeline\"]\n        Image[\"Image Pipeline\"]\n        LLM[\"LLM Pipeline\"]\n        Text[\"Text Pipeline\"]\n        Train[\"Train Pipeline\"]\n    end\n    \n    Pipeline --> Output[\"Output Data\"]\n```\n\n#### Data Pipeline\n\nThe Data Pipeline handles text extraction and processing:\n\n| Component | Function |\n|-----------|----------|\n| `Textractor` | Extracts text from files and URLs |\n| `FileToHTML` | Converts files to HTML format |\n| `HTMLToMarkdown` | Transforms HTML to Markdown |\n| `Segmentation` | Handles text segmentation (sentences, lines, paragraphs) |\n\nThe `Textractor` class supports multiple backends for file extraction:\n\n```python\ndef __init__(\n    self,\n    sentences=False,\n    lines=False,\n    paragraphs=False,\n    minlength=None,\n    join=False,\n    sections=False,\n    cleantext=True,\n    chunker=None,\n    headers=None,\n    backend=\"available\",\n    safeopen=False,\n    **kwargs,\n):\n```\n\n资料来源：[src/python/txtai/pipeline/data/textractor.py:19-37](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/data/textractor.py)\n\n#### LLM Pipeline\n\nThe LLM Pipeline provides LLM generation capabilities:\n\n```python\ndef __call__(\n    self,\n    text,\n    maxlength=None,\n    stream=False,\n    stop=None,\n    defaultrole=\"auto\",\n    stripthink=None,\n    **kwargs,\n):\n```\n\nKey features:\n- Supports streaming responses\n- Configurable stop sequences\n- Built-in thinking tag stripping\n- Chat and vision model support\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py:1-50](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n\n### Agent System\n\nThe Agent System enables autonomous AI agents with tool use capabilities:\n\n```mermaid\ngraph TD\n    User[\"User Input\"] --> Agent[\"Agent\"]\n    Agent --> Tools[\"Tool Collection\"]\n    Tools --> EmbeddingsTool[\"Embeddings Tool\"]\n    Tools --> FunctionTool[\"Function Tool\"]\n    Tools --> DefaultTools[\"Default Tools\"]\n    \n    subgraph \"Tool Factory\"\n        create[\"create()\"]\n        createtool[\"createtool()\"]\n    end\n    \n    Tools --> Result[\"Result\"]\n    Result --> Agent\n```\n\n#### FunctionTool\n\nA `FunctionTool` wraps descriptive configuration with a target function for LLM prompts:\n\n```python\nclass FunctionTool(Tool):\n    def __init__(self, config):\n        self.name = config[\"name\"]\n        self.description = config[\"description\"]\n        self.inputs = config[\"inputs\"]\n        self.output_type = config.get(\"output\", config.get(\"output_type\", \"any\"))\n        self.target = config[\"target\"]\n```\n\n资料来源：[src/python/txtai/agent/tool/function.py:1-50](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/function.py)\n\n#### Tool Factory\n\nThe `ToolFactory` creates tools from various sources:\n\n| Input Type | Creation Method |\n|------------|------------------|\n| Tool instance | Direct pass-through |\n| Function/Method | Auto-wrapped with `createtool()` |\n| Dictionary | Config-based creation |\n| String alias | Lookup in `DEFAULTS` registry |\n| HTTP URL | MCP tool collection import |\n\n资料来源：[src/python/txtai/agent/tool/factory.py:1-100](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/factory.py)\n\n### Workflow Engine\n\nThe Workflow Engine processes tasks through a series of steps:\n\n```mermaid\ngraph LR\n    Task[\"Task Input\"] --> Template[\"Template Task\"]\n    Template --> Process[\"Process\"]\n    Process --> Output[\"Task Output\"]\n```\n\n#### Template Task\n\nThe `TemplateTask` generates text from templates, supporting LLM prompt preparation:\n\n```python\nclass TemplateTask(Task):\n    def register(self, template=None, rules=None, strict=True):\n        self.template = template if template else self.defaulttemplate()\n        self.rules = rules if rules else self.defaultrules()\n        self.formatter = TemplateFormatter() if strict else Formatter()\n```\n\nTemplate processing supports three input types:\n\n| Input Type | Processing Method |\n|------------|-------------------|\n| Dictionary | Named parameters from keys |\n| Tuple | Numbered parameters (arg0, arg1, ...) |\n| Other | Default `{text}` parameter |\n\n资料来源：[src/python/txtai/workflow/task/template.py:1-50](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n\n## Module Organization\n\n### Package Structure\n\n```\ntxtai/\n├── agent/           # Agent system and tools\n├── api/             # REST API implementation\n├── cloud/           # Cloud integration (Hub)\n├── embeddings/      # Embeddings database\n├── scoring/         # Scoring/ranking system\n├── pipeline/       # Pipeline components\n│   ├── audio/      # Audio processing\n│   ├── data/       # Data extraction\n│   ├── image/      # Image processing\n│   ├── llm/        # LLM generation\n│   ├── text/       # Text processing\n│   └── train/      # Model training\n└── workflow/       # Workflow engine\n```\n\n### Dependency Groups\n\nThe project uses optional dependency groups for modular installation:\n\n| Extra | Purpose | Key Dependencies |\n|-------|---------|------------------|\n| `default` | Core functionality | faiss-cpu, numpy, torch, transformers |\n| `api` | REST API | fastapi, uvicorn, aiohttp |\n| `pipeline` | All pipelines | Combination of all pipeline extras |\n| `agent` | Agent system | smolagents, mcpadapt, jinja2 |\n| `vectors` | Vector models | sentence-transformers, litellm |\n\n资料来源：[setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n\n## Data Flow Architecture\n\n### Semantic Search Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Embeddings\n    participant ANN\n    participant Documents\n    \n    User->>Embeddings: index(documents)\n    Embeddings->>ANN: build(vector)\n    Embeddings->>Documents: store(data)\n    \n    User->>Embeddings: search(query)\n    Embeddings->>ANN: query(vector)\n    ANN-->>Embeddings: top-k results\n    Embeddings->>Documents: retrieve(ids)\n    Documents-->>Embeddings: document content\n    Embeddings-->>User: ranked results\n```\n\n### RAG Pipeline Flow\n\n```mermaid\ngraph TD\n    Question[\"User Question\"] --> Embeddings[\"Embeddings DB\"]\n    Embeddings --> Context[\"Retrieved Context\"]\n    Context --> LLM[\"LLM Pipeline\"]\n    LLM --> Answer[\"Generated Answer\"]\n    \n    subgraph \"Indexing Phase\"\n        Documents[\"Documents\"] --> Textractor[\"Textractor\"]\n        Textractor --> Chunks[\"Text Chunks\"]\n        Chunks --> Embeddings\n    end\n```\n\n## Cloud Integration\n\n### Hub Module\n\nThe Hub module enables model and data sharing via Hugging Face:\n\n```python\ndef upload(self, path):\n    with open(path, \"r\", encoding=\"utf-8\") as f:\n        content = f.read()\n    \n    if \"embeddings \" not in content:\n        content += \"documents filter=lfs diff=lfs merge=lfs -text\\n\"\n        content += \"embeddings filter=lfs diff=lfs merge=lfs -text\\n\"\n    \n    huggingface_hub.upload_file(...)\n```\n\nFeatures:\n- Automatic LFS tracking for embeddings\n- Token-based authentication\n- Remote index management\n\n资料来源：[src/python/txtai/cloud/hub.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/cloud/hub.py)\n\n## Use Cases Built on Architecture\n\n| Use Case | Architecture Components |\n|----------|------------------------|\n| Semantic Search | Embeddings + ANN Index + Workflow |\n| RAG | Embeddings + LLM Pipeline + Template Task |\n| LLM Orchestration | Agent System + Tool Factory + LLM Pipeline |\n| Document Processing | Data Pipeline + Embeddings + Workflow |\n| Multi-model Workflows | Pipeline Composition + Workflow Engine |\n\n## Summary\n\nThe txtai architecture provides a flexible, extensible foundation for AI applications:\n\n1. **Embeddings Database** - Unified vector/graph/relational indexing\n2. **Pipeline System** - Modular processing for diverse data types\n3. **Agent System** - Tool-augmented autonomous AI\n4. **Workflow Engine** - Task orchestration and automation\n5. **Cloud Integration** - Model sharing and deployment\n\nThis modular design enables developers to start with minimal installations and scale up capabilities as needed, supporting everything from simple semantic search to complex multi-model RAG systems with autonomous agents.\n\n---\n\n<a id='embeddings-core'></a>\n\n## Embeddings Database\n\n### 相关页面\n\n相关主题：[System Architecture](#architecture), [Scoring and Retrieval Algorithms](#scoring-algorithms)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/embeddings/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/__init__.py)\n- [src/python/txtai/embeddings/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/base.py)\n- [src/python/txtai/embeddings/index/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/index/__init__.py)\n- [src/python/txtai/embeddings/search/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/search/__init__.py)\n- [src/python/txtai/vectors/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/vectors/__init__.py)\n- [docs/embeddings/index.md](https://github.com/neuml/txtai/blob/main/docs/embeddings/index.md)\n</details>\n\n# Embeddings Database\n\nThe Embeddings Database is the foundational component of txtai, providing unified vector search capabilities that combine sparse and dense vector indexing, graph networks, and relational database functionality.\n\n## Overview\n\nThe txtai embeddings database serves as a comprehensive solution for semantic search and vector-based data retrieval. It extends traditional database capabilities by incorporating neural network-based embedding generation and similarity search.\n\n**Core Architecture:**\n\n```mermaid\ngraph TD\n    A[Embeddings Database] --> B[Vector Indexes]\n    A --> C[Graph Networks]\n    A --> D[Relational Database]\n    B --> B1[Dense Vectors]\n    B --> B2[Sparse Vectors]\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Key Features\n\n| Feature | Description |\n|---------|-------------|\n| Vector Search | Semantic similarity search using dense and sparse embeddings |\n| SQL Support | Traditional database queries combined with vector search |\n| Content Storage | Built-in document content storage |\n| Graph Analysis | Integration with graph networks for relationship-based queries |\n| Multimodal Indexing | Support for text, documents, audio, images, and video |\n| Topic Modeling | Built-in topic modeling capabilities |\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Installation and Setup\n\n### Basic Installation\n\n```python\nimport txtai\n\nembeddings = txtai.Embeddings()\n```\n\n### Configuration Options\n\nThe embeddings database can be configured with the following key parameters:\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `path` | string | None | Model path (Hugging Face, llama.cpp, Ollama, vLLM) |\n| `content` | bool | False | Enable content storage |\n| `maxlength` | int | None | Maximum sequence length |\n| `quantize` | bool | False | Enable model quantization |\n| `gpu` | bool | True | Enable GPU inference |\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n## Core Operations\n\n### Indexing Documents\n\n```python\nembeddings = txtai.Embeddings()\nembeddings.index([\"Correct\", \"Not what we hoped\"])\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n### Semantic Search\n\n```python\nresults = embeddings.search(\"positive\", 1)\n# Returns: [(0, 0.29862046241760254)]\n```\n\nThe search method returns tuples of (index, score) ordered by similarity.\n\n## Architecture Components\n\n### Vector Indexes\n\nThe embeddings database supports multiple vector index backends:\n\n| Backend | Package | Configuration |\n|---------|---------|---------------|\n| FAISS | `faiss-cpu` | Default backend |\n| Annoy | `annoy` | Via `ann` extra |\n| HNSWLib | `hnswlib` | Via `ann` extra |\n| pgvector | `pgvector` | Via `ann` extra |\n| sqlite-vec | `sqlite-vec` | Via `ann` extra |\n\n资料来源：[setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n\n### Vector Models\n\nSupported embedding model providers:\n\n- **Sentence Transformers**: `sentence-transformers>=5.0.0`\n- **ONNX Models**: `onnx>=1.11.0`, `onnxruntime>=1.11.0`\n- **llama.cpp**: `llama-cpp-python>=0.2.75`\n- **LiteLLM**: `litellm>=1.37.16`\n- **Model2Vec**: `model2vec>=0.3.0`\n- **Static Vectors**: `staticvectors>=0.2.0`\n\n资料来源：[setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n\n## Data Flow\n\n```mermaid\ngraph LR\n    A[Input Text] --> B[Tokenizer]\n    B --> C[Embedding Model]\n    C --> D[Vector Index]\n    D --> E[Search Query]\n    E --> F[Similarity Score]\n    F --> G[Results]\n    \n    H[(Content Store)] --> G\n```\n\n## RAG Integration\n\nThe embeddings database serves as the knowledge source for Retrieval Augmented Generation (RAG) workflows:\n\n```python\nfrom txtai import Embeddings, RAG\n\n# Build embeddings database\nembeddings = Embeddings(content=True, path=\"Qwen/Qwen3-Embedding-0.6B\", maxlength=2048)\nembeddings.index(chunks)\n\n# Create RAG pipeline\nrag = RAG(\n    embeddings,\n    \"Qwen/Qwen3-0.6B\",\n    system=\"You are a friendly assistant\",\n    template=template,\n    output=\"flatten\",\n)\n```\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n### RAG Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `similarity` | Embeddings/Similarity | Knowledge source database |\n| `path` | string | LLM model path |\n| `quantize` | bool | Enable model quantization |\n| `gpu` | bool | Enable GPU inference |\n| `template` | string | Prompt template |\n| `context` | int | Maximum context length |\n| `minscore` | float | Minimum similarity score |\n| `mintokens` | int | Minimum token count |\n\n资料来源：[src/python/txtai/pipeline/llm/rag.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/rag.py)\n\n## API Deployment\n\nThe embeddings database can be exposed as a REST API:\n\n```yaml\n# app.yml\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n```\n\n```bash\nCONFIG=app.yml uvicorn \"txtai.api:app\"\ncurl -X GET \"http://localhost:8000/search?query=positive\"\n```\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Workflow Integration\n\nThe embeddings database integrates with txtai workflows for processing pipelines:\n\n```python\nfrom txtai.workflow import Workflow, Task\n\n# Text extraction workflow\ntextractor = Textractor()\nchunks = []\nfor f, chunk in textractor(files):\n    chunks.append((f, chunk))\n\n# Index extracted content\nembeddings = Embeddings(content=True, path=\"Qwen/Qwen3-Embedding-0.6B\")\nembeddings.index(chunks)\n```\n\n资料来源：[examples/rag_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/rag_quickstart.py)\n\n## Dependencies\n\n### Core Dependencies\n\n| Package | Version | Purpose |\n|---------|---------|---------|\n| `faiss-cpu` | >=1.7.1.post2 | Default vector index |\n| `torch` | >=2.4 | Tensor operations |\n| `transformers` | >=4.56.2 | Model inference |\n| `huggingface-hub` | >=0.34.0 | Model management |\n| `numpy` | >=1.18.4 | Numerical operations |\n| `safetensors` | >=0.4.5 | Model serialization |\n\n资料来源：[setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n\n### Optional Dependencies\n\n| Extra | Packages | Use Case |\n|-------|----------|----------|\n| `ann` | annoy, hnswlib, pgvector | Alternative indexes |\n| `vectors` | sentence-transformers, litellm | Vector models |\n| `similarity` | ann + vectors | Full search capabilities |\n\n## Use Cases\n\n### Semantic Search\n\nTraditional search systems use keywords to find data. Semantic search understands natural language and identifies results with the same meaning, not necessarily the same keywords.\n\n### Knowledge Base for LLMs\n\nThe embeddings database serves as a powerful knowledge source for LLM applications, enabling retrieval augmented generation (RAG) processes.\n\n### Multimodal Applications\n\n- **Text**: Document embeddings for text classification and clustering\n- **Images**: Visual similarity search\n- **Audio**: Speech-to-text with semantic search\n- **Video**: Frame-level semantic indexing\n\n资料来源：[README.md](https://github.com/neuml/txtai/blob/main/README.md)\n\n## Performance Considerations\n\n- **Index Size**: Vector indexes scale with document count and embedding dimensionality\n- **GPU Acceleration**: Enable `gpu=True` for faster inference on compatible hardware\n- **Quantization**: Use `quantize=True` to reduce memory footprint with minimal accuracy loss\n- **Batch Processing**: Index documents in batches for optimal throughput\n\n## See Also\n\n- [RAG Pipeline](https://neuml.github.io/txtai/pipeline/text/rag) - Retrieval Augmented Generation\n- [Similarity Pipeline](https://neuml.github.io/txtai/pipeline/text/similarity) - Text similarity scoring\n- [Workflows](https://neuml.github.io/txtai/workflows) - Pipeline orchestration\n- [Agent Framework](https://neuml.github.io/txtai/agent) - Autonomous AI agents\n\n---\n\n<a id='pipelines'></a>\n\n## Pipelines\n\n### 相关页面\n\n相关主题：[Workflows](#workflows), [Agents](#agents)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/pipeline/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/__init__.py)\n- [src/python/txtai/pipeline/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/factory.py)\n- [src/python/txtai/pipeline/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/base.py)\n- [src/python/txtai/pipeline/text/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/text/__init__.py)\n- [src/python/txtai/pipeline/audio/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/audio/__init__.py)\n- [src/python/txtai/pipeline/llm/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/__init__.py)\n- [docs/pipeline/index.md](https://github.com/neuml/txtai/blob/main/docs/pipeline/index.md)\n</details>\n\n# Pipelines\n\n## Overview\n\nPipelines are the core processing components in txtai that power language model workflows. They provide a unified interface for various text processing, transformation, and generation tasks including text extraction, summarization, transcription, translation, and LLM inference.\n\nThe pipeline system enables txtai to work with text, documents, audio, images and video by providing modular, composable processing units that can be chained together to build complex workflows. 资料来源：[setup.py](setup.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    subgraph \"Pipeline Categories\"\n        Text[Text Pipelines]\n        Audio[Audio Pipelines]\n        LLM[LLM Pipelines]\n        Data[Data Pipelines]\n    end\n    \n    subgraph \"Base Infrastructure\"\n        Base[Pipeline Base Class]\n        Factory[Pipeline Factory]\n    end\n    \n    Text --> Base\n    Audio --> Base\n    LLM --> Base\n    Data --> Base\n    Base --> Factory\n```\n\n## Pipeline Categories\n\n### Text Pipelines\n\nText pipelines handle natural language processing tasks such as entity recognition, text classification, and language detection.\n\n| Pipeline | Description | Key Features |\n|----------|-------------|--------------|\n| Labels | Text classification and labeling | Supports GLiNER-based models for NER and classification |\n| Translation | Language translation | Powered by transformers |\n| Segmentation | Text splitting | Sentence, line, paragraph, and section segmentation |\n\n资料来源：[src/python/txtai/pipeline/text/__init__.py](src/python/txtai/pipeline/text/__init__.py)\n\n### Audio Pipelines\n\nAudio pipelines enable speech-to-text transcription and processing of audio content.\n\n| Component | Description |\n|-----------|-------------|\n| Transcription | Convert audio to text using webrtcvad for voice activity detection |\n| Audio processing | Support for sounddevice and soundfile libraries |\n\nThese pipelines require `scipy`, `sounddevice`, `soundfile`, and `webrtcvad-wheels` dependencies. 资料来源：[setup.py](setup.py)\n\n### LLM Pipelines\n\nLLM (Large Language Model) pipelines provide interfaces for text generation, chat completion, and vision-capable model interactions.\n\n```mermaid\ngraph LR\n    A[Input Text/List/Dict] --> B[Generator]\n    B --> C[stream]\n    B --> D[batch]\n    C --> E[Streaming Response]\n    D --> F[Complete Response]\n```\n\nThe LLM pipeline supports multiple backends including HuggingFace transformers and liteLLM for unified LLM access. Key capabilities include:\n\n- **Chat completion**: Supports both raw prompts and structured chat templates\n- **Vision support**: Can process image inputs with vision-capable models\n- **Streaming**: Real-time token-by-token response streaming\n- **Stop sequences**: Configurable stop tokens for controlled generation\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py](src/python/txtai/pipeline/llm/llm.py), [src/python/txtai/pipeline/llm/huggingface.py](src/python/txtai/pipeline/llm/huggingface.py)\n\n### Data Pipelines\n\nData pipelines handle document parsing, text extraction, and content conversion.\n\n#### Textractor\n\nThe Textractor class extracts text from files using various backends:\n\n```python\nTextractor(\n    sentences=False,      # Split into sentences\n    lines=False,          # Split into lines\n    paragraphs=False,     # Split into paragraphs\n    minlength=None,       # Minimum text length filter\n    join=False,           # Join segments\n    sections=False,       # Extract sections\n    cleantext=True,       # Clean extracted text\n    chunker=None,         # Custom text chunker\n    headers=None,         # HTTP headers for URLs\n    backend=\"available\",  # \"tika\", \"docling\", or \"available\"\n    safeopen=False        # Restrict to safe URLs only\n)\n```\n\nSupported backends:\n- **Tika**: Apache Tika for comprehensive document parsing\n- **Docling**: Alternative backend using docling library\n\n资料来源：[src/python/txtai/pipeline/data/textractor.py](src/python/txtai/pipeline/data/textractor.py)\n\n#### FileToHTML\n\nConverts various file formats to HTML for further processing.\n\n```mermaid\ngraph LR\n    A[File Input] --> B[Backend Selection]\n    B --> C{Tika Available?}\n    C -->|Yes| D[Tika Backend]\n    C -->|No| E{Docling Available?}\n    E -->|Yes| F[Docling Backend]\n    E -->|No| G[Return None]\n    D --> H[HTML Output]\n    F --> H\n```\n\nThe backend detection follows this priority:\n1. Tika (if `tika` package is installed)\n2. Docling (if `docling` package is installed)\n3. Returns `None` if no backend available\n\n资料来源：[src/python/txtai/pipeline/data/filetohtml.py](src/python/txtai/pipeline/data/filetohtml.py)\n\n#### HTMLToMarkdown\n\nConverts HTML content to Markdown format, supporting paragraph and section-level processing.\n\n## Pipeline Factory\n\nThe pipeline factory pattern enables dynamic pipeline instantiation based on configuration:\n\n```python\n# Via factory\npipeline = PipelineFactory.create(config)\n\n# Direct instantiation\npipeline = Summary(\"model-name\")\npipeline = Textractor()\n```\n\nThis allows pipelines to be defined in YAML configuration files and instantiated programmatically. 资料来源：[src/python/txtai/pipeline/factory.py](src/python/txtai/pipeline/factory.py)\n\n## Common Patterns\n\n### Text Processing Chain\n\n```mermaid\ngraph TD\n    A[File/URL] --> B[Textractor]\n    B --> C[Segmentation]\n    C --> D[Text Clean]\n    D --> E[Ready for Embeddings]\n```\n\n### LLM Generation Flow\n\n```python\nfrom txtai.pipeline import LLM\n\n# Direct model loading\nllm = LLM(\"model-name\")\n\n# Generate with various input formats\nresult = llm(\"What is artificial intelligence?\")\nresult = llm([{\"role\": \"user\", \"content\": \"Hello\"}])\nresult = llm(\"Generate a summary\", stream=True)\n```\n\nParameters supported by LLM pipelines:\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| text | str/list/dict | required | Input text or messages |\n| maxlength | int | 512 | Maximum sequence length |\n| stream | bool | False | Enable streaming response |\n| stop | list | None | Stop sequences |\n| defaultrole | str | \"auto\" | Default role for text inputs |\n| stripthink | bool | None | Strip thinking tags |\n\n资料来源：[src/python/txtai/pipeline/llm/llm.py](src/python/txtai/pipeline/llm/llm.py)\n\n## Dependencies\n\nPipelines are organized into optional dependency groups:\n\n```python\nextras[\"pipeline-audio\"]   # Audio processing\nextras[\"pipeline-data\"]     # Document parsing\nextras[\"pipeline-image\"]    # Image processing\nextras[\"pipeline-llm\"]      # LLM inference\nextras[\"pipeline-text\"]     # NLP tasks\nextras[\"pipeline-train\"]    # Model training\n```\n\nFull pipeline installation:\n```bash\npip install txtai[pipeline]\n```\n\n资料来源：[setup.py](setup.py)\n\n## Integration with Workflows\n\nPipelines integrate seamlessly with the workflow system:\n\n```python\nfrom txtai.workflow import Workflow, Task\nfrom txtai.pipeline import Summary, Textractor\n\n# Create a pipeline-based workflow\nworkflow = Workflow([\n    Task(Textractor(paragraphs=True)),\n    Task(Summary())\n])\n\n# Process documents\nresults = workflow([\"path/to/document.pdf\"])\n```\n\nThe TemplateTask enables prompt template processing within workflows, supporting string formatting with named or positional parameters. 资料来源：[src/python/txtai/workflow/task/template.py](src/python/txtai/workflow/task/template.py)\n\n## See Also\n\n- [Workflows](workflow.md) - Chaining pipelines together\n- [Embeddings](embeddings.md) - Vector indexing and search\n- [Agents](agent.md) - LLM-powered autonomous agents\n\n---\n\n<a id='workflows'></a>\n\n## Workflows\n\n### 相关页面\n\n相关主题：[Pipelines](#pipelines)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/workflow/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/__init__.py)\n- [src/python/txtai/workflow/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/base.py)\n- [src/python/txtai/workflow/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/factory.py)\n- [src/python/txtai/workflow/task/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/__init__.py)\n- [src/python/txtai/workflow/task/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/base.py)\n- [src/python/txtai/workflow/task/template.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n- [examples/workflow_quickstart.py](https://github.com/neuml/txtai/blob/main/examples/workflow_quickstart.py)\n</details>\n\n# Workflows\n\n## Overview\n\nWorkflows in txtai provide a declarative, pipeline-based approach to processing data through a series of connected tasks. They enable developers to chain together multiple processing components—including text extraction, summarization, translation, and large language model (LLM) calls—into cohesive data processing pipelines.\n\nThe workflow system supports:\n\n- **Deterministic pipelines**: Define processing steps that execute in sequence\n- **LLM-powered workflows**: Integrate language model prompts for intelligent processing\n- **Parallel execution**: Configure thread or process-based concurrency\n- **Flexible data transformation**: Handle one-to-many and many-to-one data transformations\n- **Template-based prompts**: Generate prompts dynamically using template formatting\n\n资料来源：[examples/workflow_quickstart.py:1-25]()\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Input Data] --> B[Workflow]\n    B --> C[Task 1: Textractor]\n    C --> D[Task 2: Summary]\n    D --> E[Task N: Translation/LLM]\n    E --> F[Output Results]\n    \n    G[Task Configuration] --> B\n    H[Initialize Action] --> C\n    I[Finalize Action] --> E\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `Workflow` | `workflow/__init__.py` | Main workflow orchestration class |\n| `WorkflowBase` | `workflow/base.py` | Base class with core workflow logic |\n| `Task` | `workflow/task/base.py` | Base class for all workflow tasks |\n| `TemplateTask` | `workflow/task/template.py` | Task with template prompt support |\n| `WorkflowFactory` | `workflow/factory.py` | Factory for workflow instantiation |\n\n资料来源：[src/python/txtai/workflow/__init__.py:1-50]()\n\n## Workflow Configuration\n\n### Python API\n\n```python\nfrom txtai import Workflow\nfrom txtai.workflow import Task\n\nworkflow = Workflow([\n    Task(textractor),\n    Task(summary),\n    Task(translate)\n])\n\n# Execute workflow\nresults = list(workflow(input_data))\n```\n\n### YAML Configuration\n\n```yaml\nworkflow:\n  tasks:\n    - action: textractor\n      task: textractor\n    - action: summary\n      task: summary\n```\n\n资料来源：[src/python/txtai/workflow/factory.py:1-30]()\n\n## Task System\n\n### Task Base Class\n\nThe `Task` class is the fundamental building block of workflows. Each task defines:\n\n- **Action**: Callable function or list of functions to execute\n- **Select**: Filter functions to select specific data\n- **Concurrency**: Thread or process-based execution\n- **Data handling**: Unpacking, column selection, and merging strategies\n\n```mermaid\ngraph LR\n    A[Input Element] --> B{Selection Filter}\n    B -->|Pass| C[Action Execution]\n    B -->|Fail| D[Skip Element]\n    C --> E[One-to-Many Transform]\n    E --> F[Output Elements]\n```\n\n#### Task Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `action` | callable/list | `None` | Action(s) to execute on each data element |\n| `select` | callable/list | `None` | Filter(s) to select data to process |\n| `unpack` | bool | `True` | Unwrap data from (id, data, tag) tuples |\n| `column` | int | `None` | Column index to select from tuple elements |\n| `merge` | str | `\"hstack\"` | Merge mode for multi-action outputs |\n| `initialize` | callable | `None` | Action executed before processing |\n| `finalize` | callable | `None` | Action executed after processing |\n| `concurrency` | str | `None` | `\"thread\"` or `\"process\"` for parallel execution |\n| `onetomany` | bool | `True` | Enable one-to-many data transformations |\n\n资料来源：[src/python/txtai/workflow/task/base.py:15-45]()\n\n### TemplateTask\n\n`TemplateTask` extends the base `Task` class with template processing capabilities for generating LLM prompts.\n\n```python\nfrom txtai.workflow.task import TemplateTask\n\ndef summary_template(text):\n    return f\"\"\"Summarize the following text in 40 words or less.\n    \n{text}\n\"\"\"\n\ndef translate_template(text, language):\n    return f\"\"\"Translate the following text to {language}.\n    \n{text}\n\"\"\"\n\nworkflow = Workflow([\n    Task(textractor),\n    Task(llm, action=lambda inputs: llm([summary_template(x) for x in inputs])),\n    Task(llm, action=lambda inputs: llm([translate_template(x, \"fr\") for x in inputs]))\n])\n```\n\n#### Template Processing\n\n| Element Type | Template Behavior |\n|--------------|-------------------|\n| `dict` | Pass as named template parameters |\n| `tuple` | Pass as `arg0`...`argN` template parameters |\n| `str` | Pass as `{text}` parameter |\n\n资料来源：[src/python/txtai/workflow/task/template.py:1-40]()\n\n## Workflow Execution\n\n### Basic Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant User\n    participant Workflow\n    participant Task1\n    participant TaskN\n    \n    User->>Workflow: workflow(data)\n    Workflow->>Task1: process(input)\n    Task1->>Task1: select()\n    Task1->>Task1: action()\n    Task1->>Task1: onetomany transform\n    Task1-->>TaskN: intermediate results\n    TaskN->>TaskN: select()\n    TaskN->>TaskN: action()\n    TaskN-->>User: final results\n```\n\n### Concurrency Modes\n\n| Mode | Use Case | Configuration |\n|------|----------|---------------|\n| `thread` | I/O-bound tasks | `concurrency=\"thread\"` |\n| `process` | CPU-bound tasks | `concurrency=\"process\"` |\n| `None` | Sequential execution | Default |\n\n资料来源：[src/python/txtai/workflow/base.py:1-60]()\n\n## Pipeline Components\n\n### Available Pipeline Tasks\n\n| Pipeline | Purpose | Example Usage |\n|----------|---------|---------------|\n| `Textractor` | Extract text from URLs/files | `Task(textractor)` |\n| `Summary` | Generate text summaries | `Task(summary)` |\n| `Translation` | Translate between languages | `Task(translate)` |\n| `LLM` | Execute LLM prompts | `Task(llm)` |\n| `Segmentation` | Split text into segments | `Task(segmentation)` |\n| `Transcription` | Audio to text | `Task(transcription)` |\n\n### Quick Start Example\n\n```python\nfrom txtai import LLM, Workflow\nfrom txtai.pipeline import Summary, Textractor, Translation\nfrom txtai.workflow import Task\n\n# Step 1: Define available pipelines\ntextractor = Textractor(backend=\"docling\", headers={\"user-agent\": \"Mozilla/5.0\"})\nsummary = Summary()\ntranslate = Translation()\n\n# Step 2: Define workflow tasks\nworkflow = Workflow([\n    Task(textractor),           # Extract text from URL\n    Task(summary),              # Generate summary\n    Task(lambda inputs: [translate(x, \"fr\") for x in inputs])  # Translate to French\n])\n\n# Step 3: Run the workflow\nresults = list(workflow([\"https://neuml.com\"]))\n```\n\n资料来源：[examples/workflow_quickstart.py:1-30]()\n\n### LLM-Powered Workflow Example\n\n```python\nfrom txtai import LLM, Workflow\nfrom txtai.pipeline import Textractor\nfrom txtai.workflow import Task\n\ntextractor = Textractor(backend=\"docling\")\nllm = LLM(\"Qwen/Qwen3-4B-Instruct-2507\")\n\ndef summary(text):\n    return f\"\"\"Summarize the following text in 40 words or less.\n{text}\n\"\"\"\n\ndef translate(text, language):\n    return f\"\"\"Translate the following text to {language}.\n{text}\n\"\"\"\n\nworkflow = Workflow([\n    Task(textractor),\n    Task(lambda inputs: llm([summary(x) for x in inputs], maxlength=25000)),\n    Task(lambda inputs: llm([translate(x, \"fr\") for x in inputs], maxlength=25000))\n])\n\nresults = list(workflow([\"https://neuml.com\"]))\n```\n\n资料来源：[examples/workflow_quickstart.py:35-60]()\n\n## Data Flow\n\n```mermaid\ngraph LR\n    A[(Input Data)] --> B[Workflow]\n    B --> C[Task 1: Textractor]\n    C --> D[Intermediate Data]\n    D --> E[Task 2: Summary]\n    E --> F[Intermediate Data]\n    F --> G[Task N: Custom Action]\n    G --> H[(Output Results)]\n    \n    I[URL Input] -->|UrlTask| C\n    J[File Input] -->|FileTask| C\n    K[Stream Input] -->|StreamTask| C\n```\n\n### Input/Output Handling\n\n- **URL input**: Processed via `UrlTask` wrapper\n- **File input**: Processed via `FileTask` wrapper\n- **Stream input**: Processed via `StreamTask` wrapper\n- **Tuple unpacking**: Supports `(id, data, tag)` format for labeled data\n\n资料来源：[src/python/txtai/workflow/base.py:60-100]()\n\n## Advanced Configuration\n\n### Custom Task Actions\n\n```python\nfrom txtai.workflow import Task\n\ndef custom_filter(element):\n    \"\"\"Filter elements before processing\"\"\"\n    return len(element) > 10\n\ndef custom_action(element):\n    \"\"\"Process single element\"\"\"\n    return element.upper()\n\ndef custom_finalize(results):\n    \"\"\"Aggregate results after processing\"\"\"\n    return sorted(results, key=lambda x: x[1], reverse=True)\n\nworkflow = Workflow([\n    Task(\n        action=custom_action,\n        select=custom_filter,\n        finalize=custom_finalize,\n        concurrency=\"thread\"\n    )\n])\n```\n\n### Workflow Factory\n\n```python\nfrom txtai.workflow import WorkflowFactory\n\n# Create workflow from configuration\nworkflow = WorkflowFactory.create(config)\n\n# Run with data\nresults = list(workflow(data))\n```\n\n资料来源：[src/python/txtai/workflow/factory.py:1-50]()\n\n## Best Practices\n\n1. **Use appropriate concurrency**: Set `concurrency=\"thread\"` for I/O-bound tasks (API calls, file operations) and `concurrency=\"process\"` for CPU-intensive tasks.\n\n2. **Leverage one-to-many transforms**: Tasks can produce multiple outputs from single inputs, enable with `onetomany=True`.\n\n3. **Template formatting**: Use `TemplateTask` for consistent prompt formatting across LLM calls.\n\n4. **Error handling**: Implement `select` filters to skip invalid data before reaching action processing.\n\n5. **Pipeline selection**: Only install necessary pipeline extras—e.g., `pip install txtai[pipeline-data]` for text extraction workflows.\n\n资料来源：[examples/workflow_quickstart.py:1-25]()\n\n## See Also\n\n- [Embeddings](embeddings.md) - Vector search and similarity indexing\n- [Pipelines](pipelines.md) - Language model pipelines\n- [LLM Configuration](llm.md) - Large language model setup\n- [API Documentation](api.md) - REST API reference\n\n---\n\n<a id='agents'></a>\n\n## Agents\n\n### 相关页面\n\n相关主题：[Pipelines](#pipelines), [Workflows](#workflows)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/agent/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/__init__.py)\n- [src/python/txtai/agent/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/base.py)\n- [src/python/txtai/agent/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/factory.py)\n- [src/python/txtai/agent/tool/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/__init__.py)\n- [src/python/txtai/agent/tool/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/blob/main/src/python/txtai/agent/tool/factory.py)\n- [src/python/txtai/agent/tool/function.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/agent/tool/function.py)\n- [docs/agent/index.md](https://github.com/neuml/txtai/blob/main/docs/agent/index.md)\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py) (agent dependencies)\n</details>\n\n# Agents\n\nAgents in txtai enable autonomous task execution by combining Large Language Models (LLMs) with a flexible tool system. The agent framework allows building AI-powered workflows that can reason, plan, and execute actions through configurable tools.\n\n## Overview\n\ntxtai agents are built on top of [smolagents](https://github.com/smol-ai/smolagents) and provide:\n\n- **Tool-based execution**: Agents use tools to interact with external systems and data sources\n- **LLM orchestration**: Seamless integration with various LLM backends including local models, APIs, and managed services\n- **MCP support**: Integration with Model Context Protocol for extended tool collections\n- **Configurable behavior**: Flexible configuration for tool selection, prompt templates, and execution parameters\n\nThe agent architecture separates concerns between tool definition, tool management, and agent execution, enabling modular and extensible agent systems.\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[User Input] --> B[Agent Core]\n    B --> C[LLM Engine]\n    C --> D[Tool Selection]\n    D --> E[Tool Execution]\n    E --> F[Results]\n    F --> C\n    C --> G[Final Response]\n    \n    H[ToolFactory] --> I[FunctionTool]\n    H --> J[EmbeddingsTool]\n    H --> K[MCPTools]\n    H --> L[DefaultTools]\n    \n    subgraph Tool System\n        I\n        J\n        K\n        L\n    end\n```\n\n### Core Components\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| Agent Base | `agent/base.py` | Core agent logic and execution loop |\n| Agent Factory | `agent/factory.py` | Factory for creating configured agents |\n| Tool Factory | `agent/tool/factory.py` | Creates and manages tool instances |\n| Function Tool | `agent/tool/function.py` | Wraps Python functions as LLM tools |\n| Tool Init | `agent/tool/__init__.py` | Tool abstractions and shared interfaces |\n\n## Tool System\n\n### Tool Types\n\nThe tool system supports multiple tool creation patterns:\n\n| Type | Description | Source |\n|------|-------------|--------|\n| `FunctionTool` | Wraps a Python function with config | `agent/tool/function.py:1` |\n| `EmbeddingsTool` | Embeddings-based search and retrieval | `agent/tool/factory.py` |\n| `MCP Tools` | External MCP tool collections | `agent/tool/factory.py` |\n| Default Tools | Built-in system tools | `agent/tool/factory.py` |\n\n### FunctionTool\n\nThe `FunctionTool` class wraps descriptive configuration and injects it along with a target function into an LLM prompt.\n\n```python\nfrom smolagents import Tool\n\nclass FunctionTool(Tool):\n    \"\"\"\n    A FunctionTool takes descriptive configuration and injects it along with a target function into an LLM prompt.\n    \"\"\"\n```\n\n**Configuration Schema:**\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `name` | string | Yes | Tool identifier |\n| `description` | string | Yes | Human-readable tool description for LLM |\n| `inputs` | dict | Yes | Input parameter definitions |\n| `output` | string | No | Output type (default: `\"any\"`) |\n| `output_type` | string | No | Alternative output type field |\n| `target` | callable | Yes | Function to execute |\n\n### Tool Factory\n\nThe `ToolFactory` class handles tool creation from various configuration formats:\n\n```python\n@staticmethod\ndef create(config):\n    \"\"\"\n    Creates a new list of tools. This method iterates of the `tools` configuration option and creates a Tool instance\n    for each entry. This supports the following:\n\n      - Tool instance\n      - Dictionary with `name`, `description`, `inputs`, `output` and `target` function configuration\n      - String with a tool alias name\n\n    Returns:\n        list of tools\n    \"\"\"\n```\n\n**Supported Tool Input Formats:**\n\n1. **Tool Instance**: Direct `smolagents.Tool` objects\n2. **Dictionary**: Config with `name`, `description`, `inputs`, `output`, and `target`\n3. **String Alias**: Named shortcuts (e.g., `\"webview\"`, `\"defaults\"`)\n4. **MCP URL**: HTTP URLs for MCP tool collections\n\n### Default Tools\n\n| Tool Name | Description |\n|-----------|-------------|\n| `webview` | Web content reading and extraction |\n| `read` | File and content reading capabilities |\n| `defaults` | Loads all default tools |\n\n## Agent Configuration\n\n### Dependencies\n\nThe agent module requires the following dependencies (specified in `setup.py:17`):\n\n```python\nextras[\"agent\"] = [\n    \"jinja2>=3.1.6\",\n    \"mcpadapt>=0.1.0\",\n    \"smolagents>=1.23\",\n]\n```\n\n| Dependency | Version | Purpose |\n|------------|---------|---------|\n| `jinja2` | >=3.1.6 | Template processing for prompts |\n| `mcpadapt` | >=0.1.0 | MCP tool protocol adapter |\n| `smolagents` | >=1.23 | Core agent framework |\n\n### Configuration Options\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `llm` | dict | LLM configuration with model path and parameters |\n| `tools` | list | List of tool configurations or aliases |\n| `template` | string | Prompt template for agent instructions |\n| `max iterations` | int | Maximum agent execution iterations |\n| `tool` | string | Primary execution tool/function |\n\n## Agent Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant U as User\n    participant A as Agent\n    participant L as LLM\n    participant T as Tool Manager\n    \n    U->>A: Execute Task\n    A->>L: Generate Tool Call\n    L-->>A: Tool Selection\n    A->>T: Execute Tool\n    T-->>A: Tool Result\n    A->>L: Process Result\n    L-->>A: Response\n    A-->>U: Final Answer\n```\n\n## Integration with LLM Pipelines\n\nAgents integrate with txtai's LLM pipeline system:\n\n```python\n# From llm.py - Agent uses LLM generation\ndef generator(self, text, maxlength, stream, stop, defaultrole, stripthink, **kwargs):\n    \"\"\"\n    Runs LLM generation for agent reasoning and tool calls.\n    \"\"\"\n    return self.generator(text, maxlength, stream, stop, defaultrole, stripthink, **kwargs)\n```\n\n### LLM Integration Methods\n\n| Method | Purpose |\n|--------|---------|\n| `ischat()` | Check if the LLM supports chat mode |\n| `isvision()` | Check if the LLM supports vision/multimodal |\n| `generator()` | Execute LLM generation |\n\n## Tool Execution\n\nThe tool execution is handled by the `forward` method in `FunctionTool`:\n\n```python\ndef forward(self, *args, **kwargs):\n    \"\"\"\n    Runs target function.\n\n    Args:\n        args: positional args\n        kwargs: keyword args\n\n    Returns:\n        result\n    \"\"\"\n    return self.target(*args, **kwargs)\n```\n\n### Execution Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `args` | tuple | Positional arguments for tool function |\n| `kwargs` | dict | Keyword arguments for tool function |\n\n## Best Practices\n\n1. **Tool Descriptions**: Provide clear, detailed descriptions for each tool to help the LLM understand when and how to use it\n2. **Input Schemas**: Define complete input schemas with types and descriptions\n3. **Error Handling**: Tools should handle errors gracefully and return meaningful error messages\n4. **Configuration Validation**: Use the factory pattern to validate tool configurations before execution\n\n## See Also\n\n- [Pipeline LLM](../pipeline/llm/) - LLM configuration and execution\n- [Workflows](../workflow/) - Task orchestration\n- [Embeddings](../embeddings/) - Vector search capabilities\n\n---\n\n<a id='database-integration'></a>\n\n## Database Integration\n\n### 相关页面\n\n相关主题：[Embeddings Database](#embeddings-core), [Scoring and Retrieval Algorithms](#scoring-algorithms)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [src/python/txtai/pipeline/data/textractor.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/data/textractor.py)\n- [src/python/txtai/embeddings/index/documents.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/index/documents.py)\n- [src/python/txtai/cloud/hub.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/cloud/hub.py)\n- [examples/workflows.py](https://github.com/neuml/txtai/blob/main/examples/workflows.py)\n- [src/python/txtai/workflow/task/template.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/workflow/task/template.py)\n</details>\n\n# Database Integration\n\n## Overview\n\ntxtai provides comprehensive database integration capabilities that enable vector search combined with structured data storage. The database layer is a core component of the embeddings database architecture, which unifies vector indexes (sparse and dense), graph networks, and relational databases into a single powerful system.\n\n资料来源：[setup.py:42-44]()\n\nThe database integration serves as the foundation for:\n\n- Persistent storage of indexed documents and embeddings\n- SQL-based querying alongside vector similarity search\n- Integration with external data sources and cloud storage\n- Workflow task management and state persistence\n\n## Architecture\n\nThe database layer in txtai follows a modular architecture with several key components:\n\n```mermaid\ngraph TD\n    A[Embeddings Database] --> B[Database Layer]\n    B --> C[SQLAlchemy Backend]\n    B --> D[DuckDB Backend]\n    C --> E[Schema Models]\n    D --> E\n    B --> F[Database Factory]\n    F --> G[Database Client]\n    G --> H[Query Engine]\n    H --> I[Document Storage]\n    H --> J[Vector Indexes]\n```\n\n### Key Components\n\n| Component | Purpose | Source |\n|-----------|---------|--------|\n| Database Base | Abstract base class defining database operations | [src/python/txtai/database/base.py]() |\n| Database Client | Client interface for executing queries | [src/python/txtai/database/client.py]() |\n| Database Factory | Factory pattern for creating database instances | [src/python/txtai/database/factory.py]() |\n| Schema Models | SQLAlchemy ORM models for data structures | [src/python/txtai/database/schema/__init__.py]() |\n\n## Supported Databases\n\n### SQLite/PostgreSQL with SQLAlchemy\n\ntxtai supports traditional relational databases through SQLAlchemy, providing cross-database compatibility. This enables seamless integration with:\n\n- **SQLite**: Lightweight, file-based database for local development\n- **PostgreSQL**: Production-grade relational database with advanced features\n- **pgvector**: PostgreSQL extension for vector similarity search\n\n资料来源：[setup.py:42-44]()\n\n```python\n# Example configuration\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n    database:\n        url: postgresql+psycopg2://user:pass@localhost:5432/txtai\n```\n\n### DuckDB\n\nDuckDB is an embedded analytical database that provides high-performance OLAP capabilities. txtai includes DuckDB as a database extra for analytical workloads.\n\n资料来源：[setup.py:42]()\n\n```python\nextras[\"database\"] = [\"duckdb>=0.8.0\", \"pillow>=7.1.2\", \"sqlalchemy>=2.0.20\"]\n```\n\n## Database Configuration\n\n### Configuration Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `url` | string | `sqlite:///txtai.db` | Database connection URL |\n| `path` | string | None | Path for file-based databases |\n| `indexed` | bool | True | Enable full-text indexing |\n| `content` | bool | True | Store original document content |\n| `objects` | string | `storage` | Object storage backend |\n| `embeddings` | string | `embeddings` | Embeddings storage directory |\n\n### Connection String Formats\n\n```\n# SQLite (default)\nsqlite:///txtai.db\n\n# PostgreSQL with pgvector\npostgresql://user:pass@localhost:5432/txtai\n\n# DuckDB\nduckdb:///path/to/database.duckdb\n\n# MySQL\nmysql+pymysql://user:pass@localhost:3306/txtai\n```\n\n## Document Storage System\n\nThe document storage system provides efficient handling of indexed content with support for batch operations and streaming.\n\n### Core Operations\n\n```python\nclass DocumentStorage:\n    def add(self, documents):\n        \"\"\"Add documents to storage\"\"\"\n        \n    def close(self):\n        \"\"\"Close and persist storage\"\"\"\n        \n    def delete(self, ids):\n        \"\"\"Remove documents by ID\"\"\"\n        \n    def get(self, ids):\n        \"\"\"Retrieve documents by ID\"\"\"\n```\n\n资料来源：[src/python/txtai/embeddings/index/documents.py:1-50]()\n\n### Batch Processing\n\nDocuments are processed in batches for optimal memory usage and performance:\n\n```python\n# Batch-based document indexing\nself.serializer.savestream(documents, self.documents)\nself.batch += 1\nself.size += len(documents)\n```\n\n## Integration with Pipeline Data Processing\n\nThe Textractor module demonstrates database integration with external data sources, supporting extraction from various file formats and URLs.\n\n资料来源：[src/python/txtai/pipeline/data/textractor.py:1-60]()\n\n### Supported Input Methods\n\n| Method | Description |\n|--------|-------------|\n| URL Fetching | Extract text from web pages via HTTP |\n| File Processing | Handle local and remote file paths |\n| Cloud Storage | Integration with cloud object storage |\n| Database Queries | Direct content retrieval from databases |\n\n### Safe Open Mode\n\nFor security-sensitive environments, Textractor supports safe open mode that restricts access to local temporary directories and non-private URLs only:\n\n```python\nself.safeopen = os.path.realpath(\n    tempfile.gettempdir() if isinstance(safeopen, bool) else safeopen\n) if safeopen else safeopen\n```\n\n## Cloud Storage Integration\n\ntxtai extends database capabilities with cloud storage integration for managing large embeddings and document collections.\n\n### HuggingFace Hub Integration\n\nThe cloud hub module provides seamless upload and download of index files to HuggingFace repositories:\n\n资料来源：[src/python/txtai/cloud/hub.py:1-60]()\n\n```python\n# Automatic LFS tracking for large files\nif \"embeddings \" not in content:\n    content += \"documents filter=lfs diff=lfs merge=lfs -text\\n\"\n    content += \"embeddings filter=lfs diff=lfs merge=lfs -text\\n\"\n```\n\n### Supported Cloud Providers\n\n| Provider | Package | Features |\n|----------|---------|----------|\n| HuggingFace Hub | `huggingface-hub` | Model hosting, datasets |\n| S3 Compatible | `apache-libcloud` | Object storage |\n| Google Cloud | `apache-libcloud` | Cloud storage |\n| Azure | `apache-libcloud` | Blob storage |\n\n## Workflow Database Tasks\n\nWorkflows can include database operations as part of complex data processing pipelines.\n\n### Task Configuration\n\n```python\nelif wtype == \"tabular\":\n    data[wtype] = component\n    tasks.append({\"action\": wtype})\n```\n\n资料来源：[examples/workflows.py:1-100]()\n\n### Workflow Task Types\n\n| Task Type | Description | Data Format |\n|-----------|-------------|-------------|\n| `tabular` | Database table operations | CSV, DataFrame |\n| `service` | Database connection pooling | Configuration |\n| `extract` | Data extraction to database | Various sources |\n\n## Query Patterns\n\n### Vector + SQL Hybrid Queries\n\ntxtai supports combining vector similarity search with traditional SQL filtering:\n\n```python\n# Semantic search with SQL filter\nresults = embeddings.search(\n    \"Find documents about machine learning\",\n    filter=\"category = 'technical' AND year > 2020\"\n)\n```\n\n### Batch Query Optimization\n\nFor high-throughput applications:\n\n```python\n# Batch query for multiple inputs\nqueries = [\"query1\", \"query2\", \"query3\"]\nresults = embeddings.batchsearch(queries, limit=10)\n```\n\n## Performance Considerations\n\n### Database Selection Guide\n\n| Use Case | Recommended Database |\n|----------|----------------------|\n| Local development | SQLite |\n| Production workloads | PostgreSQL + pgvector |\n| Analytical queries | DuckDB |\n| Large-scale embeddings | S3 + PostgreSQL hybrid |\n\n### Optimization Tips\n\n1. **Index Management**: Regular index rebuilding for maintaining query performance\n2. **Batch Operations**: Use batch operations for bulk inserts and updates\n3. **Connection Pooling**: Configure appropriate pool sizes for concurrent access\n4. **Object Storage**: Offload large embeddings to object storage while keeping metadata in database\n\n## Installation\n\nInstall database dependencies using extras:\n\n```bash\n# Basic database support\npip install txtai[database]\n\n# With all database backends\npip install txtai[all]\n\n# Individual backends\npip install txtai[duckdb]      # DuckDB analytics\npip install txtai[ann]         # Vector indexes with pgvector\n```\n\n## See Also\n\n- [Embeddings Configuration](embeddings/configuration/database.md) - Detailed configuration guide\n- [Vector Search](../vectors/overview.md) - Combining database with vector search\n- [Workflow Tasks](../workflow/overview.md) - Database tasks in workflows\n- [Cloud Storage](../cloud/hub.md) - HuggingFace Hub integration\n\n---\n\n<a id='graph-networks'></a>\n\n## Graph Networks\n\n### 相关页面\n\n相关主题：[Embeddings Database](#embeddings-core), [Database Integration](#database-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python/txtai/graph/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/graph/__init__.py)\n- [src/python/txtai/graph/base.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/graph/base.py)\n- [src/python/txtai/graph/factory.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/graph/factory.py)\n- [src/python/txtai/graph/networkx.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/graph/networkx.py)\n- [docs/embeddings/configuration/graph.md](https://github.com/neuml/txtai/blob/main/docs/embeddings/configuration/graph.md)\n</details>\n\n# Graph Networks\n\nGraph Networks in txtai provide a powerful mechanism for building and traversing graph-based knowledge structures. This module enables semantic relationship mapping, topic modeling, and community detection capabilities within the broader txtai ecosystem.\n\n## Overview\n\nGraph Networks are a core component of txtai's architecture that combines vector search capabilities with graph-based relationship analysis. The graph module creates network representations that can be queried, traversed, and analyzed for patterns and communities.\n\n```mermaid\ngraph TB\n    subgraph \"txtai Graph Module\"\n        GB[Graph Base Class]\n        GX[NetworkX Backend]\n        GF[Graph Factory]\n        GT[Topics Module]\n    end\n    \n    GB --> GX\n    GB --> GT\n    GF --> GB\n    \n    GX --> CD[Community Detection]\n    GX --> NT[Node Traversal]\n    GX --> RL[Relationship Links]\n```\n\n资料来源：[src/python/txtai/graph/base.py:1-20]()\n\n## Architecture\n\n### Base Graph Class\n\nThe `Graph` class serves as the foundational abstraction for all graph implementations. It provides a consistent interface for graph operations regardless of the underlying backend.\n\n```python\nclass Graph:\n    \"\"\"\n    Base class for Graph instances. This class builds graph networks. Supports topic modeling\n    and relationship traversal.\n    \"\"\"\n```\n\n**Key Attributes:**\n\n| Attribute | Type | Description |\n|-----------|------|-------------|\n| `config` | dict | Graph configuration settings |\n| `backend` | object | Graph backend implementation |\n| `categories` | list | Topic modeling categories |\n| `topics` | object | Topics instance for topic analysis |\n| `text` | str | Column name for text data (default: \"text\") |\n| `object` | str | Column name for object data (default: \"object\") |\n| `copyattributes` | bool | Flag to copy all attributes when True |\n| `relationships` | str | Column name for manually-provided edges |\n| `relations` | dict | Stored relationships dictionary |\n\n资料来源：[src/python/txtai/graph/base.py:26-43]()\n\n### NetworkX Backend\n\nThe NetworkX backend (`NetworkX` class) provides the actual graph implementation using the NetworkX library. This backend supports:\n\n- **Community Detection**: Uses Louvain algorithm for partition detection\n- **Graph Operations**: Node/edge addition, removal, and querying\n- **Serialization**: Pickle-based serialization for persistence\n- **Distance Computation**: Weight-based distance calculations between nodes\n\n资料来源：[src/python/txtai/graph/networkx.py:1-30]()\n\n## Core Methods\n\n### Graph Creation and Management\n\n```python\ndef create(self):\n    \"\"\"Creates the graph network.\"\"\"\n    raise NotImplementedError\n\ndef count(self):\n    \"\"\"Returns the total number of nodes in graph.\"\"\"\n    raise NotImplementedError\n```\n\n| Method | Description | Returns |\n|--------|-------------|---------|\n| `create()` | Initializes and returns a new graph network | Graph instance |\n| `count()` | Returns total number of nodes | int |\n| `scan(attribute, data)` | Iterates over nodes matching criteria | Node iterator or tuples |\n| `node(nodeid)` | Retrieves node attributes | dict or None |\n\n资料来源：[src/python/txtai/graph/base.py:45-65]()\n\n### Node Operations\n\n| Method | Description |\n|--------|-------------|\n| `addnode(node, **attrs)` | Adds a single node with attributes |\n| `addnodes(nodes)` | Adds multiple nodes from iterable |\n| `removenode(node)` | Removes a node from the graph |\n| `hasnode(node)` | Checks if node exists |\n\n资料来源：[src/python/txtai/graph/networkx.py:55-75]()\n\n## Community Detection\n\nThe NetworkX backend implements advanced community detection using the Louvain algorithm:\n\n```python\ndef communities(self, config):\n    \"\"\"\n    Runs community detection on graph.\n\n    Args:\n        config: topic configuration\n\n    Returns:\n        list of [ids] per community\n    \"\"\"\n    level = config.get(\"level\", \"best\")\n    results = list(louvain_partitions(self.backend, weight=\"weight\", resolution=config.get(\"resolution\", 100), seed=0))\n    return results[0] if level == \"first\" else results[-1]\n```\n\n**Configuration Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `level` | str | \"best\" | Partition level to use (\"first\" or \"best\") |\n| `resolution` | int | 100 | Louvain resolution parameter |\n\n资料来源：[src/python/txtai/graph/networkx.py:35-55]()\n\n## Distance Computation\n\nThe graph module calculates distances between nodes based on edge weights:\n\n```python\ndef distance(self, source, target, attrs):\n    \"\"\"\n    Computes distance between source and target nodes using weight.\n\n    Returns:\n        distance between source and target\n    \"\"\"\n    distance = max(1.0 - attrs[\"weight\"], 0.0)\n    return distance if distance >= 0.15 else 1.00\n```\n\n**Distance Algorithm:**\n- Distance = `1 - weight` (inverted weight)\n- Minimum threshold: 0.15 (edges below this are considered near-duplicates)\n- Maximum distance cap: 1.00\n\n资料来源：[src/python/txtai/graph/networkx.py:60-75]()\n\n## Serialization\n\n### TAR File Loading (Legacy Format)\n\nThe NetworkX backend supports loading graphs from legacy TAR archives:\n\n```python\ndef loadtar(self, path):\n    \"\"\"\n    Loads a graph from the legacy TAR file.\n\n    Args:\n        path: path to graph\n    \"\"\"\n    serializer = SerializeFactory.create(\"pickle\")\n    \n    with TemporaryDirectory() as directory:\n        archive = ArchiveFactory.create(directory)\n        archive.load(path, \"tar\")\n        self.backend = serializer.load(f\"{directory}/graph\")\n```\n\n**Serialization Features:**\n- Pickle-based serialization for backwards compatibility\n- TAR archive extraction to temporary directory\n- Optional category loading from separate file\n\n资料来源：[src/python/txtai/graph/networkx.py:80-100]()\n\n## Configuration\n\nGraph networks are configured through a dictionary passed to the constructor:\n\n```yaml\ngraph:\n    columns:\n        text: \"content\"        # Column containing text data\n        object: \"data\"        # Column containing object data\n        relationships: \"edges\" # Column for manual edge definitions\n    copyattributes: false      # Copy all attributes when True\n```\n\n**Column Configuration Options:**\n\n| Key | Default | Description |\n|-----|---------|-------------|\n| `text` | \"text\" | Name of the text column |\n| `object` | \"object\" | Name of the object column |\n| `relationships` | \"relationships\" | Name of the relationships column |\n\n资料来源：[docs/embeddings/configuration/graph.md]() and [src/python/txtai/graph/base.py:38-40]()\n\n## Topic Modeling Integration\n\nThe Graph module integrates with txtai's topic modeling capabilities:\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Graph Indexing]\n    B --> C[Topic Modeling]\n    C --> D[Community Detection]\n    D --> E[Category Assignment]\n    \n    C --> F[Topics Module]\n    F --> G[Topic Analysis]\n```\n\n**Topics Integration:**\n- `self.categories`: Stores discovered topic categories\n- `self.topics`: Topics instance for analysis operations\n- Community detection feeds into topic assignment\n\n资料来源：[src/python/txtai/graph/base.py:30-32]()\n\n## Error Handling\n\nThe Graph module requires the NetworkX library for operation:\n\n```python\nif not NETWORKX:\n    raise ImportError('NetworkX is not available - install \"graph\" extra to enable')\n```\n\nTo enable Graph Networks functionality, install the graph extra:\n\n```bash\npip install txtai[graph]\n```\n\n资料来源：[src/python/txtai/graph/networkx.py:23-25]()\n\n## Usage Example\n\n```python\nfrom txtai import Graph\n\n# Create graph with default configuration\ngraph = Graph({})\n\n# Create the graph network\ngraph.create()\n\n# Add nodes with attributes\ngraph.addnode(1, text=\"example node\", weight=0.5)\ngraph.addnode(2, text=\"another node\", weight=0.7)\n\n# Query nodes\nnodes = list(graph.scan())\ncount = graph.count()\n\n# Get community detection results\nconfig = {\"level\": \"best\", \"resolution\": 100}\ncommunities = graph.communities(config)\n```\n\n## Related Components\n\n| Component | Description |\n|-----------|-------------|\n| `Embeddings` | Vector index that may utilize graphs for relationship mapping |\n| `Topics` | Topic modeling module integrated with graph analysis |\n| `Archive` | Serialization system for graph persistence |\n| `Workflow` | Orchestration that can incorporate graph operations |\n\n---\n\n<a id='scoring-algorithms'></a>\n\n## Scoring and Retrieval Algorithms\n\n### 相关页面\n\n相关主题：[Embeddings Database](#embeddings-core), [Database Integration](#database-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [setup.py](https://github.com/neuml/txtai/blob/main/setup.py)\n- [src/python/txtai/embeddings/index/documents.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/index/documents.py)\n- [src/python/txtai/api/cluster.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/api/cluster.py)\n- [src/python/txtai/pipeline/llm/llm.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/pipeline/llm/llm.py)\n- [src/python/txtai/embeddings/database/__init__.py](https://github.com/neuml/txtai/blob/main/src/python/txtai/embeddings/database/__init__.py)\n</details>\n\n# Scoring and Retrieval Algorithms\n\n## Overview\n\ntxtai provides a sophisticated scoring and retrieval system that powers its semantic search capabilities. At its core, the system combines vector-based similarity search with traditional BM25 sparse retrieval, enabling hybrid search approaches that leverage both semantic understanding and exact term matching.\n\nThe scoring module is an integral part of the embeddings database architecture, which unifies vector indexes, graph networks, and relational databases into a single powerful knowledge source for LLM applications.\n\n资料来源：[README.md]()\n\n## Architecture\n\nThe scoring and retrieval system follows a modular factory pattern that allows different scoring algorithms to be plugged in based on configuration. The architecture is designed to support both standalone scoring operations and integrated retrieval within the embeddings database.\n\n```mermaid\ngraph TD\n    A[Query Input] --> B[Query Processing]\n    B --> C{Scoring Algorithm}\n    C --> D[BM25 Scoring]\n    C --> E[Vector Similarity]\n    C --> F[Hybrid Scoring]\n    D --> G[Results Aggregation]\n    E --> G\n    F --> G\n    G --> H[Ranked Results]\n```\n\n资料来源：[setup.py](), [src/python/txtai/scoring/factory.py]()\n\n## Scoring Module Structure\n\nThe scoring subsystem is defined in the `txtai.scoring` package and includes base classes and concrete implementations for different retrieval algorithms.\n\n### Key Components\n\n| Component | Purpose | Location |\n|-----------|---------|----------|\n| `Base` | Abstract base class defining scoring interface | `scoring/base.py` |\n| `BM25` | Traditional sparse retrieval algorithm | `scoring/bm25.py` |\n| `Factory` | Factory for creating scoring instances | `scoring/factory.py` |\n| Package `__init__` | Module exports and configuration | `scoring/__init__.py` |\n\n资料来源：[setup.py:50-51]()\n\n## BM25 Retrieval Algorithm\n\nBM25 (Best Matching 25) is a probabilistic ranking function used for information retrieval. It extends the binary independence model to incorporate term frequency and document length normalization.\n\n### Configuration Options\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `k1` | float | 1.5 | Term frequency saturation parameter |\n| `b` | float | 0.75 | Length normalization factor |\n| `avgdl` | float | auto | Average document length |\n\n### How BM25 Works\n\n1. **Term Frequency (TF)**: Measures how often a term appears in a document, with saturation to prevent over-weighting\n2. **Inverse Document Frequency (IDF)**: Weights terms by their rarity across the corpus\n3. **Document Length Normalization**: Adjusts scores based on document length relative to average\n\n资料来源：[src/python/txtai/scoring/bm25.py](), [src/python/txtai/scoring/base.py]()\n\n## Integration with Embeddings Database\n\nThe scoring algorithms are tightly integrated with the embeddings database system, which handles document indexing, storage, and retrieval.\n\n```mermaid\ngraph LR\n    A[Documents] --> B[Indexing Pipeline]\n    B --> C[Vector Index]\n    B --> D[BM25 Index]\n    B --> E[Document Store]\n    F[Query] --> G[Hybrid Search]\n    C --> G\n    D --> G\n    E --> G\n    G --> H[Scored Results]\n```\n\n### Document Storage\n\nDocuments are managed through a streaming serializer that writes to temporary files, enabling efficient handling of large document collections:\n\n```python\n# Add batch of documents\nself.serializer.savestream(documents, self.documents)\nself.batch += 1\nself.size += len(documents)\n```\n\n资料来源：[src/python/txtai/embeddings/index/documents.py:18-23]()\n\n## Search API\n\nThe scoring system exposes search capabilities through both local and cluster APIs, supporting single queries, batch searches, and hybrid scoring with configurable weights.\n\n### Search Parameters\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `query` | str | Search query string |\n| `limit` | int | Maximum number of results (default: 10) |\n| `weights` | list | Hybrid score weights for combining methods |\n| `index` | str | Specific index to search |\n| `parameters` | dict | Named parameters for advanced queries |\n| `graph` | bool | Include graph network results |\n\n### Query Execution Flow\n\n```mermaid\nsequenceDiagram\n    participant Client\n    participant API\n    participant Scorer\n    participant Index\n    \n    Client->>API: search(query, limit)\n    API->>Scorer: execute(query)\n    Scorer->>Index: score(query)\n    Index-->>Scorer: raw scores\n    Scorer-->>API: scored results\n    API->>API: aggregate(query, results)\n    API->>API: sort and limit\n    API-->>Client: ranked results\n```\n\n资料来源：[src/python/txtai/api/cluster.py:85-105]()\n\n### Batch Search\n\nFor processing multiple queries efficiently, the API supports batch search operations:\n\n```python\ndef batchsearch(self, queries, limit=None, weights=None, index=None, parameters=None, graph=False):\n    \"\"\"\n    Finds documents most similar to the input queries.\n    \"\"\"\n```\n\n资料来源：[src/python/txtai/api/cluster.py:107-127]()\n\n## Hybrid Scoring\n\ntxtai supports hybrid scoring that combines multiple retrieval methods. This is particularly valuable when you want to leverage both semantic similarity (vector-based) and exact term matching (BM25).\n\n### Weight Configuration\n\nThe `weights` parameter controls how different scoring methods are combined:\n\n| Weights | Effect |\n|---------|--------|\n| `[0.0, 1.0]` | Pure vector search |\n| `[1.0, 0.0]` | Pure BM25 search |\n| `[0.5, 0.5]` | Equal contribution |\n| `[0.7, 0.3]` | Vector-biased hybrid |\n\n### Result Aggregation\n\nResults from multiple scoring methods are combined using weighted aggregation and sorted by the combined score:\n\n```python\n# Combine aggregate functions and sort\nresults = self.aggregate(query, results)\n# Limit results\nreturn results[: (limit if limit else 10)]\n```\n\n资料来源：[src/python/txtai/api/cluster.py:97-99]()\n\n## Configuration Reference\n\n### Scoring Extra Dependencies\n\nTo use the scoring module, install the `scoring` extra:\n\n```bash\npip install txtai[scoring]\n```\n\nRequired dependencies from `setup.py`:\n\n| Package | Version | Purpose |\n|---------|---------|---------|\n| sqlalchemy | >=2.0.20 | Database operations |\n\n资料来源：[setup.py:50-51]()\n\n### Complete Installation Options\n\nFor full functionality including scoring:\n\n```bash\npip install txtai[all]\n```\n\nThis includes:\n- `ann` - Approximate nearest neighbor indexes\n- `vectors` - Vector similarity components\n- `scoring` - Retrieval algorithms\n- `api` - REST API service\n\n资料来源：[setup.py:66-81]()\n\n## Usage Examples\n\n### Basic Embeddings Search\n\n```python\nimport txtai\n\nembeddings = txtai.Embeddings()\nembeddings.index([\"Correct\", \"Not what we hoped\"])\nembeddings.search(\"positive\", 1)\n# Returns: [(0, 0.29862046241760254)]\n```\n\n### Configuration via YAML\n\n```yaml\n# app.yml\nembeddings:\n    path: sentence-transformers/all-MiniLM-L6-v2\n```\n\n资料来源：[README.md]()\n\n## Related Components\n\n| Component | Relationship |\n|-----------|--------------|\n| **Embeddings Database** | Primary consumer of scoring algorithms |\n| **Vector Indexes** | Works alongside scoring for hybrid search |\n| **API Service** | Exposes scoring via REST endpoints |\n| **Workflow Engine** | Uses scoring for document routing |\n\nThe scoring and retrieval algorithms form the computational backbone of txtai's search capabilities, enabling both high-precision keyword matching through BM25 and semantic understanding through vector similarity. This combination allows applications to benefit from the strengths of both traditional information retrieval and modern neural embeddings.\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：neuml/txtai\n\n摘要：发现 22 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：Add `txtai_minimal` package。\n\n## 1. 安装坑 · 来源证据：Add `txtai_minimal` package\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add `txtai_minimal` package\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_4e4050b59fdf4d51ac21e82d248e589e | https://github.com/neuml/txtai/issues/1090 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安装坑 · 来源证据：Add custom Captions implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Captions implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ff6fe05065564e85a549d7656b85a852 | https://github.com/neuml/txtai/issues/1084 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 3. 安装坑 · 来源证据：Add custom Questions pipeline implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Questions pipeline implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fa68ea29089d497bb8278c3c360c363c | https://github.com/neuml/txtai/issues/1087 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：Add custom Sequences pipeline implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Sequences pipeline implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_0099afb385ef498d8020521bbf3e9e64 | https://github.com/neuml/txtai/issues/1086 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 5. 安装坑 · 来源证据：Add custom Summary pipeline implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Summary pipeline implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_9d8f75c31fc6450d8659b310f25d0bf4 | https://github.com/neuml/txtai/issues/1085 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 6. 安装坑 · 来源证据：Add minimal Docker build script\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add minimal Docker build script\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_38e1e56a991b412bbc749d2c0b687d54 | https://github.com/neuml/txtai/issues/1092 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 7. 安装坑 · 来源证据：Make `transformers` package optional for minimal install\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Make `transformers` package optional for minimal install\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d755829200de4d93b2f4d2f71b36aaf1 | https://github.com/neuml/txtai/issues/1091 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 8. 安装坑 · 来源证据：Reduce dependencies to just `numpy` for minimal install\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Reduce dependencies to just `numpy` for minimal install\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_395b735968ab4ec8ad849070d43cd18f | https://github.com/neuml/txtai/issues/1093 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 9. 安装坑 · 来源证据：Support minimal install for edge devices\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support minimal install for edge devices\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_3ec8e039baf4412888ed3ac543ea1361 | https://github.com/neuml/txtai/issues/1089 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：Various training pipeline fixes for v5\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Various training pipeline fixes for v5\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b47325d7496248a993d980c3d59f3269 | https://github.com/neuml/txtai/issues/1088 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 11. 安装坑 · 来源证据：Zero dependency minimal install\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Zero dependency minimal install\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_7839a465e7e64f94acfff001c1da978c | https://github.com/neuml/txtai/issues/1094 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 12. 安装坑 · 来源证据：v9.9.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v9.9.0\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8b842ce68a1c4c1ab0a40a3ed1fd213c | https://github.com/neuml/txtai/releases/tag/v9.9.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 13. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:286301447 | https://github.com/neuml/txtai | README/documentation is current enough for a first validation pass.\n\n## 14. 运行坑 · 来源证据：v9.7.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：v9.7.0\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e3f61e41f6e84b9b9ee33d5e18dbff63 | https://github.com/neuml/txtai/releases/tag/v9.7.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | last_activity_observed missing\n\n## 16. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:286301447 | https://github.com/neuml/txtai | no_demo; severity=medium\n\n## 17. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:286301447 | https://github.com/neuml/txtai | no_demo; severity=medium\n\n## 18. 安全/权限坑 · 来源证据：Add LiteRT-LM LLM\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Add LiteRT-LM LLM\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8731971fd8404df082f75ecf565fef8b | https://github.com/neuml/txtai/issues/1095 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 19. 安全/权限坑 · 来源证据：v9.6.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v9.6.0\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fe429d6152304930bda149c4fbd35318 | https://github.com/neuml/txtai/releases/tag/v9.6.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 20. 安全/权限坑 · 来源证据：v9.8.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v9.8.0\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8bebd6701b454baaae22d73522ce9704 | https://github.com/neuml/txtai/releases/tag/v9.8.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 21. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | issue_or_pr_quality=unknown\n\n## 22. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | release_recency=unknown\n\n<!-- canonical_name: neuml/txtai; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：neuml/txtai\n\n摘要：发现 22 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：安装坑 - 来源证据：Add `txtai_minimal` package。\n\n## 1. 安装坑 · 来源证据：Add `txtai_minimal` package\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add `txtai_minimal` package\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_4e4050b59fdf4d51ac21e82d248e589e | https://github.com/neuml/txtai/issues/1090 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安装坑 · 来源证据：Add custom Captions implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Captions implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ff6fe05065564e85a549d7656b85a852 | https://github.com/neuml/txtai/issues/1084 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 3. 安装坑 · 来源证据：Add custom Questions pipeline implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Questions pipeline implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fa68ea29089d497bb8278c3c360c363c | https://github.com/neuml/txtai/issues/1087 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：Add custom Sequences pipeline implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Sequences pipeline implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_0099afb385ef498d8020521bbf3e9e64 | https://github.com/neuml/txtai/issues/1086 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 5. 安装坑 · 来源证据：Add custom Summary pipeline implementation\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add custom Summary pipeline implementation\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_9d8f75c31fc6450d8659b310f25d0bf4 | https://github.com/neuml/txtai/issues/1085 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 6. 安装坑 · 来源证据：Add minimal Docker build script\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Add minimal Docker build script\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_38e1e56a991b412bbc749d2c0b687d54 | https://github.com/neuml/txtai/issues/1092 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 7. 安装坑 · 来源证据：Make `transformers` package optional for minimal install\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Make `transformers` package optional for minimal install\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d755829200de4d93b2f4d2f71b36aaf1 | https://github.com/neuml/txtai/issues/1091 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 8. 安装坑 · 来源证据：Reduce dependencies to just `numpy` for minimal install\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Reduce dependencies to just `numpy` for minimal install\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_395b735968ab4ec8ad849070d43cd18f | https://github.com/neuml/txtai/issues/1093 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 9. 安装坑 · 来源证据：Support minimal install for edge devices\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support minimal install for edge devices\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_3ec8e039baf4412888ed3ac543ea1361 | https://github.com/neuml/txtai/issues/1089 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：Various training pipeline fixes for v5\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Various training pipeline fixes for v5\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b47325d7496248a993d980c3d59f3269 | https://github.com/neuml/txtai/issues/1088 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 11. 安装坑 · 来源证据：Zero dependency minimal install\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Zero dependency minimal install\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_7839a465e7e64f94acfff001c1da978c | https://github.com/neuml/txtai/issues/1094 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 12. 安装坑 · 来源证据：v9.9.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v9.9.0\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8b842ce68a1c4c1ab0a40a3ed1fd213c | https://github.com/neuml/txtai/releases/tag/v9.9.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 13. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:286301447 | https://github.com/neuml/txtai | README/documentation is current enough for a first validation pass.\n\n## 14. 运行坑 · 来源证据：v9.7.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：v9.7.0\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e3f61e41f6e84b9b9ee33d5e18dbff63 | https://github.com/neuml/txtai/releases/tag/v9.7.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | last_activity_observed missing\n\n## 16. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:286301447 | https://github.com/neuml/txtai | no_demo; severity=medium\n\n## 17. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:286301447 | https://github.com/neuml/txtai | no_demo; severity=medium\n\n## 18. 安全/权限坑 · 来源证据：Add LiteRT-LM LLM\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Add LiteRT-LM LLM\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8731971fd8404df082f75ecf565fef8b | https://github.com/neuml/txtai/issues/1095 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 19. 安全/权限坑 · 来源证据：v9.6.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v9.6.0\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fe429d6152304930bda149c4fbd35318 | https://github.com/neuml/txtai/releases/tag/v9.6.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 20. 安全/权限坑 · 来源证据：v9.8.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v9.8.0\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8bebd6701b454baaae22d73522ce9704 | https://github.com/neuml/txtai/releases/tag/v9.8.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 21. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | issue_or_pr_quality=unknown\n\n## 22. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:286301447 | https://github.com/neuml/txtai | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# txtai - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for neuml/txtai.\n\nProject:\n- Name: txtai\n- Repository: https://github.com/neuml/txtai\n- Summary: 💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: 💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: 💡 All-in-one AI framework for semantic search, LLM orchestration and language model workflows\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. introduction: Introduction to txtai. Produce one small intermediate artifact and wait for confirmation.\n2. getting-started: Getting Started. Produce one small intermediate artifact and wait for confirmation.\n3. architecture: System Architecture. Produce one small intermediate artifact and wait for confirmation.\n4. embeddings-core: Embeddings Database. Produce one small intermediate artifact and wait for confirmation.\n5. pipelines: Pipelines. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/neuml/txtai\n- https://github.com/neuml/txtai#readme\n- README.md\n- src/python/txtai/__init__.py\n- docs/index.md\n- pyproject.toml\n- setup.py\n- docs/install.md\n- examples/01_Introducing_txtai.ipynb\n- src/python/txtai/embeddings/base.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：neuml/txtai\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install txtai\n```\n\n来源：https://github.com/neuml/txtai#readme\n\n## 来源\n\n- repo: https://github.com/neuml/txtai\n- docs: https://github.com/neuml/txtai#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_0113bd6941264ce5bd0bac2c55fff767"
}