{
  "canonical_name": "vitalops/datatune",
  "compilation_id": "pack_d0d57ef504f34115a9761c6f3623ec98",
  "created_at": "2026-05-15T10:10:24.798729+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=prompt, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=prompt, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install datatune` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install datatune",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "llm_execute_isolated_install",
      "sandbox_validation_id": "sbx_5d955e7d1ed94cdca2d347d8d22c4507"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_db3f9e061db8ff878eaabcfb5ce84017",
    "canonical_name": "vitalops/datatune",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/vitalops/datatune",
    "slug": "datatune",
    "source_packet_id": "phit_431dac454bf04df0be4e357f880966e1",
    "source_validation_id": "dval_aaf5c46de35c48cba1717ee2b6667f9a"
  },
  "merchandising": {
    "best_for": "需要数据分析与投资研究能力，并使用 chatgpt的用户",
    "github_forks": null,
    "github_stars": null,
    "one_liner_en": "[![PyPI version](https://img.shields.io/pypi/v/datatune.svg)](https://pypi.org/project/datatune/)",
    "one_liner_zh": "[![PyPI version](https://img.shields.io/pypi/v/datatune.svg)](https://pypi.org/project/datatune/)",
    "primary_category": {
      "category_id": "data-market-research",
      "confidence": "medium",
      "name_en": "Data & Market Research",
      "name_zh": "数据分析与投资研究",
      "reason": "matched_keywords:data"
    },
    "target_user": "使用 chatgpt 等宿主 AI 的用户",
    "title_en": "datatune",
    "title_zh": "datatune 能力包",
    "visible_tags": [
      {
        "label_en": "Knowledge Retrieval",
        "label_zh": "知识检索",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-knowledge-retrieval",
        "type": "product_domain"
      },
      {
        "label_en": "Knowledge Base Q&A",
        "label_zh": "知识库问答",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-knowledge-base-q-a",
        "type": "user_job"
      },
      {
        "label_en": "Workflow Automation",
        "label_zh": "流程自动化",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-workflow-automation",
        "type": "core_capability"
      },
      {
        "label_en": "Automated Workflow",
        "label_zh": "自动化工作流",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-automated-workflow",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Evaluation Suite",
        "label_zh": "评测体系",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-evaluation-suite",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_431dac454bf04df0be4e357f880966e1",
  "page_model": {
    "artifacts": {
      "artifact_slug": "datatune",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install datatune",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/vitalops/datatune#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "知识检索",
        "知识库问答",
        "流程自动化",
        "自动化工作流",
        "评测体系"
      ],
      "eyebrow": "数据分析与投资研究",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要数据分析与投资研究能力，并使用 chatgpt的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "[![PyPI version](https://img.shields.io/pypi/v/datatune.svg)](https://pypi.org/project/datatune/)"
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "chatgpt",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "prompt, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "未记录 last_activity_observed。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | last_activity_observed missing"
            ],
            "severity": "medium",
            "suggested_check": "补 GitHub 最近 commit、release、issue/PR 响应信号。",
            "title": "维护活跃度未知",
            "user_impact": "新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "downstream_validation.risk_items | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "进入安全/权限治理复核队列。",
            "title": "下游验证发现风险项",
            "user_impact": "下游已经要求复核，不能在页面中弱化。"
          },
          {
            "body": "No sandbox install has been executed yet; downstream must verify before user use.",
            "category": "安全/权限坑",
            "evidence": [
              "risks.safety_notes | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | No sandbox install has been executed yet; downstream must verify before user use."
            ],
            "severity": "medium",
            "suggested_check": "转成明确权限清单和安全审查提示。",
            "title": "存在安全注意事项",
            "user_impact": "用户安装前需要知道权限边界和敏感操作。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "risks.scoring_risks | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "把风险写入边界卡，并确认是否需要人工复核。",
            "title": "存在评分风险",
            "user_impact": "风险会影响是否适合普通用户安装。"
          },
          {
            "body": "issue_or_pr_quality=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | issue_or_pr_quality=unknown"
            ],
            "severity": "low",
            "suggested_check": "抽样最近 issue/PR，判断是否长期无人处理。",
            "title": "issue/PR 响应质量未知",
            "user_impact": "用户无法判断遇到问题后是否有人维护。"
          },
          {
            "body": "release_recency=unknown。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | release_recency=unknown"
            ],
            "severity": "low",
            "suggested_check": "确认最近 release/tag 和 README 安装命令是否一致。",
            "title": "发布节奏不明确",
            "user_impact": "安装命令和文档可能落后于代码，用户踩坑概率升高。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 7 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：能力坑 - 能力判断依赖假设。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": null,
        "forks": null,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": null
      },
      "source_url": "https://github.com/vitalops/datatune",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "[![PyPI version](https://img.shields.io/pypi/v/datatune.svg)](https://pypi.org/project/datatune/)",
      "title": "datatune 能力包",
      "trial_prompt": "# datatune - Prompt Preview\n\n> 复制下面这段 Prompt 到你常用的 AI，先试一次，不需要安装。\n> 它的目标是让你直接体验这个项目的服务方式，而不是阅读项目介绍。\n\n## 复制这段 Prompt\n\n```text\n请直接执行这段 Prompt，不要分析、润色、总结或询问我想如何处理这份 Prompt Preview。\n\n你现在扮演 datatune 的“安装前体验版”。\n这不是项目介绍、不是评价报告、不是 README 总结。你的任务是让我用最小成本体验它的核心服务。\n\n我的试用任务：我想用它完成一个真实的数据分析与投资研究任务。\n我常用的宿主 AI：chatgpt\n\n【体验目标】\n围绕我的真实任务，现场演示这个项目如何把输入转成 示例引导, 判断线索。重点是让我感受到工作方式，而不是给我项目背景。\n\n【业务流约束】\n- 你必须像一个正在提供服务的项目能力包，而不是像一个讲解员。\n- 每一轮只推进一个步骤；提出问题后必须停下来等我回答。\n- 每一步都必须让我感受到一个具体服务动作：澄清、整理、规划、检查、判断或收尾。\n- 每一步都要说明：当前目标、你需要我提供什么、我回答后你会产出什么。\n- 不要安装、不要运行命令、不要写代码、不要声称测试通过、不要声称已经修改文件。\n- 需要真实安装或宿主加载后才能验证的内容，必须明确说“这一步需要安装后验证”。\n- 如果我说“用示例继续”，你可以用虚构示例推进，但仍然不能声称真实执行。\n\n【可体验服务能力】\n- 安装前能力预览: [![PyPI version](https://img.shields.io/pypi/v/datatune.svg)](https://pypi.org/project/datatune/) 输入：用户任务, 当前 AI 对话上下文；输出：示例引导, 判断线索。\n\n【必须安装后才可验证的能力】\n- 命令行启动或安装流程: 项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 输入：终端环境, 包管理器, 项目依赖；输出：安装结果, 列表/更新/运行结果。\n\n【核心服务流】\n请严格按这个顺序带我体验。不要一次性输出完整流程：\n1. page-introduction：Introduction to Datatune。围绕“Introduction to Datatune”模拟一次用户任务，不展示安装或运行结果。\n2. page-getting-started：Getting Started。围绕“Getting Started”模拟一次用户任务，不展示安装或运行结果。\n3. page-architecture：System Architecture。围绕“System Architecture”模拟一次用户任务，不展示安装或运行结果。\n4. page-map-operations：Map Operations。围绕“Map Operations”模拟一次用户任务，不展示安装或运行结果。\n5. page-filter-operations：Filter Operations。围绕“Filter Operations”模拟一次用户任务，不展示安装或运行结果。\n\n【核心能力体验剧本】\n每一步都必须按“输入 -> 服务动作 -> 中间产物”执行。不要只说流程名：\n1. page-introduction\n输入：用户提供的“Introduction to Datatune”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n2. page-getting-started\n输入：用户提供的“Getting Started”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n3. page-architecture\n输入：用户提供的“System Architecture”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n4. page-map-operations\n输入：用户提供的“Map Operations”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n5. page-filter-operations\n输入：用户提供的“Filter Operations”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n【项目服务规则】\n这些规则决定你如何服务用户。不要解释规则本身，而要在每一步执行时遵守：\n- 先确认用户任务、输入材料和成功标准，再模拟项目能力。\n- 每一步都必须形成可检查的小产物，并等待用户确认后再继续。\n- 凡是需要安装、调用工具或访问外部服务的能力，都必须标记为安装后验证。\n\n【每一步的服务约束】\n- Step 1 / page-introduction：Step 1 必须围绕“Introduction to Datatune”形成一个小中间产物，并等待用户确认。\n- Step 2 / page-getting-started：Step 2 必须围绕“Getting Started”形成一个小中间产物，并等待用户确认。\n- Step 3 / page-architecture：Step 3 必须围绕“System Architecture”形成一个小中间产物，并等待用户确认。\n- Step 4 / page-map-operations：Step 4 必须围绕“Map Operations”形成一个小中间产物，并等待用户确认。\n- Step 5 / page-filter-operations：Step 5 必须围绕“Filter Operations”形成一个小中间产物，并等待用户确认。\n\n【边界与风险】\n- 不要声称已经安装、运行、调用 API、读写本地文件或完成真实任务。\n- 安装前预览只能展示工作方式，不能证明兼容性、性能或输出质量。\n- 涉及安装、插件加载、工具调用或外部服务的能力必须安装后验证。\n\n【可追溯依据】\n这些路径只用于你内部校验或在我追问“依据是什么”时简要引用。不要在首次回复主动展开：\n- https://github.com/vitalops/datatune#readme\n- README.md\n- datatune/__init__.py\n- pyproject.toml\n- datatune/agent/runtime.py\n- datatune/core/map.py\n- datatune/core/filter.py\n- datatune/core/reduce.py\n- datatune/core/dask/map_dask.py\n- datatune/core/ibis/map_ibis.py\n- docs/source/Map.md\n- datatune/core/dask/filter_dask.py\n\n【首次问题规则】\n- 首次三问必须先确认用户目标、成功标准和边界，不要提前进入工具、安装或实现细节。\n- 如果后续需要技术条件、文件路径或运行环境，必须等用户确认目标后再追问。\n\n首次回复必须只输出下面 4 个部分：\n1. 体验开始：用 1 句话说明你将带我体验 datatune 的核心服务。\n2. 当前步骤：明确进入 Step 1，并说明这一步要解决什么。\n3. 你会如何服务我：说明你会先改变我完成任务的哪个动作。\n4. 只问我 3 个问题，然后停下等待回答。\n\n首次回复禁止输出：后续完整流程、证据清单、安装命令、项目评价、营销文案、已经安装或运行的说法。\n\nStep 1 / brainstorming 的二轮协议：\n- 我回答首次三问后，你仍然停留在 Step 1 / brainstorming，不要进入 Step 2。\n- 第二次回复必须产出 6 个部分：澄清后的任务定义、成功标准、边界条件、\n  2-3 个可选方案、每个方案的权衡、推荐方案。\n- 第二次回复最后必须问我是否确认推荐方案；只有我明确确认后，才能进入下一步。\n- 第二次回复禁止输出 git worktree、代码计划、测试文件、命令或真实执行结果。\n\n后续对话规则：\n- 我回答后，你先完成当前步骤的中间产物并等待确认；只有我确认后，才能进入下一步。\n- 每一步都要生成一个小的中间产物，例如澄清后的目标、计划草案、测试意图、验证清单或继续/停止判断。\n- 所有演示都写成“我会建议/我会引导/这一步会形成”，不要写成已经真实执行。\n- 不要声称已经测试通过、文件已修改、命令已运行或结果已产生。\n- 如果某个能力必须安装后验证，请直接说“这一步需要安装后验证”。\n- 如果证据不足，请明确说“证据不足”，不要补事实。\n```\n",
      "voices": [
        {
          "body": "来源平台：github, reddit。github/github_issue: Dask's parallel computation of each partition will break true_batch_comp（https://github.com/vitalops/datatune/issues/52）；github/github_issue: Requirements Document: Planning Agent for Datatune（https://github.com/vitalops/datatune/issues/86）；github/github_issue: Requirements Doc: Unstructured data handling（https://github.com/vitalops/datatune/issues/88）；github/github_issue: Semantic Deduplication（https://github.com/vitalops/datatune/issues/91）；reddit: We built a tool to connect LLMs and Agents to the entire user data and .（https://www.reddit.com/r/LocalLLaMA/comments/1qe1mx9/cursor_for_data_we_built_a_tool_to_connect_llms/）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Dask's parallel computation of each partition will break true_batch_comp",
              "url": "https://github.com/vitalops/datatune/issues/52"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Requirements Document: Planning Agent for Datatune",
              "url": "https://github.com/vitalops/datatune/issues/86"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Requirements Doc: Unstructured data handling",
              "url": "https://github.com/vitalops/datatune/issues/88"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Semantic Deduplication",
              "url": "https://github.com/vitalops/datatune/issues/91"
            },
            {
              "kind": "searxng_indexed",
              "source": "reddit",
              "title": "We built a tool to connect LLMs and Agents to the entire user data and .",
              "url": "https://www.reddit.com/r/LocalLLaMA/comments/1qe1mx9/cursor_for_data_we_built_a_tool_to_connect_llms/"
            }
          ],
          "status": "已收录 5 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "数据分析与投资研究",
      "desc": "[![PyPI version](https://img.shields.io/pypi/v/datatune.svg)](https://pypi.org/project/datatune/)",
      "effort": "安装已验证",
      "forks": null,
      "icon": "chart",
      "name": "datatune 能力包",
      "risk": "可发布",
      "slug": "datatune",
      "stars": null,
      "tags": [
        "知识检索",
        "知识库问答",
        "流程自动化",
        "自动化工作流",
        "评测体系"
      ],
      "thumb": "green",
      "type": "Prompt Preview"
    },
    "manual": {
      "markdown": "# https://github.com/vitalops/datatune 项目说明书\n\n生成时间：2026-05-15 09:50:50 UTC\n\n## 目录\n\n- [Introduction to Datatune](#page-introduction)\n- [Getting Started](#page-getting-started)\n- [System Architecture](#page-architecture)\n- [Map Operations](#page-map-operations)\n- [Filter Operations](#page-filter-operations)\n- [Reduce Operations](#page-reduce-operations)\n- [Dask Backend](#page-dask-backend)\n- [Ibis Backend](#page-ibis-backend)\n- [LLM Integration](#page-llm-integration)\n- [Agent System](#page-agent-system)\n\n<a id='page-introduction'></a>\n\n## Introduction to Datatune\n\n### 相关页面\n\n相关主题：[Getting Started](#page-getting-started), [System Architecture](#page-architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/vitalops/datatune/blob/main/README.md)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/llm/model_rate_limits.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/model_rate_limits.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n- [datatune/agent/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/__init__.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/core/deduplication.py](https://github.com/vitalops/datatune/blob/main/datatune/core/deduplication.py)\n</details>\n\n# Introduction to Datatune\n\nDatatune is an AI-powered data transformation library that enables natural language-driven data processing operations on large-scale datasets. The library leverages Large Language Models (LLMs) to interpret human-readable instructions and automatically generate the appropriate data transformations, making complex data manipulation accessible to users without deep technical expertise.\n\nDatatune supports multiple backend computational frameworks including Dask for distributed computing and Ibis for SQL-like operations across various databases. The system provides both explicit transformation primitives (Map, Filter) and an intelligent Agent that can automatically determine and chain multiple transformation steps from a single natural language prompt.\n\n## High-Level Architecture\n\n```mermaid\ngraph TD\n    User[User Request] --> Agent[Datatune Agent]\n    User --> Primitives[Explicit Primitives]\n    \n    Agent --> Planner[Planning Module]\n    Planner --> Steps[Transformation Steps]\n    Steps --> DaskOps[Dask Operations]\n    Steps --> PrimOps[Primitive Operations]\n    Steps --> CodeGen[Code Generation]\n    \n    Primitives --> MapOp[Map Operation]\n    Primitives --> FilterOp[Filter Operation]\n    \n    MapOp --> LLM[LLM Interface]\n    FilterOp --> LLM\n    \n    LLM --> OpenAI[OpenAI Provider]\n    LLM --> Ollama[Ollama Provider]\n    LLM --> VLLM[VLLM Provider]\n    LLM --> Azure[Azure Provider]\n    \n    DaskOps --> Dask[Dask DataFrame]\n    PrimOps --> Dask\n    \n    MapOp --> Ibis[Ibis Backend]\n    FilterOp --> Ibis\n    \n    Ibis --> DuckDB[DuckDB]\n    Ibis --> PostgreSQL[PostgreSQL]\n    Ibis --> BigQuery[BigQuery]\n```\n\nThe architecture follows a layered approach where the LLM abstraction layer provides a unified interface for multiple AI providers, while the transformation layer handles both explicit user-defined operations and AI-generated execution plans.\n\n## Core Components\n\n### LLM Interface Layer\n\nThe LLM module (`datatune/llm/llm.py`) provides a unified interface for interacting with various language model providers. The base `LLM` class handles rate limiting, token counting, and request batching, while provider-specific subclasses implement the actual API calls.\n\n#### Supported LLM Providers\n\n| Provider | Class | Default Model | Description |\n|----------|-------|---------------|-------------|\n| OpenAI | `OpenAI` | gpt-3.5-turbo | Standard OpenAI API integration |\n| Ollama | `Ollama` | gemma3:4b | Local LLM server support |\n| VLLM | `VLLM` | User-specified | High-performance vLLM inference |\n| Azure | `Azure` | User-specified | Azure OpenAI Service |\n\nThe base `LLM` class automatically extracts rate limit configurations from `model_rate_limits.py` based on the model name. If a model is not found in the predefined limits, it defaults to GPT-3.5-turbo limits with a warning message.\n\n```python\nfrom datatune.llm.llm import OpenAI, Ollama, Azure\n\n# OpenAI\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\n\n# Ollama (local)\nllm = Ollama()\n\n# Azure\nllm = Azure(model_name=\"gpt-3.5-turbo\", api_key=api_key)\n```\n\n资料来源：[README.md:40-56]()\n\n### Rate Limiting Configuration\n\nThe `model_rate_limits.py` module defines tokens-per-minute (TPM) and requests-per-minute (RPM) limits for supported models:\n\n| Model Family | TPM | RPM |\n|--------------|-----|-----|\n| GPT-3.5-Turbo | 200,000 | 500 |\n| GPT-4 | 10,000 - 30,000 | 500 |\n| GPT-4.1 | 30,000 - 200,000 | 500 |\n| GPT-4.5-Preview | 125,000 | 1000 |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-120]()\n\n## Transformation Primitives\n\nDatatune provides two fundamental transformation primitives that work with natural language prompts:\n\n### Map Operation\n\nThe `Map` primitive creates new columns by applying LLM-driven transformations to existing data. It takes input from specified columns and generates output fields based on the natural language prompt.\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n```\n\nThe Map operation constructs prompts with the following structure:\n\n```\nYour task is to MAP/CREATE new fields based on the following input record(s):\n[input data]\n\nYour response MUST be the entire input record as a valid Python dictionary in the format\n'index=<row_index>|{key1: value1, key2: value2, ...}' with added keys of expected new fields.\n```\n\n资料来源：[datatune/core/dask/map_dask.py:1-50]()\n资料来源：[datatune/core/ibis/map_ibis.py:1-60]()\n\n### Filter Operation\n\nThe `Filter` primitive removes rows that do not meet specified conditions interpreted by the LLM:\n\n```python\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\"]\n)(llm, mapped)\n```\n\nThe Filter operation adds a `__filter__` key to each record, with `True` indicating the row should be kept and `False` indicating removal.\n\n```\nFILTERING CRITERIA:\n[prompt]\n\nDECISION: Your response MUST be the entire input record as a Python dictionary in the format:\nindex=<row_index>|{key1: value1, key2: value2, ...} with added key called '__filter__' \nwith value either True to KEEP the record or False to REMOVE it.\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:1-45]()\n资料来源：[datatune/core/ibis/filter_ibis.py:1-50]()\n\n## Intelligent Agent\n\nThe `Agent` class provides a higher-level interface that automatically determines which operations to perform based on natural language requests. The agent can chain multiple Map, Filter, and Python code operations into a coherent execution plan.\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\nagent = dt.Agent(llm)\n\ndf = agent.do(\"Add ProfitMargin column and keep only African organizations\", df)\nresult = dt.finalize(df)\n```\n\n### Agent System Prompt\n\nThe Agent uses a structured system prompt that defines available operations and expected response formats:\n\n```python\nsystem_prompt: str = \"\"\"You are Datatune Agent, a powerful assistant designed to help users with data processing tasks.\nYou are capable of generating python code to perform various operations on data. Apart from python builtins, you have the following libraries avaiable in your run time:\n- pandas\n- numpy\n- dask\n\nIn addition to these, you also have access to the datatune libarary, which provides functionality for processing data using LLMs.\n```\n\n### Plan Execution Flow\n\n```mermaid\ngraph TD\n    Request[Natural Language Request] --> Parse[Parse Request]\n    Parse --> Plan[Generate Execution Plan]\n    Plan --> Step1[Step 1: Dask/Primitive]\n    Step1 --> Step2[Step 2: Dask/Primitive]\n    Step2 --> StepN[Step N: ...]\n    StepN --> Execute[Execute Full Plan]\n    Execute --> Result[Finalized DataFrame]\n    \n    Plan --> StepTypes{Step Type}\n    StepTypes -->|\"dask\"| DaskTemplate[Dask Template]\n    StepTypes -->|\"primitive\"| PrimTemplate[Primitive Template]\n```\n\nThe agent generates plans as JSON arrays with the following structure:\n\n```json\n[\n  {\n    \"type\": \"dask|primitive\",\n    \"operation\": \"operation_name\",\n    \"params\": {},\n    \"subprompt\": \"LLM prompt for primitive operations\",\n    \"input_fields\": [\"column1\"],\n    \"output_fields\": [\"new_column\"]\n  }\n]\n```\n\n资料来源：[datatune/agent/__init__.py:1-45]()\n资料来源：[datatune/agent/agent.py:1-100]()\n\n## Data Source Backends\n\nDatatune supports multiple computational backends for processing data at scale:\n\n### Dask Backend\n\nThe Dask backend provides distributed computing capabilities for pandas-like DataFrames:\n\n```python\nimport dask.dataframe as dd\ndf = dd.read_csv(\"data.csv\")\n```\n\nDask operations include:\n- `add_column`: Create new columns from expressions\n- `apply_function`: Apply element-wise functions\n- `rename_columns`: Rename using mappings\n- `astype_column`: Change data types\n\n### Ibis Backend\n\nThe Ibis backend enables SQL-like operations across various database backends:\n\n```python\nimport ibis\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"my_table\")\n```\n\nSupported Ibis backends include DuckDB, PostgreSQL, and BigQuery.\n\n资料来源：[README.md:60-75]()\n\n## Deduplication System\n\nDatatune includes a sophisticated deduplication system that uses embedding-based similarity detection:\n\n```mermaid\ngraph LR\n    Input[Input Data] --> Embed[Embedding Generation]\n    Embed --> Index[FAISS Index Build]\n    Index --> Cluster[Similarity Clustering]\n    Cluster --> Dedup[Deduplicated Output]\n```\n\nKey parameters:\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `embedding_model` | text-embedding-3-small | Model for generating embeddings |\n| `sim_threshold` | 0.90 | Similarity threshold for clustering |\n| `top_k` | 50 | Top-K neighbors for clustering |\n| `hnsw_m` | 32 | HNSW index construction parameter |\n| `ef_search` | 64 | HNSW search parameter |\n\nThe system uses FAISS HNSW (Hierarchical Navigable Small World) indexes for efficient similarity search at scale.\n\n资料来源：[datatune/core/deduplication.py:1-100]()\n\n## Workflow Summary\n\n```mermaid\ngraph LR\n    A[Load Data] --> B[Choose Interface]\n    B --> C[Agent Mode]\n    B --> D[Primitives Mode]\n    C --> E[Natural Language]\n    D --> F[Explicit Operations]\n    E --> G[Auto Plan]\n    F --> H[Manual Plan]\n    G --> I[Execute]\n    H --> I\n    I --> J[LLM Processing]\n    J --> K[Finalize]\n    K --> L[Output]\n```\n\n1. **Load Data**: Import data into Dask or Ibis DataFrames\n2. **Choose Interface**: Use Agent for automatic planning or Primitives for explicit control\n3. **Define Operations**: Write natural language prompts or configure operations\n4. **Execute**: Process data with LLM integration\n5. **Finalize**: Convert lazy operations to computed results\n\n资料来源：[README.md:1-50]()\n\n## Installation and Quick Start\n\n```bash\npip install datatune\n```\n\nThe complete workflow involves initializing an LLM, loading data, applying transformations, and finalizing results:\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\n# Apply transformations\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\"]\n)(llm, mapped)\n\n# Save results\nresult = dt.finalize(filtered)\nresult.compute().to_csv(\"electronics_products.csv\")\n```\n\n资料来源：[README.md:20-55]()\n\n---\n\n<a id='page-getting-started'></a>\n\n## Getting Started\n\n### 相关页面\n\n相关主题：[Introduction to Datatune](#page-introduction)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/vitalops/datatune/blob/main/README.md)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n- [datatune/agent/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/__init__.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/llm/model_rate_limits.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/model_rate_limits.py)\n</details>\n\n# Getting Started\n\nDatatune is a data transformation library that leverages Large Language Models (LLMs) to perform complex data operations using natural language prompts. It enables users to transform, filter, map, and process data without writing extensive custom code.\n\n## Installation\n\nInstall datatune using pip:\n\n```bash\npip install datatune\n```\n\n资料来源：[README.md:1]()\n\n## Core Concepts\n\nDatatune provides two primary interfaces for data transformation:\n\n| Interface | Description | Use Case |\n|-----------|-------------|----------|\n| **Primitives** | Direct operations (Map, Filter) | Single, focused transformations |\n| **Agent** | AI-powered automatic planning | Complex multi-step workflows |\n\n资料来源：[README.md:20-30]()\n\n## Quick Start\n\n### Basic Setup\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n```\n\n资料来源：[README.md:26-32]()\n\n### Map Operation\n\nUse `dt.map()` to extract or derive new columns from existing data:\n\n```python\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n```\n\n资料来源：[README.md:34-41]()\n\n### Filter Operation\n\nUse `dt.filter()` to keep only rows matching criteria:\n\n```python\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\"]\n)(llm, mapped)\n```\n\n资料来源：[README.md:44-49]()\n\n### Finalize and Save\n\n```python\nresult = dt.finalize(filtered)\nresult.compute().to_csv(\"electronics_products.csv\")\n```\n\n资料来源：[README.md:51-53]()\n\n## Supported LLMs\n\nDatatune supports multiple LLM providers through a unified interface.\n\n### OpenAI\n\n```python\nfrom datatune.llm.llm import OpenAI\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\n```\n\n资料来源：[README.md:58-61]()\n\n### Ollama (Local Models)\n\n```python\nfrom datatune.llm.llm import Ollama\nllm = Ollama()\n```\n\nDefault configuration:\n- Model: `gemma3:4b`\n- API Base: `http://localhost:11434`\n\n资料来源：[README.md:63-65]()\n\n### Azure OpenAI\n\n```python\nfrom datatune.llm.llm import Azure\nllm = Azure(model_name=\"gpt-3.5-turbo\", api_key=api_key)\n```\n\n资料来源：[README.md:67-70]()\n\n### Additional Providers\n\n| Provider | Class | Notes |\n|----------|-------|-------|\n| Mistral | `Mistral` | Requires `mistral/` prefix in model name |\n| Huggingface | `Huggingface` | Requires `huggingface/` prefix |\n| VLLM | `VLLM` | Auto-detects max token limit |\n\n资料来源：[datatune/llm/llm.py:1-150]()\n\n## Agent Mode\n\nThe Agent provides an intelligent, automated approach to data transformation. It automatically determines which operations to use and chains multiple transformations from a single natural language prompt.\n\n### Basic Agent Usage\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\nagent = dt.Agent(llm)\n\ndf = agent.do(\"Add ProfitMargin column and keep only African organizations\", df)\nresult = dt.finalize(df)\n```\n\n资料来源：[README.md:74-82]()\n\n### Agent Capabilities\n\nThe agent automatically:\n\n- Determines which operations to use (map, filter, etc.)\n- Chains multiple transformations\n- Handles complex multi-step tasks from a single prompt\n- Generates and executes Python code along with row-level primitives (Map, Filter, etc) if required\n\n资料来源：[README.md:84-88]()\n\n### Agent System Prompt\n\nThe agent has access to:\n\n- Python builtins\n- pandas\n- numpy\n- dask\n- datatune library primitives (Map, Filter)\n\n资料来源：[datatune/agent/__init__.py:1-35]()\n\n## Data Sources\n\nDatatune supports multiple data processing backends.\n\n### Dask DataFrames\n\n```python\nimport dask.dataframe as dd\ndf = dd.read_csv(\"data.csv\")\n```\n\n### Ibis Backend\n\nIbis provides connectivity to multiple databases:\n\n```python\nimport ibis\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"my_table\")\n```\n\n资料来源：[README.md:93-102]()\n\n## Architecture Overview\n\n```mermaid\ngraph TD\n    A[User Prompt] --> B[Datatune API]\n    B --> C{Operation Type}\n    C -->|Map| D[map_dask.py / map_ibis.py]\n    C -->|Filter| E[filter_dask.py / filter_ibis.py]\n    C -->|Agent| F[Agent Planning]\n    D --> G[LLM Batch Processing]\n    E --> G\n    F -->|Dask Ops| H[Dask Templates]\n    F -->|Primitives| I[Map / Filter Primitives]\n    G --> J[Output DataFrame]\n    H --> J\n    I --> J\n```\n\n## Rate Limits Configuration\n\nWhen using LLMs, datatune respects model-specific rate limits:\n\n| Parameter | Description |\n|-----------|-------------|\n| `tpm` | Tokens per minute |\n| `rpm` | Requests per minute |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-50]()\n\nDefault rate limits for unknown models fall back to `gpt-3.5-turbo` settings with a warning logged.\n\n资料来源：[datatune/llm/llm.py:20-35]()\n\n## Primitive Operations\n\n### Map Operation\n\nCreates new columns by applying LLM-based transformations to input columns.\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `prompt` | str | Natural language description of the transformation |\n| `input_fields` | list | Input column names |\n| `output_fields` | list | Names for new columns |\n\n### Filter Operation\n\nRemoves rows that don't meet the specified criteria.\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `prompt` | str | Natural language filtering criteria |\n| `input_fields` | list | Columns to evaluate |\n\n## Next Steps\n\n- **[Documentation](https://docs.datatune.ai/)** - Complete guides and API reference\n- **[Examples](https://github.com/vitalops/datatune/tree/main/examples)** - Real-world use cases\n- **[Discord](https://discord.gg/3RKA5AryQX)** - Community support\n- **[Issues](https://github.com/vitalops/datatune/issues)** - Report bugs and request features\n\n资料来源：[README.md:105-113]()\n\n---\n\n<a id='page-architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Dask Backend](#page-dask-backend), [Ibis Backend](#page-ibis-backend), [LLM Integration](#page-llm-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/core/deduplication.py](https://github.com/vitalops/datatune/blob/main/datatune/core/deduplication.py)\n- [datatune/llm/model_rate_limits.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/model_rate_limits.py)\n- [datatune/agent/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/__init__.py)\n</details>\n\n# System Architecture\n\n## Overview\n\nDatatune is a data transformation framework that leverages Large Language Models (LLMs) to perform semantic operations on distributed data. The system provides a declarative interface for common data manipulation tasks like mapping, filtering, and deduplication while abstracting the complexity of distributed computing and LLM interaction.\n\nThe architecture follows a modular design with three primary layers:\n\n| Layer | Purpose | Key Components |\n|-------|---------|----------------|\n| **LLM Layer** | Model abstraction and rate limiting | `LLM`, `OpenAI`, `Ollama`, `VLLM`, `Azure` |\n| **Core Operations Layer** | Data transformation primitives | `Map`, `Filter`, `Reduce`, `Deduplication` |\n| **Agent Layer** | High-level orchestration and planning | `Agent`, Plan execution engine |\n\n资料来源：[datatune/llm/llm.py:1-100]()\n\n## High-Level Architecture Diagram\n\n```mermaid\ngraph TB\n    subgraph \"Client Layer\"\n        User[User Code]\n    end\n    \n    subgraph \"LLM Layer\"\n        LLM[Base LLM Class]\n        OpenAI[OpenAI Provider]\n        Ollama[Ollama Provider]\n        VLLM[VLLM Provider]\n        Azure[Azure Provider]\n        RateLimits[model_rate_limits]\n    end\n    \n    subgraph \"Core Operations Layer\"\n        Map[Map Operation]\n        Filter[Filter Operation]\n        Reduce[Reduce Operation]\n        Dedupe[Deduplication]\n    end\n    \n    subgraph \"Data Backend Layer\"\n        Dask[Dask DataFrame]\n        Ibis[Ibis (DuckDB, PostgreSQL, BigQuery)]\n    end\n    \n    subgraph \"Agent Layer\"\n        Agent[Agent Orchestrator]\n        Planner[Plan Generator]\n        Executor[Plan Executor]\n    end\n    \n    User --> Agent\n    User --> Map\n    User --> Filter\n    \n    Agent --> Planner\n    Agent --> Executor\n    Agent --> LLM\n    \n    Map --> LLM\n    Filter --> LLM\n    \n    Map --> Dask\n    Map --> Ibis\n    Filter --> Dask\n    Filter --> Ibis\n    \n    LLM --> RateLimits\n```\n\n## LLM Layer\n\nThe LLM layer provides a unified interface for interacting with various LLM providers while handling rate limiting and token counting.\n\n### Class Hierarchy\n\n```mermaid\ngraph TD\n    LLM[LLM Base Class] --> Ollama[Ollama]\n    LLM --> OpenAI[OpenAI]\n    LLM --> VLLM[VLLM]\n    LLM --> Azure[Azure]\n```\n\n### Base LLM Class\n\nThe `LLM` class serves as the foundation for all LLM providers. It handles model naming, rate limit configuration, and request batching.\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `model_name` | `str` | Model identifier in format `provider/model` (e.g., `openai/gpt-3.5-turbo`) |\n| `tpm` | `int` | Tokens per minute limit |\n| `rpm` | `int` | Requests per minute limit |\n| `api_key` | `str` | API authentication key |\n| `api_base` | `str` | Base URL for API endpoint |\n\n资料来源：[datatune/llm/llm.py:70-95]()\n\n### Provider Implementations\n\n**OpenAI Provider**\n\n```python\nclass OpenAI(LLM):\n    def __init__(\n        self, model_name: str = \"gpt-3.5-turbo\", api_key: Optional[str] = None, **kwargs\n    ):\n        kwargs.update({\"api_key\": api_key})\n        super().__init__(model_name=f\"openai/{model_name}\", **kwargs)\n```\n\n资料来源：[datatune/llm/llm.py:96-103]()\n\n**Ollama Provider (Local)**\n\n```python\nclass Ollama(LLM):\n    def __init__(\n        self, model_name=\"gemma3:4b\", api_base=\"http://localhost:11434\", **kwargs\n    ) -> None:\n        super().__init__(\n            model_name=f\"ollama_chat/{model_name}\", api_base=api_base, **kwargs\n        )\n```\n\n资料来源：[datatune/llm/llm.py:81-87]()\n\n**VLLM Provider**\n\nThe VLLM provider automatically retrieves the `max_model_len` from the API and configures token limits accordingly.\n\n```python\nclass VLLM(LLM):\n    def __init__(self, model_name: str, api_base: str = \"http://localhost:8000/v1\", **kwargs):\n        import httpx\n        resp = httpx.get(f\"{api_base}/models\")\n        max_model_len = resp.json()[\"data\"][0][\"max_model_len\"]\n        kwargs.update({\"api_base\": api_base, \"api_key\": \"dummy\", \"max_tokens\": max_model_len})\n        super().__init__(model_name=f\"openai/{model_name}\", **kwargs)\n```\n\n资料来源：[datatune/llm/llm.py:105-118]()\n\n### Rate Limiting\n\nThe system maintains predefined rate limits for various models in `model_rate_limits.py`:\n\n| Model Family | TPM | RPM |\n|--------------|-----|-----|\n| `gpt-3.5-turbo` | 200,000 | 500 |\n| `gpt-4` | 10,000 | 500 |\n| `gpt-4-turbo` | 30,000 | 500 |\n| `gpt-4.1-mini/nano` | 200,000 | 500 |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-50]()\n\n## Core Operations Layer\n\nThe core operations layer provides primitive transformations that can be applied to distributed data.\n\n### Map Operation\n\nThe `Map` operation creates new columns by applying LLM-based transformations to existing data.\n\n```mermaid\ngraph LR\n    A[Input DataFrame] --> B[Serialize Rows]\n    B --> C[Batch to LLM]\n    C --> D[Parse Responses]\n    D --> E[Extract New Columns]\n    E --> F[Output DataFrame]\n```\n\n**Input Format**: Rows are serialized to Python dictionaries with the following structure:\n\n```\nindex=<row_index>|{key1: value1, key2: value2, ...}\n```\n\n**Output Format**: LLM returns dictionaries with original keys plus new field keys:\n\n```\nindex=<row_index>|{key1: value1, key2: value2, new_field: new_value}\n```\n\n资料来源：[datatune/core/dask/map_dask.py:1-50]()\n\n**Key Features**:\n\n- Handles duplicate rows via canonical mapping\n- Supports batched API calls for efficiency\n- Falls back to canonical row values for duplicates\n\n### Filter Operation\n\nThe `Filter` operation removes rows that do not meet specified criteria using LLM-based evaluation.\n\n```mermaid\ngraph LR\n    A[Input DataFrame] --> B[Serialize Rows]\n    B --> C[LLM Decision]\n    C --> D{Keep Row?}\n    D -->|Yes| E[Include in Output]\n    D -->|No| F[Exclude Row]\n```\n\n**Decision Format**: Each row receives a `__filter__` key:\n\n```\nindex=<row_index>|{..., __filter__: True/False}\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:1-50]()\n\n### Deduplication\n\nThe deduplication system uses embeddings and FAISS for efficient similarity search:\n\n| Component | Description |\n|-----------|-------------|\n| `embedding_model` | Model for generating text embeddings |\n| `sim_threshold` | Minimum similarity score (default: 0.90) |\n| `top_k` | Number of neighbors to search |\n| `hnsw_m` | HNSW index parameter for m |\n| `ef_search` | HNSW search parameter |\n\n**Workflow**:\n\n1. Embed column values in batches of 256\n2. Build FAISS HNSW index per partition\n3. Search for similar records above threshold\n4. Cluster duplicates with canonical ID\n\n资料来源：[datatune/core/deduplication.py:1-80]()\n\n## Agent Layer\n\nThe Agent layer provides high-level orchestration, automatically determining which operations to use and generating execution plans.\n\n```mermaid\ngraph TD\n    A[User Goal] --> B[Plan Generator]\n    B --> C[JSON Plan]\n    C --> D[Plan Executor]\n    D --> E[Step 1: Operation]\n    D --> F[Step 2: Operation]\n    D --> G[Step N: Operation]\n    E --> H[Finalize]\n    F --> H\n    G --> H\n    H --> I[Computed Result]\n```\n\n### Plan Structure\n\nPlans are generated as JSON arrays of steps:\n\n```json\n[\n  {\n    \"type\": \"primitive\",\n    \"operation\": \"map\",\n    \"params\": {\n      \"subprompt\": \"Extract category from industry\",\n      \"input_fields\": [\"Industry\"],\n      \"output_fields\": [\"Category\"]\n    }\n  },\n  {\n    \"type\": \"dask\",\n    \"operation\": \"add_column\",\n    \"params\": {\n      \"new_column\": \"Year\",\n      \"expression\": \"df['Date'].dt.year\"\n    }\n  }\n]\n```\n\n资料来源：[datatune/agent/agent.py:80-120]()\n\n### Step Types\n\n| Type | Description | Operations |\n|------|-------------|------------|\n| `primitive` | LLM-based transformations | `map`, `filter` |\n| `dask` | Native Dask operations | `add_column`, `group_by`, `apply_function`, `rename_columns`, `astype_column` |\n\n### Template System\n\nTemplates define how operations are translated to executable code:\n\n```python\nTEMPLATE = {\n    \"primitive\": {\n        \"Map\": \"df = dt.Map(...)(self.llm, df)\",\n        \"Filter\": \"df = dt.Filter(...)(self.llm, df)\"\n    },\n    \"dask\": {\n        \"add_column\": \"df = df.assign({new_column}=lambda x: {expression})\",\n        \"group_by_agg\": \"df = df.groupby({group_columns}).agg({aggregations})\"\n    }\n}\n```\n\n资料来源：[datatune/agent/agent.py:40-70]()\n\n## Data Backend Support\n\n### Dask Integration\n\nDatatune primarily uses Dask for distributed computing. Operations are executed partition-by-partition with lazy evaluation:\n\n```python\n# Example: Map operation on Dask DataFrame\ndf = dd.read_csv(\"data.csv\")\nmapped = dt.map(\n    prompt=\"Extract sentiment from text\",\n    output_fields=[\"sentiment\"],\n    input_fields=[\"text\"]\n)(llm, df)\n\n# Finalize and compute\nresult = dt.finalize(mapped).compute()\n```\n\n### Ibis Integration\n\nIbis provides support for multiple backends (DuckDB, PostgreSQL, BigQuery):\n\n```python\n# DuckDB example\nimport ibis\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"my_table\")\n\n# Apply map operation\nmapped_table = dt.map(\n    prompt=\"Extract category\",\n    output_fields=[\"category\"],\n    input_fields=[\"description\"]\n)(llm, table)\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:1-50]()\n\n## Request Batching\n\nThe LLM layer implements intelligent batching to optimize API usage:\n\n```mermaid\ngraph TD\n    A[Input Rows] --> B[Calculate Token Budget]\n    B --> C{Within Limits?}\n    C -->|Yes| D[Batch All]\n    C -->|No| E[Split into Batches]\n    E --> F[Execute Batch 1]\n    E --> G[Execute Batch 2]\n    F --> H[Merge Results]\n    G --> H\n    D --> H\n```\n\n**Token Calculation**:\n\n- Prefix/suffix tokens are calculated via `token_counter`\n- `nrows_per_api_call` determines optimal batch sizes\n- Rate limits (TPM/RPM) are enforced per model\n\n资料来源：[datatune/llm/llm.py:30-70]()\n\n## Error Handling\n\nThe system implements robust error handling at multiple levels:\n\n| Layer | Error Handling |\n|-------|----------------|\n| **Agent** | Catches execution errors, returns step number and error message |\n| **Core Operations** | Gracefully handles malformed LLM responses with fallback to empty dict |\n| **LLM** | Retries with exponential backoff on transient failures |\n\n```python\ndef parse_filter_output(output: Union[str, Exception], err: bool = True) -> Optional[bool]:\n    try:\n        d = ast.literal_eval(str(output).strip())\n        if d:\n            last_key = list(d.keys())[-1]\n            flag = d[last_key]\n            return flag\n    except Exception:\n        return None\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:50-65]()\n\n## Configuration Options\n\n### LLM Configuration\n\n| Option | Default | Description |\n|--------|---------|-------------|\n| `model_name` | `gpt-3.5-turbo` | Model identifier |\n| `tpm` | Model-specific | Tokens per minute limit |\n| `rpm` | Model-specific | Requests per minute limit |\n| `api_key` | `None` | API authentication |\n| `api_base` | Provider-specific | API endpoint URL |\n\n### Operation Configuration\n\n| Option | Description |\n|--------|-------------|\n| `prompt` | Natural language instruction |\n| `input_fields` | Columns to use as input |\n| `output_fields` | New columns to create (Map only) |\n| `optimized` | Enable batching optimization |\n\n## Summary\n\nDatatune's system architecture provides:\n\n1. **Provider Abstraction**: Unified interface across OpenAI, Ollama, VLLM, and Azure\n2. **Rate Limiting**: Automatic enforcement of TPM/RPM limits per model\n3. **Distributed Execution**: Partition-based processing for large datasets\n4. **Multi-Backend Support**: Native support for Dask and Ibis backends\n5. **Intelligent Planning**: Agent-based orchestration for complex multi-step transformations\n6. **Semantic Operations**: LLM-powered Map and Filter for text understanding\n7. **Deduplication**: Embedding-based similarity search with FAISS\n\n---\n\n<a id='page-map-operations'></a>\n\n## Map Operations\n\n### 相关页面\n\n相关主题：[Filter Operations](#page-filter-operations), [Reduce Operations](#page-reduce-operations), [Dask Backend](#page-dask-backend)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/map.py](https://github.com/vitalops/datatune/blob/main/datatune/core/map.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/__init__.py)\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n</details>\n\n# Map Operations\n\nMap Operations in Datatune enable LLM-powered column transformations on distributed dataframes. Users describe desired transformations in natural language, and the system generates new columns by leveraging large language models to process each row semantically.\n\n## Overview\n\nThe Map operation is a core primitive in Datatune that bridges natural language instructions with data transformations. It accepts a user-defined prompt describing what new columns to create, identifies relevant input columns, and produces one or more output columns populated by LLM inference. 资料来源：[datatune/__init__.py:3]()\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n```\n\n资料来源：[README.md:18-29]()\n\n## Architecture\n\nThe Map system is designed with a pluggable backend architecture, supporting both Dask and Ibis execution contexts. Each backend implements the same logical interface while adapting to its specific dataframe API.\n\n### Component Flow\n\n```mermaid\ngraph TD\n    A[User Prompt + Input/Output Fields] --> B[map Function Entry Point]\n    B --> C{Backend Type}\n    C -->|Dask| D[map_dask.py]\n    C -->|Ibis| E[map_ibis.py]\n    D --> F[LLM Batch Processor]\n    E --> F\n    F --> G[Parse LLM Output]\n    G --> H[Merge Results to DataFrame]\n    H --> I[Output DataFrame with New Columns]\n```\n\n### Backend Implementations\n\n| Backend | File | Purpose |\n|---------|------|---------|\n| Dask | `datatune/core/dask/map_dask.py` | Distributed DataFrame operations using Dask |\n| Ibis | `datatune/core/ibis/map_ibis.py` | SQL-like backend supporting DuckDB, PostgreSQL, BigQuery |\n\n## API Reference\n\n### Function Signature\n\n```python\ndef map(\n    prompt: str,\n    output_fields: List[str],\n    input_fields: Optional[List[str]] = None,\n    **kwargs\n) -> Callable\n```\n\n### Parameters\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `prompt` | `str` | Yes | Natural language description of the transformation to apply |\n| `output_fields` | `List[str]` | Yes | Names of columns to create in the output |\n| `input_fields` | `List[str]` | Optional | Input columns to pass to the LLM; if omitted, all columns are used |\n\n资料来源：[datatune/core/dask/map_dask.py:1-50]()\n\n### Return Value\n\nReturns a callable that accepts `(llm, dataframe)` and returns a transformed dataframe with the new columns appended.\n\n## Dask Backend Implementation\n\n### Data Flow\n\n```mermaid\ngraph LR\n    A[Partitioned DataFrame] --> B[Serialize Input Columns]\n    B --> C[Batch Rows to LLM]\n    C --> D[LLM Inference]\n    D --> E[Parse Dictionary Output]\n    E --> F[Assign to Output Columns]\n    F --> G[Merge Canonical Results to Duplicates]\n    G --> H[Return Partitioned DataFrame]\n```\n\n### Prompt Construction\n\nThe Dask implementation constructs prompts using a prefix-suffix pattern to ensure consistent LLM responses.\n\n**Prefix Structure:**\n```\nUse the following context to create new columns from existing columns.\n```\n\n资料来源：[datatune/core/dask/map_dask.py:10-15]()\n\n**Suffix Structure:**\n```python\nsuffix = (\n    f\"{os.linesep}{os.linesep}\"\n    \"Your response MUST be the entire input record as a valid Python dictionary in the format\"\n    \"'index=<row_index>|{key1: value1, key2: value2, ...}'  with added keys of expected new fields if any.\"\n     \n    \"ALWAYS START YOUR RESPONSE WITH 'index=<row_index>|' WHERE <row_index> IS THE INDEX OF THE ROW.\" \\\n    \"IF A VALUE FOR A COLUMN DOES NOT EXIST SET IT TO null\" \\\n    \"'index=<row_index>|{key1: None, key2: value2, ...}'\"\n)\n```\n\n资料来源：[datatune/core/dask/map_dask.py:22-30]()\n\n### Duplicate Handling\n\nThe Dask Map implementation includes sophisticated duplicate handling for semantic deduplication scenarios. When clusters of duplicate rows exist, only the canonical row is sent to the LLM, and results are propagated to duplicates.\n\n```python\ndup_to_canon = {\n    dup: c[\"canonical_id\"]\n    for c in clusters\n    for dup in c[\"duplicate_ids\"]\n}\n\ncanonical_idx = input_series.index.difference(dup_to_canon.keys())\ncanonical_input = input_series.loc[canonical_idx]\n\nllm_out = llm(canonical_input, prefix, prompt, suffix, optimized=True)\n\ndf.loc[canonical_idx, llm_output_column] = llm_out\n\nfor dup, canon in dup_to_canon.items():\n    df.loc[dup, llm_output_column] = df.loc[canon, llm_output_column]\n```\n\n资料来源：[datatune/core/dask/map_dask.py:35-50]()\n\n### Output Parsing\n\nThe LLM output is parsed using `parse_llm_output`, which converts the string response to a Python dictionary.\n\n```python\ndef parse_llm_output(llm_output: Union[str, Exception]) -> Union[Dict, Exception]:\n    \"\"\"\n    Parses the LLM output string into a Python dictionary.\n    \"\"\"\n```\n\n资料来源：[datatune/core/dask/map_dask.py:75-85]()\n\n## Ibis Backend Implementation\n\n### Data Flow\n\n```mermaid\ngraph LR\n    A[Ibis Table] --> B[Add Row ID Column]\n    B --> C[Execute to Local Data]\n    C --> D[Convert to List]\n    D --> E[Batch to LLM]\n    E --> F[Parse Results]\n    F --> G[Create MemTable]\n    G --> H[Join Back to Original Table]\n```\n\n### Row Indexing\n\nIbis operations require explicit row indexing since SQL tables don't maintain native pandas-like indices.\n\n```python\nindexed_table = table.mutate(_ROW_ID_=ibis.row_number().cast(\"int64\"))\n\nlocal_data = indexed_table.select(\"_ROW_ID_\", input_col).execute()\n    \ninput_list = local_data[input_col].tolist()\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:25-32]()\n\n### Result Processing\n\nRaw LLM outputs are parsed and converted to JSON strings for storage:\n\n```python\nprocessed_results = []\nfor res in raw_results:\n    try:\n        py_dict = ast.literal_eval(str(res).strip())\n        processed_results.append(json.dumps(py_dict))\n    except Exception:\n        processed_results.append(\"{}\")\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:45-52]()\n\n## LLM Integration\n\n### Supported LLM Providers\n\n| Provider | Class | Default Model |\n|----------|-------|---------------|\n| OpenAI | `OpenAI` | `gpt-3.5-turbo` |\n| Ollama | `Ollama` | `gemma3:4b` |\n| Azure | `Azure` | Configurable |\n| vLLM | `VLLM` | Configurable |\n\n资料来源：[datatune/llm/llm.py:1-60]()\n\n### Batching and Rate Limiting\n\nThe LLM layer implements token and request rate limiting based on model-specific constraints defined in `model_rate_limits.py`.\n\n```python\nmodel_rate_limits = {\n    \"gpt-3.5-turbo\": {\n        \"tpm\": 200_000,\n        \"rpm\": 500,\n    },\n    \"gpt-4\": {\n        \"tpm\": 10_000,\n        \"rpm\": 500,\n    },\n    \"gpt-4o\": {\n        \"tpm\": 30_000,\n        \"rpm\": 500,\n    },\n}\n```\n\n资料来源：[datatune/llm/model_rate_limits.py:1-20]()\n\n### Batch Processing\n\nThe LLM batches multiple rows into single API calls for efficiency:\n\n```python\nfor i, prompt in enumerate(input_rows):\n    q  # Batch construction continues...\n```\n\n资料来源：[datatune/llm/llm.py:30-45]()\n\n## Agent Integration\n\nThe Datatune Agent can automatically generate Map operations as part of multi-step data transformation pipelines.\n\n### Agent System Prompt\n\n```python\nclass Agent(ABC):\n    system_prompt: str = \"\"\"You are Datatune Agent, a powerful assistant designed to help users with data processing tasks.\n    You are capable of generating python code to perform various operations on data. Apart from python builtins, you have the following libraries avaiable in your run time:\n    - pandas\n    - numpy\n    - dask\n\n    In addition to these, you also have access to the datatune libarary, which provides functionality for processing data using LLMs.\n    Map Example:\n    ```python\n    import datatune as dt\n    import dask.dataframe as dd\n    df = dd.read_csv(\"path/to/data.csv\")\n    map = dt.Map(prompt=\"Your prompt here\")\n    llm = dt.LLM(model_name=\"gpt-3.5-turbo\")\n    mapped_df = map(llm, df)\n    mapped_df = dt.finalize(mapped_df)\n    ```\n    \"\"\"\n```\n\n资料来源：[datatune/agent/__init__.py:1-30]()\n\n### Plan Generation\n\nWhen the Agent generates a plan requiring Map operations, it creates steps with the following structure:\n\n```python\n{\n    \"type\": \"primitive\",\n    \"operation\": \"map\",\n    \"params\": {\n        \"subprompt\": \"Extract category and sub-category from industry\",\n        \"input_fields\": [\"Industry\"],\n        \"output_fields\": [\"Category\",\"Sub-Category\"]\n    },\n}\n```\n\n资料来源：[datatune/agent/agent.py:50-65]()\n\n## Usage Examples\n\n### Basic Column Extraction\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-4\")\ndf = dd.read_csv(\"customers.csv\")\n\n# Extract sentiment and category from review text\nmapped = dt.map(\n    prompt=\"Analyze the customer feedback and extract sentiment (positive/negative/neutral) and main topic\",\n    output_fields=[\"Sentiment\", \"Topic\"],\n    input_fields=[\"CustomerFeedback\"]\n)(llm, df)\n\nresult = dt.finalize(mapped)\nresult.compute().to_csv(\"enriched_customers.csv\")\n```\n\n### Multi-Column Generation\n\n```python\n# Generate multiple derived columns in a single Map operation\nmapped = dt.map(\n    prompt=\"Parse the product description to identify: 1) the material, 2) primary color, 3) target demographic\",\n    output_fields=[\"Material\", \"Color\", \"Demographic\"],\n    input_fields=[\"ProductDescription\", \"ProductName\"]\n)(llm, df)\n```\n\n### With Agent\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(model_name=\"gpt-4\")\nagent = dt.Agent(llm)\n\ndf = dd.read_csv(\"data.csv\")\n# Agent automatically determines when to use Map operations\nresult = agent.do(\"Add ProfitMargin column and categorize by revenue tier\", df)\n```\n\n资料来源：[README.md:35-45]()\n\n## Best Practices\n\n### Input Field Selection\n\n| Practice | Rationale |\n|----------|-----------|\n| Provide specific input fields | Reduces token usage and improves accuracy |\n| Include context-rich columns | Helps LLM understand the domain |\n| Avoid overly large inputs | Prevents token limit issues |\n\n### Prompt Engineering\n\n- Be explicit about the expected output format in output field names\n- Include examples in prompts for complex transformations\n- Break complex extractions into multiple Map operations when needed\n\n### Performance Considerations\n\n| Factor | Impact | Recommendation |\n|--------|--------|----------------|\n| Batch size | Throughput vs. latency | Use default batching for most cases |\n| Model selection | Quality vs. speed | Use `gpt-3.5-turbo` for bulk, `gpt-4` for complex tasks |\n| Partition count | Parallelism | Match to cluster size for Dask operations |\n\n## See Also\n\n- [Filter Operations](Filter.md) - Row-level filtering using natural language\n- [Agent](Agent.md) - Automatic operation selection and chaining\n- [Deduplication](Deduplication.md) - Semantic deduplication with Map integration\n- [Supported LLMs](LLMs.md) - Complete list of supported providers\n\n---\n\n<a id='page-filter-operations'></a>\n\n## Filter Operations\n\n### 相关页面\n\n相关主题：[Map Operations](#page-map-operations), [Reduce Operations](#page-reduce-operations), [Ibis Backend](#page-ibis-backend)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/filter.py](https://github.com/vitalops/datatune/blob/main/datatune/core/filter.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/__init__.py)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n</details>\n\n# Filter Operations\n\nFilter Operations in Datatune enable intelligent, LLM-powered row filtering based on natural language criteria. Instead of writing explicit boolean conditions, users describe what rows to keep in plain English, and the LLM determines which records match the specified criteria.\n\n## Overview\n\nThe Filter module provides a declarative interface for semantic row-level filtering of data. It supports multiple backends (Dask DataFrames and Ibis tables) and uses LLMs to make filtering decisions based on natural language prompts.\n\nKey capabilities:\n- Natural language filter criteria specification\n- Multi-backend support (Dask, Ibis)\n- Semantic deduplication integration\n- Parallelized LLM inference\n- Structured output parsing\n\n## Architecture\n\nThe Filter system follows a dispatcher pattern that routes to backend-specific implementations based on the input data type.\n\n```mermaid\ngraph TD\n    A[User calls filter prompt, input_fields] --> B[datatune.filter function]\n    B --> C{Data Type Detection}\n    C -->|dask.dataframe.DataFrame| D[_filter_dask]\n    C -->|ibis.Table| E[_filter_ibis]\n    D --> F[LLM Batch Processing]\n    E --> F\n    F --> G[Parse Filter Output]\n    G --> H[Apply Boolean Mask]\n    H --> I[Filtered DataFrame/Table]\n```\n\n### Component Overview\n\n| Component | File | Responsibility |\n|-----------|------|----------------|\n| `filter()` | `datatune/core/filter.py` | Main entry point, type dispatch |\n| `_filter_dask()` | `datatune/core/dask/filter_dask.py` | Dask DataFrame filtering |\n| `_filter_ibis()` | `datatune/core/ibis/filter_ibis.py` | Ibis table filtering |\n| LLM Integration | `datatune/llm/llm.py` | Batch inference and token management |\n\n## API Reference\n\n### `dt.filter()`\n\nMain filter function exported from `datatune`.\n\n```python\ndef filter(*, prompt, input_fields=None, clusters=None):\n    def apply(llm, data):\n        # Implementation dispatch\n        ...\n    return apply\n```\n\n#### Parameters\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `prompt` | `str` | Yes | Natural language description of filter criteria |\n| `input_fields` | `List[str]` | No | Columns to use for filtering decisions |\n| `clusters` | `List[Dict]` | No | Semantic deduplication clusters for propagating filter decisions |\n\n#### Returns\n\nReturns a callable that accepts `(llm, data)` and returns a filtered DataFrame or Table.\n\n## Dask Implementation\n\nThe Dask backend processes rows in parallel across partitions, leveraging the Dask execution graph for distributed computation.\n\n### Source Files\n\n- Main implementation: `datatune/core/dask/filter_dask.py`\n- Dispatcher: `datatune/core/filter.py`\n\n### Workflow\n\n```mermaid\ngraph LR\n    A[Input DataFrame] --> B[Serialize Input Columns]\n    B --> C[Handle Clusters]\n    C --> D[Extract Canonical Indices]\n    D --> E[LLM Batch Inference]\n    E --> F[Parse Boolean Output]\n    F --> G[Propagate to Duplicates]\n    G --> H[Apply Boolean Mask]\n```\n\n### Key Functions\n\n#### `_filter_dask()`\n\nCore filtering function for Dask DataFrames:\n\n```python\ndef _filter_dask(\n    prompt: str,\n    input_fields: Optional[List[str]] = None,\n    clusters: Optional[List[Dict]] = None,\n):\n    # Validates inputs, constructs prompts, invokes LLM\n```\n\n#### `parse_filter_output()`\n\nParses LLM output string into boolean values:\n\n```python\ndef parse_filter_output(\n    output: Union[str, Exception], \n    err: bool = True\n) -> Optional[bool]:\n```\n\n## Ibis Implementation\n\nThe Ibis backend executes filtering operations at the database level, enabling pushdown to various SQL backends (DuckDB, PostgreSQL, BigQuery, etc.).\n\n### Source Files\n\n- Implementation: `datatune/core/ibis/filter_ibis.py`\n\n### Workflow\n\n```mermaid\ngraph TD\n    A[Input Table] --> B[Add Row ID Column]\n    B --> C[Select Input Columns]\n    C --> D[Execute to Local]\n    D --> E[LLM Batch Inference]\n    E --> F[Parse Filter Results]\n    F --> G[Build Boolean Filter]\n    G --> H[Filter Original Table]\n    H --> I[Remove Row ID Column]\n```\n\n### Key Functions\n\n#### `_filter_ibis()`\n\nCore filtering function for Ibis tables:\n\n```python\ndef _filter_ibis(\n    table: ibis.Table,\n    prompt: str,\n    input_fields: Optional[List[str]] = None,\n):\n    indexed_table = table.mutate(_ROW_ID_=ibis.row_number().cast(\"int64\"))\n```\n\nUses `ibis.row_number()` to assign sequential row IDs that match LLM response indexing. 资料来源：[datatune/core/ibis/filter_ibis.py:28]()\n\n## LLM Prompt Format\n\nThe Filter module constructs specialized prompts to guide LLM responses for filtering decisions.\n\n### Prompt Structure\n\n```\nFILTERING CRITERIA:\n<prompt>\n\n[input record dictionary]\n\nDECISION: Your response MUST be the entire input record as Python dictionary in the format:\nindex=<row_index>|{key1: value1, ...}<endofrow> with added key called '__filter__' \nwith value either True to KEEP the record or False to REMOVE it.\n```\n\n### Output Format\n\nThe LLM must return records with an added `__filter__` key:\n\n```python\nindex=0|{'Name': 'Acme Corp', 'Country': 'USA', '__filter__': True}<endofrow>\nindex=1|{'Name': 'Bad Inc', 'Country': 'UK', '__filter__': False}<endofrow>\n```\n\n#### Response Requirements\n\n| Requirement | Description |\n|-------------|-------------|\n| Index prefix | Each response starts with `index=<row_index>\\|` |\n| Complete record | Include all original keys plus `__filter__` |\n| Boolean value | `__filter__` must be `True` (keep) or `False` (remove) |\n| Missing values | Set to `None` if column value does not exist |\n\n资料来源：[datatune/core/dask/filter_dask.py:14-19]()\n\n## Semantic Deduplication Integration\n\nWhen `clusters` parameter is provided, filter decisions propagate from canonical records to their duplicates.\n\n### Cluster Structure\n\n```python\nclusters = [\n    {\n        \"canonical_id\": 5,\n        \"duplicate_ids\": [12, 45, 78]\n    },\n    ...\n]\n```\n\n### Propagation Logic\n\n```mermaid\ngraph TD\n    A[Clusters Input] --> B[Build dup_to_canon Map]\n    B --> C[Extract Canonical Indices]\n    C --> D[Filter Only Canonical Rows]\n    D --> E[LLM Inference on Canonical]\n    E --> F[Propagate Decisions to Duplicates]\n    F --> G[Merge Results]\n```\n\nThe mapping builds:\n```python\ndup_to_canon = {\n    dup: c[\"canonical_id\"]\n    for c in clusters\n    for dup in c[\"duplicate_ids\"]\n}\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:41-46]()\n\n## Usage Examples\n\n### Basic Dask Filtering\n\n```python\nimport datatune as dt\nimport dask.dataframe as dd\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\n# Keep only electronics products\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\", \"Description\"]\n)(llm, df)\n\nresult = dt.finalize(filtered)\n```\n\n### Ibis Table Filtering\n\n```python\nimport datatune as dt\nimport ibis\n\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"customers\")\n\n# Filter using natural language\nfiltered = dt.filter(\n    prompt=\"Keep only customers from African countries\",\n    input_fields=[\"Country\", \"Region\"]\n)(llm, table)\n```\n\n### With Semantic Deduplication\n\n```python\n# First deduplicate to find clusters\ndeduplicator = dt.SemanticDeduplicator(llm=llm)\nclusters = deduplicator.find_duplicates(df, columns=[\"Name\", \"Address\"])\n\n# Filter with cluster awareness\nfiltered = dt.filter(\n    prompt=\"Keep only verified organizations\",\n    input_fields=[\"Status\", \"VerificationDate\"],\n    clusters=clusters\n)(llm, df)\n```\n\n## Error Handling\n\nThe `parse_filter_output()` function handles various error cases:\n\n| Error Type | Handling | Return Value |\n|------------|----------|--------------|\n| Parse failure | Returns `None` if `err=False` | `None` |\n| Exception | Returns exception if `err=True` | Original exception |\n| Empty output | Returns `None` | `None` |\n| Malformed dict | Falls back to `None` | `None` |\n\n资料来源：[datatune/core/dask/filter_dask.py:64-79]()\n\n## Rate Limiting\n\nFilter operations respect LLM rate limits defined in `datatune/llm/model_rate_limits.py`:\n\n| Model | TPM (tokens/min) | RPM (requests/min) |\n|-------|------------------|---------------------|\n| gpt-3.5-turbo | 200,000 | 500 |\n| gpt-4 | 10,000 | 500 |\n| gpt-4-turbo | 30,000 | 500 |\n\nThe batching system automatically respects these limits during batch LLM inference.\n\n## Performance Considerations\n\n### Batching Strategy\n\nRows are batched for LLM inference to optimize throughput:\n\n```python\n# From datatune/llm/llm.py\nfor i, prompt in enumerate(input_rows):\n    # Batch prompts based on token limits\n    q  # Accumulate until batch size reached\n```\n\n### Partition Parallelism (Dask)\n\nDask DataFrames process partitions in parallel:\n\n```mermaid\ngraph LR\n    A[Partition 0] --> E[LLM Batch]\n    B[Partition 1] --> F[LLM Batch]\n    C[Partition 2] --> G[LLM Batch]\n    E --> H[Results Merged]\n    F --> H\n    G --> H\n```\n\n### Backend Selection\n\n| Backend | Best For | Limitations |\n|---------|----------|-------------|\n| Dask | Large datasets, local processing | Memory-bound |\n| Ibis | Database pushdown, SQL backends | Requires database connection |\n\n## See Also\n\n- [Map Operations](./Map.md) - Creating new columns via LLM\n- [Semantic Deduplication](./SemanticDeduplication.md) - Finding duplicate records\n- [Agent](./Agent.md) - Automatic operation planning\n- [LLM Configuration](../llm/llm.md) - LLM setup and backends\n\n---\n\n<a id='page-reduce-operations'></a>\n\n## Reduce Operations\n\n### 相关页面\n\n相关主题：[Map Operations](#page-map-operations), [Filter Operations](#page-filter-operations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/reduce.py](https://github.com/vitalops/datatune/blob/main/datatune/core/reduce.py)\n- [datatune/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/__init__.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/deduplication.py](https://github.com/vitalops/datatune/blob/main/datatune/core/deduplication.py)\n</details>\n\n# Reduce Operations\n\n## Overview\n\nReduce Operations in datatune provide a mechanism for aggregating and condensing data using configurable action handlers. The reduce system follows a plugin-based architecture where specific aggregation or reduction behaviors are registered as actions and invoked through a unified interface.\n\nThe core purpose of reduce operations is to transform a DataFrame into aggregated results by applying registered reduction actions. This enables users to perform complex aggregation tasks that go beyond standard group-by operations, leveraging LLM-powered semantic understanding when needed.\n\n资料来源：[datatune/core/reduce.py:1-22]()\n\n## Architecture\n\n### Action Registry Pattern\n\nThe reduce system implements a decorator-based registry pattern for action registration:\n\n```mermaid\ngraph TD\n    A[User calls reduce df action] --> B[reduce function invoked]\n    B --> C[get_action looks up registry]\n    C --> D{Action found?}\n    D -->|Yes| E[Instantiate action class]\n    D -->|No| F[Raise ValueError]\n    E --> G[Execute reduction on DataFrame]\n    G --> H[Return aggregated result]\n```\n\nThe `_ACTIONS` dictionary serves as a central registry mapping action names to their implementation classes. The `register_action` decorator allows new actions to be dynamically registered without modifying core logic.\n\n资料来源：[datatune/core/reduce.py:1-22]()\n\n### Class Diagram\n\n```mermaid\nclassDiagram\n    class _ACTIONS {\n        Dict[str, Type]\n    }\n    \n    class register_action {\n        +decorator(name: str)\n    }\n    \n    class get_action {\n        +function(name: str) Type\n    }\n    \n    class reduce {\n        +function(df, action, **kwargs)\n    }\n    \n    register_action ..> _ACTIONS : registers\n    get_action ..> _ACTIONS : queries\n    reduce ..> get_action : uses\n```\n\n## API Reference\n\n### Main Function\n\n```python\ndef reduce(df, *, action: str, **kwargs):\n    cls = get_action(action)\n    reducer = cls(**kwargs)   \n    return reducer(df)\n```\n\n#### Parameters\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `df` | `dask.dataframe.DataFrame` | Yes | The input Dask DataFrame to reduce |\n| `action` | `str` | Yes | The name of the registered reduction action to execute |\n| `**kwargs` | Various | No | Additional keyword arguments passed to the action constructor |\n\n#### Returns\n\n| Return Type | Description |\n|-------------|-------------|\n| Any | Returns the result of the specific reduction action (varies by registered action) |\n\n#### Exceptions\n\n| Exception | Condition |\n|-----------|------------|\n| `ValueError` | When the specified action name is not registered |\n\n资料来源：[datatune/core/reduce.py:19-22]()\n\n### Register Action Decorator\n\n```python\ndef register_action(name):\n    def decorator(cls):\n        _ACTIONS[name] = cls\n        return cls\n    return decorator\n```\n\n#### Usage Pattern\n\n```python\n@register_action(\"my_reducer\")\nclass MyReducer:\n    def __init__(self, **kwargs):\n        # Initialize reducer configuration\n        pass\n    \n    def __call__(self, df):\n        # Perform reduction\n        return result\n```\n\n### Get Action Function\n\n```python\ndef get_action(name):\n    try:\n        return _ACTIONS[name]\n    except KeyError:\n        raise ValueError(f\"Unknown action: {name}\")\n```\n\n## Workflow Execution\n\n### Execution Flow\n\n```mermaid\ngraph TD\n    A[Start] --> B[Import reduce module]\n    B --> C[Call reduce df action]\n    C --> D[get_action retrieves class]\n    D --> E[Instantiate with kwargs]\n    E --> F[Call reducer on DataFrame]\n    F --> G[Return result]\n    G --> H[End]\n```\n\n### Integration with Datatune Core\n\nThe reduce operation integrates with other datatune components through the public API:\n\n```python\nimport datatune as dt\n\n# reduce is exported from datatune namespace\nresult = dt.reduce(df, action=\"specific_action\", param=value)\n```\n\n资料来源：[datatune/__init__.py:1-7]()\n\n## Registered Actions\n\nBased on the repository structure, reduce operations support extensibility through custom registered actions. The system is designed to work with Dask DataFrames and can integrate with:\n\n- **SemanticDeduplicator**: For deduplication using embedding-based similarity matching\n- **Custom aggregations**: User-defined reduction logic via the decorator pattern\n\n### SemanticDeduplicator Integration\n\nWhile primarily a deduplication tool, the `SemanticDeduplicator` class in `datatune/core/deduplication.py` demonstrates how reduce-style operations work:\n\n```python\nclass SemanticDeduplicator:\n    def __init__(\n        self,\n        llm,\n        embedding_model: str = \"text-embedding-3-small\",\n        sim_threshold: float = 0.90,\n        top_k: int = 50,\n        hnsw_m: int = 32,\n        ef_search: int = 64,\n        return_df: bool = False,\n    ):\n        # Configuration initialization\n```\n\nThe deduplicator processes data through embedding generation, FAISS index building, and similarity search phases, demonstrating how reduce operations handle complex multi-stage transformations.\n\n资料来源：[datatune/core/deduplication.py:1-150]()\n\n## Extension Points\n\n### Creating Custom Reduce Actions\n\nTo create a custom reduction action:\n\n1. Define a class with appropriate `__init__` and `__call__` methods\n2. Decorate with `@register_action(\"action_name\")`\n3. The class should accept `**kwargs` for flexible configuration\n4. Implement `__call__(self, df)` to receive the DataFrame and return the result\n\n```python\nfrom datatune.core.reduce import register_action\n\n@register_action(\"custom_aggregate\")\nclass CustomAggregateReducer:\n    def __init__(self, group_by_column, agg_column, operation=\"sum\", **kwargs):\n        self.group_by_column = group_by_column\n        self.agg_column = agg_column\n        self.operation = operation\n    \n    def __call__(self, df):\n        # Custom reduction logic\n        return result_df\n```\n\n### Best Practices\n\n| Practice | Rationale |\n|----------|-----------|\n| Use descriptive action names | Ensures clear identification in `get_action` calls |\n| Accept `**kwargs` in `__init__` | Maintains compatibility with the reduce function signature |\n| Return consistent types | Enables predictable downstream processing |\n| Handle empty DataFrames | Prevents errors during edge case processing |\n\n## Related Components\n\n| Component | File | Relationship |\n|-----------|------|--------------|\n| Filter | `datatune/core/dask/filter_dask.py` | Similar pattern for row-level operations |\n| Map | `datatune/core/dask/map_dask.py` | Column transformation counterpart |\n| Finalize | `datatune/core/dask/op.py` | Completes lazy Dask computations |\n| Agent | `datatune/agent/agent.py` | Orchestrates multi-step transformations |\n\n资料来源：[datatune/core/dask/filter_dask.py:1-50]()\n资料来源：[datatune/core/dask/map_dask.py:1-50]()\n\n## Summary\n\nReduce Operations provide a flexible, extensible mechanism for aggregating and transforming data in datatune. The action registry pattern enables:\n\n- **Dynamic action registration** via the `@register_action` decorator\n- **Centralized action lookup** through the `get_action` function\n- **Unified API** through the `reduce` function interface\n- **Custom extensibility** for domain-specific reduction logic\n\nThe architecture follows proven design patterns that separate action registration from execution, allowing the core system to remain stable while supporting diverse reduction behaviors through plugin-style extensions.\n\n---\n\n<a id='page-dask-backend'></a>\n\n## Dask Backend\n\n### 相关页面\n\n相关主题：[Ibis Backend](#page-ibis-backend), [System Architecture](#page-architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/dask/op.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/op.py)\n- [datatune/core/dask/constants.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/constants.py)\n- [datatune/core/map.py](https://github.com/vitalops/datatune/blob/main/datatune/core/map.py)\n- [datatune/core/filter.py](https://github.com/vitalops/datatune/blob/main/datatune/core/filter.py)\n</details>\n\n# Dask Backend\n\nThe Dask Backend is the primary data processing engine in datatune, enabling large-scale, distributed data transformations using Large Language Models (LLMs). It extends the popular Dask library with semantic understanding capabilities, allowing users to perform complex data operations through natural language prompts while maintaining Dask's efficient parallel and out-of-core computation model.\n\n## Architecture Overview\n\nThe Dask Backend follows a modular architecture with three main layers:\n\n1. **API Layer** (`datatune/core/map.py`, `datatune/core/filter.py`) - Entry points that dispatch to appropriate backends\n2. **Operation Layer** (`datatune/core/dask/`) - Core transformations for Dask DataFrames\n3. **LLM Integration Layer** - Handles communication with language models for semantic operations\n\n```mermaid\ngraph TD\n    A[User Data] --> B[Dask DataFrame]\n    B --> C{Operation Type}\n    C -->|Map| D[map_dask.py]\n    C -->|Filter| E[filter_dask.py]\n    D --> F[LLM API]\n    E --> F\n    F --> G[Parse & Validate]\n    G --> H[New Columns Added]\n    H --> I[Lazy Dask DataFrame]\n    I --> J[finalize]\n    J --> K[Computed Result]\n    \n    L[Clusters] -.->|Optional| D\n    L -.->|Optional| E\n```\n\n## Core Operations\n\n### Map Operation\n\nThe map operation uses an LLM to generate new columns from existing data through natural language understanding. It is designed for semantic extraction, classification, and content transformation tasks.\n\n**Source**: `datatune/core/dask/map_dask.py`\n\n#### How Map Works\n\n```mermaid\ngraph LR\n    A[Input DataFrame] --> B[Serialize Rows]\n    B --> C[Cluster Deduplication]\n    C --> D[Batch for LLM]\n    D --> E[LLM Processing]\n    E --> F[Parse Output]\n    F --> G[Expand to Columns]\n    G --> H[Copy to Duplicates]\n    H --> I[Output DataFrame]\n```\n\n#### Key Components\n\n| Component | Description |\n|-----------|-------------|\n| `serialized_input_column` | Temporary column storing row data as Python dictionaries |\n| `llm_output_column` | Temporary column storing raw LLM responses |\n| `clusters` | Optional list of duplicate clusters for deduplication |\n\n#### Processing Flow\n\n1. Each row is serialized to a dictionary format\n2. If clusters are provided, duplicate handling is applied - only canonical rows are sent to LLM\n3. LLM receives batched prompts with mapping instructions\n4. Responses are parsed and new columns are extracted\n5. Results are propagated back to duplicate rows\n\n**Key code snippet from map_dask.py**:\n```python\ndf.loc[canonical_idx, llm_output_column] = llm_out\n\nfor dup, canon in dup_to_canon.items():\n    df.loc[dup, llm_output_column] = df.loc[canon, llm_output_column]\n```\n\nThis ensures duplicate handling by copying canonical results to all duplicate rows.\n\n资料来源：[datatune/core/dask/map_dask.py:34-48]()\n\n### Filter Operation\n\nThe filter operation uses an LLM to determine which rows to keep or remove based on natural language criteria.\n\n**Source**: `datatune/core/dask/filter_dask.py`\n\n#### Filter Output Format\n\nThe LLM returns decisions in a specific format with an added `__filter__` key:\n\n```python\nindex=<row_index>|{key1: value1, ..., '__filter__': True/False}<endofrow>\n```\n\n| Decision | Behavior |\n|----------|----------|\n| `True` | Keep the row |\n| `False` | Remove the row |\n\n#### Processing Pipeline\n\n```mermaid\ngraph TD\n    A[Input DataFrame] --> B[Serialize to Dictionaries]\n    B --> C[Apply Cluster Deduplication]\n    C --> D[Send to LLM with Filter Prompt]\n    D --> E[Parse __filter__ Value]\n    E --> F[Filter DataFrame]\n    F --> G[Output DataFrame]\n```\n\n**Key code snippet for filter decision parsing**:\n```python\nsuffix = (\n    f\"{os.linesep}{os.linesep}\"\n    \"DECISION:Your response MUST be the entire input record as Python dictionary...\"\n    \"ALWAYS STICK TO THE FORMAT index=<row_index>|{...} with added key called '__filter__'...\"\n)\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:24-35]()\n\n### Finalize Operation\n\nThe `finalize` operation computes the lazy Dask DataFrame and converts it to a pandas DataFrame.\n\n**Source**: `datatune/core/dask/op.py`\n\n```python\ndef finalize(df: dd.DataFrame) -> pd.DataFrame:\n    \"\"\"Compute the lazy Dask DataFrame and return a pandas DataFrame.\"\"\"\n    return df.compute()\n```\n\n资料来源：[datatune/core/dask/op.py]()\n\n## Semantic Deduplication\n\nBoth map and filter operations support semantic deduplication to reduce LLM API calls and costs.\n\n**Source**: `datatune/core/deduplication.py`\n\n### How Deduplication Works\n\n```mermaid\ngraph TD\n    A[Input Series] --> B[Find Duplicate Clusters]\n    B --> C{dup_to_canon mapping}\n    C --> D[Extract Canonical Index]\n    D --> E[Send Only Canonical to LLM]\n    E --> F[Copy Results to Duplicates]\n    \n    style C fill:#f9f,color:#000\n```\n\n### Cluster Format\n\n```python\nclusters = [\n    {\"canonical_id\": 5, \"duplicate_ids\": [12, 23, 45]},\n    {\"canonical_id\": 10, \"duplicate_ids\": [15, 20]}\n]\n```\n\nThis approach sends only unique/canonical rows to the LLM, then propagates results to all duplicates.\n\n资料来源：[datatune/core/dask/map_dask.py:34-40]()\n\n## API Reference\n\n### Map Function\n\n```python\nfrom datatune.core.map import map\n\nresult = map(\n    prompt=\"Extract categories from the description\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"],\n    clusters=None  # Optional deduplication clusters\n)(llm, dask_dataframe)\n```\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `prompt` | str | Yes | Natural language instructions for column generation |\n| `output_fields` | List[str] | Yes | Names of new columns to create |\n| `input_fields` | List[str] | No | Columns to use as input (defaults to all) |\n| `clusters` | List[Dict] | No | Duplicate clusters for deduplication |\n\n### Filter Function\n\n```python\nfrom datatune.core.filter import filter\n\nresult = filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\", \"Description\"],\n    clusters=None  # Optional deduplication clusters\n)(llm, dask_dataframe)\n```\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `prompt` | str | Yes | Natural language criteria for filtering |\n| `input_fields` | List[str] | No | Columns to evaluate against |\n| `clusters` | List[Dict] | No | Duplicate clusters for deduplication |\n\n### Finalize Function\n\n```python\nfrom datatune.core.dask.op import finalize\n\npandas_df = finalize(lazy_dask_dataframe)\n```\n\n资料来源：[datatune/core/dask/op.py]()\n\n## LLM Output Format\n\nThe Dask Backend requires LLM responses in a specific parseable format for reliable operation.\n\n### Map Output Format\n\n```\nindex=<row_index>|{key1: value1, key2: value2, ..., 'new_field1': val, 'new_field2': val}<endofrow>\n```\n\n### Filter Output Format\n\n```\nindex=<row_index>|{key1: value1, key2: value2, ..., '__filter__': True/False}<endofrow>\n```\n\n### Parsing Rules\n\n| Rule | Description |\n|------|-------------|\n| String values | Must be enclosed in double quotes |\n| Missing values | Set to `None` or `null` |\n| Index | Must match row index exactly |\n| Format | Must be valid Python literal (dictionary) |\n\n资料来源：[datatune/core/dask/map_dask.py:20-30]()\n\n## Usage Examples\n\n### Example 1: Category Extraction\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\n# Extract categories using natural language\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n\n# Save results\nresult = dt.finalize(mapped)\nresult.compute().to_csv(\"categorized_products.csv\")\n```\n\n资料来源：[README.md]()\n\n### Example 2: Text-Based Filtering\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"companies.csv\")\n\n# Filter based on semantic criteria\nfiltered = dt.filter(\n    prompt=\"Keep only organizations in Africa\",\n    input_fields=[\"Country\", \"Description\"]\n)(llm, df)\n\nresult = dt.finalize(filtered)\n```\n\n### Example 3: With Deduplication\n\n```python\n# Assuming clusters from semantic deduplication step\nclusters = [\n    {\"canonical_id\": 5, \"duplicate_ids\": [12, 23, 45]},\n    {\"canonical_id\": 10, \"duplicate_ids\": [15, 20]}\n]\n\nmapped = dt.map(\n    prompt=\"Classify the industry sector\",\n    output_fields=[\"Industry\", \"Sector\"],\n    input_fields=[\"CompanyName\", \"Description\"],\n    clusters=clusters\n)(llm, df)\n```\n\n## Internal Data Flow\n\n### Serialization Process\n\nThe backend converts DataFrame rows to dictionary format for LLM processing:\n\n```mermaid\ngraph LR\n    A[DataFrame Row] --> B[Convert to Dict]\n    B --> C[Replace NaN with None]\n    C --> D[\"String: {col1: val1, col2: val2}\"]\n    D --> E[LLM Prompt]\n```\n\n**Implementation from map_dask.py**:\n```python\ndf = df.astype(object).where(df.notna(), None)\ndf[serialized_input_column] = [\n    str(row.to_dict()) for _, row in df.iterrows()\n]\n```\n\n资料来源：[datatune/core/dask/map_dask.py:50-56]()\n\n## Error Handling\n\nThe backend includes robust error handling for LLM parsing failures:\n\n```python\ndef parse_llm_output(llm_output: Union[str, Exception]) -> Union[Dict, Exception]:\n    \"\"\"\n    Parses the LLM output string into a Python dictionary.\n    \"\"\"\n    # Returns dict on success, Exception on failure\n```\n\nFailed parsing attempts return empty dictionaries `{}`, ensuring the pipeline continues rather than crashing.\n\n## Backend Dispatch Mechanism\n\nThe system automatically dispatches to the Dask backend when a Dask DataFrame is detected:\n\n```mermaid\ngraph TD\n    A[Input Data] --> B{Is Dask DataFrame?}\n    B -->|Yes| C[Dask Backend]\n    B -->|No| D{Is Ibis Table?}\n    D -->|Yes| E[Ibis Backend]\n    D -->|No| F[TypeError]\n    \n    C --> G[_map_dask / _filter_dask]\n    E --> H[_map_ibis / _filter_ibis]\n```\n\n**Dispatch logic from core/map.py**:\n```python\ndef _is_dask_df(obj):\n    try:\n        import dask.dataframe as dd\n        return isinstance(obj, dd.DataFrame)\n    except ImportError:\n        return False\n```\n\n资料来源：[datatune/core/map.py:1-17]()\n\n## Configuration Considerations\n\n### Memory Efficiency\n\nThe Dask backend processes data partition by partition, making it suitable for datasets larger than memory. Consider:\n\n| Setting | Recommendation |\n|---------|----------------|\n| Dask partitions | Set based on available memory |\n| Batch size | Adjust based on LLM rate limits |\n| Clusters | Use for high-duplicate datasets |\n\n### LLM Rate Limits\n\nThe backend implements batching to respect LLM API rate limits while maximizing throughput.\n\n## Summary\n\nThe Dask Backend provides a powerful, scalable mechanism for performing LLM-powered data transformations on Dask DataFrames. Key capabilities include:\n\n- **Map**: Generate new columns through semantic understanding\n- **Filter**: Remove rows based on natural language criteria  \n- **Deduplication**: Reduce costs by processing only canonical rows\n- **Lazy Evaluation**: Maintain Dask's efficient computation model until finalization\n\nAll operations follow a consistent pattern: serialize data, send to LLM with clear instructions, parse responses, and update the DataFrame.\n\n---\n\n<a id='page-ibis-backend'></a>\n\n## Ibis Backend\n\n### 相关页面\n\n相关主题：[Dask Backend](#page-dask-backend), [System Architecture](#page-architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n</details>\n\n# Ibis Backend\n\n## Overview\n\nThe Ibis Backend in datatune provides integration with the Ibis project, enabling LLM-powered data transformations on various database backends including DuckDB, PostgreSQL, BigQuery, and more. This backend extends datatune's core functionality (map, filter) to work with Ibis tables, leveraging the composability and SQL-generation capabilities of Ibis while incorporating LLM-based semantic transformations.\n\nThe Ibis Backend serves as a critical abstraction layer that:\n\n- Accepts Ibis table objects as input\n- Serializes table data for LLM consumption\n- Executes batch inference operations via LLM calls\n- Parses and applies LLM outputs back to the table structure\n- Returns transformed Ibis tables for continued processing\n\n资料来源：[datatune/core/ibis/filter_ibis.py:1-20](datatune/core/ibis/filter_ibis.py)\n资料来源：[datatune/core/ibis/map_ibis.py:1-25](datatune/core/ibis/map_ibis.py)\n\n## Architecture\n\n### System Components\n\nThe Ibis Backend consists of several interconnected components that work together to provide seamless LLM-powered transformations:\n\n```mermaid\ngraph TD\n    A[Ibis Table Input] --> B[add_serialized_col]\n    B --> C[llm_batch_inference]\n    C --> D[LLM API Call]\n    D --> E[Parse LLM Output]\n    E --> F[apply_llm_updates / Filter Logic]\n    F --> G[Transformed Ibis Table]\n    \n    H[_map_ibis Class] --> I[Map Operations]\n    J[_filter_ibis Class] --> K[Filter Operations]\n```\n\n### Component Responsibilities\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `add_serialized_col` | filter_ibis.py, map_ibis.py | Serializes Ibis table columns into JSON strings for LLM input |\n| `llm_batch_inference` | filter_ibis.py, map_ibis.py | Manages batch inference with LLM, handles retries and error cases |\n| `_map_ibis` | map_ibis.py | Handles mapping operations (creating new columns via LLM) |\n| `_filter_ibis` | filter_ibis.py | Handles filtering operations (row selection via LLM) |\n\n资料来源：[datatune/core/ibis/filter_ibis.py:11-32](datatune/core/ibis/filter_ibis.py)\n资料来源：[datatune/core/ibis/map_ibis.py:12-38](datatune/core/ibis/map_ibis.py)\n\n## Core Functions\n\n### add_serialized_col\n\nThe `add_serialized_col` function transforms Ibis table columns into a serialized JSON format suitable for LLM processing. This function is fundamental to both map and filter operations.\n\n**Function Signature:**\n\n```python\ndef add_serialized_col(table, target_col: str, input_fields: Optional[List[str]] = []):\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `table` | `ibis.Table` | The input Ibis table |\n| `target_col` | `str` | Name of the column to store serialized data |\n| `input_fields` | `Optional[List[str]]` | List of column names to include in serialization. If empty, includes all columns except target |\n\n**Behavior:**\n\n1. Filters columns to include only relevant fields (excluding target column and non-selected fields)\n2. Constructs a JSON string expression by concatenating column names and values\n3. Returns a new Ibis table with the serialized column added via `mutate()`\n\n资料来源：[datatune/core/ibis/map_ibis.py:40-53](datatune/core/ibis/map_ibis.py)\n资料来源：[datatune/core/ibis/filter_ibis.py:37-51](datatune/core/ibis/filter_ibis.py)\n\n### llm_batch_inference\n\nThe `llm_batch_inference` function manages the core LLM interaction for transforming data. It handles the complete workflow from prompt construction to result collection.\n\n**Function Signature:**\n\n```python\ndef llm_batch_inference(\n    table,\n    llm: Callable, \n    input_col: str, \n    output_col: str,   \n    prompt: str,\n    expected_new_fields: list[str] = []\n):\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `table` | `ibis.Table` | The input Ibis table |\n| `llm` | `Callable` | LLM callable (e.g., OpenAI, Ollama instance) |\n| `input_col` | `str` | Column containing serialized input data |\n| `output_col` | `str` | Column name for storing LLM output |\n| `prompt` | `str` | Transformation prompt for the LLM |\n| `expected_new_fields` | `list[str]` | Optional list of expected output field names |\n\n**Workflow:**\n\n```mermaid\nsequenceDiagram\n    participant Table as Ibis Table\n    participant Func as llm_batch_inference\n    participant LLM as LLM API\n    participant Parse as Output Parser\n    \n    Table->>Func: Execute table.select() to local DataFrame\n    Func->>Table: Get input_list from input_col\n    Func->>LLM: Call llm(input_list, prefix, prompt, suffix)\n    LLM-->>Func: Raw LLM responses\n    Func->>Parse: Process each response\n    Parse-->>Func: Validated JSON strings\n    Func->>Table: Create mapping DataFrame\n    Table-->>Func: Updated table with output_col\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:55-82](datatune/core/ibis/map_ibis.py)\n资料来源：[datatune/core/ibis/filter_ibis.py:54-78](datatune/core/ibis/filter_ibis.py)\n\n## Map Operations\n\n### _map_ibis Class\n\nThe `_map_ibis` class implements the mapping transformation for Ibis tables, enabling the creation of new columns based on LLM-driven transformations.\n\n**Class Definition:**\n\n```python\nclass _map_ibis:\n    def __init__(\n        self,\n        prompt: str,\n        input_fields: Optional[List[str]] = None,\n        output_fields: Optional[List[str]] = None,\n    ):\n        self.prompt = prompt\n        self.input_fields = input_fields or []\n        self.output_fields = output_fields or []\n        self.llm_output_column = \"MAP_LLM_OUTPUT__DATATUNE__\"\n        self.serialized_input_column = \"MAP_SERIALIZED_INPUT__DATATUNE__\"\n```\n\n**Key Attributes:**\n\n| Attribute | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `prompt` | `str` | Required | Natural language prompt describing the transformation |\n| `input_fields` | `Optional[List[str]]` | `[]` | Columns to use as input for transformation |\n| `output_fields` | `Optional[List[str]]` | `[]` | Names of new columns to create |\n| `llm_output_column` | `str` | System-generated | Internal column for LLM response storage |\n| `serialized_input_column` | `str` | System-generated | Internal column for serialized input |\n\n**Execution Flow:**\n\n1. Validates that all `input_fields` exist in the table schema\n2. Creates serialized input column via `add_serialized_col`\n3. Executes batch inference via `llm_batch_inference`\n4. Applies LLM updates to create output columns\n5. Selects final columns (removing internal columns)\n\n资料来源：[datatune/core/ibis/map_ibis.py:122-160](datatune/core/ibis/map_ibis.py)\n\n### Prompt Format for Map\n\nThe LLM receives structured prompts for mapping operations:\n\n```\nMap and transform the input according to the mapping criteria below.\nReplace or Create new fields or values as per the prompt.\nExpected new fields: [output_fields]\n\nMAPPING CRITERIA:\n{prompt}\n\nYour response MUST be the entire input record as a valid Python dictionary\nin the format 'index=<row_index>|{key1: value1, key2: value2, ...}' with added keys\nof expected new fields if any.\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:56-75](datatune/core/ibis/map_ibis.py)\n\n## Filter Operations\n\n### _filter_ibis Class\n\nThe `_filter_ibis` class implements filtering transformations for Ibis tables, enabling row-level selection based on LLM-driven criteria.\n\n**Class Definition:**\n\n```python\nclass _filter_ibis:\n    def __init__(\n        self,\n        prompt: str,\n        input_fields: Optional[List[str]] = None,\n    ):\n        self.prompt = prompt\n        self.input_fields = input_fields or []\n        self.llm_output_column = \"FILTER_LLM_OUTPUT__DATATUNE__\"\n        self.serialized_input_column = \"FILTER_SERIALIZED_INPUT__DATATUNE__\"\n```\n\n**Key Attributes:**\n\n| Attribute | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `prompt` | `str` | Required | Natural language criteria for filtering |\n| `input_fields` | `Optional[List[str]]` | `[]` | Columns to consider for filtering decision |\n| `llm_output_column` | `str` | System-generated | Internal column for LLM response storage |\n| `serialized_input_column` | `str` | System-generated | Internal column for serialized input |\n\n**Execution Flow:**\n\n```mermaid\ngraph TD\n    A[Input Ibis Table] --> B[Add Serialized Column]\n    B --> C[Execute to Local DataFrame]\n    C --> D[LLM Batch Inference]\n    D --> E[Parse Filter Decisions]\n    E --> F[Extract __filter__ Flag]\n    F --> G[Apply Boolean Filter]\n    G --> H[Final Ibis Table]\n```\n\n资料来源：[datatune/core/ibis/filter_ibis.py:80-120](datatune/core/ibis/filter_ibis.py)\n\n### Prompt Format for Filter\n\nThe LLM receives structured prompts for filtering operations:\n\n```\nYou are filtering a dataset. Your task is to determine whether each data record\nshould be KEPT or REMOVED based on the filtering criteria below.\nReturn the entire input data record with an added key called '__filter__' with\nvalue either True to KEEP the record or False to REMOVE it.\n\nFILTERING CRITERIA:\n{prompt}\n\nYour response MUST be the entire input record as a Python dictionary in the format:\nindex=<row_index>|{key1: value1, key2: value2, ...}<endofrow> with added key called\n'__filter__' with value either True to KEEP the record or False to REMOVE it.\n```\n\n资料来源：[datatune/core/ibis/filter_ibis.py:56-68](datatune/core/ibis/filter_ibis.py)\n\n## Output Parsing\n\n### parse_filter_output\n\nThe `parse_filter_output` function extracts boolean filter decisions from LLM responses.\n\n**Function Signature:**\n\n```python\ndef parse_filter_output(\n    output: Union[str, Exception], err: bool = True\n) -> Optional[bool]:\n```\n\n**Parsing Logic:**\n\n1. Attempts to evaluate the LLM output as a Python literal using `ast.literal_eval`\n2. Extracts the last key-value pair from the dictionary\n3. Returns the boolean value associated with the `__filter__` key\n4. Returns `None` if parsing fails\n\n资料来源：[datatune/core/ibis/filter_ibis.py:104-130](datatune/core/ibis/filter_ibis.py)\n\n### Output Format Requirements\n\nThe Ibis Backend enforces strict output format requirements for LLM responses:\n\n| Requirement | Format | Example |\n|-------------|--------|---------|\n| Index prefix | `index=<row_index>\\|` | `index=0\\|` |\n| Data format | Python dictionary literal | `{'col1': 'value1', 'col2': 'value2'}` |\n| String quoting | Double quotes only | `\"string\"` not `'string'` |\n| Null values | `None` | `{'col': None}` |\n| Row terminator | `<endofrow>` | `index=0\\|{...}<endofrow>` |\n\n资料来源：[datatune/core/ibis/map_ibis.py:70-75](datatune/core/ibis/map_ibis.py)\n资料来源：[datatune/core/ibis/filter_ibis.py:64-67](datatune/core/ibis/filter_ibis.py)\n\n## Usage Examples\n\n### Basic Map Operation\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport ibis\n\n# Connect to DuckDB\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"products\")\n\n# Initialize LLM\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\n\n# Create mapping operation\nmapped = dt.map(\n    prompt=\"Extract product category and subcategory from description\",\n    input_fields=[\"description\", \"name\"],\n    output_fields=[\"Category\", \"Subcategory\"]\n)(llm, table)\n```\n\n### Basic Filter Operation\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import Ollama\n\n# Connect to PostgreSQL via Ibis\ncon = ibis.postgres.connect(...)\ntable = con.table(\"orders\")\n\n# Initialize local LLM\nllm = Ollama(model_name=\"gemma3:4b\")\n\n# Create filter operation\nfiltered = dt.filter(\n    prompt=\"Keep only orders from enterprise customers with value over $10,000\",\n    input_fields=[\"customer_type\", \"order_value\"]\n)(llm, table)\n```\n\n## Error Handling\n\n### Schema Validation\n\nThe Ibis Backend performs input schema validation before processing:\n\n```python\ndef __call__(self, llm: Callable, table: Table) -> Table:\n    if self.input_fields:\n        missing = [f for f in self.input_fields if f not in table.columns]\n        if missing:\n            error_msg = (\n                f\"[datatune] Schema mismatch: The following input_fields were not found: {missing}. \"\n                f\"Available columns: {list(table.columns)}\"\n            )\n            raise ValueError(error_msg)\n```\n\n**Validation Rules:**\n\n| Check | Condition | Action on Failure |\n|-------|-----------|-------------------|\n| Input fields exist | `input_fields` subset of `table.columns` | Raise `ValueError` |\n| LLM call success | No exception from `llm()` | Return error string |\n| Output parsing | Valid Python literal | Return empty dict `{}` |\n\n资料来源：[datatune/core/ibis/map_ibis.py:130-140](datatune/core/ibis/map_ibis.py)\n\n## Internal Column Naming\n\nThe Ibis Backend uses prefixed column names to avoid conflicts with user data:\n\n| Operation | Serialized Column | Output Column |\n|-----------|-------------------|---------------|\n| Map | `MAP_SERIALIZED_INPUT__DATATUNE__` | `MAP_LLM_OUTPUT__DATATUNE__` |\n| Filter | `FILTER_SERIALIZED_INPUT__DATATUNE__` | `FILTER_LLM_OUTPUT__DATATUNE__` |\n\nThese columns are automatically removed from the final output table, ensuring clean results.\n\n## Integration with Core API\n\nThe Ibis Backend is invoked through the unified datatune API via the `filter.py` module:\n\n```python\ndef filter(*, prompt, input_fields=None, clusters=None):\n    def apply(llm, data):\n        if _is_ibis_table(data):\n            from .ibis.filter_ibis import _filter_ibis\n            return _filter_ibis(\n                prompt=prompt,\n                input_fields=input_fields,\n            )(llm, data)\n        # ... other backends\n```\n\n资料来源：[datatune/core/filter.py:15-28](datatune/core/filter.py)\n\n## Supported Database Backends\n\nThe Ibis Backend supports any database backend compatible with Ibis:\n\n| Backend | Connection Method | Example |\n|---------|-------------------|---------|\n| DuckDB | `ibis.duckdb.connect()` | Local analytical queries |\n| PostgreSQL | `ibis.postgres.connect()` | Enterprise data |\n| BigQuery | `ibis.bigquery.connect()` | Cloud data warehousing |\n| SQLite | `ibis.sqlite.connect()` | Lightweight embedded DB |\n| MySQL | `ibis.mysql.connect()` | Web applications |\n\n## Performance Considerations\n\n### Batch Processing\n\nThe Ibis Backend processes data in batches determined by:\n\n1. **Local execution boundary**: Data is materialized to a local DataFrame via `.execute()`\n2. **LLM rate limits**: Respects TPM (tokens per minute) and RPM (requests per minute) limits\n3. **Memory constraints**: Batch sizes are controlled by the underlying LLM integration\n\n### Optimization Strategies\n\n| Strategy | Description | Applicable Operation |\n|----------|-------------|---------------------|\n| Select specific fields | Use `input_fields` to minimize data transfer | Map, Filter |\n| Pushdown predicates | Filter data in the database before LLM processing | Filter |\n| Column pruning | Remove unused columns early in the pipeline | Map |\n\n## See Also\n\n- [Dask Backend](../dask/backend.md) - Alternative backend for distributed computing\n- [LLM Integrations](../../llm/llm.md) - Supported LLM providers\n- [Core API](../core/api.md) - Unified transformation API\n\n---\n\n<a id='page-llm-integration'></a>\n\n## LLM Integration\n\n### 相关页面\n\n相关主题：[Agent System](#page-agent-system), [Map Operations](#page-map-operations), [Filter Operations](#page-filter-operations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/llm/model_rate_limits.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/model_rate_limits.py)\n- [datatune/llm/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/__init__.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n</details>\n\n# LLM Integration\n\nThe LLM Integration module in datatune provides a unified abstraction layer for interfacing with various Large Language Model providers. It handles rate limiting, token management, request batching, and provider-specific configurations, enabling consistent data transformation operations across different LLM backends.\n\n## Architecture Overview\n\nThe LLM module uses a class hierarchy where a base `LLM` class provides common functionality, while provider-specific subclasses handle individual vendor implementations.\n\n```mermaid\ngraph TD\n    A[LLM Base Class] --> B[OpenAI]\n    A --> C[Ollama]\n    A --> D[VLLM]\n    A --> E[Azure]\n    A --> F[Mistral]\n    A --> G[Huggingface]\n    A --> H[Gemini]\n    \n    I[litellm] --> A\n    J[Rate Limits] --> A\n    K[Token Counter] --> A\n```\n\n## Core Components\n\n### LLM Base Class\n\nThe `LLM` class serves as the foundation for all provider implementations. It manages rate limiting, token counting, and prompt batching.\n\n#### Initialization Parameters\n\n| Parameter | Type | Description | Default |\n|-----------|------|-------------|---------|\n| `model_name` | `str` | Full model identifier (provider/model format) | Required |\n| `rpm` | `int` | Requests per minute limit | Provider default |\n| `tpm` | `int` | Tokens per minute limit | Provider default |\n| `max_tokens` | `int` | Maximum tokens in response | Provider default |\n| `temperature` | `float` | Sampling temperature | `0.0` |\n\n#### Key Methods\n\n| Method | Description |\n|--------|-------------|\n| `_completion(prompt)` | Execute a single completion request |\n| `_create_batched_prompts(input_rows, user_batch_prefix, prompt_per_row)` | Create optimized batched prompts |\n| `get_max_tokens()` | Retrieve max tokens for current model |\n\n资料来源：[datatune/llm/llm.py:40-90]()\n\n### Provider Implementations\n\n#### OpenAI\n\n```python\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(\n    model_name=\"gpt-3.5-turbo\",\n    api_key=\"your-api-key\"\n)\n```\n\nConfigures models in the `openai/model-name` format for LiteLLM compatibility.\n\n资料来源：[datatune/llm/llm.py:95-102]()\n\n#### Ollama (Local)\n\n```python\nfrom datatune.llm.llm import Ollama\n\nllm = Ollama(\n    model_name=\"gemma3:4b\",\n    api_base=\"http://localhost:11434\"\n)\n```\n\nEnables local model inference with Ollama. The `api_base` defaults to `http://localhost:11434`.\n\n资料来源：[datatune/llm/llm.py:74-82]()\n\n#### VLLM\n\n```python\nfrom datatune.llm.llm import VLLM\n\nllm = VLLM(\n    model_name=\"your-model\",\n    api_base=\"http://localhost:8000/v1\"\n)\n```\n\nQueries the `/models` endpoint to automatically determine `max_model_len` from the server.\n\n资料来源：[datatune/llm/llm.py:104-122]()\n\n#### Azure\n\n```python\nfrom datatune.llm.llm import Azure\n\nllm = Azure(\n    model_name=\"gpt-3.5-turbo\",\n    api_key=\"your-key\",\n    api_base=\"your-endpoint\",\n    api_version=\"2024-01-01\"\n)\n```\n\nSupports Azure OpenAI Service deployments with additional configuration options.\n\n#### Mistral\n\n```python\nfrom datatune.llm.llm import Mistral\n\nllm = Mistral(\n    model_name=\"mistral-tiny\",\n    api_key=\"your-api-key\"\n)\n```\n\nWraps Mistral AI models with automatic `mistral/` prefix handling.\n\n#### Huggingface\n\n```python\nfrom datatune.llm.llm import Huggingface\n\nllm = Huggingface(\n    model_name=\"meta-llama/Llama-2-70b\",\n    api_key=\"your-api-key\"\n)\n```\n\nProvides access to Hugging Face Inference API models.\n\n## Rate Limiting\n\nThe module enforces two types of rate limits to prevent API throttling:\n\n| Limit Type | Description | Applied At |\n|------------|-------------|------------|\n| **TPM** | Tokens per minute | Per-request token counting |\n| **RPM** | Requests per minute | Per-API-call counting |\n\n### Model Rate Limits Configuration\n\nPredefined limits for major models are stored in `datatune/llm/model_rate_limits.py`:\n\n| Model | TPM | RPM |\n|-------|-----|-----|\n| `gpt-3.5-turbo` | 200,000 | 500 |\n| `gpt-4` | 10,000 | 500 |\n| `gpt-4-turbo` | 30,000 | 500 |\n| `gpt-4o` | 30,000 | 500 |\n| `gpt-4.1-mini` | 200,000 | 500 |\n| `gpt-4.1-nano` | 200,000 | 500 |\n| `gpt-4.5-preview` | 125,000 | 1,000 |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-90]()\n\n### Fallback Behavior\n\nWhen an unknown model is used, the system falls back to `gpt-3.5-turbo` limits and logs a warning:\n\n```python\nif self._base_model_name in model_rate_limits:\n    model_limits = model_rate_limits[self._base_model_name]\nelse:\n    model_limits = model_rate_limits[DEFAULT_MODEL]\n    logger.warning(\n        f\"REQUESTS-PER-MINUTE limits for model '{model_name}' not found. \"\n        f\"Defaulting to '{DEFAULT_MODEL}' limits\"\n    )\n```\n\n资料来源：[datatune/llm/llm.py:45-57]()\n\n## Prompt Formatting\n\nThe LLM module enforces strict output format requirements for parsing reliability.\n\n### Output Format Specification\n\nAll responses must follow this pattern:\n\n```\nindex=<row_index>|{python_literal}<endofrow>\n```\n\nSupported Python literals:\n- Lists: `index=0|['item1', 'item2', 'item3']<endofrow>`\n- Dicts: `index=1|{'key1': 'value1', 'key2': 2}<endofrow>`\n- Strings: `index=2|\"This is a string answer\"<endofrow>`\n\n### Prompt Components\n\n| Component | Purpose |\n|-----------|---------|\n| `prefix` | Instructions for row processing |\n| `user_batch_prefix` | Custom user instructions |\n| `suffix` | Output format requirements |\n\n资料来源：[datatune/llm/llm.py:1-20]()\n\n## Batch Processing\n\nThe batching system optimizes LLM usage by grouping multiple rows into single API requests while respecting token limits.\n\n```mermaid\ngraph LR\n    A[Input Rows] --> B[Token Counting]\n    B --> C{Within Limits?}\n    C -->|Yes| D[Batch Request]\n    C -->|No| E[Split Batch]\n    D --> F[LLM API]\n    E --> D\n    F --> G[Parse Output]\n    G --> H[Row Results]\n```\n\n### Batching Logic\n\n1. Calculate prefix/suffix token overhead using `token_counter`\n2. Iterate through input rows, accumulating tokens\n3. Split batches when approaching `max_tokens` limit\n4. Track `nrows_per_api_call` for output reconstruction\n\n资料来源：[datatune/llm/llm.py:30-70]()\n\n## Integration with Core Modules\n\n### Filter Operations\n\nThe filter module uses the LLM to evaluate row-level boolean conditions:\n\n```python\ndef filter_rows(llm, df, input_column, prompt, output_column):\n    prefix = \"Row data:\"\n    suffix = \"Your response MUST be... with added key '__filter__'\"\n    \n    llm_out = llm(\n        input_series,\n        prefix,\n        prompt,\n        suffix,\n        optimized=True\n    )\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:15-40]()\n\n### Map Operations\n\nThe map module leverages the LLM for column transformations:\n\n```python\ndef map_rows(llm, df, input_columns, prompt, output_columns):\n    suffix = \"Your response MUST be... with added keys of expected new fields\"\n    \n    llm_out = llm(\n        canonical_input,\n        prefix,\n        prompt,\n        suffix,\n        optimized=True\n    )\n```\n\n资料来源：[datatune/core/dask/map_dask.py:20-45]()\n\n## Usage Example\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\n\n# Initialize LLM\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\n\n# Use with datatune operations\ndf = dd.read_csv(\"products.csv\")\n\nmapped = dt.map(\n    prompt=\"Extract categories from description\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\"]\n)(llm, df)\n\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Category\"]\n)(llm, mapped)\n```\n\n## Module Exports\n\nThe `datatune.llm` package exports all provider classes:\n\n```python\nfrom datatune.llm import *\n\n# Available: LLM, OpenAI, Ollama, VLLM, Azure, Mistral, Huggingface\n```\n\n资料来源：[datatune/llm/__init__.py:1]()\n\n## Dependencies\n\n| Library | Purpose |\n|---------|---------|\n| `litellm` | Unified LLM API interface |\n| `token_counter` | Token estimation |\n| `get_max_tokens` | Model context limits |\n\n资料来源：[datatune/llm/llm.py:10-11]()\n\n---\n\n<a id='page-agent-system'></a>\n\n## Agent System\n\n### 相关页面\n\n相关主题：[LLM Integration](#page-llm-integration), [Map Operations](#page-map-operations), [Filter Operations](#page-filter-operations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n- [datatune/agent/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/__init__.py)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n</details>\n\n# Agent System\n\nThe Agent System is the core intelligent orchestration layer in datatune that automatically determines, plans, and executes data transformation workflows based on natural language prompts. It abstracts the complexity of selecting between Dask operations and LLM-powered primitives (Map, Filter), allowing users to describe desired transformations in plain English while the system handles the implementation details.\n\n## Architecture Overview\n\nThe Agent System follows a planner-executor pattern where the LLM acts as the planner, generating a JSON-based execution plan, and the Agent class handles the runtime execution of that plan against Dask DataFrames.\n\n```mermaid\ngraph TD\n    A[User Natural Language Goal] --> B[Agent.do method]\n    B --> C[LLM Planner]\n    C --> D[JSON Execution Plan]\n    D --> E[Agent._execute_plan]\n    E --> F{Step Type}\n    F -->|dask| G[Dask Operations]\n    F -->|primitive| H[LLM Primitives<br/>Map/Filter]\n    G --> I[Runtime Execution]\n    H --> I\n    I --> J[Result DataFrame]\n```\n\n资料来源：[datatune/agent/agent.py:1-50]()\n\n## Core Components\n\n### Agent Base Class\n\nThe `Agent` class serves as the abstract base for all agent implementations and provides the foundational system prompt that defines the agent's capabilities.\n\n资料来源：[datatune/agent/__init__.py:1-50]()\n\n| Component | Description |\n|-----------|-------------|\n| `system_prompt` | Defines available libraries (pandas, numpy, dask, datatune) and example usage patterns |\n| `ABC` | Abstract base class pattern ensuring implementers provide core methods |\n| `abstractmethod` | Decorator marking `do` method as required implementation |\n\n```python\nfrom abc import ABC, abstractmethod\n\nclass Agent(ABC):\n    system_prompt: str = \"\"\"You are Datatune Agent, a powerful assistant designed to help users with data processing tasks.\n    You are capable of generating python code to perform various operations on data. Apart from python builtins, you have the following libraries avaiable in your run time:\n    - pandas\n    - numpy\n    - dask\n\n    In addition to these, you also have access to the datatune libarary, which provides functionality for processing data using LLMs.\n    ...\"\"\"\n```\n\n### Agent Implementation\n\nThe concrete `Agent` class in `datatune/agent/agent.py` implements the planner-executor logic.\n\n资料来源：[datatune/agent/agent.py:1-100]()\n\n| Attribute | Type | Purpose |\n|-----------|------|---------|\n| `llm` | `LLM` | Language model used for planning and primitive operations |\n| `history` | `List[Dict]` | Execution history for debugging and state tracking |\n| `verbose` | `bool` | Enable debug-level logging when True |\n| `TEMPLATE` | `dict` | Operation templates for code generation |\n\n```python\ndef __init__(self, llm: LLM, verbose: bool = False):\n    self.llm = llm\n    self.history: List[Dict[str, Any]] = []\n    self.verbose = verbose\n    if self.verbose:\n        logger.setLevel(logging.DEBUG)\n    else:\n        logger.setLevel(logging.INFO)\n```\n\n## Plan Structure\n\nThe Agent uses a JSON-based plan format where each step represents a single transformation operation. Plans are generated by the LLM based on user goals and the current DataFrame schema.\n\n资料来源：[datatune/agent/agent.py:100-150]()\n\n### Plan Step Schema\n\n| Field | Type | Required | Description |\n|-------|------|----------|-------------|\n| `type` | `string` | Yes | Either `\"dask\"` or `\"primitive\"` |\n| `operation` | `string` | Yes | Operation name within the step type |\n| `params` | `dict` | No | Parameters for Dask templates |\n| `subprompt` | `string` | No | LLM prompt for primitive operations |\n| `input_fields` | `list` | No | Input column names for the operation |\n| `output_fields` | `list` | No | Output column names (Map only) |\n\n### Step Types\n\n**Dask Operations** execute server-side Python code using Dask's computational graph:\n\n| Operation | Description |\n|-----------|-------------|\n| `add_column` | Create a new column from an expression |\n| `apply_function` | Apply an element-wise function to a column |\n| `rename_columns` | Rename columns using a mapping |\n| `astype_column` | Change a column's data type |\n| `group_by` | Group data and aggregate |\n\n资料来源：[datatune/agent/agent.py:50-100]()\n\n**Primitive Operations** use the LLM for row-level transformations:\n\n| Operation | Description | Use Case |\n|-----------|-------------|----------|\n| `Map` | Create new columns via LLM transformation | Semantic extraction, classification, interpretation |\n| `Filter` | Remove rows based on LLM criteria | Complex conditional filtering |\n\n资料来源：[datatune/core/dask/map_dask.py:1-50](), [datatune/core/dask/filter_dask.py:1-50]()\n\n## Plan Execution Flow\n\n```mermaid\ngraph TD\n    A[_execute_plan called] --> B[Set DataFrame in runtime]\n    B --> C[Initialize code_lines list]\n    C --> D{For each step in plan}\n    D -->|Has steps| E[Format step template]\n    E --> F[Log start message]\n    F --> G[Execute map_partitions with log_primitive]\n    G --> H[Increment step_num]\n    H --> I[Execute operation template]\n    I --> J[Log end message]\n    J --> D\n    D -->|No more steps| K[Call dt.finalize]\n    K --> L[Compute DataFrame]\n    L --> M[Return None, n_steps on success]\n    \n    N[Exception during execution] --> O[Return error, failed_step]\n```\n\n资料来源：[datatune/agent/agent.py:150-200]()\n\n### Execution Methods\n\n#### `_execute_plan(plan: List[Dict])`\n\nExecutes a complete multi-step plan sequentially.\n\n```python\ndef _execute_plan(self, plan: List[Dict]):\n    \"\"\"Execute a sequence of plan steps (Dask or primitive).\"\"\"\n    \n    self._set_df(self.df)\n    self.runtime.update({\"step_num\": 0, \"plan\": plan})\n    code_lines = []\n    n_steps = len(plan)\n    \n    for i, step in enumerate(plan, start=1):\n        # ... step formatting and execution ...\n    \n    # Final execution\n    full_code = (\n        \"\\n\".join(code_lines)\n        + \"\\n\\n\"\n        + \"df = dt.finalize(df)\\n\"\n        \"df = df.compute()\"\n    )\n    self.runtime.execute(full_code)\n    return None, n_steps\n```\n\n#### `_execute_step(step: Dict)`\n\nExecutes a single step with detailed error handling.\n\n```python\ndef _execute_step(self, step: Dict):\n    try:\n        if step[\"type\"] == \"dask\":\n            template = self.TEMPLATE[\"dask\"][step[\"operation\"]].format(**step[\"params\"])\n            self.runtime.execute(template + \"\\n_ = df.head()\")\n        elif step[\"type\"] == \"primitive\":\n            template = self.TEMPLATE[\"primitive\"][step[\"operation\"]].format(**step[\"params\"])\n            self.runtime.execute(template)\n        else:\n            raise ValueError(f\"Unknown step type: {step['type']}\")\n    except Exception as e:\n        error_msg = f\"{type(e).__name__}: {str(e)}\\n{traceback.format_exc()}\"\n        return error_msg\n```\n\n资料来源：[datatune/agent/agent.py:200-250]()\n\n## LLM Integration\n\nThe Agent integrates with the LLM system through `datatune/llm/llm.py` to generate execution plans and power primitive operations.\n\n资料来源：[datatune/llm/llm.py:1-50]()\n\n### Supported LLM Providers\n\n| Provider | Class | Default Model |\n|----------|-------|---------------|\n| OpenAI | `OpenAI` | `gpt-3.5-turbo` |\n| Ollama | `Ollama` | `gemma3:4b` |\n| Azure | `Azure` | Configurable |\n| vLLM | `VLLM` | Configurable |\n\n```python\nclass OpenAI(LLM):\n    def __init__(\n        self, model_name: str = \"gpt-3.5-turbo\", api_key: Optional[str] = None, **kwargs\n    ):\n        kwargs.update({\"api_key\": api_key})\n        super().__init__(model_name=f\"openai/{model_name}\", **kwargs)\n\nclass Ollama(LLM):\n    def __init__(\n        self, model_name=\"gemma3:4b\", api_base=\"http://localhost:11434\", **kwargs\n    ) -> None:\n        super().__init__(\n            model_name=f\"ollama_chat/{model_name}\", api_base=api_base, **kwargs\n        )\n```\n\n### Rate Limiting\n\nThe system includes model rate limits configuration to prevent API throttling:\n\n| Model | TPM | RPM |\n|-------|-----|-----|\n| `gpt-3.5-turbo` | 200,000 | 500 |\n| `gpt-4-turbo` | 30,000 | 500 |\n| `gpt-4o` | 30,000 | 500 |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-50]()\n\n## Prompt Generation\n\nThe Agent constructs prompts dynamically based on the current state and goal.\n\n资料来源：[datatune/agent/agent.py:250-300]()\n\n### Prompt Components\n\n| Method | Purpose |\n|--------|---------|\n| `get_persona_prompt(goal)` | Generates the system instructions for plan generation |\n| `get_schema_prompt(df)` | Includes current DataFrame schema |\n| `get_error_prompt(error_msg, failed_step)` | Context for retry after failures |\n| `get_full_prompt(df, goal)` | Combines all components |\n\n```python\ndef get_schema_prompt(self, df: dd.DataFrame) -> str:\n    schema_prompt: str = f\"\"\"The current schema of the dataframe df is as follows:\n    {df.dtypes.to_string()}\n    \"\"\"\n    return schema_prompt\n\ndef get_error_prompt(self, error_msg: str, failed_step: Dict) -> str:\n    error_prompt: str = f\"\"\"\n    The previous code execution failed with the following error:\n    Error: {error_msg}\n    \n    Failed step:\n    {failed_step}\n    \n    Provide the json plan with the corrected step. Make sure to:\n    1. Address the specific error mentioned above.\n    2. DO NOT include any explanations or comments.\n    \"\"\"\n    return error_prompt\n```\n\n## Primitive Operations\n\n### Map Operation\n\nThe Map primitive uses the LLM to create new columns by applying transformations to each row.\n\n资料来源：[datatune/core/dask/map_dask.py:1-80]()\n\n```python\n# Example: Extract categories from description\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n```\n\n**Output Format Requirements:**\n- Each response must be: `index=<index>|{key1: value1, key2: value2, ...}<endofrow>`\n- Strings must use double quotes\n- Missing values should be `null`\n\n### Filter Operation\n\nThe Filter primitive uses the LLM to evaluate row-level conditions.\n\n资料来源：[datatune/core/dask/filter_dask.py:1-80]()\n\n```python\n# Example: Keep only electronics products\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\"]\n)(llm, mapped)\n```\n\n**Decision Format:**\n- Each response must include `__filter__: True` (keep) or `__filter__: False` (remove)\n\n## Error Handling\n\nThe Agent implements comprehensive error handling at multiple levels:\n\n| Level | Handler | Action |\n|-------|---------|--------|\n| Step formatting | `ValueError` | Log and return error with step number |\n| Template lookup | `KeyError` | Raise with available operation names |\n| Runtime execution | `Exception` | Capture traceback, return formatted error |\n\n```python\nexcept Exception as e:\n    step_num = self.runtime[\"step_num\"]\n    logger.error(f\"❌ Step {step_num}/{n_steps} failed with error: {e}\")\n    return str(e), step_num - 1\n```\n\n资料来源：[datatune/logger.py:1-50]()\n\n## Usage Examples\n\n### Basic Agent Usage\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\nagent = dt.Agent(llm)\n\ndf = dd.read_csv(\"products.csv\")\nresult = agent.do(\"Add ProfitMargin column and keep only African organizations\", df)\nresult.compute().to_csv(\"african_products.csv\")\n```\n\n### Verbose Mode for Debugging\n\n```python\nagent = dt.Agent(llm, verbose=True)\nresult = agent.do(\"Complex transformation task\", df)\n```\n\nWith `verbose=True`, the Agent sets logger to `DEBUG` level, outputting detailed execution information including step progress and intermediate states.\n\n### Chaining Operations\n\nThe Agent automatically chains multiple operations when a goal contains multiple tasks:\n\n```\nTASK: 1. create column a based on column x\n      2. create column b based on column y\n```\n\nCombined into a single Map primitive step rather than separate steps.\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：vitalops/datatune\n\n摘要：发现 7 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：能力坑 - 能力判断依赖假设。\n\n## 1. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | README/documentation is current enough for a first validation pass.\n\n## 2. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | last_activity_observed missing\n\n## 3. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium\n\n## 4. 安全/权限坑 · 存在安全注意事项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：No sandbox install has been executed yet; downstream must verify before user use.\n- 对用户的影响：用户安装前需要知道权限边界和敏感操作。\n- 建议检查：转成明确权限清单和安全审查提示。\n- 防护动作：安全注意事项必须面向用户前置展示。\n- 证据：risks.safety_notes | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | No sandbox install has been executed yet; downstream must verify before user use.\n\n## 5. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium\n\n## 6. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | issue_or_pr_quality=unknown\n\n## 7. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | release_recency=unknown\n\n<!-- canonical_name: vitalops/datatune; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "datatune",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "art_fcb598a7a7ed4b2482ca92474f06efe2",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/vitalops/datatune#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "datatune 说明书",
      "toc": [
        "https://github.com/vitalops/datatune 项目说明书",
        "目录",
        "Introduction to Datatune",
        "High-Level Architecture",
        "Core Components",
        "OpenAI",
        "Ollama (local)",
        "Azure",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "7f822d92c9c6a95c236ffcab31953f7c073445af",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "README.md",
      "docs/source/Agent.md",
      "docs/source/index.md",
      "docs/source/Filter.md",
      "docs/source/Op.md",
      "docs/source/DataLoaders.md",
      "docs/source/Map.md",
      "docs/source/conf.py",
      "docs/source/Reduce.md",
      "docs/source/LLM.md"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# datatune - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 datatune 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **想在安装前理解开源项目价值和边界的用户**：当前证据主要来自项目文档。 证据：`README.md` Claim：`clm_0002` supported 0.86\n\n## 它能做什么\n\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n## 怎么开始\n\n- `pip install datatune` 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：仅建议沙盒试装\n- **为什么**：项目存在安装命令、宿主配置或本地写入线索，不建议直接进入主力环境，应先在隔离环境试装。\n\n### 30 秒判断\n\n- **现在怎么做**：仅建议沙盒试装\n- **最小安全下一步**：先跑 Prompt Preview；若仍要安装，只在隔离环境试装\n- **先别相信**：真实输出质量不能在安装前相信。\n- **继续会触碰**：命令执行、本地环境或项目文件、宿主 AI 上下文\n\n### 现在可以相信\n\n- **适合人群线索：想在安装前理解开源项目价值和边界的用户**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n### 现在还不能相信\n\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n- **安装命令是否需要网络、权限或全局写入？**（unverified）：这影响企业环境和个人环境的安装风险。 证据：`README.md`\n\n### 继续会触碰什么\n\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：用安装前交互式试用判断工作方式是否匹配，不需要授权或改环境。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0004` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md` Claim：`clm_0005` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n### 上下文规模\n\n- 文件总数：50\n- 重要文件覆盖：23/50\n- 证据索引条目：23\n- 角色 / Skill 条目：9\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 datatune 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 datatune 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 datatune 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 9 个角色 / Skill / 项目文档条目。\n\n- **🎵 Datatune**（project_doc）：! PyPI version https://img.shields.io/pypi/v/datatune.svg https://pypi.org/project/datatune/ ! License https://img.shields.io/github/license/vitalops/datatune https://github.com/vitalops/datatune/blob/main/LICENSE ! PyPI Downloads https://static.pepy.tech/badge/datatune https://pepy.tech/projects/datatune ! Docs https://img.shields.io/badge/docs-docs.datatune.ai-blue https://docs.datatune.ai ! Discord https://img.sh… 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`README.md`\n- **Agents**（project_doc）：Datatune Agent allows large language models LLMs to autonomously plan and execute data transformation steps using natural language prompts. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/source/Agent.md`\n- **Data Loaders**（project_doc）：Datatune supports multiple data loading backends to work with your data efficiently. Choose the backend that best fits your use case and infrastructure. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/source/DataLoaders.md`\n- **Filter**（project_doc）：The Filter operation in Datatune uses LLMs to evaluate and filter rows in a dataset based on natural language criteria. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/source/Filter.md`\n- **LLM**（project_doc）：The LLM module in Datatune provides classes for interfacing with various Language Model providers. It handles the connection to and inference from LLMs with a unified API structure, supporting both single and batch inference, with the help of LiteLLM. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/source/LLM.md`\n- **Map**（project_doc）：The Map operation in Datatune uses LLMs to transform data in a dataset by generating new fields or modifying existing ones based on natural language instructions. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/source/Map.md`\n- **Op**（project_doc）：The Op module provides the base operation class and utility functions for Datatune's data transformation pipeline. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/source/Op.md`\n- **Reduce**（project_doc）：The Reduce operation in Datatune reduces the size of a dataset by grouping, deduplicating, or otherwise collapsing rows based on a specified action. It is typically used to minimize downstream processing cost e.g. LLM calls by identifying canonical rows and eliminating redundant ones. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/source/Reduce.md`\n- **Documentation**（project_doc）：Perform transformations on your data with natural language using LLMs 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/source/index.md`\n\n## 证据索引\n\n- 共索引 23 条证据。\n\n- **🎵 Datatune**（documentation）：! PyPI version https://img.shields.io/pypi/v/datatune.svg https://pypi.org/project/datatune/ ! License https://img.shields.io/github/license/vitalops/datatune https://github.com/vitalops/datatune/blob/main/LICENSE ! PyPI Downloads https://static.pepy.tech/badge/datatune https://pepy.tech/projects/datatune ! Docs https://img.shields.io/badge/docs-docs.datatune.ai-blue https://docs.datatune.ai ! Discord https://img.shields.io/badge/Discord-7289da?logo=discord&logoColor=white https://discord.gg/3RKA5AryQX 证据：`README.md`\n- **License**（source_file）：Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files the \"Software\" , to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 证据：`LICENSE`\n- **Agents**（documentation）：Datatune Agent allows large language models LLMs to autonomously plan and execute data transformation steps using natural language prompts. 证据：`docs/source/Agent.md`\n- **Data Loaders**（documentation）：Datatune supports multiple data loading backends to work with your data efficiently. Choose the backend that best fits your use case and infrastructure. 证据：`docs/source/DataLoaders.md`\n- **Filter**（documentation）：The Filter operation in Datatune uses LLMs to evaluate and filter rows in a dataset based on natural language criteria. 证据：`docs/source/Filter.md`\n- **LLM**（documentation）：The LLM module in Datatune provides classes for interfacing with various Language Model providers. It handles the connection to and inference from LLMs with a unified API structure, supporting both single and batch inference, with the help of LiteLLM. 证据：`docs/source/LLM.md`\n- **Map**（documentation）：The Map operation in Datatune uses LLMs to transform data in a dataset by generating new fields or modifying existing ones based on natural language instructions. 证据：`docs/source/Map.md`\n- **Op**（documentation）：The Op module provides the base operation class and utility functions for Datatune's data transformation pipeline. 证据：`docs/source/Op.md`\n- **Reduce**（documentation）：The Reduce operation in Datatune reduces the size of a dataset by grouping, deduplicating, or otherwise collapsing rows based on a specified action. It is typically used to minimize downstream processing cost e.g. LLM calls by identifying canonical rows and eliminating redundant ones. 证据：`docs/source/Reduce.md`\n- **Documentation**（documentation）：Perform transformations on your data with natural language using LLMs 证据：`docs/source/index.md`\n- **Byte-compiled / optimized / DLL files**（source_file）：/tests/test data/ .DS Store Byte-compiled / optimized / DLL files pycache / .py cod $py.class 证据：`.gitignore`\n- **Cname**（source_file）：docs.datatune.ai 证据：`CNAME`\n- **Init**（source_file）：from datatune.agent.agent import Agent from datatune.core.filter import filter from datatune.core.map import map from datatune.core.dask.op import finalize from datatune.core.deduplication import SemanticDeduplicator from datatune.core.reduce import reduce all = \"map\", \"filter\", \"finalize\", \"Agent\", \"reduce\" 证据：`datatune/__init__.py`\n- **Logger**（source_file）：import logging import threading import time import sys 证据：`datatune/logger.py`\n- **Minimal makefile for Sphinx documentation**（source_file）：Minimal makefile for Sphinx documentation 证据：`docs/Makefile`\n- **Make**（source_file）：REM Command file for Sphinx documentation 证据：`docs/make.bat`\n- **Getting Started**（source_file）：{ \"cells\": { \"cell type\": \"markdown\", \"metadata\": { \"id\": \"OHg0Xg632UxE\" }, \"source\": \" Getting started with Datatune\" }, { \"cell type\": \"markdown\", \"metadata\": { \"id\": \"TjDiMfzi2UxF\" }, \"source\": \" Setup and Installation \\n\", \"First, let's install the necessary packages:\" }, { \"cell type\": \"code\", \"execution count\": null, \"metadata\": { \"vscode\": { \"languageId\": \"plaintext\" }, \"id\": \"aISH8Ve42UxF\" }, \"outputs\": , \"source\": \"!pip install datatune\" }, { \"cell type\": \"markdown\", \"metadata\": { \"id\": \"Gd16ZTSG2UxG\" }, \"source\": \" Import Required Libraries\\n\" }, { \"cell type\": \"code\", \"execution count\": null, \"metadata\": { \"id\": \"x2GlLBRv2UxG\" }, \"outputs\": , \"source\": \"import pandas as pd\\n\", \"i… 证据：`examples/Getting_started.ipynb`\n- **Agents**（source_file）：{ \"nbformat\": 4, \"nbformat minor\": 0, \"metadata\": { \"colab\": { \"provenance\": }, \"kernelspec\": { \"name\": \"python3\", \"display name\": \"Python 3\" }, \"language info\": { \"name\": \"python\" } }, \"cells\": { \"cell type\": \"code\", \"execution count\": null, \"metadata\": { \"id\": \"0olYqjotD-UR\" }, \"outputs\": , \"source\": \" Let's start by installing libraries\" }, { \"cell type\": \"code\", \"source\": \"!pip install datatune\" , \"metadata\": { \"collapsed\": true, \"id\": \"S5lfcBmNHwz1\" }, \"execution count\": null, \"outputs\": }, { \"cell type\": \"code\", \"source\": \"import os\\n\", \"import dask.dataframe as dd\\n\", \"import datatune as dt\\n\", \"from datatune.llm.llm import Azure\" , \"metadata\": { \"id\": \"-NMdOHHzHsRa\" }, \"execution co… 证据：`examples/agents.ipynb`\n- **Data Anonymization**（source_file）：{ \"nbformat\": 4, \"nbformat minor\": 0, \"metadata\": { \"colab\": { \"provenance\": }, \"kernelspec\": { \"name\": \"python3\", \"display name\": \"Python 3\" }, \"language info\": { \"name\": \"python\" } }, \"cells\": { \"cell type\": \"code\", \"source\": \"!pip install datatune\" , \"metadata\": { \"colab\": { \"base uri\": \"https://localhost:8080/\" }, \"collapsed\": true, \"id\": \"VCfT3UnVOXrb\", \"outputId\": \"491f3d3e-8157-4336-e157-f6dbced4d555\" }, \"execution count\": 1, \"outputs\": { \"output type\": \"stream\", \"name\": \"stdout\", \"text\": \"Collecting datatune\\n\", \" Downloading datatune-0.0.4-py3-none-any.whl.metadata 10 kB \\n\", \"Requirement already satisfied: numpy in /usr/local/lib/python3.12/dist-packages from datatune 2.0.2 \\n\", \"… 证据：`examples/data_anonymization.ipynb`\n- **Deduplication**（source_file）：{ \"cells\": { \"cell type\": \"markdown\", \"id\": \"9d96f064\", \"metadata\": {}, \"source\": \" Datatune with Semantic Deduplication\" }, { \"cell type\": \"markdown\", \"id\": \"e16bfaa5\", \"metadata\": {}, \"source\": \"Let's start by installing dependencies. We will be using duckdb as our database backend and OpenAI LLM API.\" }, { \"cell type\": \"code\", \"execution count\": null, \"id\": \"4f08aea7\", \"metadata\": {}, \"outputs\": , \"source\": \"!pip install datatune\" }, { \"cell type\": \"markdown\", \"id\": \"6b326c27\", \"metadata\": {}, \"source\": \" Import required libraries\" }, { \"cell type\": \"code\", \"execution count\": null, \"id\": \"5169a466\", \"metadata\": {}, \"outputs\": , \"source\": \"import datatune as dt\\n\", \"import seaborn as sns\\… 证据：`examples/deduplication.ipynb`\n- **Ibis Core**（source_file）：{ \"cells\": { \"cell type\": \"markdown\", \"id\": \"9d96f064\", \"metadata\": {}, \"source\": \" Datatune with Ibis\" }, { \"cell type\": \"markdown\", \"id\": \"e16bfaa5\", \"metadata\": {}, \"source\": \"Let's start by installing dependencies. We will be using duckdb as our database backend and OpenAI LLM API.\" }, { \"cell type\": \"code\", \"execution count\": null, \"id\": \"4f08aea7\", \"metadata\": {}, \"outputs\": , \"source\": \"!pip install -e ../. ibis \" }, { \"cell type\": \"markdown\", \"id\": \"6b326c27\", \"metadata\": {}, \"source\": \" Import required libraries\" }, { \"cell type\": \"code\", \"execution count\": null, \"id\": \"5169a466\", \"metadata\": {}, \"outputs\": , \"source\": \"import ibis\\n\", \"import datatune as dt\\n\", \"import seaborn as… 证据：`examples/ibis_core.ipynb`\n- **Pyproject**（source_file）：build-system requires = \"setuptools =61.0\", \"wheel\" build-backend = \"setuptools.build meta\" 证据：`pyproject.toml`\n- **Ibis Core Test**（source_file）：import pytest import ibis import pandas as pd import datatune as dt 证据：`tests/ibis_core_test.py`\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`README.md`, `LICENSE`, `docs/source/Agent.md`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`README.md`, `LICENSE`, `docs/source/Agent.md`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Introduction to Datatune**：importance `high`\n  - source_paths: README.md, datatune/__init__.py\n- **Getting Started**：importance `high`\n  - source_paths: pyproject.toml, README.md\n- **System Architecture**：importance `high`\n  - source_paths: datatune/agent/runtime.py, datatune/core/map.py, datatune/core/filter.py, datatune/core/reduce.py\n- **Map Operations**：importance `high`\n  - source_paths: datatune/core/map.py, datatune/core/dask/map_dask.py, datatune/core/ibis/map_ibis.py, docs/source/Map.md\n- **Filter Operations**：importance `high`\n  - source_paths: datatune/core/filter.py, datatune/core/dask/filter_dask.py, datatune/core/ibis/filter_ibis.py, docs/source/Filter.md\n- **Reduce Operations**：importance `medium`\n  - source_paths: datatune/core/reduce.py, docs/source/Reduce.md\n- **Dask Backend**：importance `high`\n  - source_paths: datatune/core/dask/filter_dask.py, datatune/core/dask/map_dask.py, datatune/core/dask/op.py, datatune/core/dask/constants.py\n- **Ibis Backend**：importance `high`\n  - source_paths: datatune/core/ibis/filter_ibis.py, datatune/core/ibis/map_ibis.py, docs/source/Ibis.md\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `7f822d92c9c6a95c236ffcab31953f7c073445af`\n- inspected_files: `pyproject.toml`, `README.md`, `docs/source/Agent.md`, `docs/source/index.md`, `docs/source/Filter.md`, `docs/source/Op.md`, `docs/source/DataLoaders.md`, `docs/source/Map.md`, `docs/source/conf.py`, `docs/source/Reduce.md`, `docs/source/LLM.md`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 能力判断依赖假设\n\n- Trigger: README/documentation is current enough for a first validation pass.\n- Host AI rule: 将假设转成下游验证清单。\n- Why it matters: 假设不成立时，用户拿不到承诺的能力。\n- Evidence: capability.assumptions | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | README/documentation is current enough for a first validation pass.\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 维护活跃度未知\n\n- Trigger: 未记录 last_activity_observed。\n- Host AI rule: 补 GitHub 最近 commit、release、issue/PR 响应信号。\n- Why it matters: 新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- Evidence: evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | last_activity_observed missing\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 下游验证发现风险项\n\n- Trigger: no_demo\n- Host AI rule: 进入安全/权限治理复核队列。\n- Why it matters: 下游已经要求复核，不能在页面中弱化。\n- Evidence: downstream_validation.risk_items | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 存在安全注意事项\n\n- Trigger: No sandbox install has been executed yet; downstream must verify before user use.\n- Host AI rule: 转成明确权限清单和安全审查提示。\n- Why it matters: 用户安装前需要知道权限边界和敏感操作。\n- Evidence: risks.safety_notes | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | No sandbox install has been executed yet; downstream must verify before user use.\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 存在评分风险\n\n- Trigger: no_demo\n- Host AI rule: 把风险写入边界卡，并确认是否需要人工复核。\n- Why it matters: 风险会影响是否适合普通用户安装。\n- Evidence: risks.scoring_risks | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: issue/PR 响应质量未知\n\n- Trigger: issue_or_pr_quality=unknown。\n- Host AI rule: 抽样最近 issue/PR，判断是否长期无人处理。\n- Why it matters: 用户无法判断遇到问题后是否有人维护。\n- Evidence: evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | issue_or_pr_quality=unknown\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 发布节奏不明确\n\n- Trigger: release_recency=unknown。\n- Host AI rule: 确认最近 release/tag 和 README 安装命令是否一致。\n- Why it matters: 安装命令和文档可能落后于代码，用户踩坑概率升高。\n- Evidence: evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | release_recency=unknown\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：vitalops/datatune\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：chatgpt\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 能力判断依赖假设（medium）：假设不成立时，用户拿不到承诺的能力。 建议检查：将假设转成下游验证清单。\n- 维护活跃度未知（medium）：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 下游验证发现风险项（medium）：下游已经要求复核，不能在页面中弱化。 建议检查：进入安全/权限治理复核队列。\n- 存在安全注意事项（medium）：用户安装前需要知道权限边界和敏感操作。 建议检查：转成明确权限清单和安全审查提示。\n- 存在评分风险（medium）：风险会影响是否适合普通用户安装。 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/vitalops/datatune 项目说明书\n\n生成时间：2026-05-15 09:50:50 UTC\n\n## 目录\n\n- [Introduction to Datatune](#page-introduction)\n- [Getting Started](#page-getting-started)\n- [System Architecture](#page-architecture)\n- [Map Operations](#page-map-operations)\n- [Filter Operations](#page-filter-operations)\n- [Reduce Operations](#page-reduce-operations)\n- [Dask Backend](#page-dask-backend)\n- [Ibis Backend](#page-ibis-backend)\n- [LLM Integration](#page-llm-integration)\n- [Agent System](#page-agent-system)\n\n<a id='page-introduction'></a>\n\n## Introduction to Datatune\n\n### 相关页面\n\n相关主题：[Getting Started](#page-getting-started), [System Architecture](#page-architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/vitalops/datatune/blob/main/README.md)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/llm/model_rate_limits.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/model_rate_limits.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n- [datatune/agent/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/__init__.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/core/deduplication.py](https://github.com/vitalops/datatune/blob/main/datatune/core/deduplication.py)\n</details>\n\n# Introduction to Datatune\n\nDatatune is an AI-powered data transformation library that enables natural language-driven data processing operations on large-scale datasets. The library leverages Large Language Models (LLMs) to interpret human-readable instructions and automatically generate the appropriate data transformations, making complex data manipulation accessible to users without deep technical expertise.\n\nDatatune supports multiple backend computational frameworks including Dask for distributed computing and Ibis for SQL-like operations across various databases. The system provides both explicit transformation primitives (Map, Filter) and an intelligent Agent that can automatically determine and chain multiple transformation steps from a single natural language prompt.\n\n## High-Level Architecture\n\n```mermaid\ngraph TD\n    User[User Request] --> Agent[Datatune Agent]\n    User --> Primitives[Explicit Primitives]\n    \n    Agent --> Planner[Planning Module]\n    Planner --> Steps[Transformation Steps]\n    Steps --> DaskOps[Dask Operations]\n    Steps --> PrimOps[Primitive Operations]\n    Steps --> CodeGen[Code Generation]\n    \n    Primitives --> MapOp[Map Operation]\n    Primitives --> FilterOp[Filter Operation]\n    \n    MapOp --> LLM[LLM Interface]\n    FilterOp --> LLM\n    \n    LLM --> OpenAI[OpenAI Provider]\n    LLM --> Ollama[Ollama Provider]\n    LLM --> VLLM[VLLM Provider]\n    LLM --> Azure[Azure Provider]\n    \n    DaskOps --> Dask[Dask DataFrame]\n    PrimOps --> Dask\n    \n    MapOp --> Ibis[Ibis Backend]\n    FilterOp --> Ibis\n    \n    Ibis --> DuckDB[DuckDB]\n    Ibis --> PostgreSQL[PostgreSQL]\n    Ibis --> BigQuery[BigQuery]\n```\n\nThe architecture follows a layered approach where the LLM abstraction layer provides a unified interface for multiple AI providers, while the transformation layer handles both explicit user-defined operations and AI-generated execution plans.\n\n## Core Components\n\n### LLM Interface Layer\n\nThe LLM module (`datatune/llm/llm.py`) provides a unified interface for interacting with various language model providers. The base `LLM` class handles rate limiting, token counting, and request batching, while provider-specific subclasses implement the actual API calls.\n\n#### Supported LLM Providers\n\n| Provider | Class | Default Model | Description |\n|----------|-------|---------------|-------------|\n| OpenAI | `OpenAI` | gpt-3.5-turbo | Standard OpenAI API integration |\n| Ollama | `Ollama` | gemma3:4b | Local LLM server support |\n| VLLM | `VLLM` | User-specified | High-performance vLLM inference |\n| Azure | `Azure` | User-specified | Azure OpenAI Service |\n\nThe base `LLM` class automatically extracts rate limit configurations from `model_rate_limits.py` based on the model name. If a model is not found in the predefined limits, it defaults to GPT-3.5-turbo limits with a warning message.\n\n```python\nfrom datatune.llm.llm import OpenAI, Ollama, Azure\n\n# OpenAI\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\n\n# Ollama (local)\nllm = Ollama()\n\n# Azure\nllm = Azure(model_name=\"gpt-3.5-turbo\", api_key=api_key)\n```\n\n资料来源：[README.md:40-56]()\n\n### Rate Limiting Configuration\n\nThe `model_rate_limits.py` module defines tokens-per-minute (TPM) and requests-per-minute (RPM) limits for supported models:\n\n| Model Family | TPM | RPM |\n|--------------|-----|-----|\n| GPT-3.5-Turbo | 200,000 | 500 |\n| GPT-4 | 10,000 - 30,000 | 500 |\n| GPT-4.1 | 30,000 - 200,000 | 500 |\n| GPT-4.5-Preview | 125,000 | 1000 |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-120]()\n\n## Transformation Primitives\n\nDatatune provides two fundamental transformation primitives that work with natural language prompts:\n\n### Map Operation\n\nThe `Map` primitive creates new columns by applying LLM-driven transformations to existing data. It takes input from specified columns and generates output fields based on the natural language prompt.\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n```\n\nThe Map operation constructs prompts with the following structure:\n\n```\nYour task is to MAP/CREATE new fields based on the following input record(s):\n[input data]\n\nYour response MUST be the entire input record as a valid Python dictionary in the format\n'index=<row_index>|{key1: value1, key2: value2, ...}' with added keys of expected new fields.\n```\n\n资料来源：[datatune/core/dask/map_dask.py:1-50]()\n资料来源：[datatune/core/ibis/map_ibis.py:1-60]()\n\n### Filter Operation\n\nThe `Filter` primitive removes rows that do not meet specified conditions interpreted by the LLM:\n\n```python\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\"]\n)(llm, mapped)\n```\n\nThe Filter operation adds a `__filter__` key to each record, with `True` indicating the row should be kept and `False` indicating removal.\n\n```\nFILTERING CRITERIA:\n[prompt]\n\nDECISION: Your response MUST be the entire input record as a Python dictionary in the format:\nindex=<row_index>|{key1: value1, key2: value2, ...} with added key called '__filter__' \nwith value either True to KEEP the record or False to REMOVE it.\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:1-45]()\n资料来源：[datatune/core/ibis/filter_ibis.py:1-50]()\n\n## Intelligent Agent\n\nThe `Agent` class provides a higher-level interface that automatically determines which operations to perform based on natural language requests. The agent can chain multiple Map, Filter, and Python code operations into a coherent execution plan.\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\nagent = dt.Agent(llm)\n\ndf = agent.do(\"Add ProfitMargin column and keep only African organizations\", df)\nresult = dt.finalize(df)\n```\n\n### Agent System Prompt\n\nThe Agent uses a structured system prompt that defines available operations and expected response formats:\n\n```python\nsystem_prompt: str = \"\"\"You are Datatune Agent, a powerful assistant designed to help users with data processing tasks.\nYou are capable of generating python code to perform various operations on data. Apart from python builtins, you have the following libraries avaiable in your run time:\n- pandas\n- numpy\n- dask\n\nIn addition to these, you also have access to the datatune libarary, which provides functionality for processing data using LLMs.\n```\n\n### Plan Execution Flow\n\n```mermaid\ngraph TD\n    Request[Natural Language Request] --> Parse[Parse Request]\n    Parse --> Plan[Generate Execution Plan]\n    Plan --> Step1[Step 1: Dask/Primitive]\n    Step1 --> Step2[Step 2: Dask/Primitive]\n    Step2 --> StepN[Step N: ...]\n    StepN --> Execute[Execute Full Plan]\n    Execute --> Result[Finalized DataFrame]\n    \n    Plan --> StepTypes{Step Type}\n    StepTypes -->|\"dask\"| DaskTemplate[Dask Template]\n    StepTypes -->|\"primitive\"| PrimTemplate[Primitive Template]\n```\n\nThe agent generates plans as JSON arrays with the following structure:\n\n```json\n[\n  {\n    \"type\": \"dask|primitive\",\n    \"operation\": \"operation_name\",\n    \"params\": {},\n    \"subprompt\": \"LLM prompt for primitive operations\",\n    \"input_fields\": [\"column1\"],\n    \"output_fields\": [\"new_column\"]\n  }\n]\n```\n\n资料来源：[datatune/agent/__init__.py:1-45]()\n资料来源：[datatune/agent/agent.py:1-100]()\n\n## Data Source Backends\n\nDatatune supports multiple computational backends for processing data at scale:\n\n### Dask Backend\n\nThe Dask backend provides distributed computing capabilities for pandas-like DataFrames:\n\n```python\nimport dask.dataframe as dd\ndf = dd.read_csv(\"data.csv\")\n```\n\nDask operations include:\n- `add_column`: Create new columns from expressions\n- `apply_function`: Apply element-wise functions\n- `rename_columns`: Rename using mappings\n- `astype_column`: Change data types\n\n### Ibis Backend\n\nThe Ibis backend enables SQL-like operations across various database backends:\n\n```python\nimport ibis\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"my_table\")\n```\n\nSupported Ibis backends include DuckDB, PostgreSQL, and BigQuery.\n\n资料来源：[README.md:60-75]()\n\n## Deduplication System\n\nDatatune includes a sophisticated deduplication system that uses embedding-based similarity detection:\n\n```mermaid\ngraph LR\n    Input[Input Data] --> Embed[Embedding Generation]\n    Embed --> Index[FAISS Index Build]\n    Index --> Cluster[Similarity Clustering]\n    Cluster --> Dedup[Deduplicated Output]\n```\n\nKey parameters:\n| Parameter | Default | Description |\n|-----------|---------|-------------|\n| `embedding_model` | text-embedding-3-small | Model for generating embeddings |\n| `sim_threshold` | 0.90 | Similarity threshold for clustering |\n| `top_k` | 50 | Top-K neighbors for clustering |\n| `hnsw_m` | 32 | HNSW index construction parameter |\n| `ef_search` | 64 | HNSW search parameter |\n\nThe system uses FAISS HNSW (Hierarchical Navigable Small World) indexes for efficient similarity search at scale.\n\n资料来源：[datatune/core/deduplication.py:1-100]()\n\n## Workflow Summary\n\n```mermaid\ngraph LR\n    A[Load Data] --> B[Choose Interface]\n    B --> C[Agent Mode]\n    B --> D[Primitives Mode]\n    C --> E[Natural Language]\n    D --> F[Explicit Operations]\n    E --> G[Auto Plan]\n    F --> H[Manual Plan]\n    G --> I[Execute]\n    H --> I\n    I --> J[LLM Processing]\n    J --> K[Finalize]\n    K --> L[Output]\n```\n\n1. **Load Data**: Import data into Dask or Ibis DataFrames\n2. **Choose Interface**: Use Agent for automatic planning or Primitives for explicit control\n3. **Define Operations**: Write natural language prompts or configure operations\n4. **Execute**: Process data with LLM integration\n5. **Finalize**: Convert lazy operations to computed results\n\n资料来源：[README.md:1-50]()\n\n## Installation and Quick Start\n\n```bash\npip install datatune\n```\n\nThe complete workflow involves initializing an LLM, loading data, applying transformations, and finalizing results:\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\n# Apply transformations\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\"]\n)(llm, mapped)\n\n# Save results\nresult = dt.finalize(filtered)\nresult.compute().to_csv(\"electronics_products.csv\")\n```\n\n资料来源：[README.md:20-55]()\n\n---\n\n<a id='page-getting-started'></a>\n\n## Getting Started\n\n### 相关页面\n\n相关主题：[Introduction to Datatune](#page-introduction)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/vitalops/datatune/blob/main/README.md)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n- [datatune/agent/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/__init__.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/llm/model_rate_limits.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/model_rate_limits.py)\n</details>\n\n# Getting Started\n\nDatatune is a data transformation library that leverages Large Language Models (LLMs) to perform complex data operations using natural language prompts. It enables users to transform, filter, map, and process data without writing extensive custom code.\n\n## Installation\n\nInstall datatune using pip:\n\n```bash\npip install datatune\n```\n\n资料来源：[README.md:1]()\n\n## Core Concepts\n\nDatatune provides two primary interfaces for data transformation:\n\n| Interface | Description | Use Case |\n|-----------|-------------|----------|\n| **Primitives** | Direct operations (Map, Filter) | Single, focused transformations |\n| **Agent** | AI-powered automatic planning | Complex multi-step workflows |\n\n资料来源：[README.md:20-30]()\n\n## Quick Start\n\n### Basic Setup\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n```\n\n资料来源：[README.md:26-32]()\n\n### Map Operation\n\nUse `dt.map()` to extract or derive new columns from existing data:\n\n```python\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n```\n\n资料来源：[README.md:34-41]()\n\n### Filter Operation\n\nUse `dt.filter()` to keep only rows matching criteria:\n\n```python\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\"]\n)(llm, mapped)\n```\n\n资料来源：[README.md:44-49]()\n\n### Finalize and Save\n\n```python\nresult = dt.finalize(filtered)\nresult.compute().to_csv(\"electronics_products.csv\")\n```\n\n资料来源：[README.md:51-53]()\n\n## Supported LLMs\n\nDatatune supports multiple LLM providers through a unified interface.\n\n### OpenAI\n\n```python\nfrom datatune.llm.llm import OpenAI\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\n```\n\n资料来源：[README.md:58-61]()\n\n### Ollama (Local Models)\n\n```python\nfrom datatune.llm.llm import Ollama\nllm = Ollama()\n```\n\nDefault configuration:\n- Model: `gemma3:4b`\n- API Base: `http://localhost:11434`\n\n资料来源：[README.md:63-65]()\n\n### Azure OpenAI\n\n```python\nfrom datatune.llm.llm import Azure\nllm = Azure(model_name=\"gpt-3.5-turbo\", api_key=api_key)\n```\n\n资料来源：[README.md:67-70]()\n\n### Additional Providers\n\n| Provider | Class | Notes |\n|----------|-------|-------|\n| Mistral | `Mistral` | Requires `mistral/` prefix in model name |\n| Huggingface | `Huggingface` | Requires `huggingface/` prefix |\n| VLLM | `VLLM` | Auto-detects max token limit |\n\n资料来源：[datatune/llm/llm.py:1-150]()\n\n## Agent Mode\n\nThe Agent provides an intelligent, automated approach to data transformation. It automatically determines which operations to use and chains multiple transformations from a single natural language prompt.\n\n### Basic Agent Usage\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\nagent = dt.Agent(llm)\n\ndf = agent.do(\"Add ProfitMargin column and keep only African organizations\", df)\nresult = dt.finalize(df)\n```\n\n资料来源：[README.md:74-82]()\n\n### Agent Capabilities\n\nThe agent automatically:\n\n- Determines which operations to use (map, filter, etc.)\n- Chains multiple transformations\n- Handles complex multi-step tasks from a single prompt\n- Generates and executes Python code along with row-level primitives (Map, Filter, etc) if required\n\n资料来源：[README.md:84-88]()\n\n### Agent System Prompt\n\nThe agent has access to:\n\n- Python builtins\n- pandas\n- numpy\n- dask\n- datatune library primitives (Map, Filter)\n\n资料来源：[datatune/agent/__init__.py:1-35]()\n\n## Data Sources\n\nDatatune supports multiple data processing backends.\n\n### Dask DataFrames\n\n```python\nimport dask.dataframe as dd\ndf = dd.read_csv(\"data.csv\")\n```\n\n### Ibis Backend\n\nIbis provides connectivity to multiple databases:\n\n```python\nimport ibis\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"my_table\")\n```\n\n资料来源：[README.md:93-102]()\n\n## Architecture Overview\n\n```mermaid\ngraph TD\n    A[User Prompt] --> B[Datatune API]\n    B --> C{Operation Type}\n    C -->|Map| D[map_dask.py / map_ibis.py]\n    C -->|Filter| E[filter_dask.py / filter_ibis.py]\n    C -->|Agent| F[Agent Planning]\n    D --> G[LLM Batch Processing]\n    E --> G\n    F -->|Dask Ops| H[Dask Templates]\n    F -->|Primitives| I[Map / Filter Primitives]\n    G --> J[Output DataFrame]\n    H --> J\n    I --> J\n```\n\n## Rate Limits Configuration\n\nWhen using LLMs, datatune respects model-specific rate limits:\n\n| Parameter | Description |\n|-----------|-------------|\n| `tpm` | Tokens per minute |\n| `rpm` | Requests per minute |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-50]()\n\nDefault rate limits for unknown models fall back to `gpt-3.5-turbo` settings with a warning logged.\n\n资料来源：[datatune/llm/llm.py:20-35]()\n\n## Primitive Operations\n\n### Map Operation\n\nCreates new columns by applying LLM-based transformations to input columns.\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `prompt` | str | Natural language description of the transformation |\n| `input_fields` | list | Input column names |\n| `output_fields` | list | Names for new columns |\n\n### Filter Operation\n\nRemoves rows that don't meet the specified criteria.\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `prompt` | str | Natural language filtering criteria |\n| `input_fields` | list | Columns to evaluate |\n\n## Next Steps\n\n- **[Documentation](https://docs.datatune.ai/)** - Complete guides and API reference\n- **[Examples](https://github.com/vitalops/datatune/tree/main/examples)** - Real-world use cases\n- **[Discord](https://discord.gg/3RKA5AryQX)** - Community support\n- **[Issues](https://github.com/vitalops/datatune/issues)** - Report bugs and request features\n\n资料来源：[README.md:105-113]()\n\n---\n\n<a id='page-architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Dask Backend](#page-dask-backend), [Ibis Backend](#page-ibis-backend), [LLM Integration](#page-llm-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/core/deduplication.py](https://github.com/vitalops/datatune/blob/main/datatune/core/deduplication.py)\n- [datatune/llm/model_rate_limits.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/model_rate_limits.py)\n- [datatune/agent/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/__init__.py)\n</details>\n\n# System Architecture\n\n## Overview\n\nDatatune is a data transformation framework that leverages Large Language Models (LLMs) to perform semantic operations on distributed data. The system provides a declarative interface for common data manipulation tasks like mapping, filtering, and deduplication while abstracting the complexity of distributed computing and LLM interaction.\n\nThe architecture follows a modular design with three primary layers:\n\n| Layer | Purpose | Key Components |\n|-------|---------|----------------|\n| **LLM Layer** | Model abstraction and rate limiting | `LLM`, `OpenAI`, `Ollama`, `VLLM`, `Azure` |\n| **Core Operations Layer** | Data transformation primitives | `Map`, `Filter`, `Reduce`, `Deduplication` |\n| **Agent Layer** | High-level orchestration and planning | `Agent`, Plan execution engine |\n\n资料来源：[datatune/llm/llm.py:1-100]()\n\n## High-Level Architecture Diagram\n\n```mermaid\ngraph TB\n    subgraph \"Client Layer\"\n        User[User Code]\n    end\n    \n    subgraph \"LLM Layer\"\n        LLM[Base LLM Class]\n        OpenAI[OpenAI Provider]\n        Ollama[Ollama Provider]\n        VLLM[VLLM Provider]\n        Azure[Azure Provider]\n        RateLimits[model_rate_limits]\n    end\n    \n    subgraph \"Core Operations Layer\"\n        Map[Map Operation]\n        Filter[Filter Operation]\n        Reduce[Reduce Operation]\n        Dedupe[Deduplication]\n    end\n    \n    subgraph \"Data Backend Layer\"\n        Dask[Dask DataFrame]\n        Ibis[Ibis (DuckDB, PostgreSQL, BigQuery)]\n    end\n    \n    subgraph \"Agent Layer\"\n        Agent[Agent Orchestrator]\n        Planner[Plan Generator]\n        Executor[Plan Executor]\n    end\n    \n    User --> Agent\n    User --> Map\n    User --> Filter\n    \n    Agent --> Planner\n    Agent --> Executor\n    Agent --> LLM\n    \n    Map --> LLM\n    Filter --> LLM\n    \n    Map --> Dask\n    Map --> Ibis\n    Filter --> Dask\n    Filter --> Ibis\n    \n    LLM --> RateLimits\n```\n\n## LLM Layer\n\nThe LLM layer provides a unified interface for interacting with various LLM providers while handling rate limiting and token counting.\n\n### Class Hierarchy\n\n```mermaid\ngraph TD\n    LLM[LLM Base Class] --> Ollama[Ollama]\n    LLM --> OpenAI[OpenAI]\n    LLM --> VLLM[VLLM]\n    LLM --> Azure[Azure]\n```\n\n### Base LLM Class\n\nThe `LLM` class serves as the foundation for all LLM providers. It handles model naming, rate limit configuration, and request batching.\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `model_name` | `str` | Model identifier in format `provider/model` (e.g., `openai/gpt-3.5-turbo`) |\n| `tpm` | `int` | Tokens per minute limit |\n| `rpm` | `int` | Requests per minute limit |\n| `api_key` | `str` | API authentication key |\n| `api_base` | `str` | Base URL for API endpoint |\n\n资料来源：[datatune/llm/llm.py:70-95]()\n\n### Provider Implementations\n\n**OpenAI Provider**\n\n```python\nclass OpenAI(LLM):\n    def __init__(\n        self, model_name: str = \"gpt-3.5-turbo\", api_key: Optional[str] = None, **kwargs\n    ):\n        kwargs.update({\"api_key\": api_key})\n        super().__init__(model_name=f\"openai/{model_name}\", **kwargs)\n```\n\n资料来源：[datatune/llm/llm.py:96-103]()\n\n**Ollama Provider (Local)**\n\n```python\nclass Ollama(LLM):\n    def __init__(\n        self, model_name=\"gemma3:4b\", api_base=\"http://localhost:11434\", **kwargs\n    ) -> None:\n        super().__init__(\n            model_name=f\"ollama_chat/{model_name}\", api_base=api_base, **kwargs\n        )\n```\n\n资料来源：[datatune/llm/llm.py:81-87]()\n\n**VLLM Provider**\n\nThe VLLM provider automatically retrieves the `max_model_len` from the API and configures token limits accordingly.\n\n```python\nclass VLLM(LLM):\n    def __init__(self, model_name: str, api_base: str = \"http://localhost:8000/v1\", **kwargs):\n        import httpx\n        resp = httpx.get(f\"{api_base}/models\")\n        max_model_len = resp.json()[\"data\"][0][\"max_model_len\"]\n        kwargs.update({\"api_base\": api_base, \"api_key\": \"dummy\", \"max_tokens\": max_model_len})\n        super().__init__(model_name=f\"openai/{model_name}\", **kwargs)\n```\n\n资料来源：[datatune/llm/llm.py:105-118]()\n\n### Rate Limiting\n\nThe system maintains predefined rate limits for various models in `model_rate_limits.py`:\n\n| Model Family | TPM | RPM |\n|--------------|-----|-----|\n| `gpt-3.5-turbo` | 200,000 | 500 |\n| `gpt-4` | 10,000 | 500 |\n| `gpt-4-turbo` | 30,000 | 500 |\n| `gpt-4.1-mini/nano` | 200,000 | 500 |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-50]()\n\n## Core Operations Layer\n\nThe core operations layer provides primitive transformations that can be applied to distributed data.\n\n### Map Operation\n\nThe `Map` operation creates new columns by applying LLM-based transformations to existing data.\n\n```mermaid\ngraph LR\n    A[Input DataFrame] --> B[Serialize Rows]\n    B --> C[Batch to LLM]\n    C --> D[Parse Responses]\n    D --> E[Extract New Columns]\n    E --> F[Output DataFrame]\n```\n\n**Input Format**: Rows are serialized to Python dictionaries with the following structure:\n\n```\nindex=<row_index>|{key1: value1, key2: value2, ...}\n```\n\n**Output Format**: LLM returns dictionaries with original keys plus new field keys:\n\n```\nindex=<row_index>|{key1: value1, key2: value2, new_field: new_value}\n```\n\n资料来源：[datatune/core/dask/map_dask.py:1-50]()\n\n**Key Features**:\n\n- Handles duplicate rows via canonical mapping\n- Supports batched API calls for efficiency\n- Falls back to canonical row values for duplicates\n\n### Filter Operation\n\nThe `Filter` operation removes rows that do not meet specified criteria using LLM-based evaluation.\n\n```mermaid\ngraph LR\n    A[Input DataFrame] --> B[Serialize Rows]\n    B --> C[LLM Decision]\n    C --> D{Keep Row?}\n    D -->|Yes| E[Include in Output]\n    D -->|No| F[Exclude Row]\n```\n\n**Decision Format**: Each row receives a `__filter__` key:\n\n```\nindex=<row_index>|{..., __filter__: True/False}\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:1-50]()\n\n### Deduplication\n\nThe deduplication system uses embeddings and FAISS for efficient similarity search:\n\n| Component | Description |\n|-----------|-------------|\n| `embedding_model` | Model for generating text embeddings |\n| `sim_threshold` | Minimum similarity score (default: 0.90) |\n| `top_k` | Number of neighbors to search |\n| `hnsw_m` | HNSW index parameter for m |\n| `ef_search` | HNSW search parameter |\n\n**Workflow**:\n\n1. Embed column values in batches of 256\n2. Build FAISS HNSW index per partition\n3. Search for similar records above threshold\n4. Cluster duplicates with canonical ID\n\n资料来源：[datatune/core/deduplication.py:1-80]()\n\n## Agent Layer\n\nThe Agent layer provides high-level orchestration, automatically determining which operations to use and generating execution plans.\n\n```mermaid\ngraph TD\n    A[User Goal] --> B[Plan Generator]\n    B --> C[JSON Plan]\n    C --> D[Plan Executor]\n    D --> E[Step 1: Operation]\n    D --> F[Step 2: Operation]\n    D --> G[Step N: Operation]\n    E --> H[Finalize]\n    F --> H\n    G --> H\n    H --> I[Computed Result]\n```\n\n### Plan Structure\n\nPlans are generated as JSON arrays of steps:\n\n```json\n[\n  {\n    \"type\": \"primitive\",\n    \"operation\": \"map\",\n    \"params\": {\n      \"subprompt\": \"Extract category from industry\",\n      \"input_fields\": [\"Industry\"],\n      \"output_fields\": [\"Category\"]\n    }\n  },\n  {\n    \"type\": \"dask\",\n    \"operation\": \"add_column\",\n    \"params\": {\n      \"new_column\": \"Year\",\n      \"expression\": \"df['Date'].dt.year\"\n    }\n  }\n]\n```\n\n资料来源：[datatune/agent/agent.py:80-120]()\n\n### Step Types\n\n| Type | Description | Operations |\n|------|-------------|------------|\n| `primitive` | LLM-based transformations | `map`, `filter` |\n| `dask` | Native Dask operations | `add_column`, `group_by`, `apply_function`, `rename_columns`, `astype_column` |\n\n### Template System\n\nTemplates define how operations are translated to executable code:\n\n```python\nTEMPLATE = {\n    \"primitive\": {\n        \"Map\": \"df = dt.Map(...)(self.llm, df)\",\n        \"Filter\": \"df = dt.Filter(...)(self.llm, df)\"\n    },\n    \"dask\": {\n        \"add_column\": \"df = df.assign({new_column}=lambda x: {expression})\",\n        \"group_by_agg\": \"df = df.groupby({group_columns}).agg({aggregations})\"\n    }\n}\n```\n\n资料来源：[datatune/agent/agent.py:40-70]()\n\n## Data Backend Support\n\n### Dask Integration\n\nDatatune primarily uses Dask for distributed computing. Operations are executed partition-by-partition with lazy evaluation:\n\n```python\n# Example: Map operation on Dask DataFrame\ndf = dd.read_csv(\"data.csv\")\nmapped = dt.map(\n    prompt=\"Extract sentiment from text\",\n    output_fields=[\"sentiment\"],\n    input_fields=[\"text\"]\n)(llm, df)\n\n# Finalize and compute\nresult = dt.finalize(mapped).compute()\n```\n\n### Ibis Integration\n\nIbis provides support for multiple backends (DuckDB, PostgreSQL, BigQuery):\n\n```python\n# DuckDB example\nimport ibis\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"my_table\")\n\n# Apply map operation\nmapped_table = dt.map(\n    prompt=\"Extract category\",\n    output_fields=[\"category\"],\n    input_fields=[\"description\"]\n)(llm, table)\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:1-50]()\n\n## Request Batching\n\nThe LLM layer implements intelligent batching to optimize API usage:\n\n```mermaid\ngraph TD\n    A[Input Rows] --> B[Calculate Token Budget]\n    B --> C{Within Limits?}\n    C -->|Yes| D[Batch All]\n    C -->|No| E[Split into Batches]\n    E --> F[Execute Batch 1]\n    E --> G[Execute Batch 2]\n    F --> H[Merge Results]\n    G --> H\n    D --> H\n```\n\n**Token Calculation**:\n\n- Prefix/suffix tokens are calculated via `token_counter`\n- `nrows_per_api_call` determines optimal batch sizes\n- Rate limits (TPM/RPM) are enforced per model\n\n资料来源：[datatune/llm/llm.py:30-70]()\n\n## Error Handling\n\nThe system implements robust error handling at multiple levels:\n\n| Layer | Error Handling |\n|-------|----------------|\n| **Agent** | Catches execution errors, returns step number and error message |\n| **Core Operations** | Gracefully handles malformed LLM responses with fallback to empty dict |\n| **LLM** | Retries with exponential backoff on transient failures |\n\n```python\ndef parse_filter_output(output: Union[str, Exception], err: bool = True) -> Optional[bool]:\n    try:\n        d = ast.literal_eval(str(output).strip())\n        if d:\n            last_key = list(d.keys())[-1]\n            flag = d[last_key]\n            return flag\n    except Exception:\n        return None\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:50-65]()\n\n## Configuration Options\n\n### LLM Configuration\n\n| Option | Default | Description |\n|--------|---------|-------------|\n| `model_name` | `gpt-3.5-turbo` | Model identifier |\n| `tpm` | Model-specific | Tokens per minute limit |\n| `rpm` | Model-specific | Requests per minute limit |\n| `api_key` | `None` | API authentication |\n| `api_base` | Provider-specific | API endpoint URL |\n\n### Operation Configuration\n\n| Option | Description |\n|--------|-------------|\n| `prompt` | Natural language instruction |\n| `input_fields` | Columns to use as input |\n| `output_fields` | New columns to create (Map only) |\n| `optimized` | Enable batching optimization |\n\n## Summary\n\nDatatune's system architecture provides:\n\n1. **Provider Abstraction**: Unified interface across OpenAI, Ollama, VLLM, and Azure\n2. **Rate Limiting**: Automatic enforcement of TPM/RPM limits per model\n3. **Distributed Execution**: Partition-based processing for large datasets\n4. **Multi-Backend Support**: Native support for Dask and Ibis backends\n5. **Intelligent Planning**: Agent-based orchestration for complex multi-step transformations\n6. **Semantic Operations**: LLM-powered Map and Filter for text understanding\n7. **Deduplication**: Embedding-based similarity search with FAISS\n\n---\n\n<a id='page-map-operations'></a>\n\n## Map Operations\n\n### 相关页面\n\n相关主题：[Filter Operations](#page-filter-operations), [Reduce Operations](#page-reduce-operations), [Dask Backend](#page-dask-backend)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/map.py](https://github.com/vitalops/datatune/blob/main/datatune/core/map.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/__init__.py)\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n</details>\n\n# Map Operations\n\nMap Operations in Datatune enable LLM-powered column transformations on distributed dataframes. Users describe desired transformations in natural language, and the system generates new columns by leveraging large language models to process each row semantically.\n\n## Overview\n\nThe Map operation is a core primitive in Datatune that bridges natural language instructions with data transformations. It accepts a user-defined prompt describing what new columns to create, identifies relevant input columns, and produces one or more output columns populated by LLM inference. 资料来源：[datatune/__init__.py:3]()\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n```\n\n资料来源：[README.md:18-29]()\n\n## Architecture\n\nThe Map system is designed with a pluggable backend architecture, supporting both Dask and Ibis execution contexts. Each backend implements the same logical interface while adapting to its specific dataframe API.\n\n### Component Flow\n\n```mermaid\ngraph TD\n    A[User Prompt + Input/Output Fields] --> B[map Function Entry Point]\n    B --> C{Backend Type}\n    C -->|Dask| D[map_dask.py]\n    C -->|Ibis| E[map_ibis.py]\n    D --> F[LLM Batch Processor]\n    E --> F\n    F --> G[Parse LLM Output]\n    G --> H[Merge Results to DataFrame]\n    H --> I[Output DataFrame with New Columns]\n```\n\n### Backend Implementations\n\n| Backend | File | Purpose |\n|---------|------|---------|\n| Dask | `datatune/core/dask/map_dask.py` | Distributed DataFrame operations using Dask |\n| Ibis | `datatune/core/ibis/map_ibis.py` | SQL-like backend supporting DuckDB, PostgreSQL, BigQuery |\n\n## API Reference\n\n### Function Signature\n\n```python\ndef map(\n    prompt: str,\n    output_fields: List[str],\n    input_fields: Optional[List[str]] = None,\n    **kwargs\n) -> Callable\n```\n\n### Parameters\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `prompt` | `str` | Yes | Natural language description of the transformation to apply |\n| `output_fields` | `List[str]` | Yes | Names of columns to create in the output |\n| `input_fields` | `List[str]` | Optional | Input columns to pass to the LLM; if omitted, all columns are used |\n\n资料来源：[datatune/core/dask/map_dask.py:1-50]()\n\n### Return Value\n\nReturns a callable that accepts `(llm, dataframe)` and returns a transformed dataframe with the new columns appended.\n\n## Dask Backend Implementation\n\n### Data Flow\n\n```mermaid\ngraph LR\n    A[Partitioned DataFrame] --> B[Serialize Input Columns]\n    B --> C[Batch Rows to LLM]\n    C --> D[LLM Inference]\n    D --> E[Parse Dictionary Output]\n    E --> F[Assign to Output Columns]\n    F --> G[Merge Canonical Results to Duplicates]\n    G --> H[Return Partitioned DataFrame]\n```\n\n### Prompt Construction\n\nThe Dask implementation constructs prompts using a prefix-suffix pattern to ensure consistent LLM responses.\n\n**Prefix Structure:**\n```\nUse the following context to create new columns from existing columns.\n```\n\n资料来源：[datatune/core/dask/map_dask.py:10-15]()\n\n**Suffix Structure:**\n```python\nsuffix = (\n    f\"{os.linesep}{os.linesep}\"\n    \"Your response MUST be the entire input record as a valid Python dictionary in the format\"\n    \"'index=<row_index>|{key1: value1, key2: value2, ...}'  with added keys of expected new fields if any.\"\n     \n    \"ALWAYS START YOUR RESPONSE WITH 'index=<row_index>|' WHERE <row_index> IS THE INDEX OF THE ROW.\" \\\n    \"IF A VALUE FOR A COLUMN DOES NOT EXIST SET IT TO null\" \\\n    \"'index=<row_index>|{key1: None, key2: value2, ...}'\"\n)\n```\n\n资料来源：[datatune/core/dask/map_dask.py:22-30]()\n\n### Duplicate Handling\n\nThe Dask Map implementation includes sophisticated duplicate handling for semantic deduplication scenarios. When clusters of duplicate rows exist, only the canonical row is sent to the LLM, and results are propagated to duplicates.\n\n```python\ndup_to_canon = {\n    dup: c[\"canonical_id\"]\n    for c in clusters\n    for dup in c[\"duplicate_ids\"]\n}\n\ncanonical_idx = input_series.index.difference(dup_to_canon.keys())\ncanonical_input = input_series.loc[canonical_idx]\n\nllm_out = llm(canonical_input, prefix, prompt, suffix, optimized=True)\n\ndf.loc[canonical_idx, llm_output_column] = llm_out\n\nfor dup, canon in dup_to_canon.items():\n    df.loc[dup, llm_output_column] = df.loc[canon, llm_output_column]\n```\n\n资料来源：[datatune/core/dask/map_dask.py:35-50]()\n\n### Output Parsing\n\nThe LLM output is parsed using `parse_llm_output`, which converts the string response to a Python dictionary.\n\n```python\ndef parse_llm_output(llm_output: Union[str, Exception]) -> Union[Dict, Exception]:\n    \"\"\"\n    Parses the LLM output string into a Python dictionary.\n    \"\"\"\n```\n\n资料来源：[datatune/core/dask/map_dask.py:75-85]()\n\n## Ibis Backend Implementation\n\n### Data Flow\n\n```mermaid\ngraph LR\n    A[Ibis Table] --> B[Add Row ID Column]\n    B --> C[Execute to Local Data]\n    C --> D[Convert to List]\n    D --> E[Batch to LLM]\n    E --> F[Parse Results]\n    F --> G[Create MemTable]\n    G --> H[Join Back to Original Table]\n```\n\n### Row Indexing\n\nIbis operations require explicit row indexing since SQL tables don't maintain native pandas-like indices.\n\n```python\nindexed_table = table.mutate(_ROW_ID_=ibis.row_number().cast(\"int64\"))\n\nlocal_data = indexed_table.select(\"_ROW_ID_\", input_col).execute()\n    \ninput_list = local_data[input_col].tolist()\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:25-32]()\n\n### Result Processing\n\nRaw LLM outputs are parsed and converted to JSON strings for storage:\n\n```python\nprocessed_results = []\nfor res in raw_results:\n    try:\n        py_dict = ast.literal_eval(str(res).strip())\n        processed_results.append(json.dumps(py_dict))\n    except Exception:\n        processed_results.append(\"{}\")\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:45-52]()\n\n## LLM Integration\n\n### Supported LLM Providers\n\n| Provider | Class | Default Model |\n|----------|-------|---------------|\n| OpenAI | `OpenAI` | `gpt-3.5-turbo` |\n| Ollama | `Ollama` | `gemma3:4b` |\n| Azure | `Azure` | Configurable |\n| vLLM | `VLLM` | Configurable |\n\n资料来源：[datatune/llm/llm.py:1-60]()\n\n### Batching and Rate Limiting\n\nThe LLM layer implements token and request rate limiting based on model-specific constraints defined in `model_rate_limits.py`.\n\n```python\nmodel_rate_limits = {\n    \"gpt-3.5-turbo\": {\n        \"tpm\": 200_000,\n        \"rpm\": 500,\n    },\n    \"gpt-4\": {\n        \"tpm\": 10_000,\n        \"rpm\": 500,\n    },\n    \"gpt-4o\": {\n        \"tpm\": 30_000,\n        \"rpm\": 500,\n    },\n}\n```\n\n资料来源：[datatune/llm/model_rate_limits.py:1-20]()\n\n### Batch Processing\n\nThe LLM batches multiple rows into single API calls for efficiency:\n\n```python\nfor i, prompt in enumerate(input_rows):\n    q  # Batch construction continues...\n```\n\n资料来源：[datatune/llm/llm.py:30-45]()\n\n## Agent Integration\n\nThe Datatune Agent can automatically generate Map operations as part of multi-step data transformation pipelines.\n\n### Agent System Prompt\n\n```python\nclass Agent(ABC):\n    system_prompt: str = \"\"\"You are Datatune Agent, a powerful assistant designed to help users with data processing tasks.\n    You are capable of generating python code to perform various operations on data. Apart from python builtins, you have the following libraries avaiable in your run time:\n    - pandas\n    - numpy\n    - dask\n\n    In addition to these, you also have access to the datatune libarary, which provides functionality for processing data using LLMs.\n    Map Example:\n    ```python\n    import datatune as dt\n    import dask.dataframe as dd\n    df = dd.read_csv(\"path/to/data.csv\")\n    map = dt.Map(prompt=\"Your prompt here\")\n    llm = dt.LLM(model_name=\"gpt-3.5-turbo\")\n    mapped_df = map(llm, df)\n    mapped_df = dt.finalize(mapped_df)\n    ```\n    \"\"\"\n```\n\n资料来源：[datatune/agent/__init__.py:1-30]()\n\n### Plan Generation\n\nWhen the Agent generates a plan requiring Map operations, it creates steps with the following structure:\n\n```python\n{\n    \"type\": \"primitive\",\n    \"operation\": \"map\",\n    \"params\": {\n        \"subprompt\": \"Extract category and sub-category from industry\",\n        \"input_fields\": [\"Industry\"],\n        \"output_fields\": [\"Category\",\"Sub-Category\"]\n    },\n}\n```\n\n资料来源：[datatune/agent/agent.py:50-65]()\n\n## Usage Examples\n\n### Basic Column Extraction\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-4\")\ndf = dd.read_csv(\"customers.csv\")\n\n# Extract sentiment and category from review text\nmapped = dt.map(\n    prompt=\"Analyze the customer feedback and extract sentiment (positive/negative/neutral) and main topic\",\n    output_fields=[\"Sentiment\", \"Topic\"],\n    input_fields=[\"CustomerFeedback\"]\n)(llm, df)\n\nresult = dt.finalize(mapped)\nresult.compute().to_csv(\"enriched_customers.csv\")\n```\n\n### Multi-Column Generation\n\n```python\n# Generate multiple derived columns in a single Map operation\nmapped = dt.map(\n    prompt=\"Parse the product description to identify: 1) the material, 2) primary color, 3) target demographic\",\n    output_fields=[\"Material\", \"Color\", \"Demographic\"],\n    input_fields=[\"ProductDescription\", \"ProductName\"]\n)(llm, df)\n```\n\n### With Agent\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(model_name=\"gpt-4\")\nagent = dt.Agent(llm)\n\ndf = dd.read_csv(\"data.csv\")\n# Agent automatically determines when to use Map operations\nresult = agent.do(\"Add ProfitMargin column and categorize by revenue tier\", df)\n```\n\n资料来源：[README.md:35-45]()\n\n## Best Practices\n\n### Input Field Selection\n\n| Practice | Rationale |\n|----------|-----------|\n| Provide specific input fields | Reduces token usage and improves accuracy |\n| Include context-rich columns | Helps LLM understand the domain |\n| Avoid overly large inputs | Prevents token limit issues |\n\n### Prompt Engineering\n\n- Be explicit about the expected output format in output field names\n- Include examples in prompts for complex transformations\n- Break complex extractions into multiple Map operations when needed\n\n### Performance Considerations\n\n| Factor | Impact | Recommendation |\n|--------|--------|----------------|\n| Batch size | Throughput vs. latency | Use default batching for most cases |\n| Model selection | Quality vs. speed | Use `gpt-3.5-turbo` for bulk, `gpt-4` for complex tasks |\n| Partition count | Parallelism | Match to cluster size for Dask operations |\n\n## See Also\n\n- [Filter Operations](Filter.md) - Row-level filtering using natural language\n- [Agent](Agent.md) - Automatic operation selection and chaining\n- [Deduplication](Deduplication.md) - Semantic deduplication with Map integration\n- [Supported LLMs](LLMs.md) - Complete list of supported providers\n\n---\n\n<a id='page-filter-operations'></a>\n\n## Filter Operations\n\n### 相关页面\n\n相关主题：[Map Operations](#page-map-operations), [Reduce Operations](#page-reduce-operations), [Ibis Backend](#page-ibis-backend)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/filter.py](https://github.com/vitalops/datatune/blob/main/datatune/core/filter.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/__init__.py)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n</details>\n\n# Filter Operations\n\nFilter Operations in Datatune enable intelligent, LLM-powered row filtering based on natural language criteria. Instead of writing explicit boolean conditions, users describe what rows to keep in plain English, and the LLM determines which records match the specified criteria.\n\n## Overview\n\nThe Filter module provides a declarative interface for semantic row-level filtering of data. It supports multiple backends (Dask DataFrames and Ibis tables) and uses LLMs to make filtering decisions based on natural language prompts.\n\nKey capabilities:\n- Natural language filter criteria specification\n- Multi-backend support (Dask, Ibis)\n- Semantic deduplication integration\n- Parallelized LLM inference\n- Structured output parsing\n\n## Architecture\n\nThe Filter system follows a dispatcher pattern that routes to backend-specific implementations based on the input data type.\n\n```mermaid\ngraph TD\n    A[User calls filter prompt, input_fields] --> B[datatune.filter function]\n    B --> C{Data Type Detection}\n    C -->|dask.dataframe.DataFrame| D[_filter_dask]\n    C -->|ibis.Table| E[_filter_ibis]\n    D --> F[LLM Batch Processing]\n    E --> F\n    F --> G[Parse Filter Output]\n    G --> H[Apply Boolean Mask]\n    H --> I[Filtered DataFrame/Table]\n```\n\n### Component Overview\n\n| Component | File | Responsibility |\n|-----------|------|----------------|\n| `filter()` | `datatune/core/filter.py` | Main entry point, type dispatch |\n| `_filter_dask()` | `datatune/core/dask/filter_dask.py` | Dask DataFrame filtering |\n| `_filter_ibis()` | `datatune/core/ibis/filter_ibis.py` | Ibis table filtering |\n| LLM Integration | `datatune/llm/llm.py` | Batch inference and token management |\n\n## API Reference\n\n### `dt.filter()`\n\nMain filter function exported from `datatune`.\n\n```python\ndef filter(*, prompt, input_fields=None, clusters=None):\n    def apply(llm, data):\n        # Implementation dispatch\n        ...\n    return apply\n```\n\n#### Parameters\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `prompt` | `str` | Yes | Natural language description of filter criteria |\n| `input_fields` | `List[str]` | No | Columns to use for filtering decisions |\n| `clusters` | `List[Dict]` | No | Semantic deduplication clusters for propagating filter decisions |\n\n#### Returns\n\nReturns a callable that accepts `(llm, data)` and returns a filtered DataFrame or Table.\n\n## Dask Implementation\n\nThe Dask backend processes rows in parallel across partitions, leveraging the Dask execution graph for distributed computation.\n\n### Source Files\n\n- Main implementation: `datatune/core/dask/filter_dask.py`\n- Dispatcher: `datatune/core/filter.py`\n\n### Workflow\n\n```mermaid\ngraph LR\n    A[Input DataFrame] --> B[Serialize Input Columns]\n    B --> C[Handle Clusters]\n    C --> D[Extract Canonical Indices]\n    D --> E[LLM Batch Inference]\n    E --> F[Parse Boolean Output]\n    F --> G[Propagate to Duplicates]\n    G --> H[Apply Boolean Mask]\n```\n\n### Key Functions\n\n#### `_filter_dask()`\n\nCore filtering function for Dask DataFrames:\n\n```python\ndef _filter_dask(\n    prompt: str,\n    input_fields: Optional[List[str]] = None,\n    clusters: Optional[List[Dict]] = None,\n):\n    # Validates inputs, constructs prompts, invokes LLM\n```\n\n#### `parse_filter_output()`\n\nParses LLM output string into boolean values:\n\n```python\ndef parse_filter_output(\n    output: Union[str, Exception], \n    err: bool = True\n) -> Optional[bool]:\n```\n\n## Ibis Implementation\n\nThe Ibis backend executes filtering operations at the database level, enabling pushdown to various SQL backends (DuckDB, PostgreSQL, BigQuery, etc.).\n\n### Source Files\n\n- Implementation: `datatune/core/ibis/filter_ibis.py`\n\n### Workflow\n\n```mermaid\ngraph TD\n    A[Input Table] --> B[Add Row ID Column]\n    B --> C[Select Input Columns]\n    C --> D[Execute to Local]\n    D --> E[LLM Batch Inference]\n    E --> F[Parse Filter Results]\n    F --> G[Build Boolean Filter]\n    G --> H[Filter Original Table]\n    H --> I[Remove Row ID Column]\n```\n\n### Key Functions\n\n#### `_filter_ibis()`\n\nCore filtering function for Ibis tables:\n\n```python\ndef _filter_ibis(\n    table: ibis.Table,\n    prompt: str,\n    input_fields: Optional[List[str]] = None,\n):\n    indexed_table = table.mutate(_ROW_ID_=ibis.row_number().cast(\"int64\"))\n```\n\nUses `ibis.row_number()` to assign sequential row IDs that match LLM response indexing. 资料来源：[datatune/core/ibis/filter_ibis.py:28]()\n\n## LLM Prompt Format\n\nThe Filter module constructs specialized prompts to guide LLM responses for filtering decisions.\n\n### Prompt Structure\n\n```\nFILTERING CRITERIA:\n<prompt>\n\n[input record dictionary]\n\nDECISION: Your response MUST be the entire input record as Python dictionary in the format:\nindex=<row_index>|{key1: value1, ...}<endofrow> with added key called '__filter__' \nwith value either True to KEEP the record or False to REMOVE it.\n```\n\n### Output Format\n\nThe LLM must return records with an added `__filter__` key:\n\n```python\nindex=0|{'Name': 'Acme Corp', 'Country': 'USA', '__filter__': True}<endofrow>\nindex=1|{'Name': 'Bad Inc', 'Country': 'UK', '__filter__': False}<endofrow>\n```\n\n#### Response Requirements\n\n| Requirement | Description |\n|-------------|-------------|\n| Index prefix | Each response starts with `index=<row_index>\\|` |\n| Complete record | Include all original keys plus `__filter__` |\n| Boolean value | `__filter__` must be `True` (keep) or `False` (remove) |\n| Missing values | Set to `None` if column value does not exist |\n\n资料来源：[datatune/core/dask/filter_dask.py:14-19]()\n\n## Semantic Deduplication Integration\n\nWhen `clusters` parameter is provided, filter decisions propagate from canonical records to their duplicates.\n\n### Cluster Structure\n\n```python\nclusters = [\n    {\n        \"canonical_id\": 5,\n        \"duplicate_ids\": [12, 45, 78]\n    },\n    ...\n]\n```\n\n### Propagation Logic\n\n```mermaid\ngraph TD\n    A[Clusters Input] --> B[Build dup_to_canon Map]\n    B --> C[Extract Canonical Indices]\n    C --> D[Filter Only Canonical Rows]\n    D --> E[LLM Inference on Canonical]\n    E --> F[Propagate Decisions to Duplicates]\n    F --> G[Merge Results]\n```\n\nThe mapping builds:\n```python\ndup_to_canon = {\n    dup: c[\"canonical_id\"]\n    for c in clusters\n    for dup in c[\"duplicate_ids\"]\n}\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:41-46]()\n\n## Usage Examples\n\n### Basic Dask Filtering\n\n```python\nimport datatune as dt\nimport dask.dataframe as dd\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\n# Keep only electronics products\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\", \"Description\"]\n)(llm, df)\n\nresult = dt.finalize(filtered)\n```\n\n### Ibis Table Filtering\n\n```python\nimport datatune as dt\nimport ibis\n\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"customers\")\n\n# Filter using natural language\nfiltered = dt.filter(\n    prompt=\"Keep only customers from African countries\",\n    input_fields=[\"Country\", \"Region\"]\n)(llm, table)\n```\n\n### With Semantic Deduplication\n\n```python\n# First deduplicate to find clusters\ndeduplicator = dt.SemanticDeduplicator(llm=llm)\nclusters = deduplicator.find_duplicates(df, columns=[\"Name\", \"Address\"])\n\n# Filter with cluster awareness\nfiltered = dt.filter(\n    prompt=\"Keep only verified organizations\",\n    input_fields=[\"Status\", \"VerificationDate\"],\n    clusters=clusters\n)(llm, df)\n```\n\n## Error Handling\n\nThe `parse_filter_output()` function handles various error cases:\n\n| Error Type | Handling | Return Value |\n|------------|----------|--------------|\n| Parse failure | Returns `None` if `err=False` | `None` |\n| Exception | Returns exception if `err=True` | Original exception |\n| Empty output | Returns `None` | `None` |\n| Malformed dict | Falls back to `None` | `None` |\n\n资料来源：[datatune/core/dask/filter_dask.py:64-79]()\n\n## Rate Limiting\n\nFilter operations respect LLM rate limits defined in `datatune/llm/model_rate_limits.py`:\n\n| Model | TPM (tokens/min) | RPM (requests/min) |\n|-------|------------------|---------------------|\n| gpt-3.5-turbo | 200,000 | 500 |\n| gpt-4 | 10,000 | 500 |\n| gpt-4-turbo | 30,000 | 500 |\n\nThe batching system automatically respects these limits during batch LLM inference.\n\n## Performance Considerations\n\n### Batching Strategy\n\nRows are batched for LLM inference to optimize throughput:\n\n```python\n# From datatune/llm/llm.py\nfor i, prompt in enumerate(input_rows):\n    # Batch prompts based on token limits\n    q  # Accumulate until batch size reached\n```\n\n### Partition Parallelism (Dask)\n\nDask DataFrames process partitions in parallel:\n\n```mermaid\ngraph LR\n    A[Partition 0] --> E[LLM Batch]\n    B[Partition 1] --> F[LLM Batch]\n    C[Partition 2] --> G[LLM Batch]\n    E --> H[Results Merged]\n    F --> H\n    G --> H\n```\n\n### Backend Selection\n\n| Backend | Best For | Limitations |\n|---------|----------|-------------|\n| Dask | Large datasets, local processing | Memory-bound |\n| Ibis | Database pushdown, SQL backends | Requires database connection |\n\n## See Also\n\n- [Map Operations](./Map.md) - Creating new columns via LLM\n- [Semantic Deduplication](./SemanticDeduplication.md) - Finding duplicate records\n- [Agent](./Agent.md) - Automatic operation planning\n- [LLM Configuration](../llm/llm.md) - LLM setup and backends\n\n---\n\n<a id='page-reduce-operations'></a>\n\n## Reduce Operations\n\n### 相关页面\n\n相关主题：[Map Operations](#page-map-operations), [Filter Operations](#page-filter-operations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/reduce.py](https://github.com/vitalops/datatune/blob/main/datatune/core/reduce.py)\n- [datatune/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/__init__.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/deduplication.py](https://github.com/vitalops/datatune/blob/main/datatune/core/deduplication.py)\n</details>\n\n# Reduce Operations\n\n## Overview\n\nReduce Operations in datatune provide a mechanism for aggregating and condensing data using configurable action handlers. The reduce system follows a plugin-based architecture where specific aggregation or reduction behaviors are registered as actions and invoked through a unified interface.\n\nThe core purpose of reduce operations is to transform a DataFrame into aggregated results by applying registered reduction actions. This enables users to perform complex aggregation tasks that go beyond standard group-by operations, leveraging LLM-powered semantic understanding when needed.\n\n资料来源：[datatune/core/reduce.py:1-22]()\n\n## Architecture\n\n### Action Registry Pattern\n\nThe reduce system implements a decorator-based registry pattern for action registration:\n\n```mermaid\ngraph TD\n    A[User calls reduce df action] --> B[reduce function invoked]\n    B --> C[get_action looks up registry]\n    C --> D{Action found?}\n    D -->|Yes| E[Instantiate action class]\n    D -->|No| F[Raise ValueError]\n    E --> G[Execute reduction on DataFrame]\n    G --> H[Return aggregated result]\n```\n\nThe `_ACTIONS` dictionary serves as a central registry mapping action names to their implementation classes. The `register_action` decorator allows new actions to be dynamically registered without modifying core logic.\n\n资料来源：[datatune/core/reduce.py:1-22]()\n\n### Class Diagram\n\n```mermaid\nclassDiagram\n    class _ACTIONS {\n        Dict[str, Type]\n    }\n    \n    class register_action {\n        +decorator(name: str)\n    }\n    \n    class get_action {\n        +function(name: str) Type\n    }\n    \n    class reduce {\n        +function(df, action, **kwargs)\n    }\n    \n    register_action ..> _ACTIONS : registers\n    get_action ..> _ACTIONS : queries\n    reduce ..> get_action : uses\n```\n\n## API Reference\n\n### Main Function\n\n```python\ndef reduce(df, *, action: str, **kwargs):\n    cls = get_action(action)\n    reducer = cls(**kwargs)   \n    return reducer(df)\n```\n\n#### Parameters\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `df` | `dask.dataframe.DataFrame` | Yes | The input Dask DataFrame to reduce |\n| `action` | `str` | Yes | The name of the registered reduction action to execute |\n| `**kwargs` | Various | No | Additional keyword arguments passed to the action constructor |\n\n#### Returns\n\n| Return Type | Description |\n|-------------|-------------|\n| Any | Returns the result of the specific reduction action (varies by registered action) |\n\n#### Exceptions\n\n| Exception | Condition |\n|-----------|------------|\n| `ValueError` | When the specified action name is not registered |\n\n资料来源：[datatune/core/reduce.py:19-22]()\n\n### Register Action Decorator\n\n```python\ndef register_action(name):\n    def decorator(cls):\n        _ACTIONS[name] = cls\n        return cls\n    return decorator\n```\n\n#### Usage Pattern\n\n```python\n@register_action(\"my_reducer\")\nclass MyReducer:\n    def __init__(self, **kwargs):\n        # Initialize reducer configuration\n        pass\n    \n    def __call__(self, df):\n        # Perform reduction\n        return result\n```\n\n### Get Action Function\n\n```python\ndef get_action(name):\n    try:\n        return _ACTIONS[name]\n    except KeyError:\n        raise ValueError(f\"Unknown action: {name}\")\n```\n\n## Workflow Execution\n\n### Execution Flow\n\n```mermaid\ngraph TD\n    A[Start] --> B[Import reduce module]\n    B --> C[Call reduce df action]\n    C --> D[get_action retrieves class]\n    D --> E[Instantiate with kwargs]\n    E --> F[Call reducer on DataFrame]\n    F --> G[Return result]\n    G --> H[End]\n```\n\n### Integration with Datatune Core\n\nThe reduce operation integrates with other datatune components through the public API:\n\n```python\nimport datatune as dt\n\n# reduce is exported from datatune namespace\nresult = dt.reduce(df, action=\"specific_action\", param=value)\n```\n\n资料来源：[datatune/__init__.py:1-7]()\n\n## Registered Actions\n\nBased on the repository structure, reduce operations support extensibility through custom registered actions. The system is designed to work with Dask DataFrames and can integrate with:\n\n- **SemanticDeduplicator**: For deduplication using embedding-based similarity matching\n- **Custom aggregations**: User-defined reduction logic via the decorator pattern\n\n### SemanticDeduplicator Integration\n\nWhile primarily a deduplication tool, the `SemanticDeduplicator` class in `datatune/core/deduplication.py` demonstrates how reduce-style operations work:\n\n```python\nclass SemanticDeduplicator:\n    def __init__(\n        self,\n        llm,\n        embedding_model: str = \"text-embedding-3-small\",\n        sim_threshold: float = 0.90,\n        top_k: int = 50,\n        hnsw_m: int = 32,\n        ef_search: int = 64,\n        return_df: bool = False,\n    ):\n        # Configuration initialization\n```\n\nThe deduplicator processes data through embedding generation, FAISS index building, and similarity search phases, demonstrating how reduce operations handle complex multi-stage transformations.\n\n资料来源：[datatune/core/deduplication.py:1-150]()\n\n## Extension Points\n\n### Creating Custom Reduce Actions\n\nTo create a custom reduction action:\n\n1. Define a class with appropriate `__init__` and `__call__` methods\n2. Decorate with `@register_action(\"action_name\")`\n3. The class should accept `**kwargs` for flexible configuration\n4. Implement `__call__(self, df)` to receive the DataFrame and return the result\n\n```python\nfrom datatune.core.reduce import register_action\n\n@register_action(\"custom_aggregate\")\nclass CustomAggregateReducer:\n    def __init__(self, group_by_column, agg_column, operation=\"sum\", **kwargs):\n        self.group_by_column = group_by_column\n        self.agg_column = agg_column\n        self.operation = operation\n    \n    def __call__(self, df):\n        # Custom reduction logic\n        return result_df\n```\n\n### Best Practices\n\n| Practice | Rationale |\n|----------|-----------|\n| Use descriptive action names | Ensures clear identification in `get_action` calls |\n| Accept `**kwargs` in `__init__` | Maintains compatibility with the reduce function signature |\n| Return consistent types | Enables predictable downstream processing |\n| Handle empty DataFrames | Prevents errors during edge case processing |\n\n## Related Components\n\n| Component | File | Relationship |\n|-----------|------|--------------|\n| Filter | `datatune/core/dask/filter_dask.py` | Similar pattern for row-level operations |\n| Map | `datatune/core/dask/map_dask.py` | Column transformation counterpart |\n| Finalize | `datatune/core/dask/op.py` | Completes lazy Dask computations |\n| Agent | `datatune/agent/agent.py` | Orchestrates multi-step transformations |\n\n资料来源：[datatune/core/dask/filter_dask.py:1-50]()\n资料来源：[datatune/core/dask/map_dask.py:1-50]()\n\n## Summary\n\nReduce Operations provide a flexible, extensible mechanism for aggregating and transforming data in datatune. The action registry pattern enables:\n\n- **Dynamic action registration** via the `@register_action` decorator\n- **Centralized action lookup** through the `get_action` function\n- **Unified API** through the `reduce` function interface\n- **Custom extensibility** for domain-specific reduction logic\n\nThe architecture follows proven design patterns that separate action registration from execution, allowing the core system to remain stable while supporting diverse reduction behaviors through plugin-style extensions.\n\n---\n\n<a id='page-dask-backend'></a>\n\n## Dask Backend\n\n### 相关页面\n\n相关主题：[Ibis Backend](#page-ibis-backend), [System Architecture](#page-architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n- [datatune/core/dask/op.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/op.py)\n- [datatune/core/dask/constants.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/constants.py)\n- [datatune/core/map.py](https://github.com/vitalops/datatune/blob/main/datatune/core/map.py)\n- [datatune/core/filter.py](https://github.com/vitalops/datatune/blob/main/datatune/core/filter.py)\n</details>\n\n# Dask Backend\n\nThe Dask Backend is the primary data processing engine in datatune, enabling large-scale, distributed data transformations using Large Language Models (LLMs). It extends the popular Dask library with semantic understanding capabilities, allowing users to perform complex data operations through natural language prompts while maintaining Dask's efficient parallel and out-of-core computation model.\n\n## Architecture Overview\n\nThe Dask Backend follows a modular architecture with three main layers:\n\n1. **API Layer** (`datatune/core/map.py`, `datatune/core/filter.py`) - Entry points that dispatch to appropriate backends\n2. **Operation Layer** (`datatune/core/dask/`) - Core transformations for Dask DataFrames\n3. **LLM Integration Layer** - Handles communication with language models for semantic operations\n\n```mermaid\ngraph TD\n    A[User Data] --> B[Dask DataFrame]\n    B --> C{Operation Type}\n    C -->|Map| D[map_dask.py]\n    C -->|Filter| E[filter_dask.py]\n    D --> F[LLM API]\n    E --> F\n    F --> G[Parse & Validate]\n    G --> H[New Columns Added]\n    H --> I[Lazy Dask DataFrame]\n    I --> J[finalize]\n    J --> K[Computed Result]\n    \n    L[Clusters] -.->|Optional| D\n    L -.->|Optional| E\n```\n\n## Core Operations\n\n### Map Operation\n\nThe map operation uses an LLM to generate new columns from existing data through natural language understanding. It is designed for semantic extraction, classification, and content transformation tasks.\n\n**Source**: `datatune/core/dask/map_dask.py`\n\n#### How Map Works\n\n```mermaid\ngraph LR\n    A[Input DataFrame] --> B[Serialize Rows]\n    B --> C[Cluster Deduplication]\n    C --> D[Batch for LLM]\n    D --> E[LLM Processing]\n    E --> F[Parse Output]\n    F --> G[Expand to Columns]\n    G --> H[Copy to Duplicates]\n    H --> I[Output DataFrame]\n```\n\n#### Key Components\n\n| Component | Description |\n|-----------|-------------|\n| `serialized_input_column` | Temporary column storing row data as Python dictionaries |\n| `llm_output_column` | Temporary column storing raw LLM responses |\n| `clusters` | Optional list of duplicate clusters for deduplication |\n\n#### Processing Flow\n\n1. Each row is serialized to a dictionary format\n2. If clusters are provided, duplicate handling is applied - only canonical rows are sent to LLM\n3. LLM receives batched prompts with mapping instructions\n4. Responses are parsed and new columns are extracted\n5. Results are propagated back to duplicate rows\n\n**Key code snippet from map_dask.py**:\n```python\ndf.loc[canonical_idx, llm_output_column] = llm_out\n\nfor dup, canon in dup_to_canon.items():\n    df.loc[dup, llm_output_column] = df.loc[canon, llm_output_column]\n```\n\nThis ensures duplicate handling by copying canonical results to all duplicate rows.\n\n资料来源：[datatune/core/dask/map_dask.py:34-48]()\n\n### Filter Operation\n\nThe filter operation uses an LLM to determine which rows to keep or remove based on natural language criteria.\n\n**Source**: `datatune/core/dask/filter_dask.py`\n\n#### Filter Output Format\n\nThe LLM returns decisions in a specific format with an added `__filter__` key:\n\n```python\nindex=<row_index>|{key1: value1, ..., '__filter__': True/False}<endofrow>\n```\n\n| Decision | Behavior |\n|----------|----------|\n| `True` | Keep the row |\n| `False` | Remove the row |\n\n#### Processing Pipeline\n\n```mermaid\ngraph TD\n    A[Input DataFrame] --> B[Serialize to Dictionaries]\n    B --> C[Apply Cluster Deduplication]\n    C --> D[Send to LLM with Filter Prompt]\n    D --> E[Parse __filter__ Value]\n    E --> F[Filter DataFrame]\n    F --> G[Output DataFrame]\n```\n\n**Key code snippet for filter decision parsing**:\n```python\nsuffix = (\n    f\"{os.linesep}{os.linesep}\"\n    \"DECISION:Your response MUST be the entire input record as Python dictionary...\"\n    \"ALWAYS STICK TO THE FORMAT index=<row_index>|{...} with added key called '__filter__'...\"\n)\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:24-35]()\n\n### Finalize Operation\n\nThe `finalize` operation computes the lazy Dask DataFrame and converts it to a pandas DataFrame.\n\n**Source**: `datatune/core/dask/op.py`\n\n```python\ndef finalize(df: dd.DataFrame) -> pd.DataFrame:\n    \"\"\"Compute the lazy Dask DataFrame and return a pandas DataFrame.\"\"\"\n    return df.compute()\n```\n\n资料来源：[datatune/core/dask/op.py]()\n\n## Semantic Deduplication\n\nBoth map and filter operations support semantic deduplication to reduce LLM API calls and costs.\n\n**Source**: `datatune/core/deduplication.py`\n\n### How Deduplication Works\n\n```mermaid\ngraph TD\n    A[Input Series] --> B[Find Duplicate Clusters]\n    B --> C{dup_to_canon mapping}\n    C --> D[Extract Canonical Index]\n    D --> E[Send Only Canonical to LLM]\n    E --> F[Copy Results to Duplicates]\n    \n    style C fill:#f9f,color:#000\n```\n\n### Cluster Format\n\n```python\nclusters = [\n    {\"canonical_id\": 5, \"duplicate_ids\": [12, 23, 45]},\n    {\"canonical_id\": 10, \"duplicate_ids\": [15, 20]}\n]\n```\n\nThis approach sends only unique/canonical rows to the LLM, then propagates results to all duplicates.\n\n资料来源：[datatune/core/dask/map_dask.py:34-40]()\n\n## API Reference\n\n### Map Function\n\n```python\nfrom datatune.core.map import map\n\nresult = map(\n    prompt=\"Extract categories from the description\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"],\n    clusters=None  # Optional deduplication clusters\n)(llm, dask_dataframe)\n```\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `prompt` | str | Yes | Natural language instructions for column generation |\n| `output_fields` | List[str] | Yes | Names of new columns to create |\n| `input_fields` | List[str] | No | Columns to use as input (defaults to all) |\n| `clusters` | List[Dict] | No | Duplicate clusters for deduplication |\n\n### Filter Function\n\n```python\nfrom datatune.core.filter import filter\n\nresult = filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\", \"Description\"],\n    clusters=None  # Optional deduplication clusters\n)(llm, dask_dataframe)\n```\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `prompt` | str | Yes | Natural language criteria for filtering |\n| `input_fields` | List[str] | No | Columns to evaluate against |\n| `clusters` | List[Dict] | No | Duplicate clusters for deduplication |\n\n### Finalize Function\n\n```python\nfrom datatune.core.dask.op import finalize\n\npandas_df = finalize(lazy_dask_dataframe)\n```\n\n资料来源：[datatune/core/dask/op.py]()\n\n## LLM Output Format\n\nThe Dask Backend requires LLM responses in a specific parseable format for reliable operation.\n\n### Map Output Format\n\n```\nindex=<row_index>|{key1: value1, key2: value2, ..., 'new_field1': val, 'new_field2': val}<endofrow>\n```\n\n### Filter Output Format\n\n```\nindex=<row_index>|{key1: value1, key2: value2, ..., '__filter__': True/False}<endofrow>\n```\n\n### Parsing Rules\n\n| Rule | Description |\n|------|-------------|\n| String values | Must be enclosed in double quotes |\n| Missing values | Set to `None` or `null` |\n| Index | Must match row index exactly |\n| Format | Must be valid Python literal (dictionary) |\n\n资料来源：[datatune/core/dask/map_dask.py:20-30]()\n\n## Usage Examples\n\n### Example 1: Category Extraction\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"products.csv\")\n\n# Extract categories using natural language\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n\n# Save results\nresult = dt.finalize(mapped)\nresult.compute().to_csv(\"categorized_products.csv\")\n```\n\n资料来源：[README.md]()\n\n### Example 2: Text-Based Filtering\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\ndf = dd.read_csv(\"companies.csv\")\n\n# Filter based on semantic criteria\nfiltered = dt.filter(\n    prompt=\"Keep only organizations in Africa\",\n    input_fields=[\"Country\", \"Description\"]\n)(llm, df)\n\nresult = dt.finalize(filtered)\n```\n\n### Example 3: With Deduplication\n\n```python\n# Assuming clusters from semantic deduplication step\nclusters = [\n    {\"canonical_id\": 5, \"duplicate_ids\": [12, 23, 45]},\n    {\"canonical_id\": 10, \"duplicate_ids\": [15, 20]}\n]\n\nmapped = dt.map(\n    prompt=\"Classify the industry sector\",\n    output_fields=[\"Industry\", \"Sector\"],\n    input_fields=[\"CompanyName\", \"Description\"],\n    clusters=clusters\n)(llm, df)\n```\n\n## Internal Data Flow\n\n### Serialization Process\n\nThe backend converts DataFrame rows to dictionary format for LLM processing:\n\n```mermaid\ngraph LR\n    A[DataFrame Row] --> B[Convert to Dict]\n    B --> C[Replace NaN with None]\n    C --> D[\"String: {col1: val1, col2: val2}\"]\n    D --> E[LLM Prompt]\n```\n\n**Implementation from map_dask.py**:\n```python\ndf = df.astype(object).where(df.notna(), None)\ndf[serialized_input_column] = [\n    str(row.to_dict()) for _, row in df.iterrows()\n]\n```\n\n资料来源：[datatune/core/dask/map_dask.py:50-56]()\n\n## Error Handling\n\nThe backend includes robust error handling for LLM parsing failures:\n\n```python\ndef parse_llm_output(llm_output: Union[str, Exception]) -> Union[Dict, Exception]:\n    \"\"\"\n    Parses the LLM output string into a Python dictionary.\n    \"\"\"\n    # Returns dict on success, Exception on failure\n```\n\nFailed parsing attempts return empty dictionaries `{}`, ensuring the pipeline continues rather than crashing.\n\n## Backend Dispatch Mechanism\n\nThe system automatically dispatches to the Dask backend when a Dask DataFrame is detected:\n\n```mermaid\ngraph TD\n    A[Input Data] --> B{Is Dask DataFrame?}\n    B -->|Yes| C[Dask Backend]\n    B -->|No| D{Is Ibis Table?}\n    D -->|Yes| E[Ibis Backend]\n    D -->|No| F[TypeError]\n    \n    C --> G[_map_dask / _filter_dask]\n    E --> H[_map_ibis / _filter_ibis]\n```\n\n**Dispatch logic from core/map.py**:\n```python\ndef _is_dask_df(obj):\n    try:\n        import dask.dataframe as dd\n        return isinstance(obj, dd.DataFrame)\n    except ImportError:\n        return False\n```\n\n资料来源：[datatune/core/map.py:1-17]()\n\n## Configuration Considerations\n\n### Memory Efficiency\n\nThe Dask backend processes data partition by partition, making it suitable for datasets larger than memory. Consider:\n\n| Setting | Recommendation |\n|---------|----------------|\n| Dask partitions | Set based on available memory |\n| Batch size | Adjust based on LLM rate limits |\n| Clusters | Use for high-duplicate datasets |\n\n### LLM Rate Limits\n\nThe backend implements batching to respect LLM API rate limits while maximizing throughput.\n\n## Summary\n\nThe Dask Backend provides a powerful, scalable mechanism for performing LLM-powered data transformations on Dask DataFrames. Key capabilities include:\n\n- **Map**: Generate new columns through semantic understanding\n- **Filter**: Remove rows based on natural language criteria  \n- **Deduplication**: Reduce costs by processing only canonical rows\n- **Lazy Evaluation**: Maintain Dask's efficient computation model until finalization\n\nAll operations follow a consistent pattern: serialize data, send to LLM with clear instructions, parse responses, and update the DataFrame.\n\n---\n\n<a id='page-ibis-backend'></a>\n\n## Ibis Backend\n\n### 相关页面\n\n相关主题：[Dask Backend](#page-dask-backend), [System Architecture](#page-architecture)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/core/ibis/filter_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/filter_ibis.py)\n- [datatune/core/ibis/map_ibis.py](https://github.com/vitalops/datatune/blob/main/datatune/core/ibis/map_ibis.py)\n</details>\n\n# Ibis Backend\n\n## Overview\n\nThe Ibis Backend in datatune provides integration with the Ibis project, enabling LLM-powered data transformations on various database backends including DuckDB, PostgreSQL, BigQuery, and more. This backend extends datatune's core functionality (map, filter) to work with Ibis tables, leveraging the composability and SQL-generation capabilities of Ibis while incorporating LLM-based semantic transformations.\n\nThe Ibis Backend serves as a critical abstraction layer that:\n\n- Accepts Ibis table objects as input\n- Serializes table data for LLM consumption\n- Executes batch inference operations via LLM calls\n- Parses and applies LLM outputs back to the table structure\n- Returns transformed Ibis tables for continued processing\n\n资料来源：[datatune/core/ibis/filter_ibis.py:1-20](datatune/core/ibis/filter_ibis.py)\n资料来源：[datatune/core/ibis/map_ibis.py:1-25](datatune/core/ibis/map_ibis.py)\n\n## Architecture\n\n### System Components\n\nThe Ibis Backend consists of several interconnected components that work together to provide seamless LLM-powered transformations:\n\n```mermaid\ngraph TD\n    A[Ibis Table Input] --> B[add_serialized_col]\n    B --> C[llm_batch_inference]\n    C --> D[LLM API Call]\n    D --> E[Parse LLM Output]\n    E --> F[apply_llm_updates / Filter Logic]\n    F --> G[Transformed Ibis Table]\n    \n    H[_map_ibis Class] --> I[Map Operations]\n    J[_filter_ibis Class] --> K[Filter Operations]\n```\n\n### Component Responsibilities\n\n| Component | File | Purpose |\n|-----------|------|---------|\n| `add_serialized_col` | filter_ibis.py, map_ibis.py | Serializes Ibis table columns into JSON strings for LLM input |\n| `llm_batch_inference` | filter_ibis.py, map_ibis.py | Manages batch inference with LLM, handles retries and error cases |\n| `_map_ibis` | map_ibis.py | Handles mapping operations (creating new columns via LLM) |\n| `_filter_ibis` | filter_ibis.py | Handles filtering operations (row selection via LLM) |\n\n资料来源：[datatune/core/ibis/filter_ibis.py:11-32](datatune/core/ibis/filter_ibis.py)\n资料来源：[datatune/core/ibis/map_ibis.py:12-38](datatune/core/ibis/map_ibis.py)\n\n## Core Functions\n\n### add_serialized_col\n\nThe `add_serialized_col` function transforms Ibis table columns into a serialized JSON format suitable for LLM processing. This function is fundamental to both map and filter operations.\n\n**Function Signature:**\n\n```python\ndef add_serialized_col(table, target_col: str, input_fields: Optional[List[str]] = []):\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `table` | `ibis.Table` | The input Ibis table |\n| `target_col` | `str` | Name of the column to store serialized data |\n| `input_fields` | `Optional[List[str]]` | List of column names to include in serialization. If empty, includes all columns except target |\n\n**Behavior:**\n\n1. Filters columns to include only relevant fields (excluding target column and non-selected fields)\n2. Constructs a JSON string expression by concatenating column names and values\n3. Returns a new Ibis table with the serialized column added via `mutate()`\n\n资料来源：[datatune/core/ibis/map_ibis.py:40-53](datatune/core/ibis/map_ibis.py)\n资料来源：[datatune/core/ibis/filter_ibis.py:37-51](datatune/core/ibis/filter_ibis.py)\n\n### llm_batch_inference\n\nThe `llm_batch_inference` function manages the core LLM interaction for transforming data. It handles the complete workflow from prompt construction to result collection.\n\n**Function Signature:**\n\n```python\ndef llm_batch_inference(\n    table,\n    llm: Callable, \n    input_col: str, \n    output_col: str,   \n    prompt: str,\n    expected_new_fields: list[str] = []\n):\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `table` | `ibis.Table` | The input Ibis table |\n| `llm` | `Callable` | LLM callable (e.g., OpenAI, Ollama instance) |\n| `input_col` | `str` | Column containing serialized input data |\n| `output_col` | `str` | Column name for storing LLM output |\n| `prompt` | `str` | Transformation prompt for the LLM |\n| `expected_new_fields` | `list[str]` | Optional list of expected output field names |\n\n**Workflow:**\n\n```mermaid\nsequenceDiagram\n    participant Table as Ibis Table\n    participant Func as llm_batch_inference\n    participant LLM as LLM API\n    participant Parse as Output Parser\n    \n    Table->>Func: Execute table.select() to local DataFrame\n    Func->>Table: Get input_list from input_col\n    Func->>LLM: Call llm(input_list, prefix, prompt, suffix)\n    LLM-->>Func: Raw LLM responses\n    Func->>Parse: Process each response\n    Parse-->>Func: Validated JSON strings\n    Func->>Table: Create mapping DataFrame\n    Table-->>Func: Updated table with output_col\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:55-82](datatune/core/ibis/map_ibis.py)\n资料来源：[datatune/core/ibis/filter_ibis.py:54-78](datatune/core/ibis/filter_ibis.py)\n\n## Map Operations\n\n### _map_ibis Class\n\nThe `_map_ibis` class implements the mapping transformation for Ibis tables, enabling the creation of new columns based on LLM-driven transformations.\n\n**Class Definition:**\n\n```python\nclass _map_ibis:\n    def __init__(\n        self,\n        prompt: str,\n        input_fields: Optional[List[str]] = None,\n        output_fields: Optional[List[str]] = None,\n    ):\n        self.prompt = prompt\n        self.input_fields = input_fields or []\n        self.output_fields = output_fields or []\n        self.llm_output_column = \"MAP_LLM_OUTPUT__DATATUNE__\"\n        self.serialized_input_column = \"MAP_SERIALIZED_INPUT__DATATUNE__\"\n```\n\n**Key Attributes:**\n\n| Attribute | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `prompt` | `str` | Required | Natural language prompt describing the transformation |\n| `input_fields` | `Optional[List[str]]` | `[]` | Columns to use as input for transformation |\n| `output_fields` | `Optional[List[str]]` | `[]` | Names of new columns to create |\n| `llm_output_column` | `str` | System-generated | Internal column for LLM response storage |\n| `serialized_input_column` | `str` | System-generated | Internal column for serialized input |\n\n**Execution Flow:**\n\n1. Validates that all `input_fields` exist in the table schema\n2. Creates serialized input column via `add_serialized_col`\n3. Executes batch inference via `llm_batch_inference`\n4. Applies LLM updates to create output columns\n5. Selects final columns (removing internal columns)\n\n资料来源：[datatune/core/ibis/map_ibis.py:122-160](datatune/core/ibis/map_ibis.py)\n\n### Prompt Format for Map\n\nThe LLM receives structured prompts for mapping operations:\n\n```\nMap and transform the input according to the mapping criteria below.\nReplace or Create new fields or values as per the prompt.\nExpected new fields: [output_fields]\n\nMAPPING CRITERIA:\n{prompt}\n\nYour response MUST be the entire input record as a valid Python dictionary\nin the format 'index=<row_index>|{key1: value1, key2: value2, ...}' with added keys\nof expected new fields if any.\n```\n\n资料来源：[datatune/core/ibis/map_ibis.py:56-75](datatune/core/ibis/map_ibis.py)\n\n## Filter Operations\n\n### _filter_ibis Class\n\nThe `_filter_ibis` class implements filtering transformations for Ibis tables, enabling row-level selection based on LLM-driven criteria.\n\n**Class Definition:**\n\n```python\nclass _filter_ibis:\n    def __init__(\n        self,\n        prompt: str,\n        input_fields: Optional[List[str]] = None,\n    ):\n        self.prompt = prompt\n        self.input_fields = input_fields or []\n        self.llm_output_column = \"FILTER_LLM_OUTPUT__DATATUNE__\"\n        self.serialized_input_column = \"FILTER_SERIALIZED_INPUT__DATATUNE__\"\n```\n\n**Key Attributes:**\n\n| Attribute | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `prompt` | `str` | Required | Natural language criteria for filtering |\n| `input_fields` | `Optional[List[str]]` | `[]` | Columns to consider for filtering decision |\n| `llm_output_column` | `str` | System-generated | Internal column for LLM response storage |\n| `serialized_input_column` | `str` | System-generated | Internal column for serialized input |\n\n**Execution Flow:**\n\n```mermaid\ngraph TD\n    A[Input Ibis Table] --> B[Add Serialized Column]\n    B --> C[Execute to Local DataFrame]\n    C --> D[LLM Batch Inference]\n    D --> E[Parse Filter Decisions]\n    E --> F[Extract __filter__ Flag]\n    F --> G[Apply Boolean Filter]\n    G --> H[Final Ibis Table]\n```\n\n资料来源：[datatune/core/ibis/filter_ibis.py:80-120](datatune/core/ibis/filter_ibis.py)\n\n### Prompt Format for Filter\n\nThe LLM receives structured prompts for filtering operations:\n\n```\nYou are filtering a dataset. Your task is to determine whether each data record\nshould be KEPT or REMOVED based on the filtering criteria below.\nReturn the entire input data record with an added key called '__filter__' with\nvalue either True to KEEP the record or False to REMOVE it.\n\nFILTERING CRITERIA:\n{prompt}\n\nYour response MUST be the entire input record as a Python dictionary in the format:\nindex=<row_index>|{key1: value1, key2: value2, ...}<endofrow> with added key called\n'__filter__' with value either True to KEEP the record or False to REMOVE it.\n```\n\n资料来源：[datatune/core/ibis/filter_ibis.py:56-68](datatune/core/ibis/filter_ibis.py)\n\n## Output Parsing\n\n### parse_filter_output\n\nThe `parse_filter_output` function extracts boolean filter decisions from LLM responses.\n\n**Function Signature:**\n\n```python\ndef parse_filter_output(\n    output: Union[str, Exception], err: bool = True\n) -> Optional[bool]:\n```\n\n**Parsing Logic:**\n\n1. Attempts to evaluate the LLM output as a Python literal using `ast.literal_eval`\n2. Extracts the last key-value pair from the dictionary\n3. Returns the boolean value associated with the `__filter__` key\n4. Returns `None` if parsing fails\n\n资料来源：[datatune/core/ibis/filter_ibis.py:104-130](datatune/core/ibis/filter_ibis.py)\n\n### Output Format Requirements\n\nThe Ibis Backend enforces strict output format requirements for LLM responses:\n\n| Requirement | Format | Example |\n|-------------|--------|---------|\n| Index prefix | `index=<row_index>\\|` | `index=0\\|` |\n| Data format | Python dictionary literal | `{'col1': 'value1', 'col2': 'value2'}` |\n| String quoting | Double quotes only | `\"string\"` not `'string'` |\n| Null values | `None` | `{'col': None}` |\n| Row terminator | `<endofrow>` | `index=0\\|{...}<endofrow>` |\n\n资料来源：[datatune/core/ibis/map_ibis.py:70-75](datatune/core/ibis/map_ibis.py)\n资料来源：[datatune/core/ibis/filter_ibis.py:64-67](datatune/core/ibis/filter_ibis.py)\n\n## Usage Examples\n\n### Basic Map Operation\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport ibis\n\n# Connect to DuckDB\ncon = ibis.duckdb.connect(\"data.duckdb\")\ntable = con.table(\"products\")\n\n# Initialize LLM\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\n\n# Create mapping operation\nmapped = dt.map(\n    prompt=\"Extract product category and subcategory from description\",\n    input_fields=[\"description\", \"name\"],\n    output_fields=[\"Category\", \"Subcategory\"]\n)(llm, table)\n```\n\n### Basic Filter Operation\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import Ollama\n\n# Connect to PostgreSQL via Ibis\ncon = ibis.postgres.connect(...)\ntable = con.table(\"orders\")\n\n# Initialize local LLM\nllm = Ollama(model_name=\"gemma3:4b\")\n\n# Create filter operation\nfiltered = dt.filter(\n    prompt=\"Keep only orders from enterprise customers with value over $10,000\",\n    input_fields=[\"customer_type\", \"order_value\"]\n)(llm, table)\n```\n\n## Error Handling\n\n### Schema Validation\n\nThe Ibis Backend performs input schema validation before processing:\n\n```python\ndef __call__(self, llm: Callable, table: Table) -> Table:\n    if self.input_fields:\n        missing = [f for f in self.input_fields if f not in table.columns]\n        if missing:\n            error_msg = (\n                f\"[datatune] Schema mismatch: The following input_fields were not found: {missing}. \"\n                f\"Available columns: {list(table.columns)}\"\n            )\n            raise ValueError(error_msg)\n```\n\n**Validation Rules:**\n\n| Check | Condition | Action on Failure |\n|-------|-----------|-------------------|\n| Input fields exist | `input_fields` subset of `table.columns` | Raise `ValueError` |\n| LLM call success | No exception from `llm()` | Return error string |\n| Output parsing | Valid Python literal | Return empty dict `{}` |\n\n资料来源：[datatune/core/ibis/map_ibis.py:130-140](datatune/core/ibis/map_ibis.py)\n\n## Internal Column Naming\n\nThe Ibis Backend uses prefixed column names to avoid conflicts with user data:\n\n| Operation | Serialized Column | Output Column |\n|-----------|-------------------|---------------|\n| Map | `MAP_SERIALIZED_INPUT__DATATUNE__` | `MAP_LLM_OUTPUT__DATATUNE__` |\n| Filter | `FILTER_SERIALIZED_INPUT__DATATUNE__` | `FILTER_LLM_OUTPUT__DATATUNE__` |\n\nThese columns are automatically removed from the final output table, ensuring clean results.\n\n## Integration with Core API\n\nThe Ibis Backend is invoked through the unified datatune API via the `filter.py` module:\n\n```python\ndef filter(*, prompt, input_fields=None, clusters=None):\n    def apply(llm, data):\n        if _is_ibis_table(data):\n            from .ibis.filter_ibis import _filter_ibis\n            return _filter_ibis(\n                prompt=prompt,\n                input_fields=input_fields,\n            )(llm, data)\n        # ... other backends\n```\n\n资料来源：[datatune/core/filter.py:15-28](datatune/core/filter.py)\n\n## Supported Database Backends\n\nThe Ibis Backend supports any database backend compatible with Ibis:\n\n| Backend | Connection Method | Example |\n|---------|-------------------|---------|\n| DuckDB | `ibis.duckdb.connect()` | Local analytical queries |\n| PostgreSQL | `ibis.postgres.connect()` | Enterprise data |\n| BigQuery | `ibis.bigquery.connect()` | Cloud data warehousing |\n| SQLite | `ibis.sqlite.connect()` | Lightweight embedded DB |\n| MySQL | `ibis.mysql.connect()` | Web applications |\n\n## Performance Considerations\n\n### Batch Processing\n\nThe Ibis Backend processes data in batches determined by:\n\n1. **Local execution boundary**: Data is materialized to a local DataFrame via `.execute()`\n2. **LLM rate limits**: Respects TPM (tokens per minute) and RPM (requests per minute) limits\n3. **Memory constraints**: Batch sizes are controlled by the underlying LLM integration\n\n### Optimization Strategies\n\n| Strategy | Description | Applicable Operation |\n|----------|-------------|---------------------|\n| Select specific fields | Use `input_fields` to minimize data transfer | Map, Filter |\n| Pushdown predicates | Filter data in the database before LLM processing | Filter |\n| Column pruning | Remove unused columns early in the pipeline | Map |\n\n## See Also\n\n- [Dask Backend](../dask/backend.md) - Alternative backend for distributed computing\n- [LLM Integrations](../../llm/llm.md) - Supported LLM providers\n- [Core API](../core/api.md) - Unified transformation API\n\n---\n\n<a id='page-llm-integration'></a>\n\n## LLM Integration\n\n### 相关页面\n\n相关主题：[Agent System](#page-agent-system), [Map Operations](#page-map-operations), [Filter Operations](#page-filter-operations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/llm/model_rate_limits.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/model_rate_limits.py)\n- [datatune/llm/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/__init__.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n</details>\n\n# LLM Integration\n\nThe LLM Integration module in datatune provides a unified abstraction layer for interfacing with various Large Language Model providers. It handles rate limiting, token management, request batching, and provider-specific configurations, enabling consistent data transformation operations across different LLM backends.\n\n## Architecture Overview\n\nThe LLM module uses a class hierarchy where a base `LLM` class provides common functionality, while provider-specific subclasses handle individual vendor implementations.\n\n```mermaid\ngraph TD\n    A[LLM Base Class] --> B[OpenAI]\n    A --> C[Ollama]\n    A --> D[VLLM]\n    A --> E[Azure]\n    A --> F[Mistral]\n    A --> G[Huggingface]\n    A --> H[Gemini]\n    \n    I[litellm] --> A\n    J[Rate Limits] --> A\n    K[Token Counter] --> A\n```\n\n## Core Components\n\n### LLM Base Class\n\nThe `LLM` class serves as the foundation for all provider implementations. It manages rate limiting, token counting, and prompt batching.\n\n#### Initialization Parameters\n\n| Parameter | Type | Description | Default |\n|-----------|------|-------------|---------|\n| `model_name` | `str` | Full model identifier (provider/model format) | Required |\n| `rpm` | `int` | Requests per minute limit | Provider default |\n| `tpm` | `int` | Tokens per minute limit | Provider default |\n| `max_tokens` | `int` | Maximum tokens in response | Provider default |\n| `temperature` | `float` | Sampling temperature | `0.0` |\n\n#### Key Methods\n\n| Method | Description |\n|--------|-------------|\n| `_completion(prompt)` | Execute a single completion request |\n| `_create_batched_prompts(input_rows, user_batch_prefix, prompt_per_row)` | Create optimized batched prompts |\n| `get_max_tokens()` | Retrieve max tokens for current model |\n\n资料来源：[datatune/llm/llm.py:40-90]()\n\n### Provider Implementations\n\n#### OpenAI\n\n```python\nfrom datatune.llm.llm import OpenAI\n\nllm = OpenAI(\n    model_name=\"gpt-3.5-turbo\",\n    api_key=\"your-api-key\"\n)\n```\n\nConfigures models in the `openai/model-name` format for LiteLLM compatibility.\n\n资料来源：[datatune/llm/llm.py:95-102]()\n\n#### Ollama (Local)\n\n```python\nfrom datatune.llm.llm import Ollama\n\nllm = Ollama(\n    model_name=\"gemma3:4b\",\n    api_base=\"http://localhost:11434\"\n)\n```\n\nEnables local model inference with Ollama. The `api_base` defaults to `http://localhost:11434`.\n\n资料来源：[datatune/llm/llm.py:74-82]()\n\n#### VLLM\n\n```python\nfrom datatune.llm.llm import VLLM\n\nllm = VLLM(\n    model_name=\"your-model\",\n    api_base=\"http://localhost:8000/v1\"\n)\n```\n\nQueries the `/models` endpoint to automatically determine `max_model_len` from the server.\n\n资料来源：[datatune/llm/llm.py:104-122]()\n\n#### Azure\n\n```python\nfrom datatune.llm.llm import Azure\n\nllm = Azure(\n    model_name=\"gpt-3.5-turbo\",\n    api_key=\"your-key\",\n    api_base=\"your-endpoint\",\n    api_version=\"2024-01-01\"\n)\n```\n\nSupports Azure OpenAI Service deployments with additional configuration options.\n\n#### Mistral\n\n```python\nfrom datatune.llm.llm import Mistral\n\nllm = Mistral(\n    model_name=\"mistral-tiny\",\n    api_key=\"your-api-key\"\n)\n```\n\nWraps Mistral AI models with automatic `mistral/` prefix handling.\n\n#### Huggingface\n\n```python\nfrom datatune.llm.llm import Huggingface\n\nllm = Huggingface(\n    model_name=\"meta-llama/Llama-2-70b\",\n    api_key=\"your-api-key\"\n)\n```\n\nProvides access to Hugging Face Inference API models.\n\n## Rate Limiting\n\nThe module enforces two types of rate limits to prevent API throttling:\n\n| Limit Type | Description | Applied At |\n|------------|-------------|------------|\n| **TPM** | Tokens per minute | Per-request token counting |\n| **RPM** | Requests per minute | Per-API-call counting |\n\n### Model Rate Limits Configuration\n\nPredefined limits for major models are stored in `datatune/llm/model_rate_limits.py`:\n\n| Model | TPM | RPM |\n|-------|-----|-----|\n| `gpt-3.5-turbo` | 200,000 | 500 |\n| `gpt-4` | 10,000 | 500 |\n| `gpt-4-turbo` | 30,000 | 500 |\n| `gpt-4o` | 30,000 | 500 |\n| `gpt-4.1-mini` | 200,000 | 500 |\n| `gpt-4.1-nano` | 200,000 | 500 |\n| `gpt-4.5-preview` | 125,000 | 1,000 |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-90]()\n\n### Fallback Behavior\n\nWhen an unknown model is used, the system falls back to `gpt-3.5-turbo` limits and logs a warning:\n\n```python\nif self._base_model_name in model_rate_limits:\n    model_limits = model_rate_limits[self._base_model_name]\nelse:\n    model_limits = model_rate_limits[DEFAULT_MODEL]\n    logger.warning(\n        f\"REQUESTS-PER-MINUTE limits for model '{model_name}' not found. \"\n        f\"Defaulting to '{DEFAULT_MODEL}' limits\"\n    )\n```\n\n资料来源：[datatune/llm/llm.py:45-57]()\n\n## Prompt Formatting\n\nThe LLM module enforces strict output format requirements for parsing reliability.\n\n### Output Format Specification\n\nAll responses must follow this pattern:\n\n```\nindex=<row_index>|{python_literal}<endofrow>\n```\n\nSupported Python literals:\n- Lists: `index=0|['item1', 'item2', 'item3']<endofrow>`\n- Dicts: `index=1|{'key1': 'value1', 'key2': 2}<endofrow>`\n- Strings: `index=2|\"This is a string answer\"<endofrow>`\n\n### Prompt Components\n\n| Component | Purpose |\n|-----------|---------|\n| `prefix` | Instructions for row processing |\n| `user_batch_prefix` | Custom user instructions |\n| `suffix` | Output format requirements |\n\n资料来源：[datatune/llm/llm.py:1-20]()\n\n## Batch Processing\n\nThe batching system optimizes LLM usage by grouping multiple rows into single API requests while respecting token limits.\n\n```mermaid\ngraph LR\n    A[Input Rows] --> B[Token Counting]\n    B --> C{Within Limits?}\n    C -->|Yes| D[Batch Request]\n    C -->|No| E[Split Batch]\n    D --> F[LLM API]\n    E --> D\n    F --> G[Parse Output]\n    G --> H[Row Results]\n```\n\n### Batching Logic\n\n1. Calculate prefix/suffix token overhead using `token_counter`\n2. Iterate through input rows, accumulating tokens\n3. Split batches when approaching `max_tokens` limit\n4. Track `nrows_per_api_call` for output reconstruction\n\n资料来源：[datatune/llm/llm.py:30-70]()\n\n## Integration with Core Modules\n\n### Filter Operations\n\nThe filter module uses the LLM to evaluate row-level boolean conditions:\n\n```python\ndef filter_rows(llm, df, input_column, prompt, output_column):\n    prefix = \"Row data:\"\n    suffix = \"Your response MUST be... with added key '__filter__'\"\n    \n    llm_out = llm(\n        input_series,\n        prefix,\n        prompt,\n        suffix,\n        optimized=True\n    )\n```\n\n资料来源：[datatune/core/dask/filter_dask.py:15-40]()\n\n### Map Operations\n\nThe map module leverages the LLM for column transformations:\n\n```python\ndef map_rows(llm, df, input_columns, prompt, output_columns):\n    suffix = \"Your response MUST be... with added keys of expected new fields\"\n    \n    llm_out = llm(\n        canonical_input,\n        prefix,\n        prompt,\n        suffix,\n        optimized=True\n    )\n```\n\n资料来源：[datatune/core/dask/map_dask.py:20-45]()\n\n## Usage Example\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\n\n# Initialize LLM\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\n\n# Use with datatune operations\ndf = dd.read_csv(\"products.csv\")\n\nmapped = dt.map(\n    prompt=\"Extract categories from description\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\"]\n)(llm, df)\n\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Category\"]\n)(llm, mapped)\n```\n\n## Module Exports\n\nThe `datatune.llm` package exports all provider classes:\n\n```python\nfrom datatune.llm import *\n\n# Available: LLM, OpenAI, Ollama, VLLM, Azure, Mistral, Huggingface\n```\n\n资料来源：[datatune/llm/__init__.py:1]()\n\n## Dependencies\n\n| Library | Purpose |\n|---------|---------|\n| `litellm` | Unified LLM API interface |\n| `token_counter` | Token estimation |\n| `get_max_tokens` | Model context limits |\n\n资料来源：[datatune/llm/llm.py:10-11]()\n\n---\n\n<a id='page-agent-system'></a>\n\n## Agent System\n\n### 相关页面\n\n相关主题：[LLM Integration](#page-llm-integration), [Map Operations](#page-map-operations), [Filter Operations](#page-filter-operations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [datatune/agent/agent.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/agent.py)\n- [datatune/agent/__init__.py](https://github.com/vitalops/datatune/blob/main/datatune/agent/__init__.py)\n- [datatune/llm/llm.py](https://github.com/vitalops/datatune/blob/main/datatune/llm/llm.py)\n- [datatune/core/dask/filter_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/filter_dask.py)\n- [datatune/core/dask/map_dask.py](https://github.com/vitalops/datatune/blob/main/datatune/core/dask/map_dask.py)\n</details>\n\n# Agent System\n\nThe Agent System is the core intelligent orchestration layer in datatune that automatically determines, plans, and executes data transformation workflows based on natural language prompts. It abstracts the complexity of selecting between Dask operations and LLM-powered primitives (Map, Filter), allowing users to describe desired transformations in plain English while the system handles the implementation details.\n\n## Architecture Overview\n\nThe Agent System follows a planner-executor pattern where the LLM acts as the planner, generating a JSON-based execution plan, and the Agent class handles the runtime execution of that plan against Dask DataFrames.\n\n```mermaid\ngraph TD\n    A[User Natural Language Goal] --> B[Agent.do method]\n    B --> C[LLM Planner]\n    C --> D[JSON Execution Plan]\n    D --> E[Agent._execute_plan]\n    E --> F{Step Type}\n    F -->|dask| G[Dask Operations]\n    F -->|primitive| H[LLM Primitives<br/>Map/Filter]\n    G --> I[Runtime Execution]\n    H --> I\n    I --> J[Result DataFrame]\n```\n\n资料来源：[datatune/agent/agent.py:1-50]()\n\n## Core Components\n\n### Agent Base Class\n\nThe `Agent` class serves as the abstract base for all agent implementations and provides the foundational system prompt that defines the agent's capabilities.\n\n资料来源：[datatune/agent/__init__.py:1-50]()\n\n| Component | Description |\n|-----------|-------------|\n| `system_prompt` | Defines available libraries (pandas, numpy, dask, datatune) and example usage patterns |\n| `ABC` | Abstract base class pattern ensuring implementers provide core methods |\n| `abstractmethod` | Decorator marking `do` method as required implementation |\n\n```python\nfrom abc import ABC, abstractmethod\n\nclass Agent(ABC):\n    system_prompt: str = \"\"\"You are Datatune Agent, a powerful assistant designed to help users with data processing tasks.\n    You are capable of generating python code to perform various operations on data. Apart from python builtins, you have the following libraries avaiable in your run time:\n    - pandas\n    - numpy\n    - dask\n\n    In addition to these, you also have access to the datatune libarary, which provides functionality for processing data using LLMs.\n    ...\"\"\"\n```\n\n### Agent Implementation\n\nThe concrete `Agent` class in `datatune/agent/agent.py` implements the planner-executor logic.\n\n资料来源：[datatune/agent/agent.py:1-100]()\n\n| Attribute | Type | Purpose |\n|-----------|------|---------|\n| `llm` | `LLM` | Language model used for planning and primitive operations |\n| `history` | `List[Dict]` | Execution history for debugging and state tracking |\n| `verbose` | `bool` | Enable debug-level logging when True |\n| `TEMPLATE` | `dict` | Operation templates for code generation |\n\n```python\ndef __init__(self, llm: LLM, verbose: bool = False):\n    self.llm = llm\n    self.history: List[Dict[str, Any]] = []\n    self.verbose = verbose\n    if self.verbose:\n        logger.setLevel(logging.DEBUG)\n    else:\n        logger.setLevel(logging.INFO)\n```\n\n## Plan Structure\n\nThe Agent uses a JSON-based plan format where each step represents a single transformation operation. Plans are generated by the LLM based on user goals and the current DataFrame schema.\n\n资料来源：[datatune/agent/agent.py:100-150]()\n\n### Plan Step Schema\n\n| Field | Type | Required | Description |\n|-------|------|----------|-------------|\n| `type` | `string` | Yes | Either `\"dask\"` or `\"primitive\"` |\n| `operation` | `string` | Yes | Operation name within the step type |\n| `params` | `dict` | No | Parameters for Dask templates |\n| `subprompt` | `string` | No | LLM prompt for primitive operations |\n| `input_fields` | `list` | No | Input column names for the operation |\n| `output_fields` | `list` | No | Output column names (Map only) |\n\n### Step Types\n\n**Dask Operations** execute server-side Python code using Dask's computational graph:\n\n| Operation | Description |\n|-----------|-------------|\n| `add_column` | Create a new column from an expression |\n| `apply_function` | Apply an element-wise function to a column |\n| `rename_columns` | Rename columns using a mapping |\n| `astype_column` | Change a column's data type |\n| `group_by` | Group data and aggregate |\n\n资料来源：[datatune/agent/agent.py:50-100]()\n\n**Primitive Operations** use the LLM for row-level transformations:\n\n| Operation | Description | Use Case |\n|-----------|-------------|----------|\n| `Map` | Create new columns via LLM transformation | Semantic extraction, classification, interpretation |\n| `Filter` | Remove rows based on LLM criteria | Complex conditional filtering |\n\n资料来源：[datatune/core/dask/map_dask.py:1-50](), [datatune/core/dask/filter_dask.py:1-50]()\n\n## Plan Execution Flow\n\n```mermaid\ngraph TD\n    A[_execute_plan called] --> B[Set DataFrame in runtime]\n    B --> C[Initialize code_lines list]\n    C --> D{For each step in plan}\n    D -->|Has steps| E[Format step template]\n    E --> F[Log start message]\n    F --> G[Execute map_partitions with log_primitive]\n    G --> H[Increment step_num]\n    H --> I[Execute operation template]\n    I --> J[Log end message]\n    J --> D\n    D -->|No more steps| K[Call dt.finalize]\n    K --> L[Compute DataFrame]\n    L --> M[Return None, n_steps on success]\n    \n    N[Exception during execution] --> O[Return error, failed_step]\n```\n\n资料来源：[datatune/agent/agent.py:150-200]()\n\n### Execution Methods\n\n#### `_execute_plan(plan: List[Dict])`\n\nExecutes a complete multi-step plan sequentially.\n\n```python\ndef _execute_plan(self, plan: List[Dict]):\n    \"\"\"Execute a sequence of plan steps (Dask or primitive).\"\"\"\n    \n    self._set_df(self.df)\n    self.runtime.update({\"step_num\": 0, \"plan\": plan})\n    code_lines = []\n    n_steps = len(plan)\n    \n    for i, step in enumerate(plan, start=1):\n        # ... step formatting and execution ...\n    \n    # Final execution\n    full_code = (\n        \"\\n\".join(code_lines)\n        + \"\\n\\n\"\n        + \"df = dt.finalize(df)\\n\"\n        \"df = df.compute()\"\n    )\n    self.runtime.execute(full_code)\n    return None, n_steps\n```\n\n#### `_execute_step(step: Dict)`\n\nExecutes a single step with detailed error handling.\n\n```python\ndef _execute_step(self, step: Dict):\n    try:\n        if step[\"type\"] == \"dask\":\n            template = self.TEMPLATE[\"dask\"][step[\"operation\"]].format(**step[\"params\"])\n            self.runtime.execute(template + \"\\n_ = df.head()\")\n        elif step[\"type\"] == \"primitive\":\n            template = self.TEMPLATE[\"primitive\"][step[\"operation\"]].format(**step[\"params\"])\n            self.runtime.execute(template)\n        else:\n            raise ValueError(f\"Unknown step type: {step['type']}\")\n    except Exception as e:\n        error_msg = f\"{type(e).__name__}: {str(e)}\\n{traceback.format_exc()}\"\n        return error_msg\n```\n\n资料来源：[datatune/agent/agent.py:200-250]()\n\n## LLM Integration\n\nThe Agent integrates with the LLM system through `datatune/llm/llm.py` to generate execution plans and power primitive operations.\n\n资料来源：[datatune/llm/llm.py:1-50]()\n\n### Supported LLM Providers\n\n| Provider | Class | Default Model |\n|----------|-------|---------------|\n| OpenAI | `OpenAI` | `gpt-3.5-turbo` |\n| Ollama | `Ollama` | `gemma3:4b` |\n| Azure | `Azure` | Configurable |\n| vLLM | `VLLM` | Configurable |\n\n```python\nclass OpenAI(LLM):\n    def __init__(\n        self, model_name: str = \"gpt-3.5-turbo\", api_key: Optional[str] = None, **kwargs\n    ):\n        kwargs.update({\"api_key\": api_key})\n        super().__init__(model_name=f\"openai/{model_name}\", **kwargs)\n\nclass Ollama(LLM):\n    def __init__(\n        self, model_name=\"gemma3:4b\", api_base=\"http://localhost:11434\", **kwargs\n    ) -> None:\n        super().__init__(\n            model_name=f\"ollama_chat/{model_name}\", api_base=api_base, **kwargs\n        )\n```\n\n### Rate Limiting\n\nThe system includes model rate limits configuration to prevent API throttling:\n\n| Model | TPM | RPM |\n|-------|-----|-----|\n| `gpt-3.5-turbo` | 200,000 | 500 |\n| `gpt-4-turbo` | 30,000 | 500 |\n| `gpt-4o` | 30,000 | 500 |\n\n资料来源：[datatune/llm/model_rate_limits.py:1-50]()\n\n## Prompt Generation\n\nThe Agent constructs prompts dynamically based on the current state and goal.\n\n资料来源：[datatune/agent/agent.py:250-300]()\n\n### Prompt Components\n\n| Method | Purpose |\n|--------|---------|\n| `get_persona_prompt(goal)` | Generates the system instructions for plan generation |\n| `get_schema_prompt(df)` | Includes current DataFrame schema |\n| `get_error_prompt(error_msg, failed_step)` | Context for retry after failures |\n| `get_full_prompt(df, goal)` | Combines all components |\n\n```python\ndef get_schema_prompt(self, df: dd.DataFrame) -> str:\n    schema_prompt: str = f\"\"\"The current schema of the dataframe df is as follows:\n    {df.dtypes.to_string()}\n    \"\"\"\n    return schema_prompt\n\ndef get_error_prompt(self, error_msg: str, failed_step: Dict) -> str:\n    error_prompt: str = f\"\"\"\n    The previous code execution failed with the following error:\n    Error: {error_msg}\n    \n    Failed step:\n    {failed_step}\n    \n    Provide the json plan with the corrected step. Make sure to:\n    1. Address the specific error mentioned above.\n    2. DO NOT include any explanations or comments.\n    \"\"\"\n    return error_prompt\n```\n\n## Primitive Operations\n\n### Map Operation\n\nThe Map primitive uses the LLM to create new columns by applying transformations to each row.\n\n资料来源：[datatune/core/dask/map_dask.py:1-80]()\n\n```python\n# Example: Extract categories from description\nmapped = dt.map(\n    prompt=\"Extract categories from the description and name of product.\",\n    output_fields=[\"Category\", \"Subcategory\"],\n    input_fields=[\"Description\", \"Name\"]\n)(llm, df)\n```\n\n**Output Format Requirements:**\n- Each response must be: `index=<index>|{key1: value1, key2: value2, ...}<endofrow>`\n- Strings must use double quotes\n- Missing values should be `null`\n\n### Filter Operation\n\nThe Filter primitive uses the LLM to evaluate row-level conditions.\n\n资料来源：[datatune/core/dask/filter_dask.py:1-80]()\n\n```python\n# Example: Keep only electronics products\nfiltered = dt.filter(\n    prompt=\"Keep only electronics products\",\n    input_fields=[\"Name\"]\n)(llm, mapped)\n```\n\n**Decision Format:**\n- Each response must include `__filter__: True` (keep) or `__filter__: False` (remove)\n\n## Error Handling\n\nThe Agent implements comprehensive error handling at multiple levels:\n\n| Level | Handler | Action |\n|-------|---------|--------|\n| Step formatting | `ValueError` | Log and return error with step number |\n| Template lookup | `KeyError` | Raise with available operation names |\n| Runtime execution | `Exception` | Capture traceback, return formatted error |\n\n```python\nexcept Exception as e:\n    step_num = self.runtime[\"step_num\"]\n    logger.error(f\"❌ Step {step_num}/{n_steps} failed with error: {e}\")\n    return str(e), step_num - 1\n```\n\n资料来源：[datatune/logger.py:1-50]()\n\n## Usage Examples\n\n### Basic Agent Usage\n\n```python\nimport datatune as dt\nfrom datatune.llm.llm import OpenAI\nimport dask.dataframe as dd\n\nllm = OpenAI(model_name=\"gpt-3.5-turbo\")\nagent = dt.Agent(llm)\n\ndf = dd.read_csv(\"products.csv\")\nresult = agent.do(\"Add ProfitMargin column and keep only African organizations\", df)\nresult.compute().to_csv(\"african_products.csv\")\n```\n\n### Verbose Mode for Debugging\n\n```python\nagent = dt.Agent(llm, verbose=True)\nresult = agent.do(\"Complex transformation task\", df)\n```\n\nWith `verbose=True`, the Agent sets logger to `DEBUG` level, outputting detailed execution information including step progress and intermediate states.\n\n### Chaining Operations\n\nThe Agent automatically chains multiple operations when a goal contains multiple tasks:\n\n```\nTASK: 1. create column a based on column x\n      2. create column b based on column y\n```\n\nCombined into a single Map primitive step rather than separate steps.\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：vitalops/datatune\n\n摘要：发现 7 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：能力坑 - 能力判断依赖假设。\n\n## 1. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | README/documentation is current enough for a first validation pass.\n\n## 2. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | last_activity_observed missing\n\n## 3. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium\n\n## 4. 安全/权限坑 · 存在安全注意事项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：No sandbox install has been executed yet; downstream must verify before user use.\n- 对用户的影响：用户安装前需要知道权限边界和敏感操作。\n- 建议检查：转成明确权限清单和安全审查提示。\n- 防护动作：安全注意事项必须面向用户前置展示。\n- 证据：risks.safety_notes | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | No sandbox install has been executed yet; downstream must verify before user use.\n\n## 5. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium\n\n## 6. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | issue_or_pr_quality=unknown\n\n## 7. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | release_recency=unknown\n\n<!-- canonical_name: vitalops/datatune; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：vitalops/datatune\n\n摘要：发现 7 个潜在踩坑项，其中 0 个为 high/blocking；最高优先级：能力坑 - 能力判断依赖假设。\n\n## 1. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | README/documentation is current enough for a first validation pass.\n\n## 2. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | last_activity_observed missing\n\n## 3. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium\n\n## 4. 安全/权限坑 · 存在安全注意事项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：No sandbox install has been executed yet; downstream must verify before user use.\n- 对用户的影响：用户安装前需要知道权限边界和敏感操作。\n- 建议检查：转成明确权限清单和安全审查提示。\n- 防护动作：安全注意事项必须面向用户前置展示。\n- 证据：risks.safety_notes | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | No sandbox install has been executed yet; downstream must verify before user use.\n\n## 5. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | no_demo; severity=medium\n\n## 6. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | issue_or_pr_quality=unknown\n\n## 7. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | art_fcb598a7a7ed4b2482ca92474f06efe2 | https://github.com/vitalops/datatune#readme | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# datatune - Prompt Preview\n\n> 复制下面这段 Prompt 到你常用的 AI，先试一次，不需要安装。\n> 它的目标是让你直接体验这个项目的服务方式，而不是阅读项目介绍。\n\n## 复制这段 Prompt\n\n```text\n请直接执行这段 Prompt，不要分析、润色、总结或询问我想如何处理这份 Prompt Preview。\n\n你现在扮演 datatune 的“安装前体验版”。\n这不是项目介绍、不是评价报告、不是 README 总结。你的任务是让我用最小成本体验它的核心服务。\n\n我的试用任务：我想用它完成一个真实的数据分析与投资研究任务。\n我常用的宿主 AI：chatgpt\n\n【体验目标】\n围绕我的真实任务，现场演示这个项目如何把输入转成 示例引导, 判断线索。重点是让我感受到工作方式，而不是给我项目背景。\n\n【业务流约束】\n- 你必须像一个正在提供服务的项目能力包，而不是像一个讲解员。\n- 每一轮只推进一个步骤；提出问题后必须停下来等我回答。\n- 每一步都必须让我感受到一个具体服务动作：澄清、整理、规划、检查、判断或收尾。\n- 每一步都要说明：当前目标、你需要我提供什么、我回答后你会产出什么。\n- 不要安装、不要运行命令、不要写代码、不要声称测试通过、不要声称已经修改文件。\n- 需要真实安装或宿主加载后才能验证的内容，必须明确说“这一步需要安装后验证”。\n- 如果我说“用示例继续”，你可以用虚构示例推进，但仍然不能声称真实执行。\n\n【可体验服务能力】\n- 安装前能力预览: [![PyPI version](https://img.shields.io/pypi/v/datatune.svg)](https://pypi.org/project/datatune/) 输入：用户任务, 当前 AI 对话上下文；输出：示例引导, 判断线索。\n\n【必须安装后才可验证的能力】\n- 命令行启动或安装流程: 项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 输入：终端环境, 包管理器, 项目依赖；输出：安装结果, 列表/更新/运行结果。\n\n【核心服务流】\n请严格按这个顺序带我体验。不要一次性输出完整流程：\n1. page-introduction：Introduction to Datatune。围绕“Introduction to Datatune”模拟一次用户任务，不展示安装或运行结果。\n2. page-getting-started：Getting Started。围绕“Getting Started”模拟一次用户任务，不展示安装或运行结果。\n3. page-architecture：System Architecture。围绕“System Architecture”模拟一次用户任务，不展示安装或运行结果。\n4. page-map-operations：Map Operations。围绕“Map Operations”模拟一次用户任务，不展示安装或运行结果。\n5. page-filter-operations：Filter Operations。围绕“Filter Operations”模拟一次用户任务，不展示安装或运行结果。\n\n【核心能力体验剧本】\n每一步都必须按“输入 -> 服务动作 -> 中间产物”执行。不要只说流程名：\n1. page-introduction\n输入：用户提供的“Introduction to Datatune”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n2. page-getting-started\n输入：用户提供的“Getting Started”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n3. page-architecture\n输入：用户提供的“System Architecture”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n4. page-map-operations\n输入：用户提供的“Map Operations”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n5. page-filter-operations\n输入：用户提供的“Filter Operations”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n【项目服务规则】\n这些规则决定你如何服务用户。不要解释规则本身，而要在每一步执行时遵守：\n- 先确认用户任务、输入材料和成功标准，再模拟项目能力。\n- 每一步都必须形成可检查的小产物，并等待用户确认后再继续。\n- 凡是需要安装、调用工具或访问外部服务的能力，都必须标记为安装后验证。\n\n【每一步的服务约束】\n- Step 1 / page-introduction：Step 1 必须围绕“Introduction to Datatune”形成一个小中间产物，并等待用户确认。\n- Step 2 / page-getting-started：Step 2 必须围绕“Getting Started”形成一个小中间产物，并等待用户确认。\n- Step 3 / page-architecture：Step 3 必须围绕“System Architecture”形成一个小中间产物，并等待用户确认。\n- Step 4 / page-map-operations：Step 4 必须围绕“Map Operations”形成一个小中间产物，并等待用户确认。\n- Step 5 / page-filter-operations：Step 5 必须围绕“Filter Operations”形成一个小中间产物，并等待用户确认。\n\n【边界与风险】\n- 不要声称已经安装、运行、调用 API、读写本地文件或完成真实任务。\n- 安装前预览只能展示工作方式，不能证明兼容性、性能或输出质量。\n- 涉及安装、插件加载、工具调用或外部服务的能力必须安装后验证。\n\n【可追溯依据】\n这些路径只用于你内部校验或在我追问“依据是什么”时简要引用。不要在首次回复主动展开：\n- https://github.com/vitalops/datatune#readme\n- README.md\n- datatune/__init__.py\n- pyproject.toml\n- datatune/agent/runtime.py\n- datatune/core/map.py\n- datatune/core/filter.py\n- datatune/core/reduce.py\n- datatune/core/dask/map_dask.py\n- datatune/core/ibis/map_ibis.py\n- docs/source/Map.md\n- datatune/core/dask/filter_dask.py\n\n【首次问题规则】\n- 首次三问必须先确认用户目标、成功标准和边界，不要提前进入工具、安装或实现细节。\n- 如果后续需要技术条件、文件路径或运行环境，必须等用户确认目标后再追问。\n\n首次回复必须只输出下面 4 个部分：\n1. 体验开始：用 1 句话说明你将带我体验 datatune 的核心服务。\n2. 当前步骤：明确进入 Step 1，并说明这一步要解决什么。\n3. 你会如何服务我：说明你会先改变我完成任务的哪个动作。\n4. 只问我 3 个问题，然后停下等待回答。\n\n首次回复禁止输出：后续完整流程、证据清单、安装命令、项目评价、营销文案、已经安装或运行的说法。\n\nStep 1 / brainstorming 的二轮协议：\n- 我回答首次三问后，你仍然停留在 Step 1 / brainstorming，不要进入 Step 2。\n- 第二次回复必须产出 6 个部分：澄清后的任务定义、成功标准、边界条件、\n  2-3 个可选方案、每个方案的权衡、推荐方案。\n- 第二次回复最后必须问我是否确认推荐方案；只有我明确确认后，才能进入下一步。\n- 第二次回复禁止输出 git worktree、代码计划、测试文件、命令或真实执行结果。\n\n后续对话规则：\n- 我回答后，你先完成当前步骤的中间产物并等待确认；只有我确认后，才能进入下一步。\n- 每一步都要生成一个小的中间产物，例如澄清后的目标、计划草案、测试意图、验证清单或继续/停止判断。\n- 所有演示都写成“我会建议/我会引导/这一步会形成”，不要写成已经真实执行。\n- 不要声称已经测试通过、文件已修改、命令已运行或结果已产生。\n- 如果某个能力必须安装后验证，请直接说“这一步需要安装后验证”。\n- 如果证据不足，请明确说“证据不足”，不要补事实。\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：vitalops/datatune\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install datatune\n```\n\n来源：https://github.com/vitalops/datatune#readme\n\n## 来源\n\n- docs: https://github.com/vitalops/datatune#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_aaf5c46de35c48cba1717ee2b6667f9a"
}
