{
  "canonical_name": "pathwaycom/pathway",
  "compilation_id": "pack_5fd0856749b04e30b6ef9bc79c7d4261",
  "created_at": "2026-05-16T21:11:32.569851+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=skill, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=skill, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install -U pathway` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install -U pathway",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "deterministic_isolated_install",
      "sandbox_validation_id": "sbx_201d177b776449bf8ebb301268561a08"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_a6d2fff6b478ba4fbfe579de3aaab457",
    "canonical_name": "pathwaycom/pathway",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/pathwaycom/pathway",
    "slug": "pathway",
    "source_packet_id": "phit_b1ea0938e4c0431dacce0edfc388bf24",
    "source_validation_id": "dval_6b1c86605d304988ae43163ed46c8441"
  },
  "merchandising": {
    "best_for": "需要信息检索与知识管理能力，并使用 local_cli的用户",
    "github_forks": 1672,
    "github_stars": 63307,
    "one_liner_en": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.",
    "one_liner_zh": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.",
    "primary_category": {
      "category_id": "research-knowledge",
      "confidence": "high",
      "name_en": "Research & Knowledge",
      "name_zh": "信息检索与知识管理",
      "reason": "strong category phrase match from project identity and outcome"
    },
    "target_user": "使用 local_cli 等宿主 AI 的用户",
    "title_en": "pathway",
    "title_zh": "pathway 能力包",
    "visible_tags": [
      {
        "label_en": "MCP Tools",
        "label_zh": "MCP 工具",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-mcp-tools",
        "type": "product_domain"
      },
      {
        "label_en": "Knowledge Base Q&A",
        "label_zh": "知识库问答",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-knowledge-base-q-a",
        "type": "user_job"
      },
      {
        "label_en": "Natural-language Web Actions",
        "label_zh": "自然语言网页操作",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-natural-language-web-actions",
        "type": "core_capability"
      },
      {
        "label_en": "Multi-role Workflow",
        "label_zh": "多角色协作流程",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-multi-role-workflow",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Structured Data Extraction",
        "label_zh": "结构化数据提取",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-structured-data-extraction",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_b1ea0938e4c0431dacce0edfc388bf24",
  "page_model": {
    "artifacts": {
      "artifact_slug": "pathway",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install -U pathway",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/pathwaycom/pathway#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "MCP 工具",
        "知识库问答",
        "自然语言网页操作",
        "多角色协作流程",
        "结构化数据提取"
      ],
      "eyebrow": "信息检索与知识管理",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要信息检索与知识管理能力，并使用 local_cli的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG."
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "local_cli",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "skill, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: TypeError: Cannot instantiate typing.Any",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Entra Authentication Support (credential handler and/or password callback)",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Automated schema exploration in input connectors",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Improve watermarks in POSIX-like objects tracker",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Persistence in `iterate` operator",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Support LEANN for RAG pipelines",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Support MongoDB Atlas in pw.io.mongodb",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：feat: Add Microsoft SQL Server (MSSQL) connector",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.27.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.29.0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_b25babd3e3cc49108e56daaaaae8c5bf | https://github.com/pathwaycom/pathway/releases/tag/v0.29.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.29.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.30.0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_0291ee068e4e4d38aa86d0fd433b960a | https://github.com/pathwaycom/pathway/releases/tag/v0.30.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.30.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.28.0",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_708184d9c8ac445d923c82d38af7bd19 | https://github.com/pathwaycom/pathway/releases/tag/v0.28.0 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.28.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.30.1",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_e7a19bd03a49499e987781160e4b76f0 | https://github.com/pathwaycom/pathway/releases/tag/v0.30.1 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.30.1",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | github_repo:571186647 | https://github.com/pathwaycom/pathway | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个运行相关的待验证问题：`pw.io.mssql.read` needs LSN persistence",
            "category": "运行坑",
            "evidence": [
              "community_evidence:github | cevd_59d927a667a843ccb7d0a4cd147a1dc0 | https://github.com/pathwaycom/pathway/issues/218 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：`pw.io.mssql.read` needs LSN persistence",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 22 个潜在踩坑项，其中 2 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": 39,
        "forks": 1672,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": 63307
      },
      "source_url": "https://github.com/pathwaycom/pathway",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.",
      "title": "pathway 能力包",
      "trial_prompt": "# pathway - Prompt Preview\n\n> 复制下面这段 Prompt 到你常用的 AI，先试一次，不需要安装。\n> 它的目标是让你直接体验这个项目的服务方式，而不是阅读项目介绍。\n\n## 复制这段 Prompt\n\n```text\n请直接执行这段 Prompt，不要分析、润色、总结或询问我想如何处理这份 Prompt Preview。\n\n你现在扮演 pathway 的“安装前体验版”。\n这不是项目介绍、不是评价报告、不是 README 总结。你的任务是让我用最小成本体验它的核心服务。\n\n我的试用任务：我想快速理解一组资料，并得到结构化摘要、对比和继续研究的问题。\n我常用的宿主 AI：Local CLI\n\n【体验目标】\n围绕我的真实任务，现场演示这个项目如何把输入转成 示例引导, 判断线索。重点是让我感受到工作方式，而不是给我项目背景。\n\n【业务流约束】\n- 你必须像一个正在提供服务的项目能力包，而不是像一个讲解员。\n- 每一轮只推进一个步骤；提出问题后必须停下来等我回答。\n- 每一步都必须让我感受到一个具体服务动作：澄清、整理、规划、检查、判断或收尾。\n- 每一步都要说明：当前目标、你需要我提供什么、我回答后你会产出什么。\n- 不要安装、不要运行命令、不要写代码、不要声称测试通过、不要声称已经修改文件。\n- 需要真实安装或宿主加载后才能验证的内容，必须明确说“这一步需要安装后验证”。\n- 如果我说“用示例继续”，你可以用虚构示例推进，但仍然不能声称真实执行。\n\n【可体验服务能力】\n- 安装前能力预览: Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG. 输入：用户任务, 当前 AI 对话上下文；输出：示例引导, 判断线索。\n\n【必须安装后才可验证的能力】\n- 命令行启动或安装流程: 项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 输入：终端环境, 包管理器, 项目依赖；输出：安装结果, 列表/更新/运行结果。\n\n【核心服务流】\n请严格按这个顺序带我体验。不要一次性输出完整流程：\n1. overview-introduction：Pathway简介。围绕“Pathway简介”模拟一次用户任务，不展示安装或运行结果。\n2. overview-installation：安装指南。围绕“安装指南”模拟一次用户任务，不展示安装或运行结果。\n3. quickstart-basic-example：基础示例。围绕“基础示例”模拟一次用户任务，不展示安装或运行结果。\n4. arch-engine-overview：引擎架构。围绕“引擎架构”模拟一次用户任务，不展示安装或运行结果。\n5. connectors-overview：连接器概述。围绕“连接器概述”模拟一次用户任务，不展示安装或运行结果。\n\n【核心能力体验剧本】\n每一步都必须按“输入 -> 服务动作 -> 中间产物”执行。不要只说流程名：\n1. overview-introduction\n输入：用户提供的“Pathway简介”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n2. overview-installation\n输入：用户提供的“安装指南”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n3. quickstart-basic-example\n输入：用户提供的“基础示例”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n4. arch-engine-overview\n输入：用户提供的“引擎架构”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n5. connectors-overview\n输入：用户提供的“连接器概述”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n【项目服务规则】\n这些规则决定你如何服务用户。不要解释规则本身，而要在每一步执行时遵守：\n- 先确认用户任务、输入材料和成功标准，再模拟项目能力。\n- 每一步都必须形成可检查的小产物，并等待用户确认后再继续。\n- 凡是需要安装、调用工具或访问外部服务的能力，都必须标记为安装后验证。\n\n【每一步的服务约束】\n- Step 1 / overview-introduction：Step 1 必须围绕“Pathway简介”形成一个小中间产物，并等待用户确认。\n- Step 2 / overview-installation：Step 2 必须围绕“安装指南”形成一个小中间产物，并等待用户确认。\n- Step 3 / quickstart-basic-example：Step 3 必须围绕“基础示例”形成一个小中间产物，并等待用户确认。\n- Step 4 / arch-engine-overview：Step 4 必须围绕“引擎架构”形成一个小中间产物，并等待用户确认。\n- Step 5 / connectors-overview：Step 5 必须围绕“连接器概述”形成一个小中间产物，并等待用户确认。\n\n【边界与风险】\n- 不要声称已经安装、运行、调用 API、读写本地文件或完成真实任务。\n- 安装前预览只能展示工作方式，不能证明兼容性、性能或输出质量。\n- 涉及安装、插件加载、工具调用或外部服务的能力必须安装后验证。\n\n【可追溯依据】\n这些路径只用于你内部校验或在我追问“依据是什么”时简要引用。不要在首次回复主动展开：\n- https://github.com/pathwaycom/pathway\n- https://github.com/pathwaycom/pathway#readme\n- README.md\n- pyproject.toml\n- rust-toolchain.toml\n- docs/2.developers/4.user-guide/10.introduction/20.installation.md\n- examples/notebooks/tutorials/installation_first_steps.ipynb\n- python/pathway/__init__.py\n- src/engine/mod.rs\n- src/engine/dataflow.rs\n- src/engine/graph.rs\n- external/differential-dataflow/src/lib.rs\n\n【首次问题规则】\n- 首次三问必须先确认用户目标、成功标准和边界，不要提前进入工具、安装或实现细节。\n- 如果后续需要技术条件、文件路径或运行环境，必须等用户确认目标后再追问。\n\n首次回复必须只输出下面 4 个部分：\n1. 体验开始：用 1 句话说明你将带我体验 pathway 的核心服务。\n2. 当前步骤：明确进入 Step 1，并说明这一步要解决什么。\n3. 你会如何服务我：说明你会先改变我完成任务的哪个动作。\n4. 只问我 3 个问题，然后停下等待回答。\n\n首次回复禁止输出：后续完整流程、证据清单、安装命令、项目评价、营销文案、已经安装或运行的说法。\n\nStep 1 / brainstorming 的二轮协议：\n- 我回答首次三问后，你仍然停留在 Step 1 / brainstorming，不要进入 Step 2。\n- 第二次回复必须产出 6 个部分：澄清后的任务定义、成功标准、边界条件、\n  2-3 个可选方案、每个方案的权衡、推荐方案。\n- 第二次回复最后必须问我是否确认推荐方案；只有我明确确认后，才能进入下一步。\n- 第二次回复禁止输出 git worktree、代码计划、测试文件、命令或真实执行结果。\n\n后续对话规则：\n- 我回答后，你先完成当前步骤的中间产物并等待确认；只有我确认后，才能进入下一步。\n- 每一步都要生成一个小的中间产物，例如澄清后的目标、计划草案、测试意图、验证清单或继续/停止判断。\n- 所有演示都写成“我会建议/我会引导/这一步会形成”，不要写成已经真实执行。\n- 不要声称已经测试通过、文件已修改、命令已运行或结果已产生。\n- 如果某个能力必须安装后验证，请直接说“这一步需要安装后验证”。\n- 如果证据不足，请明确说“证据不足”，不要补事实。\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in（https://github.com/pathwaycom/pathway/issues/232）；github/github_issue: Entra Authentication Support (credential handler and/or password callbac（https://github.com/pathwaycom/pathway/issues/230）；github/github_issue: [Bug]: TypeError: Cannot instantiate typing.Any（https://github.com/pathwaycom/pathway/issues/227）；github/github_issue: Improve watermarks in POSIX-like objects tracker（https://github.com/pathwaycom/pathway/issues/225）；github/github_issue: Automated schema exploration in input connectors（https://github.com/pathwaycom/pathway/issues/224）；github/github_issue: Support encryption in Kinesis connectors（https://github.com/pathwaycom/pathway/issues/223）；github/github_issue: Support external replication slots in `pw.io.postgres.read`; add persist（https://github.com/pathwaycom/pathway/issues/222）；github/github_issue: Support MongoDB Atlas in pw.io.mongodb（https://github.com/pathwaycom/pathway/issues/221）；github/github_issue: Support LEANN for RAG pipelines（https://github.com/pathwaycom/pathway/issues/173）；github/github_issue: feat: Add Microsoft SQL Server (MSSQL) connector（https://github.com/pathwaycom/pathway/issues/204）；github/github_issue: Persistence in `iterate` operator（https://github.com/pathwaycom/pathway/issues/214）；github/github_issue: `pw.io.mssql.read` needs LSN persistence（https://github.com/pathwaycom/pathway/issues/218）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in",
              "url": "https://github.com/pathwaycom/pathway/issues/232"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Entra Authentication Support (credential handler and/or password callbac",
              "url": "https://github.com/pathwaycom/pathway/issues/230"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: TypeError: Cannot instantiate typing.Any",
              "url": "https://github.com/pathwaycom/pathway/issues/227"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Improve watermarks in POSIX-like objects tracker",
              "url": "https://github.com/pathwaycom/pathway/issues/225"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Automated schema exploration in input connectors",
              "url": "https://github.com/pathwaycom/pathway/issues/224"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Support encryption in Kinesis connectors",
              "url": "https://github.com/pathwaycom/pathway/issues/223"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Support external replication slots in `pw.io.postgres.read`; add persist",
              "url": "https://github.com/pathwaycom/pathway/issues/222"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Support MongoDB Atlas in pw.io.mongodb",
              "url": "https://github.com/pathwaycom/pathway/issues/221"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Support LEANN for RAG pipelines",
              "url": "https://github.com/pathwaycom/pathway/issues/173"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "feat: Add Microsoft SQL Server (MSSQL) connector",
              "url": "https://github.com/pathwaycom/pathway/issues/204"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Persistence in `iterate` operator",
              "url": "https://github.com/pathwaycom/pathway/issues/214"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "`pw.io.mssql.read` needs LSN persistence",
              "url": "https://github.com/pathwaycom/pathway/issues/218"
            }
          ],
          "status": "已收录 12 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "信息检索与知识管理",
      "desc": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.",
      "effort": "安装已验证",
      "forks": 1672,
      "icon": "search",
      "name": "pathway 能力包",
      "risk": "可发布",
      "slug": "pathway",
      "stars": 63307,
      "tags": [
        "MCP 工具",
        "知识库问答",
        "自然语言网页操作",
        "多角色协作流程",
        "结构化数据提取"
      ],
      "thumb": "blue",
      "type": "Skill Pack"
    },
    "manual": {
      "markdown": "# https://github.com/pathwaycom/pathway 项目说明书\n\n生成时间：2026-05-16 20:44:44 UTC\n\n## 目录\n\n- [Pathway简介](#overview-introduction)\n- [安装指南](#overview-installation)\n- [基础示例](#quickstart-basic-example)\n- [引擎架构](#arch-engine-overview)\n- [Python与Rust集成](#arch-python-rust-integration)\n- [连接器概述](#connectors-overview)\n- [数据库连接器](#connectors-database)\n- [流数据连接器](#connectors-streaming)\n- [表操作](#trans-table-operations)\n- [连接与聚合](#trans-joins-aggregations)\n\n<a id='overview-introduction'></a>\n\n## Pathway简介\n\n### 相关页面\n\n相关主题：[引擎架构](#arch-engine-overview), [安装指南](#overview-installation)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [external/differential-dataflow/src/trace/implementations/ord.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/implementations/ord.rs)\n- [external/timely-dataflow/timely/src/progress/subgraph.rs](https://github.com/pathwaycom/pathway/blob/main/external/timely-dataflow/timely/src/progress/subgraph.rs)\n- [external/differential-dataflow/src/trace/wrappers/freeze.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/wrappers/freeze.rs)\n- [src/connectors/postgres.rs](https://github.com/pathwaycom/pathway/blob/main/src/connectors/postgres.rs)\n- [examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n</details>\n\n# Pathway简介\n\n## 项目概述\n\nPathway是一个Python ETL框架，专为流处理（stream processing）、实时分析（real-time analytics）、LLM管道（LLM pipelines）和检索增强生成（RAG）场景设计。资料来源：[README.md:1]()\n\nPathway的核心优势在于其**易用的Python API**，允许用户无缝集成最喜欢的Python机器学习库。同时，Pathway代码具有极高的通用性和鲁棒性，可以同时用于开发和生产环境，有效处理批处理（batch）和流式数据（streaming data）。资料来源：[README.md:3]()\n\nPathway的一个显著特点是**代码一致性**：同一套代码可用于本地开发、CI/CD测试、运行批处理任务、处理流回放（stream replays）以及处理数据流。资料来源：[README.md:4]()\n\n## 核心技术架构\n\nPathway的技术架构采用分层设计，融合了多种分布式计算范式：\n\n```mermaid\ngraph TD\n    A[Python API层] --> B[PyO3绑定层]\n    B --> C[Rust执行引擎]\n    C --> D[Timely Dataflow分布式计算]\n    C --> E[Differential Dataflow增量计算]\n    D --> F[增量数据流处理]\n    E --> F\n```\n\n### Python API层\n\nPathway提供简洁的Python接口，通过PyO3实现Python与Rust的双向绑定。Python API层支持多种数据类型和连接器配置。资料来源：[src/python_api.rs:1-80]()\n\n核心Python绑定类包括：\n\n| 类名 | 功能说明 |\n|------|----------|\n| `PythonSubject` | Python主题定义，支持start、read、seek等回调 |\n| `ValueField` | 值字段定义，包含名称、类型、来源和默认值 |\n| `Table` | 数据表核心抽象 |\n| `ConnectorProperties` | 连接器通用属性 |\n| `CsvParserSettings` | CSV解析器配置 |\n| `AwsS3Settings` | AWS S3连接配置 |\n\n### Rust执行引擎\n\nPathway的执行引擎由Rust实现，基于Differential Dataflow和Timely Dataflow构建。Rust引擎负责高性能的数据处理和计算。资料来源：[README.md:5]()\n\n引擎通过`pymodule`导出Python可调用的类和函数，实现Python语法的易用性与Rust的高性能完美结合。资料来源：[src/python_api.rs:100-150]()\n\n### Timely Dataflow分布式计算\n\nPathway使用Timely Dataflow作为底层分布式计算框架。Timely Dataflow提供了数据流图的处理能力，支持并行和分布式计算。资料来源：[external/timely-dataflow/timely/src/progress/subgraph.rs:1-30]()\n\nTimely Dataflow的核心组件包括：\n\n```mermaid\ngraph LR\n    A[数据输入] --> B[Scope子图]\n    B --> C[PerOperatorState]\n    C --> D[Antichain进度跟踪]\n    D --> E[ChangeBatch消息]\n    E --> F[数据输出]\n```\n\n关键数据结构：\n- **Activations**：共享激活状态管理\n- **MutableAntichain**：可变反链，用于跟踪进度边界\n- **ChangeBatch**：变化批次，用于记录数据变化\n\n### Differential Dataflow增量计算\n\nDifferential Dataflow是Pathway实现增量计算的核心技术。与传统批处理不同，Differential Dataflow能够智能处理数据的变化，仅重新计算受影响的部分。资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:1-30]()\n\n#### 轨迹追踪器（Trace）\n\nPathway使用轨迹（Trace）来记录历史数据状态，支持时间旅行查询和增量更新。资料来源：[external/differential-dataflow/src/trace/wrappers/freeze.rs:1-30]()\n\n```mermaid\ngraph TD\n    A[数据变化] --> B[Trace记录]\n    B --> C[时间戳管理]\n    C --> D[Antichain since边界]\n    D --> E[增量计算引擎]\n    E --> F[输出更新]\n```\n\n#### 有序批处理\n\nPathway实现了高效的有序批处理结构`OrdKeyBatch`和`OrdValBatch`，用于存储键值对的时间序列数据。资料来源：[external/differential-dataflow/src/trace/implementations/ord.rs:1-50]()\n\n| 组件 | 说明 |\n|------|------|\n| `OrdKeyCursor` | 有序键游标，用于遍历键数据 |\n| `OrdValCursor` | 有序键值游标，用于遍历键值对 |\n| `BatchContainer` | 批量容器接口 |\n| `MergeBuilder` | 合并构建器 |\n\n## 主要功能特性\n\n### 连接器支持\n\nPathway提供丰富的连接器支持，覆盖多种数据源和数据格式：\n\n| 连接器类型 | 支持的数据源 |\n|------------|--------------|\n| 文件系统 | CSV、Parquet等 |\n| 云存储 | AWS S3、Azure Blob Storage |\n| 数据库 | PostgreSQL（包括复制支持） |\n| 搜索引擎 | Elasticsearch |\n| 流处理 | MQTT、Kafka、Debezium |\n| 其他 | Schema Registry、Iceberg |\n\n资料来源：[src/python_api.rs:80-120]()\n\n### PostgreSQL连接器\n\nPathway的PostgreSQL连接器支持完整的二进制协议，实现高效的数据读写：\n\n- 支持多种数据类型映射（BOOL、INT、Float、TEXT、JSONB等）\n- 二进制数组格式写入\n- 复制槽（Replication Slot）支持逻辑复制\n- Debezium CDC集成\n\n资料来源：[src/connectors/postgres.rs:1-60]()\n\n### 数据格式与解析\n\nPathway提供灵活的数据格式配置，包括：\n\n```python\nCsvParserSettings(\n    name=field_name,\n    type_=field_type,\n    source=FieldSource.Payload,\n    default=None\n)\n```\n\n支持的数据类型包括：INT、FLOAT、STRING、BOOL、TIMESTAMP、JSON等。\n\n### 持久化与状态管理\n\nPathway支持灵活的数据持久化配置：\n\n| 配置项 | 说明 |\n|--------|------|\n| `PersistenceConfig` | 持久化基础配置 |\n| `BackfillingThreshold` | 回填阈值配置 |\n| `PySnapshotAccess` | 快照访问模式 |\n| `PySnapshotEvent` | 快照事件类型 |\n\n## 应用场景\n\n### ETL管道\n\nPathway特别适合构建ETL（Extract-Transform-Load）管道，支持从多种数据源提取数据、进行转换处理、加载到目标系统。\n\n```mermaid\ngraph LR\n    A[数据源] -->|Extract| B[Pathway管道]\n    B -->|Transform| C[增量处理]\n    C -->|Load| D[目标系统]\n```\n\n### 实时分析\n\n基于Differential Dataflow的增量计算能力，Pathway能够高效处理实时数据分析任务，仅处理数据流中的增量变化部分。\n\n### LLM管道与RAG\n\nPathway为构建LLM（Large Language Model）管道和RAG（Retrieval-Augmented Generation）应用提供专用支持，能够实时索引和检索外部数据源。\n\n## 开发与部署\n\n### 开发环境\n\nPathway支持本地开发环境，开发者可以使用熟悉的Python工具链进行开发。资料来源：[examples/templates/el-pipeline/README.md:1-20]()\n\n### 许可证\n\nPathway采用BSL（Business Source License）许可证发布，允许非商业使用和评估。商业使用需要获取正式许可证。资料来源：[README.md:45]()\n\n### 许可证配置\n\n部署时需要设置环境变量：\n\n```bash\nexport PATHWAY_LICENSE_KEY=your_pathway_key\n```\n\n## 技术优势总结\n\n| 优势 | 描述 |\n|------|------|\n| **高性能** | Rust引擎提供接近原生的执行效率 |\n| **增量计算** | Differential Dataflow避免重复计算 |\n| **统一API** | Python接口简化开发，Rust保证性能 |\n| **多数据源** | 丰富的内置连接器支持 |\n| **可扩展性** | 基于Timely Dataflow支持分布式部署 |\n| **开发友好** | 支持本地开发和生产环境一致体验 |\n\n---\n\n<a id='overview-installation'></a>\n\n## 安装指南\n\n### 相关页面\n\n相关主题：[基础示例](#quickstart-basic-example)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [pyproject.toml](https://github.com/pathwaycom/pathway/blob/main/pyproject.toml)\n- [rust-toolchain.toml](https://github.com/pathwaycom/pathway/blob/main/rust-toolchain.toml)\n- [docs/2.developers/4.user-guide/10.introduction/20.installation.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/10.introduction/20.installation.md)\n</details>\n\n# 安装指南\n\nPathway 是一个基于 Python 的 ETL 框架，由 Rust 引擎驱动，支持流处理和实时分析。本指南将帮助你在不同环境下正确安装和配置 Pathway。\n\n## 系统要求\n\n### 硬件和操作系统要求\n\n| 要求类型 | 最低配置 | 推荐配置 |\n|---------|---------|---------|\n| 处理器 | 2 核 | 4 核或以上 |\n| 内存 | 4 GB | 8 GB 或以上 |\n| 磁盘空间 | 500 MB | 1 GB 或以上 |\n| 操作系统 | Linux/macOS/Windows | Linux/macOS |\n\n### Python 版本要求\n\nPathway 需要 Python 3.10 或更高版本。建议使用 Python 3.11 以获得最佳性能。\n\n| Python 版本 | 支持状态 |\n|------------|---------|\n| 3.10 | ✅ 支持 |\n| 3.11 | ✅ 推荐 |\n| 3.12 | ✅ 支持 |\n\n## 安装方式\n\n### 使用 pip 安装\n\n通过 pip 包管理器安装 Pathway 是最简单的方式：\n\n```bash\npip install pathway\n```\n\n### 从源码安装\n\n对于需要最新功能或进行开发调试的用户，可以从源码编译安装：\n\n```bash\n# 克隆仓库\ngit clone https://github.com/pathwaycom/pathway.git\ncd pathway\n\n# 安装 Rust 依赖和 Python 包\npip install -e .\n```\n\n## 依赖环境\n\n### Rust 工具链\n\nPathway 的核心引擎使用 Rust 编写，需要 Rust 工具链支持。\n\n#### 安装 Rust\n\n```bash\n# Linux/macOS\ncurl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh\n\n# Windows - 使用 rustup 安装程序\nwinget install Rustlang.Rustup\n```\n\n#### 验证 Rust 安装\n\n```bash\nrustc --version\ncargo --version\n```\n\n项目使用 `rust-toolchain.toml` 文件指定 Rust 版本，确保编译器版本一致性。\n\n### Python 依赖\n\nPathway 的 Python 依赖在 `pyproject.toml` 中定义，主要包括：\n\n| 依赖包 | 版本要求 | 用途 |\n|-------|---------|------|\n| pyo3 | 最新稳定版 | Python 绑定 |\n| timely-dataflow | 外部依赖 | 数据流处理 |\n| differential-dataflow | 外部依赖 | 增量计算 |\n\n## 环境配置\n\n### 环境变量\n\nPathway 运行可能需要的可选环境变量：\n\n```bash\n# 设置 Pathway 许可证密钥（商业功能需要）\nexport PATHWAY_LICENSE_KEY=your_pathway_key\n\n# 可选：设置 Python 路径\nexport PYTHONPATH=/path/to/pathway:$PYTHONPATH\n```\n\n### 验证安装\n\n安装完成后，可以通过以下方式验证：\n\n```python\nimport pathway as pw\n\n# 打印版本信息\nprint(pw.__version__)\n\n# 创建一个简单的数据流测试\ninput_session = pw.demo.demo_reduce_operator(show_progress=False)\nprint(\"Pathway 安装成功！\")\n```\n\n## 平台特定说明\n\n### Linux\n\nLinux 环境需要确保以下系统包已安装：\n\n```bash\n# Debian/Ubuntu\nsudo apt-get update\nsudo apt-get install -y python3-dev python3-pip cargo\n```\n\n### macOS\n\nmacOS 用户建议使用 Homebrew 安装依赖：\n\n```bash\nbrew install python rust\npip3 install pathway\n```\n\n### Windows\n\nWindows 用户需要安装 Visual C++ Build Tools 以编译 Rust 依赖：\n\n1. 下载并安装 [Visual Studio Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)\n2. 确保安装 \"C++ 生成工具\" 工作负载\n3. 通过 PowerShell 或命令提示符安装 Pathway\n\n## 常见问题\n\n### 编译错误\n\n如果遇到编译错误，确保 Rust 工具链是最新版本：\n\n```bash\nrustup update\ncargo clean\npip install --force-reinstall pathway\n```\n\n### 导入错误\n\n如果遇到 `ModuleNotFoundError`，检查 Python 路径和安装状态：\n\n```bash\npip show pathway\npython -c \"import pathway; print(pathway.__file__)\"\n```\n\n## 下一步\n\n安装完成后，你可以开始使用 Pathway：\n\n1. 阅读[快速开始指南](./quick-start.md)了解基本用法\n2. 查看[用户指南](../user-guide/introduction/welcome)深入学习\n3. 参考 [API 文档](https://pathway.com/developers/api-docs/pathway)了解完整功能\n\n---\n\n<a id='quickstart-basic-example'></a>\n\n## 基础示例\n\n### 相关页面\n\n相关主题：[表操作](#trans-table-operations), [连接器概述](#connectors-overview)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n- [examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n</details>\n\n# 基础示例\n\nPathway 是一个 Python ETL 框架，用于流处理、实时分析、LLM 管道和 RAG（检索增强生成）应用开发。本文档介绍 Pathway 的基础概念、核心组件和入门示例。\n\n## 概述\n\nPathway 由 Python 前端和 Rust 后端引擎组成，通过 PyO3 实现 Python 与 Rust 的互操作。Python API 提供了简洁易用的接口，而 Rust 引擎基于 Differential Dataflow 进行增量计算，确保高性能和一致性。\n\n资料来源：[README.md:1-15]()\n\n## 核心概念\n\n### 数据流架构\n\n```mermaid\ngraph TD\n    A[数据源 Connectors] --> B[PythonSubject 输入]\n    B --> C[Rust 引擎处理]\n    C --> D[增量计算 Differential Dataflow]\n    D --> E[PyExportedTable 输出]\n    E --> F[快照 Snapshot]\n```\n\nPathway 的核心数据流遵循：数据源 → PythonSubject → Rust 引擎 → 增量计算 → ExportedTable 的模式。\n\n资料来源：[src/python_api.rs:35-45]()\n\n### PythonSubject 核心类\n\n`PythonSubject` 是 Python 端的数据输入接口，用于定义数据源的读取行为：\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| `start` | `Py<PyAny>` | 起始处理函数 |\n| `read` | `Py<PyAny>` | 读取数据函数 |\n| `seek` | `Py<PyAny>` | 定位函数 |\n| `on_persisted_run` | `Py<PyAny>` | 持久化运行回调 |\n| `end` | `Py<PyAny>` | 结束处理函数 |\n| `is_internal` | `bool` | 是否为内部数据源 |\n| `deletions_enabled` | `bool` | 是否启用删除 |\n\n资料来源：[src/python_api.rs:25-33]()\n\n### ValueField 模式字段\n\n`ValueField` 定义表的 schema 字段结构：\n\n| 属性 | 类型 | 说明 |\n|------|------|------|\n| `name` | `String` | 字段名称 |\n| `type_` | `Type` | 数据类型 |\n| `source` | `FieldSource` | 字段来源（默认 Payload） |\n| `default` | `Option<Value>` | 默认值 |\n| `metadata` | `Option<String>` | 元数据 |\n\n资料来源：[src/python_api.rs:52-61]()\n\n### PyExportedTable 导出表\n\n`PyExportedTable` 提供数据输出和快照功能：\n\n| 方法 | 返回类型 | 说明 |\n|------|----------|------|\n| `frontier()` | `TotalFrontier<Timestamp>` | 获取当前前沿 |\n| `snapshot_at(frontier)` | `Vec<(Key, Vec<Value>)>` | 在指定前沿获取快照 |\n| `failed()` | `bool` | 检查处理是否失败 |\n\n资料来源：[src/python_api.rs:106-115]()\n\n### Done 标记值\n\n`Done` 是时间线末端的标记值，用于表示计算完成状态。它是一个冻结的 Python 对象，具有特殊的比较和哈希行为：\n\n- 与 `Done` 比较时返回 `Equal`\n- 与整数比较时返回 `Greater`\n- 支持自定义哈希实现\n\n资料来源：[src/python_api.rs:176-196]()\n\n## 时间戳与前沿\n\n### Timestamp 时间戳\n\nPathway 使用 `Timestamp` 进行时间管理，实现 `OriginalOrRetraction` 和 `NextRetractionTime` trait：\n\n```mermaid\ngraph LR\n    A[原始数据] --> B[偶数时间戳]\n    A --> C[ retraction 数据]\n    C --> D[奇数时间戳]\n    D --> E[下一 retraction 时间]\n```\n\n- `is_original()`: 偶数时间戳表示原始数据\n- `next_retraction_time()`: 计算下一个 retraction 时间（当前时间 + 1）\n\n资料来源：[src/engine/timestamp.rs:45-55]()\n\n### TotalFrontier 前沿状态\n\n`TotalFrontier<T>` 是表示计算前沿的枚举类型：\n\n| 变体 | 说明 |\n|------|------|\n| `At(T)` | 在指定时间点 |\n| `Done` | 计算已完成 |\n\n该类型实现了 `FromPyObject` 和 `IntoPyObject`，支持 Python 和 Rust 间的无缝转换。\n\n资料来源：[src/python_api.rs:140-175]()\n\n## 基础使用模式\n\n### 典型项目结构\n\n```\nproject/\n├── app.py              # 主程序\n├── data/               # 数据目录\n│   ├── docs/           # 文档输入\n│   └── output/         # 结果输出\n├── .env                # 环境变量\n└── requirements.txt    # 依赖\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md:1-30]()\n\n### 环境配置\n\n安装 Pathway：\n\n```bash\npip install pathway[xpack-llm]\n```\n\n设置环境变量：\n\n```bash\nexport PATHWAY_LICENSE_KEY=your_pathway_key\n```\n\n或创建 `.env` 文件：\n\n```\nOPENAI_API_KEY=your_openai_api_key_here\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md:25-30]()\n\n### 运行应用\n\n直接运行 Python 脚本：\n\n```bash\npython app.py\n```\n\n使用 Pathway CLI：\n\n```bash\npathway spawn python main.py\n```\n\n多线程运行：\n\n```bash\npathway spawn --threads 3 python main.py\n```\n\n资料来源：[README.md:120-135]()\n\n### HTTP 服务集成\n\nPathway 提供 HTTP 服务器用于接收查询和返回结果：\n\n```python\nwebserver = pw.io.http.PathwayWebserver(host=\"0.0.0.0\", port=8011)\n```\n\n提交查询：\n\n```bash\ncurl --data '{ \"messages\": \"What is the value of X?\"}' http://localhost:8011\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md:15-20]()\n\n## 注册的 Python API 类\n\nPathway 通过 `add_class` 注册以下核心类到 `pathway.engine` 模块：\n\n| 类别 | 类名 |\n|------|------|\n| 连接器设置 | `AwsS3Settings`, `AzureBlobStorageSettings`, `MqttSettings`, `PsqlReplicationSettings` |\n| 解析器 | `CsvParserSettings`, `PySchemaRegistrySettings` |\n| 存储格式 | `DataStorage`, `DataFormat`, `PersistenceConfig` |\n| 索引 | `IcebergCatalogSettings`, `ElasticSearchParams` |\n| 内部类 | `PythonSubject`, `ValueField`, `PyExportedTable`, `Done`, `Trace` |\n| 配置 | `ConnectorProperties`, `ColumnProperties`, `TableProperties`, `TelemetryConfig` |\n| 快照 | `PySnapshotAccess`, `PySnapshotEvent`, `PyPersistenceMode` |\n| 向量索引 | `PyExternalIndexFactory`, `PyExternalIndexData`, `PyExternalIndexQuery` |\n\n资料来源：[src/python_api.rs:220-250]()\n\n## 文档与支持\n\n完整文档访问：[pathway.com/developers/](https://pathway.com/developers/user-guide/introduction/welcome)\n\n如有问题，可通过以下方式获取帮助：\n\n- [GitHub Issues](https://github.com/pathwaycom/pathway/issues)\n- [Discord 社区](https://discord.com/invite/pathway)\n- 邮箱：contact@pathway.com\n\n资料来源：[examples/templates/el-pipeline/README.md:45-50]()\n\n## 下一步\n\n- 学习 Pathway 的连接器配置\n- 了解实时数据处理的高级特性\n- 探索 RAG 和 LLM 集成的最佳实践\n- 查看 `examples/projects/` 目录下的完整示例项目\n\n---\n\n<a id='arch-engine-overview'></a>\n\n## 引擎架构\n\n### 相关页面\n\n相关主题：[Python与Rust集成](#arch-python-rust-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/engine/dataflow.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/dataflow.rs)\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [external/differential-dataflow/src/operators/join.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/join.rs)\n- [external/differential-dataflow/src/operators/reduce.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/reduce.rs)\n- [external/differential-dataflow/src/trace/wrappers/freeze.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/wrappers/freeze.rs)\n- [external/differential-dataflow/src/operators/arrange/writer.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/writer.rs)\n- [external/timely-dataflow/timely/src/progress/change_batch.rs](https://github.com/pathwaycom/pathway/blob/main/external/timely-dataflow/timely/src/progress/change_batch.rs)\n</details>\n\n# 引擎架构\n\n## 概述\n\nPathway 的引擎架构是一个基于 **Rust 的高性能计算引擎**，构建于 Differential Dataflow 和 Timely Dataflow 之上。这一架构使得 Pathway 能够以 Python API 的形式提供简洁易用的接口，同时在底层实现增量计算和流式处理。引擎的核心设计理念是将 Python 层的灵活表达与 Rust 引擎的高性能执行相结合，通过 PyO3 实现 Python 与 Rust 之间的无缝桥接。\n\nPathway 引擎支持多种数据类型和处理模式，包括静态数据（batch）和动态数据（streaming），能够在同一个代码库中处理历史数据和实时数据流。引擎内置的增量计算机制确保只有发生变化的数据才会被重新处理，从而大幅提升计算效率。\n\n## 核心架构组件\n\n### 计算引擎层次结构\n\nPathway 引擎采用三层架构设计，从上到下依次为：\n\n| 层次 | 组件 | 职责 |\n|------|------|------|\n| Python API 层 | `pathway.engine` 模块 | 提供 Python 接口，暴露核心类和函数 |\n| PyO3 绑定层 | `src/python_api.rs` | 实现 Python 对象到 Rust 对象的转换 |\n| Rust 核心层 | `src/engine/` | 实现 Differential Dataflow 计算引擎 |\n\n资料来源：[src/python_api.rs:1-50]()\n\n### 核心类结构\n\nPathway 引擎通过 PyO3 将以下核心 Rust 结构暴露给 Python：\n\n```python\n# Python API 层暴露的核心类\nTable           # 数据表，表示一组带时间戳的记录\nColumn          # 列，表示数据表中的一列\nUniverse        # 宇宙，表示一组唯一标识符\nContext         # 上下文，管理数据流计算的执行环境\nScope           # 作用域，定义计算的数据流范围\nDataRow         # 数据行，表示表中的一行数据\nComputer        # 计算器，定义如何计算派生列\n```\n\n资料来源：[src/python_api.rs:85-110]()\n\n## 数据流计算模型\n\n### Differential Dataflow 集成\n\nPathway 引擎的核心计算模型基于 **Differential Dataflow**，这是一种用于增量数据流计算的编程模型。Differential Dataflow 能够高效处理数据的插入、更新和删除操作，通过追踪数据变化而非重新计算整个结果来实现高性能。\n\n```mermaid\ngraph TD\n    A[输入数据流] --> B[Input Session]\n    B --> C[Differential Collection]\n    C --> D[Arrangement]\n    D --> E[Operators<br/>join/reduce/count]\n    E --> F[输出 Collection]\n    F --> G[Incremental 结果]\n    \n    H[变化追踪] --> C\n```\n\n资料来源：[external/differential-dataflow/src/operators/reduce.rs:1-50]()\n\n### 时间戳与版本管理\n\nPathway 使用复合时间戳来追踪数据变化，每个数据记录都带有时间戳和差异值（multiplicity）。时间戳必须实现 `Lattice` trait，支持部分有序比较，这使得系统能够正确处理并发和乱序数据。\n\n```rust\n// 时间戳必须是 Lattice，用于支持增量计算的合并\nimpl<G, T1> JoinCore<G, T1::Key, T1::Val, T1::R> for Arranged<G, T1>\nwhere\n    G: Scope,\n    G::Timestamp: Lattice+Ord+Debug,\n    // ...\n```\n\n资料来源：[external/differential-dataflow/src/operators/join.rs:1-30]()\n\n### 数据流图结构\n\nPathway 引擎内部维护一个数据流图（Dataflow Graph），用于协调各个算子之间的数据传递：\n\n```mermaid\ngraph LR\n    subgraph DataflowGraphInner\n        A[BeforeIterate] --> B[Scope]\n        B --> C[Child Scope]\n        C --> D[Operators]\n        D --> E[AfterIterate]\n    end\n    \n    F[持久化数据] --> A\n    A --> G[表创建]\n    G --> D\n```\n\n资料来源：[src/engine/dataflow.rs:1-60]()\n\n## 核心算子实现\n\n### Join 算子\n\nJoin 是 Pathway 中最常用的算子之一，用于合并两个数据集。引擎使用 `join_core_internal_unsafe` 方法实现高效的连接操作：\n\n```rust\n// Join 操作的核心实现\nfn join_core<Tr2,I,L>(&self, stream2: &Arranged<G,Tr2>, result: L) -> Collection<G,D,ROut>\nwhere\n    Tr2: TraceReader<Key=K, Time=G::Timestamp>+Clone+'static,\n    // ... 配置连接参数\n{\n    self.arrange_by_key().join_core_internal_unsafe(stream2, result)\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/join.rs:20-35]()\n\n### Reduce 算子\n\nReduce 算子用于对数据进行聚合操作，支持自定义归约器实现：\n\n```rust\nimpl<S: MaybeTotalScope, R: ReducerImpl> DataflowReducer<S> for R\nwhere\n    Collection<S, (Key, Option<<R as ReducerImpl>::State>)>:\n        Into<PersistableCollection<S>> + From<PersistableCollection<S>>,\n{\n    fn reduce(\n        self: Rc<Self>,\n        values: &Collection<S, (Key, Key, Vec<Value>)>,\n        error_logger: Rc<dyn LogError>,\n        _trace: Trace,\n        graph: &mut DataflowGraphInner<S>,\n    ) -> Result<Values<S>> {\n        // 实现聚合逻辑\n    }\n}\n```\n\n资料来源：[src/engine/dataflow.rs:80-110]()\n\n### Count 算子\n\nCount 是 Differential Dataflow 内置的聚合操作，用于计算每个键的出现次数：\n\n```rust\n// Count 扩展 trait 实现\npub trait Count<G: Scope, K: Data, R: Semigroup> where G::Timestamp: Lattice+Ord {\n    fn count(&self) -> Collection<G, (K, R), isize> {\n        self.count_core()\n    }\n    \n    fn count_core<R2: Abelian + From<i8>>(&self) -> Collection<G, (K, R), R2> {\n        self.reduce_abelian::<_,DefaultValTrace<_,_,_,_>>(\n            \"Count\", \n            |_k, s, t| t.push((s[0].1.clone(), R2::from(1i8)))\n        ).as_collection(|k, c| (k.clone(), c.clone()))\n    }\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/reduce.rs:30-80]()\n\n## Arrangement 数据结构\n\n### Arrangement 概念\n\nArrangement 是 Differential Dataflow 中的核心数据结构，用于组织和索引数据以支持高效的后续操作。Arrangement 本质上是一个按 key 排序的 trace 结构：\n\n```mermaid\ngraph TD\n    A[输入数据] --> B[ArrangeByKey]\n    B --> C[Trace Agent]\n    C --> D[Trace Spine]\n    D --> E[多个 Batch]\n    \n    F[查询] --> G[Cursor 遍历]\n    G --> D\n```\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:1-40]()\n\n### Trace Writer 实现\n\nTrace Writer 负责向 arrangement 中写入数据批次：\n\n```rust\npub fn new(\n    upper: Vec<Tr::Time>,\n    trace: Weak<RefCell<TraceBox<Tr>>>,\n    queues: Rc<RefCell<Vec<TraceAgentQueueWriter<Tr>>>>\n) -> Self {\n    let mut temp = Antichain::new();\n    temp.extend(upper.into_iter());\n    Self { upper: temp, trace, queues }\n}\n\npub fn exert(&mut self, fuel: &mut isize) {\n    if let Some(trace) = self.trace.upgrade() {\n        trace.borrow_mut().trace.exert(fuel);\n    }\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/arrange/writer.rs:1-40]()\n\n### Trace Freeze 包装器\n\nTrace Freeze 是一个特殊的 trace 包装器，用于在特定条件下冻结时间：\n\n```rust\nimpl<Tr, F> TraceReader for TraceFreeze<Tr, F>\nwhere\n    Tr: TraceReader,\n    Tr::Time: Lattice+Clone+'static,\n    F: Fn(&Tr::Time)->Option<Tr::Time>+'static,\n{\n    type Key = Tr::Key;\n    type Val = Tr::Val;\n    type Time = Tr::Time;\n    type R = Tr::R;\n    \n    type Batch = BatchFreeze<Tr::Batch, F>;\n    type Cursor = CursorFreeze<Tr::Cursor, F>;\n}\n```\n\n资料来源：[external/differential-dataflow/src/trace/wrappers/freeze.rs:1-50]()\n\n## 增量计算机制\n\n### 变化批次处理\n\nPathway 使用 `ChangeBatch` 来追踪数据的变化，每个变化包含时间戳和差异值：\n\n```rust\npub fn into_inner(mut self) -> Vec<(T, i64)> {\n    self.compact();\n    self.updates\n}\n\npub fn iter(&mut self) -> ::std::slice::Iter<(T, i64)> {\n    self.compact();\n    self.updates.iter()\n}\n```\n\n资料来源：[external/timely-dataflow/timely/src/progress/change_batch.rs:60-90]()\n\n### 持久化与状态管理\n\n引擎支持对计算状态进行持久化，通过 `PersistenceConfig` 配置持久化行为：\n\n| 配置项 | 类型 | 说明 |\n|--------|------|------|\n| `persistence_mode` | `PyPersistenceMode` | 持久化模式（禁用/异步/同步） |\n| `snapshot_access` | `PySnapshotAccess` | 快照访问策略 |\n| `snapshot_event` | `PySnapshotEvent` | 快照触发事件 |\n\n资料来源：[src/python_api.rs:1-30]()\n\n## Python 绑定实现\n\n### PyO3 类导出\n\nPathway 通过 PyO3 将 Rust 结构导出为 Python 类：\n\n```rust\n#[pymethods]\nimpl ValueField {\n    #[new]\n    #[pyo3(signature = (name, type_, source = FieldSource::Payload))]\n    pub fn new(name: String, type_: Type, source: FieldSource) -> Self {\n        Self {\n            name,\n            type_,\n            source,\n        }\n    }\n}\n```\n\n资料来源：[src/python_api.rs:140-160]()\n\n### PythonSubject 绑定\n\nPythonSubject 允许 Python 代码定义自定义的数据源：\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\npub struct PythonSubject {\n    pub start: Py<PyAny>,\n    pub read: Py<PyAny>,\n    pub seek: Py<PyAny>,\n    pub on_persisted_run: Py<PyAny>,\n    pub end: Py<PyAny>,\n    pub is_internal: bool,\n    pub deletions_enabled: bool,\n}\n```\n\n资料来源：[src/python_api.rs:95-130]()\n\n## 数据模型\n\n### 表结构\n\n| 组件 | 说明 |\n|------|------|\n| `TableProperties` | 表级别的属性配置 |\n| `ColumnProperties` | 列级别的属性配置 |\n| `ConnectorProperties` | 连接器配置 |\n| `Trace` | 追踪配置 |\n| `Done` | 完成标记 |\n\n资料来源：[src/python_api.rs:25-40]()\n\n### 值字段定义\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\n#[derive(Clone)]\npub struct ValueField {\n    #[pyo3(get)]\n    pub name: String,\n    #[pyo3(get)]\n    pub type_: Type,\n    #[pyo3(get)]\n    pub source: FieldSource,\n    #[pyo3(get)]\n    pub default: Option<Value>,\n    #[pyo3(get)]\n    pub metadata: Option<String>,\n}\n```\n\n资料来源：[src/python_api.rs:145-165]()\n\n## 关键配置选项\n\n### 连接器模式\n\n| 模式 | 说明 |\n|------|------|\n| `STATIC` | 静态数据源，读取一次 |\n| `STREAMING` | 流式数据源，持续监听 |\n| `REPLAY` | 回放模式，重演历史数据 |\n\n### 监控级别\n\n| 级别 | 说明 |\n|------|------|\n| `OFF` | 禁用监控 |\n| `BASIC` | 基础指标 |\n| `DETAILED` | 详细指标 |\n\n资料来源：[src/python_api.rs:85-100]()\n\n## 工作流程图\n\n```mermaid\ngraph TD\n    A[Python 代码定义数据流] --> B[PyO3 调用 Rust 引擎]\n    B --> C[创建 DataflowGraph]\n    C --> D[初始化 Scope 和 Universe]\n    D --> E[注册数据源和算子]\n    E --> F[启动数据流执行]\n    \n    F --> G{数据输入模式}\n    G -->|批量| H[Batch Input]\n    G -->|流式| I[Stream Input]\n    \n    H --> J[计算 operators]\n    I --> J\n    \n    J --> K[生成 Arrangement]\n    K --> L[输出结果]\n    \n    J --> M{状态变化}\n    M -->|有变化| N[触发增量计算]\n    M -->|无变化| O[跳过重算]\n    N --> K\n```\n\n## 总结\n\nPathway 的引擎架构通过深度整合 Differential Dataflow 和 Timely Dataflow，实现了一个高性能、可扩展的增量计算引擎。核心设计特点包括：\n\n1. **双层语言架构**：Python API 提供易用性，Rust 引擎保证性能\n2. **增量计算模型**：通过时间戳和差异值追踪变化，避免重复计算\n3. **灵活的 Arrangement**：支持高效的 join、reduce 等操作\n4. **完整的持久化支持**：内置状态管理和快照机制\n5. **丰富的连接器**：支持多种数据源和输出目标\n\n这种架构使 Pathway 能够同时满足开发和生产环境的需求，支持从本地开发到大规模分布式部署的多种场景。\n\n---\n\n<a id='arch-python-rust-integration'></a>\n\n## Python与Rust集成\n\n### 相关页面\n\n相关主题：[引擎架构](#arch-engine-overview)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [external/differential-dataflow/src/trace/wrappers/freeze.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/wrappers/freeze.rs)\n- [external/timely-dataflow/mdbook/src/chapter_2/chapter_2_5.md](https://github.com/pathwaycom/pathway/blob/main/external/timely-dataflow/mdbook/src/chapter_2/chapter_2_5.md)\n</details>\n\n# Python与Rust集成\n\nPathway是一个基于Python的ETL框架，但其核心计算引擎由Rust实现。这种架构通过PyO3库实现Python与Rust的无缝集成，使得开发者可以使用简洁的Python API编写数据处理逻辑，同时享受Rust带来的高性能和内存安全性。\n\n## 技术架构概述\n\nPathway采用双层架构设计：上层是用户编写的Python代码，下层是Rust实现的增量计算引擎。Differential Dataflow算法驱动核心数据流处理，而Timely Dataflow提供分布式计算能力。\n\n```mermaid\ngraph TD\n    A[Python用户代码] --> B[PyO3绑定层]\n    B --> C[Rust引擎核心]\n    C --> D[Differential Dataflow]\n    C --> E[Timely Dataflow]\n    D --> F[增量计算]\n    E --> G[分布式数据流]\n    F --> H[结果返回Python]\n    G --> H\n```\n\n## PyO3绑定机制\n\nPathway使用PyO3库实现Python与Rust之间的双向通信。PyO3提供了从Rust向Python暴露类型和函数的能力，同时也支持从Python代码调用Rust函数。\n\n### 核心数据结构绑定\n\n#### PythonSubject结构体\n\n`PythonSubject`是Pathway中用于连接外部数据源的关键结构体，它将Python端的回调函数包装为Rust可调用的对象：\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\n#[derive(Clone)]\npub struct PythonSubject {\n    pub start: Py<PyAny>,\n    pub read: Py<PyAny>,\n    pub seek: Py<PyAny>,\n    pub on_persisted_run: Py<PyAny>,\n    pub end: Py<PyAny>,\n    pub is_internal: bool,\n    pub deletions_enabled: bool,\n}\n```\n\n资料来源：[src/python_api.rs:1-15]()\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| `start` | `Py<PyAny>` | 起始位置回调函数 |\n| `read` | `Py<PyAny>` | 读取数据回调函数 |\n| `seek` | `Py<PyAny>` | 定位回调函数 |\n| `on_persisted_run` | `Py<PyAny>` | 持久化运行回调 |\n| `end` | `Py<PyAny>` | 结束位置回调 |\n| `is_internal` | `bool` | 是否为内部数据源 |\n| `deletions_enabled` | `bool` | 是否启用删除操作 |\n\n#### ValueField结构体\n\n`ValueField`用于定义表字段的元数据，包括字段名、类型、来源和默认值：\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen)]\n#[derive(Clone)]\npub struct ValueField {\n    #[pyo3(get)]\n    pub name: String,\n    #[pyo3(get)]\n    pub type_: Type,\n    #[pyo3(get)]\n    pub source: FieldSource,\n    #[pyo3(get)]\n    pub default: Option<Value>,\n    #[pyo3(get)]\n    pub metadata: Option<String>,\n}\n```\n\n资料来源：[src/python_api.rs:36-48]()\n\n### PyO3类型转换\n\nPathway实现了Python与Rust之间的自动类型转换，通过`IntoPyObject`和`FromPyObject`trait实现。\n\n#### Timestamp类型转换\n\n```rust\nimpl<'py> IntoPyObject<'py> for Timestamp {\n    type Target = PyAny;\n    type Output = Bound<'py, Self::Target>;\n    type Error = PyErr;\n    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {\n        self.0.into_bound_py_any(py)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:40-50]()\n\n#### TotalFrontier枚举转换\n\n`TotalFrontier`表示计算的前沿边界，包含`At`和`Done`两种状态：\n\n```rust\nimpl<'py, T> FromPyObject<'py> for TotalFrontier<T>\nwhere\n    T: FromPyObject<'py>,\n{\n    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {\n        if ob.is_instance_of::<Done>() {\n            Ok(TotalFrontier::Done)\n        } else {\n            Ok(TotalFrontier::At(ob.extract()?))\n        }\n    }\n}\n```\n\n资料来源：[src/python_api.rs:180-192]()\n\n## 错误处理机制\n\nPathway通过PyO3暴露多种异常类型供Python端捕获和处理。\n\n### 异常类型体系\n\n| 异常类型 | 说明 | 使用场景 |\n|----------|------|----------|\n| `EngineError` | 基础引擎错误 | 通用错误处理 |\n| `EngineErrorWithTrace` | 带调用栈的错误 | 调试和问题定位 |\n| `OtherWorkerError` | 其他工作进程错误 | 分布式环境错误处理 |\n| `Error` | 通用错误类型 | 自定义错误 |\n| `Pending` | 操作未完成状态 | 异步操作状态查询 |\n\n资料来源：[src/python_api.rs:200-230]()\n\n## 表导出与快照机制\n\n### PyExportedTable导出表\n\n`PyExportedTable`允许Python代码访问Rust引擎计算的表数据：\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen, name = \"ExportedTable\")]\npub struct PyExportedTable {\n    inner: Arc<dyn ExportedTable>,\n}\n\n#[pymethods]\nimpl PyExportedTable {\n    fn frontier(&self) -> TotalFrontier<Timestamp> {\n        self.inner.frontier()\n    }\n\n    fn snapshot_at(&self, frontier: TotalFrontier<Timestamp>) -> Vec<(Key, Vec<Value>)> {\n        self.inner.snapshot_at(frontier)\n    }\n\n    fn failed(&self) -> bool {\n        self.inner.failed()\n    }\n}\n```\n\n资料来源：[src/python_api.rs:115-135]()\n\n### Done标记类型\n\n`Done`是一个特殊的冻结类型，用于表示计算已完成的状态：\n\n```rust\nmod done {\n    use once_cell::sync::Lazy;\n    use pyo3::prelude::*;\n\n    #[pyclass(module = \"pathway.engine\", frozen)]\n    pub struct Done(InnerDone);\n\n    pub static DONE: Lazy<Py<Done>> = Lazy::new(|| {\n        Python::with_gil(|py| Py::new(py, Done(InnerDone)).expect(\"creating DONE should not fail\"))\n    });\n}\n```\n\n资料来源：[src/python_api.rs:165-177]()\n\n## 计算上下文\n\n### Context作用域管理\n\n`Context`结构体提供了当前计算行的上下文信息：\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen)]\npub struct Context(SendWrapper<ScopedContext>);\n\n#[pymethods]\nimpl Context {\n    #[getter]\n    fn this_row(&self) -> PyResult<Key> {\n        self.0\n            .with(|context| context.this_row())\n            .ok_or_else(|| PyValueError::new_err(\"context out of scope\"))\n    }\n}\n```\n\n资料来源：[src/python_api.rs:235-250]()\n\n### Computer计算器\n\n`Computer`用于执行用户定义的Python函数：\n\n```rust\n#[pyclass]\npub struct Computer {\n    pub fun: Py<PyAny>,\n    pub dtype: Py<PyAny>,\n    pub is_output: bool,\n    pub is_method: bool,\n    pub universe: Py<Universe>,\n    pub data: Value,\n    pub data_column: Option<Py<Column>>,\n}\n\n#[pymethods]\nimpl Computer {\n    #[staticmethod]\n    #[pyo3(signature = (fun, dtype, is_output, is_method, universe, data = Value::None, data_column = None))]\n    pub fn from_raising_fun(\n        py: Python,\n        fun: Py<PyAny>,\n        dtype: Py<PyAny>,\n        is_output: bool,\n        is_method: bool,\n        universe: Py<Universe>,\n        data: Value,\n        data_column: Option<Py<Column>>,\n    ) -> PyResult<Py<Self>> { ... }\n}\n```\n\n资料来源：[src/python_api.rs:260-290]()\n\n## Differential Dataflow集成\n\n### Arrangement导入机制\n\nPathway通过Differential Dataflow的arrangement机制在数据流之间共享计算结果：\n\n```rust\n/// Same as `import`, but allows to name the source.\npub fn import_named<G>(&mut self, scope: &G, name: &str) -> Arranged<G, TraceAgent<Tr>>\nwhere\n    G: Scope<Timestamp=Tr::Time>,\n    Tr::Time: Timestamp,\n{\n    // Drop ShutdownButton and return only the arrangement.\n    self.import_core(scope, name).0\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:35-45]()\n\n### Trace Freeze包装器\n\n`TraceFreeze`包装器允许延迟计算trace直到被使用时才执行：\n\n```rust\npub struct TraceFreeze<Tr, F>\nwhere\n    Tr: TraceReader,\n    Tr::Time: Lattice+Clone+'static,\n    F: Fn(&Tr::Time)->Option<Tr::Time>,\n{\n    trace: Tr,\n    func: Rc<F>,\n}\n```\n\n资料来源：[external/differential-dataflow/src/trace/wrappers/freeze.rs:15-21]()\n\n## 时间戳与顺序语义\n\n### Timestamp特征实现\n\nPathway实现了Differential Dataflow所需的时间戳特征：\n\n```rust\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:60-70]()\n\n### 时间戳顺序关系\n\nTimestamp支持时间戳间的偏序关系计算：\n\n```rust\npub fn followed_by(&self, other: &Self) -> Option<Self> {\n    self.0.followed_by(&other.0).map(Self)\n}\n```\n\n## 注册的Python类\n\nPathway通过`add_class`方法将以下Rust结构体暴露给Python：\n\n| 类名 | 功能 |\n|------|------|\n| `AwsS3Settings` | AWS S3连接配置 |\n| `AzureBlobStorageSettings` | Azure Blob存储配置 |\n| `ElasticSearchParams` | Elasticsearch参数 |\n| `CsvParserSettings` | CSV解析设置 |\n| `DataStorage` | 数据存储配置 |\n| `DataFormat` | 数据格式定义 |\n| `PersistenceConfig` | 持久化配置 |\n| `PythonSubject` | Python数据源主题 |\n| `MqttSettings` | MQTT连接设置 |\n| `IcebergCatalogSettings` | Iceberg目录配置 |\n| `ConnectorProperties` | 连接器属性 |\n| `ColumnProperties` | 列属性 |\n| `TableProperties` | 表属性 |\n| `PyExportedTable` | 导出表 |\n| `PyExternalIndexFactory` | 外部索引工厂 |\n| `PyUSearchMetricKind` | USearch度量类型 |\n| `PyBruteForceKnnMetricKind` | 暴力KNN度量类型 |\n\n资料来源：[src/python_api.rs:300-330]()\n\n## 数据流处理流程\n\n```mermaid\nsequenceDiagram\n    participant Python用户代码\n    participant PyO3绑定层\n    participant Rust引擎\n    participant DifferentialDataflow\n    participant TimelyDataflow\n    \n    Python用户代码->>PyO3绑定层: 调用Python API\n    PyO3绑定层->>Rust引擎: 传递计算请求\n    Rust引擎->>DifferentialDataflow: 创建数据流操作符\n    DifferentialDataflow->>TimelyDataflow: 分布式执行\n    TimelyDataflow-->>DifferentialDataflow: 返回计算结果\n    DifferentialDataflow-->>Rust引擎: 增量更新结果\n    Rust引擎-->>PyO3绑定层: 转换结果类型\n    PyO3绑定层-->>Python用户代码: 返回Python对象\n```\n\n## 多工作进程支持\n\nPathway支持通过Timely Dataflow的多工作进程实现并行计算。可以通过命令行参数指定工作进程数量：\n\n```bash\ncargo run --example wordcount -- -w2\n```\n\n这将启动两个工作进程并行处理数据，每个进程独立消费输入数据并产生部分结果。\n\n资料来源：[external/timely-dataflow/mdbook/src/chapter_2/chapter_2_5.md:1-25]()\n\n## 总结\n\nPathway的Python与Rust集成架构充分发挥了两种语言的优势：\n\n- **Python端**：提供简洁的声明式API，开发者使用Python编写数据处理逻辑\n- **Rust端**：实现高性能的增量计算引擎，基于Differential Dataflow提供高效的流式处理\n- **PyO3**：作为桥梁，实现Python与Rust之间的无缝类型转换和函数调用\n- **Timely Dataflow**：提供分布式计算能力，支持多工作进程并行处理\n\n---\n\n<a id='connectors-overview'></a>\n\n## 连接器概述\n\n### 相关页面\n\n相关主题：[数据库连接器](#connectors-database), [流数据连接器](#connectors-streaming)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs) — Python API 绑定定义，包括连接器相关的 Rust 类型映射\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md) — 项目总体介绍\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md) — RAG 示例项目\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md) — 多智能体 RAG 示例\n- [examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md) — 流水线模板文档\n\n</details>\n\n# 连接器概述\n\n## 1. 概述\n\nPathway 是一个 Python ETL 框架，专为流处理、实时分析、LLM 管道和 RAG（检索增强生成）而设计 [资料来源：README.md](README.md)。连接器（Connector）是 Pathway 数据管道中的核心组件，负责数据的输入与输出操作。\n\nPathway 连接器系统具有以下特点：\n\n- **多源数据支持**：支持文件系统、数据库、云存储、消息队列等多种数据源\n- **实时与批处理**：同一套 API 支持静态批处理和流式实时处理\n- **Rust 引擎驱动**：底层由 Rust 编写的可扩展引擎驱动，基于 Differential Dataflow 实现增量计算\n- **Python 友好**：通过 Python API 提供简洁易用的接口\n\n## 2. 连接器架构\n\n### 2.1 核心组件关系\n\n```mermaid\ngraph TD\n    subgraph \"Python API 层\"\n        IO[\"pw.io 模块\"]\n        DS[\"DataSource\"]\n        DK[\"DataSink\"]\n    end\n    \n    subgraph \"Rust 引擎层\"\n        CP[\"ConnectorProperties\"]\n        CM[\"ConnectorMode\"]\n        Engine[\"Pathway Engine\"]\n    end\n    \n    subgraph \"数据源类型\"\n        S3[\"AWS S3\"]\n        Azure[\"Azure Blob\"]\n        CSV[\"CSV Parser\"]\n        MQTT[\"MQTT\"]\n        Kafka[\"Kafka/Confluent\"]\n    end\n    \n    IO --> DS\n    IO --> DK\n    DS --> CP\n    DK --> CP\n    CP --> CM\n    CM --> Engine\n    S3 --> CP\n    Azure --> CP\n    CSV --> CP\n    MQTT --> CP\n```\n\n### 2.2 连接器模式\n\nPathway 支持两种连接器模式，用于区分数据处理方式 [资料来源：src/python_api.rs:320-329](src/python_api.rs)：\n\n| 模式 | 枚举值 | 说明 |\n|------|--------|------|\n| 静态模式 | `ConnectorMode.STATIC` | 批处理模式，一次性加载所有数据 |\n| 流式模式 | `ConnectorMode.STREAMING` | 实时流处理，持续监听数据变化 |\n\n```rust\n#[pymethods]\nimpl PyConnectorMode {\n    #[classattr]\n    pub const STATIC: ConnectorMode = ConnectorMode::Static;\n    #[classattr]\n    pub const STREAMING: ConnectorMode = ConnectorMode::Streaming;\n}\n```\n\n## 3. 数据源（DataSource）\n\n数据源是连接器系统中负责从外部系统读取数据的组件。在 Pathway 中，数据源通过 Python Subject 接口与 Rust 引擎交互。\n\n### 3.1 PythonSubject 结构\n\n`PythonSubject` 是 Python 层与 Rust 引擎之间的桥梁，封装了数据读取所需的所有回调函数 [资料来源：src/python_api.rs:15-39](src/python_api.rs)：\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\n#[derive(Clone)]\npub struct PythonSubject {\n    pub start: Py<PyAny>,              // 初始化回调\n    pub read: Py<PyAny>,               // 读取数据回调\n    pub seek: Py<PyAny>,               // 寻址回调（可选）\n    pub on_persisted_run: Py<PyAny>,    // 持久化运行回调\n    pub end: Py<PyAny>,                // 结束回调\n    pub is_internal: bool,             // 是否为内部数据源\n    pub deletions_enabled: bool,       // 是否启用删除操作\n}\n```\n\n### 3.2 数据读取流程\n\n```mermaid\nsequenceDiagram\n    participant App as 应用代码\n    participant DS as DataSource\n    participant Subject as PythonSubject\n    participant Engine as Pathway Engine\n    \n    App->>DS: 创建数据源实例\n    DS->>Subject: 调用 start() 初始化\n    Engine->>Subject: 调用 read() 读取数据\n    Subject-->>Engine: 返回数据批次\n    loop 流式处理\n        Engine->>Subject: 调用 read() 轮询新数据\n        Subject-->>Engine: 返回更新数据\n    end\n    Engine->>Subject: 调用 end() 清理资源\n```\n\n## 4. 数据输出（DataSink）\n\n数据 Sink 负责将处理后的数据输出到外部系统。Pathway 提供了多种输出连接器，包括文件系统、数据库和消息队列等。\n\n### 4.1 输出配置参数\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| `host` | string | 服务器主机地址 |\n| `port` | int | 服务器端口 |\n| `format` | DataFormat | 输出数据格式 |\n| `storage` | DataStorage | 存储配置 |\n| `create_table` | bool | 是否创建输出表 |\n\n## 5. 支持的连接器类型\n\n### 5.1 云存储连接器\n\nPathway 原生支持主流云存储服务：\n\n| 服务 | 设置类 | 说明 |\n|------|--------|------|\n| AWS S3 | `AwsS3Settings` | 亚马逊 S3 对象存储 |\n| Azure Blob | `AzureBlobStorageSettings` | 微软 Azure Blob 存储 |\n\n### 5.2 消息队列连接器\n\n| 消息系统 | 设置类 | 说明 |\n|----------|--------|------|\n| MQTT | `MqttSettings` | IoT 场景常用的轻量级消息协议 |\n| Kafka | SchemaRegistrySettings | Confluent Schema Registry 集成 |\n\n### 5.3 数据库连接器\n\n| 数据库 | 设置类 | 说明 |\n|--------|--------|------|\n| PostgreSQL | `PsqlReplicationSettings` | 支持 CDC（变更数据捕获）复制 |\n\n### 5.4 其他专用连接器\n\n- **ElasticSearch**：`ElasticSearchParams` / `ElasticSearchAuth` — 全文搜索引擎集成\n- **Iceberg**：`IcebergCatalogSettings` — Apache Iceberg 表格式支持\n- **Schema Registry**：`PySchemaRegistrySettings` — Avro/Protobuf Schema 管理\n\n### 5.5 解析器设置\n\n```rust\nm.add_class::<CsvParserSettings>()?;\nm.add_class::<ValueField>()?;\nm.add_class::<DataStorage>()?;\nm.add_class::<DataFormat>()?;\n```\n\n## 6. 连接器配置属性\n\n### 6.1 ConnectorProperties\n\n连接器属性类定义了数据源和输出端的通用配置选项 [资料来源：src/python_api.rs:450-465](src/python_api.rs)：\n\n| 属性 | 说明 |\n|------|------|\n| `columns` | 列定义列表 |\n| `primary_key` | 主键列 |\n| `date_format` | 日期格式字符串 |\n| `value_fields` | 值字段列表 |\n\n### 6.2 ColumnProperties\n\n列属性用于定义单个列的元数据：\n\n```python\npw.ColumnProperties(\n    name=\"column_name\",\n    type_=pw.Type,\n    source=pw.FieldSource.Payload,\n    default=None,\n    metadata=None\n)\n```\n\n## 7. 使用示例\n\n### 7.1 RAG 应用中的连接器使用\n\n在问答型 RAG 应用中，Pathway 连接器用于实时索引文档并提供检索服务 [资料来源：examples/projects/question-answering-rag/README.md](examples/projects/question-answering-rag/README.md)：\n\n```python\nimport pathway as pw\n\n# 创建 HTTP 服务器用于接收查询\nwebserver = pw.io.http.PathwayWebserver(host=\"0.0.0.0\", port=8011)\n\n# 数据源：文件系统连接器监听 ./data/ 目录\n# 文档被分块、向量化后存储\n# 输出：通过 REST API 提供 /v1/retrieve 检索接口\n```\n\n### 7.2 多智能体 RAG 中的连接器\n\n在多智能体 RAG 架构中，连接器同时承担文档索引和智能体间通信的任务 [资料来源：examples/projects/ag2-multiagent-rag/README.md](examples/projects/ag2-multiagent-rag/README.md)：\n\n```mermaid\ngraph LR\n    A[\"Documents (./data/)\"] -->|实时索引| B[\"Pathway VectorStoreServer\"]\n    B -->|HTTP /v1/retrieve| C[\"AG2 Researcher Agent\"]\n    C -->|search_documents| B\n    B -->|检索结果| D[\"AG2 Analyst Agent\"]\n    D -->|综合回答| E[\"用户\"]\n```\n\n## 8. 连接器的高级特性\n\n### 8.1 持久化配置\n\nPathway 支持连接器级别的持久化配置，用于故障恢复和状态管理：\n\n```rust\nm.add_class::<PersistenceConfig>()?;\nm.add_class::<PyPersistenceMode>()?;\nm.add_class::<PySnapshotAccess>()?;\nm.add_class::<PySnapshotEvent>()?;\n```\n\n### 8.2 会话类型\n\nPathway 引擎支持不同类型的会话处理 [资料来源：src/python_api.rs:334-345](src/python_api.rs)：\n\n| 会话类型 | 枚举值 | 使用场景 |\n|----------|--------|----------|\n| 原生会话 | `SessionType.NATIVE` | 标准批处理和流处理 |\n| UPSERT 会话 | `SessionType.UPSERT` | 增量更新场景 |\n\n### 8.3 监控与追踪\n\nPathway 提供了完善的监控机制，包括：\n\n- **TelemetryConfig**：遥测数据收集配置\n- **BackfillingThreshold**：回填数据阈值控制\n- **EngineTrace**：引擎执行追踪\n\n## 9. 最佳实践\n\n1. **选择合适的连接器模式**：静态数据使用 `STATIC` 模式，实时数据使用 `STREAMING` 模式\n2. **配置主键**：为数据源设置主键以支持增量更新和去重\n3. **设置超时和重试**：生产环境应配置适当的重试策略\n4. **监控连接器状态**：使用 Pathway Dashboard 监控消息数量和延迟\n5. **处理删除操作**：根据业务需求通过 `deletions_enabled` 参数控制是否处理数据删除\n\n## 10. 相关文档\n\n- [入门指南](https://pathway.com/developers/user-guide/introduction/welcome)\n- [API 文档](https://pathway.com/developers/api-docs/pathway)\n- [模板示例](/developers/templates?tab=ai-pipelines)\n- [部署指南](#deployment)\n\n---\n\n<a id='connectors-database'></a>\n\n## 数据库连接器\n\n### 相关页面\n\n相关主题：[连接器概述](#connectors-overview), [流数据连接器](#connectors-streaming)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n</details>\n\n# 数据库连接器\n\n## 概述\n\nPathway 的数据库连接器模块提供了与各类数据库系统交互的能力，支持从外部数据源读取数据并将处理结果写入目标数据库。作为 Pathway ETL 框架的核心组件之一，数据库连接器实现了实时的增量数据同步机制，能够在流处理和批处理场景下保持数据的一致性和时效性。\n\nPathway 的数据库连接器基于 Rust 引擎构建，结合 Differential Dataflow 计算模型，实现了高效的增量计算能力。这意味着当源数据库中的数据发生变化时，连接器能够自动检测变更并仅处理发生变化的记录，而不是重新处理整个数据集。\n\n## 架构设计\n\n### 连接器类型体系\n\nPathway 在 `src/python_api.rs` 中定义了一套统一的连接器配置接口，通过 PyO3 绑定暴露给 Python 用户。连接器配置通过 `ConnectorProperties`、`ColumnProperties` 和 `TableProperties` 等类进行封装，支持对数据源的精细化配置。\n\n```python\nm.add_class::<ConnectorProperties>()?;\nm.add_class::<ColumnProperties>()?;\nm.add_class::<TableProperties>()?;\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n### 连接器模式\n\nPathway 支持两种连接器运行模式，通过 `ConnectorMode` 枚举定义：\n\n| 模式 | 说明 |\n|------|------|\n| `STATIC` | 静态模式，适用于批处理场景，数据源仅读取一次 |\n| `STREAMING` | 流式模式，适用于实时数据处理，支持持续监听数据变更 |\n\n```rust\n#[pymethods]\nimpl PyConnectorMode {\n    #[classattr]\n    pub const STATIC: ConnectorMode = ConnectorMode::Static;\n    #[classattr]\n    pub const STREAMING: ConnectorMode = ConnectorMode::Streaming;\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## 支持的数据库类型\n\n### 关系型数据库\n\nPathway 原生支持多种主流关系型数据库的连接：\n\n- **PostgreSQL**：通过 `PsqlReplicationSettings` 提供复制槽支持，用于捕获数据库变更\n- **MySQL**：支持标准的读取连接配置\n- **Microsoft SQL Server**：通过 MSSQL 连接器支持企业级应用\n\n### NoSQL 数据库\n\n- **MongoDB**：支持文档型数据库的实时读取和写入\n- **Elasticsearch**：提供全文检索场景下的数据索引能力\n\n### 云存储与服务\n\nPathway 还集成了多种云服务的连接配置：\n\n- **AWS S3**：`AwsS3Settings` 支持从 S3 存储桶读取数据文件\n- **Azure Blob Storage**：`AzureBlobStorageSettings` 支持 Azure 云存储\n- **Iceberg Catalog**：`IcebergCatalogSettings` 支持数据湖架构\n\n```rust\nm.add_class::<AwsS3Settings>()?;\nm.add_class::<AzureBlobStorageSettings>()?;\nm.add_class::<ElasticSearchParams>()?;\nm.add_class::<IcebergCatalogSettings>()?;\nm.add_class::<PsqlReplicationSettings>()?;\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## 数据处理流程\n\n```mermaid\ngraph TD\n    A[外部数据库] --> B[连接器读取层]\n    B --> C[变更捕获]\n    C --> D[Rust 引擎处理]\n    D --> E[增量计算]\n    E --> F[下游输出]\n    \n    G[Schema 定义] --> D\n    H[连接配置] --> B\n```\n\n### 读取流程\n\n1. **连接初始化**：连接器根据配置参数建立与目标数据库的连接\n2. **Schema 映射**：通过 `ValueField` 定义输入数据的字段映射和类型转换\n3. **变更捕获**：在流式模式下持续监听数据变更事件\n4. **数据传输**：变更数据通过 Pathway 引擎进行增量处理\n\n### 写入流程\n\n1. **输出表定义**：通过 `is_output=True` 标记输出计算节点\n2. **目标配置**：指定目标数据库的连接信息和写入策略\n3. **批量写入**：引擎自动将处理结果批量写入目标系统\n\n## Schema 定义\n\nPathway 使用 `ValueField` 结构体定义数据的 Schema 映射：\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\n#[derive(Clone)]\npub struct ValueField {\n    #[pyo3(get)]\n    pub name: String,\n    #[pyo3(get)]\n    pub type_: Type,\n    #[pyo3(get)]\n    pub source: FieldSource,\n    #[pyo3(get)]\n    pub default: Option<Value>,\n    #[pyo3(get)]\n    pub metadata: Option<String>,\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n字段属性说明：\n\n| 属性 | 类型 | 说明 |\n|------|------|------|\n| name | String | 字段名称 |\n| type_ | Type | 数据类型 |\n| source | FieldSource | 字段来源（Payload/Materialized） |\n| default | Option<Value> | 默认值 |\n| metadata | Option<String> | 元数据信息 |\n\n## 时间戳与前沿管理\n\nPathway 通过 `TotalFrontier<T>` 管理数据处理的时间戳前沿，实现精确的增量计算：\n\n```rust\n#[pymethods]\nimpl PyExportedTable {\n    fn frontier(&self) -> TotalFrontier<Timestamp> {\n        self.inner.frontier()\n    }\n\n    fn snapshot_at(&self, frontier: TotalFrontier<Timestamp>) -> Vec<(Key, Vec<Value>)> {\n        self.inner.snapshot_at(frontier)\n    }\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n`TotalFrontier` 有两种状态：\n\n- **`At(t)`**：表示处理到达时间戳 `t`，继续处理后续数据\n- **`Done`**：表示数据处理完成，不再有新数据到达\n\n```rust\nimpl<'py, T> FromPyObject<'py> for TotalFrontier<T> {\n    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {\n        if ob.is_instance_of::<Done>() {\n            Ok(TotalFrontier::Done)\n        } else {\n            Ok(TotalFrontier::At(ob.extract()?))\n        }\n    }\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## 使用示例\n\n### 基本配置流程\n\n```python\nimport pathway as pw\n\n# 定义输入连接器\nclass InputSchema(pw.Schema):\n    id: int\n    value: str\n\n# 配置 PostgreSQL 输入\ninput_connector = pw.io.postgres.read(\n    host=\"localhost\",\n    port=5432,\n    database=\"mydb\",\n    user=\"user\",\n    password=\"password\",\n    table=\"source_table\",\n    schema=InputSchema,\n    mode=pw ConnectorMode.STREAMING\n)\n\n# 定义处理逻辑\nresult = input_connector.groupby(pw.this.id).reduce(\n    id=pw.this.id,\n    count=pw.reducers.count()\n)\n\n# 配置输出连接器\npw.io.postgres.write(\n    result,\n    host=\"localhost\",\n    port=5432,\n    database=\"mydb\",\n    user=\"user\",\n    password=\"password\",\n    table=\"output_table\"\n)\n\n# 运行管道\npw.run()\n```\n\n### 与向量存储集成\n\nPathway 的数据库连接器可以与向量索引功能结合使用，实现 RAG（检索增强生成）场景：\n\n```mermaid\ngraph LR\n    A[PostgreSQL/MySQL] --> B[Pathway 引擎]\n    B --> C[向量嵌入]\n    C --> D[向量索引]\n    D --> E[LLM 查询]\n```\n\n## 监控与故障处理\n\nPathway 的 `PyExportedTable` 提供了内置的监控接口：\n\n```rust\n#[pymethods]\nimpl PyExportedTable {\n    fn failed(&self) -> bool {\n        self.inner.failed()\n    }\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n通过检查 `failed()` 状态，用户可以及时发现数据处理过程中的异常情况，并采取相应的恢复措施。\n\n## 性能优化建议\n\n1. **批量处理**：利用 Pathway 的增量计算特性，避免全量数据重处理\n2. **索引优化**：确保源数据库的相关字段已建立索引\n3. **连接池**：合理配置连接池大小以平衡资源使用和吞吐量\n4. **Schema 精简**：仅提取必要的字段，减少数据传输量\n\n---\n\n<a id='connectors-streaming'></a>\n\n## 流数据连接器\n\n### 相关页面\n\n相关主题：[连接器概述](#connectors-overview)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs) - Python绑定和Rust连接器接口定义\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md) - RAG项目中的连接器使用示例\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md) - 问答RAG系统架构\n- [examples/projects/web-scraping/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/web-scraping/README.md) - Web数据抓取连接器实现\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md) - Pathway框架总体架构\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs) - 时间戳和增量计算引擎\n</details>\n\n# 流数据连接器\n\n## 概述\n\nPathway的流数据连接器（Stream Connectors）是Framework与外部数据源之间进行实时数据交换的核心组件。这些连接器使得Pathway能够从Kafka、NATS、MQTT、RabbitMQ、Redpanda等消息队列系统实时读取数据，并将处理结果写入各种数据sink。\n\nPathway的连接器系统构建在Differential Dataflow增量计算引擎之上，支持两种主要的运行模式：静态模式（Static）和流式模式（Streaming）。静态模式适用于批处理场景，而流式模式则专为实时数据处理设计，能够处理持续到来的数据更新。连接器系统还支持增量计算的语义，允许系统只处理自上次处理以来的变更，而不是重新处理整个数据集。\n\n## 核心架构\n\n### 连接器模式体系\n\nPathway定义了一套完整的连接器模式枚举，用于区分不同的数据处理语义：\n\n```python\nclass ConnectorMode(Enum):\n    STATIC = auto()    # 静态批处理模式\n    STREAMING = auto() # 实时流处理模式\n```\n\n每种模式对应不同的数据处理策略。静态模式下，连接器假设数据源是有限的，处理完成后系统进入完成状态。流式模式下，连接器持续监听数据源，系统永远不会自然结束，除非外部终止。\n\n### PythonSubject核心接口\n\n所有自定义数据源连接器都继承自`PythonSubject`基类，该类定义了数据源的核心接口：\n\n```python\nclass PythonSubject:\n    def __init__(\n        self,\n        start: Callable[[], None],           # 启动数据源\n        read: Callable[[], Iterator[Row]],   # 读取数据行\n        seek: Callable[[Any], None],         # 定位到指定位置\n        on_persisted_run: Callable[[], None], # 持久化运行回调\n        end: Callable[[], bool],             # 检查是否结束\n        is_internal: bool,                   # 是否内部数据源\n        deletions_enabled: bool              # 是否启用删除\n    )\n```\n\n资料来源：[src/python_api.rs:21-31]()\n\n`PythonSubject`的设计体现了Pathway连接器的核心哲学：数据源被抽象为一个可以随时启动、读取、定位和检查完成状态的可迭代对象。这种设计使得连接器能够与Differential Dataflow的计算模型无缝集成。\n\n### 字段定义与数据类型\n\n`ValueField`类定义了表中每个字段的元信息，包括名称、类型、来源和默认值：\n\n```python\nclass ValueField:\n    name: str           # 字段名称\n    type_: Type         # 数据类型\n    source: FieldSource # 字段来源 (默认: Payload)\n    default: Optional[Value]  # 默认值\n    metadata: Optional[str]   # 元数据\n```\n\n`FieldSource`枚举标识了数据的来源类型，最常用的是`FieldSource.Payload`，表示数据来自消息的有效载荷。在复杂的流处理场景中，字段来源还可以标识时间戳、主键等特殊字段。\n\n## 工作原理\n\n### 数据流处理管道\n\nPathway的流数据连接器遵循一个清晰的数据处理管道：\n\n```mermaid\ngraph TD\n    A[外部数据源] --> B[Connector启动]\n    B --> C[数据读取循环]\n    C --> D{数据有效?}\n    D -->|是| E[解析消息]\n    D -->|否| F[跳过/日志]\n    E --> G[转换为Pathway Row]\n    G --> H[输入到Differential Dataflow]\n    H --> I[增量计算]\n    I --> J[结果输出到Sink]\n    J --> C\n    F --> C\n```\n\n### 增量计算与时间戳\n\nPathway的连接器系统深度集成了Differential Dataflow的增量计算能力。每个数据更新都带有时间戳，系统只处理自上次检查点以来的变更。`Timestamp`类型实现了`OriginalOrRetraction`特性，用于区分原始数据和撤回数据：\n\n```python\nclass Timestamp:\n    def is_original(self) -> bool:\n        # 偶数时间戳表示原始数据\n        return self.0.is_multiple_of(2)\n    \n    def next_retraction_time(self) -> Self:\n        # 奇数时间戳表示撤回\n        return Self(self.0 + 1)\n```\n\n资料来源：[src/engine/timestamp.rs:45-53]()\n\n这种设计允许连接器高效处理数据的插入、更新和删除操作，而无需重新处理整个数据集。\n\n### 持久化与状态恢复\n\n连接器支持持久化运行模式，通过`on_persisted_run`回调函数实现故障恢复：\n\n1. **检查点保存**：系统在适当的时间点保存当前的处理状态\n2. **故障恢复**：重新启动时，从最后一个检查点恢复\n3. **数据重放**：根据保存的seek位置，重新从数据源读取未处理的数据\n\n这种机制确保了在分布式环境中，即使发生节点故障，数据也不会丢失，处理可以透明地继续。\n\n## 预置连接器\n\nPathway提供了多个开箱即用的流数据连接器，覆盖了主流的消息队列系统：\n\n| 连接器 | 协议 | 典型用途 | 文档链接 |\n|--------|------|----------|----------|\n| Kafka | TCP二进制协议 | 大规模流处理 | [kafka/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/kafka/__init__.py) |\n| Redpanda | Kafka兼容API | 高性能流处理 | [redpanda/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/redpanda/__init__.py) |\n| NATS | NATS协议 | 轻量级消息传递 | [nats/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/nats/__init__.py) |\n| MQTT | MQTT协议 | IoT设备数据采集 | [mqtt/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/mqtt/__init__.py) |\n| RabbitMQ | AMQP协议 | 企业消息队列 | [rabbitmq/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/rabbitmq/__init__.py) |\n\n### 连接器配置参数\n\n每种连接器都支持一组通用的配置参数：\n\n| 参数 | 类型 | 说明 | 默认值 |\n|------|------|------|--------|\n| `mode` | ConnectorMode | 运行模式 | STREAMING |\n| `deletions_enabled` | bool | 是否处理删除事件 | false |\n| `autocommit_duration` | timedelta | 自动提交间隔 | None |\n| `timeout` | timedelta | 连接超时时间 | 30秒 |\n| `schema` | ValueField[] | 输出表模式 | 自动推断 |\n\n## 使用场景\n\n### 实时文档索引\n\nPathway的连接器常用于构建实时RAG（检索增强生成）系统。以下架构展示了典型的用法：\n\n```mermaid\ngraph LR\n    A[文档数据源] --> B[Pathway连接器]\n    B --> C[数据预处理]\n    C --> D[文本分块]\n    D --> E[嵌入模型]\n    E --> F[向量索引]\n    F --> G[HTTP API]\n    G --> H[LLM查询]\n```\n\n资料来源：[examples/projects/ag2-multiagent-rag/README.md:12-20]()\n\n在ag2-multiagent-rag项目中，Pathway连接器负责监听文件系统中的文档变化，实时索引新文档并更新向量存储。外部AI代理通过REST API查询相关文档，系统保证始终返回最新的索引结果。\n\n### Web数据抓取管道\n\nPathway连接器也广泛用于构建数据采集管道：\n\n```mermaid\ngraph TD\n    A[新闻网站] --> B[Web爬虫连接器]\n    B --> C[内容解析]\n    C --> D[元数据提取]\n    D --> E[数据清洗]\n    E --> F[JSON Lines输出]\n    F --> G[下游处理]\n```\n\n资料来源：[examples/projects/web-scraping/README.md:1-50]()\n\n自定义的`NewsScraperSubject`继承自Pathway的`ConnectorSubject`，实现了文章URL作为主键的数据表结构，支持增量更新和去重。\n\n### 企业数据集成\n\n对于企业级应用，Pathway的RabbitMQ和Kafka连接器能够与现有的消息基础设施无缝集成：\n\n```python\nimport pathway as pw\n\nclass MyConnectorSubject(pw.io.ConnectorSubject):\n    def __init__(self):\n        super().__init__()\n        self._client = create_rabbitmq_client()\n    \n    def start(self):\n        self._client.connect()\n    \n    def read(self):\n        while message := self._client.consume():\n            yield self._parse_message(message)\n    \n    def end(self):\n        return self._client.is_idle_timeout()\n```\n\n## 与Differential Dataflow的集成\n\nPathway的连接器系统是Differential Dataflow在Python生态系统中的桥接层。Rust核心负责：\n\n1. **数据流的组织**：使用Trace数据结构维护历史状态\n2. **增量计算**：只处理实际发生的变更，而非全量数据\n3. **多 worker 支持**：通过`pathway spawn --threads N`启用多线程处理\n\nPython API提供了高级抽象，降低了使用门槛，同时保持了Rust引擎的高性能特性。这种设计使得开发者能够用Python编写流处理逻辑，而底层执行效率与原生Rust程序相当。\n\n## 最佳实践\n\n### 模式定义\n\n在创建连接器时，应显式定义输出表的schema，包括所有字段的名称、类型和默认值：\n\n```python\nschema = pw.Schema(\n    fields={\n        \"id\": pw.FieldType.STRING,\n        \"content\": pw.FieldType.STRING,\n        \"timestamp\": pw.FieldType.DATETIME,\n    },\n    primary_key=[\"id\"]\n)\n```\n\n### 错误处理\n\n连接器应实现健壮的错误处理机制：\n\n```python\ndef read(self):\n    try:\n        yield from self._fetch_data()\n    except TransientError:\n        # 临时错误，等待后重试\n        time.sleep(1)\n        yield from self.read()\n    except PermanentError:\n        # 永久错误，标记连接器失败\n        self._mark_failed()\n```\n\n### 性能优化\n\n1. **批量处理**：积累多条消息后再提交，减少I/O次数\n2. **背压控制**：当处理速度跟不上数据到达速度时，使用`BackfillingThreshold`配置\n3. **连接池复用**：在多worker场景下复用连接，避免资源耗尽\n\n## 总结\n\nPathway的流数据连接器系统提供了一套完整的数据集成解决方案。通过统一的抽象接口，开发者可以轻松地将各种外部数据源接入Pathway的处理管道。连接器与Differential Dataflow的深度集成确保了即使在处理大规模实时数据时，也能保持高效的增量计算能力。\n\n预置的Kafka、Redpanda、NATS、MQTT和RabbitMQ连接器覆盖了大多数流数据场景，而PythonSubject基类则允许开发者实现完全自定义的数据源连接器，满足各种特殊的集成需求。\n\n---\n\n<a id='trans-table-operations'></a>\n\n## 表操作\n\n### 相关页面\n\n相关主题：[连接与聚合](#trans-joins-aggregations), [基础示例](#quickstart-basic-example)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/internals/table.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/table.py)\n- [python/pathway/internals/column.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/column.py)\n- [python/pathway/internals/expression.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/expression.py)\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n</details>\n\n# 表操作\n\n## 概述\n\nPathway 是一个基于 Python 的 ETL 框架，其核心数据模型围绕**表（Table）**展开。表操作是 Pathway 数据处理流水线的基本构建单元，提供了对结构化数据进行转换、过滤、聚合和连接的能力。\n\nPathway 的表操作具有以下特性：\n\n- **统一批流处理**：同一套 API 可处理静态数据集和实时流数据\n- **增量计算**：基于 Rust 的 Differential Dataflow 引擎实现高效增量更新\n- **声明式语法**：通过 Python 表达式定义数据转换逻辑\n\n资料来源：[README.md:1-20]()\n\n## 核心数据结构\n\n### Table 类\n\n`Table` 是 Pathway 中的核心数据结构，代表一个具有特定模式的分布式表格。\n\n```mermaid\nclassDiagram\n    class Table {\n        +scope: Py[Scope]\n        +handle: TableHandle\n        +_is_deleted: bool\n        +_app_id: str\n        +__getitem__(key) TableSlice\n        +select(*args) Table\n        +update_rows(*args) Table\n        +update_cells(*args) Table\n        +filter(predicate) Table\n        +groupby(*args) Table\n        +join(*args) Table\n        +join_inner(*args) Table\n        +concat(*tables) Table\n    }\n    \n    class LegacyTable {\n        +universe: Universe\n        +columns: Vec~Column~\n        +to_engine() (UniverseHandle, Vec~ColumnHandle~)\n        +from_handles(scope, handles) LegacyTable\n    }\n    \n    Table --> Scope : 关联\n    LegacyTable --> Universe : 关联\n    LegacyTable --> Column : 包含\n```\n\n### 列（Column）与字段（ValueField）\n\n每张表由多个列组成，每个列具有名称、类型和来源信息。\n\n```python\n# ValueField 定义 - 资料来源：src/python_api.rs:40-55\n@dataclass\nclass ValueField:\n    name: str                    # 列名\n    type_: Type                  # 数据类型\n    source: FieldSource          # 字段来源（Payload/Materialized）\n    default: Optional[Value]     # 默认值\n    metadata: Optional[str]      # 元数据\n```\n\n### 类型系统\n\n| 类型 | 说明 | 对应 Rust 类型 |\n|------|------|---------------|\n| `INT` | 64位整数 | `i64` |\n| `FLOAT` | 双精度浮点 | `f64` |\n| `STR` | 字符串 | `String` |\n| `BOOL` | 布尔值 | `bool` |\n| `ANY` | 任意类型 | - |\n\n资料来源：[src/python_api.rs:40-55]()\n\n## 表的创建与初始化\n\n### 从输入连接器创建\n\nPathway 提供多种连接器用于从不同数据源创建表：\n\n```python\n# 从文件系统创建表\ntable = pw.io.fs.read(\n    \"./data\",\n    schema=MySchema,\n    format=\"csv\"\n)\n\n# 从 HTTP 流读取\ntable = pw.io.http.read(\n    \"http://api.example.com/stream\",\n    schema=MySchema,\n    json_field_paths={\"data\": \"$.records[*]\"}\n)\n```\n\n### 表的内部状态\n\n```mermaid\ngraph TD\n    A[输入数据] --> B[Python Subject]\n    B --> C[Raw Updates]\n    C --> D[Rust Engine]\n    D --> E[Universe Handle]\n    D --> F[Column Handles]\n    E --> G[Table]\n    F --> G\n    G --> H[快照 Snapshot]\n```\n\n表的内部状态由 `Universe` 和 `Column` 集合组成：\n\n```python\n# LegacyTable 内部结构 - 资料来源：src/python_api.rs:180-210\nclass LegacyTable:\n    def __init__(self, universe, columns):\n        self.universe = universe    # 行的唯一标识集合\n        self.columns = columns      # 列的列表\n    \n    def to_engine(self):\n        \"\"\"转换为引擎句柄\"\"\"\n        universe = self.universe.get()\n        column_handles = [c.get().handle for c in self.columns]\n        return (universe.handle, column_handles)\n```\n\n## 基础表操作\n\n### 选择操作（select）\n\n`select` 用于从表中提取指定列，可进行计算和重命名：\n\n```python\n# 基本选择\nresult = table.select(\n    id=table.user_id,\n    name=table.name,\n    full_name=table.first_name + \" \" + table.last_name  # 表达式计算\n)\n```\n\n### 过滤操作（filter）\n\n`filter` 根据条件筛选行：\n\n```python\n# 过滤条件\nactive_users = users.filter(users.status == \"active\")\npremium_orders = orders.filter(orders.amount > 100)\n```\n\n### 行更新（update_rows）\n\n向表中添加或更新行：\n\n```python\n# 添加新行\nupdated_table = table.update_rows(\n    pw.table.builder.make_table(\n        id=[1, 2],\n        name=[\"Alice\", \"Bob\"],\n        value=[10, 20]\n    )\n)\n```\n\n### 单元格更新（update_cells）\n\n更新特定列的值：\n\n```python\n# 更新特定列\nresult = table.update_cells(\n    status=pw.this.status.upper(),  # 将 status 列转为大写\n    _id=table.id                     # 保留原 id 列\n)\n```\n\n资料来源：[python/pathway/internals/table.py]()\n\n## 聚合与分组\n\n### 分组聚合（groupby）\n\n`groupby` 按照指定键分组并执行聚合操作：\n\n```mermaid\ngraph LR\n    A[输入表] --> B[GroupBy Key]\n    B --> C[每组聚合]\n    C --> D[聚合函数]\n    D --> E[输出表]\n    \n    subgraph 聚合函数\n        F[count]\n        G[sum]\n        H[min/max]\n        I[collect]\n    end\n    D --> F\n    D --> G\n    D --> H\n    D --> I\n```\n\n```python\n# 按类别聚合\ncategory_stats = orders.groupby(orders.category).reduce(\n    orders.category,\n    total_amount=pw.reducers.sum(orders.amount),\n    order_count=pw.reducers.count(),\n    avg_price=pw.reducers.mean(orders.price)\n)\n```\n\n### 可用聚合函数\n\n| 函数 | 说明 | 示例 |\n|------|------|------|\n| `count()` | 计数 | `count()` |\n| `sum(expr)` | 求和 | `sum(table.amount)` |\n| `mean(expr)` | 平均值 | `mean(table.score)` |\n| `min(expr)` | 最小值 | `min(table.latency)` |\n| `max(expr)` | 最大值 | `max(table.price)` |\n| `collect(expr)` | 收集为列表 | `collect(table.items)` |\n| `sorted_tuple(expr)` | 排序元组 | `sorted_tuple(table.events)` |\n\n## 表连接操作\n\n### 连接的种类\n\n| 连接类型 | 说明 | 空值处理 |\n|----------|------|----------|\n| `join` / `join_inner` | 内连接 | 丢弃无匹配的行 |\n| `join_left` | 左外连接 | 保留左表所有行 |\n| `join_right` | 右外连接 | 保留右表所有行 |\n| `join_outer` | 全外连接 | 保留所有行 |\n\n### 基本连接语法\n\n```python\n# 内连接\nresult = table1.join(\n    table2,\n    table1.key == table2.key\n)\n\n# 指定输出列\nresult = table1.join(\n    table2,\n    table1.id == table2.user_id,\n    left=pw.left.id,\n    right=pw.right.user_id,\n    prefix=\"left_\"\n)\n```\n\n### 复杂连接场景\n\n```python\n# 多条件连接\norders_enriched = orders.join(\n    customers,\n    (orders.customer_id == customers.id) & \n    (orders.region == customers.region)\n)\n```\n\n## 时间与版本控制\n\n### 时间戳机制\n\nPathway 使用特殊的时间戳系统来追踪数据的版本和变更：\n\n```python\n# Timestamp 实现 - 资料来源：src/engine/timestamp.rs:1-30\nimpl Timestamp {\n    fn followed_by(&self, other: &Self) -> Option<Self> {\n        self.0.followed_by(&other.0).map(Self)\n    }\n}\n```\n\n### 数据变更追踪\n\n```\n时间线:  ─────────────────────────────────────────────►\n         t0      t1      t2      t3      t4\n         │       │       │       │       │\n         ▼       ▼       ▼       ▼       ▼\n数据:   [v0]   [v1]   [v2]   [撤回]   [v4]\n              (修改)  (修改)  (删除)\n```\n\nPathway 使用 **Differential Dataflow** 实现增量更新，仅处理发生变化的数据。\n\n## 表的快照与导出\n\n### 快照操作（snapshot）\n\n获取表在特定时间点的状态：\n\n```python\n# 获取当前快照\nsnapshot_data = table.snapshot()\n\n# 在特定前沿时间获取快照 - 资料来源：src/python_api.rs:220-240\nfrontier = TotalFrontier::At(timestamp)\nsnapshot = exported_table.snapshot_at(frontier)\n```\n\n### 外部索引集成\n\n```python\n# 创建向量索引用于 RAG\ntable = pw.Table.from_columns(\n    id=range(1000),\n    content=texts,\n    vector=embeddings\n)\n\n# 构建外部索引\ntable = table.groupby(table.id).reduce(\n    table.id,\n    content=pw.reducers.torch(\n        \"embedding\",\n        vector=table.vector,\n        mode=\"mean\"\n    )\n)\n```\n\n## 最佳实践\n\n### 性能优化\n\n1. **合理设计模式**：避免过度规范化\n2. **使用适当的数据类型**：优先使用原生类型而非字符串\n3. **批量操作**：优先使用 `update_rows` 而非逐行更新\n4. **索引列**：在频繁连接的列上建立索引\n\n### 常见模式\n\n```python\n# 模式1：流式处理 + 批处理混合\nstream_table = pw.io.kafka.read(...)\nbatch_table = pw.io.fs.read(\"./backup\", ...)\n\ncombined = stream_table.concat(batch_table)\n```\n\n```python\n# 模式2：滑动窗口聚合\nwindowed = events.groupby(\n    events.session_id\n).reduce(\n    events.session_id,\n    time_range=pw.reducers.count(),\n    events_list=pw.reducers.collect(\n        pw.this.event_data\n    )[-100:]  # 最近100个事件\n)\n```\n\n## 与其他模块的关系\n\n```mermaid\ngraph TD\n    Table[表操作] --> Column[列操作]\n    Table --> Expression[表达式]\n    Table --> Connector[连接器]\n    Column --> Expression\n    Expression --> Compute[计算引擎]\n    Connector --> Input[输入数据]\n    Compute --> Output[输出数据]\n    Compute --> Persistence[持久化]\n```\n\n- **列操作**（`column.py`）：定义列级别的表达式和转换\n- **表达式**（`expression.py`）：处理数据计算和条件逻辑\n- **连接器**（`pw.io.*`）：负责数据的输入输出\n\n## 错误处理\n\n### 失败状态检测\n\n```python\n# 检查表是否处于失败状态\nif table.failed():\n    logger.error(\"数据处理失败，请检查输入数据\")\n```\n\n### 追踪与调试\n\n```python\n# 获取错误追踪信息\nclass Trace:\n    file_name: str\n    line_number: int\n    line: str\n    function: str\n```\n\n## 小结\n\nPathway 的表操作模块构成了整个框架的核心，提供了：\n\n- **统一的数据模型**：通过 `Table` 类实现批流统一的表格操作\n- **声明式的转换语法**：使用 Python 表达式定义复杂的数据处理逻辑\n- **高效的增量计算**：基于 Differential Dataflow 的增量更新机制\n- **丰富的连接和聚合能力**：支持多种连接类型和聚合函数\n\n熟练掌握表操作是构建高效 Pathway 数据处理流水线的基础。\n\n---\n\n<a id='trans-joins-aggregations'></a>\n\n## 连接与聚合\n\n### 相关页面\n\n相关主题：[表操作](#trans-table-operations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/internals/joins.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/joins.py)\n- [python/pathway/internals/groupbys.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/groupbys.py)\n- [python/pathway/internals/reducers.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/reducers.py)\n- [python/pathway/internals/custom_reducers.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/custom_reducers.py)\n</details>\n\n# 连接与聚合\n\nPathway 是一个基于 Python 的 ETL 框架，其核心由 Rust 引擎驱动，底层采用 Differential Dataflow 实现增量计算。**连接（Join）与聚合（Aggregation）** 是 Pathway 数据处理管线中的两个核心操作，它们使得实时流处理和批处理能够以统一的方式进行。\n\n---\n\n## 1. 概述\n\n在 Pathway 中，连接与聚合承担着数据管道中的关键转换角色：\n\n| 功能 | 作用 | 典型场景 |\n|------|------|----------|\n| **连接（Join）** | 将两个或多个数据源按指定键进行关联，产生新的组合记录 | 实时关联用户行为与配置文件 |\n| **聚合（Aggregation）** | 按分组键对数据进行汇总计算（计数、求和、平均等） | 统计每小时的访问量 |\n| **自定义聚合（Custom Reducers）** | 支持用户定义的聚合逻辑 | 复杂业务规则下的数据汇总 |\n\nPathway 的连接与聚合操作具有以下特性：\n\n- **增量计算**：仅处理自上次计算以来发生变化的数据，而非全量重算\n- **幂等性**：相同输入产生相同输出，支持重放（replay）\n- **统一 API**：流式数据和静态数据使用相同代码\n- **Rust 引擎加速**：核心计算在 Rust 层执行，保证高性能\n\n> 资料来源：[README.md]() 中关于 \"incremental computation\" 和 \"stream processing\" 的描述\n\n---\n\n## 2. 架构设计\n\n### 2.1 核心层次结构\n\n```mermaid\ngraph TD\n    A[Python API 层] --> B[Python Internals 层]\n    B --> C[Rust 引擎层]\n    C --> D[Differential Dataflow]\n    D --> E[Timely Dataflow]\n    \n    F[joins.py] --> B\n    G[groupbys.py] --> B\n    H[reducers.py] --> B\n    I[custom_reducers.py] --> B\n    \n    J[python_api.rs] --> C\n    K[dataflow.rs] --> C\n```\n\nPathway 的连接与聚合功能跨越三个主要层次：\n\n| 层次 | 文件/模块 | 职责 |\n|------|-----------|------|\n| **Python API** | `pathway/internals/*.py` | 提供用户友好的 Python 接口 |\n| **Python Internals** | `joins.py`, `groupbys.py`, `reducers.py` | 实现连接、分组、聚合的 Python 逻辑 |\n| **Rust 引擎** | `src/python_api.rs`, `src/engine/dataflow.rs` | 底层计算引擎绑定 |\n\n### 2.2 连接操作的数据流\n\n```mermaid\ngraph LR\n    A[Table A] --> D[Join 操作]\n    B[Table B] --> D\n    D --> E[Arrangement 索引]\n    E --> F[匹配键值对]\n    F --> G[结果 Collection]\n    \n    H[增量更新] --> E\n    I[键删除] --> F\n```\n\nPathway 使用 Differential Dataflow 的 `join_core` 方法实现连接。连接过程涉及数据的**排列（Arrangement）**，这是实现高效增量连接的关键数据结构。\n\n> 资料来源：[external/differential-dataflow/src/operators/join.rs:28-42]() 中 `join_core` 方法定义\n\n---\n\n## 3. 连接（Join）操作详解\n\n### 3.1 Join 方法签名与参数\n\n根据 Pathway 的架构，Join 操作在 Rust 层暴露为 Python 可调用的接口：\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| `stream2` | `Arranged<G, Tr2>` | 第二个输入流（已排列） |\n| `result` | `FnMut(&K, &V, &Tr2::Val) -> I` | 结果生成函数 |\n| `key` | - | 连接键提取（隐含在方法中） |\n\n```rust\n// 资料来源：external/differential-dataflow/src/operators/join.rs\nfn join_core<Tr2, I, L>(&self, stream2: &Arranged<G, Tr2>, result: L) -> Collection<G, I::Item, R::Output>\nwhere\n    Tr2: TraceReader<Key=K, Time=G::Timestamp> + Clone + 'static,\n    R: Multiply<Tr2::R>,\n    L: FnMut(&K, &V, &Tr2::Val) -> I + 'static,\n```\n\n### 3.2 不安全变体：join_core_internal_unsafe\n\n除了标准的 `join_core`，Differential Dataflow 还提供了**不安全变体**，允许更灵活的处理：\n\n```rust\n// 资料来源：external/differential-dataflow/src/operators/join.rs\nfn join_core_internal_unsafe<Tr2, I, D, ROut, L>(\n    &self,\n    stream2: &Arranged<G, Tr2>,\n    result: L\n) -> Collection<G, D, ROut>\nwhere\n    // ... 允许返回 (data, time, diff) 三元组\n    L: FnMut(&K, &V, &Tr2::Val, &G::Timestamp, &R, &Tr2::R) -> I + 'static,\n```\n\n此方法允许回调函数访问时间戳和差分（diff）信息，适用于需要精细控制的场景。\n\n### 3.3 Arrangement 机制\n\nArrangement 是 Pathway 高效连接的核心。每个参与连接的表都需要预先建立索引：\n\n```mermaid\ngraph TD\n    A[输入数据] --> B[ArrangeByKey]\n    B --> C[OrderedLayer 存储结构]\n    C --> D[键索引 BTreeMap]\n    D --> E[值列表 Vec]\n    \n    F[新数据到达] --> G[增量更新 Arrangement]\n    G --> C\n```\n\nPathway 在 `ordered.rs` 中扩展了 Differential Dataflow 的有序层结构：\n\n```rust\n// 资料来源：external/differential-dataflow/src/trace/layers/ordered.rs\nfn step(&mut self, storage: &OrderedLayer<K, L, O, C>) {\n    // [Pathway extension]: pos_nonnegative indicates that index was moved to \n    // [Pathway extension]: negative value, here we undo it\n    if self.pos_nonnegative { \n        self.pos += 1; \n    } else {\n        self.pos_nonnegative = true;\n    }\n    // ...\n}\n```\n\n> 资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs]() 中 `import_named` 方法\n\n---\n\n## 4. 聚合（Aggregation）操作详解\n\n### 4.1 聚合方法概述\n\nPathway 通过 `reduce` 操作族实现聚合功能。Differential Dataflow 提供了多种聚合变体：\n\n| 方法 | 用途 |\n|------|------|\n| `reduce` | 基础聚合操作 |\n| `reduce_abelian` | 阿贝尔群（Abelian）聚合，优化数学运算 |\n| `threshold` | 阈值过滤后聚合 |\n| `count` | 计数聚合 |\n\n### 4.2 Count 聚合实现\n\n计数是常用的聚合操作，Pathway 的实现如下：\n\n```rust\n// 资料来源：external/differential-dataflow/src/operators/reduce.rs\nfn count(&self) -> Collection<G, (K, R), isize> {\n    self.count_core()\n}\n```\n\n### 4.3 Threshold 聚合\n\n阈值聚合允许在聚合前进行条件过滤：\n\n```rust\n// 资料来源：external/differential-dataflow/src/operators/reduce.rs\nfn threshold_named<R2: Abelian, F: FnMut(&K, &R1) -> R2 + 'static>(\n    &self,\n    name: &str,\n    mut thresh: F\n) -> Collection<G, K, R2> {\n    self.reduce_abelian::<_, DefaultKeyTrace<_, _, _>>(\n        name,\n        move |k, s, t| t.push(((), thresh(k, &s[0].1)))\n    )\n    .as_collection(|k, _| k.clone())\n}\n```\n\n---\n\n## 5. Universe 与 Keys 机制\n\n### 5.1 Universe 的作用\n\nUniverse 是 Pathway 引擎中管理数据键空间的核心结构：\n\n```rust\n// 资料来源：src/engine/dataflow.rs\nstruct Universe<S: MaybeTotalScope> {\n    data: Rc<UniverseData<S>>,\n}\n```\n\nUniverse 提供了三种数据来源方式：\n\n| 来源类型 | 说明 |\n|----------|------|\n| `FromCollection` | 从 Collection 直接创建 |\n| `FromArranged` | 从已排列的数据创建 |\n| `Consolidated` | 合并后的数据 |\n\n```rust\n// 资料来源：src/engine/dataflow.rs\nimpl<S: MaybeTotalScope> Universe<S> {\n    fn from_collection(keys: Keys<S>) -> Self {\n        let data = Rc::new(UniverseData::from_collection(keys));\n        Self::new(data)\n    }\n\n    fn from_arranged(keys: KeysArranged<S>) -> Self {\n        let data = Rc::new(UniverseData::from_arranged(keys));\n        Self::new(data)\n    }\n}\n```\n\n### 5.2 Keys 数据结构\n\nKeys 结构体提供了统一的接口来访问数据键：\n\n```mermaid\ngraph TD\n    A[Keys 枚举] --> B[FromCollection]\n    A --> C[FromArranged]\n    \n    B --> D[Keys::collection]\n    B --> E[Keys::arranged]\n    B --> F[Keys::consolidated]\n    \n    C --> D\n    C --> E\n    \n    E --> G[KeysArranged]\n    G --> H[Arranged 索引]\n```\n\n---\n\n## 6. Python API 层\n\n### 6.1 Computer 结构\n\n在 Rust-Python 绑定层，聚合逻辑通过 `Computer` 结构表示：\n\n```rust\n// 资料来源：src/python_api.rs\n#[pyclass(module = \"pathway.engine\")]\nstruct Computer {\n    fun: Py<PyAny>,\n    dtype: Py<PyAny>,\n    is_output: bool,\n    is_method: bool,\n    universe: Py<Universe>,\n    data: Value,\n    data_column: Option<Py<Column>>,\n}\n```\n\n`Computer` 的创建通过 `from_raising_fun` 方法：\n\n```rust\n// 资料来源：src/python_api.rs\n#[pymethods]\nimpl Computer {\n    #[new]\n    #[pyo3(signature = (\n        fun,\n        dtype,\n        is_output,\n        is_method,\n        universe,\n        data = Value::None,\n        data_column = None,\n    ))]\n    pub fn from_raising_fun(\n        py: Python,\n        fun: Py<PyAny>,\n        dtype: Py<PyAny>,\n        is_output: bool,\n        is_method: bool,\n        universe: Py<Universe>,\n        data: Value,\n        data_column: Option<Py<Column>>,\n    ) -> PyResult<Py<Self>> {\n        // ...\n    }\n}\n```\n\n### 6.2 时间戳与增量计算\n\nPathway 使用 `Timestamp` 类型管理增量计算的时间维度：\n\n```rust\n// 资料来源：src/engine/timestamp.rs\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\n时间戳设计遵循以下规则：\n- **偶数时间戳**：表示原始记录（original）\n- **奇数时间戳**：表示删除/撤回记录（retraction）\n\n---\n\n## 7. 使用示例\n\n### 7.1 基础连接\n\n```python\nimport pathway as pw\n\n# 定义两个输入表\norders = pw.io.csv.read(\"./orders.csv\", schema=OrderSchema)\nproducts = pw.io.csv.read(\"./products.csv\", schema=ProductSchema)\n\n# 连接两个表\nresult = orders.join(\n    products,\n    orders.product_id == products.id\n).select(\n    order_id=orders.id,\n    product_name=products.name,\n    quantity=orders.quantity\n)\n```\n\n### 7.2 分组聚合\n\n```python\n# 按产品分组统计订单数量\nsales_summary = orders.groupby(orders.product_id).reduce(\n    orders.product_id,\n    total_quantity=pw.reducers.sum(orders.quantity),\n    order_count=pw.reducers.count()\n)\n```\n\n### 7.3 自定义聚合\n\n```python\nimport pathway as pw\n\n@pw.reducer\ndef weighted_average(values, weights):\n    total_weight = sum(weights)\n    if total_weight == 0:\n        return 0.0\n    return sum(v * w for v, w in zip(values, weights)) / total_weight\n\n# 使用自定义聚合\nresult = data.groupby(data.category).reduce(\n    category=data.category,\n    avg_rating=weighted_average(data.rating, data.importance)\n)\n```\n\n---\n\n## 8. 性能优化建议\n\n### 8.1 Arrangement 复用\n\n对于需要多次连接的表，预先建立 Arrangement 可以显著提升性能：\n\n| 策略 | 适用场景 | 收益 |\n|------|----------|------|\n| 预排序（Pre-arrange） | 同一表参与多次连接 | 减少重复索引开销 |\n| 键压缩 | 大键值数据 | 降低内存占用 |\n| 分区（Partitioning） | 超大规模数据 | 并行处理加速 |\n\n### 8.2 增量计算优势\n\nPathway 的增量计算模型特别适合以下场景：\n\n```mermaid\ngraph LR\n    A[新数据到达] --> B{检查变更}\n    B --> C[仅处理受影响键]\n    C --> D[更新结果]\n    D --> E[增量输出]\n    \n    F[传统批处理] --> G[全量重算]\n    G --> H[完整输出]\n```\n\n---\n\n## 9. 相关源码文件索引\n\n| 文件路径 | 功能描述 |\n|----------|----------|\n| `python/pathway/internals/joins.py` | Python 层 Join 操作实现 |\n| `python/pathway/internals/groupbys.py` | Python 层分组操作实现 |\n| `python/pathway/internals/reducers.py` | 内置聚合函数 |\n| `python/pathway/internals/custom_reducers.py` | 自定义聚合装饰器 |\n| `src/python_api.rs` | Rust-Python 绑定层 |\n| `src/engine/dataflow.rs` | 引擎核心数据结构 |\n| `src/engine/timestamp.rs` | 时间戳管理 |\n| `external/differential-dataflow/src/operators/join.rs` | 底层 Join 操作 |\n| `external/differential-dataflow/src/operators/reduce.rs` | 底层聚合操作 |\n| `external/differential-dataflow/src/operators/arrange/agent.rs` | Arrangement 管理 |\n\n---\n\n## 10. 总结\n\nPathway 的连接与聚合系统是一个精心设计的分层架构：\n\n1. **Python API 层**提供直观易用的接口\n2. **Python Internals 层**处理业务逻辑与类型转换\n3. **Rust 引擎层**基于 Differential Dataflow 实现高效增量计算\n4. **Timely Dataflow 层**提供底层并行计算支持\n\n这种设计使得用户能够以简洁的 Python 代码表达复杂的数据处理逻辑，同时享受 Rust 引擎带来的高性能和增量计算的效率优势。\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：pathwaycom/pathway\n\n摘要：发现 22 个潜在踩坑项，其中 2 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any。\n\n## 1. 安装坑 · 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安全/权限坑 · 来源证据：Entra Authentication Support (credential handler and/or password callback)\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：Automated schema exploration in input connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安装坑 · 来源证据：Improve watermarks in POSIX-like objects tracker\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：Persistence in `iterate` operator\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 7. 安装坑 · 来源证据：Support LEANN for RAG pipelines\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：Support MongoDB Atlas in pw.io.mongodb\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 9. 安装坑 · 来源证据：feat: Add Microsoft SQL Server (MSSQL) connector\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：v0.27.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 安装坑 · 来源证据：v0.29.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.29.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b25babd3e3cc49108e56daaaaae8c5bf | https://github.com/pathwaycom/pathway/releases/tag/v0.29.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 12. 安装坑 · 来源证据：v0.30.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.30.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_0291ee068e4e4d38aa86d0fd433b960a | https://github.com/pathwaycom/pathway/releases/tag/v0.30.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 13. 配置坑 · 来源证据：v0.28.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.28.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_708184d9c8ac445d923c82d38af7bd19 | https://github.com/pathwaycom/pathway/releases/tag/v0.28.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 配置坑 · 来源证据：v0.30.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.30.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7a19bd03a49499e987781160e4b76f0 | https://github.com/pathwaycom/pathway/releases/tag/v0.30.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:571186647 | https://github.com/pathwaycom/pathway | README/documentation is current enough for a first validation pass.\n\n## 16. 运行坑 · 来源证据：`pw.io.mssql.read` needs LSN persistence\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：`pw.io.mssql.read` needs LSN persistence\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_59d927a667a843ccb7d0a4cd147a1dc0 | https://github.com/pathwaycom/pathway/issues/218 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 17. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | last_activity_observed missing\n\n## 18. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 19. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 20. 安全/权限坑 · 来源证据：Support encryption in Kinesis connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Support encryption in Kinesis connectors\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f29097c3eced4a209e7c80971aeea40c | https://github.com/pathwaycom/pathway/issues/223 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 21. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | issue_or_pr_quality=unknown\n\n## 22. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | release_recency=unknown\n\n<!-- canonical_name: pathwaycom/pathway; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "pathway",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "github_repo:571186647",
          "kind": "repo",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/pathwaycom/pathway"
        },
        {
          "evidence_id": "art_60def7e0e2da413bb22cca132cca079f",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/pathwaycom/pathway#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "pathway 说明书",
      "toc": [
        "https://github.com/pathwaycom/pathway 项目说明书",
        "目录",
        "Pathway简介",
        "项目概述",
        "核心技术架构",
        "主要功能特性",
        "应用场景",
        "开发与部署",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "510c6daf92bd76e21b7279cee5f588b1b64e7b0f",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "README.md",
      "docs/2.developers/7.templates/30.configure-yaml.md",
      "docs/2.developers/7.templates/35.custom-components.md",
      "docs/2.developers/7.templates/20.run-a-template.md",
      "docs/2.developers/7.templates/38.licensing-guide.md",
      "docs/2.developers/7.templates/1.index.md",
      "docs/2.developers/4.user-guide/.changelog.md",
      "docs/2.developers/7.templates/rag/1008.template-demo-document-indexing.md",
      "docs/2.developers/7.templates/rag/1000.demo-question-answering.md",
      "docs/2.developers/7.templates/rag/1001.template-adaptive-rag.md",
      "docs/2.developers/7.templates/rag/1010.template-slides-search.md",
      "docs/2.developers/7.templates/rag/5.unstructured-to-structured.md",
      "docs/2.developers/7.templates/rag/1009.drive-alert.md",
      "docs/2.developers/7.templates/rag/1003.template-multimodal-rag.md",
      "docs/2.developers/7.templates/rag/1002.template-private-rag.md",
      "docs/2.developers/7.templates/rag/.navigation.yml",
      "docs/2.developers/7.templates/40.rag-customization/20.custom-prompt.md",
      "docs/2.developers/7.templates/40.rag-customization/10.REST-API.md",
      "docs/2.developers/7.templates/40.rag-customization/40.splitters.md",
      "docs/2.developers/7.templates/40.rag-customization/50.embedders.md",
      "docs/2.developers/7.templates/40.rag-customization/30.parsers.md",
      "docs/2.developers/7.templates/40.rag-customization/60.llm-chats.md",
      "docs/2.developers/7.templates/ETL/2.twitter.md",
      "docs/2.developers/7.templates/ETL/180.kafka-alternative.md",
      "docs/2.developers/7.templates/ETL/150.etl-python-airbyte.md",
      "docs/2.developers/7.templates/ETL/5.linear_regression_with_kafka.md",
      "docs/2.developers/7.templates/ETL/10.el-pipeline.md",
      "docs/2.developers/7.templates/ETL/7.realtime-log-monitoring.md",
      "docs/2.developers/7.templates/ETL/175.delta_lake_etl.md",
      "docs/2.developers/7.templates/ETL/4.logistics.md",
      "docs/2.developers/7.templates/ETL/.navigation.yml",
      "docs/2.developers/7.templates/ETL/140.kafka-etl.md",
      "docs/2.developers/7.templates/60.deploy/20.aws-fargate-deploy.md",
      "docs/2.developers/7.templates/60.deploy/10.cloud-deployment.md",
      "docs/2.developers/7.templates/60.deploy/15.gcp-deploy.md",
      "docs/2.developers/7.templates/60.deploy/25.azure-aci-deploy.md",
      "docs/2.developers/7.templates/39.yaml-snippets/20.rag-configuration-examples.md",
      "docs/2.developers/7.templates/39.yaml-snippets/10.data-sources-examples.md",
      "docs/2.developers/7.templates/39.yaml-snippets/.navigation.yml"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# frontend - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 frontend 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **想在安装前理解开源项目价值和边界的用户**：当前证据主要来自项目文档。 证据：`README.md` Claim：`clm_0002` supported 0.86\n\n## 它能做什么\n\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n## 怎么开始\n\n- `pip install -U pathway` 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：先做角色匹配试用\n- **为什么**：这个项目更像角色库，核心风险是选错角色或把角色文案当执行能力；先用 Prompt Preview 试角色匹配，再决定是否沙盒导入。\n\n### 30 秒判断\n\n- **现在怎么做**：先做角色匹配试用\n- **最小安全下一步**：先用 Prompt Preview 试角色匹配；满意后再隔离导入\n- **先别相信**：角色质量和任务匹配不能直接相信。\n- **继续会触碰**：角色选择偏差、命令执行、本地环境或项目文件\n\n### 现在可以相信\n\n- **适合人群线索：想在安装前理解开源项目价值和边界的用户**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n### 现在还不能相信\n\n- **角色质量和任务匹配不能直接相信。**（unverified）：角色库证明有很多角色，不证明每个角色都适合你的具体任务，也不证明角色能产生高质量结果。\n- **不能把角色文案当成真实执行能力。**（unverified）：安装前只能判断角色描述和任务画像是否匹配，不能证明它能在宿主 AI 里完成任务。\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n\n### 继续会触碰什么\n\n- **角色选择偏差**：用户对任务应该由哪个专家角色处理的判断。 原因：选错角色会让 AI 从错误专业视角回答，浪费时间或误导决策。\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：先用交互式试用验证任务画像和角色匹配，不要先导入整套角色库。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **保留原始角色选择记录**：如果输出偏题，可以回到任务画像阶段重新选择角色，而不是继续沿着错误角色推进。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0004` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md` Claim：`clm_0005` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n### 上下文规模\n\n- 文件总数：2326\n- 重要文件覆盖：40/2326\n- 证据索引条目：80\n- 角色 / Skill 条目：77\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 frontend 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 frontend 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 frontend 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 77 个角色 / Skill / 项目文档条目。\n\n- **Pathway Live Data Framework**（project_doc）：Getting Started Deployment Documentation and Support Blog License 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`README.md`\n- **Pathway examples**（project_doc）：This repository contains examples of data processing using Pathway https://pathway.com/developers/documentation/introduction/welcome , the Python-based programming framework for easy building of realtime and reactive data products. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/README.md`\n- **AG2 Multi-Agent Conversations with Pathway Real-Time RAG**（project_doc）：AG2 Multi-Agent Conversations with Pathway Real-Time RAG 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/ag2-multiagent-rag/README.md`\n- **Azure ACI Deployment Example**（project_doc）：This repository contains an example for the tutorial: \"Running Pathway Program in Azure with Azure Container Instances\" https://pathway.com/developers/user-guide/deployment/azure-aci-deploy . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/azure-aci-deploy/README.md`\n- **Best-rated movies examples**（project_doc）：The purpose of this project is two-fold: 1. doing an end-to-end application with Pathway to compute the K-best-rated items in a movie dataset, 2. switching from Kafka to Redpanda 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/best-movies-example/README.md`\n- **Best-rated movies example - Kafka version**（project_doc）：Best-rated movies example - Kafka version 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/best-movies-example/kafka-version/README.md`\n- **Best-rated movies example - Redpanda version**（project_doc）：Best-rated movies example - Redpanda version 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/best-movies-example/redpanda-version/README.md`\n- **Make your LLM app sane again: Forgetting incorrect data in real time**（project_doc）：Make your LLM app sane again: Forgetting incorrect data in real time 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/conf42/README.md`\n- **Custom python connector example**（project_doc）：⚠️ Twitter has turned off its free tier streaming API. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/custom-python-connector-twitter/README.md`\n- **Tutorial: From interactive data exploration to deployment**（project_doc）：Tutorial: From interactive data exploration to deployment 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/from_jupyter_to_deploy/README.md`\n- **ETL with Kafka in/Kafka out**（project_doc）：In this project, you build an ETL pipeline with Pathway with Kafka in and Kafka out. Two data sources broadcast date times with different time zones to different Kafka topics. Pathway connects to those topics, extracts E the data, transforms T the date times to timestamps, and loads L the data to a third Kafka topic. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/kafka-ETL/README.md`\n- **Benchmark for Delta Lake S3 messaging as a Kafka replacement**（project_doc）：Benchmark for Delta Lake S3 messaging as a Kafka replacement 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/kafka-alternatives/benchmarks/README.md`\n- **Pathway Monitoring using OpenTelemetry Collector and Grafana Cloud**（project_doc）：Pathway Monitoring using OpenTelemetry Collector and Grafana Cloud 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/monitoring/README.md`\n- **Computing Option Greeks with Pathway and Databento.**（project_doc）：Computing Option Greeks with Pathway and Databento. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/option-greeks/README.md`\n- **Retrieval-Augmented Generation RAG Pipeline with Pathway**（project_doc）：Retrieval-Augmented Generation RAG Pipeline with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/question-answering-rag/README.md`\n- **Realtime monitoring of logs**（project_doc）：The purpose of this project is to do an end-to-end application with Pathway to monitors logs such as nginx logs . It connects Filebeat to Pathway via Kafka and send the alerts to a Slack channel. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/realtime-log-monitoring/filebeat-pathway-slack/README.md`\n- **Realtime monitoring of logs**（project_doc）：The purpose of this project is to do an end-to-end application with Pathway to monitors logs such as nginx logs . It connects Filebeat/Logstash to Pathway via Kafka and send the alerts to ElasticSearch. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/realtime-log-monitoring/logstash-pathway-elastic/README.md`\n- **Sample Pathway program for SharePoint connection testing**（project_doc）：Sample Pathway program for SharePoint connection testing 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/sharepoint-test/README.md`\n- **Data Preparation for Spark Analytics**（project_doc）：Data Preparation for Spark Analytics 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/spark-data-preparation/README.md`\n- **Realtime Twitter Analysis App with Pathway**（project_doc）：Realtime Twitter Analysis App with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/twitter/README.md`\n- **Web Scraping with Pathway**（project_doc）：This project demonstrates how to create a real-time web scraper using Pathway, a powerful data processing framework. The implementation fetches and processes news articles from websites, making it possible to continuously monitor and analyze the web content. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/web-scraping/README.md`\n- **Pathway EL Pipeline**（project_doc）：Pathway's EL pipeline allows you to move data from different data sources Extract to the sinks of your choice Load . By customizing a single YAML file, you can configure and run the pipeline without modifying any Python code. You can learn more about the pipeline in our article https://pathway.com/developers/templates/etl/el-pipeline/ . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/templates/el-pipeline/README.md`\n- **Worst-case optimal joins and delta queries in differential dataflow**（project_doc）：Worst-case optimal joins and delta queries in differential dataflow 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`external/differential-dataflow/dogsdogsdogs/README.md`\n- **Graph Server**（project_doc）：A differential dataflow server for continually changing graphs 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`external/differential-dataflow/server/README.md`\n- **tpchlike: a TPCH-like evaluation of differential dataflow**（project_doc）：tpchlike: a TPCH-like evaluation of differential dataflow 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`external/differential-dataflow/tpchlike/README.md`\n- **Pathway RAG evals**（project_doc）：Introduction This folder contains the automated continuous integration tests for RAG evaluations. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`integration_tests/rag_evals/README.md`\n- **Pathway LLM Extension Pack**（project_doc）：This XPack contains Pathway pathway.com functions useful in building data processing pipelines involving LLM models: 1. pathway-compatible wrappers UDFs for popular AI and language modeling tasks, such as: - document parsing - text embedding - LLM API calls 2. ready-made document processing pipelines for popular tasks: - document indexing 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`python/pathway/xpacks/llm/README.md`\n- **Contributing to Pathway**（project_doc）：Welcome! This guide is intended to help developers that are new to the community to make contributions to the Pathway project. Please be sure to also read our code of conduct CODE OF CONDUCT.md . We work hard to make this a welcoming community for all, and we're excited to have you on board! 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`CONTRIBUTING.md`\n- **Contributing to Pathway**（project_doc）：Welcome! This guide is intended to help developers that are new to the community to make contributions to the Pathway project. Please be sure to also read our code of conduct CODE OF CONDUCT.md . We work hard to make this a welcoming community for all, and we're excited to have you on board! 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/CONTRIBUTING.md`\n- **.Changelog**（project_doc）：This page references the Changelog of Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/.changelog.md`\n- **Welcome to Pathway Developer Documentation!**（project_doc）：Welcome to the Pathway developer hub 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/10.welcome.md`\n- **Installation**（project_doc）：Installation page of Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/20.installation.md`\n- **Quick Overview of Pathway**（project_doc）：Quick introduction to Pathway operators. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/30.pathway-overview.md`\n- **Pathway in Minutes: Quick ETL Examples**（project_doc）：A step-by-step guide to build a real-time pipeline with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/40.first_realtime_app_with_pathway.md`\n- **Core Concepts**（project_doc）：A review of the core concepts behind the Pathway programming framework. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/50.concepts.md`\n- **The Easiest Solution for Python Stream Processing, Data Indexing, and Real-Time AI Analytics.**（project_doc）：Why you should use Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/60.why-pathway.md`\n- **Pathway Framework Licensing Guide**（project_doc）：Quick Licensing Guide of Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/65.licensing-guide.md`\n- **Batch Processing in Python: Why Pathway Is Ideal for Both Streaming and Static Workloads**（project_doc）：Why Pathway makes sense for batch processing? 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/80.batch-processing.md`\n- **Connectors**（project_doc）：Pathway supported data sources 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/10.supported-data-sources.md`\n- **Connectors in Pathway**（project_doc）：Presentation of Pathway connectors. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/30.connectors-in-pathway.md`\n- **Pathway connectors**（project_doc）：Pathway supported data sources 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/35.pathway-connectors.md`\n- **Data Types and Schemas**（project_doc）：Defining schema in Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/40.schema.md`\n- **Artificial Data Streams with the demo Module**（project_doc）：How to generate artificial data streams using the demo module 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/50.artificial-streams.md`\n- **Switching from Batch to Streaming**（project_doc）：How to switch from batch to streaming with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/80.switch-from-batch-to-streaming.md`\n- **Making Your Python Web Scraping Go Live with Pathway**（project_doc）：Python Web Scraping with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/90.python-web-scraping.md`\n- **Using CSV connectors**（project_doc）：Tutorial on CSV connectors 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/10.csv_connectors.md`\n- **Sending alerts to Slack**（project_doc）：Tutorial on the connector for sending alerts to Slack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/100.slack_send_alerts.md`\n- **Airbyte connectors available in Pathway**（project_doc）：List of Airbyte connectors you can use with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/110.airbyte-connectors.md`\n- **Using Pathway Debezium Connector for MongoDB**（project_doc）：How to use the Debezium connector in Pathway to perform change data capture from MongoDB 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/115.mongodb-debezium.md`\n- **Using database connectors**（project_doc）：Tutorial on Database connectors 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/20.database-connectors.md`\n- **Creating a custom Python connector**（project_doc）：Tutorial on how to create custom Python connector 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/30.custom-python-connectors.md`\n- **Using Kafka connectors**（project_doc）：Tutorial on how to use Kafka connectors 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/30.kafka_connectors.md`\n- **Using NATS connectors**（project_doc）：Tutorial on how to use NATS connectors 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/33.nats-connectors.md`\n- **Subscribing to changes with Python function**（project_doc）：Tutorial on how to subscribe to changes with Python callback 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/35.python-output-connectors.md`\n- **Google Drive connector**（project_doc）：Tutorial on Google Drive connector 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/70.gdrive-connector.md`\n- **Switching from Kafka to Redpanda**（project_doc）：Tutorial on how to use Redpanda instead of Kafka 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/80.switching-to-redpanda.md`\n- **Consuming WebSockets streams**（project_doc）：Creating a custom WebSockets connector in Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/90.websockets-connectors.md`\n- **Table Operations: An Overview**（project_doc）：Overview of the basic transformations available in Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/30.data-transformation/10.table-operations.md`\n- **Taking You to the SQL API Documentation**（project_doc）：Pathway SQL API 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/30.data-transformation/80.sql.md`\n- **Understanding Temporal Behavior in Pathway**（project_doc）：How to define the behavior of your streamed data in Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/40.temporal-data/20.behaviors.md`\n- **Bridging Financial Data Streams: A Look at ASOF Join in Pathway**（project_doc）：Tutorial about ASOF Joins in Pathway. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md`\n- **LLM Chats**（project_doc）：LLM Wrappers available through Pathway xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.chats/llm-chats.md`\n- **Embedders**（project_doc）：Embedders available through the Pathway xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.embedders/embedders.md`\n- **LLM Examples**（project_doc）：LLM examples made with Pathway LLM tooling 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.llm-examples.md`\n- **Parsers**（project_doc）：Article about Pathway's parsers. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.parsers/parsers.md`\n- **Rerankers**（project_doc）：Rerankers available through the Pathway xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.rerankers/rerankers.md`\n- **Chunking**（project_doc）：Splitters available through the Pathway xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.splitters/splitters.md`\n- **Introduction to the LLM xpack**（project_doc）：Introduction to the Pathway LLM xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md`\n- **Create your own real-time RAG with Pathway**（project_doc）：Create your own RAG Pipeline with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/20.llm-app-pathway.md`\n- **Document Indexing**（project_doc）：Introduction to the Pathway LLM xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/30.docs-indexing.md`\n- **Pathway MCP Server**（project_doc）：Tutorial about how to set up and use Pathway MCP Server 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/40.pathway_mcp_server.md`\n- **Using Pathway MCP Server with Claude Desktop**（project_doc）：Tutorial on connecting Pathway MCP Server to Claude Desktop for real-time data access 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/41.pathway-mcp-claude-desktop.md`\n- **Cloud Deployment of Pathway**（project_doc）：A guide about how to deploy Pathway using the cloud 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/10.cloud-deployment.md`\n- **Deploy to Google Cloud Platform using Cloud Run: a step-by-step guide.**（project_doc）：A guide about how to deploy Pathway to GCP using Google Cloud Run 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/15.gcp-deploy.md`\n- **Running Pathway Program in AWS Cloud with Fargate**（project_doc）：How to deploy Pathway in the cloud with AWS Fargate 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/20.aws-fargate-deploy.md`\n- **Deploying Pathway Programs on Azure Made Easy**（project_doc）：How to deploy Pathway in the cloud within Azure ecosystem 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/25.azure-aci-deploy.md`\n- **Deploy with Render: a Step-by-step Guide.**（project_doc）：A guide about how to deploy Pathway using the Render 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/30.render-deploy.md`\n\n## 证据索引\n\n- 共索引 80 条证据。\n\n- **Pathway Live Data Framework**（documentation）：Getting Started Deployment Documentation and Support Blog License 证据：`README.md`\n- **Pathway examples**（documentation）：This repository contains examples of data processing using Pathway https://pathway.com/developers/documentation/introduction/welcome , the Python-based programming framework for easy building of realtime and reactive data products. 证据：`examples/README.md`\n- **AG2 Multi-Agent Conversations with Pathway Real-Time RAG**（documentation）：AG2 Multi-Agent Conversations with Pathway Real-Time RAG 证据：`examples/projects/ag2-multiagent-rag/README.md`\n- **Azure ACI Deployment Example**（documentation）：This repository contains an example for the tutorial: \"Running Pathway Program in Azure with Azure Container Instances\" https://pathway.com/developers/user-guide/deployment/azure-aci-deploy . 证据：`examples/projects/azure-aci-deploy/README.md`\n- **Best-rated movies examples**（documentation）：The purpose of this project is two-fold: 1. doing an end-to-end application with Pathway to compute the K-best-rated items in a movie dataset, 2. switching from Kafka to Redpanda 证据：`examples/projects/best-movies-example/README.md`\n- **Best-rated movies example - Kafka version**（documentation）：Best-rated movies example - Kafka version 证据：`examples/projects/best-movies-example/kafka-version/README.md`\n- **Best-rated movies example - Redpanda version**（documentation）：Best-rated movies example - Redpanda version 证据：`examples/projects/best-movies-example/redpanda-version/README.md`\n- **Make your LLM app sane again: Forgetting incorrect data in real time**（documentation）：Make your LLM app sane again: Forgetting incorrect data in real time 证据：`examples/projects/conf42/README.md`\n- **Custom python connector example**（documentation）：⚠️ Twitter has turned off its free tier streaming API. 证据：`examples/projects/custom-python-connector-twitter/README.md`\n- **Tutorial: From interactive data exploration to deployment**（documentation）：Tutorial: From interactive data exploration to deployment 证据：`examples/projects/from_jupyter_to_deploy/README.md`\n- **ETL with Kafka in/Kafka out**（documentation）：In this project, you build an ETL pipeline with Pathway with Kafka in and Kafka out. Two data sources broadcast date times with different time zones to different Kafka topics. Pathway connects to those topics, extracts E the data, transforms T the date times to timestamps, and loads L the data to a third Kafka topic. 证据：`examples/projects/kafka-ETL/README.md`\n- **Benchmark for Delta Lake S3 messaging as a Kafka replacement**（documentation）：Benchmark for Delta Lake S3 messaging as a Kafka replacement 证据：`examples/projects/kafka-alternatives/benchmarks/README.md`\n- **Pathway Monitoring using OpenTelemetry Collector and Grafana Cloud**（documentation）：Pathway Monitoring using OpenTelemetry Collector and Grafana Cloud 证据：`examples/projects/monitoring/README.md`\n- **Computing Option Greeks with Pathway and Databento.**（documentation）：Computing Option Greeks with Pathway and Databento. 证据：`examples/projects/option-greeks/README.md`\n- **Retrieval-Augmented Generation RAG Pipeline with Pathway**（documentation）：Retrieval-Augmented Generation RAG Pipeline with Pathway 证据：`examples/projects/question-answering-rag/README.md`\n- **Realtime monitoring of logs**（documentation）：The purpose of this project is to do an end-to-end application with Pathway to monitors logs such as nginx logs . It connects Filebeat to Pathway via Kafka and send the alerts to a Slack channel. 证据：`examples/projects/realtime-log-monitoring/filebeat-pathway-slack/README.md`\n- **Realtime monitoring of logs**（documentation）：The purpose of this project is to do an end-to-end application with Pathway to monitors logs such as nginx logs . It connects Filebeat/Logstash to Pathway via Kafka and send the alerts to ElasticSearch. 证据：`examples/projects/realtime-log-monitoring/logstash-pathway-elastic/README.md`\n- **Sample Pathway program for SharePoint connection testing**（documentation）：Sample Pathway program for SharePoint connection testing 证据：`examples/projects/sharepoint-test/README.md`\n- **Data Preparation for Spark Analytics**（documentation）：Data Preparation for Spark Analytics 证据：`examples/projects/spark-data-preparation/README.md`\n- **Realtime Twitter Analysis App with Pathway**（documentation）：Realtime Twitter Analysis App with Pathway 证据：`examples/projects/twitter/README.md`\n- **Web Scraping with Pathway**（documentation）：This project demonstrates how to create a real-time web scraper using Pathway, a powerful data processing framework. The implementation fetches and processes news articles from websites, making it possible to continuously monitor and analyze the web content. 证据：`examples/projects/web-scraping/README.md`\n- **Pathway EL Pipeline**（documentation）：Pathway's EL pipeline allows you to move data from different data sources Extract to the sinks of your choice Load . By customizing a single YAML file, you can configure and run the pipeline without modifying any Python code. You can learn more about the pipeline in our article https://pathway.com/developers/templates/etl/el-pipeline/ . 证据：`examples/templates/el-pipeline/README.md`\n- **Worst-case optimal joins and delta queries in differential dataflow**（documentation）：Worst-case optimal joins and delta queries in differential dataflow 证据：`external/differential-dataflow/dogsdogsdogs/README.md`\n- **Graph Server**（documentation）：A differential dataflow server for continually changing graphs 证据：`external/differential-dataflow/server/README.md`\n- **tpchlike: a TPCH-like evaluation of differential dataflow**（documentation）：tpchlike: a TPCH-like evaluation of differential dataflow 证据：`external/differential-dataflow/tpchlike/README.md`\n- **Pathway RAG evals**（documentation）：Introduction This folder contains the automated continuous integration tests for RAG evaluations. 证据：`integration_tests/rag_evals/README.md`\n- **Pathway LLM Extension Pack**（documentation）：This XPack contains Pathway pathway.com functions useful in building data processing pipelines involving LLM models: 1. pathway-compatible wrappers UDFs for popular AI and language modeling tasks, such as: - document parsing - text embedding - LLM API calls 2. ready-made document processing pipelines for popular tasks: - document indexing 证据：`python/pathway/xpacks/llm/README.md`\n- **Contributing to Pathway**（documentation）：Welcome! This guide is intended to help developers that are new to the community to make contributions to the Pathway project. Please be sure to also read our code of conduct CODE OF CONDUCT.md . We work hard to make this a welcoming community for all, and we're excited to have you on board! 证据：`CONTRIBUTING.md`\n- **Contributing to Pathway**（documentation）：Welcome! This guide is intended to help developers that are new to the community to make contributions to the Pathway project. Please be sure to also read our code of conduct CODE OF CONDUCT.md . We work hard to make this a welcoming community for all, and we're excited to have you on board! 证据：`examples/CONTRIBUTING.md`\n- **Package**（package_manifest）：{ \"name\": \"frontend\", \"version\": \"1.0.0\", \"description\": \"\", \"main\": \"index.js\", \"scripts\": { \"webpack-dev-server\": \"webpack-dev-server\", \"dev\": \"webpack-dev-server --mode=development\", \"prod\": \"webpack --mode=production\" }, \"keywords\": , \"author\": \"\", \"license\": \"ISC\", \"dependencies\": { \"@emotion/react\": \"^11.10.4\", \"@emotion/styled\": \"^11.10.4\", \"@mui/material\": \"^5.10.3\", \"@mui/x-data-grid\": \"^5.17.1\", \"@mui/x-date-pickers\": \"^5.0.0\", \"css-loader\": \"^6.7.1\", \"deck.gl\": \"^8.8.9\", \"file-loader\": \"^6.2.0\", \"lodash\": \"^4.17.21\", \"mapbox-gl\": \"^2.10.0\", \"moment\": \"^2.29.4\", \"moment-timezone\": \"^0.5.37\", \"react\": \"^18.2.0\", \"react-dom\": \"^18.2.0\", \"react-map-gl\": \"^7.0.19\", \"style-loader\": \"^3… 证据：`examples/projects/twitter/services/frontend/package.json`\n- **License**（source_file）：Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files the \"Software\" , to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 证据：`examples/LICENSE`\n- **License**（source_file）：Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files the \"Software\" , to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 证据：`python/pathway/third_party/airbyte_serverless/LICENSE`\n- **.Changelog**（documentation）：--- title: Changelog description: 'This page references the Changelog of Pathway' toc: false --- 证据：`docs/2.developers/4.user-guide/.changelog.md`\n- **Welcome to Pathway Developer Documentation!**（documentation）：Welcome to Pathway Developer Documentation! 证据：`docs/2.developers/4.user-guide/10.introduction/10.welcome.md`\n- **Installation**（documentation）：To quickly get started with Pathway, you can install it via pip with the following command: 证据：`docs/2.developers/4.user-guide/10.introduction/20.installation.md`\n- **Quick Overview of Pathway**（documentation）：Quick Overview of Pathway This page is a summary of what you need to know about to start programming with Pathway. If you want to learn more about the core concepts of Pathway, you can read the dedicated article /developers/user-guide/introduction/concepts . 证据：`docs/2.developers/4.user-guide/10.introduction/30.pathway-overview.md`\n- **Pathway in Minutes: Quick ETL Examples**（documentation）：Pathway in Minutes: Quick ETL Examples 证据：`docs/2.developers/4.user-guide/10.introduction/40.first_realtime_app_with_pathway.md`\n- **Core Concepts**（documentation）：Core Concepts A review of the core concepts behind data representation and data transformation in the programming layer of Pathway. 证据：`docs/2.developers/4.user-guide/10.introduction/50.concepts.md`\n- **The Easiest Solution for Python Stream Processing, Data Indexing, and Real-Time AI Analytics.**（documentation）：The Easiest Solution for Python Stream Processing, Data Indexing, and Real-Time AI Analytics. 证据：`docs/2.developers/4.user-guide/10.introduction/60.why-pathway.md`\n- **Pathway Framework Licensing Guide**（documentation）：Pathway offers a flexible licensing model designed to accommodate both community users and enterprise deployments. This guide explains the different licensing options, usage limitations, and key considerations for developers. 证据：`docs/2.developers/4.user-guide/10.introduction/65.licensing-guide.md`\n- **Batch Processing in Python: Why Pathway Is Ideal for Both Streaming and Static Workloads**（documentation）：Batch Processing in Python: Why Pathway Is Ideal for Both Streaming and Static Workloads 证据：`docs/2.developers/4.user-guide/10.introduction/80.batch-processing.md`\n- **Connectors**（documentation）：--- title: \"Supported Data Sources\" description: \"Pathway supported data sources\" navigation: false layout: default aside: true toc: false single: true redirect: \"/developers/user-guide/connect/pathway-connectors\" --- Connectors 证据：`docs/2.developers/4.user-guide/20.connect/10.supported-data-sources.md`\n- **Connectors in Pathway**（documentation）：In order to use Pathway, one of the first things you need to do is to access the data you want to manipulate. In Pathway, accessing the data is done using connectors . 证据：`docs/2.developers/4.user-guide/20.connect/30.connectors-in-pathway.md`\n- **Pathway connectors**（documentation）：Pathway has a long list of connectors for connecting to external data sources at input and connectors for outputting the data outside of Pathway. Since Pathway is a streaming-first framework, all connectors operate in streaming mode , automatically updating the results in real-time with every change. Explore the list below to find the connector you need. 证据：`docs/2.developers/4.user-guide/20.connect/35.pathway-connectors.md`\n- **Data Types and Schemas**（documentation）：Data Types and Schemas In this guide, you will explore how to effectively utilize data types and schemas. 证据：`docs/2.developers/4.user-guide/20.connect/40.schema.md`\n- **Artificial Data Streams with the demo Module**（documentation）：Artificial Data Streams with the demo Module With Pathway's demo /developers/api-docs/pathway-demo module, you can create custom data streams from scratch or by utilizing a CSV file. This feature empowers you to effectively test and debug your Pathway implementation using realtime data. 证据：`docs/2.developers/4.user-guide/20.connect/50.artificial-streams.md`\n- **Switching from Batch to Streaming**（documentation）：Switching from Batch to Streaming Easily switching from batch to streaming is a core feature of Pathway. In this article, you will see how easy it is to change your static pipeline to make it run with streaming data. 证据：`docs/2.developers/4.user-guide/20.connect/80.switch-from-batch-to-streaming.md`\n- **Making Your Python Web Scraping Go Live with Pathway**（documentation）：Making Your Python Web Scraping Go Live with Pathway 证据：`docs/2.developers/4.user-guide/20.connect/90.python-web-scraping.md`\n- **Using CSV connectors**（documentation）：Using CSV connectors Pathway provides a pw.io.csv /developers/api-docs/pathway-io/csv module with connectors to read and write data streams using CSV files. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/10.csv_connectors.md`\n- **Sending alerts to Slack**（documentation）：This tutorial will guide you through connecting Pathway to Slack and sending alerts to a specific channel. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/100.slack_send_alerts.md`\n- **Airbyte connectors available in Pathway**（documentation）：Airbyte connectors available in Pathway 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/110.airbyte-connectors.md`\n- **Using Pathway Debezium Connector for MongoDB**（documentation）：Using Pathway Debezium Connector for MongoDB Connect Pathway on top of your MongoDB/Debezium database using pw.io.debezium.read /developers/api-docs/pathway-io/debezium pathway.io.debezium.read and pw.io.mongodb.write /developers/api-docs/pathway-io/mongodb . 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/115.mongodb-debezium.md`\n- **Using database connectors**（documentation）：Using database connectors Connect Pathway on top of your PostgreSQL/Debezium database using pw.io.debezium.read /developers/api-docs/pathway-io/debezium pathway.io.debezium.read and pw.io.postgres.write /developers/api-docs/pathway-io/postgres pathway.io.postgres.write . 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/20.database-connectors.md`\n- **Creating a custom Python connector**（documentation）：In this tutorial, you will learn how to create a Python connector that will allow you to connect to your custom data source and feed data directly into Pathway. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/30.custom-python-connectors.md`\n- **Using Kafka connectors**（documentation）：Using Kafka connectors Pathway provides a pw.io.kafka /developers/api-docs/pathway-io/kafka module with connectors to read and send messages from a Kafka instance. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/30.kafka_connectors.md`\n- **Using NATS connectors**（documentation）：Using NATS connectors Pathway provides a pw.io.nats /developers/api-docs/pathway-io/nats module with connectors to read and send messages from a NATS instance. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/33.nats-connectors.md`\n- **Subscribing to changes with Python function**（documentation）：Subscribing to changes with Python function 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/35.python-output-connectors.md`\n- **Google Drive connector**（documentation）：Google Drive connector This tutorial will guide you through connecting Pathway to your data stored on Google Drive. For detailed information about Google Drive connector, refer to the API documentation /developers/api-docs/pathway-io/gdrive . 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/70.gdrive-connector.md`\n- **Switching from Kafka to Redpanda**（documentation）：Switching from Kafka to Redpanda Not a fan of the JVM and ZooKeeper? In this article, you will learn how to switch from Kafka to Redpanda and how to adapt your Pathway project to Redpanda. The change is easier than you might think: your Pathway code remains the same! 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/80.switching-to-redpanda.md`\n- **Consuming WebSockets streams**（documentation）：In this tutorial, you will be guided through creating a custom WebSocket connector. It will allow you to interact with WebSocket data streams and process them as needed. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/90.websockets-connectors.md`\n- 其余 20 条证据见 `AI_CONTEXT_PACK.json` 或 `EVIDENCE_INDEX.json`。\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`README.md`, `examples/README.md`, `examples/projects/ag2-multiagent-rag/README.md`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`README.md`, `examples/README.md`, `examples/projects/ag2-multiagent-rag/README.md`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Pathway简介**：importance `high`\n  - source_paths: README.md, pyproject.toml\n- **安装指南**：importance `high`\n  - source_paths: pyproject.toml, rust-toolchain.toml, docs/2.developers/4.user-guide/10.introduction/20.installation.md\n- **基础示例**：importance `high`\n  - source_paths: examples/notebooks/tutorials/installation_first_steps.ipynb, python/pathway/__init__.py\n- **引擎架构**：importance `high`\n  - source_paths: src/engine/mod.rs, src/engine/dataflow.rs, src/engine/graph.rs, external/differential-dataflow/src/lib.rs, external/timely-dataflow/timely/src/lib.rs\n- **Python与Rust集成**：importance `medium`\n  - source_paths: src/python_api.rs, python/pathway/_engine_finder.py, src/lib.rs\n- **连接器概述**：importance `high`\n  - source_paths: python/pathway/io/__init__.py, python/pathway/internals/datasource.py, python/pathway/internals/datasink.py, src/connectors/mod.rs\n- **数据库连接器**：importance `high`\n  - source_paths: python/pathway/io/postgres/__init__.py, python/pathway/io/mongodb/__init__.py, python/pathway/io/mysql/__init__.py, python/pathway/io/mssql/__init__.py, src/connectors/postgres.rs\n- **流数据连接器**：importance `high`\n  - source_paths: python/pathway/io/kafka/__init__.py, python/pathway/io/nats/__init__.py, python/pathway/io/mqtt/__init__.py, python/pathway/io/rabbitmq/__init__.py, python/pathway/io/redpanda/__init__.py\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `510c6daf92bd76e21b7279cee5f588b1b64e7b0f`\n- inspected_files: `pyproject.toml`, `README.md`, `docs/2.developers/7.templates/30.configure-yaml.md`, `docs/2.developers/7.templates/35.custom-components.md`, `docs/2.developers/7.templates/20.run-a-template.md`, `docs/2.developers/7.templates/38.licensing-guide.md`, `docs/2.developers/7.templates/1.index.md`, `docs/2.developers/4.user-guide/.changelog.md`, `docs/2.developers/7.templates/rag/1008.template-demo-document-indexing.md`, `docs/2.developers/7.templates/rag/1000.demo-question-answering.md`, `docs/2.developers/7.templates/rag/1001.template-adaptive-rag.md`, `docs/2.developers/7.templates/rag/1010.template-slides-search.md`, `docs/2.developers/7.templates/rag/5.unstructured-to-structured.md`, `docs/2.developers/7.templates/rag/1009.drive-alert.md`, `docs/2.developers/7.templates/rag/1003.template-multimodal-rag.md`, `docs/2.developers/7.templates/rag/1002.template-private-rag.md`, `docs/2.developers/7.templates/rag/.navigation.yml`, `docs/2.developers/7.templates/40.rag-customization/20.custom-prompt.md`, `docs/2.developers/7.templates/40.rag-customization/10.REST-API.md`, `docs/2.developers/7.templates/40.rag-customization/40.splitters.md`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 来源证据：Entra Authentication Support (credential handler and/or password callback)\n\n- Trigger: GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能影响授权、密钥配置或安全边界。\n- Evidence: community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 来源证据：Automated schema exploration in input connectors\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能阻塞安装或首次运行。\n- Evidence: community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 来源证据：Improve watermarks in POSIX-like objects tracker\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能阻塞安装或首次运行。\n- Evidence: community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: 来源证据：Persistence in `iterate` operator\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 来源证据：Support LEANN for RAG pipelines\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 8: 来源证据：Support MongoDB Atlas in pw.io.mongodb\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 9: 来源证据：feat: Add Microsoft SQL Server (MSSQL) connector\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能影响升级、迁移或版本选择。\n- Evidence: community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 10: 来源证据：v0.27.0\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能影响升级、迁移或版本选择。\n- Evidence: community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：pathwaycom/pathway\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：local_cli\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any（high）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Entra Authentication Support (credential handler and/or password callback)（high）：可能影响授权、密钥配置或安全边界。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Automated schema exploration in input connectors（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions（medium）：可能阻塞安装或首次运行。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Improve watermarks in POSIX-like objects tracker（medium）：可能阻塞安装或首次运行。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/pathwaycom/pathway 项目说明书\n\n生成时间：2026-05-16 20:44:44 UTC\n\n## 目录\n\n- [Pathway简介](#overview-introduction)\n- [安装指南](#overview-installation)\n- [基础示例](#quickstart-basic-example)\n- [引擎架构](#arch-engine-overview)\n- [Python与Rust集成](#arch-python-rust-integration)\n- [连接器概述](#connectors-overview)\n- [数据库连接器](#connectors-database)\n- [流数据连接器](#connectors-streaming)\n- [表操作](#trans-table-operations)\n- [连接与聚合](#trans-joins-aggregations)\n\n<a id='overview-introduction'></a>\n\n## Pathway简介\n\n### 相关页面\n\n相关主题：[引擎架构](#arch-engine-overview), [安装指南](#overview-installation)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [external/differential-dataflow/src/trace/implementations/ord.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/implementations/ord.rs)\n- [external/timely-dataflow/timely/src/progress/subgraph.rs](https://github.com/pathwaycom/pathway/blob/main/external/timely-dataflow/timely/src/progress/subgraph.rs)\n- [external/differential-dataflow/src/trace/wrappers/freeze.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/wrappers/freeze.rs)\n- [src/connectors/postgres.rs](https://github.com/pathwaycom/pathway/blob/main/src/connectors/postgres.rs)\n- [examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n</details>\n\n# Pathway简介\n\n## 项目概述\n\nPathway是一个Python ETL框架，专为流处理（stream processing）、实时分析（real-time analytics）、LLM管道（LLM pipelines）和检索增强生成（RAG）场景设计。资料来源：[README.md:1]()\n\nPathway的核心优势在于其**易用的Python API**，允许用户无缝集成最喜欢的Python机器学习库。同时，Pathway代码具有极高的通用性和鲁棒性，可以同时用于开发和生产环境，有效处理批处理（batch）和流式数据（streaming data）。资料来源：[README.md:3]()\n\nPathway的一个显著特点是**代码一致性**：同一套代码可用于本地开发、CI/CD测试、运行批处理任务、处理流回放（stream replays）以及处理数据流。资料来源：[README.md:4]()\n\n## 核心技术架构\n\nPathway的技术架构采用分层设计，融合了多种分布式计算范式：\n\n```mermaid\ngraph TD\n    A[Python API层] --> B[PyO3绑定层]\n    B --> C[Rust执行引擎]\n    C --> D[Timely Dataflow分布式计算]\n    C --> E[Differential Dataflow增量计算]\n    D --> F[增量数据流处理]\n    E --> F\n```\n\n### Python API层\n\nPathway提供简洁的Python接口，通过PyO3实现Python与Rust的双向绑定。Python API层支持多种数据类型和连接器配置。资料来源：[src/python_api.rs:1-80]()\n\n核心Python绑定类包括：\n\n| 类名 | 功能说明 |\n|------|----------|\n| `PythonSubject` | Python主题定义，支持start、read、seek等回调 |\n| `ValueField` | 值字段定义，包含名称、类型、来源和默认值 |\n| `Table` | 数据表核心抽象 |\n| `ConnectorProperties` | 连接器通用属性 |\n| `CsvParserSettings` | CSV解析器配置 |\n| `AwsS3Settings` | AWS S3连接配置 |\n\n### Rust执行引擎\n\nPathway的执行引擎由Rust实现，基于Differential Dataflow和Timely Dataflow构建。Rust引擎负责高性能的数据处理和计算。资料来源：[README.md:5]()\n\n引擎通过`pymodule`导出Python可调用的类和函数，实现Python语法的易用性与Rust的高性能完美结合。资料来源：[src/python_api.rs:100-150]()\n\n### Timely Dataflow分布式计算\n\nPathway使用Timely Dataflow作为底层分布式计算框架。Timely Dataflow提供了数据流图的处理能力，支持并行和分布式计算。资料来源：[external/timely-dataflow/timely/src/progress/subgraph.rs:1-30]()\n\nTimely Dataflow的核心组件包括：\n\n```mermaid\ngraph LR\n    A[数据输入] --> B[Scope子图]\n    B --> C[PerOperatorState]\n    C --> D[Antichain进度跟踪]\n    D --> E[ChangeBatch消息]\n    E --> F[数据输出]\n```\n\n关键数据结构：\n- **Activations**：共享激活状态管理\n- **MutableAntichain**：可变反链，用于跟踪进度边界\n- **ChangeBatch**：变化批次，用于记录数据变化\n\n### Differential Dataflow增量计算\n\nDifferential Dataflow是Pathway实现增量计算的核心技术。与传统批处理不同，Differential Dataflow能够智能处理数据的变化，仅重新计算受影响的部分。资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:1-30]()\n\n#### 轨迹追踪器（Trace）\n\nPathway使用轨迹（Trace）来记录历史数据状态，支持时间旅行查询和增量更新。资料来源：[external/differential-dataflow/src/trace/wrappers/freeze.rs:1-30]()\n\n```mermaid\ngraph TD\n    A[数据变化] --> B[Trace记录]\n    B --> C[时间戳管理]\n    C --> D[Antichain since边界]\n    D --> E[增量计算引擎]\n    E --> F[输出更新]\n```\n\n#### 有序批处理\n\nPathway实现了高效的有序批处理结构`OrdKeyBatch`和`OrdValBatch`，用于存储键值对的时间序列数据。资料来源：[external/differential-dataflow/src/trace/implementations/ord.rs:1-50]()\n\n| 组件 | 说明 |\n|------|------|\n| `OrdKeyCursor` | 有序键游标，用于遍历键数据 |\n| `OrdValCursor` | 有序键值游标，用于遍历键值对 |\n| `BatchContainer` | 批量容器接口 |\n| `MergeBuilder` | 合并构建器 |\n\n## 主要功能特性\n\n### 连接器支持\n\nPathway提供丰富的连接器支持，覆盖多种数据源和数据格式：\n\n| 连接器类型 | 支持的数据源 |\n|------------|--------------|\n| 文件系统 | CSV、Parquet等 |\n| 云存储 | AWS S3、Azure Blob Storage |\n| 数据库 | PostgreSQL（包括复制支持） |\n| 搜索引擎 | Elasticsearch |\n| 流处理 | MQTT、Kafka、Debezium |\n| 其他 | Schema Registry、Iceberg |\n\n资料来源：[src/python_api.rs:80-120]()\n\n### PostgreSQL连接器\n\nPathway的PostgreSQL连接器支持完整的二进制协议，实现高效的数据读写：\n\n- 支持多种数据类型映射（BOOL、INT、Float、TEXT、JSONB等）\n- 二进制数组格式写入\n- 复制槽（Replication Slot）支持逻辑复制\n- Debezium CDC集成\n\n资料来源：[src/connectors/postgres.rs:1-60]()\n\n### 数据格式与解析\n\nPathway提供灵活的数据格式配置，包括：\n\n```python\nCsvParserSettings(\n    name=field_name,\n    type_=field_type,\n    source=FieldSource.Payload,\n    default=None\n)\n```\n\n支持的数据类型包括：INT、FLOAT、STRING、BOOL、TIMESTAMP、JSON等。\n\n### 持久化与状态管理\n\nPathway支持灵活的数据持久化配置：\n\n| 配置项 | 说明 |\n|--------|------|\n| `PersistenceConfig` | 持久化基础配置 |\n| `BackfillingThreshold` | 回填阈值配置 |\n| `PySnapshotAccess` | 快照访问模式 |\n| `PySnapshotEvent` | 快照事件类型 |\n\n## 应用场景\n\n### ETL管道\n\nPathway特别适合构建ETL（Extract-Transform-Load）管道，支持从多种数据源提取数据、进行转换处理、加载到目标系统。\n\n```mermaid\ngraph LR\n    A[数据源] -->|Extract| B[Pathway管道]\n    B -->|Transform| C[增量处理]\n    C -->|Load| D[目标系统]\n```\n\n### 实时分析\n\n基于Differential Dataflow的增量计算能力，Pathway能够高效处理实时数据分析任务，仅处理数据流中的增量变化部分。\n\n### LLM管道与RAG\n\nPathway为构建LLM（Large Language Model）管道和RAG（Retrieval-Augmented Generation）应用提供专用支持，能够实时索引和检索外部数据源。\n\n## 开发与部署\n\n### 开发环境\n\nPathway支持本地开发环境，开发者可以使用熟悉的Python工具链进行开发。资料来源：[examples/templates/el-pipeline/README.md:1-20]()\n\n### 许可证\n\nPathway采用BSL（Business Source License）许可证发布，允许非商业使用和评估。商业使用需要获取正式许可证。资料来源：[README.md:45]()\n\n### 许可证配置\n\n部署时需要设置环境变量：\n\n```bash\nexport PATHWAY_LICENSE_KEY=your_pathway_key\n```\n\n## 技术优势总结\n\n| 优势 | 描述 |\n|------|------|\n| **高性能** | Rust引擎提供接近原生的执行效率 |\n| **增量计算** | Differential Dataflow避免重复计算 |\n| **统一API** | Python接口简化开发，Rust保证性能 |\n| **多数据源** | 丰富的内置连接器支持 |\n| **可扩展性** | 基于Timely Dataflow支持分布式部署 |\n| **开发友好** | 支持本地开发和生产环境一致体验 |\n\n---\n\n<a id='overview-installation'></a>\n\n## 安装指南\n\n### 相关页面\n\n相关主题：[基础示例](#quickstart-basic-example)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [pyproject.toml](https://github.com/pathwaycom/pathway/blob/main/pyproject.toml)\n- [rust-toolchain.toml](https://github.com/pathwaycom/pathway/blob/main/rust-toolchain.toml)\n- [docs/2.developers/4.user-guide/10.introduction/20.installation.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/10.introduction/20.installation.md)\n</details>\n\n# 安装指南\n\nPathway 是一个基于 Python 的 ETL 框架，由 Rust 引擎驱动，支持流处理和实时分析。本指南将帮助你在不同环境下正确安装和配置 Pathway。\n\n## 系统要求\n\n### 硬件和操作系统要求\n\n| 要求类型 | 最低配置 | 推荐配置 |\n|---------|---------|---------|\n| 处理器 | 2 核 | 4 核或以上 |\n| 内存 | 4 GB | 8 GB 或以上 |\n| 磁盘空间 | 500 MB | 1 GB 或以上 |\n| 操作系统 | Linux/macOS/Windows | Linux/macOS |\n\n### Python 版本要求\n\nPathway 需要 Python 3.10 或更高版本。建议使用 Python 3.11 以获得最佳性能。\n\n| Python 版本 | 支持状态 |\n|------------|---------|\n| 3.10 | ✅ 支持 |\n| 3.11 | ✅ 推荐 |\n| 3.12 | ✅ 支持 |\n\n## 安装方式\n\n### 使用 pip 安装\n\n通过 pip 包管理器安装 Pathway 是最简单的方式：\n\n```bash\npip install pathway\n```\n\n### 从源码安装\n\n对于需要最新功能或进行开发调试的用户，可以从源码编译安装：\n\n```bash\n# 克隆仓库\ngit clone https://github.com/pathwaycom/pathway.git\ncd pathway\n\n# 安装 Rust 依赖和 Python 包\npip install -e .\n```\n\n## 依赖环境\n\n### Rust 工具链\n\nPathway 的核心引擎使用 Rust 编写，需要 Rust 工具链支持。\n\n#### 安装 Rust\n\n```bash\n# Linux/macOS\ncurl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh\n\n# Windows - 使用 rustup 安装程序\nwinget install Rustlang.Rustup\n```\n\n#### 验证 Rust 安装\n\n```bash\nrustc --version\ncargo --version\n```\n\n项目使用 `rust-toolchain.toml` 文件指定 Rust 版本，确保编译器版本一致性。\n\n### Python 依赖\n\nPathway 的 Python 依赖在 `pyproject.toml` 中定义，主要包括：\n\n| 依赖包 | 版本要求 | 用途 |\n|-------|---------|------|\n| pyo3 | 最新稳定版 | Python 绑定 |\n| timely-dataflow | 外部依赖 | 数据流处理 |\n| differential-dataflow | 外部依赖 | 增量计算 |\n\n## 环境配置\n\n### 环境变量\n\nPathway 运行可能需要的可选环境变量：\n\n```bash\n# 设置 Pathway 许可证密钥（商业功能需要）\nexport PATHWAY_LICENSE_KEY=your_pathway_key\n\n# 可选：设置 Python 路径\nexport PYTHONPATH=/path/to/pathway:$PYTHONPATH\n```\n\n### 验证安装\n\n安装完成后，可以通过以下方式验证：\n\n```python\nimport pathway as pw\n\n# 打印版本信息\nprint(pw.__version__)\n\n# 创建一个简单的数据流测试\ninput_session = pw.demo.demo_reduce_operator(show_progress=False)\nprint(\"Pathway 安装成功！\")\n```\n\n## 平台特定说明\n\n### Linux\n\nLinux 环境需要确保以下系统包已安装：\n\n```bash\n# Debian/Ubuntu\nsudo apt-get update\nsudo apt-get install -y python3-dev python3-pip cargo\n```\n\n### macOS\n\nmacOS 用户建议使用 Homebrew 安装依赖：\n\n```bash\nbrew install python rust\npip3 install pathway\n```\n\n### Windows\n\nWindows 用户需要安装 Visual C++ Build Tools 以编译 Rust 依赖：\n\n1. 下载并安装 [Visual Studio Build Tools](https://visualstudio.microsoft.com/visual-cpp-build-tools/)\n2. 确保安装 \"C++ 生成工具\" 工作负载\n3. 通过 PowerShell 或命令提示符安装 Pathway\n\n## 常见问题\n\n### 编译错误\n\n如果遇到编译错误，确保 Rust 工具链是最新版本：\n\n```bash\nrustup update\ncargo clean\npip install --force-reinstall pathway\n```\n\n### 导入错误\n\n如果遇到 `ModuleNotFoundError`，检查 Python 路径和安装状态：\n\n```bash\npip show pathway\npython -c \"import pathway; print(pathway.__file__)\"\n```\n\n## 下一步\n\n安装完成后，你可以开始使用 Pathway：\n\n1. 阅读[快速开始指南](./quick-start.md)了解基本用法\n2. 查看[用户指南](../user-guide/introduction/welcome)深入学习\n3. 参考 [API 文档](https://pathway.com/developers/api-docs/pathway)了解完整功能\n\n---\n\n<a id='quickstart-basic-example'></a>\n\n## 基础示例\n\n### 相关页面\n\n相关主题：[表操作](#trans-table-operations), [连接器概述](#connectors-overview)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n- [examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n</details>\n\n# 基础示例\n\nPathway 是一个 Python ETL 框架，用于流处理、实时分析、LLM 管道和 RAG（检索增强生成）应用开发。本文档介绍 Pathway 的基础概念、核心组件和入门示例。\n\n## 概述\n\nPathway 由 Python 前端和 Rust 后端引擎组成，通过 PyO3 实现 Python 与 Rust 的互操作。Python API 提供了简洁易用的接口，而 Rust 引擎基于 Differential Dataflow 进行增量计算，确保高性能和一致性。\n\n资料来源：[README.md:1-15]()\n\n## 核心概念\n\n### 数据流架构\n\n```mermaid\ngraph TD\n    A[数据源 Connectors] --> B[PythonSubject 输入]\n    B --> C[Rust 引擎处理]\n    C --> D[增量计算 Differential Dataflow]\n    D --> E[PyExportedTable 输出]\n    E --> F[快照 Snapshot]\n```\n\nPathway 的核心数据流遵循：数据源 → PythonSubject → Rust 引擎 → 增量计算 → ExportedTable 的模式。\n\n资料来源：[src/python_api.rs:35-45]()\n\n### PythonSubject 核心类\n\n`PythonSubject` 是 Python 端的数据输入接口，用于定义数据源的读取行为：\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| `start` | `Py<PyAny>` | 起始处理函数 |\n| `read` | `Py<PyAny>` | 读取数据函数 |\n| `seek` | `Py<PyAny>` | 定位函数 |\n| `on_persisted_run` | `Py<PyAny>` | 持久化运行回调 |\n| `end` | `Py<PyAny>` | 结束处理函数 |\n| `is_internal` | `bool` | 是否为内部数据源 |\n| `deletions_enabled` | `bool` | 是否启用删除 |\n\n资料来源：[src/python_api.rs:25-33]()\n\n### ValueField 模式字段\n\n`ValueField` 定义表的 schema 字段结构：\n\n| 属性 | 类型 | 说明 |\n|------|------|------|\n| `name` | `String` | 字段名称 |\n| `type_` | `Type` | 数据类型 |\n| `source` | `FieldSource` | 字段来源（默认 Payload） |\n| `default` | `Option<Value>` | 默认值 |\n| `metadata` | `Option<String>` | 元数据 |\n\n资料来源：[src/python_api.rs:52-61]()\n\n### PyExportedTable 导出表\n\n`PyExportedTable` 提供数据输出和快照功能：\n\n| 方法 | 返回类型 | 说明 |\n|------|----------|------|\n| `frontier()` | `TotalFrontier<Timestamp>` | 获取当前前沿 |\n| `snapshot_at(frontier)` | `Vec<(Key, Vec<Value>)>` | 在指定前沿获取快照 |\n| `failed()` | `bool` | 检查处理是否失败 |\n\n资料来源：[src/python_api.rs:106-115]()\n\n### Done 标记值\n\n`Done` 是时间线末端的标记值，用于表示计算完成状态。它是一个冻结的 Python 对象，具有特殊的比较和哈希行为：\n\n- 与 `Done` 比较时返回 `Equal`\n- 与整数比较时返回 `Greater`\n- 支持自定义哈希实现\n\n资料来源：[src/python_api.rs:176-196]()\n\n## 时间戳与前沿\n\n### Timestamp 时间戳\n\nPathway 使用 `Timestamp` 进行时间管理，实现 `OriginalOrRetraction` 和 `NextRetractionTime` trait：\n\n```mermaid\ngraph LR\n    A[原始数据] --> B[偶数时间戳]\n    A --> C[ retraction 数据]\n    C --> D[奇数时间戳]\n    D --> E[下一 retraction 时间]\n```\n\n- `is_original()`: 偶数时间戳表示原始数据\n- `next_retraction_time()`: 计算下一个 retraction 时间（当前时间 + 1）\n\n资料来源：[src/engine/timestamp.rs:45-55]()\n\n### TotalFrontier 前沿状态\n\n`TotalFrontier<T>` 是表示计算前沿的枚举类型：\n\n| 变体 | 说明 |\n|------|------|\n| `At(T)` | 在指定时间点 |\n| `Done` | 计算已完成 |\n\n该类型实现了 `FromPyObject` 和 `IntoPyObject`，支持 Python 和 Rust 间的无缝转换。\n\n资料来源：[src/python_api.rs:140-175]()\n\n## 基础使用模式\n\n### 典型项目结构\n\n```\nproject/\n├── app.py              # 主程序\n├── data/               # 数据目录\n│   ├── docs/           # 文档输入\n│   └── output/         # 结果输出\n├── .env                # 环境变量\n└── requirements.txt    # 依赖\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md:1-30]()\n\n### 环境配置\n\n安装 Pathway：\n\n```bash\npip install pathway[xpack-llm]\n```\n\n设置环境变量：\n\n```bash\nexport PATHWAY_LICENSE_KEY=your_pathway_key\n```\n\n或创建 `.env` 文件：\n\n```\nOPENAI_API_KEY=your_openai_api_key_here\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md:25-30]()\n\n### 运行应用\n\n直接运行 Python 脚本：\n\n```bash\npython app.py\n```\n\n使用 Pathway CLI：\n\n```bash\npathway spawn python main.py\n```\n\n多线程运行：\n\n```bash\npathway spawn --threads 3 python main.py\n```\n\n资料来源：[README.md:120-135]()\n\n### HTTP 服务集成\n\nPathway 提供 HTTP 服务器用于接收查询和返回结果：\n\n```python\nwebserver = pw.io.http.PathwayWebserver(host=\"0.0.0.0\", port=8011)\n```\n\n提交查询：\n\n```bash\ncurl --data '{ \"messages\": \"What is the value of X?\"}' http://localhost:8011\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md:15-20]()\n\n## 注册的 Python API 类\n\nPathway 通过 `add_class` 注册以下核心类到 `pathway.engine` 模块：\n\n| 类别 | 类名 |\n|------|------|\n| 连接器设置 | `AwsS3Settings`, `AzureBlobStorageSettings`, `MqttSettings`, `PsqlReplicationSettings` |\n| 解析器 | `CsvParserSettings`, `PySchemaRegistrySettings` |\n| 存储格式 | `DataStorage`, `DataFormat`, `PersistenceConfig` |\n| 索引 | `IcebergCatalogSettings`, `ElasticSearchParams` |\n| 内部类 | `PythonSubject`, `ValueField`, `PyExportedTable`, `Done`, `Trace` |\n| 配置 | `ConnectorProperties`, `ColumnProperties`, `TableProperties`, `TelemetryConfig` |\n| 快照 | `PySnapshotAccess`, `PySnapshotEvent`, `PyPersistenceMode` |\n| 向量索引 | `PyExternalIndexFactory`, `PyExternalIndexData`, `PyExternalIndexQuery` |\n\n资料来源：[src/python_api.rs:220-250]()\n\n## 文档与支持\n\n完整文档访问：[pathway.com/developers/](https://pathway.com/developers/user-guide/introduction/welcome)\n\n如有问题，可通过以下方式获取帮助：\n\n- [GitHub Issues](https://github.com/pathwaycom/pathway/issues)\n- [Discord 社区](https://discord.com/invite/pathway)\n- 邮箱：contact@pathway.com\n\n资料来源：[examples/templates/el-pipeline/README.md:45-50]()\n\n## 下一步\n\n- 学习 Pathway 的连接器配置\n- 了解实时数据处理的高级特性\n- 探索 RAG 和 LLM 集成的最佳实践\n- 查看 `examples/projects/` 目录下的完整示例项目\n\n---\n\n<a id='arch-engine-overview'></a>\n\n## 引擎架构\n\n### 相关页面\n\n相关主题：[Python与Rust集成](#arch-python-rust-integration)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/engine/dataflow.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/dataflow.rs)\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [external/differential-dataflow/src/operators/join.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/join.rs)\n- [external/differential-dataflow/src/operators/reduce.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/reduce.rs)\n- [external/differential-dataflow/src/trace/wrappers/freeze.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/wrappers/freeze.rs)\n- [external/differential-dataflow/src/operators/arrange/writer.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/writer.rs)\n- [external/timely-dataflow/timely/src/progress/change_batch.rs](https://github.com/pathwaycom/pathway/blob/main/external/timely-dataflow/timely/src/progress/change_batch.rs)\n</details>\n\n# 引擎架构\n\n## 概述\n\nPathway 的引擎架构是一个基于 **Rust 的高性能计算引擎**，构建于 Differential Dataflow 和 Timely Dataflow 之上。这一架构使得 Pathway 能够以 Python API 的形式提供简洁易用的接口，同时在底层实现增量计算和流式处理。引擎的核心设计理念是将 Python 层的灵活表达与 Rust 引擎的高性能执行相结合，通过 PyO3 实现 Python 与 Rust 之间的无缝桥接。\n\nPathway 引擎支持多种数据类型和处理模式，包括静态数据（batch）和动态数据（streaming），能够在同一个代码库中处理历史数据和实时数据流。引擎内置的增量计算机制确保只有发生变化的数据才会被重新处理，从而大幅提升计算效率。\n\n## 核心架构组件\n\n### 计算引擎层次结构\n\nPathway 引擎采用三层架构设计，从上到下依次为：\n\n| 层次 | 组件 | 职责 |\n|------|------|------|\n| Python API 层 | `pathway.engine` 模块 | 提供 Python 接口，暴露核心类和函数 |\n| PyO3 绑定层 | `src/python_api.rs` | 实现 Python 对象到 Rust 对象的转换 |\n| Rust 核心层 | `src/engine/` | 实现 Differential Dataflow 计算引擎 |\n\n资料来源：[src/python_api.rs:1-50]()\n\n### 核心类结构\n\nPathway 引擎通过 PyO3 将以下核心 Rust 结构暴露给 Python：\n\n```python\n# Python API 层暴露的核心类\nTable           # 数据表，表示一组带时间戳的记录\nColumn          # 列，表示数据表中的一列\nUniverse        # 宇宙，表示一组唯一标识符\nContext         # 上下文，管理数据流计算的执行环境\nScope           # 作用域，定义计算的数据流范围\nDataRow         # 数据行，表示表中的一行数据\nComputer        # 计算器，定义如何计算派生列\n```\n\n资料来源：[src/python_api.rs:85-110]()\n\n## 数据流计算模型\n\n### Differential Dataflow 集成\n\nPathway 引擎的核心计算模型基于 **Differential Dataflow**，这是一种用于增量数据流计算的编程模型。Differential Dataflow 能够高效处理数据的插入、更新和删除操作，通过追踪数据变化而非重新计算整个结果来实现高性能。\n\n```mermaid\ngraph TD\n    A[输入数据流] --> B[Input Session]\n    B --> C[Differential Collection]\n    C --> D[Arrangement]\n    D --> E[Operators<br/>join/reduce/count]\n    E --> F[输出 Collection]\n    F --> G[Incremental 结果]\n    \n    H[变化追踪] --> C\n```\n\n资料来源：[external/differential-dataflow/src/operators/reduce.rs:1-50]()\n\n### 时间戳与版本管理\n\nPathway 使用复合时间戳来追踪数据变化，每个数据记录都带有时间戳和差异值（multiplicity）。时间戳必须实现 `Lattice` trait，支持部分有序比较，这使得系统能够正确处理并发和乱序数据。\n\n```rust\n// 时间戳必须是 Lattice，用于支持增量计算的合并\nimpl<G, T1> JoinCore<G, T1::Key, T1::Val, T1::R> for Arranged<G, T1>\nwhere\n    G: Scope,\n    G::Timestamp: Lattice+Ord+Debug,\n    // ...\n```\n\n资料来源：[external/differential-dataflow/src/operators/join.rs:1-30]()\n\n### 数据流图结构\n\nPathway 引擎内部维护一个数据流图（Dataflow Graph），用于协调各个算子之间的数据传递：\n\n```mermaid\ngraph LR\n    subgraph DataflowGraphInner\n        A[BeforeIterate] --> B[Scope]\n        B --> C[Child Scope]\n        C --> D[Operators]\n        D --> E[AfterIterate]\n    end\n    \n    F[持久化数据] --> A\n    A --> G[表创建]\n    G --> D\n```\n\n资料来源：[src/engine/dataflow.rs:1-60]()\n\n## 核心算子实现\n\n### Join 算子\n\nJoin 是 Pathway 中最常用的算子之一，用于合并两个数据集。引擎使用 `join_core_internal_unsafe` 方法实现高效的连接操作：\n\n```rust\n// Join 操作的核心实现\nfn join_core<Tr2,I,L>(&self, stream2: &Arranged<G,Tr2>, result: L) -> Collection<G,D,ROut>\nwhere\n    Tr2: TraceReader<Key=K, Time=G::Timestamp>+Clone+'static,\n    // ... 配置连接参数\n{\n    self.arrange_by_key().join_core_internal_unsafe(stream2, result)\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/join.rs:20-35]()\n\n### Reduce 算子\n\nReduce 算子用于对数据进行聚合操作，支持自定义归约器实现：\n\n```rust\nimpl<S: MaybeTotalScope, R: ReducerImpl> DataflowReducer<S> for R\nwhere\n    Collection<S, (Key, Option<<R as ReducerImpl>::State>)>:\n        Into<PersistableCollection<S>> + From<PersistableCollection<S>>,\n{\n    fn reduce(\n        self: Rc<Self>,\n        values: &Collection<S, (Key, Key, Vec<Value>)>,\n        error_logger: Rc<dyn LogError>,\n        _trace: Trace,\n        graph: &mut DataflowGraphInner<S>,\n    ) -> Result<Values<S>> {\n        // 实现聚合逻辑\n    }\n}\n```\n\n资料来源：[src/engine/dataflow.rs:80-110]()\n\n### Count 算子\n\nCount 是 Differential Dataflow 内置的聚合操作，用于计算每个键的出现次数：\n\n```rust\n// Count 扩展 trait 实现\npub trait Count<G: Scope, K: Data, R: Semigroup> where G::Timestamp: Lattice+Ord {\n    fn count(&self) -> Collection<G, (K, R), isize> {\n        self.count_core()\n    }\n    \n    fn count_core<R2: Abelian + From<i8>>(&self) -> Collection<G, (K, R), R2> {\n        self.reduce_abelian::<_,DefaultValTrace<_,_,_,_>>(\n            \"Count\", \n            |_k, s, t| t.push((s[0].1.clone(), R2::from(1i8)))\n        ).as_collection(|k, c| (k.clone(), c.clone()))\n    }\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/reduce.rs:30-80]()\n\n## Arrangement 数据结构\n\n### Arrangement 概念\n\nArrangement 是 Differential Dataflow 中的核心数据结构，用于组织和索引数据以支持高效的后续操作。Arrangement 本质上是一个按 key 排序的 trace 结构：\n\n```mermaid\ngraph TD\n    A[输入数据] --> B[ArrangeByKey]\n    B --> C[Trace Agent]\n    C --> D[Trace Spine]\n    D --> E[多个 Batch]\n    \n    F[查询] --> G[Cursor 遍历]\n    G --> D\n```\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:1-40]()\n\n### Trace Writer 实现\n\nTrace Writer 负责向 arrangement 中写入数据批次：\n\n```rust\npub fn new(\n    upper: Vec<Tr::Time>,\n    trace: Weak<RefCell<TraceBox<Tr>>>,\n    queues: Rc<RefCell<Vec<TraceAgentQueueWriter<Tr>>>>\n) -> Self {\n    let mut temp = Antichain::new();\n    temp.extend(upper.into_iter());\n    Self { upper: temp, trace, queues }\n}\n\npub fn exert(&mut self, fuel: &mut isize) {\n    if let Some(trace) = self.trace.upgrade() {\n        trace.borrow_mut().trace.exert(fuel);\n    }\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/arrange/writer.rs:1-40]()\n\n### Trace Freeze 包装器\n\nTrace Freeze 是一个特殊的 trace 包装器，用于在特定条件下冻结时间：\n\n```rust\nimpl<Tr, F> TraceReader for TraceFreeze<Tr, F>\nwhere\n    Tr: TraceReader,\n    Tr::Time: Lattice+Clone+'static,\n    F: Fn(&Tr::Time)->Option<Tr::Time>+'static,\n{\n    type Key = Tr::Key;\n    type Val = Tr::Val;\n    type Time = Tr::Time;\n    type R = Tr::R;\n    \n    type Batch = BatchFreeze<Tr::Batch, F>;\n    type Cursor = CursorFreeze<Tr::Cursor, F>;\n}\n```\n\n资料来源：[external/differential-dataflow/src/trace/wrappers/freeze.rs:1-50]()\n\n## 增量计算机制\n\n### 变化批次处理\n\nPathway 使用 `ChangeBatch` 来追踪数据的变化，每个变化包含时间戳和差异值：\n\n```rust\npub fn into_inner(mut self) -> Vec<(T, i64)> {\n    self.compact();\n    self.updates\n}\n\npub fn iter(&mut self) -> ::std::slice::Iter<(T, i64)> {\n    self.compact();\n    self.updates.iter()\n}\n```\n\n资料来源：[external/timely-dataflow/timely/src/progress/change_batch.rs:60-90]()\n\n### 持久化与状态管理\n\n引擎支持对计算状态进行持久化，通过 `PersistenceConfig` 配置持久化行为：\n\n| 配置项 | 类型 | 说明 |\n|--------|------|------|\n| `persistence_mode` | `PyPersistenceMode` | 持久化模式（禁用/异步/同步） |\n| `snapshot_access` | `PySnapshotAccess` | 快照访问策略 |\n| `snapshot_event` | `PySnapshotEvent` | 快照触发事件 |\n\n资料来源：[src/python_api.rs:1-30]()\n\n## Python 绑定实现\n\n### PyO3 类导出\n\nPathway 通过 PyO3 将 Rust 结构导出为 Python 类：\n\n```rust\n#[pymethods]\nimpl ValueField {\n    #[new]\n    #[pyo3(signature = (name, type_, source = FieldSource::Payload))]\n    pub fn new(name: String, type_: Type, source: FieldSource) -> Self {\n        Self {\n            name,\n            type_,\n            source,\n        }\n    }\n}\n```\n\n资料来源：[src/python_api.rs:140-160]()\n\n### PythonSubject 绑定\n\nPythonSubject 允许 Python 代码定义自定义的数据源：\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\npub struct PythonSubject {\n    pub start: Py<PyAny>,\n    pub read: Py<PyAny>,\n    pub seek: Py<PyAny>,\n    pub on_persisted_run: Py<PyAny>,\n    pub end: Py<PyAny>,\n    pub is_internal: bool,\n    pub deletions_enabled: bool,\n}\n```\n\n资料来源：[src/python_api.rs:95-130]()\n\n## 数据模型\n\n### 表结构\n\n| 组件 | 说明 |\n|------|------|\n| `TableProperties` | 表级别的属性配置 |\n| `ColumnProperties` | 列级别的属性配置 |\n| `ConnectorProperties` | 连接器配置 |\n| `Trace` | 追踪配置 |\n| `Done` | 完成标记 |\n\n资料来源：[src/python_api.rs:25-40]()\n\n### 值字段定义\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\n#[derive(Clone)]\npub struct ValueField {\n    #[pyo3(get)]\n    pub name: String,\n    #[pyo3(get)]\n    pub type_: Type,\n    #[pyo3(get)]\n    pub source: FieldSource,\n    #[pyo3(get)]\n    pub default: Option<Value>,\n    #[pyo3(get)]\n    pub metadata: Option<String>,\n}\n```\n\n资料来源：[src/python_api.rs:145-165]()\n\n## 关键配置选项\n\n### 连接器模式\n\n| 模式 | 说明 |\n|------|------|\n| `STATIC` | 静态数据源，读取一次 |\n| `STREAMING` | 流式数据源，持续监听 |\n| `REPLAY` | 回放模式，重演历史数据 |\n\n### 监控级别\n\n| 级别 | 说明 |\n|------|------|\n| `OFF` | 禁用监控 |\n| `BASIC` | 基础指标 |\n| `DETAILED` | 详细指标 |\n\n资料来源：[src/python_api.rs:85-100]()\n\n## 工作流程图\n\n```mermaid\ngraph TD\n    A[Python 代码定义数据流] --> B[PyO3 调用 Rust 引擎]\n    B --> C[创建 DataflowGraph]\n    C --> D[初始化 Scope 和 Universe]\n    D --> E[注册数据源和算子]\n    E --> F[启动数据流执行]\n    \n    F --> G{数据输入模式}\n    G -->|批量| H[Batch Input]\n    G -->|流式| I[Stream Input]\n    \n    H --> J[计算 operators]\n    I --> J\n    \n    J --> K[生成 Arrangement]\n    K --> L[输出结果]\n    \n    J --> M{状态变化}\n    M -->|有变化| N[触发增量计算]\n    M -->|无变化| O[跳过重算]\n    N --> K\n```\n\n## 总结\n\nPathway 的引擎架构通过深度整合 Differential Dataflow 和 Timely Dataflow，实现了一个高性能、可扩展的增量计算引擎。核心设计特点包括：\n\n1. **双层语言架构**：Python API 提供易用性，Rust 引擎保证性能\n2. **增量计算模型**：通过时间戳和差异值追踪变化，避免重复计算\n3. **灵活的 Arrangement**：支持高效的 join、reduce 等操作\n4. **完整的持久化支持**：内置状态管理和快照机制\n5. **丰富的连接器**：支持多种数据源和输出目标\n\n这种架构使 Pathway 能够同时满足开发和生产环境的需求，支持从本地开发到大规模分布式部署的多种场景。\n\n---\n\n<a id='arch-python-rust-integration'></a>\n\n## Python与Rust集成\n\n### 相关页面\n\n相关主题：[引擎架构](#arch-engine-overview)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [external/differential-dataflow/src/trace/wrappers/freeze.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/wrappers/freeze.rs)\n- [external/timely-dataflow/mdbook/src/chapter_2/chapter_2_5.md](https://github.com/pathwaycom/pathway/blob/main/external/timely-dataflow/mdbook/src/chapter_2/chapter_2_5.md)\n</details>\n\n# Python与Rust集成\n\nPathway是一个基于Python的ETL框架，但其核心计算引擎由Rust实现。这种架构通过PyO3库实现Python与Rust的无缝集成，使得开发者可以使用简洁的Python API编写数据处理逻辑，同时享受Rust带来的高性能和内存安全性。\n\n## 技术架构概述\n\nPathway采用双层架构设计：上层是用户编写的Python代码，下层是Rust实现的增量计算引擎。Differential Dataflow算法驱动核心数据流处理，而Timely Dataflow提供分布式计算能力。\n\n```mermaid\ngraph TD\n    A[Python用户代码] --> B[PyO3绑定层]\n    B --> C[Rust引擎核心]\n    C --> D[Differential Dataflow]\n    C --> E[Timely Dataflow]\n    D --> F[增量计算]\n    E --> G[分布式数据流]\n    F --> H[结果返回Python]\n    G --> H\n```\n\n## PyO3绑定机制\n\nPathway使用PyO3库实现Python与Rust之间的双向通信。PyO3提供了从Rust向Python暴露类型和函数的能力，同时也支持从Python代码调用Rust函数。\n\n### 核心数据结构绑定\n\n#### PythonSubject结构体\n\n`PythonSubject`是Pathway中用于连接外部数据源的关键结构体，它将Python端的回调函数包装为Rust可调用的对象：\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\n#[derive(Clone)]\npub struct PythonSubject {\n    pub start: Py<PyAny>,\n    pub read: Py<PyAny>,\n    pub seek: Py<PyAny>,\n    pub on_persisted_run: Py<PyAny>,\n    pub end: Py<PyAny>,\n    pub is_internal: bool,\n    pub deletions_enabled: bool,\n}\n```\n\n资料来源：[src/python_api.rs:1-15]()\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| `start` | `Py<PyAny>` | 起始位置回调函数 |\n| `read` | `Py<PyAny>` | 读取数据回调函数 |\n| `seek` | `Py<PyAny>` | 定位回调函数 |\n| `on_persisted_run` | `Py<PyAny>` | 持久化运行回调 |\n| `end` | `Py<PyAny>` | 结束位置回调 |\n| `is_internal` | `bool` | 是否为内部数据源 |\n| `deletions_enabled` | `bool` | 是否启用删除操作 |\n\n#### ValueField结构体\n\n`ValueField`用于定义表字段的元数据，包括字段名、类型、来源和默认值：\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen)]\n#[derive(Clone)]\npub struct ValueField {\n    #[pyo3(get)]\n    pub name: String,\n    #[pyo3(get)]\n    pub type_: Type,\n    #[pyo3(get)]\n    pub source: FieldSource,\n    #[pyo3(get)]\n    pub default: Option<Value>,\n    #[pyo3(get)]\n    pub metadata: Option<String>,\n}\n```\n\n资料来源：[src/python_api.rs:36-48]()\n\n### PyO3类型转换\n\nPathway实现了Python与Rust之间的自动类型转换，通过`IntoPyObject`和`FromPyObject`trait实现。\n\n#### Timestamp类型转换\n\n```rust\nimpl<'py> IntoPyObject<'py> for Timestamp {\n    type Target = PyAny;\n    type Output = Bound<'py, Self::Target>;\n    type Error = PyErr;\n    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {\n        self.0.into_bound_py_any(py)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:40-50]()\n\n#### TotalFrontier枚举转换\n\n`TotalFrontier`表示计算的前沿边界，包含`At`和`Done`两种状态：\n\n```rust\nimpl<'py, T> FromPyObject<'py> for TotalFrontier<T>\nwhere\n    T: FromPyObject<'py>,\n{\n    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {\n        if ob.is_instance_of::<Done>() {\n            Ok(TotalFrontier::Done)\n        } else {\n            Ok(TotalFrontier::At(ob.extract()?))\n        }\n    }\n}\n```\n\n资料来源：[src/python_api.rs:180-192]()\n\n## 错误处理机制\n\nPathway通过PyO3暴露多种异常类型供Python端捕获和处理。\n\n### 异常类型体系\n\n| 异常类型 | 说明 | 使用场景 |\n|----------|------|----------|\n| `EngineError` | 基础引擎错误 | 通用错误处理 |\n| `EngineErrorWithTrace` | 带调用栈的错误 | 调试和问题定位 |\n| `OtherWorkerError` | 其他工作进程错误 | 分布式环境错误处理 |\n| `Error` | 通用错误类型 | 自定义错误 |\n| `Pending` | 操作未完成状态 | 异步操作状态查询 |\n\n资料来源：[src/python_api.rs:200-230]()\n\n## 表导出与快照机制\n\n### PyExportedTable导出表\n\n`PyExportedTable`允许Python代码访问Rust引擎计算的表数据：\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen, name = \"ExportedTable\")]\npub struct PyExportedTable {\n    inner: Arc<dyn ExportedTable>,\n}\n\n#[pymethods]\nimpl PyExportedTable {\n    fn frontier(&self) -> TotalFrontier<Timestamp> {\n        self.inner.frontier()\n    }\n\n    fn snapshot_at(&self, frontier: TotalFrontier<Timestamp>) -> Vec<(Key, Vec<Value>)> {\n        self.inner.snapshot_at(frontier)\n    }\n\n    fn failed(&self) -> bool {\n        self.inner.failed()\n    }\n}\n```\n\n资料来源：[src/python_api.rs:115-135]()\n\n### Done标记类型\n\n`Done`是一个特殊的冻结类型，用于表示计算已完成的状态：\n\n```rust\nmod done {\n    use once_cell::sync::Lazy;\n    use pyo3::prelude::*;\n\n    #[pyclass(module = \"pathway.engine\", frozen)]\n    pub struct Done(InnerDone);\n\n    pub static DONE: Lazy<Py<Done>> = Lazy::new(|| {\n        Python::with_gil(|py| Py::new(py, Done(InnerDone)).expect(\"creating DONE should not fail\"))\n    });\n}\n```\n\n资料来源：[src/python_api.rs:165-177]()\n\n## 计算上下文\n\n### Context作用域管理\n\n`Context`结构体提供了当前计算行的上下文信息：\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen)]\npub struct Context(SendWrapper<ScopedContext>);\n\n#[pymethods]\nimpl Context {\n    #[getter]\n    fn this_row(&self) -> PyResult<Key> {\n        self.0\n            .with(|context| context.this_row())\n            .ok_or_else(|| PyValueError::new_err(\"context out of scope\"))\n    }\n}\n```\n\n资料来源：[src/python_api.rs:235-250]()\n\n### Computer计算器\n\n`Computer`用于执行用户定义的Python函数：\n\n```rust\n#[pyclass]\npub struct Computer {\n    pub fun: Py<PyAny>,\n    pub dtype: Py<PyAny>,\n    pub is_output: bool,\n    pub is_method: bool,\n    pub universe: Py<Universe>,\n    pub data: Value,\n    pub data_column: Option<Py<Column>>,\n}\n\n#[pymethods]\nimpl Computer {\n    #[staticmethod]\n    #[pyo3(signature = (fun, dtype, is_output, is_method, universe, data = Value::None, data_column = None))]\n    pub fn from_raising_fun(\n        py: Python,\n        fun: Py<PyAny>,\n        dtype: Py<PyAny>,\n        is_output: bool,\n        is_method: bool,\n        universe: Py<Universe>,\n        data: Value,\n        data_column: Option<Py<Column>>,\n    ) -> PyResult<Py<Self>> { ... }\n}\n```\n\n资料来源：[src/python_api.rs:260-290]()\n\n## Differential Dataflow集成\n\n### Arrangement导入机制\n\nPathway通过Differential Dataflow的arrangement机制在数据流之间共享计算结果：\n\n```rust\n/// Same as `import`, but allows to name the source.\npub fn import_named<G>(&mut self, scope: &G, name: &str) -> Arranged<G, TraceAgent<Tr>>\nwhere\n    G: Scope<Timestamp=Tr::Time>,\n    Tr::Time: Timestamp,\n{\n    // Drop ShutdownButton and return only the arrangement.\n    self.import_core(scope, name).0\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:35-45]()\n\n### Trace Freeze包装器\n\n`TraceFreeze`包装器允许延迟计算trace直到被使用时才执行：\n\n```rust\npub struct TraceFreeze<Tr, F>\nwhere\n    Tr: TraceReader,\n    Tr::Time: Lattice+Clone+'static,\n    F: Fn(&Tr::Time)->Option<Tr::Time>,\n{\n    trace: Tr,\n    func: Rc<F>,\n}\n```\n\n资料来源：[external/differential-dataflow/src/trace/wrappers/freeze.rs:15-21]()\n\n## 时间戳与顺序语义\n\n### Timestamp特征实现\n\nPathway实现了Differential Dataflow所需的时间戳特征：\n\n```rust\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:60-70]()\n\n### 时间戳顺序关系\n\nTimestamp支持时间戳间的偏序关系计算：\n\n```rust\npub fn followed_by(&self, other: &Self) -> Option<Self> {\n    self.0.followed_by(&other.0).map(Self)\n}\n```\n\n## 注册的Python类\n\nPathway通过`add_class`方法将以下Rust结构体暴露给Python：\n\n| 类名 | 功能 |\n|------|------|\n| `AwsS3Settings` | AWS S3连接配置 |\n| `AzureBlobStorageSettings` | Azure Blob存储配置 |\n| `ElasticSearchParams` | Elasticsearch参数 |\n| `CsvParserSettings` | CSV解析设置 |\n| `DataStorage` | 数据存储配置 |\n| `DataFormat` | 数据格式定义 |\n| `PersistenceConfig` | 持久化配置 |\n| `PythonSubject` | Python数据源主题 |\n| `MqttSettings` | MQTT连接设置 |\n| `IcebergCatalogSettings` | Iceberg目录配置 |\n| `ConnectorProperties` | 连接器属性 |\n| `ColumnProperties` | 列属性 |\n| `TableProperties` | 表属性 |\n| `PyExportedTable` | 导出表 |\n| `PyExternalIndexFactory` | 外部索引工厂 |\n| `PyUSearchMetricKind` | USearch度量类型 |\n| `PyBruteForceKnnMetricKind` | 暴力KNN度量类型 |\n\n资料来源：[src/python_api.rs:300-330]()\n\n## 数据流处理流程\n\n```mermaid\nsequenceDiagram\n    participant Python用户代码\n    participant PyO3绑定层\n    participant Rust引擎\n    participant DifferentialDataflow\n    participant TimelyDataflow\n    \n    Python用户代码->>PyO3绑定层: 调用Python API\n    PyO3绑定层->>Rust引擎: 传递计算请求\n    Rust引擎->>DifferentialDataflow: 创建数据流操作符\n    DifferentialDataflow->>TimelyDataflow: 分布式执行\n    TimelyDataflow-->>DifferentialDataflow: 返回计算结果\n    DifferentialDataflow-->>Rust引擎: 增量更新结果\n    Rust引擎-->>PyO3绑定层: 转换结果类型\n    PyO3绑定层-->>Python用户代码: 返回Python对象\n```\n\n## 多工作进程支持\n\nPathway支持通过Timely Dataflow的多工作进程实现并行计算。可以通过命令行参数指定工作进程数量：\n\n```bash\ncargo run --example wordcount -- -w2\n```\n\n这将启动两个工作进程并行处理数据，每个进程独立消费输入数据并产生部分结果。\n\n资料来源：[external/timely-dataflow/mdbook/src/chapter_2/chapter_2_5.md:1-25]()\n\n## 总结\n\nPathway的Python与Rust集成架构充分发挥了两种语言的优势：\n\n- **Python端**：提供简洁的声明式API，开发者使用Python编写数据处理逻辑\n- **Rust端**：实现高性能的增量计算引擎，基于Differential Dataflow提供高效的流式处理\n- **PyO3**：作为桥梁，实现Python与Rust之间的无缝类型转换和函数调用\n- **Timely Dataflow**：提供分布式计算能力，支持多工作进程并行处理\n\n---\n\n<a id='connectors-overview'></a>\n\n## 连接器概述\n\n### 相关页面\n\n相关主题：[数据库连接器](#connectors-database), [流数据连接器](#connectors-streaming)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs) — Python API 绑定定义，包括连接器相关的 Rust 类型映射\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md) — 项目总体介绍\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md) — RAG 示例项目\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md) — 多智能体 RAG 示例\n- [examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md) — 流水线模板文档\n\n</details>\n\n# 连接器概述\n\n## 1. 概述\n\nPathway 是一个 Python ETL 框架，专为流处理、实时分析、LLM 管道和 RAG（检索增强生成）而设计 [资料来源：README.md](README.md)。连接器（Connector）是 Pathway 数据管道中的核心组件，负责数据的输入与输出操作。\n\nPathway 连接器系统具有以下特点：\n\n- **多源数据支持**：支持文件系统、数据库、云存储、消息队列等多种数据源\n- **实时与批处理**：同一套 API 支持静态批处理和流式实时处理\n- **Rust 引擎驱动**：底层由 Rust 编写的可扩展引擎驱动，基于 Differential Dataflow 实现增量计算\n- **Python 友好**：通过 Python API 提供简洁易用的接口\n\n## 2. 连接器架构\n\n### 2.1 核心组件关系\n\n```mermaid\ngraph TD\n    subgraph \"Python API 层\"\n        IO[\"pw.io 模块\"]\n        DS[\"DataSource\"]\n        DK[\"DataSink\"]\n    end\n    \n    subgraph \"Rust 引擎层\"\n        CP[\"ConnectorProperties\"]\n        CM[\"ConnectorMode\"]\n        Engine[\"Pathway Engine\"]\n    end\n    \n    subgraph \"数据源类型\"\n        S3[\"AWS S3\"]\n        Azure[\"Azure Blob\"]\n        CSV[\"CSV Parser\"]\n        MQTT[\"MQTT\"]\n        Kafka[\"Kafka/Confluent\"]\n    end\n    \n    IO --> DS\n    IO --> DK\n    DS --> CP\n    DK --> CP\n    CP --> CM\n    CM --> Engine\n    S3 --> CP\n    Azure --> CP\n    CSV --> CP\n    MQTT --> CP\n```\n\n### 2.2 连接器模式\n\nPathway 支持两种连接器模式，用于区分数据处理方式 [资料来源：src/python_api.rs:320-329](src/python_api.rs)：\n\n| 模式 | 枚举值 | 说明 |\n|------|--------|------|\n| 静态模式 | `ConnectorMode.STATIC` | 批处理模式，一次性加载所有数据 |\n| 流式模式 | `ConnectorMode.STREAMING` | 实时流处理，持续监听数据变化 |\n\n```rust\n#[pymethods]\nimpl PyConnectorMode {\n    #[classattr]\n    pub const STATIC: ConnectorMode = ConnectorMode::Static;\n    #[classattr]\n    pub const STREAMING: ConnectorMode = ConnectorMode::Streaming;\n}\n```\n\n## 3. 数据源（DataSource）\n\n数据源是连接器系统中负责从外部系统读取数据的组件。在 Pathway 中，数据源通过 Python Subject 接口与 Rust 引擎交互。\n\n### 3.1 PythonSubject 结构\n\n`PythonSubject` 是 Python 层与 Rust 引擎之间的桥梁，封装了数据读取所需的所有回调函数 [资料来源：src/python_api.rs:15-39](src/python_api.rs)：\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\n#[derive(Clone)]\npub struct PythonSubject {\n    pub start: Py<PyAny>,              // 初始化回调\n    pub read: Py<PyAny>,               // 读取数据回调\n    pub seek: Py<PyAny>,               // 寻址回调（可选）\n    pub on_persisted_run: Py<PyAny>,    // 持久化运行回调\n    pub end: Py<PyAny>,                // 结束回调\n    pub is_internal: bool,             // 是否为内部数据源\n    pub deletions_enabled: bool,       // 是否启用删除操作\n}\n```\n\n### 3.2 数据读取流程\n\n```mermaid\nsequenceDiagram\n    participant App as 应用代码\n    participant DS as DataSource\n    participant Subject as PythonSubject\n    participant Engine as Pathway Engine\n    \n    App->>DS: 创建数据源实例\n    DS->>Subject: 调用 start() 初始化\n    Engine->>Subject: 调用 read() 读取数据\n    Subject-->>Engine: 返回数据批次\n    loop 流式处理\n        Engine->>Subject: 调用 read() 轮询新数据\n        Subject-->>Engine: 返回更新数据\n    end\n    Engine->>Subject: 调用 end() 清理资源\n```\n\n## 4. 数据输出（DataSink）\n\n数据 Sink 负责将处理后的数据输出到外部系统。Pathway 提供了多种输出连接器，包括文件系统、数据库和消息队列等。\n\n### 4.1 输出配置参数\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| `host` | string | 服务器主机地址 |\n| `port` | int | 服务器端口 |\n| `format` | DataFormat | 输出数据格式 |\n| `storage` | DataStorage | 存储配置 |\n| `create_table` | bool | 是否创建输出表 |\n\n## 5. 支持的连接器类型\n\n### 5.1 云存储连接器\n\nPathway 原生支持主流云存储服务：\n\n| 服务 | 设置类 | 说明 |\n|------|--------|------|\n| AWS S3 | `AwsS3Settings` | 亚马逊 S3 对象存储 |\n| Azure Blob | `AzureBlobStorageSettings` | 微软 Azure Blob 存储 |\n\n### 5.2 消息队列连接器\n\n| 消息系统 | 设置类 | 说明 |\n|----------|--------|------|\n| MQTT | `MqttSettings` | IoT 场景常用的轻量级消息协议 |\n| Kafka | SchemaRegistrySettings | Confluent Schema Registry 集成 |\n\n### 5.3 数据库连接器\n\n| 数据库 | 设置类 | 说明 |\n|--------|--------|------|\n| PostgreSQL | `PsqlReplicationSettings` | 支持 CDC（变更数据捕获）复制 |\n\n### 5.4 其他专用连接器\n\n- **ElasticSearch**：`ElasticSearchParams` / `ElasticSearchAuth` — 全文搜索引擎集成\n- **Iceberg**：`IcebergCatalogSettings` — Apache Iceberg 表格式支持\n- **Schema Registry**：`PySchemaRegistrySettings` — Avro/Protobuf Schema 管理\n\n### 5.5 解析器设置\n\n```rust\nm.add_class::<CsvParserSettings>()?;\nm.add_class::<ValueField>()?;\nm.add_class::<DataStorage>()?;\nm.add_class::<DataFormat>()?;\n```\n\n## 6. 连接器配置属性\n\n### 6.1 ConnectorProperties\n\n连接器属性类定义了数据源和输出端的通用配置选项 [资料来源：src/python_api.rs:450-465](src/python_api.rs)：\n\n| 属性 | 说明 |\n|------|------|\n| `columns` | 列定义列表 |\n| `primary_key` | 主键列 |\n| `date_format` | 日期格式字符串 |\n| `value_fields` | 值字段列表 |\n\n### 6.2 ColumnProperties\n\n列属性用于定义单个列的元数据：\n\n```python\npw.ColumnProperties(\n    name=\"column_name\",\n    type_=pw.Type,\n    source=pw.FieldSource.Payload,\n    default=None,\n    metadata=None\n)\n```\n\n## 7. 使用示例\n\n### 7.1 RAG 应用中的连接器使用\n\n在问答型 RAG 应用中，Pathway 连接器用于实时索引文档并提供检索服务 [资料来源：examples/projects/question-answering-rag/README.md](examples/projects/question-answering-rag/README.md)：\n\n```python\nimport pathway as pw\n\n# 创建 HTTP 服务器用于接收查询\nwebserver = pw.io.http.PathwayWebserver(host=\"0.0.0.0\", port=8011)\n\n# 数据源：文件系统连接器监听 ./data/ 目录\n# 文档被分块、向量化后存储\n# 输出：通过 REST API 提供 /v1/retrieve 检索接口\n```\n\n### 7.2 多智能体 RAG 中的连接器\n\n在多智能体 RAG 架构中，连接器同时承担文档索引和智能体间通信的任务 [资料来源：examples/projects/ag2-multiagent-rag/README.md](examples/projects/ag2-multiagent-rag/README.md)：\n\n```mermaid\ngraph LR\n    A[\"Documents (./data/)\"] -->|实时索引| B[\"Pathway VectorStoreServer\"]\n    B -->|HTTP /v1/retrieve| C[\"AG2 Researcher Agent\"]\n    C -->|search_documents| B\n    B -->|检索结果| D[\"AG2 Analyst Agent\"]\n    D -->|综合回答| E[\"用户\"]\n```\n\n## 8. 连接器的高级特性\n\n### 8.1 持久化配置\n\nPathway 支持连接器级别的持久化配置，用于故障恢复和状态管理：\n\n```rust\nm.add_class::<PersistenceConfig>()?;\nm.add_class::<PyPersistenceMode>()?;\nm.add_class::<PySnapshotAccess>()?;\nm.add_class::<PySnapshotEvent>()?;\n```\n\n### 8.2 会话类型\n\nPathway 引擎支持不同类型的会话处理 [资料来源：src/python_api.rs:334-345](src/python_api.rs)：\n\n| 会话类型 | 枚举值 | 使用场景 |\n|----------|--------|----------|\n| 原生会话 | `SessionType.NATIVE` | 标准批处理和流处理 |\n| UPSERT 会话 | `SessionType.UPSERT` | 增量更新场景 |\n\n### 8.3 监控与追踪\n\nPathway 提供了完善的监控机制，包括：\n\n- **TelemetryConfig**：遥测数据收集配置\n- **BackfillingThreshold**：回填数据阈值控制\n- **EngineTrace**：引擎执行追踪\n\n## 9. 最佳实践\n\n1. **选择合适的连接器模式**：静态数据使用 `STATIC` 模式，实时数据使用 `STREAMING` 模式\n2. **配置主键**：为数据源设置主键以支持增量更新和去重\n3. **设置超时和重试**：生产环境应配置适当的重试策略\n4. **监控连接器状态**：使用 Pathway Dashboard 监控消息数量和延迟\n5. **处理删除操作**：根据业务需求通过 `deletions_enabled` 参数控制是否处理数据删除\n\n## 10. 相关文档\n\n- [入门指南](https://pathway.com/developers/user-guide/introduction/welcome)\n- [API 文档](https://pathway.com/developers/api-docs/pathway)\n- [模板示例](/developers/templates?tab=ai-pipelines)\n- [部署指南](#deployment)\n\n---\n\n<a id='connectors-database'></a>\n\n## 数据库连接器\n\n### 相关页面\n\n相关主题：[连接器概述](#connectors-overview), [流数据连接器](#connectors-streaming)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n</details>\n\n# 数据库连接器\n\n## 概述\n\nPathway 的数据库连接器模块提供了与各类数据库系统交互的能力，支持从外部数据源读取数据并将处理结果写入目标数据库。作为 Pathway ETL 框架的核心组件之一，数据库连接器实现了实时的增量数据同步机制，能够在流处理和批处理场景下保持数据的一致性和时效性。\n\nPathway 的数据库连接器基于 Rust 引擎构建，结合 Differential Dataflow 计算模型，实现了高效的增量计算能力。这意味着当源数据库中的数据发生变化时，连接器能够自动检测变更并仅处理发生变化的记录，而不是重新处理整个数据集。\n\n## 架构设计\n\n### 连接器类型体系\n\nPathway 在 `src/python_api.rs` 中定义了一套统一的连接器配置接口，通过 PyO3 绑定暴露给 Python 用户。连接器配置通过 `ConnectorProperties`、`ColumnProperties` 和 `TableProperties` 等类进行封装，支持对数据源的精细化配置。\n\n```python\nm.add_class::<ConnectorProperties>()?;\nm.add_class::<ColumnProperties>()?;\nm.add_class::<TableProperties>()?;\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n### 连接器模式\n\nPathway 支持两种连接器运行模式，通过 `ConnectorMode` 枚举定义：\n\n| 模式 | 说明 |\n|------|------|\n| `STATIC` | 静态模式，适用于批处理场景，数据源仅读取一次 |\n| `STREAMING` | 流式模式，适用于实时数据处理，支持持续监听数据变更 |\n\n```rust\n#[pymethods]\nimpl PyConnectorMode {\n    #[classattr]\n    pub const STATIC: ConnectorMode = ConnectorMode::Static;\n    #[classattr]\n    pub const STREAMING: ConnectorMode = ConnectorMode::Streaming;\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## 支持的数据库类型\n\n### 关系型数据库\n\nPathway 原生支持多种主流关系型数据库的连接：\n\n- **PostgreSQL**：通过 `PsqlReplicationSettings` 提供复制槽支持，用于捕获数据库变更\n- **MySQL**：支持标准的读取连接配置\n- **Microsoft SQL Server**：通过 MSSQL 连接器支持企业级应用\n\n### NoSQL 数据库\n\n- **MongoDB**：支持文档型数据库的实时读取和写入\n- **Elasticsearch**：提供全文检索场景下的数据索引能力\n\n### 云存储与服务\n\nPathway 还集成了多种云服务的连接配置：\n\n- **AWS S3**：`AwsS3Settings` 支持从 S3 存储桶读取数据文件\n- **Azure Blob Storage**：`AzureBlobStorageSettings` 支持 Azure 云存储\n- **Iceberg Catalog**：`IcebergCatalogSettings` 支持数据湖架构\n\n```rust\nm.add_class::<AwsS3Settings>()?;\nm.add_class::<AzureBlobStorageSettings>()?;\nm.add_class::<ElasticSearchParams>()?;\nm.add_class::<IcebergCatalogSettings>()?;\nm.add_class::<PsqlReplicationSettings>()?;\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## 数据处理流程\n\n```mermaid\ngraph TD\n    A[外部数据库] --> B[连接器读取层]\n    B --> C[变更捕获]\n    C --> D[Rust 引擎处理]\n    D --> E[增量计算]\n    E --> F[下游输出]\n    \n    G[Schema 定义] --> D\n    H[连接配置] --> B\n```\n\n### 读取流程\n\n1. **连接初始化**：连接器根据配置参数建立与目标数据库的连接\n2. **Schema 映射**：通过 `ValueField` 定义输入数据的字段映射和类型转换\n3. **变更捕获**：在流式模式下持续监听数据变更事件\n4. **数据传输**：变更数据通过 Pathway 引擎进行增量处理\n\n### 写入流程\n\n1. **输出表定义**：通过 `is_output=True` 标记输出计算节点\n2. **目标配置**：指定目标数据库的连接信息和写入策略\n3. **批量写入**：引擎自动将处理结果批量写入目标系统\n\n## Schema 定义\n\nPathway 使用 `ValueField` 结构体定义数据的 Schema 映射：\n\n```rust\n#[pyclass(module = \"pathway.engine\")]\n#[derive(Clone)]\npub struct ValueField {\n    #[pyo3(get)]\n    pub name: String,\n    #[pyo3(get)]\n    pub type_: Type,\n    #[pyo3(get)]\n    pub source: FieldSource,\n    #[pyo3(get)]\n    pub default: Option<Value>,\n    #[pyo3(get)]\n    pub metadata: Option<String>,\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n字段属性说明：\n\n| 属性 | 类型 | 说明 |\n|------|------|------|\n| name | String | 字段名称 |\n| type_ | Type | 数据类型 |\n| source | FieldSource | 字段来源（Payload/Materialized） |\n| default | Option<Value> | 默认值 |\n| metadata | Option<String> | 元数据信息 |\n\n## 时间戳与前沿管理\n\nPathway 通过 `TotalFrontier<T>` 管理数据处理的时间戳前沿，实现精确的增量计算：\n\n```rust\n#[pymethods]\nimpl PyExportedTable {\n    fn frontier(&self) -> TotalFrontier<Timestamp> {\n        self.inner.frontier()\n    }\n\n    fn snapshot_at(&self, frontier: TotalFrontier<Timestamp>) -> Vec<(Key, Vec<Value>)> {\n        self.inner.snapshot_at(frontier)\n    }\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n`TotalFrontier` 有两种状态：\n\n- **`At(t)`**：表示处理到达时间戳 `t`，继续处理后续数据\n- **`Done`**：表示数据处理完成，不再有新数据到达\n\n```rust\nimpl<'py, T> FromPyObject<'py> for TotalFrontier<T> {\n    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {\n        if ob.is_instance_of::<Done>() {\n            Ok(TotalFrontier::Done)\n        } else {\n            Ok(TotalFrontier::At(ob.extract()?))\n        }\n    }\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## 使用示例\n\n### 基本配置流程\n\n```python\nimport pathway as pw\n\n# 定义输入连接器\nclass InputSchema(pw.Schema):\n    id: int\n    value: str\n\n# 配置 PostgreSQL 输入\ninput_connector = pw.io.postgres.read(\n    host=\"localhost\",\n    port=5432,\n    database=\"mydb\",\n    user=\"user\",\n    password=\"password\",\n    table=\"source_table\",\n    schema=InputSchema,\n    mode=pw ConnectorMode.STREAMING\n)\n\n# 定义处理逻辑\nresult = input_connector.groupby(pw.this.id).reduce(\n    id=pw.this.id,\n    count=pw.reducers.count()\n)\n\n# 配置输出连接器\npw.io.postgres.write(\n    result,\n    host=\"localhost\",\n    port=5432,\n    database=\"mydb\",\n    user=\"user\",\n    password=\"password\",\n    table=\"output_table\"\n)\n\n# 运行管道\npw.run()\n```\n\n### 与向量存储集成\n\nPathway 的数据库连接器可以与向量索引功能结合使用，实现 RAG（检索增强生成）场景：\n\n```mermaid\ngraph LR\n    A[PostgreSQL/MySQL] --> B[Pathway 引擎]\n    B --> C[向量嵌入]\n    C --> D[向量索引]\n    D --> E[LLM 查询]\n```\n\n## 监控与故障处理\n\nPathway 的 `PyExportedTable` 提供了内置的监控接口：\n\n```rust\n#[pymethods]\nimpl PyExportedTable {\n    fn failed(&self) -> bool {\n        self.inner.failed()\n    }\n}\n```\n\n资料来源：[src/python_api.rs:行](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n通过检查 `failed()` 状态，用户可以及时发现数据处理过程中的异常情况，并采取相应的恢复措施。\n\n## 性能优化建议\n\n1. **批量处理**：利用 Pathway 的增量计算特性，避免全量数据重处理\n2. **索引优化**：确保源数据库的相关字段已建立索引\n3. **连接池**：合理配置连接池大小以平衡资源使用和吞吐量\n4. **Schema 精简**：仅提取必要的字段，减少数据传输量\n\n---\n\n<a id='connectors-streaming'></a>\n\n## 流数据连接器\n\n### 相关页面\n\n相关主题：[连接器概述](#connectors-overview)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs) - Python绑定和Rust连接器接口定义\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md) - RAG项目中的连接器使用示例\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md) - 问答RAG系统架构\n- [examples/projects/web-scraping/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/web-scraping/README.md) - Web数据抓取连接器实现\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md) - Pathway框架总体架构\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs) - 时间戳和增量计算引擎\n</details>\n\n# 流数据连接器\n\n## 概述\n\nPathway的流数据连接器（Stream Connectors）是Framework与外部数据源之间进行实时数据交换的核心组件。这些连接器使得Pathway能够从Kafka、NATS、MQTT、RabbitMQ、Redpanda等消息队列系统实时读取数据，并将处理结果写入各种数据sink。\n\nPathway的连接器系统构建在Differential Dataflow增量计算引擎之上，支持两种主要的运行模式：静态模式（Static）和流式模式（Streaming）。静态模式适用于批处理场景，而流式模式则专为实时数据处理设计，能够处理持续到来的数据更新。连接器系统还支持增量计算的语义，允许系统只处理自上次处理以来的变更，而不是重新处理整个数据集。\n\n## 核心架构\n\n### 连接器模式体系\n\nPathway定义了一套完整的连接器模式枚举，用于区分不同的数据处理语义：\n\n```python\nclass ConnectorMode(Enum):\n    STATIC = auto()    # 静态批处理模式\n    STREAMING = auto() # 实时流处理模式\n```\n\n每种模式对应不同的数据处理策略。静态模式下，连接器假设数据源是有限的，处理完成后系统进入完成状态。流式模式下，连接器持续监听数据源，系统永远不会自然结束，除非外部终止。\n\n### PythonSubject核心接口\n\n所有自定义数据源连接器都继承自`PythonSubject`基类，该类定义了数据源的核心接口：\n\n```python\nclass PythonSubject:\n    def __init__(\n        self,\n        start: Callable[[], None],           # 启动数据源\n        read: Callable[[], Iterator[Row]],   # 读取数据行\n        seek: Callable[[Any], None],         # 定位到指定位置\n        on_persisted_run: Callable[[], None], # 持久化运行回调\n        end: Callable[[], bool],             # 检查是否结束\n        is_internal: bool,                   # 是否内部数据源\n        deletions_enabled: bool              # 是否启用删除\n    )\n```\n\n资料来源：[src/python_api.rs:21-31]()\n\n`PythonSubject`的设计体现了Pathway连接器的核心哲学：数据源被抽象为一个可以随时启动、读取、定位和检查完成状态的可迭代对象。这种设计使得连接器能够与Differential Dataflow的计算模型无缝集成。\n\n### 字段定义与数据类型\n\n`ValueField`类定义了表中每个字段的元信息，包括名称、类型、来源和默认值：\n\n```python\nclass ValueField:\n    name: str           # 字段名称\n    type_: Type         # 数据类型\n    source: FieldSource # 字段来源 (默认: Payload)\n    default: Optional[Value]  # 默认值\n    metadata: Optional[str]   # 元数据\n```\n\n`FieldSource`枚举标识了数据的来源类型，最常用的是`FieldSource.Payload`，表示数据来自消息的有效载荷。在复杂的流处理场景中，字段来源还可以标识时间戳、主键等特殊字段。\n\n## 工作原理\n\n### 数据流处理管道\n\nPathway的流数据连接器遵循一个清晰的数据处理管道：\n\n```mermaid\ngraph TD\n    A[外部数据源] --> B[Connector启动]\n    B --> C[数据读取循环]\n    C --> D{数据有效?}\n    D -->|是| E[解析消息]\n    D -->|否| F[跳过/日志]\n    E --> G[转换为Pathway Row]\n    G --> H[输入到Differential Dataflow]\n    H --> I[增量计算]\n    I --> J[结果输出到Sink]\n    J --> C\n    F --> C\n```\n\n### 增量计算与时间戳\n\nPathway的连接器系统深度集成了Differential Dataflow的增量计算能力。每个数据更新都带有时间戳，系统只处理自上次检查点以来的变更。`Timestamp`类型实现了`OriginalOrRetraction`特性，用于区分原始数据和撤回数据：\n\n```python\nclass Timestamp:\n    def is_original(self) -> bool:\n        # 偶数时间戳表示原始数据\n        return self.0.is_multiple_of(2)\n    \n    def next_retraction_time(self) -> Self:\n        # 奇数时间戳表示撤回\n        return Self(self.0 + 1)\n```\n\n资料来源：[src/engine/timestamp.rs:45-53]()\n\n这种设计允许连接器高效处理数据的插入、更新和删除操作，而无需重新处理整个数据集。\n\n### 持久化与状态恢复\n\n连接器支持持久化运行模式，通过`on_persisted_run`回调函数实现故障恢复：\n\n1. **检查点保存**：系统在适当的时间点保存当前的处理状态\n2. **故障恢复**：重新启动时，从最后一个检查点恢复\n3. **数据重放**：根据保存的seek位置，重新从数据源读取未处理的数据\n\n这种机制确保了在分布式环境中，即使发生节点故障，数据也不会丢失，处理可以透明地继续。\n\n## 预置连接器\n\nPathway提供了多个开箱即用的流数据连接器，覆盖了主流的消息队列系统：\n\n| 连接器 | 协议 | 典型用途 | 文档链接 |\n|--------|------|----------|----------|\n| Kafka | TCP二进制协议 | 大规模流处理 | [kafka/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/kafka/__init__.py) |\n| Redpanda | Kafka兼容API | 高性能流处理 | [redpanda/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/redpanda/__init__.py) |\n| NATS | NATS协议 | 轻量级消息传递 | [nats/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/nats/__init__.py) |\n| MQTT | MQTT协议 | IoT设备数据采集 | [mqtt/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/mqtt/__init__.py) |\n| RabbitMQ | AMQP协议 | 企业消息队列 | [rabbitmq/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/rabbitmq/__init__.py) |\n\n### 连接器配置参数\n\n每种连接器都支持一组通用的配置参数：\n\n| 参数 | 类型 | 说明 | 默认值 |\n|------|------|------|--------|\n| `mode` | ConnectorMode | 运行模式 | STREAMING |\n| `deletions_enabled` | bool | 是否处理删除事件 | false |\n| `autocommit_duration` | timedelta | 自动提交间隔 | None |\n| `timeout` | timedelta | 连接超时时间 | 30秒 |\n| `schema` | ValueField[] | 输出表模式 | 自动推断 |\n\n## 使用场景\n\n### 实时文档索引\n\nPathway的连接器常用于构建实时RAG（检索增强生成）系统。以下架构展示了典型的用法：\n\n```mermaid\ngraph LR\n    A[文档数据源] --> B[Pathway连接器]\n    B --> C[数据预处理]\n    C --> D[文本分块]\n    D --> E[嵌入模型]\n    E --> F[向量索引]\n    F --> G[HTTP API]\n    G --> H[LLM查询]\n```\n\n资料来源：[examples/projects/ag2-multiagent-rag/README.md:12-20]()\n\n在ag2-multiagent-rag项目中，Pathway连接器负责监听文件系统中的文档变化，实时索引新文档并更新向量存储。外部AI代理通过REST API查询相关文档，系统保证始终返回最新的索引结果。\n\n### Web数据抓取管道\n\nPathway连接器也广泛用于构建数据采集管道：\n\n```mermaid\ngraph TD\n    A[新闻网站] --> B[Web爬虫连接器]\n    B --> C[内容解析]\n    C --> D[元数据提取]\n    D --> E[数据清洗]\n    E --> F[JSON Lines输出]\n    F --> G[下游处理]\n```\n\n资料来源：[examples/projects/web-scraping/README.md:1-50]()\n\n自定义的`NewsScraperSubject`继承自Pathway的`ConnectorSubject`，实现了文章URL作为主键的数据表结构，支持增量更新和去重。\n\n### 企业数据集成\n\n对于企业级应用，Pathway的RabbitMQ和Kafka连接器能够与现有的消息基础设施无缝集成：\n\n```python\nimport pathway as pw\n\nclass MyConnectorSubject(pw.io.ConnectorSubject):\n    def __init__(self):\n        super().__init__()\n        self._client = create_rabbitmq_client()\n    \n    def start(self):\n        self._client.connect()\n    \n    def read(self):\n        while message := self._client.consume():\n            yield self._parse_message(message)\n    \n    def end(self):\n        return self._client.is_idle_timeout()\n```\n\n## 与Differential Dataflow的集成\n\nPathway的连接器系统是Differential Dataflow在Python生态系统中的桥接层。Rust核心负责：\n\n1. **数据流的组织**：使用Trace数据结构维护历史状态\n2. **增量计算**：只处理实际发生的变更，而非全量数据\n3. **多 worker 支持**：通过`pathway spawn --threads N`启用多线程处理\n\nPython API提供了高级抽象，降低了使用门槛，同时保持了Rust引擎的高性能特性。这种设计使得开发者能够用Python编写流处理逻辑，而底层执行效率与原生Rust程序相当。\n\n## 最佳实践\n\n### 模式定义\n\n在创建连接器时，应显式定义输出表的schema，包括所有字段的名称、类型和默认值：\n\n```python\nschema = pw.Schema(\n    fields={\n        \"id\": pw.FieldType.STRING,\n        \"content\": pw.FieldType.STRING,\n        \"timestamp\": pw.FieldType.DATETIME,\n    },\n    primary_key=[\"id\"]\n)\n```\n\n### 错误处理\n\n连接器应实现健壮的错误处理机制：\n\n```python\ndef read(self):\n    try:\n        yield from self._fetch_data()\n    except TransientError:\n        # 临时错误，等待后重试\n        time.sleep(1)\n        yield from self.read()\n    except PermanentError:\n        # 永久错误，标记连接器失败\n        self._mark_failed()\n```\n\n### 性能优化\n\n1. **批量处理**：积累多条消息后再提交，减少I/O次数\n2. **背压控制**：当处理速度跟不上数据到达速度时，使用`BackfillingThreshold`配置\n3. **连接池复用**：在多worker场景下复用连接，避免资源耗尽\n\n## 总结\n\nPathway的流数据连接器系统提供了一套完整的数据集成解决方案。通过统一的抽象接口，开发者可以轻松地将各种外部数据源接入Pathway的处理管道。连接器与Differential Dataflow的深度集成确保了即使在处理大规模实时数据时，也能保持高效的增量计算能力。\n\n预置的Kafka、Redpanda、NATS、MQTT和RabbitMQ连接器覆盖了大多数流数据场景，而PythonSubject基类则允许开发者实现完全自定义的数据源连接器，满足各种特殊的集成需求。\n\n---\n\n<a id='trans-table-operations'></a>\n\n## 表操作\n\n### 相关页面\n\n相关主题：[连接与聚合](#trans-joins-aggregations), [基础示例](#quickstart-basic-example)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/internals/table.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/table.py)\n- [python/pathway/internals/column.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/column.py)\n- [python/pathway/internals/expression.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/expression.py)\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n</details>\n\n# 表操作\n\n## 概述\n\nPathway 是一个基于 Python 的 ETL 框架，其核心数据模型围绕**表（Table）**展开。表操作是 Pathway 数据处理流水线的基本构建单元，提供了对结构化数据进行转换、过滤、聚合和连接的能力。\n\nPathway 的表操作具有以下特性：\n\n- **统一批流处理**：同一套 API 可处理静态数据集和实时流数据\n- **增量计算**：基于 Rust 的 Differential Dataflow 引擎实现高效增量更新\n- **声明式语法**：通过 Python 表达式定义数据转换逻辑\n\n资料来源：[README.md:1-20]()\n\n## 核心数据结构\n\n### Table 类\n\n`Table` 是 Pathway 中的核心数据结构，代表一个具有特定模式的分布式表格。\n\n```mermaid\nclassDiagram\n    class Table {\n        +scope: Py[Scope]\n        +handle: TableHandle\n        +_is_deleted: bool\n        +_app_id: str\n        +__getitem__(key) TableSlice\n        +select(*args) Table\n        +update_rows(*args) Table\n        +update_cells(*args) Table\n        +filter(predicate) Table\n        +groupby(*args) Table\n        +join(*args) Table\n        +join_inner(*args) Table\n        +concat(*tables) Table\n    }\n    \n    class LegacyTable {\n        +universe: Universe\n        +columns: Vec~Column~\n        +to_engine() (UniverseHandle, Vec~ColumnHandle~)\n        +from_handles(scope, handles) LegacyTable\n    }\n    \n    Table --> Scope : 关联\n    LegacyTable --> Universe : 关联\n    LegacyTable --> Column : 包含\n```\n\n### 列（Column）与字段（ValueField）\n\n每张表由多个列组成，每个列具有名称、类型和来源信息。\n\n```python\n# ValueField 定义 - 资料来源：src/python_api.rs:40-55\n@dataclass\nclass ValueField:\n    name: str                    # 列名\n    type_: Type                  # 数据类型\n    source: FieldSource          # 字段来源（Payload/Materialized）\n    default: Optional[Value]     # 默认值\n    metadata: Optional[str]      # 元数据\n```\n\n### 类型系统\n\n| 类型 | 说明 | 对应 Rust 类型 |\n|------|------|---------------|\n| `INT` | 64位整数 | `i64` |\n| `FLOAT` | 双精度浮点 | `f64` |\n| `STR` | 字符串 | `String` |\n| `BOOL` | 布尔值 | `bool` |\n| `ANY` | 任意类型 | - |\n\n资料来源：[src/python_api.rs:40-55]()\n\n## 表的创建与初始化\n\n### 从输入连接器创建\n\nPathway 提供多种连接器用于从不同数据源创建表：\n\n```python\n# 从文件系统创建表\ntable = pw.io.fs.read(\n    \"./data\",\n    schema=MySchema,\n    format=\"csv\"\n)\n\n# 从 HTTP 流读取\ntable = pw.io.http.read(\n    \"http://api.example.com/stream\",\n    schema=MySchema,\n    json_field_paths={\"data\": \"$.records[*]\"}\n)\n```\n\n### 表的内部状态\n\n```mermaid\ngraph TD\n    A[输入数据] --> B[Python Subject]\n    B --> C[Raw Updates]\n    C --> D[Rust Engine]\n    D --> E[Universe Handle]\n    D --> F[Column Handles]\n    E --> G[Table]\n    F --> G\n    G --> H[快照 Snapshot]\n```\n\n表的内部状态由 `Universe` 和 `Column` 集合组成：\n\n```python\n# LegacyTable 内部结构 - 资料来源：src/python_api.rs:180-210\nclass LegacyTable:\n    def __init__(self, universe, columns):\n        self.universe = universe    # 行的唯一标识集合\n        self.columns = columns      # 列的列表\n    \n    def to_engine(self):\n        \"\"\"转换为引擎句柄\"\"\"\n        universe = self.universe.get()\n        column_handles = [c.get().handle for c in self.columns]\n        return (universe.handle, column_handles)\n```\n\n## 基础表操作\n\n### 选择操作（select）\n\n`select` 用于从表中提取指定列，可进行计算和重命名：\n\n```python\n# 基本选择\nresult = table.select(\n    id=table.user_id,\n    name=table.name,\n    full_name=table.first_name + \" \" + table.last_name  # 表达式计算\n)\n```\n\n### 过滤操作（filter）\n\n`filter` 根据条件筛选行：\n\n```python\n# 过滤条件\nactive_users = users.filter(users.status == \"active\")\npremium_orders = orders.filter(orders.amount > 100)\n```\n\n### 行更新（update_rows）\n\n向表中添加或更新行：\n\n```python\n# 添加新行\nupdated_table = table.update_rows(\n    pw.table.builder.make_table(\n        id=[1, 2],\n        name=[\"Alice\", \"Bob\"],\n        value=[10, 20]\n    )\n)\n```\n\n### 单元格更新（update_cells）\n\n更新特定列的值：\n\n```python\n# 更新特定列\nresult = table.update_cells(\n    status=pw.this.status.upper(),  # 将 status 列转为大写\n    _id=table.id                     # 保留原 id 列\n)\n```\n\n资料来源：[python/pathway/internals/table.py]()\n\n## 聚合与分组\n\n### 分组聚合（groupby）\n\n`groupby` 按照指定键分组并执行聚合操作：\n\n```mermaid\ngraph LR\n    A[输入表] --> B[GroupBy Key]\n    B --> C[每组聚合]\n    C --> D[聚合函数]\n    D --> E[输出表]\n    \n    subgraph 聚合函数\n        F[count]\n        G[sum]\n        H[min/max]\n        I[collect]\n    end\n    D --> F\n    D --> G\n    D --> H\n    D --> I\n```\n\n```python\n# 按类别聚合\ncategory_stats = orders.groupby(orders.category).reduce(\n    orders.category,\n    total_amount=pw.reducers.sum(orders.amount),\n    order_count=pw.reducers.count(),\n    avg_price=pw.reducers.mean(orders.price)\n)\n```\n\n### 可用聚合函数\n\n| 函数 | 说明 | 示例 |\n|------|------|------|\n| `count()` | 计数 | `count()` |\n| `sum(expr)` | 求和 | `sum(table.amount)` |\n| `mean(expr)` | 平均值 | `mean(table.score)` |\n| `min(expr)` | 最小值 | `min(table.latency)` |\n| `max(expr)` | 最大值 | `max(table.price)` |\n| `collect(expr)` | 收集为列表 | `collect(table.items)` |\n| `sorted_tuple(expr)` | 排序元组 | `sorted_tuple(table.events)` |\n\n## 表连接操作\n\n### 连接的种类\n\n| 连接类型 | 说明 | 空值处理 |\n|----------|------|----------|\n| `join` / `join_inner` | 内连接 | 丢弃无匹配的行 |\n| `join_left` | 左外连接 | 保留左表所有行 |\n| `join_right` | 右外连接 | 保留右表所有行 |\n| `join_outer` | 全外连接 | 保留所有行 |\n\n### 基本连接语法\n\n```python\n# 内连接\nresult = table1.join(\n    table2,\n    table1.key == table2.key\n)\n\n# 指定输出列\nresult = table1.join(\n    table2,\n    table1.id == table2.user_id,\n    left=pw.left.id,\n    right=pw.right.user_id,\n    prefix=\"left_\"\n)\n```\n\n### 复杂连接场景\n\n```python\n# 多条件连接\norders_enriched = orders.join(\n    customers,\n    (orders.customer_id == customers.id) & \n    (orders.region == customers.region)\n)\n```\n\n## 时间与版本控制\n\n### 时间戳机制\n\nPathway 使用特殊的时间戳系统来追踪数据的版本和变更：\n\n```python\n# Timestamp 实现 - 资料来源：src/engine/timestamp.rs:1-30\nimpl Timestamp {\n    fn followed_by(&self, other: &Self) -> Option<Self> {\n        self.0.followed_by(&other.0).map(Self)\n    }\n}\n```\n\n### 数据变更追踪\n\n```\n时间线:  ─────────────────────────────────────────────►\n         t0      t1      t2      t3      t4\n         │       │       │       │       │\n         ▼       ▼       ▼       ▼       ▼\n数据:   [v0]   [v1]   [v2]   [撤回]   [v4]\n              (修改)  (修改)  (删除)\n```\n\nPathway 使用 **Differential Dataflow** 实现增量更新，仅处理发生变化的数据。\n\n## 表的快照与导出\n\n### 快照操作（snapshot）\n\n获取表在特定时间点的状态：\n\n```python\n# 获取当前快照\nsnapshot_data = table.snapshot()\n\n# 在特定前沿时间获取快照 - 资料来源：src/python_api.rs:220-240\nfrontier = TotalFrontier::At(timestamp)\nsnapshot = exported_table.snapshot_at(frontier)\n```\n\n### 外部索引集成\n\n```python\n# 创建向量索引用于 RAG\ntable = pw.Table.from_columns(\n    id=range(1000),\n    content=texts,\n    vector=embeddings\n)\n\n# 构建外部索引\ntable = table.groupby(table.id).reduce(\n    table.id,\n    content=pw.reducers.torch(\n        \"embedding\",\n        vector=table.vector,\n        mode=\"mean\"\n    )\n)\n```\n\n## 最佳实践\n\n### 性能优化\n\n1. **合理设计模式**：避免过度规范化\n2. **使用适当的数据类型**：优先使用原生类型而非字符串\n3. **批量操作**：优先使用 `update_rows` 而非逐行更新\n4. **索引列**：在频繁连接的列上建立索引\n\n### 常见模式\n\n```python\n# 模式1：流式处理 + 批处理混合\nstream_table = pw.io.kafka.read(...)\nbatch_table = pw.io.fs.read(\"./backup\", ...)\n\ncombined = stream_table.concat(batch_table)\n```\n\n```python\n# 模式2：滑动窗口聚合\nwindowed = events.groupby(\n    events.session_id\n).reduce(\n    events.session_id,\n    time_range=pw.reducers.count(),\n    events_list=pw.reducers.collect(\n        pw.this.event_data\n    )[-100:]  # 最近100个事件\n)\n```\n\n## 与其他模块的关系\n\n```mermaid\ngraph TD\n    Table[表操作] --> Column[列操作]\n    Table --> Expression[表达式]\n    Table --> Connector[连接器]\n    Column --> Expression\n    Expression --> Compute[计算引擎]\n    Connector --> Input[输入数据]\n    Compute --> Output[输出数据]\n    Compute --> Persistence[持久化]\n```\n\n- **列操作**（`column.py`）：定义列级别的表达式和转换\n- **表达式**（`expression.py`）：处理数据计算和条件逻辑\n- **连接器**（`pw.io.*`）：负责数据的输入输出\n\n## 错误处理\n\n### 失败状态检测\n\n```python\n# 检查表是否处于失败状态\nif table.failed():\n    logger.error(\"数据处理失败，请检查输入数据\")\n```\n\n### 追踪与调试\n\n```python\n# 获取错误追踪信息\nclass Trace:\n    file_name: str\n    line_number: int\n    line: str\n    function: str\n```\n\n## 小结\n\nPathway 的表操作模块构成了整个框架的核心，提供了：\n\n- **统一的数据模型**：通过 `Table` 类实现批流统一的表格操作\n- **声明式的转换语法**：使用 Python 表达式定义复杂的数据处理逻辑\n- **高效的增量计算**：基于 Differential Dataflow 的增量更新机制\n- **丰富的连接和聚合能力**：支持多种连接类型和聚合函数\n\n熟练掌握表操作是构建高效 Pathway 数据处理流水线的基础。\n\n---\n\n<a id='trans-joins-aggregations'></a>\n\n## 连接与聚合\n\n### 相关页面\n\n相关主题：[表操作](#trans-table-operations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/internals/joins.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/joins.py)\n- [python/pathway/internals/groupbys.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/groupbys.py)\n- [python/pathway/internals/reducers.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/reducers.py)\n- [python/pathway/internals/custom_reducers.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/custom_reducers.py)\n</details>\n\n# 连接与聚合\n\nPathway 是一个基于 Python 的 ETL 框架，其核心由 Rust 引擎驱动，底层采用 Differential Dataflow 实现增量计算。**连接（Join）与聚合（Aggregation）** 是 Pathway 数据处理管线中的两个核心操作，它们使得实时流处理和批处理能够以统一的方式进行。\n\n---\n\n## 1. 概述\n\n在 Pathway 中，连接与聚合承担着数据管道中的关键转换角色：\n\n| 功能 | 作用 | 典型场景 |\n|------|------|----------|\n| **连接（Join）** | 将两个或多个数据源按指定键进行关联，产生新的组合记录 | 实时关联用户行为与配置文件 |\n| **聚合（Aggregation）** | 按分组键对数据进行汇总计算（计数、求和、平均等） | 统计每小时的访问量 |\n| **自定义聚合（Custom Reducers）** | 支持用户定义的聚合逻辑 | 复杂业务规则下的数据汇总 |\n\nPathway 的连接与聚合操作具有以下特性：\n\n- **增量计算**：仅处理自上次计算以来发生变化的数据，而非全量重算\n- **幂等性**：相同输入产生相同输出，支持重放（replay）\n- **统一 API**：流式数据和静态数据使用相同代码\n- **Rust 引擎加速**：核心计算在 Rust 层执行，保证高性能\n\n> 资料来源：[README.md]() 中关于 \"incremental computation\" 和 \"stream processing\" 的描述\n\n---\n\n## 2. 架构设计\n\n### 2.1 核心层次结构\n\n```mermaid\ngraph TD\n    A[Python API 层] --> B[Python Internals 层]\n    B --> C[Rust 引擎层]\n    C --> D[Differential Dataflow]\n    D --> E[Timely Dataflow]\n    \n    F[joins.py] --> B\n    G[groupbys.py] --> B\n    H[reducers.py] --> B\n    I[custom_reducers.py] --> B\n    \n    J[python_api.rs] --> C\n    K[dataflow.rs] --> C\n```\n\nPathway 的连接与聚合功能跨越三个主要层次：\n\n| 层次 | 文件/模块 | 职责 |\n|------|-----------|------|\n| **Python API** | `pathway/internals/*.py` | 提供用户友好的 Python 接口 |\n| **Python Internals** | `joins.py`, `groupbys.py`, `reducers.py` | 实现连接、分组、聚合的 Python 逻辑 |\n| **Rust 引擎** | `src/python_api.rs`, `src/engine/dataflow.rs` | 底层计算引擎绑定 |\n\n### 2.2 连接操作的数据流\n\n```mermaid\ngraph LR\n    A[Table A] --> D[Join 操作]\n    B[Table B] --> D\n    D --> E[Arrangement 索引]\n    E --> F[匹配键值对]\n    F --> G[结果 Collection]\n    \n    H[增量更新] --> E\n    I[键删除] --> F\n```\n\nPathway 使用 Differential Dataflow 的 `join_core` 方法实现连接。连接过程涉及数据的**排列（Arrangement）**，这是实现高效增量连接的关键数据结构。\n\n> 资料来源：[external/differential-dataflow/src/operators/join.rs:28-42]() 中 `join_core` 方法定义\n\n---\n\n## 3. 连接（Join）操作详解\n\n### 3.1 Join 方法签名与参数\n\n根据 Pathway 的架构，Join 操作在 Rust 层暴露为 Python 可调用的接口：\n\n| 参数 | 类型 | 说明 |\n|------|------|------|\n| `stream2` | `Arranged<G, Tr2>` | 第二个输入流（已排列） |\n| `result` | `FnMut(&K, &V, &Tr2::Val) -> I` | 结果生成函数 |\n| `key` | - | 连接键提取（隐含在方法中） |\n\n```rust\n// 资料来源：external/differential-dataflow/src/operators/join.rs\nfn join_core<Tr2, I, L>(&self, stream2: &Arranged<G, Tr2>, result: L) -> Collection<G, I::Item, R::Output>\nwhere\n    Tr2: TraceReader<Key=K, Time=G::Timestamp> + Clone + 'static,\n    R: Multiply<Tr2::R>,\n    L: FnMut(&K, &V, &Tr2::Val) -> I + 'static,\n```\n\n### 3.2 不安全变体：join_core_internal_unsafe\n\n除了标准的 `join_core`，Differential Dataflow 还提供了**不安全变体**，允许更灵活的处理：\n\n```rust\n// 资料来源：external/differential-dataflow/src/operators/join.rs\nfn join_core_internal_unsafe<Tr2, I, D, ROut, L>(\n    &self,\n    stream2: &Arranged<G, Tr2>,\n    result: L\n) -> Collection<G, D, ROut>\nwhere\n    // ... 允许返回 (data, time, diff) 三元组\n    L: FnMut(&K, &V, &Tr2::Val, &G::Timestamp, &R, &Tr2::R) -> I + 'static,\n```\n\n此方法允许回调函数访问时间戳和差分（diff）信息，适用于需要精细控制的场景。\n\n### 3.3 Arrangement 机制\n\nArrangement 是 Pathway 高效连接的核心。每个参与连接的表都需要预先建立索引：\n\n```mermaid\ngraph TD\n    A[输入数据] --> B[ArrangeByKey]\n    B --> C[OrderedLayer 存储结构]\n    C --> D[键索引 BTreeMap]\n    D --> E[值列表 Vec]\n    \n    F[新数据到达] --> G[增量更新 Arrangement]\n    G --> C\n```\n\nPathway 在 `ordered.rs` 中扩展了 Differential Dataflow 的有序层结构：\n\n```rust\n// 资料来源：external/differential-dataflow/src/trace/layers/ordered.rs\nfn step(&mut self, storage: &OrderedLayer<K, L, O, C>) {\n    // [Pathway extension]: pos_nonnegative indicates that index was moved to \n    // [Pathway extension]: negative value, here we undo it\n    if self.pos_nonnegative { \n        self.pos += 1; \n    } else {\n        self.pos_nonnegative = true;\n    }\n    // ...\n}\n```\n\n> 资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs]() 中 `import_named` 方法\n\n---\n\n## 4. 聚合（Aggregation）操作详解\n\n### 4.1 聚合方法概述\n\nPathway 通过 `reduce` 操作族实现聚合功能。Differential Dataflow 提供了多种聚合变体：\n\n| 方法 | 用途 |\n|------|------|\n| `reduce` | 基础聚合操作 |\n| `reduce_abelian` | 阿贝尔群（Abelian）聚合，优化数学运算 |\n| `threshold` | 阈值过滤后聚合 |\n| `count` | 计数聚合 |\n\n### 4.2 Count 聚合实现\n\n计数是常用的聚合操作，Pathway 的实现如下：\n\n```rust\n// 资料来源：external/differential-dataflow/src/operators/reduce.rs\nfn count(&self) -> Collection<G, (K, R), isize> {\n    self.count_core()\n}\n```\n\n### 4.3 Threshold 聚合\n\n阈值聚合允许在聚合前进行条件过滤：\n\n```rust\n// 资料来源：external/differential-dataflow/src/operators/reduce.rs\nfn threshold_named<R2: Abelian, F: FnMut(&K, &R1) -> R2 + 'static>(\n    &self,\n    name: &str,\n    mut thresh: F\n) -> Collection<G, K, R2> {\n    self.reduce_abelian::<_, DefaultKeyTrace<_, _, _>>(\n        name,\n        move |k, s, t| t.push(((), thresh(k, &s[0].1)))\n    )\n    .as_collection(|k, _| k.clone())\n}\n```\n\n---\n\n## 5. Universe 与 Keys 机制\n\n### 5.1 Universe 的作用\n\nUniverse 是 Pathway 引擎中管理数据键空间的核心结构：\n\n```rust\n// 资料来源：src/engine/dataflow.rs\nstruct Universe<S: MaybeTotalScope> {\n    data: Rc<UniverseData<S>>,\n}\n```\n\nUniverse 提供了三种数据来源方式：\n\n| 来源类型 | 说明 |\n|----------|------|\n| `FromCollection` | 从 Collection 直接创建 |\n| `FromArranged` | 从已排列的数据创建 |\n| `Consolidated` | 合并后的数据 |\n\n```rust\n// 资料来源：src/engine/dataflow.rs\nimpl<S: MaybeTotalScope> Universe<S> {\n    fn from_collection(keys: Keys<S>) -> Self {\n        let data = Rc::new(UniverseData::from_collection(keys));\n        Self::new(data)\n    }\n\n    fn from_arranged(keys: KeysArranged<S>) -> Self {\n        let data = Rc::new(UniverseData::from_arranged(keys));\n        Self::new(data)\n    }\n}\n```\n\n### 5.2 Keys 数据结构\n\nKeys 结构体提供了统一的接口来访问数据键：\n\n```mermaid\ngraph TD\n    A[Keys 枚举] --> B[FromCollection]\n    A --> C[FromArranged]\n    \n    B --> D[Keys::collection]\n    B --> E[Keys::arranged]\n    B --> F[Keys::consolidated]\n    \n    C --> D\n    C --> E\n    \n    E --> G[KeysArranged]\n    G --> H[Arranged 索引]\n```\n\n---\n\n## 6. Python API 层\n\n### 6.1 Computer 结构\n\n在 Rust-Python 绑定层，聚合逻辑通过 `Computer` 结构表示：\n\n```rust\n// 资料来源：src/python_api.rs\n#[pyclass(module = \"pathway.engine\")]\nstruct Computer {\n    fun: Py<PyAny>,\n    dtype: Py<PyAny>,\n    is_output: bool,\n    is_method: bool,\n    universe: Py<Universe>,\n    data: Value,\n    data_column: Option<Py<Column>>,\n}\n```\n\n`Computer` 的创建通过 `from_raising_fun` 方法：\n\n```rust\n// 资料来源：src/python_api.rs\n#[pymethods]\nimpl Computer {\n    #[new]\n    #[pyo3(signature = (\n        fun,\n        dtype,\n        is_output,\n        is_method,\n        universe,\n        data = Value::None,\n        data_column = None,\n    ))]\n    pub fn from_raising_fun(\n        py: Python,\n        fun: Py<PyAny>,\n        dtype: Py<PyAny>,\n        is_output: bool,\n        is_method: bool,\n        universe: Py<Universe>,\n        data: Value,\n        data_column: Option<Py<Column>>,\n    ) -> PyResult<Py<Self>> {\n        // ...\n    }\n}\n```\n\n### 6.2 时间戳与增量计算\n\nPathway 使用 `Timestamp` 类型管理增量计算的时间维度：\n\n```rust\n// 资料来源：src/engine/timestamp.rs\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\n时间戳设计遵循以下规则：\n- **偶数时间戳**：表示原始记录（original）\n- **奇数时间戳**：表示删除/撤回记录（retraction）\n\n---\n\n## 7. 使用示例\n\n### 7.1 基础连接\n\n```python\nimport pathway as pw\n\n# 定义两个输入表\norders = pw.io.csv.read(\"./orders.csv\", schema=OrderSchema)\nproducts = pw.io.csv.read(\"./products.csv\", schema=ProductSchema)\n\n# 连接两个表\nresult = orders.join(\n    products,\n    orders.product_id == products.id\n).select(\n    order_id=orders.id,\n    product_name=products.name,\n    quantity=orders.quantity\n)\n```\n\n### 7.2 分组聚合\n\n```python\n# 按产品分组统计订单数量\nsales_summary = orders.groupby(orders.product_id).reduce(\n    orders.product_id,\n    total_quantity=pw.reducers.sum(orders.quantity),\n    order_count=pw.reducers.count()\n)\n```\n\n### 7.3 自定义聚合\n\n```python\nimport pathway as pw\n\n@pw.reducer\ndef weighted_average(values, weights):\n    total_weight = sum(weights)\n    if total_weight == 0:\n        return 0.0\n    return sum(v * w for v, w in zip(values, weights)) / total_weight\n\n# 使用自定义聚合\nresult = data.groupby(data.category).reduce(\n    category=data.category,\n    avg_rating=weighted_average(data.rating, data.importance)\n)\n```\n\n---\n\n## 8. 性能优化建议\n\n### 8.1 Arrangement 复用\n\n对于需要多次连接的表，预先建立 Arrangement 可以显著提升性能：\n\n| 策略 | 适用场景 | 收益 |\n|------|----------|------|\n| 预排序（Pre-arrange） | 同一表参与多次连接 | 减少重复索引开销 |\n| 键压缩 | 大键值数据 | 降低内存占用 |\n| 分区（Partitioning） | 超大规模数据 | 并行处理加速 |\n\n### 8.2 增量计算优势\n\nPathway 的增量计算模型特别适合以下场景：\n\n```mermaid\ngraph LR\n    A[新数据到达] --> B{检查变更}\n    B --> C[仅处理受影响键]\n    C --> D[更新结果]\n    D --> E[增量输出]\n    \n    F[传统批处理] --> G[全量重算]\n    G --> H[完整输出]\n```\n\n---\n\n## 9. 相关源码文件索引\n\n| 文件路径 | 功能描述 |\n|----------|----------|\n| `python/pathway/internals/joins.py` | Python 层 Join 操作实现 |\n| `python/pathway/internals/groupbys.py` | Python 层分组操作实现 |\n| `python/pathway/internals/reducers.py` | 内置聚合函数 |\n| `python/pathway/internals/custom_reducers.py` | 自定义聚合装饰器 |\n| `src/python_api.rs` | Rust-Python 绑定层 |\n| `src/engine/dataflow.rs` | 引擎核心数据结构 |\n| `src/engine/timestamp.rs` | 时间戳管理 |\n| `external/differential-dataflow/src/operators/join.rs` | 底层 Join 操作 |\n| `external/differential-dataflow/src/operators/reduce.rs` | 底层聚合操作 |\n| `external/differential-dataflow/src/operators/arrange/agent.rs` | Arrangement 管理 |\n\n---\n\n## 10. 总结\n\nPathway 的连接与聚合系统是一个精心设计的分层架构：\n\n1. **Python API 层**提供直观易用的接口\n2. **Python Internals 层**处理业务逻辑与类型转换\n3. **Rust 引擎层**基于 Differential Dataflow 实现高效增量计算\n4. **Timely Dataflow 层**提供底层并行计算支持\n\n这种设计使得用户能够以简洁的 Python 代码表达复杂的数据处理逻辑，同时享受 Rust 引擎带来的高性能和增量计算的效率优势。\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：pathwaycom/pathway\n\n摘要：发现 22 个潜在踩坑项，其中 2 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any。\n\n## 1. 安装坑 · 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安全/权限坑 · 来源证据：Entra Authentication Support (credential handler and/or password callback)\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：Automated schema exploration in input connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安装坑 · 来源证据：Improve watermarks in POSIX-like objects tracker\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：Persistence in `iterate` operator\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 7. 安装坑 · 来源证据：Support LEANN for RAG pipelines\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：Support MongoDB Atlas in pw.io.mongodb\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 9. 安装坑 · 来源证据：feat: Add Microsoft SQL Server (MSSQL) connector\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：v0.27.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 安装坑 · 来源证据：v0.29.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.29.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b25babd3e3cc49108e56daaaaae8c5bf | https://github.com/pathwaycom/pathway/releases/tag/v0.29.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 12. 安装坑 · 来源证据：v0.30.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.30.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_0291ee068e4e4d38aa86d0fd433b960a | https://github.com/pathwaycom/pathway/releases/tag/v0.30.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 13. 配置坑 · 来源证据：v0.28.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.28.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_708184d9c8ac445d923c82d38af7bd19 | https://github.com/pathwaycom/pathway/releases/tag/v0.28.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 配置坑 · 来源证据：v0.30.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.30.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7a19bd03a49499e987781160e4b76f0 | https://github.com/pathwaycom/pathway/releases/tag/v0.30.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:571186647 | https://github.com/pathwaycom/pathway | README/documentation is current enough for a first validation pass.\n\n## 16. 运行坑 · 来源证据：`pw.io.mssql.read` needs LSN persistence\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：`pw.io.mssql.read` needs LSN persistence\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_59d927a667a843ccb7d0a4cd147a1dc0 | https://github.com/pathwaycom/pathway/issues/218 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 17. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | last_activity_observed missing\n\n## 18. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 19. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 20. 安全/权限坑 · 来源证据：Support encryption in Kinesis connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Support encryption in Kinesis connectors\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f29097c3eced4a209e7c80971aeea40c | https://github.com/pathwaycom/pathway/issues/223 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 21. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | issue_or_pr_quality=unknown\n\n## 22. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | release_recency=unknown\n\n<!-- canonical_name: pathwaycom/pathway; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：pathwaycom/pathway\n\n摘要：发现 22 个潜在踩坑项，其中 2 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any。\n\n## 1. 安装坑 · 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安全/权限坑 · 来源证据：Entra Authentication Support (credential handler and/or password callback)\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：Automated schema exploration in input connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安装坑 · 来源证据：Improve watermarks in POSIX-like objects tracker\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：Persistence in `iterate` operator\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 7. 安装坑 · 来源证据：Support LEANN for RAG pipelines\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：Support MongoDB Atlas in pw.io.mongodb\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 9. 安装坑 · 来源证据：feat: Add Microsoft SQL Server (MSSQL) connector\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：v0.27.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 安装坑 · 来源证据：v0.29.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.29.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b25babd3e3cc49108e56daaaaae8c5bf | https://github.com/pathwaycom/pathway/releases/tag/v0.29.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 12. 安装坑 · 来源证据：v0.30.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.30.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_0291ee068e4e4d38aa86d0fd433b960a | https://github.com/pathwaycom/pathway/releases/tag/v0.30.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 13. 配置坑 · 来源证据：v0.28.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.28.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_708184d9c8ac445d923c82d38af7bd19 | https://github.com/pathwaycom/pathway/releases/tag/v0.28.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 配置坑 · 来源证据：v0.30.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.30.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7a19bd03a49499e987781160e4b76f0 | https://github.com/pathwaycom/pathway/releases/tag/v0.30.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:571186647 | https://github.com/pathwaycom/pathway | README/documentation is current enough for a first validation pass.\n\n## 16. 运行坑 · 来源证据：`pw.io.mssql.read` needs LSN persistence\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：`pw.io.mssql.read` needs LSN persistence\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_59d927a667a843ccb7d0a4cd147a1dc0 | https://github.com/pathwaycom/pathway/issues/218 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 17. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | last_activity_observed missing\n\n## 18. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 19. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 20. 安全/权限坑 · 来源证据：Support encryption in Kinesis connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Support encryption in Kinesis connectors\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f29097c3eced4a209e7c80971aeea40c | https://github.com/pathwaycom/pathway/issues/223 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 21. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | issue_or_pr_quality=unknown\n\n## 22. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# pathway - Prompt Preview\n\n> 复制下面这段 Prompt 到你常用的 AI，先试一次，不需要安装。\n> 它的目标是让你直接体验这个项目的服务方式，而不是阅读项目介绍。\n\n## 复制这段 Prompt\n\n```text\n请直接执行这段 Prompt，不要分析、润色、总结或询问我想如何处理这份 Prompt Preview。\n\n你现在扮演 pathway 的“安装前体验版”。\n这不是项目介绍、不是评价报告、不是 README 总结。你的任务是让我用最小成本体验它的核心服务。\n\n我的试用任务：我想快速理解一组资料，并得到结构化摘要、对比和继续研究的问题。\n我常用的宿主 AI：Local CLI\n\n【体验目标】\n围绕我的真实任务，现场演示这个项目如何把输入转成 示例引导, 判断线索。重点是让我感受到工作方式，而不是给我项目背景。\n\n【业务流约束】\n- 你必须像一个正在提供服务的项目能力包，而不是像一个讲解员。\n- 每一轮只推进一个步骤；提出问题后必须停下来等我回答。\n- 每一步都必须让我感受到一个具体服务动作：澄清、整理、规划、检查、判断或收尾。\n- 每一步都要说明：当前目标、你需要我提供什么、我回答后你会产出什么。\n- 不要安装、不要运行命令、不要写代码、不要声称测试通过、不要声称已经修改文件。\n- 需要真实安装或宿主加载后才能验证的内容，必须明确说“这一步需要安装后验证”。\n- 如果我说“用示例继续”，你可以用虚构示例推进，但仍然不能声称真实执行。\n\n【可体验服务能力】\n- 安装前能力预览: Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG. 输入：用户任务, 当前 AI 对话上下文；输出：示例引导, 判断线索。\n\n【必须安装后才可验证的能力】\n- 命令行启动或安装流程: 项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 输入：终端环境, 包管理器, 项目依赖；输出：安装结果, 列表/更新/运行结果。\n\n【核心服务流】\n请严格按这个顺序带我体验。不要一次性输出完整流程：\n1. overview-introduction：Pathway简介。围绕“Pathway简介”模拟一次用户任务，不展示安装或运行结果。\n2. overview-installation：安装指南。围绕“安装指南”模拟一次用户任务，不展示安装或运行结果。\n3. quickstart-basic-example：基础示例。围绕“基础示例”模拟一次用户任务，不展示安装或运行结果。\n4. arch-engine-overview：引擎架构。围绕“引擎架构”模拟一次用户任务，不展示安装或运行结果。\n5. connectors-overview：连接器概述。围绕“连接器概述”模拟一次用户任务，不展示安装或运行结果。\n\n【核心能力体验剧本】\n每一步都必须按“输入 -> 服务动作 -> 中间产物”执行。不要只说流程名：\n1. overview-introduction\n输入：用户提供的“Pathway简介”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n2. overview-installation\n输入：用户提供的“安装指南”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n3. quickstart-basic-example\n输入：用户提供的“基础示例”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n4. arch-engine-overview\n输入：用户提供的“引擎架构”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n5. connectors-overview\n输入：用户提供的“连接器概述”相关信息。\n服务动作：模拟项目在这一步的核心判断和整理方式。\n中间产物：一个可检查的小结果。\n\n【项目服务规则】\n这些规则决定你如何服务用户。不要解释规则本身，而要在每一步执行时遵守：\n- 先确认用户任务、输入材料和成功标准，再模拟项目能力。\n- 每一步都必须形成可检查的小产物，并等待用户确认后再继续。\n- 凡是需要安装、调用工具或访问外部服务的能力，都必须标记为安装后验证。\n\n【每一步的服务约束】\n- Step 1 / overview-introduction：Step 1 必须围绕“Pathway简介”形成一个小中间产物，并等待用户确认。\n- Step 2 / overview-installation：Step 2 必须围绕“安装指南”形成一个小中间产物，并等待用户确认。\n- Step 3 / quickstart-basic-example：Step 3 必须围绕“基础示例”形成一个小中间产物，并等待用户确认。\n- Step 4 / arch-engine-overview：Step 4 必须围绕“引擎架构”形成一个小中间产物，并等待用户确认。\n- Step 5 / connectors-overview：Step 5 必须围绕“连接器概述”形成一个小中间产物，并等待用户确认。\n\n【边界与风险】\n- 不要声称已经安装、运行、调用 API、读写本地文件或完成真实任务。\n- 安装前预览只能展示工作方式，不能证明兼容性、性能或输出质量。\n- 涉及安装、插件加载、工具调用或外部服务的能力必须安装后验证。\n\n【可追溯依据】\n这些路径只用于你内部校验或在我追问“依据是什么”时简要引用。不要在首次回复主动展开：\n- https://github.com/pathwaycom/pathway\n- https://github.com/pathwaycom/pathway#readme\n- README.md\n- pyproject.toml\n- rust-toolchain.toml\n- docs/2.developers/4.user-guide/10.introduction/20.installation.md\n- examples/notebooks/tutorials/installation_first_steps.ipynb\n- python/pathway/__init__.py\n- src/engine/mod.rs\n- src/engine/dataflow.rs\n- src/engine/graph.rs\n- external/differential-dataflow/src/lib.rs\n\n【首次问题规则】\n- 首次三问必须先确认用户目标、成功标准和边界，不要提前进入工具、安装或实现细节。\n- 如果后续需要技术条件、文件路径或运行环境，必须等用户确认目标后再追问。\n\n首次回复必须只输出下面 4 个部分：\n1. 体验开始：用 1 句话说明你将带我体验 pathway 的核心服务。\n2. 当前步骤：明确进入 Step 1，并说明这一步要解决什么。\n3. 你会如何服务我：说明你会先改变我完成任务的哪个动作。\n4. 只问我 3 个问题，然后停下等待回答。\n\n首次回复禁止输出：后续完整流程、证据清单、安装命令、项目评价、营销文案、已经安装或运行的说法。\n\nStep 1 / brainstorming 的二轮协议：\n- 我回答首次三问后，你仍然停留在 Step 1 / brainstorming，不要进入 Step 2。\n- 第二次回复必须产出 6 个部分：澄清后的任务定义、成功标准、边界条件、\n  2-3 个可选方案、每个方案的权衡、推荐方案。\n- 第二次回复最后必须问我是否确认推荐方案；只有我明确确认后，才能进入下一步。\n- 第二次回复禁止输出 git worktree、代码计划、测试文件、命令或真实执行结果。\n\n后续对话规则：\n- 我回答后，你先完成当前步骤的中间产物并等待确认；只有我确认后，才能进入下一步。\n- 每一步都要生成一个小的中间产物，例如澄清后的目标、计划草案、测试意图、验证清单或继续/停止判断。\n- 所有演示都写成“我会建议/我会引导/这一步会形成”，不要写成已经真实执行。\n- 不要声称已经测试通过、文件已修改、命令已运行或结果已产生。\n- 如果某个能力必须安装后验证，请直接说“这一步需要安装后验证”。\n- 如果证据不足，请明确说“证据不足”，不要补事实。\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：pathwaycom/pathway\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install -U pathway\n```\n\n来源：https://github.com/pathwaycom/pathway#readme\n\n## 来源\n\n- repo: https://github.com/pathwaycom/pathway\n- docs: https://github.com/pathwaycom/pathway#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_6b1c86605d304988ae43163ed46c8441"
}