{
  "canonical_name": "pathwaycom/pathway",
  "compilation_id": "pack_69a0c7a76b3749d9861b76893c0cb2b2",
  "created_at": "2026-05-16T21:38:13.680103+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=skill, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=skill, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install -U pathway` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install -U pathway",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "deterministic_isolated_install",
      "sandbox_validation_id": "sbx_201d177b776449bf8ebb301268561a08"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_a6d2fff6b478ba4fbfe579de3aaab457",
    "canonical_name": "pathwaycom/pathway",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/pathwaycom/pathway",
    "slug": "pathway",
    "source_packet_id": "phit_b1ea0938e4c0431dacce0edfc388bf24",
    "source_validation_id": "dval_6b1c86605d304988ae43163ed46c8441"
  },
  "merchandising": {
    "best_for": "需要信息检索与知识管理能力，并使用 local_cli的用户",
    "github_forks": 1672,
    "github_stars": 63307,
    "one_liner_en": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.",
    "one_liner_zh": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.",
    "primary_category": {
      "category_id": "research-knowledge",
      "confidence": "high",
      "name_en": "Research & Knowledge",
      "name_zh": "信息检索与知识管理",
      "reason": "strong category phrase match from project identity and outcome"
    },
    "target_user": "使用 local_cli 等宿主 AI 的用户",
    "title_en": "pathway",
    "title_zh": "pathway 能力包",
    "visible_tags": [
      {
        "label_en": "MCP Tools",
        "label_zh": "MCP 工具",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-mcp-tools",
        "type": "product_domain"
      },
      {
        "label_en": "Knowledge Base Q&A",
        "label_zh": "知识库问答",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-knowledge-base-q-a",
        "type": "user_job"
      },
      {
        "label_en": "Natural-language Web Actions",
        "label_zh": "自然语言网页操作",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-natural-language-web-actions",
        "type": "core_capability"
      },
      {
        "label_en": "Checkpoint Resume",
        "label_zh": "断点恢复流程",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-checkpoint-resume",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Structured Data Extraction",
        "label_zh": "结构化数据提取",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-structured-data-extraction",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_b1ea0938e4c0431dacce0edfc388bf24",
  "page_model": {
    "artifacts": {
      "artifact_slug": "pathway",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install -U pathway",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/pathwaycom/pathway#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "MCP 工具",
        "知识库问答",
        "自然语言网页操作",
        "断点恢复流程",
        "结构化数据提取"
      ],
      "eyebrow": "信息检索与知识管理",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要信息检索与知识管理能力，并使用 local_cli的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG."
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "local_cli",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "skill, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：[Bug]: TypeError: Cannot instantiate typing.Any",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Entra Authentication Support (credential handler and/or password callback)",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Automated schema exploration in input connectors",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Improve watermarks in POSIX-like objects tracker",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Persistence in `iterate` operator",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Support LEANN for RAG pipelines",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Support MongoDB Atlas in pw.io.mongodb",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：feat: Add Microsoft SQL Server (MSSQL) connector",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.27.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.29.0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_b25babd3e3cc49108e56daaaaae8c5bf | https://github.com/pathwaycom/pathway/releases/tag/v0.29.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.29.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.30.0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_0291ee068e4e4d38aa86d0fd433b960a | https://github.com/pathwaycom/pathway/releases/tag/v0.30.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.30.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.28.0",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_708184d9c8ac445d923c82d38af7bd19 | https://github.com/pathwaycom/pathway/releases/tag/v0.28.0 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.28.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.30.1",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_e7a19bd03a49499e987781160e4b76f0 | https://github.com/pathwaycom/pathway/releases/tag/v0.30.1 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.30.1",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | github_repo:571186647 | https://github.com/pathwaycom/pathway | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个运行相关的待验证问题：`pw.io.mssql.read` needs LSN persistence",
            "category": "运行坑",
            "evidence": [
              "community_evidence:github | cevd_59d927a667a843ccb7d0a4cd147a1dc0 | https://github.com/pathwaycom/pathway/issues/218 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：`pw.io.mssql.read` needs LSN persistence",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 22 个潜在踩坑项，其中 2 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": 39,
        "forks": 1672,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": 63307
      },
      "source_url": "https://github.com/pathwaycom/pathway",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.",
      "title": "pathway 能力包",
      "trial_prompt": "# pathway - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for pathwaycom/pathway.\n\nProject:\n- Name: pathway\n- Repository: https://github.com/pathwaycom/pathway\n- Summary: Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. page-introduction: Pathway Introduction. Produce one small intermediate artifact and wait for confirmation.\n2. page-concepts: Core Concepts. Produce one small intermediate artifact and wait for confirmation.\n3. page-architecture: System Architecture. Produce one small intermediate artifact and wait for confirmation.\n4. page-python-api: Python API Structure. Produce one small intermediate artifact and wait for confirmation.\n5. page-data-model: Data Model and Type System. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/pathwaycom/pathway\n- https://github.com/pathwaycom/pathway#readme\n- README.md\n- python/pathway/__init__.py\n- pyproject.toml\n- python/pathway/internals/schema.py\n- python/pathway/internals/table.py\n- python/pathway/internals/datasource.py\n- docs/2.developers/4.user-guide/10.introduction/50.concepts.md\n- docs/2.developers/4.user-guide/10.introduction/70.streaming-and-static-modes.md\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in（https://github.com/pathwaycom/pathway/issues/232）；github/github_issue: Entra Authentication Support (credential handler and/or password callbac（https://github.com/pathwaycom/pathway/issues/230）；github/github_issue: [Bug]: TypeError: Cannot instantiate typing.Any（https://github.com/pathwaycom/pathway/issues/227）；github/github_issue: Improve watermarks in POSIX-like objects tracker（https://github.com/pathwaycom/pathway/issues/225）；github/github_issue: Automated schema exploration in input connectors（https://github.com/pathwaycom/pathway/issues/224）；github/github_issue: Support encryption in Kinesis connectors（https://github.com/pathwaycom/pathway/issues/223）；github/github_issue: Support external replication slots in `pw.io.postgres.read`; add persist（https://github.com/pathwaycom/pathway/issues/222）；github/github_issue: Support MongoDB Atlas in pw.io.mongodb（https://github.com/pathwaycom/pathway/issues/221）；github/github_issue: Support LEANN for RAG pipelines（https://github.com/pathwaycom/pathway/issues/173）；github/github_issue: feat: Add Microsoft SQL Server (MSSQL) connector（https://github.com/pathwaycom/pathway/issues/204）；github/github_issue: Persistence in `iterate` operator（https://github.com/pathwaycom/pathway/issues/214）；github/github_issue: `pw.io.mssql.read` needs LSN persistence（https://github.com/pathwaycom/pathway/issues/218）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in",
              "url": "https://github.com/pathwaycom/pathway/issues/232"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Entra Authentication Support (credential handler and/or password callbac",
              "url": "https://github.com/pathwaycom/pathway/issues/230"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "[Bug]: TypeError: Cannot instantiate typing.Any",
              "url": "https://github.com/pathwaycom/pathway/issues/227"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Improve watermarks in POSIX-like objects tracker",
              "url": "https://github.com/pathwaycom/pathway/issues/225"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Automated schema exploration in input connectors",
              "url": "https://github.com/pathwaycom/pathway/issues/224"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Support encryption in Kinesis connectors",
              "url": "https://github.com/pathwaycom/pathway/issues/223"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Support external replication slots in `pw.io.postgres.read`; add persist",
              "url": "https://github.com/pathwaycom/pathway/issues/222"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Support MongoDB Atlas in pw.io.mongodb",
              "url": "https://github.com/pathwaycom/pathway/issues/221"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Support LEANN for RAG pipelines",
              "url": "https://github.com/pathwaycom/pathway/issues/173"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "feat: Add Microsoft SQL Server (MSSQL) connector",
              "url": "https://github.com/pathwaycom/pathway/issues/204"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Persistence in `iterate` operator",
              "url": "https://github.com/pathwaycom/pathway/issues/214"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "`pw.io.mssql.read` needs LSN persistence",
              "url": "https://github.com/pathwaycom/pathway/issues/218"
            }
          ],
          "status": "已收录 12 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "信息检索与知识管理",
      "desc": "Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.",
      "effort": "安装已验证",
      "forks": 1672,
      "icon": "search",
      "name": "pathway 能力包",
      "risk": "可发布",
      "slug": "pathway",
      "stars": 63307,
      "tags": [
        "MCP 工具",
        "知识库问答",
        "自然语言网页操作",
        "断点恢复流程",
        "结构化数据提取"
      ],
      "thumb": "blue",
      "type": "Skill Pack"
    },
    "manual": {
      "markdown": "# https://github.com/pathwaycom/pathway 项目说明书\n\n生成时间：2026-05-16 21:24:03 UTC\n\n## 目录\n\n- [Pathway Introduction](#page-introduction)\n- [Core Concepts](#page-concepts)\n- [System Architecture](#page-architecture)\n- [Python API Structure](#page-python-api)\n- [Data Model and Type System](#page-data-model)\n- [Data Connectors](#page-connectors)\n- [Custom Connectors](#page-connectors-custom)\n- [Data Transformations](#page-transformations)\n- [Temporal and Time-Based Processing](#page-temporal-processing)\n- [LLM xPack](#page-llm-xpack)\n\n<a id='page-introduction'></a>\n\n## Pathway Introduction\n\n### 相关页面\n\n相关主题：[Core Concepts](#page-concepts), [System Architecture](#page-architecture)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n</details>\n\n# Pathway Introduction\n\nPathway is a Python ETL framework designed for stream processing, real-time analytics, LLM pipelines, and Retrieval-Augmented Generation (RAG). It provides an easy-to-use Python API that seamlessly integrates with popular Python ML libraries, enabling developers to build robust data processing pipelines that work in both development and production environments.\n\n## Overview\n\nPathway combines the flexibility of Python with the performance of a scalable Rust engine built on Differential Dataflow. The framework performs incremental computation, allowing it to efficiently handle both batch and streaming data with the same code base.\n\n### Key Characteristics\n\n| Feature | Description |\n|---------|-------------|\n| **Language** | Python with Rust engine |\n| **Processing Model** | Incremental computation via Differential Dataflow |\n| **Data Modes** | Batch processing and streaming |\n| **Deployment** | Local development, CI/CD, Docker, production |\n| **License** | BSL (Business Source License) |\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n## Architecture\n\nPathway's architecture leverages a multi-layered design that separates the Python interface from the high-performance Rust computation engine.\n\n```mermaid\ngraph TD\n    subgraph Python_Interface[\"Python Layer\"]\n        A[User Code] --> B[pathway API]\n        B --> C[Python Subject]\n        B --> D[Python Connectors]\n    end\n    \n    subgraph Rust_Engine[\"Rust Engine\"]\n        E[Expression Engine] --> F[Differential Dataflow]\n        F --> G[Timely Dataflow]\n        C --> E\n        D --> E\n    end\n    \n    subgraph Output[\"Output Layer\"]\n        G --> H[Exported Tables]\n        H --> I[Snapshot Data]\n    end\n```\n\n### Core Components\n\n| Component | Role |\n|-----------|------|\n| **PythonSubject** | Wraps Python callbacks for data input |\n| **ValueField** | Defines schema fields with types and defaults |\n| **PyExpression** | Encapsulates expressions with optional GIL release |\n| **PyExportedTable** | Provides snapshot and frontier access to output data |\n| **ConnectorMode** | Distinguishes between STATIC and STREAMING modes |\n\n资料来源：[src/python_api.rs:25-35](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L25-L35)\n\n## Data Model\n\n### Schema Fields\n\nPathway uses `ValueField` to define table schemas with the following properties:\n\n```python\n@dataclass\nclass ValueField:\n    name: str           # Field identifier\n    type_: Type         # Data type\n    source: FieldSource # Origin of the field (Payload, Key, etc.)\n    default: Optional[Value]  # Default value if not present\n    metadata: Optional[str]   # Additional metadata as JSON string\n```\n\nThe `source` field determines how a field is populated:\n\n| FieldSource | Description |\n|-------------|-------------|\n| `FieldSource.Payload` | Field comes from the input data |\n| `FieldSource.Key` | Field is used as a key identifier |\n| `FieldSource.PSEUDO` | Synthetic field generated by the system |\n\n资料来源：[src/python_api.rs:47-55](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L47-L55)\n\n### Timestamps and Ordering\n\nPathway implements timestamp-based ordering for incremental computation. Each timestamp has special semantics for distinguishing original data from retractions.\n\n```rust\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\nThe timestamp system ensures that even-numbered timestamps represent original data while odd-numbered timestamps represent retractions, enabling efficient differential updates.\n\n资料来源：[src/engine/timestamp.rs:48-60](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs#L48-L60)\n\n## Processing Modes\n\nPathway supports two distinct connector modes:\n\n### Static Mode\n\nIn static mode, data is processed in batches and the pipeline completes once all input is consumed.\n\n```python\npw.io.http.PathwayWebserver(mode=pw.io.http.SocketWriterMode.STATIC)\n```\n\n### Streaming Mode\n\nIn streaming mode, the pipeline remains active and continuously processes incoming data updates.\n\n```python\npw.io.http.PathwayWebserver(mode=pw.io.http.SocketWriterMode.STREAMING)\n```\n\n| Mode | Use Case | Behavior |\n|------|----------|----------|\n| `STATIC` | Batch jobs, one-off processing | Exits after all data processed |\n| `STREAMING` | Live data, real-time analytics | Continuously processes updates |\n\n资料来源：[src/python_api.rs:280-290](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L280-L290)\n\n## Expression Engine\n\nPathway's expression engine supports both unary and binary operations on data. The engine is implemented in Rust for performance but callable from Python.\n\n```mermaid\ngraph LR\n    A[PyExpression] -->|unary_op!| B[Unary Expression]\n    A -->|binary_op!| C[Binary Expression]\n    B --> D[Result Expression]\n    C --> D\n```\n\n### Supported Operations\n\nThe `PyExpression` class wraps Rust expressions and tracks whether the GIL (Global Interpreter Lock) should be held during evaluation:\n\n```rust\npub struct PyExpression {\n    inner: Arc<Expression>,\n    gil: bool,  // Whether GIL is required\n}\n```\n\nOperations automatically set the `gil` flag based on whether any operand requires the Python GIL, optimizing performance when Python code is not involved.\n\n资料来源：[src/python_api.rs:360-375](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L360-L375)\n\n## Frontier and Snapshots\n\nPathway uses the concept of \"frontiers\" to track progress in incremental computation. The `TotalFrontier` enum represents the state of computation:\n\n```rust\npub enum TotalFrontier<T> {\n    At(T),      // Computation at specific timestamp\n    Done,       // Computation complete\n}\n```\n\n### Snapshot Access\n\nThe `PyExportedTable` provides methods to access data at specific frontiers:\n\n| Method | Description |\n|--------|-------------|\n| `frontier()` | Returns the current computation frontier |\n| `snapshot_at(frontier)` | Returns all key-value pairs up to the specified frontier |\n| `failed()` | Indicates if the computation has failed |\n\n```python\n# Access snapshot at current frontier\ncurrent_frontier = table.frontier()\ndata = table.snapshot_at(current_frontier)\n```\n\n资料来源：[src/python_api.rs:200-220](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L200-L220)\n\n## Differential Dataflow Integration\n\nPathway's Rust engine is built on Differential Dataflow, an extension of Timely Dataflow that supports incremental computation with efficient handling of updates and retractions.\n\n```mermaid\ngraph TD\n    subgraph Timely_Dataflow[\"Timely Dataflow\"]\n        A[Input] --> B[Operator]\n        B --> C[Probe]\n    end\n    \n    subgraph Differential_Extension[\"Differential Dataflow\"]\n        B --> D[Arrangement]\n        D --> E[Trace Agent]\n        E --> F[Collection]\n    end\n```\n\nThe `arrange` operator imports arranged data into a computation scope:\n\n```rust\npub fn import<G>(&mut self, scope: &G) -> Arranged<G, TraceAgent<Tr>>\nwhere\n    G: Scope<Timestamp=Tr::Time>,\n    Tr::Time: Timestamp,\n{\n    self.import_named(scope, \"ArrangedSource\")\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:45-55](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs#L45-L55)\n\n## Common Use Cases\n\n### Real-time RAG Pipelines\n\nPathway excels at building Retrieval-Augmented Generation pipelines that require real-time document indexing and querying.\n\n```\nDocuments --> Pathway VectorStoreServer --> REST API /v1/retrieve\n                                                    |\nUser Query --> HTTP POST --> Pathway --> LLM --> Response\n```\n\nExample configuration:\n\n```python\nimport pathway as pw\n\n# Start the vector store server\nwebserver = pw.io.http.PathwayWebserver(host=\"0.0.0.0\", port=8011)\n\n# Query endpoint\ncurl --data '{ \"messages\": \"What is the value of X?\"}' http://localhost:8011\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n\n### Multi-Agent RAG Systems\n\nPathway supports multi-agent architectures where different agents collaborate to answer complex queries:\n\n```mermaid\ngraph LR\n    A[Documents] --> B[Pathway VectorStoreServer]\n    B --> C[REST API]\n    C --> D[UserProxy Agent]\n    D --> E[Researcher Agent]\n    D --> F[Analyst Agent]\n    E & F --> G[Grounded Answers]\n```\n\n资料来源：[examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n\n## Running Pathway Applications\n\n### Local Execution\n\nRun Pathway applications like standard Python scripts:\n\n```bash\npython main.py\n```\n\n### Multi-threaded Execution\n\nPathway natively supports multithreading:\n\n```bash\npathway spawn --threads 3 python main.py\n```\n\n### Docker Deployment\n\nPathway provides official Docker images for containerized execution:\n\n```dockerfile\nFROM pathwaycom/pathway:latest\n\nWORKDIR /app\nCOPY requirements.txt ./\nRUN pip install --no-cache-dir -r requirements.txt\nCOPY . .\n\nCMD [ \"python\", \"./your-script.py\" ]\n```\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n## Summary\n\nPathway provides a powerful framework for building data processing pipelines with the following advantages:\n\n- **Python-first API**: Write pipelines in familiar Python syntax\n- **Rust performance**: Leverage Differential Dataflow for efficient incremental computation\n- **Unified code**: Same code works for batch and streaming scenarios\n- **Production-ready**: Built-in monitoring, logging, and Docker support\n- **LLM integration**: First-class support for RAG and AI pipelines\n\nThe framework is particularly well-suited for applications requiring real-time data processing, such as live document indexing, dynamic analytics, and AI-powered search systems.\n\n---\n\n<a id='page-concepts'></a>\n\n## Core Concepts\n\n### 相关页面\n\n相关主题：[Pathway Introduction](#page-introduction), [Data Model and Type System](#page-data-model)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/internals/schema.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/schema.py)\n- [python/pathway/internals/table.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/table.py)\n- [python/pathway/internals/datasource.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/datasource.py)\n- [docs/2.developers/4.user-guide/10.introduction/50.concepts.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/10.introduction/50.concepts.md)\n- [docs/2.developers/4.user-guide/10.introduction/70.streaming-and-static-modes.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/10.introduction/70.streaming-and-static-modes.md)\n</details>\n\n# Core Concepts\n\nPathway is a Python ETL framework built on a scalable Rust engine that leverages Differential Dataflow for incremental computation. Understanding its core concepts is essential for building robust data processing pipelines that work seamlessly in both development and production environments.\n\n## Architecture Overview\n\nPathway's architecture combines Python usability with Rust performance through a carefully designed abstraction layer. The Python API provides the user-facing interface, while the Rust engine handles the actual computation using Differential Dataflow's incremental processing model.\n\n```mermaid\ngraph TB\n    subgraph Python_API[\"Python API Layer\"]\n        Schema[\"Schema\"]\n        Table[\"Table\"]\n        Connector[\"Connectors\"]\n        UDF[\"User-Defined Functions\"]\n    end\n    \n    subgraph Rust_Engine[\"Rust Engine\"]\n        DiffDataflow[\"Differential Dataflow\"]\n        IncrementalComp[\"Incremental Computation\"]\n        Persistence[\"Persistence Layer\"]\n    end\n    \n    subgraph DataStreams[\"Data Streams\"]\n        Static[\"Static/Batch Data\"]\n        Streaming[\"Real-time Streams\"]\n    end\n    \n    Python_API --> Rust_Engine\n    DataStreams --> Rust_Engine\n```\n\nThe framework is designed so that the same code can run in multiple execution modes without modification, supporting local development, CI/CD tests, batch jobs, stream replays, and live data processing.\n\n## Tables\n\nTables are the fundamental data structure in Pathway, representing collections of typed rows with a defined schema. Every table has a primary key that uniquely identifies each row.\n\n### Table Structure\n\nFrom the Rust engine perspective, a `Table` consists of:\n\n| Component | Description | Rust Type |\n|-----------|-------------|-----------|\n| Scope | Execution scope for the table | `Scope` |\n| Handle | Internal reference to table data | `TableHandle` |\n| Columns | Typed data columns | `Vec<Column>` |\n| Universe | Set of all keys in the table | `Universe` |\n\nThe `Table` class wraps the Rust engine's table representation and caches Python instances to avoid redundant object creation.\n\n### Table Operations\n\nTables support a rich set of operations:\n\n- **Selection**: Filter rows based on conditions\n- **Projection**: Select and transform columns\n- **Joins**: Combine tables based on keys\n- **Aggregations**: Group and aggregate data\n- **Updates**: Modify existing rows\n\nAll operations are lazy—they don't execute until the computation is triggered by `pw.run()`.\n\n## Schemas\n\nSchemas define the structure of tables, specifying column names, types, and metadata. They serve as the contract between data sources and the processing logic.\n\n### Schema Definition\n\nSchemas are typically defined using the `@pw.schema_class` decorator combined with `pw.column_definition`:\n\n```python\nclass DocumentSchema(pw.Schema):\n    id = pw.column_definition(type_=pw.Type.STRING)\n    content = pw.column_definition(type_=pw.Type.STRING)\n    timestamp = pw.column_definition(type_=pw.Type.DATETIME)\n    embedding = pw.column_definition(type_=pw.Type.FLOAT_ARRAY)\n```\n\n### Column Types\n\n| Type | Python Representation | Use Case |\n|------|---------------------|----------|\n| `STRING` | `str` | Text data, identifiers |\n| `INT` | `int` | Integer values, counts |\n| `FLOAT` | `float` | Decimal numbers |\n| `BOOL` | `bool` | True/false values |\n| `DATETIME` | `datetime` | Timestamps and dates |\n| `FLOAT_ARRAY` | `List[float]` | Embeddings, vectors |\n| `DICT` | `dict` | Nested structured data |\n\n### Primary Keys\n\nEach schema must define a primary key that uniquely identifies rows:\n\n```python\nclass InputSchema(pw.Schema):\n    doc_id = pw.column_definition(type_=pw.Type.STRING)\n    content = pw.column_definition(type_=pw.Type.STRING)\n    \n    @staticmethod\n    def primary_key():\n        return InputSchema.doc_id\n```\n\n## Connectors and Data Sources\n\nConnectors bridge external data sources with Pathway's internal table representation. They handle reading data from various sources and writing results to destinations.\n\n### Input Connectors\n\nPathway provides input connectors for multiple data sources:\n\n| Connector | Description | Protocol |\n|-----------|-------------|----------|\n| `pw.io.filesystem.read` | File system reading | Batch/Streaming |\n| `pw.io.http.read` | HTTP endpoint polling | Streaming |\n| `pw.io.rdkafka.read` | Kafka consumer | Streaming |\n| `pw.io.amqp.read` | AMQP message queue | Streaming |\n| `pw.io.gcp_storage.read` | Google Cloud Storage | Batch |\n| `pw.io.s3.read` | Amazon S3 | Batch |\n| `pw.io.azure_blob.read` | Azure Blob Storage | Batch |\n| `pw.io.postgres.read` | PostgreSQL database | Batch/Streaming |\n\n### Output Connectors\n\nOutput connectors write processed data to external systems:\n\n```python\npw.io.json_lines.write(\n    table=result_table,\n    path=\"./output/results.jsonl\"\n)\n```\n\n### PythonSubject for Custom Sources\n\nFor custom data sources, Pathway provides the `PythonSubject` class that wraps Python callback functions:\n\n```python\npw.io.python.connector(\n    schema=MySchema,\n    start_function=start_callback,\n    read_function=read_callback,\n    seek_function=seek_callback,\n    stop_function=stop_callback\n)\n```\n\nThe subject handles the lifecycle of data reading with callbacks for initialization, data retrieval, and seeking to specific positions.\n\n## Data Flow and Processing\n\nPathway processes data through a directed acyclic graph (DAG) of operations. Each operation transforms input tables into output tables.\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Source Connector]\n    B --> C[Raw Table]\n    C --> D[Transformations]\n    D --> E[Joins & Aggregations]\n    E --> F[Computed Table]\n    F --> G[Output Connector]\n    G --> H[Destination]\n    \n    style C fill:#e1f5fe\n    style F fill:#e8f5e8\n```\n\n### Lazy Evaluation\n\nAll Pathway operations are lazily evaluated. The computation graph is built incrementally as operations are added, but no actual data processing occurs until `pw.run()` is called.\n\n### Incremental Computation\n\nThe Rust engine uses Differential Dataflow to perform incremental computation. When input data changes, only the affected portions of the computation are re-executed. This is particularly valuable for streaming scenarios where data arrives continuously.\n\n## Execution Modes\n\nPathway supports two primary execution modes that determine how data is processed over time.\n\n### Static Mode (Batch Processing)\n\nIn static mode, Pathway processes finite datasets completely before producing results. The computation terminates when all input has been consumed.\n\n```mermaid\ngraph LR\n    A[All Input Data] --> B[Process Entire Dataset]\n    B --> C[Single Output Result]\n    \n    style A fill:#fff3e0\n    style C fill:#e8f5e8\n```\n\nCharacteristics:\n- Processes data in batches\n- Terminates after completion\n- Suitable for ETL jobs\n- Optimized for throughput\n\n### Streaming Mode\n\nIn streaming mode, Pathway processes data continuously as it arrives, updating results incrementally.\n\n```mermaid\ngraph LR\n    A[Data Stream] --> B[Continuous Processing]\n    B --> C[Incremental Updates]\n    C --> D[Live Results]\n    D --> B\n    \n    style A fill:#e3f2fd\n    style D fill:#e8f5e8\n```\n\nCharacteristics:\n- Processes data as it arrives\n- Results update continuously\n- Suitable for real-time analytics\n- Maintains state over time\n\n### Unified API\n\nThe same Pathway code works in both modes:\n\n```python\n# Works identically in both modes\nresult = (\n    input_table\n    .filter(lambda row: row.value > threshold)\n    .groupby(lambda row: row.category)\n    .reduce(lambda key, rows: (key, sum(r.value for r in rows)))\n)\n```\n\nSwitching between modes is achieved by changing the connector configuration rather than the transformation logic.\n\n## The Rust Engine\n\nPathway's Rust engine provides the computational foundation, built on Timely Dataflow and Differential Dataflow.\n\n### Key Components\n\n| Component | Purpose | Location |\n|-----------|---------|----------|\n| `Scope` | Execution context | Rust Engine |\n| `TableHandle` | Internal table reference | Rust Engine |\n| `Column` | Column data structure | Rust Engine |\n| `Universe` | Key set management | Rust Engine |\n| `TraceAgent` | Arrangement tracking | Differential Dataflow |\n\n### Python-Rust Integration\n\nThe integration uses PyO3 to create Python bindings for Rust structures:\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen)]\npub struct Table {\n    scope: Py<Scope>,\n    handle: TableHandle,\n}\n```\n\nThis allows Python code to work with Rust objects seamlessly while maintaining Python's ease of use.\n\n### Persistence\n\nThe Rust engine supports checkpointing and state persistence:\n\n```python\npw.run(\n    persistence_config=pw.persistence.Config(\n        mode=pw.persistence.Mode.SNAPSHOT,\n        snapshot_path=\"./checkpoints\"\n    )\n)\n```\n\n## User-Defined Functions\n\nPathway allows custom Python functions to be used in transformations, with careful handling of serialization and execution.\n\n### Function Types\n\n| Type | Execution | Use Case |\n|------|-----------|----------|\n| Pure Python | Serialized to Rust | Simple transformations |\n| Stateful | Managed by Pathway | Complex aggregations |\n| External | Called externally | LLM integrations |\n\n### Execution Safety\n\nFunctions executed on streaming data must be deterministic and side-effect free to ensure correct incremental computation. Pathway validates function behavior during development and enforces constraints at runtime.\n\n## Monitoring and Debugging\n\nPathway provides built-in monitoring capabilities through its dashboard.\n\n### Available Metrics\n\n- Message counts per connector\n- Processing latency\n- System resource utilization\n- Error rates and logs\n\n### Debugging Features\n\nThe `Trace` class captures execution stack traces for debugging:\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen)]\npub struct Trace {\n    file_name: String,\n    line_number: usize,\n    line: String,\n    function: String,\n}\n```\n\n## Summary\n\nPathway's core concepts form a cohesive system for building data processing pipelines:\n\n| Concept | Role |\n|---------|------|\n| **Tables** | Primary data structure for collections of typed rows |\n| **Schemas** | Define table structure, types, and primary keys |\n| **Connectors** | Bridge external data sources with Pathway tables |\n| **Lazy Evaluation** | Build computation graphs without immediate execution |\n| **Incremental Computation** | Efficient updates through Differential Dataflow |\n| **Execution Modes** | Unified API for batch and streaming processing |\n| **Rust Engine** | High-performance computation layer |\n\nThese concepts work together to provide a framework that is both developer-friendly and production-ready, capable of handling the full spectrum from simple batch ETL to complex real-time streaming analytics.\n\n---\n\n<a id='page-architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Python API Structure](#page-python-api), [Pathway Introduction](#page-introduction)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/mod.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/mod.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [src/connectors/postgres.rs](https://github.com/pathwaycom/pathway/blob/main/src/connectors/postgres.rs)\n- [external/differential-dataflow/src/lib.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/lib.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [external/differential-dataflow/src/trace/wrappers/freeze.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/wrappers/freeze.rs)\n- [external/differential-dataflow/src/trace/implementations/ord.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/implementations/ord.rs)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n</details>\n\n# System Architecture\n\nPathway is a Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG applications. The system provides an easy-to-use Python API while being powered by a scalable Rust engine based on Differential Dataflow for incremental computation. 资料来源：[README.md]()\n\n## High-Level Architecture Overview\n\nPathway follows a layered architecture that separates the Python user interface from the high-performance Rust computation engine.\n\n```mermaid\ngraph TD\n    subgraph Python_Layer[\"Python API Layer\"]\n        User_Code[\"User Python Code\"]\n        Python_API[\"Pathway Python API<br/>pw.Table, pw.io.*, pw.connectors\"]\n    end\n\n    subgraph Rust_Engine[\"Rust Engine Layer\"]\n        PyO3_Bindings[\"PyO3 Bindings\"]\n        Timely_Runtime[\"Timely Dataflow Runtime\"]\n        Diff_Computations[\"Differential Dataflow<br/>Incremental Computations\"]\n        Trace_Agent[\"Trace Agent & Arrangements\"]\n    end\n\n    subgraph Data_Storage[\"Data Storage Layer\"]\n        In_Memory_State[\"In-Memory State\"]\n        Persistence[\"Persistence Layer\"]\n        Connectors[\"External Connectors<br/>SQL, S3, HTTP, etc.\"]\n    end\n\n    User_Code --> Python_API\n    Python_API --> PyO3_Bindings\n    PyO3_Bindings --> Timely_Runtime\n    Timely_Runtime --> Diff_Computations\n    Diff_Computations --> Trace_Agent\n    Trace_Agent --> In_Memory_State\n    In_Memory_State --> Persistence\n    Diff_Computations --> Connectors\n```\n\n资料来源：[src/python_api.rs:1-50]()\n\n## Core Architectural Components\n\n### Python API Layer\n\nThe Python API layer provides the public interface for Pathway users. It exposes various I/O connectors, table operations, and transformation functions.\n\n**Key Python Classes Exposed via PyO3:**\n\n| Class | Purpose | Source |\n|-------|---------|--------|\n| `Table` | Primary data container for rows | `src/python_api.rs` |\n| `Universe` | Represents a set of row keys | `src/python_api.rs` |\n| `Column` | Column descriptor with type info | `src/python_api.rs` |\n| `Scope` | Computation scope context | `src/python_api.rs` |\n| `Context` | Runtime context for expressions | `src/python_api.rs` |\n| `Pointer` | Reference to table data | `src/python_api.rs` |\n| `ValueField` | Schema field definition | `src/python_api.rs` |\n| `PythonSubject` | Python-based data source | `src/python_api.rs` |\n\n资料来源：[src/python_api.rs:150-200]()\n\n### Rust Engine Layer\n\nThe Rust engine implements the core computation logic using Timely Dataflow and Differential Dataflow.\n\n#### Timely Dataflow\n\nTimely Dataflow provides the low-level infrastructure for parallel and distributed dataflow computation. Pathway extends the external timely-dataflow crate to support:\n\n- **Multi-worker execution**: Enables parallel processing across CPU cores\n- **Progress tracking**: Monitors dataflow completeness\n- **Dynamic reconfiguration**: Supports changing dataflow graphs\n\n资料来源：[external/timely-dataflow/mdbook/src/chapter_2/chapter_2_5.md]()\n\n#### Differential Dataflow\n\nDifferential Dataflow builds on Timely Dataflow to provide incremental computation capabilities. This is the heart of Pathway's ability to efficiently handle streaming data with updates and deletions.\n\n**Key Differential Dataflow Components:**\n\n| Component | Purpose |\n|-----------|---------|\n| `TraceAgent` | Manages persistent trace data for arrangements |\n| `Arranged` | Wrapper combining stream with trace |\n| `TraceReader` | Interface for reading from traces |\n| `Batch` | Immutable collection of updates |\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:1-30]()\n\n#### Trace Implementation\n\nPathway uses `OrdValBatch` and `OrdKeyBatch` for efficient in-memory storage of keyed data with timestamps.\n\n**OrdValBatch Structure:**\n```\nOrdValBatch<K, V, T, R, O, CK, CV>\n├── K: Key type (Ord + Clone)\n├── V: Value type (Ord + Clone)\n├── T: Timestamp type (Lattice + Ord + Clone)\n├── R: Weight type (Semigroup)\n├── O: Offset type for arrays\n├── CK: Container for keys\n└── CV: Container for values\n```\n\n资料来源：[external/differential-dataflow/src/trace/implementations/ord.rs:1-50]()\n\n### Timestamp System\n\nPathway's timestamp system is crucial for handling streaming data and maintaining temporal ordering.\n\n```mermaid\nclassDiagram\n    class Timestamp {\n        +u64 inner\n        +followed_by(other: Timestamp) Timestamp\n        +is_original() bool\n        +next_retraction_time() Timestamp\n    }\n    \n    class TotalFrontier {\n        <<enumeration>>\n        At(Timestamp)\n        Done\n    }\n    \n    class OriginalOrRetraction {\n        +is_original() bool\n    }\n    \n    class NextRetractionTime {\n        +next_retraction_time() Timestamp\n    }\n    \n    Timestamp ..|> OriginalOrRetraction\n    Timestamp ..|> NextRetractionTime\n```\n\n**Key Timestamp Properties:**\n\n- **Even timestamps** (`is_multiple_of(2)`): Represent original data\n- **Odd timestamps**: Represent retractions\n- **Lattice properties**: Enable merging of partial time information\n\n资料来源：[src/engine/timestamp.rs:1-50]()\n\n## Data Flow Architecture\n\n### Input Processing Flow\n\n```mermaid\ngraph LR\n    subgraph Input_Sources\n        Files[\"Files\"]\n        HTTP[\"HTTP/Websocket\"]\n        SQL[\"SQL Databases\"]\n        Kafka[\"Kafka/MQTT\"]\n    end\n\n    subgraph Processing\n        Parser[\"Parser/Deserializer\"]\n        Transformer[\"Transformer\"]\n        Arranger[\"Arranger\"]\n    end\n\n    subgraph Storage\n        Trace[\"Trace Agent\"]\n        Table[\"Table State\"]\n    end\n\n    Files --> Parser\n    HTTP --> Parser\n    SQL --> Parser\n    Kafka --> Parser\n    Parser --> Transformer\n    Transformer --> Arranger\n    Arranger --> Trace\n    Trace --> Table\n```\n\n### Table Operations Flow\n\nTables in Pathway are the primary data abstraction. Each table maintains:\n\n1. **Schema**: Column definitions with types\n2. **Keys**: Unique row identifiers\n3. **Values**: Column data\n4. **Timestamps**: For temporal ordering\n5. **Diffs**: Change weights (+1 for insert, -1 for delete)\n\n**Table Properties:**\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `id` | Key | Unique row identifier |\n| `values` | Map[Column, Value] | Row data |\n| `time` | Timestamp | Event timestamp |\n| `diff` | i64 | Change weight |\n\n资料来源：[src/python_api.rs:200-250]()\n\n## Connectors Architecture\n\nPathway provides connectors for various data sources and sinks. The connector system is designed with extensibility in mind.\n\n### Connector Base Infrastructure\n\n**ConnectorProperties Class Hierarchy:**\n\n```mermaid\nclassDiagram\n    class ConnectorProperties {\n        +name: String\n        +storage: DataStorage\n        +format: DataFormat\n    }\n    \n    class ColumnProperties {\n        +name: String\n        +type_: PathwayType\n        +source: FieldSource\n        +default: Option~Value~\n    }\n    \n    class TableProperties {\n        +columns: Vec~ColumnProperties~\n        +primary_key: Option~Vec~String~~\n    }\n    \n    class CsvParserSettings {\n        +delimiter: char\n        +quotechar: char\n        +header: bool\n    }\n    \n    ConnectorProperties --> TableProperties\n    TableProperties --> ColumnProperties\n```\n\n### Database Connectors\n\nThe PostgreSQL connector demonstrates the connector implementation pattern:\n\n**Type Mapping:**\n\n| PostgreSQL Type | Pathway Type | OID |\n|-----------------|--------------|-----|\n| BOOL | BOOLEAN | 16 |\n| INT2 | INT | 21 |\n| INT4 | INT | 23 |\n| INT8 | INT | 20 |\n| FLOAT4 | FLOAT | 700 |\n| FLOAT8 | FLOAT | 701 |\n| TEXT/VARCHAR | STRING | 25 |\n| BYTEA | BINARY | 17 |\n| JSONB | JSON | 3802 |\n| TIMESTAMP | DATETIME | 1114 |\n| UUID | UUID | 2950 |\n\n**Postgres Binary Array Format:**\n- `i32`: ndim (dimensions)\n- `i32`: has_nulls flag\n- `u32`: element OID\n- Per dimension: size, lower_bound\n- Per element: length (-1 for NULL), bytes\n\n资料来源：[src/connectors/postgres.rs:1-80]()\n\n### Other Supported Connectors\n\n| Connector | Settings Class | Description |\n|-----------|---------------|-------------|\n| AWS S3 | `AwsS3Settings` | S3 object storage |\n| Azure Blob | `AzureBlobStorageSettings` | Azure blob storage |\n| Elasticsearch | `ElasticSearchParams` | Search and indexing |\n| MQTT | `MqttSettings` | IoT messaging |\n| Kafka | `KafkaSettings` | Stream messaging |\n| Schema Registry | `PySchemaRegistrySettings` | Avro schema management |\n| Iceberg | `IcebergCatalogSettings` | Lakehouse tables |\n| PostgreSQL Replication | `PsqlReplicationSettings` | CDC from Postgres |\n\n资料来源：[src/python_api.rs:250-300]()\n\n## Persistence and State Management\n\n### Persistence Configuration\n\nPathway supports configurable persistence for fault tolerance and state recovery.\n\n```mermaid\ngraph TD\n    subgraph Application_Layer\n        User_Query[\"User Query\"]\n    end\n\n    subgraph Persistence_Layer\n        Config[\"PersistenceConfig\"]\n        Mode[\"PyPersistenceMode<br/>ON_STARTUP<br/>MANUAL<br/>periodic\"]\n        Storage[\"DataStorage\"]\n    end\n\n    subgraph State_Management\n        Snapshot[\"PySnapshotAccess\"]\n        Event[\"PySnapshotEvent\"]\n        Frontier[\"TotalFrontier\"]\n    end\n\n    User_Query --> Config\n    Config --> Mode\n    Config --> Storage\n    Mode --> Snapshot\n    Snapshot --> Event\n    Event --> Frontier\n```\n\n**Persistence Modes:**\n\n| Mode | Behavior |\n|------|----------|\n| `ON_STARTUP` | Automatically restore state on application start |\n| `MANUAL` | Explicit checkpoint management via API |\n| `PERIODIC` | Automatic snapshots at intervals |\n\n**Snapshot Access Methods:**\n\n| Method | Return Type | Description |\n|--------|-------------|-------------|\n| `snapshot_at(frontier)` | `Vec<(Key, Vec<Value>)>` | State at specific frontier |\n| `frontier()` | `TotalFrontier` | Current progress frontier |\n| `failed()` | `bool` | Whether computation failed |\n\n资料来源：[src/python_api.rs:100-150]()\n\n### Trace Freeze Wrapper\n\nThe `TraceFreeze` wrapper provides read-only access to historical trace data:\n\n```rust\npub struct TraceFreeze<Tr, F>\nwhere\n    Tr: TraceReader,\n    Tr::Time: Lattice+Clone+'static,\n    F: Fn(&Tr::Time)->Option<Tr::Time>,\n{\n    trace: Tr,\n    func: Rc<F>,\n}\n```\n\nThis enables efficient point-in-time queries without blocking ongoing updates.\n\n资料来源：[external/differential-dataflow/src/trace/wrappers/freeze.rs:1-30]()\n\n## Expression and Computation System\n\n### Python Expression Evaluation\n\nPathway allows Python functions to be used for data transformations through the `PyExpression` system.\n\n**Expression System Components:**\n\n| Component | Purpose |\n|-----------|---------|\n| `PyExpression` | Serialized Python expression |\n| `PyExpressionData` | Runtime data for evaluation |\n| `PyReducer` | Aggregation functions |\n| `PyUnaryOperator` | Single-input operators |\n| `PyBinaryOperator` | Two-input operators |\n\n**Computation Execution Model:**\n\n```mermaid\ngraph LR\n    subgraph Definition\n        Expr[\"Expression Definition\"]\n        Schema[\"Schema Definition\"]\n    end\n\n    subgraph Compilation\n        Parse[\"Parse & Validate\"]\n        Optimize[\"Optimize\"]\n        Generate[\"Generate Operators\"]\n    end\n\n    subgraph Execution\n        Compute[\"Compute\"]\n        Index[\"Index Results\"]\n        Output[\"Output to Tables\"]\n    end\n\n    Expr --> Parse\n    Schema --> Parse\n    Parse --> Optimize\n    Optimize --> Generate\n    Generate --> Compute\n    Compute --> Index\n    Index --> Output\n```\n\n资料来源：[src/python_api.rs:300-350]()\n\n### Computer and Scope System\n\nThe `Computer` and `Scope` classes manage computation execution:\n\n| Class | Responsibility |\n|-------|----------------|\n| `Scope` | Defines a computation boundary |\n| `Context` | Provides runtime evaluation context |\n| `Computer` | Executes transformation logic |\n| `Table` | Stores results |\n\n```mermaid\nsequenceDiagram\n    participant User as User Code\n    participant Scope as Scope\n    participant Context as Context\n    participant Computer as Computer\n    participant Table as Table\n    \n    User->>Scope: Create scope\n    User->>Context: Define operations\n    Context->>Computer: Create computer\n    Computer->>Table: Write results\n    Table-->>User: Return handle\n```\n\n## Monitoring and Telemetry\n\nPathway includes comprehensive monitoring capabilities through the `TelemetryConfig` and `PyMonitoringLevel` classes.\n\n**Monitoring Levels:**\n\n| Level | Description |\n|-------|-------------|\n| `OFF` | No telemetry |\n| `ERROR` | Only errors |\n| `WARN` | Warnings and errors |\n| `INFO` | Informational logs |\n| `DEBUG` | Detailed debugging |\n\n**Trace and Done Classes:**\n\nThe `Trace` class provides structured logging and tracing:\n\n```python\nclass Trace:\n    \"\"\"Structured logging and tracing\"\"\"\n    def log(level, message, context)\n    def span(name, fn)\n```\n\nThe `Done` class signals completion of operations and is used for synchronization.\n\n资料来源：[src/python_api.rs:350-400]()\n\n## External Integrations\n\nPathway integrates with various external systems and frameworks:\n\n| Integration | Description |\n|-------------|-------------|\n| LangChain | LLM pipeline integration |\n| LlamaIndex | RAG framework integration |\n| MinIO | S3-compatible object storage |\n| PaddleOCR | OCR capabilities |\n| Databento | Market data feeds |\n\nThese integrations are typically implemented as Pathway connectors or external index providers.\n\n资料来源：[README.md]()\n\n## Key Design Patterns\n\n### Incremental Computation Pattern\n\nPathway's core innovation is incremental computation:\n\n1. **Input Change**: A modification arrives at time T\n2. **Propagate**: The change flows through the computation graph\n3. **Differential Update**: Only affected results are updated\n4. **Efficient Output**: Minimal recomputation\n\n```mermaid\ngraph LR\n    A[\"Input Δ\"] --> B[\"Compute Δ\"]\n    B --> C[\"Update Trace\"]\n    C --> D[\"Emit Δ\"]\n    D --> A\n    \n    style A fill:#f96\n    style D fill:#96f\n```\n\n### Arrangement Pattern\n\nArrangements are a fundamental concept in Differential Dataflow:\n\n- An **arrangement** = a stream + its indexed state\n- Enables efficient joins and aggregations\n- Maintains multiple versions for time-travel queries\n\n```mermaid\ngraph TD\n    subgraph Arrangement\n        Stream[\"Stream Input\"]\n        Index[\"Trace Index\"]\n        Merge[\"Merge Operator\"]\n    end\n    \n    Stream --> Merge\n    Index --> Merge\n    Merge --> Output[\"Arranged Data\"]\n```\n\n## Summary\n\nPathway's architecture is designed around three key principles:\n\n1. **Python-First API**: Users write Python, but computation happens in Rust\n2. **Incremental Everywhere**: All operations support efficient updates\n3. **Unified Batch/Stream**: The same code works for both paradigms\n\nThe system successfully abstracts the complexity of parallel and distributed computation behind a simple Python interface, making real-time analytics and LLM pipelines accessible to Python developers.\n\n---\n\n<a id='page-python-api'></a>\n\n## Python API Structure\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture), [Data Transformations](#page-transformations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n- [examples/projects/web-scraping/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/web-scraping/README.md)\n</details>\n\n# Python API Structure\n\n## Overview\n\nPathway is a Python ETL framework for stream processing and real-time analytics, powered by a **scalable Rust engine** based on Differential Dataflow. The Python API Structure defines the interface layer that bridges Python user code with the high-performance Rust computation engine through PyO3 bindings. This architecture enables Python developers to write data pipelines while leveraging Rust's performance characteristics for incremental computation.\n\nThe framework allows seamless integration with Python ML libraries, supports both batch and streaming data, and can be used across development, CI/CD, and production environments with identical code. 资料来源：[README.md:1-15]()\n\n## Architecture Overview\n\n```mermaid\ngraph TD\n    A[Python User Code] --> B[Python API Layer]\n    B --> C[PyO3 Bindings]\n    C --> D[Rust Engine]\n    D --> E[Differential Dataflow]\n    E --> F[Incremental Computation]\n    \n    G[PythonSubject] --> B\n    H[ValueField] --> B\n    I[PyExpression] --> B\n    J[PyExportedTable] --> B\n```\n\n## Core Components\n\n### PythonSubject\n\nThe `PythonSubject` class serves as the primary interface for connecting external data sources to the Pathway engine. It wraps Python callback functions that the Rust engine invokes during data processing.\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `start` | `Py<PyAny>` | Callback invoked when a new session starts |\n| `read` | `Py<PyAny>` | Callback for reading data from the source |\n| `seek` | `Py<PyAny>` | Callback for seeking within the data stream |\n| `on_persisted_run` | `Py<PyAny>` | Callback triggered on persisted runs |\n| `end` | `Py<PyAny>` | Callback invoked when the source ends |\n| `is_internal` | `bool` | Flag indicating internal source usage |\n| `deletions_enabled` | `bool` | Flag enabling deletion tracking |\n\n资料来源：[src/python_api.rs:1-35]()\n\n```rust\n#[pymethods]\nimpl PythonSubject {\n    #[new]\n    #[pyo3(signature = (start, read, seek, on_persisted_run, end, is_internal, deletions_enabled))]\n    fn new(\n        start: Py<PyAny>,\n        read: Py<PyAny>,\n        seek: Py<PyAny>,\n        on_persisted_run: Py<PyAny>,\n        end: Py<PyAny>,\n        is_internal: bool,\n        deletions_enabled: bool,\n    ) -> Self {\n        Self { ... }\n    }\n}\n```\n\n### ValueField\n\n`ValueField` defines schema fields within Pathway tables, specifying the structure of data including type, source, default values, and metadata.\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `name` | `String` | Field identifier |\n| `type_` | `Type` | Data type specification |\n| `source` | `FieldSource` | Origin of the field data |\n| `default` | `Option<Value>` | Default value if none provided |\n| `metadata` | `Option<String>` | Additional metadata as JSON string |\n\n资料来源：[src/python_api.rs:40-55]()\n\n### FieldSource Enum\n\nThe `FieldSource` enum specifies where field data originates:\n\n| Value | Description |\n|-------|-------------|\n| `Payload` | Data from the input payload (default) |\n| `NewId` | System-generated identifier |\n| `USB` | User-specified binding |\n\n资料来源：[src/python_api.rs:200-210]()\n\n## Engine Integration Layer\n\n### TotalFrontier\n\n`TotalFrontier` represents the completion state of data processing across all workers. It uses an enum with two states:\n\n```mermaid\nstateDiagram-v2\n    [*] --> At\n    At --> Done: All workers complete\n    Done --> [*]\n    \n    TotalFrontier: At(timestamp) - Processing pending\n    TotalFrontier: Done - All complete\n```\n\n| State | Type | Meaning |\n|-------|------|---------|\n| `At(i)` | `Timestamp` | Frontier positioned at timestamp `i` |\n| `Done` | singleton | All processing completed |\n\n资料来源：[src/python_api.rs:90-130]()\n\n### PyExportedTable\n\n`PyExportedTable` exposes table data to Python consumers through three key methods:\n\n| Method | Return Type | Description |\n|--------|-------------|-------------|\n| `frontier()` | `TotalFrontier<Timestamp>` | Current frontier position |\n| `snapshot_at(frontier)` | `Vec<(Key, Vec<Value>)>` | Point-in-time table snapshot |\n| `failed()` | `bool` | Indicates if table processing failed |\n\n资料来源：[src/python_api.rs:140-165]()\n\n```rust\n#[pymethods]\nimpl PyExportedTable {\n    fn frontier(&self) -> TotalFrontier<Timestamp> {\n        self.inner.frontier()\n    }\n\n    fn snapshot_at(&self, frontier: TotalFrontier<Timestamp>) -> Vec<(Key, Vec<Value>)> {\n        self.inner.snapshot_at(frontier)\n    }\n\n    fn failed(&self) -> bool {\n        self.inner.failed()\n    }\n}\n```\n\n## Mode and Type Enums\n\n### ConnectorMode\n\nDefines data processing behavior for connectors:\n\n| Constant | Value | Use Case |\n|----------|-------|----------|\n| `STATIC` | `ConnectorMode::Static` | Batch processing, fixed dataset |\n| `STREAMING` | `ConnectorMode::Streaming` | Continuous data streams |\n\n资料来源：[src/python_api.rs:220-240]()\n\n### SessionType\n\nSpecifies how the engine handles sessionization:\n\n| Constant | Value | Description |\n|----------|-------|-------------|\n| `NATIVE` | `SessionType::Native` | Native differential dataflow processing |\n| `UPSERT` | `SessionType::Upsert` | Upsert-based session handling |\n\n资料来源：[src/python_api.rs:245-260]()\n\n## Expression System\n\n### PyExpression\n\nThe `PyExpression` class wraps Rust `Expression` objects and tracks whether Python GIL (Global Interpreter Lock) is required for evaluation:\n\n| Property | Type | Purpose |\n|----------|------|---------|\n| `inner` | `Arc<Expression>` | Rust expression AST |\n| `gil` | `bool` | Whether GIL acquisition needed |\n\n资料来源：[src/python_api.rs:280-310]()\n\n```rust\npub struct PyExpression {\n    inner: Arc<Expression>,\n    gil: bool,\n}\n\nimpl PyExpression {\n    fn new(inner: Arc<Expression>, gil: bool) -> Self {\n        Self { inner, gil }\n    }\n}\n```\n\n### Binary Operators\n\nBinary operations are implemented using macros that automatically handle GIL tracking:\n\n```rust\nmacro_rules! binary_op {\n    ($expression:path, $lhs:expr, $rhs:expr $(, $arg:expr)*) => {\n        Self::new(\n            Arc::new(Expression::from($expression(\n                $lhs.inner.clone(),\n                $rhs.inner.clone(),\n                $($arg,)*\n            ))),\n            $lhs.gil || $rhs.gil,\n        )\n    };\n}\n```\n\nThis pattern ensures that if either operand requires Python GIL, the entire expression is marked as requiring GIL during evaluation.\n\n## Timestamp System\n\nPathway uses a specialized timestamp system that supports incremental computation through Differential Dataflow:\n\n```mermaid\ngraph LR\n    A[Input] --> B[Timestamp Assignment]\n    B --> C[Frontier Computation]\n    C --> D[Incremental Update]\n    D --> E[Result]\n    \n    F[Retraction] -.->|Original| D\n    F[Retraction] -.->|Retraction| D\n```\n\n### OriginalOrRetraction Trait\n\nTimestamps determine whether a change is an original value or a retraction:\n\n```rust\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n```\n\n### NextRetractionTime Trait\n\nComputes the next retraction timestamp:\n\n```rust\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:1-50]()\n\n## Wakeup Handler\n\nThe `WakeupHandler` manages file descriptor-based wakeups for coordinating Rust async operations with Python's event loop:\n\n```rust\nstruct WakeupHandler<'py> {\n    _fd: OwnedFd,\n    set_wakeup_fd: Bound<'py, PyAny>,\n    old_wakeup_fd: Bound<'py, PyAny>,\n}\n```\n\nThis enables Pathway to integrate with Python's async ecosystem while maintaining efficient I/O handling in Rust.\n\n## Trace and Error Handling\n\n### EngineTrace\n\n`EngineTrace` captures source location information for debugging and error reporting:\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `file_name` | `String` | Source file path |\n| `line_number` | `u32` | Line number in source |\n| `line` | `String` | Source line content |\n| `function` | `String` | Function/method name |\n\n资料来源：[src/python_api.rs:180-200]()\n\nThe trace system integrates with Python's exception handling, converting Rust trace data into Python traceback objects:\n\n```rust\nimpl<'py> IntoPyObject<'py> for EngineTrace {\n    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {\n        match self {\n            Self::Empty => Ok(py.None().into_bound(py)),\n            Self::Frame { ... } => Trace { ... }.into_bound_py_any(py),\n        }\n    }\n}\n```\n\n## Data Flow Example\n\n```mermaid\ngraph TD\n    A[Data Source] -->|PythonSubject| B[Rust Engine]\n    B -->|Processing| C[Timestamp Assignment]\n    C -->|Frontier Update| D[TotalFrontier]\n    D -->|Snapshot| E[PyExportedTable]\n    E -->|Query| F[Python Application]\n    \n    G[ConnectorMode] -->|Configuration| B\n    H[SessionType] -->|Configuration| B\n```\n\n## Usage Patterns\n\n### Creating a Data Pipeline\n\nPathway pipelines are defined in Python and executed by the Rust engine:\n\n```python\nimport pathway as pw\n\n# Define schema\nclass InputSchema(pw.Schema):\n    id: int\n    data: str\n\n# Create connector\nconnector = pw.io.http.PathwayWebserver(host=\"0.0.0.0\", port=8011)\n\n# Run pipeline\npw.run()\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md:30-45]()\n\n### Web Scraping Pipeline\n\nReal-time web scraping demonstrates the streaming capabilities:\n\n```python\n# Connector implementation\nclass NewsScraperSubject(pw.ConnectorSubject):\n    def __init__(self, url, interval=60):\n        # Configure callbacks\n        self.url = url\n        self.interval = interval\n```\n\n资料来源：[examples/projects/web-scraping/README.md:50-65]()\n\n### Multi-Agent RAG Pipeline\n\nThe integration with agent frameworks shows advanced API usage:\n\n```mermaid\ngraph LR\n    A[Documents] -->|Index| B[Pathway VectorStoreServer]\n    B -->|Retrieve| C[AG2 Agents]\n    C -->|Query| D[User]\n    \n    E[REST API /v1/retrieve] --> B\n```\n\n资料来源：[examples/projects/ag2-multiagent-rag/README.md:20-40]()\n\n## Summary\n\nThe Python API Structure in Pathway consists of several key layers:\n\n1. **Python Interface Layer**: PyO3-wrapped classes (`PythonSubject`, `ValueField`, `PyExportedTable`, `PyExpression`) provide idiomatic Python APIs\n2. **Type System**: Enums for `ConnectorMode`, `SessionType`, `FieldSource` configure engine behavior\n3. **State Management**: `TotalFrontier` tracks processing progress across distributed workers\n4. **Expression Evaluation**: The `PyExpression` system handles both pure Rust and Python-requiring computations\n5. **Error Handling**: `EngineTrace` and `WakeupHandler` provide debugging and async coordination capabilities\n\nThis architecture enables Python developers to leverage high-performance Rust computation while maintaining the productivity of Python development.\n\n---\n\n<a id='page-data-model'></a>\n\n## Data Model and Type System\n\n### 相关页面\n\n相关主题：[Core Concepts](#page-concepts), [Data Transformations](#page-transformations)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [src/connectors/postgres.rs](https://github.com/pathwaycom/pathway/blob/main/src/connectors/postgres.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [external/differential-dataflow/src/trace/layers/ordered.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/layers/ordered.rs)\n</details>\n\n# Data Model and Type System\n\nPathway's Data Model and Type System form the foundational architecture that enables the framework to handle structured data processing with type safety, schema validation, and efficient data representation. Built on a Rust-powered engine, Pathway provides a Python-friendly API while maintaining the performance benefits of differential dataflow computations.\n\n## Architecture Overview\n\nPathway's type system operates at multiple levels: the Python API layer where users define schemas, and the Rust engine layer where types are resolved and processed efficiently.\n\n```mermaid\ngraph TD\n    A[Python API Layer<br/>dtype.py, expression.py] --> B[Type Interpreter<br/>type_interpreter.py]\n    B --> C[Rust Engine<br/>python_api.rs]\n    C --> D[Differential Dataflow<br/>Timely Backend]\n    C --> E[Connectors<br/>postgres.rs, etc.]\n    \n    F[ValueField] --> C\n    G[FieldSource] --> C\n    H[ConnectorMode] --> C\n    I[Timestamp] --> D\n```\n\n## Core Type Components\n\n### Type Enumeration\n\nThe Pathway type system is built around a `Type` enumeration that defines all supported data types. These types map to both Python native types and Rust storage representations.\n\n| Type Category | Supported Types | Description |\n|---------------|-----------------|-------------|\n| Boolean | `BOOL` | True/False values |\n| Integer | `INT2`, `INT4`, `INT8` | 16, 32, and 64-bit signed integers |\n| Float | `FLOAT4`, `FLOAT8` | 32 and 64-bit floating point |\n| String | `TEXT`, `VARCHAR` | Variable-length text |\n| Binary | `BYTEA` | Binary/blob data |\n| JSON | `JSON`, `JSONB` | JSON data with/without binary optimization |\n| Temporal | `TIMESTAMP`, `TIMESTAMPTZ`, `INTERVAL` | Date, time, and duration values |\n| UUID | `UUID` | Universally unique identifiers |\n\n资料来源：[src/connectors/postgres.rs:1-30]()\n\n### ValueField Structure\n\n`ValueField` is the core schema definition unit that encapsulates field metadata:\n\n```python\n@dataclass\nclass ValueField:\n    name: str           # Field identifier\n    type_: Type         # Data type enumeration\n    source: FieldSource # Origin of the field data\n    default: Optional[Value]  # Default value if null\n    metadata: Optional[str]    # Additional metadata as JSON string\n```\n\n资料来源：[src/python_api.rs:30-47]()\n\n## Field Source Classification\n\n`FieldSource` defines where data originates within the Pathway computation model:\n\n| Source | Value | Description |\n|--------|-------|-------------|\n| `Payload` | Default | Data from the original input connector |\n| `Key` | - | Primary key field extracted/generated |\n| `Computed` | - | Derived from transformations |\n| `Pointer` | - | Reference to another table entry |\n\n资料来源：[src/python_api.rs:20-28]()\n\nThe `FieldSource` enum enables Pathway to track data lineage and enforce immutability constraints:\n\n```python\nconst PAYLOAD: FieldSource = FieldSource::Payload;\n```\n\n资料来源：[src/python_api.rs:450-451]()\n\n## Connector Modes\n\nPathway supports two distinct data processing modes:\n\n| Mode | Behavior | Use Case |\n|------|----------|----------|\n| `STATIC` | Process data once, complete when input exhausted | Batch processing, static datasets |\n| `STREAMING` | Continuous processing, incremental updates | Real-time data streams, live monitoring |\n\n资料来源：[src/python_api.rs:460-468]()\n\n```python\n#[pymethods]\nimpl PyConnectorMode {\n    #[classattr]\n    pub const STATIC: ConnectorMode = ConnectorMode::Static;\n    #[classattr]\n    pub const STREAMING: ConnectorMode = ConnectorMode::Streaming;\n}\n```\n\n## Session Types\n\nSession type configuration affects how Pathway handles data transactions:\n\n| Type | Description |\n|------|-------------|\n| `NATIVE` | Standard Pathway processing |\n| `UPSERT` | Specialized handling for upsert operations |\n\n资料来源：[src/python_api.rs:470-477]()\n\n## Timestamp System\n\nPathway's timestamp system implements incremental computation semantics using even-odd parity:\n\n```python\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:25-36]()\n\n### Timestamp Parity Convention\n\n- **Even timestamps**: Original data values\n- **Odd timestamps**: Retractions (deletions/updates of previous values)\n\nThis design enables efficient incremental updates in the differential dataflow computation model.\n\n## Data Arrangement in Differential Dataflow\n\nPathway leverages differential dataflow for efficient state management. The arrangement layer provides key-value storage with ordered indexing:\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[OrderedLayer<br/>Keys + Values]\n    B --> C[Arranged Trace]\n    C --> D[Query Operations]\n    \n    E[Offset Index] --> B\n    F[Child Cursor] --> B\n```\n\n资料来源：[external/differential-dataflow/src/trace/layers/ordered.rs:1-20]()\n\n### Cursor Navigation\n\nThe cursor-based traversal in ordered layers supports efficient seeking and stepping:\n\n```rust\nfn step(&mut self, storage: &OrderedLayer<K, L, O, C>) {\n    // Pathway extension: pos_nonnegative tracks cursor repositioning\n    if self.pos_nonnegative { \n        self.pos += 1; \n    } else {\n        self.pos_nonnegative = true;\n    }\n}\n```\n\n资料来源：[external/differential-dataflow/src/trace/layers/ordered.rs:15-23]()\n\n## Table Representation\n\nPathway represents data using the `LegacyTable` structure:\n\n| Component | Type | Purpose |\n|-----------|------|---------|\n| `universe` | `Universe` | Unique identifier namespace |\n| `columns` | `Vec<Column>` | Collection of typed columns |\n\n资料来源：[src/python_api.rs:100-110]()\n\n### Universe System\n\nThe universe concept provides isolation and partitioning for tables:\n\n```python\npub fn __repr__(&self, py: Python) -> String {\n    format!(\n        \"<LegacyTable universe={:?} columns=[{}]>\",\n        self.universe.borrow(py).handle,\n        self.columns.iter().format_with(\", \", |column, f| {\n            f(&format_args!(\"{:?}\", column.borrow(py).handle))\n        })\n    )\n}\n```\n\n资料来源：[src/python_api.rs:112-120]()\n\n## Computer and Computation Model\n\nThe `Computer` class wraps Python functions for execution within the Rust engine:\n\n```python\npub struct Computer {\n    fun: Py<PyAny>,           # Python callable\n    dtype: Py<PyAny>,         # Return type annotation\n    is_output: bool,          # Marks output tables\n    is_method: bool,          # Instance vs class method\n    universe: Py<Universe>,   # Computation scope\n    data: Value,              # Closure-captured data\n    data_column: Option<Py<Column>>,  # Optional input column\n}\n```\n\n资料来源：[src/python_api.rs:145-154]()\n\n### Computation Flow\n\n```mermaid\ngraph TD\n    A[Python Function Definition] --> B[Computer Wrapper]\n    B --> C[Rust Engine Execution]\n    C --> D[ScopedContext Injection]\n    D --> E[Result Value]\n    \n    F[Timestamp Parity Check] -.-> C\n```\n\n## Key Generation Policies\n\nPathway supports flexible key generation strategies through `KeyGenerationPolicy`:\n\n| Policy | Behavior |\n|--------|----------|\n| `Auto` | System-generated unique keys |\n| `Manual` | User-provided key values |\n| `Composite` | Keys derived from multiple fields |\n\n资料来源：[src/python_api.rs:520-530]()\n\n## Monitoring and Debugging\n\nThe `EngineTrace` system captures execution context for debugging:\n\n```python\npub enum EngineTrace {\n    Empty,                    # No trace information\n    Frame {                   # Source location\n        file_name: String,\n        line: String,\n        line_number: usize,\n        function: String,\n    }\n}\n```\n\n资料来源：[src/python_api.rs:380-395]()\n\n## Python-Rust Type Conversions\n\nPathway provides bidirectional type conversions between Python and Rust:\n\n### IntoPyObject Implementation\n\n```python\nimpl<'py> IntoPyObject<'py> for Timestamp {\n    type Target = PyAny;\n    type Output = Bound<'py, Self::Target>;\n    type Error = PyErr;\n    \n    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {\n        self.0.into_bound_py_any(py)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:8-18]()\n\n### FromPyObject Implementation\n\n```python\nimpl<'py> FromPyObject<'py> for Timestamp {\n    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {\n        ob.extract().map(Self)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:20-24]()\n\n## PostgreSQL Type Mapping\n\nPathway's PostgreSQL connector provides explicit OID mappings for binary protocol:\n\n| PostgreSQL OID | Pathway Type |\n|----------------|--------------|\n| 16 | BOOL |\n| 21 | INT2 |\n| 23 | INT4 |\n| 20 | INT8 |\n| 700 | FLOAT4 |\n| 701 | FLOAT8 |\n| 25 | TEXT/VARCHAR |\n| 17 | BYTEA |\n| 3802 | JSONB |\n| 114 | JSON |\n| 1114 | TIMESTAMP |\n| 1184 | TIMESTAMPTZ |\n| 1186 | INTERVAL |\n| 2950 | UUID |\n\n资料来源：[src/connectors/postgres.rs:10-25]()\n\n## Summary\n\nPathway's Data Model and Type System provides:\n\n1. **Type Safety**: Strong typing from Python definitions through to Rust execution\n2. **Incremental Computation**: Timestamp-based parity for efficient updates\n3. **Flexible Schema**: ValueField with configurable sources and defaults\n4. **Multiple Processing Modes**: Static and streaming connector support\n5. **Differential Execution**: Powered by timely and differential dataflow\n6. **Connector Integration**: Native type mappings for external systems like PostgreSQL\n\nThis architecture enables users to write expressive Python pipelines while benefiting from high-performance Rust-based incremental computation.\n\n---\n\n<a id='page-connectors'></a>\n\n## Data Connectors\n\n### 相关页面\n\n相关主题：[Custom Connectors](#page-connectors-custom)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/io/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/__init__.py)\n- [python/pathway/io/csv/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/csv/__init__.py)\n- [python/pathway/io/kafka/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/kafka/__init__.py)\n- [python/pathway/io/postgres/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/postgres/__init__.py)\n- [python/pathway/io/s3/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/s3/__init__.py)\n- [python/pathway/io/http/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/http/__init__.py)\n- [src/connectors/mod.rs](https://github.com/pathwaycom/pathway/blob/main/src/connectors/mod.rs)\n- [docs/2.developers/4.user-guide/20.connect/35.pathway-connectors.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/20.connect/35.pathway-connectors.md)\n</details>\n\n# Data Connectors\n\n## Overview\n\nData Connectors in Pathway are the primary interface for ingesting data from external sources and persisting computed results to external destinations. Pathway provides a unified abstraction that handles both batch and streaming data, allowing developers to write Python code that works seamlessly across different execution modes. The connector system is built on top of Pathway's Rust engine, which provides incremental computation capabilities through Differential Dataflow.\n\nPathway connectors abstract the complexity of various data sources—including filesystems, databases, message queues, cloud storage, and HTTP endpoints—behind a consistent Python API. This design enables developers to connect to multiple data sources using the same patterns, whether working in local development, CI/CD tests, or production streaming environments.\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n## Architecture\n\n### Connector System Components\n\nThe Pathway connector system consists of three primary components that work together to provide reliable data movement:\n\n**Subjects** are the data source abstraction in Pathway. A `PythonSubject` wraps user-defined Python functions that handle the mechanics of reading from an external source—connecting, seeking, reading bytes, and managing persistence state. The subject tracks whether the source is internal or external and manages deletion notifications when upstream data is removed.\n\n**Connectors** orchestrate the data flow between subjects and the Pathway engine. They manage the lifecycle of data ingestion, including initialization, incremental updates, and graceful shutdown. Connectors expose configuration options for streaming versus static modes, enabling the same code to work in batch and real-time scenarios.\n\n**Sinks** handle output operations, persisting Pathway tables to external destinations. Sinks receive computed results from the engine and write them to files, databases, or streaming systems with exactly-once semantics where supported.\n\n```mermaid\ngraph TD\n    A[External Data Source] --> B[PythonSubject]\n    B --> C[Connector]\n    C --> D[Pathway Engine<br/>Differential Dataflow]\n    D --> E[Sink]\n    E --> F[External Data Destination]\n    \n    G[Schema Definition] --> C\n    H[Connector Mode<br/>Static/Streaming] --> C\n```\n\n资料来源：[src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n### Connector Mode\n\nPathway supports two distinct connector modes that determine how data is processed:\n\n| Mode | Description | Use Case |\n|------|-------------|----------|\n| `STATIC` | Process all available data at once, then complete | Batch jobs, one-time imports |\n| `STREAMING` | Continuously monitor for changes and updates | Real-time pipelines, live data |\n\n```python\n# Static mode - processes existing data and exits\npw.io.fs.read(path=\"./data/\", mode=pw ConnectorMode.STATIC)\n\n# Streaming mode - continuously monitors for changes\npw.io.fs.read(path=\"./data/\", mode=pw.ConnectorMode.STREAMING)\n```\n\n资料来源：[src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## Input Connectors\n\n### CSV Reader\n\nThe CSV connector reads comma-separated values from files or directories. It supports schema inference, type casting, and automatic handling of headers. The connector can operate in both static and streaming modes, re-reading files when they are modified or new files appear in the watched directory.\n\n```python\nimport pathway as pw\n\n# Basic CSV reading\ntable = pw.io.csv.read(\n    path=\"./input_data/\",\n    schema=pw.Schema(\n        id=pw.ColumnDefinition(type=pw.Type.INT),\n        name=pw.ColumnDefinition(type=pw.Type.STR),\n        value=pw.ColumnDefinition(type=pw.Type.FLOAT)\n    ),\n    mode=pw.ConnectorMode.STATIC\n)\n```\n\nConfiguration parameters for CSV reading include the file path, schema definition, whether to use streaming mode, and options for handling malformed rows. The connector tracks file modifications to enable incremental processing in streaming scenarios.\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n### Kafka Connector\n\nThe Kafka connector integrates with Apache Kafka for high-throughput message streaming. It supports both consumer and producer patterns, enabling Pathway pipelines to consume from Kafka topics and write results back to Kafka. The connector handles offset management, partition awareness, and graceful rebalancing when consumer groups change.\n\n```python\nimport pathway as pw\n\n# Kafka consumer\nevents = pw.io.kafka.read(\n    brokers=[\"localhost:9092\"],\n    topic=\"input-events\",\n    schema=pw.Schema(\n        event_id=pw.ColumnDefinition(type=pw.Type.STR),\n        timestamp=pw.ColumnDefinition(type=pw.Type.DATETIME),\n        payload=pw.ColumnDefinition(type=pw.Type.STR)\n    ),\n    format=\"json\",\n    kafka_settings=pw.io.kafka.KafkaReadSettings(\n        consumer_group=\"pathway-consumer\",\n        offset=pw.io.kafka.Offset.latest()\n    )\n)\n```\n\nThe connector supports JSON and Avro message formats, with automatic schema registration for Avro registries. It provides exactly-once processing semantics through coordinated offsets with Pathway's checkpointing system.\n\n资料来源：[examples/projects/option-greeks/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/option-greeks/README.md)\n\n### PostgreSQL Connector\n\nPathway provides bidirectional PostgreSQL integration for reading from and writing to PostgreSQL tables. The connector supports both full table scans and change data capture (CDC) when combined with logical replication, enabling real-time streaming from database changes.\n\n```python\nimport pathway as pw\n\n# PostgreSQL read with CDC\ntable = pw.io.postgres.read(\n    host=\"localhost\",\n    port=5432,\n    database=\"mydb\",\n    table_name=\"orders\",\n    user=\"pathway_user\",\n    password=pw.io.postgres.PasswordSecret(\"PG_PASSWORD\"),\n    schema=pw.Schema(\n        order_id=pw.ColumnDefinition(type=pw.Type.INT, primary_key=True),\n        customer_id=pw.ColumnDefinition(type=pw.Type.INT),\n        total=pw.ColumnDefinition(type=pw.Type.FLOAT),\n        created_at=pw.ColumnDefinition(type=pw.Type.DATETIME)\n    )\n)\n```\n\nFor write operations, the PostgreSQL sink supports batch inserts and upserts, using primary key columns to determine insert versus update behavior. Connection pooling ensures efficient resource utilization under high throughput.\n\n### S3 Connector\n\nThe S3 connector provides access to Amazon S3 and S3-compatible object stores (including MinIO). It supports reading objects as binary data, CSV, or JSON, and can watch S3 prefixes for new objects in streaming mode.\n\n```python\nimport pathway as pw\n\n# S3 read\ndata = pw.io.s3.read(\n    bucket=\"my-bucket\",\n    prefix=\"input/data-\",\n    access_key=pw.io.s3.AwsAccessKeySecret(\"AWS_ACCESS_KEY\"),\n    secret_key=pw.io.s3.AwsSecretKeySecret(\"AWS_SECRET_KEY\"),\n    region=\"us-east-1\",\n    format=\"csv\",\n    mode=pw.ConnectorMode.STREAMING\n)\n```\n\nThe connector supports AWS IAM roles, instance profiles, and explicit credentials. It handles retries automatically for transient network failures and implements efficient listing with pagination for buckets with many objects.\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n### HTTP Connector\n\nPathway includes HTTP-based connectors for both receiving data via HTTP servers and making outbound HTTP requests. The HTTP input connector starts a web server that accepts incoming data, while the HTTP output connector can send computed results to external webhooks.\n\n```python\nimport pathway as pw\n\n# HTTP input server\ninput_table = pw.io.http.read(\n    host=\"0.0.0.0\",\n    port=8080,\n    schema=pw.Schema(\n        request_id=pw.ColumnDefinition(type=pw.Type.STR),\n        data=pw.ColumnDefinition(type=pw.Type.STR)\n    ),\n    mode=pw.ConnectorMode.STREAMING\n)\n\n# HTTP output\npw.io.http.write(\n    table=result_table,\n    url=\"https://api.example.com/webhook\",\n    api_key=pw.io.http.ApiKeySecret(\"WEBHOOK_API_KEY\")\n)\n```\n\nThe HTTP input connector supports authentication via API keys and basic authentication. It processes incoming POST requests with JSON payloads and automatically handles concurrent requests with thread-safe processing.\n\n资料来源：[examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n\n### SharePoint Connector\n\nPathway provides specialized connectors for Microsoft SharePoint that enable reading files from SharePoint document libraries and sites. The connector supports authenticated access using Azure AD application credentials with certificate-based authentication.\n\n```python\nimport pathway as pw\n\n# SharePoint read\ndocuments = pw.io.sharepoint.read(\n    url=\"https://company.sharepoint.com/sites/project\",\n    root_path=\"/documents\",\n    tenant=\"tenant-id\",\n    client_id=\"app-client-id\",\n    thumbprint=\"cert-thumbprint\",\n    cert_path=\"/path/to/certificate.pem\",\n    mode=pw.ConnectorMode.STREAMING\n)\n```\n\nThe connector indexes all files in the specified SharePoint location, monitoring for additions and modifications. It supports various file formats and can extract content for downstream processing.\n\n资料来源：[examples/projects/sharepoint-test/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/sharepoint-test/README.md)\n\n## Output Connectors\n\n### CSV Writer\n\nThe CSV writer persists Pathway tables to CSV files with configurable delimiters, quoting behavior, and header options. In streaming mode, the writer can append to files or create new files based on time windows.\n\n```python\nimport pathway as pw\n\npw.io.csv.write(\n    table=result_table,\n    path=\"./output/results.csv\",\n    include_header=True,\n    separator=\",\"\n)\n```\n\n### Kafka Writer\n\nThe Kafka writer sends Pathway table updates to Kafka topics. Each row update is serialized as a message with configurable key extraction for partition routing.\n\n```python\npw.io.kafka.write(\n    table=result_table,\n    brokers=[\"localhost:9092\"],\n    topic=\"output-events\",\n    format=\"json\",\n    key_column=\"event_id\"\n)\n```\n\n### Database Writers\n\nPathway supports writing to PostgreSQL and other databases with upsert semantics. Writers automatically batch updates for efficiency and handle retries for transient failures.\n\n```python\npw.io.postgres.write(\n    table=result_table,\n    host=\"localhost\",\n    port=5432,\n    database=\"output_db\",\n    table_name=\"results\",\n    user=\"writer_user\",\n    password=pw.io.postgres.PasswordSecret(\"PG_PASSWORD\"),\n    upsert=True\n)\n```\n\n## Schema Definition\n\nPathway uses explicit schema definitions to map external data formats to typed columns. Schemas define column names, types, and metadata that guide parsing and validation.\n\n```python\nimport pathway as pw\n\nclass InputSchema(pw.Schema):\n    id: int\n    name: str\n    timestamp: pw.DateTimeUtc\n    metadata: dict\n```\n\n| Schema Feature | Description |\n|----------------|-------------|\n| Column Types | INT, FLOAT, STR, BOOL, DATETIME, BYTES, JSON |\n| Primary Keys | Mark columns as primary keys for upsert operations |\n| Default Values | Provide fallback values for missing data |\n| Nested Types | Support for arrays and dictionaries |\n\n资料来源：[src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## Persistence and State Management\n\nConnectors integrate with Pathway's persistence system to maintain state across restarts. When a connector's subject tracks offsets, positions, or timestamps, this state is preserved in the persistence backend and restored on restart.\n\n```python\nimport pathway as pw\n\n# Persistence configuration for connector state\npersistence_config = pw.persistence.Config(\n    backend=pw.persistence.Backend.filesystem(\n        path=\"./persistence_storage/\"\n    )\n)\n\npw.run(\n    persistence_config=persistence_config\n)\n```\n\nThe persistence system ensures that streaming connectors resume from their last processed position, preventing data loss and duplicate processing during planned or unplanned shutdowns.\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n## Common Patterns\n\n### Multi-Source Aggregation\n\nPathway excels at combining data from multiple heterogeneous sources into unified computations. Connectors for each source produce tables that can be joined, unioned, and transformed together.\n\n```python\nimport pathway as pw\n\n# Combine CSV and Kafka sources\ncsv_data = pw.io.csv.read(path=\"./static-data/\", schema=CsvSchema)\nkafka_data = pw.io.kafka.read(\n    brokers=[\"localhost:9092\"],\n    topic=\"live-events\",\n    schema=KafkaSchema\n)\n\n# Join and process\nresult = csv_data.join(kafka_data, pw.left.id == pw.right.source_id).select(\n    id=pw.left.id,\n    static_info=pw.left.info,\n    live_update=pw.right.value\n)\n```\n\n### Live Document Indexing\n\nA common pattern for RAG (Retrieval-Augmented Generation) applications involves continuous document indexing with real-time search capability. The HTTP connector combined with vector embedding creates a live knowledge base.\n\n```python\nimport pathway as pw\n\n# Document source with automatic reindexing\ndocs = pw.io.fs.read(\n    path=\"./documents/\",\n    format=\"binary\",\n    mode=pw.ConnectorMode.STREAMING\n)\n\n# Web server for queries\npw.io.http.write_server(\n    table=embedded_docs,\n    host=\"0.0.0.0\",\n    port=8011\n)\n```\n\n资料来源：[examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n\n## Error Handling\n\nConnectors provide mechanisms for handling errors during data processing. The engine tracks connector failures through the `failed()` method on exported tables, enabling pipelines to detect and respond to source or destination errors.\n\n```python\nexported = pw.io.csv.read(\n    path=\"./data/\",\n    schema=InputSchema\n)\n\n# Check for connector failures\nif exported.failed():\n    # Handle error - log, alert, or retry\n    pass\n```\n\nConnectors also support dead-letter patterns where invalid records are routed to separate tables for inspection and reprocessing, ensuring that malformed data does not halt the entire pipeline.\n\n## Security Considerations\n\nConnectors handle sensitive data and credentials with appropriate security measures. Secrets can be provided through environment variables, files, or secret management services. The connector API uses typed secret classes that prevent accidental logging of credentials.\n\n| Secret Type | Usage |\n|-------------|-------|\n| `PasswordSecret` | Database passwords |\n| `ApiKeySecret` | HTTP API keys |\n| `AwsAccessKeySecret` | AWS access keys |\n| `AwsSecretKeySecret` | AWS secret keys |\n\nPathway recommends using IAM roles, Azure managed identities, or Kubernetes secrets for production deployments rather than embedding credentials in code.\n\n## Monitoring and Observability\n\nPathway's monitoring dashboard tracks connector metrics including message counts, processing latency, and error rates. Each connector reports statistics independently, enabling fine-grained observability across complex pipelines.\n\n```python\npw.run(monitoring=True)\n```\n\nThe dashboard displays per-connector metrics that help identify bottlenecks and failures in data pipelines. Metrics are also exported to Prometheus-compatible endpoints for integration with existing observability infrastructure.\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n---\n\n<a id='page-connectors-custom'></a>\n\n## Custom Connectors\n\n### 相关页面\n\n相关主题：[Data Connectors](#page-connectors)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/internals/datasource.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/datasource.py)\n- [python/pathway/internals/datasink.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/datasink.py)\n- [docs/2.developers/4.user-guide/20.connect/99.connectors/30.custom-python-connectors.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/20.connect/99.connectors/30.custom-python-connectors.md)\n- [examples/projects/custom-python-connector-twitter/twitter_connector_example.py](https://github.com/pathwaycom/pathway/blob/main/examples/projects/custom-python-connector-twitter/twitter_connector_example.py)\n</details>\n\n# Custom Connectors\n\nPathway provides a powerful extensibility mechanism through Custom Connectors, enabling developers to create bespoke data source and data sink implementations tailored to specific data providers or consumers. This capability allows seamless integration with proprietary systems, legacy data sources, or third-party APIs that are not covered by Pathway's built-in connector library.\n\n## Overview\n\nCustom Connectors bridge the gap between Pathway's Rust-powered computation engine and Python-based data sources or sinks. They leverage the `ConnectorSubject` base class for data sources and provide a callback-based mechanism where Python functions handle the actual data reading or writing logic while the Rust engine manages state, persistence, and incremental computation.\n\nThe architecture follows a clean separation of concerns:\n\n- **Python Layer**: Implements the actual data fetching/writing logic\n- **Rust Engine**: Handles data flow, state management, watermarking, and incremental updates\n\n## Architecture\n\n```mermaid\ngraph TD\n    subgraph \"Python User Code\"\n        A[Custom Connector Class] --> B[ConnectorSubject Subclass]\n        A --> C[read callback]\n        A --> D[seek callback]\n        A --> E[start callback]\n        A --> F[end callback]\n    end\n    \n    subgraph \"Pathway Engine (Rust)\"\n        G[PythonSubject Wrapper] --> H[PythonReader]\n        H --> I[State Manager]\n        I --> J[Incremental Computation]\n    end\n    \n    B --> G\n    C --> G\n    D --> G\n    E --> G\n    F --> G\n```\n\n## Core Components\n\n### PythonSubject\n\nThe `PythonSubject` class serves as the bridge between Python connector implementations and the Pathway Rust engine. It wraps several Python callbacks that define the connector's behavior.\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `start` | `Py<PyAny>` | Callback invoked when the connector begins processing |\n| `read` | `Py<PyAny>` | Callback to read data entries from the source |\n| `seek` | `Py<PyAny>` | Callback to seek to a specific position in the data stream |\n| `on_persisted_run` | `Py<PyAny>` | Callback triggered when persisted state is restored |\n| `end` | `Py<PyAny>` | Callback invoked when the connector finishes processing |\n| `is_internal` | `bool` | Flag indicating if this is an internal connector |\n| `deletions_enabled` | `bool` | Flag enabling deletion support |\n\n*资料来源：[src/python_api.rs:1-50](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)*\n\n### ConnectorMode\n\nPathway supports two distinct connector modes that determine data processing behavior:\n\n| Mode | Description |\n|------|-------------|\n| `STATIC` | Reads data once without continuous polling. Ideal for batch processing of finite datasets. |\n| `STREAMING` | Enables polling and continuous data monitoring. Supports real-time data ingestion with watermarks. |\n\n*资料来源：[src/connectors/data_storage.rs:1-20](https://github.com/pathwaycom/pathway/blob/main/src/connectors/data_storage.rs)*\n\nThe `is_polling_enabled()` method returns `false` for Static mode and `true` for Streaming mode, controlling whether the engine periodically invokes the connector's read callback.\n\n## Creating a Custom Data Source\n\n### Step 1: Subclass ConnectorSubject\n\nCreate a new class that extends the base `ConnectorSubject` class. This defines your connector's schema and behavior.\n\n```python\nimport pathway as pw\n\nclass MyCustomSourceSubject(pw.ConnectorSubject):\n    schema: pw.Schema = pw.schema_from_types(\n        id=str,\n        value=float,\n        timestamp=int,\n    )\n```\n\n### Step 2: Implement Required Callbacks\n\nThe following callbacks must be implemented to define data retrieval behavior:\n\n| Callback | Purpose |\n|----------|---------|\n| `start()` | Initialize connection and set up reader state |\n| `read()` | Return new data entries since last read |\n| `seek()` | Move to a specific position for replay |\n| `end()` | Clean up resources when processing completes |\n\n### Step 3: Configure the Connector\n\nPass your custom subject class to the connector along with mode and configuration:\n\n```python\nclass MyCustomSource(pw.io.KafkaSource):\n    def __init__(self, subject, topics, start_at_timestamp=0):\n        super().__init__(\n            subject=subject,\n            topics=topics,\n            start_at_timestamp=start_at_timestamp,\n        )\n```\n\n## Creating a Custom Data Sink\n\nCustom data sinks follow a similar pattern to sources but focus on outputting processed data. The datasink receives table updates and writes them to the target system.\n\n### Sink Architecture\n\n```mermaid\ngraph LR\n    A[Pathway Table] --> B[DataSink Handler]\n    B --> C[write callback]\n    C --> D[External System]\n```\n\n### Key Properties\n\n| Property | Description |\n|----------|-------------|\n| `table` | The Pathway table to subscribe to for updates |\n| `id_column` | Column used for primary key identification |\n| `username` | Authentication username (optional) |\n| `password` | Authentication password (optional) |\n| `host` | Target system hostname |\n| `port` | Target system port |\n\n## Twitter Connector Example\n\nA complete example demonstrating custom connector implementation is available in the repository. The Twitter connector shows how to wrap an external API as a Pathway data source.\n\n### Key Implementation Details\n\n```python\nclass TwitterSubject(pw.ConnectorSubject):\n    schema: pw.Schema = pw.schema_from_types(\n        id=str,\n        text=str,\n        author=str,\n        created_at=int,\n    )\n    \n    def __init__(self, api_key, api_secret, **kwargs):\n        self.api_key = api_key\n        self.api_secret = api_secret\n        super().__init__(**kwargs)\n    \n    def start(self):\n        self.twitter_api = connect_to_twitter(self.api_key, self.api_secret)\n        self.last_read_id = None\n    \n    def read(self):\n        new_tweets = self.twitter_api.get_new_tweets(since_id=self.last_read_id)\n        self.last_read_id = max(t.id for t in new_tweets)\n        return [(t.id, t.text, t.author, t.created_at) for t in new_tweets]\n```\n\n*资料来源：[examples/projects/custom-python-connector-twitter/twitter_connector_example.py](https://github.com/pathwaycom/pathway/blob/main/examples/projects/custom-python-connector-twitter/twitter_connector_example.py)*\n\n## ValueField Schema Definition\n\nWhen defining connector schemas, `ValueField` provides fine-grained control over column properties:\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `name` | `String` | Column identifier |\n| `type_` | `Type` | Data type (INT, FLOAT, STRING, etc.) |\n| `source` | `FieldSource` | Origin of the field (Payload, Time, Partition) |\n| `default` | `Option<Value>` | Default value if field is missing |\n| `metadata` | `Option<String>` | Optional metadata string |\n\n*资料来源：[src/python_api.rs:30-60](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)*\n\n## Integration with Pathway Engine\n\n### Reader Builder Pattern\n\nThe `PythonReaderBuilder` constructs reader instances that integrate with the engine:\n\n```rust\npub struct PythonReaderBuilder {\n    subject: Py<PythonSubject>,\n    schema: HashMap<String, Type>,\n}\n```\n\n*资料来源：[src/connectors/data_storage.rs:20-30](https://github.com/pathwaycom/pathway/blob/main/src/connectors/data_storage.rs)*\n\n### State Management\n\nThe `PythonReader` maintains internal state for incremental processing:\n\n| State Field | Purpose |\n|-------------|---------|\n| `total_entries_read` | Counter for statistics and debugging |\n| `current_external_offset` | Position marker for resumable reads |\n| `is_initialized` | Flag indicating startup completion |\n| `is_finished` | Flag indicating processing completion |\n| `python_thread_state` | GIL state for thread-safe Python calls |\n\n## Best Practices\n\n### Error Handling\n\nImplement robust error handling in callbacks. The connector should gracefully handle transient failures and support retry mechanisms.\n\n### Checkpointing\n\nFor long-running connectors, implement periodic checkpointing by tracking processed record identifiers or offsets in `seek()` callback state.\n\n### Resource Management\n\nEnsure resources (connections, file handles, API sessions) are properly initialized in `start()` and cleaned up in `end()`.\n\n### Performance Considerations\n\n- Batch data reads when possible to reduce callback overhead\n- Use appropriate polling intervals for streaming connectors\n- Monitor `total_entries_read` for capacity planning\n\n## See Also\n\n- [Pathway Connectors Documentation](https://pathway.com/developers/user-guide/connect/pathway-connectors)\n- [Built-in Connectors Reference](https://pathway.com/developers/api-docs/pathway)\n- [Persistence and State Management](https://pathway.com/developers/user-guide/persistence/)\n- [Real-time Data Processing Guide](https://pathway.com/developers/user-guide/introduction/welcome)\n\n---\n\n<a id='page-transformations'></a>\n\n## Data Transformations\n\n### 相关页面\n\n相关主题：[Temporal and Time-Based Processing](#page-temporal-processing)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs) (核心引擎接口)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs) (时间戳处理)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md) (项目概述)\n- [examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md) (YAML配置示例)\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md) (RAG管道示例)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs) (Differential Dataflow操作)\n</details>\n\n# Data Transformations\n\nData Transformations are the core operational layer in Pathway, enabling you to manipulate, combine, filter, and aggregate data as it flows through your pipeline. Built on a scalable Rust engine powered by Differential Dataflow, Pathway performs incremental computation that efficiently handles both batch and streaming data with the same Python API.\n\n## Architecture Overview\n\nPathway's transformation engine operates on **Tables**, which are the fundamental data structures in the framework. Each table consists of:\n\n- **Columns**: Named data fields with typed values\n- **Keys**: Unique identifiers for rows\n- **Timestamps**: Time indicators for stream ordering\n- **Change Metadata**: Track additions, modifications, and retractions\n\n```mermaid\ngraph TD\n    A[Input Connectors] --> B[Table Creation]\n    B --> C[Transformations]\n    C --> D[Aggregations]\n    C --> E[Joins]\n    C --> F[Filters & Mappings]\n    D --> G[Output Connectors]\n    E --> G\n    F --> G\n    G --> H[Results / Persistence]\n    \n    subgraph Rust Engine\n        C\n        D\n        E\n        F\n    end\n```\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n## Table Operations\n\nTables are immutable once created in Pathway. All data transformations produce new tables rather than modifying existing ones, ensuring data consistency and enabling proper change tracking.\n\n### Core Table Structure\n\nFrom the Rust engine perspective, tables are represented as:\n\n| Component | Description |\n|-----------|-------------|\n| `universe` | A unique identifier space for table rows |\n| `columns` | Collection of typed data columns |\n| `handle` | Internal reference for engine operations |\n\n资料来源：[src/python_api.rs:1-100](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n### Supported Data Types\n\nPathway supports a variety of primitive and complex types:\n\n| Category | Types |\n|----------|-------|\n| Primitives | `int`, `float`, `str`, `bool` |\n| Temporal | `datetime`, `date`, `time` |\n| Complex | `bytes`, `JSON`, `pointer` |\n\n### Filtering\n\nFiltering operations select subsets of rows based on predicates:\n\n```python\nfiltered_table = source_table.filter(source_table.column > threshold)\n```\n\nThe filter operation:\n1. Evaluates the predicate for each row\n2. Produces a new table with only matching rows\n3. Propagates timestamps from the source\n\n### Column Operations\n\nAdd, rename, or compute new columns:\n\n```python\n# Add computed column\nresult = table.select(\n    table.id,\n    table.value,\n    total=table.value * table.rate\n)\n\n# Rename columns\nrenamed = table.select(\n    new_name=table.old_name,\n    another=table.other\n)\n```\n\n## Join Operations\n\nPathway provides multiple join strategies for combining tables, all powered by the underlying Differential Dataflow computation model.\n\n```mermaid\ngraph LR\n    A[Table A] --> C[Join Operation]\n    B[Table B] --> C\n    C --> D[Result Table]\n    \n    D --> E[Inner Join<br/>Rows with matches in both]\n    D --> F[Left Join<br/>All rows from A]\n    D --> G[Outer Join<br/>All rows from both]\n```\n\n### Join Types\n\n| Join Type | Behavior |\n|-----------|----------|\n| Inner Join | Returns rows with matches in both tables |\n| Left Join | All rows from left table, matched from right |\n| Outer Join | All rows from both tables |\n| Cross Join | Cartesian product with optional filtering |\n\n### Join Syntax\n\n```python\nresult = left_table.join(\n    right_table,\n    left_table.key == right_table.key\n).select(\n    left_table.star,\n    right_table.data\n)\n```\n\n### Incremental Join Behavior\n\nThe Rust engine handles joins incrementally:\n\n```\nsrc/python_api.rs: Table implementation\n├── handle: TableHandle (internal engine reference)\n├── scope: Py<Scope> (computation scope)\n└── Methods for engine-level join operations\n```\n\n资料来源：[src/python_api.rs:200-300](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## Aggregation and Grouping\n\nPathway's aggregation system uses Differential Dataflow's efficient incremental computation model.\n\n### GroupBy Operations\n\nGroup data by one or more columns:\n\n```python\ngrouped = table.groupby(table.category)\n```\n\n### Built-in Reducers\n\n| Reducer | Description | Example |\n|---------|-------------|---------|\n| `count` | Number of rows | `t.groupby(k).reduce(count=pw.reducers.count())` |\n| `sum` | Sum of values | `t.groupby(k).reduce(total=pw.reducers.sum(t.val))` |\n| `avg` | Average value | `t.groupby(k).reduce(avg=pw.reducers.avg(t.val))` |\n| `min` | Minimum value | `t.groupby(k).reduce(min=pw.reducers.min(t.val))` |\n| `max` | Maximum value | `t.groupby(k).reduce(max=pw.reducers.max(t.val))` |\n| `collect` | Collect all values | `t.groupby(k).reduce(all=pw.reducers.collect(t.val))` |\n\n### Custom Aggregations\n\nDefine custom aggregation logic:\n\n```python\nresult = table.groupby(table.key).reduce(\n    key=table.key,\n    custom_agg=custom_reducer(table.values)\n)\n```\n\n## Temporal Operations\n\nPathway handles temporal data with a sophisticated timestamp model that supports streaming and batch scenarios.\n\n### Timestamp Semantics\n\n```mermaid\ngraph TD\n    T1[Timestamp 1] --> T2[Timestamp 2]\n    T2 --> T3[Timestamp 3]\n    T3 --> T4[Timestamp N]\n    \n    subgraph Original Values\n        T1\n        T3\n    end\n    \n    subgraph Retractions\n        T2\n        T4\n    end\n```\n\nThe timestamp system distinguishes between:\n- **Original values** (even timestamps)\n- **Retractions** (odd timestamps)\n\n资料来源：[src/engine/timestamp.rs:1-50](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n\n### Time-based Operations\n\n```python\n# Window by time\nresult = table.windowby(table.timestamp).reduce(\n    time_bucket=table.timestamp.dt.floor(\"1h\"),\n    value=pw.reducers.sum(table.amount)\n)\n```\n\n## SQL-style Processing\n\nPathway supports SQL-like query processing for complex transformations:\n\n```python\nresult = pw.sql.processing(\n    \"\"\"\n    SELECT \n        category,\n        SUM(value) as total,\n        AVG(value) as average\n    FROM source_table\n    GROUP BY category\n    HAVING SUM(value) > 100\n    \"\"\"\n)\n```\n\n### Supported SQL Features\n\n| Feature | Support |\n|---------|---------|\n| SELECT | Full projection support |\n| WHERE | Filter conditions |\n| GROUP BY | Aggregation grouping |\n| HAVING | Post-aggregation filters |\n| JOIN | Table combinations |\n| Subqueries | Nested queries |\n\n## Differential Dataflow Foundation\n\nPathway's transformation engine is built on **Differential Dataflow**, an extension of Timely Dataflow that efficiently handles incremental updates.\n\n```mermaid\ngraph LR\n    subgraph Timely Dataflow\n        A[Operators] --> B[Scope]\n        B --> C[Channels]\n    end\n    \n    subgraph Differential Dataflow\n        D[Arrangements] --> E[Collections]\n        E --> F[Traces]\n        D --> F\n    end\n    \n    G[Pathway Python API] --> H[Rust Engine]\n    H --> I[Differential Dataflow]\n```\n\n### Key Properties\n\n| Property | Benefit |\n|----------|---------|\n| Incremental | Only recomputes changed portions |\n| Monotonic | Handles retractions naturally |\n| Composable | Complex pipelines from simple operators |\n| Parallel | Scales across threads automatically |\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:1-50](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n\n## Computation Model\n\nPathway's computation follows an **append-only** model where:\n\n1. **Input**: Data enters via connectors\n2. **Transform**: Operations produce new tables\n3. **Output**: Results flow to sinks\n4. **Persistence**: Optional state storage for recovery\n\n```mermaid\ngraph TD\n    A[Connectors] -->|Input Data| B[Engine]\n    B --> C{Transformations}\n    C --> D[Table Operations]\n    C --> E[Joins]\n    C --> F[Aggregations]\n    D --> G[New Tables]\n    E --> G\n    F --> G\n    G --> H[Output Connectors]\n    G --> I[Persistence Layer]\n```\n\n### Running the Pipeline\n\n```python\nimport pathway as pw\n\n# Define pipeline\ntable = pw.io.fs.read(\"./input\", schema=Schema)\nresult = table.groupby(table.key).reduce(count=pw.reducers.count())\n\n# Configure output\npw.io.json_lines.write(result, \"./output\")\n\n# Execute\npw.run()\n```\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n## Configuration and YAML Integration\n\nPathway supports declarative pipeline definition via YAML:\n\n```yaml\n# Data source\nsource: !pw.io.csv.read\n  path: ./input.csv\n  schema: $schema\n\n# Transformations\nprocessed: !pw.Transformations.groupby\n  table: $source\n  by: [category]\n  reducers:\n    - !pw.reducers.sum\n      column: amount\n\n# Output\noutput: !pw.io.csv.write\n  table: $processed\n  path: ./output.csv\n```\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n## Error Handling\n\nThe engine tracks and propagates errors through the computation graph:\n\n```mermaid\ngraph TD\n    A[Operation] -->|Success| B[Next Operation]\n    A -->|Error| C[ErrorLog]\n    C -->|Recorded| D[Monitoring]\n    D -->|Notification| E[Alert System]\n```\n\n```python\n# Access error information\nerror_log = table.error_history()\n```\n\n资料来源：[src/python_api.rs:300-400](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## Performance Characteristics\n\n### Threading\n\nPathway natively supports multithreading:\n\n```bash\n$ pathway spawn --threads 3 python main.py\n```\n\n### Memory Efficiency\n\n| Strategy | Description |\n|----------|-------------|\n| Incremental | Only changed data is reprocessed |\n| Arrangements | Pre-indexed data structures for fast joins |\n| Persistence | Checkpointing to manage memory |\n\n### Scaling\n\nPathway pipelines scale from:\n- Local development (single thread)\n- Multi-threaded processing\n- Distributed deployment via containerization\n\n## See Also\n\n- [Input Connectors](https://pathway.com/developers/user-guide/connect/pathway-connectors) - Data ingestion methods\n- [Output Connectors](https://pathway.com/developers/user-guide/connect/pathway-connectors) - Data delivery methods\n- [Persistence Guide](https://pathway.com/developers/user-guide/persistence/overview) - State management\n- [RAG Pipeline Example](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/) - End-to-end transformation example\n\n---\n\n<a id='page-temporal-processing'></a>\n\n## Temporal and Time-Based Processing\n\n### 相关页面\n\n相关主题：[Data Transformations](#page-transformations)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/stdlib/temporal/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/__init__.py)\n- [python/pathway/stdlib/temporal/_window.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_window.py)\n- [python/pathway/stdlib/temporal/_asof_join.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_asof_join.py)\n- [python/pathway/stdlib/temporal/_interval_join.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_interval_join.py)\n- [docs/2.developers/4.user-guide/40.temporal-data/10.windows-manual.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/40.temporal-data/10.windows-manual.md)\n- [docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md)\n</details>\n\n# Temporal and Time-Based Processing\n\n## Overview\n\nPathway's temporal and time-based processing capabilities enable sophisticated stream analytics by leveraging the underlying Differential Dataflow computation model. The framework provides robust mechanisms for windowing operations, temporal joins, and time-based aggregations that work seamlessly across both batch and streaming data scenarios.\n\nPathway processes data with timestamps, maintaining the temporal order and allowing computations to reference historical data at specific time points. The Rust engine manages time progression through frontiers—boundary markers that indicate which timestamps have been fully processed—enabling consistent and deterministic results regardless of data arrival order.\n\n资料来源：[src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)  \n资料来源：[src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n\n## Core Concepts\n\n### Timestamps and Time Model\n\nPathway operates on timestamped data where each record carries a timestamp indicating when the event occurred or when it should be processed. The timestamp model supports both event time and processing time semantics.\n\n```python\n# Timestamps are automatically tracked by Pathway\n# Each data record carries temporal information\nresult = table.select(\n    col=table.value,\n    time=table._time_attribute  # System-managed timestamp\n)\n```\n\n### Frontiers and Time Progress\n\nFrontiers in Pathway represent the boundary between processed and unprocessed data. The system tracks two critical frontiers:\n\n| Frontier Type | Description |\n|--------------|-------------|\n| Input Frontier | Earliest incomplete input timestamp |\n| Output Frontier | Latest timestamp for which all computations are complete |\n\nThe capability system (borrowed from Timely Dataflow) allows operators to hold references to specific timestamps, ensuring computations only execute when their input data is available.\n\n资料来源：[external/timely-dataflow/timely/src/dataflow/operators/capability.rs](https://github.com/pathwaycom/pathway/blob/main/external/timely-dataflow/timely/src/dataflow/operators/capability.rs)\n\n## Windowing Operations\n\nPathway provides comprehensive windowing capabilities through the `pathway.stdlib.temporal` module. Windows group data based on temporal criteria, enabling aggregations, joins, and transformations over bounded time intervals.\n\n```mermaid\ngraph TD\n    A[Input Stream] --> B[Window Assigner]\n    B --> C[Tumbling Windows]\n    B --> D[Sliding Windows]\n    B --> E[Session Windows]\n    C --> F[Window 1]\n    C --> G[Window 2]\n    D --> H[Overlapping Windows]\n    E --> I[Session 1]\n    E --> J[Session 2]\n```\n\n### Tumbling Windows\n\nTumbling windows are fixed-size, non-overlapping time windows that partition the data stream into discrete, consecutive segments. Each event belongs to exactly one window.\n\n```python\nfrom pathway.stdlib.temporal import tumbling_window\n\n# Create tumbling windows of 1 hour\nresult = table.window.tumbling(\n    window_length=timedelta(hours=1),\n    timestamp=table.event_time\n).reduce(\n    count=tools.count(),\n    sum_value=tools.sum(table.value)\n)\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `window_length` | `timedelta` | Duration of each window |\n| `timestamp` | `ColumnExpression` | Column containing event timestamps |\n| `session_gap` | `timedelta` | Optional gap between sessions |\n\n### Sliding Windows\n\nSliding windows move continuously over the data stream with a fixed size and slide interval. Unlike tumbling windows, consecutive windows can overlap.\n\n```python\nfrom pathway.stdlib.temporal import sliding_window\n\n# Create sliding windows of 5 minutes, sliding every 1 minute\nresult = table.window.sliding(\n    window_length=timedelta(minutes=5),\n    slide=timedelta(minutes=1),\n    timestamp=table.event_time\n).reduce(\n    avg_value=tools.aggregate.avg(table.value)\n)\n```\n\n### Session Windows\n\nSession windows detect periods of activity separated by gaps of inactivity. They automatically expand as new events arrive within the gap threshold.\n\n```python\nfrom pathway.stdlib.temporal import session_window\n\n# Session windows with 30-minute inactivity gap\nresult = table.window.session(\n    gap=timedelta(minutes=30),\n    timestamp=table.event_time\n).reduce(\n    session_duration=tools.count(),\n    total_amount=tools.sum(table.amount)\n)\n```\n\n资料来源：[python/pathway/stdlib/temporal/_window.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_window.py)  \n资料来源：[docs/2.developers/4.user-guide/40.temporal-data/10.windows-manual.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/40.temporal-data/10.windows-manual.md)\n\n## Temporal Joins\n\n### ASOF Join\n\nASOF (As-Of) join is designed for time-series data where you want to match records from two tables based on the closest timestamp without exceeding a specified threshold. This is particularly useful for correlating events that occur at approximately the same time.\n\n```python\nfrom pathway.stdlib.temporal import asof_join\n\n# Match trades with the most recent quote before each trade\nresult = trades.join(\n    quotes,\n    trades.timestamp >= quotes.timestamp,\n    trades.timestamp < quotes.timestamp + timedelta(seconds=10),\n    selectors={\n        trades.price: \"trade_price\",\n        quotes.price: \"quote_price\",\n    }\n).select(\n    trade_price=result.trade_price,\n    quote_price=result.quote_price,\n    time_diff=result.trades_timestamp - result.quotes_timestamp\n)\n```\n\n**Configuration Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `time_distance` | `timedelta` | Maximum time distance between matched records |\n| `closed_window` | `bool` | Include boundary timestamps in matches |\n| `behavior` | `str` | `\"all\"`, `\"emit_per_right\"`, or `\"emit_per_left\"` |\n| `index` | `Table` | Optional reference table for index-based matching |\n\nThe ASOF join supports multiple matching strategies:\n\n```python\n# Emit one result per left record (default)\nresult = left.asof_join(\n    right,\n    left.time >= right.time,\n    left.time < right.time + time_distance\n)\n\n# Emit one result per right record\nresult = left.asof_join(\n    right,\n    left.time >= right.time,\n    left.time < right.time + time_distance,\n    behavior=ASOF_join_behavior.EMIT_PER_RIGHT\n)\n```\n\n资料来源：[python/pathway/stdlib/temporal/_asof_join.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_asof_join.py)  \n资料来源：[docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md)\n\n### Interval Join\n\nInterval join matches records from two tables where the timestamps fall within a specified time interval relative to each other. Unlike ASOF join which finds the closest match, interval join produces all pairs where timestamps satisfy the interval condition.\n\n```python\nfrom pathway.stdway.stdlib.temporal import interval_join\n\n# Match orders with shipments that arrive within 1-5 days after ordering\nresult = orders.join(\n    shipments,\n    orders.order_date <= shipments.ship_date,\n    orders.order_date + timedelta(days=5) >= shipments.ship_date,\n    orders.order_date + timedelta(days=1) <= shipments.ship_date,\n    selectors={\n        orders.order_id: \"order_id\",\n        orders.order_date: \"order_date\",\n        shipments.ship_date: \"ship_date\",\n        shipments.tracking: \"tracking_number\"\n    }\n).select(\n    order_id=result.order_id,\n    order_date=result.order_date,\n    ship_date=result.ship_date,\n    delay=result.ship_date - result.order_date\n)\n```\n\n**Key Differences from ASOF Join:**\n\n| Aspect | ASOF Join | Interval Join |\n|--------|----------|---------------|\n| Match Cardinality | One-to-one (closest) | One-to-many |\n| Timestamp Condition | Single bound (closest) | Interval range |\n| Use Case | Time-series alignment | Event correlation |\n| Performance | Optimized for single match | May produce more results |\n\n资料来源：[python/pathway/stdlib/temporal/_interval_join.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_interval_join.py)\n\n## Windowed Aggregations\n\nPathway supports aggregations over windowed data with a rich set of built-in functions:\n\n```python\nfrom pathway.stdlib.temporal import window\nfrom pathway.stdlib import tools\n\n# Comprehensive windowed aggregation\nresult = sensor_data.window.tumbling(\n    window_length=timedelta(minutes=15)\n).reduce(\n    # Count aggregations\n    event_count=tools.count(),\n    distinct_sensors=tools.count(distinct=True, by=sensor_data.sensor_id),\n    \n    # Numeric aggregations\n    avg_temperature=tools.avg(sensor_data.temperature),\n    max_reading=tools.max(sensor_data.value),\n    min_reading=tools.min(sensor_data.value),\n    total_value=tools.sum(sensor_data.value),\n    variance_pop=tools.var_pop(sensor_data.value),\n    \n    # Collection aggregations\n    all_readings=tools.collect(sensor_data.value),\n    std_dev=tools.stddev(sensor_data.value)\n)\n```\n\n**Available Aggregation Functions:**\n\n| Function | Description | Applicable Types |\n|----------|-------------|------------------|\n| `count()` | Number of records | All |\n| `sum(column)` | Sum of values | Numeric |\n| `avg(column)` | Arithmetic mean | Numeric |\n| `min(column)` | Minimum value | Comparable |\n| `max(column)` | Maximum value | Comparable |\n| `var_pop(column)` | Population variance | Numeric |\n| `stddev(column)` | Standard deviation | Numeric |\n| `collect(column)` | Collect all values | All |\n| `any(column)` | Return any value | All |\n\n## Time Evolution and State Management\n\nPathway maintains state across time using differential dataflow's incremental computation model. The persistence layer tracks state changes and enables recovery from checkpoints.\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Timestamp Assignment]\n    B --> C[Differential Update]\n    C --> D[State Accumulation]\n    D --> E[Frontier Advancement]\n    E --> F[Output Production]\n    E -->|State Checkpoint| G[Persistence Backend]\n    G -->|Recovery| D\n```\n\n### Checkpoint and Recovery\n\nThe persistence system automatically snapshots state at frontier boundaries:\n\n```python\nimport pathway as pw\n\nclass MyApp(pw.Application):\n    # Enable persistence with automatic snapshots\n    persistence_config = pw.PersistenceConfig(\n        backend=pw.persistence.Backends.filesystem(\"./data\"),\n        snapshot_interval=timedelta(minutes=5)\n    )\n    \n    # Tables automatically benefit from persistence\n    @pw.operator\n    def my_operator(self, table):\n        return table.groupby(table.category).reduce(\n            total=pw.reducers.sum(table.value)\n        )\n```\n\n资料来源：[src/persistence/tracker.rs](https://github.com/pathwaycom/pathway/blob/main/src/persistence/tracker.rs)\n\n## API Reference\n\n### Module: pathway.stdlib.temporal\n\n#### Window Creation\n\n```python\npathway.stdlib.temporal.tumbling_window(\n    table,\n    time_column,\n    window_length,\n    hop=None,\n    name=None\n)\n```\n\n#### Join Operations\n\n```python\npathway.stdlib.temporal.asof_join(\n    left_table,\n    right_table,\n    left_time,\n    right_time,\n    time_distance,\n    behavior=ASOF_join_behavior.EMIT_PER_RIGHT,\n    selectors=None\n)\n\npathway.stdlib.temporal.interval_join(\n    left_table,\n    right_table,\n    left_time_lower,\n    left_time_upper,\n    right_time_lower,\n    right_time_upper,\n    selectors=None\n)\n```\n\n### Enum: ASOF_join_behavior\n\n| Value | Description |\n|-------|-------------|\n| `EMIT_PER_RIGHT` | Emit one result per right record matched |\n| `EMIT_PER_LEFT` | Emit one result per left record matched |\n| `EMIT_ALL` | Emit all possible matches |\n\n## Best Practices\n\n### Timestamp Column Selection\n\nChoose timestamp columns carefully:\n\n```python\n# Prefer event-time columns for windowing\ntable = pw.Table.read(\n    data,\n    id_column=\"event_id\",\n    time_column=\"event_timestamp\"  # Event time, not processing time\n)\n```\n\n### Performance Considerations\n\n1. **Window Size Tuning**: Smaller windows reduce memory footprint but increase computation frequency\n2. **Session Gap Configuration**: Set gaps based on expected user behavior patterns\n3. **Join Cardinality**: Monitor interval join result sizes to avoid unbounded growth\n4. **Late Data Handling**: Configure watermarks appropriately for your data arrival patterns\n\n### Handling Out-of-Order Data\n\n```python\n# Configure watermark tolerance for late events\nresult = table.window.tumbling(\n    window_length=timedelta(hours=1),\n    timestamp=table.event_time,\n    watermark=timedelta(minutes=5)  # Accept events up to 5 min late\n).reduce(\n    count=tools.count()\n)\n```\n\n## Conclusion\n\nPathway's temporal and time-based processing capabilities provide a powerful foundation for building real-time analytics applications. The combination of flexible windowing, specialized temporal joins, and the underlying differential dataflow engine enables processing of streaming data with the same ease as batch processing, while maintaining correctness and determinism across restarts and scaling events.\n\n资料来源：[python/pathway/stdlib/temporal/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/__init__.py)\n\n---\n\n<a id='page-llm-xpack'></a>\n\n## LLM xPack\n\n### 相关页面\n\n相关主题：[Data Connectors](#page-connectors)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/xpacks/llm/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/__init__.py)\n- [python/pathway/xpacks/llm/llms.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/llms.py)\n- [python/pathway/xpacks/llm/embedders.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/embedders.py)\n- [python/pathway/xpacks/llm/parsers.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/parsers.py)\n- [python/pathway/xpacks/llm/splitters.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/splitters.py)\n- [python/pathway/xpacks/llm/rerankers.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/rerankers.py)\n- [docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md)\n- [docs/2.developers/4.user-guide/50.llm-xpack/70.embedders.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/50.llm-xpack/70.embedders.md)\n</details>\n\n# LLM xPack\n\nThe LLM xPack is a specialized extension module within Pathway that provides comprehensive integration capabilities for Large Language Models (LLMs), embedding models, and retrieval-augmented generation (RAG) workflows. This module enables developers to build production-ready LLM pipelines with real-time data processing capabilities.\n\n## Overview\n\nPathway's LLM xPack bridges the gap between traditional ETL stream processing and modern AI-powered applications. It provides Python-native interfaces for interacting with various LLM providers, embedding services, and RAG components while maintaining Pathway's core strengths in handling streaming and batch data with consistency guarantees.\n\nThe module is designed to work seamlessly with Pathway's Rust-powered engine, enabling high-performance inference and data transformation operations that can scale from local development to production deployments.\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Pathway Pipeline] --> B[LLM xPack]\n    B --> C[LLMs Module]\n    B --> D[Embedders Module]\n    B --> E[Rerankers Module]\n    B --> F[Splitters Module]\n    B --> G[Parsers Module]\n    \n    C --> H[OpenAI]\n    C --> I[Azure OpenAI]\n    C --> J[Anthropic]\n    C --> K[Google AI]\n    C --> L[Custom REST]\n    \n    D --> M[OpenAI Embeddings]\n    D --> N[HuggingFace Embeddings]\n    D --> O[Ollama]\n    D --> P[Azure OpenAI Embeddings]\n    \n    H --> Q[Responses]\n    I --> Q\n    J --> Q\n    K --> Q\n    L --> Q\n```\n\n## Module Structure\n\n| Module | Purpose | Primary Classes/Functions |\n|--------|---------|---------------------------|\n| `llms.py` | LLM provider integration | `OpenAILanguageModel`, `AzureOpenAILanguageModel`, `AnthropicLanguageModel` |\n| `embedders.py` | Text embedding generation | `OpenAIEmbedder`, `HuggingFaceEmbedder`, `OllamaEmbedder` |\n| `rerankers.py` | Document reranking for RAG | Cross-encoder based reranking |\n| `splitters.py` | Text document splitting | `TokenSplitter`, `SentenceSplitter` |\n| `parsers.py` | Output parsing and extraction | Response parsing utilities |\n\n## LLMs Module\n\nThe LLMs module provides unified interfaces for interacting with various language model providers. All implementations follow a common base interface ensuring portability across different backends.\n\n### Supported Providers\n\n```mermaid\ngraph LR\n    A[Developer Code] --> B[LLM Wrapper Layer]\n    B --> C[OpenAI]\n    B --> D[Azure OpenAI]\n    B --> E[Anthropic]\n    B --> F[Google AI]\n    B --> G[Custom API]\n```\n\n### Base Interface\n\nThe LLM wrappers implement a common pattern where prompts are processed and responses are returned as Pathway-compatible data structures. This enables seamless integration with Pathway's table operations.\n\n```python\n# Conceptual interface pattern\nclass BaseLanguageModel:\n    def __call__(self, prompt: str, **kwargs) -> LLMResponse:\n        ...\n```\n\n### Provider-Specific Configurations\n\n| Provider | Authentication | API Version | Model Selection |\n|----------|---------------|-------------|-----------------|\n| OpenAI | API Key | Latest | Via model parameter |\n| Azure OpenAI | API Key + Endpoint | 2024-02-01 | Deployment name |\n| Anthropic | API Key | Latest | claude-3 family |\n| Google AI | API Key | Latest | Gemini models |\n\n## Embedders Module\n\nEmbedders generate vector representations of text for semantic search and similarity matching. The module supports multiple embedding providers with consistent interfaces.\n\n### Supported Embedding Services\n\n| Embedder | Provider | Dimension Support | Batch Processing |\n|----------|----------|-------------------|------------------|\n| `OpenAIEmbedder` | OpenAI | 1536, 3072 | Yes |\n| `HuggingFaceEmbedder` | HuggingFace Inference API | Model-dependent | Yes |\n| `OllamaEmbedder` | Local Ollama server | Model-dependent | Yes |\n| `AzureOpenAIEmbedder` | Azure OpenAI | 1536, 3072 | Yes |\n\n### Embedding Workflow\n\n```mermaid\ngraph TD\n    A[Input Text] --> B[Preprocessing]\n    B --> C[Tokenization]\n    C --> D[API Call / Local Inference]\n    D --> E[Vector Embedding]\n    E --> F[Normalized Output]\n```\n\n### Configuration Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `model` | str | Required | Embedding model identifier |\n| `api_key` | str | Environment | Authentication key |\n| `batch_size` | int | 32 | Documents per batch |\n| `timeout` | float | 60.0 | Request timeout in seconds |\n| `max_retries` | int | 3 | Retry attempts on failure |\n\n## Rerankers Module\n\nRerankers improve retrieval quality by reordering initial search results using cross-encoder models. This is particularly valuable in RAG pipelines where initial vector search may return partially relevant results.\n\n### Reranking Process\n\n```mermaid\ngraph LR\n    A[Query] --> B[Initial Retrieval<br/>Top-K Results]\n    B --> C[Cross-Encoder Scorer]\n    C --> D[Reordered Results]\n    D --> E[Final Output]\n    \n    F[Retrieved Documents] --> C\n```\n\nThe reranking module uses cross-encoder models to score query-document pairs, returning results ordered by relevance score.\n\n## Splitters Module\n\nDocument splitters divide large texts into smaller chunks suitable for embedding and retrieval. Proper chunking is critical for RAG pipeline effectiveness.\n\n### Splitting Strategies\n\n| Strategy | Description | Use Case |\n|----------|-------------|----------|\n| `TokenSplitter` | Split by token count | Fixed context windows |\n| `SentenceSplitter` | Split at sentence boundaries | Natural language coherence |\n| `RecursiveSplitter` | Hierarchical splitting | Complex documents |\n\n### Configuration\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `chunk_size` | int | Target chunk size |\n| `chunk_overlap` | int | Overlap between chunks |\n| `separator` | str | Splitting delimiter |\n\n## Parsers Module\n\nThe parsers module provides utilities for extracting structured information from LLM responses. This is essential for turning raw model outputs into usable data.\n\n### Supported Parsing Patterns\n\n- JSON extraction from markdown-formatted responses\n- Structured output parsing with Pydantic models\n- Template-based response extraction\n\n## RAG Pipeline Integration\n\nThe LLM xPack is designed to compose into complete RAG pipelines within Pathway:\n\n```mermaid\ngraph TD\n    A[Data Sources] --> B[Pathway Connectors]\n    B --> C[Document Splitter]\n    C --> D[Embedder]\n    D --> E[Vector Store / Index]\n    E --> F[Query Input]\n    F --> G[Similarity Search]\n    G --> H[Reranker]\n    H --> I[LLM Prompt Assembly]\n    I --> J[LLM Inference]\n    J --> K[Response Parser]\n    K --> L[Structured Output]\n```\n\n## Usage Example\n\n```python\nimport pathway as pw\nfrom pathway.xpacks.llm import OpenAIEmbedder, OpenAILanguageModel\nfrom pathway.xpacks.llm.splitters import TokenSplitter\n\n# Configure embedder\nembedder = OpenAIEmbedder(model=\"text-embedding-3-small\")\n\n# Configure LLM\nllm = OpenAILanguageModel(model=\"gpt-4o\")\n\n# Document splitting\nsplitter = TokenSplitter(chunk_size=512, chunk_overlap=50)\n\n# Pipeline integration\nclass LLMQueryResult(pw.Result):\n    response: str\n    context: list[str]\n```\n\n## Error Handling and Resilience\n\nThe LLM xPack implements robust error handling for production deployments:\n\n- **Automatic retry** with exponential backoff for transient failures\n- **Timeout handling** to prevent pipeline stalls\n- **Rate limiting compliance** for API-based providers\n- **Graceful degradation** when optional components are unavailable\n\n## Performance Considerations\n\n| Aspect | Recommendation |\n|--------|----------------|\n| Batch Processing | Enable batching for multiple documents |\n| Caching | Use for repeated queries |\n| Async Operations | Prefer async APIs where supported |\n| Memory | Monitor embedding vector memory usage |\n\n## Configuration Sources\n\nLLM xPack credentials and settings can be configured through:\n\n1. **Environment variables** (recommended for production)\n2. **Pathway config files** (YAML/JSON)\n3. **Runtime parameters** (for development)\n\nCommon environment variables:\n\n```bash\nOPENAI_API_KEY=sk-...\nANTHROPIC_API_KEY=sk-ant-...\nGOOGLE_API_KEY=...\n```\n\n## See Also\n\n- [Pathway Documentation](https://pathway.com/developers/)\n- [LLM xPack User Guide](https://pathway.com/developers/user-guide/llm-xpack/)\n- [RAG Pipeline Examples](https://github.com/pathwaycom/pathway/tree/main/examples)\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：pathwaycom/pathway\n\n摘要：发现 22 个潜在踩坑项，其中 2 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any。\n\n## 1. 安装坑 · 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安全/权限坑 · 来源证据：Entra Authentication Support (credential handler and/or password callback)\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：Automated schema exploration in input connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安装坑 · 来源证据：Improve watermarks in POSIX-like objects tracker\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：Persistence in `iterate` operator\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 7. 安装坑 · 来源证据：Support LEANN for RAG pipelines\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：Support MongoDB Atlas in pw.io.mongodb\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 9. 安装坑 · 来源证据：feat: Add Microsoft SQL Server (MSSQL) connector\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：v0.27.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 安装坑 · 来源证据：v0.29.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.29.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b25babd3e3cc49108e56daaaaae8c5bf | https://github.com/pathwaycom/pathway/releases/tag/v0.29.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 12. 安装坑 · 来源证据：v0.30.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.30.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_0291ee068e4e4d38aa86d0fd433b960a | https://github.com/pathwaycom/pathway/releases/tag/v0.30.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 13. 配置坑 · 来源证据：v0.28.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.28.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_708184d9c8ac445d923c82d38af7bd19 | https://github.com/pathwaycom/pathway/releases/tag/v0.28.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 配置坑 · 来源证据：v0.30.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.30.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7a19bd03a49499e987781160e4b76f0 | https://github.com/pathwaycom/pathway/releases/tag/v0.30.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:571186647 | https://github.com/pathwaycom/pathway | README/documentation is current enough for a first validation pass.\n\n## 16. 运行坑 · 来源证据：`pw.io.mssql.read` needs LSN persistence\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：`pw.io.mssql.read` needs LSN persistence\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_59d927a667a843ccb7d0a4cd147a1dc0 | https://github.com/pathwaycom/pathway/issues/218 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 17. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | last_activity_observed missing\n\n## 18. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 19. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 20. 安全/权限坑 · 来源证据：Support encryption in Kinesis connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Support encryption in Kinesis connectors\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f29097c3eced4a209e7c80971aeea40c | https://github.com/pathwaycom/pathway/issues/223 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 21. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | issue_or_pr_quality=unknown\n\n## 22. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | release_recency=unknown\n\n<!-- canonical_name: pathwaycom/pathway; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "pathway",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "github_repo:571186647",
          "kind": "repo",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/pathwaycom/pathway"
        },
        {
          "evidence_id": "art_60def7e0e2da413bb22cca132cca079f",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/pathwaycom/pathway#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "pathway 说明书",
      "toc": [
        "https://github.com/pathwaycom/pathway 项目说明书",
        "目录",
        "Pathway Introduction",
        "Overview",
        "Architecture",
        "Data Model",
        "Processing Modes",
        "Expression Engine",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "510c6daf92bd76e21b7279cee5f588b1b64e7b0f",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "README.md",
      "docs/2.developers/7.templates/30.configure-yaml.md",
      "docs/2.developers/7.templates/35.custom-components.md",
      "docs/2.developers/7.templates/20.run-a-template.md",
      "docs/2.developers/7.templates/38.licensing-guide.md",
      "docs/2.developers/7.templates/1.index.md",
      "docs/2.developers/4.user-guide/.changelog.md",
      "docs/2.developers/7.templates/rag/1008.template-demo-document-indexing.md",
      "docs/2.developers/7.templates/rag/1000.demo-question-answering.md",
      "docs/2.developers/7.templates/rag/1001.template-adaptive-rag.md",
      "docs/2.developers/7.templates/rag/1010.template-slides-search.md",
      "docs/2.developers/7.templates/rag/5.unstructured-to-structured.md",
      "docs/2.developers/7.templates/rag/1009.drive-alert.md",
      "docs/2.developers/7.templates/rag/1003.template-multimodal-rag.md",
      "docs/2.developers/7.templates/rag/1002.template-private-rag.md",
      "docs/2.developers/7.templates/rag/.navigation.yml",
      "docs/2.developers/7.templates/40.rag-customization/20.custom-prompt.md",
      "docs/2.developers/7.templates/40.rag-customization/10.REST-API.md",
      "docs/2.developers/7.templates/40.rag-customization/40.splitters.md",
      "docs/2.developers/7.templates/40.rag-customization/50.embedders.md",
      "docs/2.developers/7.templates/40.rag-customization/30.parsers.md",
      "docs/2.developers/7.templates/40.rag-customization/60.llm-chats.md",
      "docs/2.developers/7.templates/ETL/2.twitter.md",
      "docs/2.developers/7.templates/ETL/180.kafka-alternative.md",
      "docs/2.developers/7.templates/ETL/150.etl-python-airbyte.md",
      "docs/2.developers/7.templates/ETL/5.linear_regression_with_kafka.md",
      "docs/2.developers/7.templates/ETL/10.el-pipeline.md",
      "docs/2.developers/7.templates/ETL/7.realtime-log-monitoring.md",
      "docs/2.developers/7.templates/ETL/175.delta_lake_etl.md",
      "docs/2.developers/7.templates/ETL/4.logistics.md",
      "docs/2.developers/7.templates/ETL/.navigation.yml",
      "docs/2.developers/7.templates/ETL/140.kafka-etl.md",
      "docs/2.developers/7.templates/60.deploy/20.aws-fargate-deploy.md",
      "docs/2.developers/7.templates/60.deploy/10.cloud-deployment.md",
      "docs/2.developers/7.templates/60.deploy/15.gcp-deploy.md",
      "docs/2.developers/7.templates/60.deploy/25.azure-aci-deploy.md",
      "docs/2.developers/7.templates/39.yaml-snippets/20.rag-configuration-examples.md",
      "docs/2.developers/7.templates/39.yaml-snippets/10.data-sources-examples.md",
      "docs/2.developers/7.templates/39.yaml-snippets/.navigation.yml"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# frontend - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 frontend 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **想在安装前理解开源项目价值和边界的用户**：当前证据主要来自项目文档。 证据：`README.md` Claim：`clm_0002` supported 0.86\n\n## 它能做什么\n\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n## 怎么开始\n\n- `pip install -U pathway` 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：先做角色匹配试用\n- **为什么**：这个项目更像角色库，核心风险是选错角色或把角色文案当执行能力；先用 Prompt Preview 试角色匹配，再决定是否沙盒导入。\n\n### 30 秒判断\n\n- **现在怎么做**：先做角色匹配试用\n- **最小安全下一步**：先用 Prompt Preview 试角色匹配；满意后再隔离导入\n- **先别相信**：角色质量和任务匹配不能直接相信。\n- **继续会触碰**：角色选择偏差、命令执行、本地环境或项目文件\n\n### 现在可以相信\n\n- **适合人群线索：想在安装前理解开源项目价值和边界的用户**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n### 现在还不能相信\n\n- **角色质量和任务匹配不能直接相信。**（unverified）：角色库证明有很多角色，不证明每个角色都适合你的具体任务，也不证明角色能产生高质量结果。\n- **不能把角色文案当成真实执行能力。**（unverified）：安装前只能判断角色描述和任务画像是否匹配，不能证明它能在宿主 AI 里完成任务。\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n\n### 继续会触碰什么\n\n- **角色选择偏差**：用户对任务应该由哪个专家角色处理的判断。 原因：选错角色会让 AI 从错误专业视角回答，浪费时间或误导决策。\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：先用交互式试用验证任务画像和角色匹配，不要先导入整套角色库。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **保留原始角色选择记录**：如果输出偏题，可以回到任务画像阶段重新选择角色，而不是继续沿着错误角色推进。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0004` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md` Claim：`clm_0005` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n### 上下文规模\n\n- 文件总数：2326\n- 重要文件覆盖：40/2326\n- 证据索引条目：80\n- 角色 / Skill 条目：77\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 frontend 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 frontend 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 frontend 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 77 个角色 / Skill / 项目文档条目。\n\n- **Pathway Live Data Framework**（project_doc）：Getting Started Deployment Documentation and Support Blog License 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`README.md`\n- **Pathway examples**（project_doc）：This repository contains examples of data processing using Pathway https://pathway.com/developers/documentation/introduction/welcome , the Python-based programming framework for easy building of realtime and reactive data products. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/README.md`\n- **AG2 Multi-Agent Conversations with Pathway Real-Time RAG**（project_doc）：AG2 Multi-Agent Conversations with Pathway Real-Time RAG 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/ag2-multiagent-rag/README.md`\n- **Azure ACI Deployment Example**（project_doc）：This repository contains an example for the tutorial: \"Running Pathway Program in Azure with Azure Container Instances\" https://pathway.com/developers/user-guide/deployment/azure-aci-deploy . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/azure-aci-deploy/README.md`\n- **Best-rated movies examples**（project_doc）：The purpose of this project is two-fold: 1. doing an end-to-end application with Pathway to compute the K-best-rated items in a movie dataset, 2. switching from Kafka to Redpanda 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/best-movies-example/README.md`\n- **Best-rated movies example - Kafka version**（project_doc）：Best-rated movies example - Kafka version 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/best-movies-example/kafka-version/README.md`\n- **Best-rated movies example - Redpanda version**（project_doc）：Best-rated movies example - Redpanda version 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/best-movies-example/redpanda-version/README.md`\n- **Make your LLM app sane again: Forgetting incorrect data in real time**（project_doc）：Make your LLM app sane again: Forgetting incorrect data in real time 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/conf42/README.md`\n- **Custom python connector example**（project_doc）：⚠️ Twitter has turned off its free tier streaming API. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/custom-python-connector-twitter/README.md`\n- **Tutorial: From interactive data exploration to deployment**（project_doc）：Tutorial: From interactive data exploration to deployment 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/from_jupyter_to_deploy/README.md`\n- **ETL with Kafka in/Kafka out**（project_doc）：In this project, you build an ETL pipeline with Pathway with Kafka in and Kafka out. Two data sources broadcast date times with different time zones to different Kafka topics. Pathway connects to those topics, extracts E the data, transforms T the date times to timestamps, and loads L the data to a third Kafka topic. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/kafka-ETL/README.md`\n- **Benchmark for Delta Lake S3 messaging as a Kafka replacement**（project_doc）：Benchmark for Delta Lake S3 messaging as a Kafka replacement 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/kafka-alternatives/benchmarks/README.md`\n- **Pathway Monitoring using OpenTelemetry Collector and Grafana Cloud**（project_doc）：Pathway Monitoring using OpenTelemetry Collector and Grafana Cloud 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/monitoring/README.md`\n- **Computing Option Greeks with Pathway and Databento.**（project_doc）：Computing Option Greeks with Pathway and Databento. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/option-greeks/README.md`\n- **Retrieval-Augmented Generation RAG Pipeline with Pathway**（project_doc）：Retrieval-Augmented Generation RAG Pipeline with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/question-answering-rag/README.md`\n- **Realtime monitoring of logs**（project_doc）：The purpose of this project is to do an end-to-end application with Pathway to monitors logs such as nginx logs . It connects Filebeat to Pathway via Kafka and send the alerts to a Slack channel. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/realtime-log-monitoring/filebeat-pathway-slack/README.md`\n- **Realtime monitoring of logs**（project_doc）：The purpose of this project is to do an end-to-end application with Pathway to monitors logs such as nginx logs . It connects Filebeat/Logstash to Pathway via Kafka and send the alerts to ElasticSearch. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/realtime-log-monitoring/logstash-pathway-elastic/README.md`\n- **Sample Pathway program for SharePoint connection testing**（project_doc）：Sample Pathway program for SharePoint connection testing 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/sharepoint-test/README.md`\n- **Data Preparation for Spark Analytics**（project_doc）：Data Preparation for Spark Analytics 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/spark-data-preparation/README.md`\n- **Realtime Twitter Analysis App with Pathway**（project_doc）：Realtime Twitter Analysis App with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/twitter/README.md`\n- **Web Scraping with Pathway**（project_doc）：This project demonstrates how to create a real-time web scraper using Pathway, a powerful data processing framework. The implementation fetches and processes news articles from websites, making it possible to continuously monitor and analyze the web content. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/projects/web-scraping/README.md`\n- **Pathway EL Pipeline**（project_doc）：Pathway's EL pipeline allows you to move data from different data sources Extract to the sinks of your choice Load . By customizing a single YAML file, you can configure and run the pipeline without modifying any Python code. You can learn more about the pipeline in our article https://pathway.com/developers/templates/etl/el-pipeline/ . 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/templates/el-pipeline/README.md`\n- **Worst-case optimal joins and delta queries in differential dataflow**（project_doc）：Worst-case optimal joins and delta queries in differential dataflow 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`external/differential-dataflow/dogsdogsdogs/README.md`\n- **Graph Server**（project_doc）：A differential dataflow server for continually changing graphs 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`external/differential-dataflow/server/README.md`\n- **tpchlike: a TPCH-like evaluation of differential dataflow**（project_doc）：tpchlike: a TPCH-like evaluation of differential dataflow 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`external/differential-dataflow/tpchlike/README.md`\n- **Pathway RAG evals**（project_doc）：Introduction This folder contains the automated continuous integration tests for RAG evaluations. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`integration_tests/rag_evals/README.md`\n- **Pathway LLM Extension Pack**（project_doc）：This XPack contains Pathway pathway.com functions useful in building data processing pipelines involving LLM models: 1. pathway-compatible wrappers UDFs for popular AI and language modeling tasks, such as: - document parsing - text embedding - LLM API calls 2. ready-made document processing pipelines for popular tasks: - document indexing 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`python/pathway/xpacks/llm/README.md`\n- **Contributing to Pathway**（project_doc）：Welcome! This guide is intended to help developers that are new to the community to make contributions to the Pathway project. Please be sure to also read our code of conduct CODE OF CONDUCT.md . We work hard to make this a welcoming community for all, and we're excited to have you on board! 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`CONTRIBUTING.md`\n- **Contributing to Pathway**（project_doc）：Welcome! This guide is intended to help developers that are new to the community to make contributions to the Pathway project. Please be sure to also read our code of conduct CODE OF CONDUCT.md . We work hard to make this a welcoming community for all, and we're excited to have you on board! 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/CONTRIBUTING.md`\n- **.Changelog**（project_doc）：This page references the Changelog of Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/.changelog.md`\n- **Welcome to Pathway Developer Documentation!**（project_doc）：Welcome to the Pathway developer hub 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/10.welcome.md`\n- **Installation**（project_doc）：Installation page of Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/20.installation.md`\n- **Quick Overview of Pathway**（project_doc）：Quick introduction to Pathway operators. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/30.pathway-overview.md`\n- **Pathway in Minutes: Quick ETL Examples**（project_doc）：A step-by-step guide to build a real-time pipeline with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/40.first_realtime_app_with_pathway.md`\n- **Core Concepts**（project_doc）：A review of the core concepts behind the Pathway programming framework. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/50.concepts.md`\n- **The Easiest Solution for Python Stream Processing, Data Indexing, and Real-Time AI Analytics.**（project_doc）：Why you should use Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/60.why-pathway.md`\n- **Pathway Framework Licensing Guide**（project_doc）：Quick Licensing Guide of Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/65.licensing-guide.md`\n- **Batch Processing in Python: Why Pathway Is Ideal for Both Streaming and Static Workloads**（project_doc）：Why Pathway makes sense for batch processing? 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/10.introduction/80.batch-processing.md`\n- **Connectors**（project_doc）：Pathway supported data sources 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/10.supported-data-sources.md`\n- **Connectors in Pathway**（project_doc）：Presentation of Pathway connectors. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/30.connectors-in-pathway.md`\n- **Pathway connectors**（project_doc）：Pathway supported data sources 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/35.pathway-connectors.md`\n- **Data Types and Schemas**（project_doc）：Defining schema in Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/40.schema.md`\n- **Artificial Data Streams with the demo Module**（project_doc）：How to generate artificial data streams using the demo module 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/50.artificial-streams.md`\n- **Switching from Batch to Streaming**（project_doc）：How to switch from batch to streaming with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/80.switch-from-batch-to-streaming.md`\n- **Making Your Python Web Scraping Go Live with Pathway**（project_doc）：Python Web Scraping with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/90.python-web-scraping.md`\n- **Using CSV connectors**（project_doc）：Tutorial on CSV connectors 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/10.csv_connectors.md`\n- **Sending alerts to Slack**（project_doc）：Tutorial on the connector for sending alerts to Slack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/100.slack_send_alerts.md`\n- **Airbyte connectors available in Pathway**（project_doc）：List of Airbyte connectors you can use with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/110.airbyte-connectors.md`\n- **Using Pathway Debezium Connector for MongoDB**（project_doc）：How to use the Debezium connector in Pathway to perform change data capture from MongoDB 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/115.mongodb-debezium.md`\n- **Using database connectors**（project_doc）：Tutorial on Database connectors 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/20.database-connectors.md`\n- **Creating a custom Python connector**（project_doc）：Tutorial on how to create custom Python connector 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/30.custom-python-connectors.md`\n- **Using Kafka connectors**（project_doc）：Tutorial on how to use Kafka connectors 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/30.kafka_connectors.md`\n- **Using NATS connectors**（project_doc）：Tutorial on how to use NATS connectors 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/33.nats-connectors.md`\n- **Subscribing to changes with Python function**（project_doc）：Tutorial on how to subscribe to changes with Python callback 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/35.python-output-connectors.md`\n- **Google Drive connector**（project_doc）：Tutorial on Google Drive connector 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/70.gdrive-connector.md`\n- **Switching from Kafka to Redpanda**（project_doc）：Tutorial on how to use Redpanda instead of Kafka 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/80.switching-to-redpanda.md`\n- **Consuming WebSockets streams**（project_doc）：Creating a custom WebSockets connector in Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/90.websockets-connectors.md`\n- **Table Operations: An Overview**（project_doc）：Overview of the basic transformations available in Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/30.data-transformation/10.table-operations.md`\n- **Taking You to the SQL API Documentation**（project_doc）：Pathway SQL API 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/30.data-transformation/80.sql.md`\n- **Understanding Temporal Behavior in Pathway**（project_doc）：How to define the behavior of your streamed data in Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/40.temporal-data/20.behaviors.md`\n- **Bridging Financial Data Streams: A Look at ASOF Join in Pathway**（project_doc）：Tutorial about ASOF Joins in Pathway. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md`\n- **LLM Chats**（project_doc）：LLM Wrappers available through Pathway xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.chats/llm-chats.md`\n- **Embedders**（project_doc）：Embedders available through the Pathway xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.embedders/embedders.md`\n- **LLM Examples**（project_doc）：LLM examples made with Pathway LLM tooling 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.llm-examples.md`\n- **Parsers**（project_doc）：Article about Pathway's parsers. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.parsers/parsers.md`\n- **Rerankers**（project_doc）：Rerankers available through the Pathway xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.rerankers/rerankers.md`\n- **Chunking**（project_doc）：Splitters available through the Pathway xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/.splitters/splitters.md`\n- **Introduction to the LLM xpack**（project_doc）：Introduction to the Pathway LLM xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md`\n- **Create your own real-time RAG with Pathway**（project_doc）：Create your own RAG Pipeline with Pathway 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/20.llm-app-pathway.md`\n- **Document Indexing**（project_doc）：Introduction to the Pathway LLM xpack 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/30.docs-indexing.md`\n- **Pathway MCP Server**（project_doc）：Tutorial about how to set up and use Pathway MCP Server 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/40.pathway_mcp_server.md`\n- **Using Pathway MCP Server with Claude Desktop**（project_doc）：Tutorial on connecting Pathway MCP Server to Claude Desktop for real-time data access 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/50.llm-xpack/41.pathway-mcp-claude-desktop.md`\n- **Cloud Deployment of Pathway**（project_doc）：A guide about how to deploy Pathway using the cloud 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/10.cloud-deployment.md`\n- **Deploy to Google Cloud Platform using Cloud Run: a step-by-step guide.**（project_doc）：A guide about how to deploy Pathway to GCP using Google Cloud Run 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/15.gcp-deploy.md`\n- **Running Pathway Program in AWS Cloud with Fargate**（project_doc）：How to deploy Pathway in the cloud with AWS Fargate 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/20.aws-fargate-deploy.md`\n- **Deploying Pathway Programs on Azure Made Easy**（project_doc）：How to deploy Pathway in the cloud within Azure ecosystem 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/25.azure-aci-deploy.md`\n- **Deploy with Render: a Step-by-step Guide.**（project_doc）：A guide about how to deploy Pathway using the Render 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`docs/2.developers/4.user-guide/60.deployment/30.render-deploy.md`\n\n## 证据索引\n\n- 共索引 80 条证据。\n\n- **Pathway Live Data Framework**（documentation）：Getting Started Deployment Documentation and Support Blog License 证据：`README.md`\n- **Pathway examples**（documentation）：This repository contains examples of data processing using Pathway https://pathway.com/developers/documentation/introduction/welcome , the Python-based programming framework for easy building of realtime and reactive data products. 证据：`examples/README.md`\n- **AG2 Multi-Agent Conversations with Pathway Real-Time RAG**（documentation）：AG2 Multi-Agent Conversations with Pathway Real-Time RAG 证据：`examples/projects/ag2-multiagent-rag/README.md`\n- **Azure ACI Deployment Example**（documentation）：This repository contains an example for the tutorial: \"Running Pathway Program in Azure with Azure Container Instances\" https://pathway.com/developers/user-guide/deployment/azure-aci-deploy . 证据：`examples/projects/azure-aci-deploy/README.md`\n- **Best-rated movies examples**（documentation）：The purpose of this project is two-fold: 1. doing an end-to-end application with Pathway to compute the K-best-rated items in a movie dataset, 2. switching from Kafka to Redpanda 证据：`examples/projects/best-movies-example/README.md`\n- **Best-rated movies example - Kafka version**（documentation）：Best-rated movies example - Kafka version 证据：`examples/projects/best-movies-example/kafka-version/README.md`\n- **Best-rated movies example - Redpanda version**（documentation）：Best-rated movies example - Redpanda version 证据：`examples/projects/best-movies-example/redpanda-version/README.md`\n- **Make your LLM app sane again: Forgetting incorrect data in real time**（documentation）：Make your LLM app sane again: Forgetting incorrect data in real time 证据：`examples/projects/conf42/README.md`\n- **Custom python connector example**（documentation）：⚠️ Twitter has turned off its free tier streaming API. 证据：`examples/projects/custom-python-connector-twitter/README.md`\n- **Tutorial: From interactive data exploration to deployment**（documentation）：Tutorial: From interactive data exploration to deployment 证据：`examples/projects/from_jupyter_to_deploy/README.md`\n- **ETL with Kafka in/Kafka out**（documentation）：In this project, you build an ETL pipeline with Pathway with Kafka in and Kafka out. Two data sources broadcast date times with different time zones to different Kafka topics. Pathway connects to those topics, extracts E the data, transforms T the date times to timestamps, and loads L the data to a third Kafka topic. 证据：`examples/projects/kafka-ETL/README.md`\n- **Benchmark for Delta Lake S3 messaging as a Kafka replacement**（documentation）：Benchmark for Delta Lake S3 messaging as a Kafka replacement 证据：`examples/projects/kafka-alternatives/benchmarks/README.md`\n- **Pathway Monitoring using OpenTelemetry Collector and Grafana Cloud**（documentation）：Pathway Monitoring using OpenTelemetry Collector and Grafana Cloud 证据：`examples/projects/monitoring/README.md`\n- **Computing Option Greeks with Pathway and Databento.**（documentation）：Computing Option Greeks with Pathway and Databento. 证据：`examples/projects/option-greeks/README.md`\n- **Retrieval-Augmented Generation RAG Pipeline with Pathway**（documentation）：Retrieval-Augmented Generation RAG Pipeline with Pathway 证据：`examples/projects/question-answering-rag/README.md`\n- **Realtime monitoring of logs**（documentation）：The purpose of this project is to do an end-to-end application with Pathway to monitors logs such as nginx logs . It connects Filebeat to Pathway via Kafka and send the alerts to a Slack channel. 证据：`examples/projects/realtime-log-monitoring/filebeat-pathway-slack/README.md`\n- **Realtime monitoring of logs**（documentation）：The purpose of this project is to do an end-to-end application with Pathway to monitors logs such as nginx logs . It connects Filebeat/Logstash to Pathway via Kafka and send the alerts to ElasticSearch. 证据：`examples/projects/realtime-log-monitoring/logstash-pathway-elastic/README.md`\n- **Sample Pathway program for SharePoint connection testing**（documentation）：Sample Pathway program for SharePoint connection testing 证据：`examples/projects/sharepoint-test/README.md`\n- **Data Preparation for Spark Analytics**（documentation）：Data Preparation for Spark Analytics 证据：`examples/projects/spark-data-preparation/README.md`\n- **Realtime Twitter Analysis App with Pathway**（documentation）：Realtime Twitter Analysis App with Pathway 证据：`examples/projects/twitter/README.md`\n- **Web Scraping with Pathway**（documentation）：This project demonstrates how to create a real-time web scraper using Pathway, a powerful data processing framework. The implementation fetches and processes news articles from websites, making it possible to continuously monitor and analyze the web content. 证据：`examples/projects/web-scraping/README.md`\n- **Pathway EL Pipeline**（documentation）：Pathway's EL pipeline allows you to move data from different data sources Extract to the sinks of your choice Load . By customizing a single YAML file, you can configure and run the pipeline without modifying any Python code. You can learn more about the pipeline in our article https://pathway.com/developers/templates/etl/el-pipeline/ . 证据：`examples/templates/el-pipeline/README.md`\n- **Worst-case optimal joins and delta queries in differential dataflow**（documentation）：Worst-case optimal joins and delta queries in differential dataflow 证据：`external/differential-dataflow/dogsdogsdogs/README.md`\n- **Graph Server**（documentation）：A differential dataflow server for continually changing graphs 证据：`external/differential-dataflow/server/README.md`\n- **tpchlike: a TPCH-like evaluation of differential dataflow**（documentation）：tpchlike: a TPCH-like evaluation of differential dataflow 证据：`external/differential-dataflow/tpchlike/README.md`\n- **Pathway RAG evals**（documentation）：Introduction This folder contains the automated continuous integration tests for RAG evaluations. 证据：`integration_tests/rag_evals/README.md`\n- **Pathway LLM Extension Pack**（documentation）：This XPack contains Pathway pathway.com functions useful in building data processing pipelines involving LLM models: 1. pathway-compatible wrappers UDFs for popular AI and language modeling tasks, such as: - document parsing - text embedding - LLM API calls 2. ready-made document processing pipelines for popular tasks: - document indexing 证据：`python/pathway/xpacks/llm/README.md`\n- **Contributing to Pathway**（documentation）：Welcome! This guide is intended to help developers that are new to the community to make contributions to the Pathway project. Please be sure to also read our code of conduct CODE OF CONDUCT.md . We work hard to make this a welcoming community for all, and we're excited to have you on board! 证据：`CONTRIBUTING.md`\n- **Contributing to Pathway**（documentation）：Welcome! This guide is intended to help developers that are new to the community to make contributions to the Pathway project. Please be sure to also read our code of conduct CODE OF CONDUCT.md . We work hard to make this a welcoming community for all, and we're excited to have you on board! 证据：`examples/CONTRIBUTING.md`\n- **Package**（package_manifest）：{ \"name\": \"frontend\", \"version\": \"1.0.0\", \"description\": \"\", \"main\": \"index.js\", \"scripts\": { \"webpack-dev-server\": \"webpack-dev-server\", \"dev\": \"webpack-dev-server --mode=development\", \"prod\": \"webpack --mode=production\" }, \"keywords\": , \"author\": \"\", \"license\": \"ISC\", \"dependencies\": { \"@emotion/react\": \"^11.10.4\", \"@emotion/styled\": \"^11.10.4\", \"@mui/material\": \"^5.10.3\", \"@mui/x-data-grid\": \"^5.17.1\", \"@mui/x-date-pickers\": \"^5.0.0\", \"css-loader\": \"^6.7.1\", \"deck.gl\": \"^8.8.9\", \"file-loader\": \"^6.2.0\", \"lodash\": \"^4.17.21\", \"mapbox-gl\": \"^2.10.0\", \"moment\": \"^2.29.4\", \"moment-timezone\": \"^0.5.37\", \"react\": \"^18.2.0\", \"react-dom\": \"^18.2.0\", \"react-map-gl\": \"^7.0.19\", \"style-loader\": \"^3… 证据：`examples/projects/twitter/services/frontend/package.json`\n- **License**（source_file）：Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files the \"Software\" , to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 证据：`examples/LICENSE`\n- **License**（source_file）：Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files the \"Software\" , to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 证据：`python/pathway/third_party/airbyte_serverless/LICENSE`\n- **.Changelog**（documentation）：--- title: Changelog description: 'This page references the Changelog of Pathway' toc: false --- 证据：`docs/2.developers/4.user-guide/.changelog.md`\n- **Welcome to Pathway Developer Documentation!**（documentation）：Welcome to Pathway Developer Documentation! 证据：`docs/2.developers/4.user-guide/10.introduction/10.welcome.md`\n- **Installation**（documentation）：To quickly get started with Pathway, you can install it via pip with the following command: 证据：`docs/2.developers/4.user-guide/10.introduction/20.installation.md`\n- **Quick Overview of Pathway**（documentation）：Quick Overview of Pathway This page is a summary of what you need to know about to start programming with Pathway. If you want to learn more about the core concepts of Pathway, you can read the dedicated article /developers/user-guide/introduction/concepts . 证据：`docs/2.developers/4.user-guide/10.introduction/30.pathway-overview.md`\n- **Pathway in Minutes: Quick ETL Examples**（documentation）：Pathway in Minutes: Quick ETL Examples 证据：`docs/2.developers/4.user-guide/10.introduction/40.first_realtime_app_with_pathway.md`\n- **Core Concepts**（documentation）：Core Concepts A review of the core concepts behind data representation and data transformation in the programming layer of Pathway. 证据：`docs/2.developers/4.user-guide/10.introduction/50.concepts.md`\n- **The Easiest Solution for Python Stream Processing, Data Indexing, and Real-Time AI Analytics.**（documentation）：The Easiest Solution for Python Stream Processing, Data Indexing, and Real-Time AI Analytics. 证据：`docs/2.developers/4.user-guide/10.introduction/60.why-pathway.md`\n- **Pathway Framework Licensing Guide**（documentation）：Pathway offers a flexible licensing model designed to accommodate both community users and enterprise deployments. This guide explains the different licensing options, usage limitations, and key considerations for developers. 证据：`docs/2.developers/4.user-guide/10.introduction/65.licensing-guide.md`\n- **Batch Processing in Python: Why Pathway Is Ideal for Both Streaming and Static Workloads**（documentation）：Batch Processing in Python: Why Pathway Is Ideal for Both Streaming and Static Workloads 证据：`docs/2.developers/4.user-guide/10.introduction/80.batch-processing.md`\n- **Connectors**（documentation）：--- title: \"Supported Data Sources\" description: \"Pathway supported data sources\" navigation: false layout: default aside: true toc: false single: true redirect: \"/developers/user-guide/connect/pathway-connectors\" --- Connectors 证据：`docs/2.developers/4.user-guide/20.connect/10.supported-data-sources.md`\n- **Connectors in Pathway**（documentation）：In order to use Pathway, one of the first things you need to do is to access the data you want to manipulate. In Pathway, accessing the data is done using connectors . 证据：`docs/2.developers/4.user-guide/20.connect/30.connectors-in-pathway.md`\n- **Pathway connectors**（documentation）：Pathway has a long list of connectors for connecting to external data sources at input and connectors for outputting the data outside of Pathway. Since Pathway is a streaming-first framework, all connectors operate in streaming mode , automatically updating the results in real-time with every change. Explore the list below to find the connector you need. 证据：`docs/2.developers/4.user-guide/20.connect/35.pathway-connectors.md`\n- **Data Types and Schemas**（documentation）：Data Types and Schemas In this guide, you will explore how to effectively utilize data types and schemas. 证据：`docs/2.developers/4.user-guide/20.connect/40.schema.md`\n- **Artificial Data Streams with the demo Module**（documentation）：Artificial Data Streams with the demo Module With Pathway's demo /developers/api-docs/pathway-demo module, you can create custom data streams from scratch or by utilizing a CSV file. This feature empowers you to effectively test and debug your Pathway implementation using realtime data. 证据：`docs/2.developers/4.user-guide/20.connect/50.artificial-streams.md`\n- **Switching from Batch to Streaming**（documentation）：Switching from Batch to Streaming Easily switching from batch to streaming is a core feature of Pathway. In this article, you will see how easy it is to change your static pipeline to make it run with streaming data. 证据：`docs/2.developers/4.user-guide/20.connect/80.switch-from-batch-to-streaming.md`\n- **Making Your Python Web Scraping Go Live with Pathway**（documentation）：Making Your Python Web Scraping Go Live with Pathway 证据：`docs/2.developers/4.user-guide/20.connect/90.python-web-scraping.md`\n- **Using CSV connectors**（documentation）：Using CSV connectors Pathway provides a pw.io.csv /developers/api-docs/pathway-io/csv module with connectors to read and write data streams using CSV files. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/10.csv_connectors.md`\n- **Sending alerts to Slack**（documentation）：This tutorial will guide you through connecting Pathway to Slack and sending alerts to a specific channel. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/100.slack_send_alerts.md`\n- **Airbyte connectors available in Pathway**（documentation）：Airbyte connectors available in Pathway 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/110.airbyte-connectors.md`\n- **Using Pathway Debezium Connector for MongoDB**（documentation）：Using Pathway Debezium Connector for MongoDB Connect Pathway on top of your MongoDB/Debezium database using pw.io.debezium.read /developers/api-docs/pathway-io/debezium pathway.io.debezium.read and pw.io.mongodb.write /developers/api-docs/pathway-io/mongodb . 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/115.mongodb-debezium.md`\n- **Using database connectors**（documentation）：Using database connectors Connect Pathway on top of your PostgreSQL/Debezium database using pw.io.debezium.read /developers/api-docs/pathway-io/debezium pathway.io.debezium.read and pw.io.postgres.write /developers/api-docs/pathway-io/postgres pathway.io.postgres.write . 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/20.database-connectors.md`\n- **Creating a custom Python connector**（documentation）：In this tutorial, you will learn how to create a Python connector that will allow you to connect to your custom data source and feed data directly into Pathway. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/30.custom-python-connectors.md`\n- **Using Kafka connectors**（documentation）：Using Kafka connectors Pathway provides a pw.io.kafka /developers/api-docs/pathway-io/kafka module with connectors to read and send messages from a Kafka instance. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/30.kafka_connectors.md`\n- **Using NATS connectors**（documentation）：Using NATS connectors Pathway provides a pw.io.nats /developers/api-docs/pathway-io/nats module with connectors to read and send messages from a NATS instance. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/33.nats-connectors.md`\n- **Subscribing to changes with Python function**（documentation）：Subscribing to changes with Python function 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/35.python-output-connectors.md`\n- **Google Drive connector**（documentation）：Google Drive connector This tutorial will guide you through connecting Pathway to your data stored on Google Drive. For detailed information about Google Drive connector, refer to the API documentation /developers/api-docs/pathway-io/gdrive . 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/70.gdrive-connector.md`\n- **Switching from Kafka to Redpanda**（documentation）：Switching from Kafka to Redpanda Not a fan of the JVM and ZooKeeper? In this article, you will learn how to switch from Kafka to Redpanda and how to adapt your Pathway project to Redpanda. The change is easier than you might think: your Pathway code remains the same! 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/80.switching-to-redpanda.md`\n- **Consuming WebSockets streams**（documentation）：In this tutorial, you will be guided through creating a custom WebSocket connector. It will allow you to interact with WebSocket data streams and process them as needed. 证据：`docs/2.developers/4.user-guide/20.connect/99.connectors/90.websockets-connectors.md`\n- 其余 20 条证据见 `AI_CONTEXT_PACK.json` 或 `EVIDENCE_INDEX.json`。\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`README.md`, `examples/README.md`, `examples/projects/ag2-multiagent-rag/README.md`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`README.md`, `examples/README.md`, `examples/projects/ag2-multiagent-rag/README.md`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Pathway Introduction**：importance `high`\n  - source_paths: README.md, python/pathway/__init__.py, pyproject.toml\n- **Core Concepts**：importance `high`\n  - source_paths: python/pathway/internals/schema.py, python/pathway/internals/table.py, python/pathway/internals/datasource.py, docs/2.developers/4.user-guide/10.introduction/50.concepts.md, docs/2.developers/4.user-guide/10.introduction/70.streaming-and-static-modes.md\n- **System Architecture**：importance `high`\n  - source_paths: src/lib.rs, src/python_api.rs, src/engine/mod.rs, python/pathway/_engine_finder.py, external/differential-dataflow/src/lib.rs\n- **Python API Structure**：importance `high`\n  - source_paths: python/pathway/internals/__init__.py, python/pathway/internals/config.py, python/pathway/internals/run.py, python/pathway/engine.pyi, src/python_api.rs\n- **Data Model and Type System**：importance `high`\n  - source_paths: python/pathway/internals/dtype.py, python/pathway/internals/column_properties.py, python/pathway/internals/expression.py, python/pathway/internals/expressions/__init__.py, python/pathway/internals/type_interpreter.py\n- **Data Connectors**：importance `high`\n  - source_paths: python/pathway/io/__init__.py, python/pathway/io/csv/__init__.py, python/pathway/io/kafka/__init__.py, python/pathway/io/postgres/__init__.py, python/pathway/io/s3/__init__.py\n- **Custom Connectors**：importance `medium`\n  - source_paths: python/pathway/internals/datasource.py, python/pathway/internals/datasink.py, docs/2.developers/4.user-guide/20.connect/99.connectors/30.custom-python-connectors.md, examples/projects/custom-python-connector-twitter/twitter_connector_example.py\n- **Data Transformations**：importance `high`\n  - source_paths: python/pathway/internals/table.py, python/pathway/internals/joins.py, python/pathway/internals/groupbys.py, python/pathway/internals/reducers.py, python/pathway/internals/sql/processing.py\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `510c6daf92bd76e21b7279cee5f588b1b64e7b0f`\n- inspected_files: `pyproject.toml`, `README.md`, `docs/2.developers/7.templates/30.configure-yaml.md`, `docs/2.developers/7.templates/35.custom-components.md`, `docs/2.developers/7.templates/20.run-a-template.md`, `docs/2.developers/7.templates/38.licensing-guide.md`, `docs/2.developers/7.templates/1.index.md`, `docs/2.developers/4.user-guide/.changelog.md`, `docs/2.developers/7.templates/rag/1008.template-demo-document-indexing.md`, `docs/2.developers/7.templates/rag/1000.demo-question-answering.md`, `docs/2.developers/7.templates/rag/1001.template-adaptive-rag.md`, `docs/2.developers/7.templates/rag/1010.template-slides-search.md`, `docs/2.developers/7.templates/rag/5.unstructured-to-structured.md`, `docs/2.developers/7.templates/rag/1009.drive-alert.md`, `docs/2.developers/7.templates/rag/1003.template-multimodal-rag.md`, `docs/2.developers/7.templates/rag/1002.template-private-rag.md`, `docs/2.developers/7.templates/rag/.navigation.yml`, `docs/2.developers/7.templates/40.rag-customization/20.custom-prompt.md`, `docs/2.developers/7.templates/40.rag-customization/10.REST-API.md`, `docs/2.developers/7.templates/40.rag-customization/40.splitters.md`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 来源证据：Entra Authentication Support (credential handler and/or password callback)\n\n- Trigger: GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能影响授权、密钥配置或安全边界。\n- Evidence: community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 来源证据：Automated schema exploration in input connectors\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能阻塞安装或首次运行。\n- Evidence: community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 来源证据：Improve watermarks in POSIX-like objects tracker\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能阻塞安装或首次运行。\n- Evidence: community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: 来源证据：Persistence in `iterate` operator\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 来源证据：Support LEANN for RAG pipelines\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 8: 来源证据：Support MongoDB Atlas in pw.io.mongodb\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 9: 来源证据：feat: Add Microsoft SQL Server (MSSQL) connector\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能影响升级、迁移或版本选择。\n- Evidence: community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 10: 来源证据：v0.27.0\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能影响升级、迁移或版本选择。\n- Evidence: community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：pathwaycom/pathway\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：local_cli\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any（high）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Entra Authentication Support (credential handler and/or password callback)（high）：可能影响授权、密钥配置或安全边界。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Automated schema exploration in input connectors（medium）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions（medium）：可能阻塞安装或首次运行。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Improve watermarks in POSIX-like objects tracker（medium）：可能阻塞安装或首次运行。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/pathwaycom/pathway 项目说明书\n\n生成时间：2026-05-16 21:24:03 UTC\n\n## 目录\n\n- [Pathway Introduction](#page-introduction)\n- [Core Concepts](#page-concepts)\n- [System Architecture](#page-architecture)\n- [Python API Structure](#page-python-api)\n- [Data Model and Type System](#page-data-model)\n- [Data Connectors](#page-connectors)\n- [Custom Connectors](#page-connectors-custom)\n- [Data Transformations](#page-transformations)\n- [Temporal and Time-Based Processing](#page-temporal-processing)\n- [LLM xPack](#page-llm-xpack)\n\n<a id='page-introduction'></a>\n\n## Pathway Introduction\n\n### 相关页面\n\n相关主题：[Core Concepts](#page-concepts), [System Architecture](#page-architecture)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n</details>\n\n# Pathway Introduction\n\nPathway is a Python ETL framework designed for stream processing, real-time analytics, LLM pipelines, and Retrieval-Augmented Generation (RAG). It provides an easy-to-use Python API that seamlessly integrates with popular Python ML libraries, enabling developers to build robust data processing pipelines that work in both development and production environments.\n\n## Overview\n\nPathway combines the flexibility of Python with the performance of a scalable Rust engine built on Differential Dataflow. The framework performs incremental computation, allowing it to efficiently handle both batch and streaming data with the same code base.\n\n### Key Characteristics\n\n| Feature | Description |\n|---------|-------------|\n| **Language** | Python with Rust engine |\n| **Processing Model** | Incremental computation via Differential Dataflow |\n| **Data Modes** | Batch processing and streaming |\n| **Deployment** | Local development, CI/CD, Docker, production |\n| **License** | BSL (Business Source License) |\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n## Architecture\n\nPathway's architecture leverages a multi-layered design that separates the Python interface from the high-performance Rust computation engine.\n\n```mermaid\ngraph TD\n    subgraph Python_Interface[\"Python Layer\"]\n        A[User Code] --> B[pathway API]\n        B --> C[Python Subject]\n        B --> D[Python Connectors]\n    end\n    \n    subgraph Rust_Engine[\"Rust Engine\"]\n        E[Expression Engine] --> F[Differential Dataflow]\n        F --> G[Timely Dataflow]\n        C --> E\n        D --> E\n    end\n    \n    subgraph Output[\"Output Layer\"]\n        G --> H[Exported Tables]\n        H --> I[Snapshot Data]\n    end\n```\n\n### Core Components\n\n| Component | Role |\n|-----------|------|\n| **PythonSubject** | Wraps Python callbacks for data input |\n| **ValueField** | Defines schema fields with types and defaults |\n| **PyExpression** | Encapsulates expressions with optional GIL release |\n| **PyExportedTable** | Provides snapshot and frontier access to output data |\n| **ConnectorMode** | Distinguishes between STATIC and STREAMING modes |\n\n资料来源：[src/python_api.rs:25-35](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L25-L35)\n\n## Data Model\n\n### Schema Fields\n\nPathway uses `ValueField` to define table schemas with the following properties:\n\n```python\n@dataclass\nclass ValueField:\n    name: str           # Field identifier\n    type_: Type         # Data type\n    source: FieldSource # Origin of the field (Payload, Key, etc.)\n    default: Optional[Value]  # Default value if not present\n    metadata: Optional[str]   # Additional metadata as JSON string\n```\n\nThe `source` field determines how a field is populated:\n\n| FieldSource | Description |\n|-------------|-------------|\n| `FieldSource.Payload` | Field comes from the input data |\n| `FieldSource.Key` | Field is used as a key identifier |\n| `FieldSource.PSEUDO` | Synthetic field generated by the system |\n\n资料来源：[src/python_api.rs:47-55](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L47-L55)\n\n### Timestamps and Ordering\n\nPathway implements timestamp-based ordering for incremental computation. Each timestamp has special semantics for distinguishing original data from retractions.\n\n```rust\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\nThe timestamp system ensures that even-numbered timestamps represent original data while odd-numbered timestamps represent retractions, enabling efficient differential updates.\n\n资料来源：[src/engine/timestamp.rs:48-60](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs#L48-L60)\n\n## Processing Modes\n\nPathway supports two distinct connector modes:\n\n### Static Mode\n\nIn static mode, data is processed in batches and the pipeline completes once all input is consumed.\n\n```python\npw.io.http.PathwayWebserver(mode=pw.io.http.SocketWriterMode.STATIC)\n```\n\n### Streaming Mode\n\nIn streaming mode, the pipeline remains active and continuously processes incoming data updates.\n\n```python\npw.io.http.PathwayWebserver(mode=pw.io.http.SocketWriterMode.STREAMING)\n```\n\n| Mode | Use Case | Behavior |\n|------|----------|----------|\n| `STATIC` | Batch jobs, one-off processing | Exits after all data processed |\n| `STREAMING` | Live data, real-time analytics | Continuously processes updates |\n\n资料来源：[src/python_api.rs:280-290](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L280-L290)\n\n## Expression Engine\n\nPathway's expression engine supports both unary and binary operations on data. The engine is implemented in Rust for performance but callable from Python.\n\n```mermaid\ngraph LR\n    A[PyExpression] -->|unary_op!| B[Unary Expression]\n    A -->|binary_op!| C[Binary Expression]\n    B --> D[Result Expression]\n    C --> D\n```\n\n### Supported Operations\n\nThe `PyExpression` class wraps Rust expressions and tracks whether the GIL (Global Interpreter Lock) should be held during evaluation:\n\n```rust\npub struct PyExpression {\n    inner: Arc<Expression>,\n    gil: bool,  // Whether GIL is required\n}\n```\n\nOperations automatically set the `gil` flag based on whether any operand requires the Python GIL, optimizing performance when Python code is not involved.\n\n资料来源：[src/python_api.rs:360-375](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L360-L375)\n\n## Frontier and Snapshots\n\nPathway uses the concept of \"frontiers\" to track progress in incremental computation. The `TotalFrontier` enum represents the state of computation:\n\n```rust\npub enum TotalFrontier<T> {\n    At(T),      // Computation at specific timestamp\n    Done,       // Computation complete\n}\n```\n\n### Snapshot Access\n\nThe `PyExportedTable` provides methods to access data at specific frontiers:\n\n| Method | Description |\n|--------|-------------|\n| `frontier()` | Returns the current computation frontier |\n| `snapshot_at(frontier)` | Returns all key-value pairs up to the specified frontier |\n| `failed()` | Indicates if the computation has failed |\n\n```python\n# Access snapshot at current frontier\ncurrent_frontier = table.frontier()\ndata = table.snapshot_at(current_frontier)\n```\n\n资料来源：[src/python_api.rs:200-220](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs#L200-L220)\n\n## Differential Dataflow Integration\n\nPathway's Rust engine is built on Differential Dataflow, an extension of Timely Dataflow that supports incremental computation with efficient handling of updates and retractions.\n\n```mermaid\ngraph TD\n    subgraph Timely_Dataflow[\"Timely Dataflow\"]\n        A[Input] --> B[Operator]\n        B --> C[Probe]\n    end\n    \n    subgraph Differential_Extension[\"Differential Dataflow\"]\n        B --> D[Arrangement]\n        D --> E[Trace Agent]\n        E --> F[Collection]\n    end\n```\n\nThe `arrange` operator imports arranged data into a computation scope:\n\n```rust\npub fn import<G>(&mut self, scope: &G) -> Arranged<G, TraceAgent<Tr>>\nwhere\n    G: Scope<Timestamp=Tr::Time>,\n    Tr::Time: Timestamp,\n{\n    self.import_named(scope, \"ArrangedSource\")\n}\n```\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:45-55](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs#L45-L55)\n\n## Common Use Cases\n\n### Real-time RAG Pipelines\n\nPathway excels at building Retrieval-Augmented Generation pipelines that require real-time document indexing and querying.\n\n```\nDocuments --> Pathway VectorStoreServer --> REST API /v1/retrieve\n                                                    |\nUser Query --> HTTP POST --> Pathway --> LLM --> Response\n```\n\nExample configuration:\n\n```python\nimport pathway as pw\n\n# Start the vector store server\nwebserver = pw.io.http.PathwayWebserver(host=\"0.0.0.0\", port=8011)\n\n# Query endpoint\ncurl --data '{ \"messages\": \"What is the value of X?\"}' http://localhost:8011\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n\n### Multi-Agent RAG Systems\n\nPathway supports multi-agent architectures where different agents collaborate to answer complex queries:\n\n```mermaid\ngraph LR\n    A[Documents] --> B[Pathway VectorStoreServer]\n    B --> C[REST API]\n    C --> D[UserProxy Agent]\n    D --> E[Researcher Agent]\n    D --> F[Analyst Agent]\n    E & F --> G[Grounded Answers]\n```\n\n资料来源：[examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n\n## Running Pathway Applications\n\n### Local Execution\n\nRun Pathway applications like standard Python scripts:\n\n```bash\npython main.py\n```\n\n### Multi-threaded Execution\n\nPathway natively supports multithreading:\n\n```bash\npathway spawn --threads 3 python main.py\n```\n\n### Docker Deployment\n\nPathway provides official Docker images for containerized execution:\n\n```dockerfile\nFROM pathwaycom/pathway:latest\n\nWORKDIR /app\nCOPY requirements.txt ./\nRUN pip install --no-cache-dir -r requirements.txt\nCOPY . .\n\nCMD [ \"python\", \"./your-script.py\" ]\n```\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n## Summary\n\nPathway provides a powerful framework for building data processing pipelines with the following advantages:\n\n- **Python-first API**: Write pipelines in familiar Python syntax\n- **Rust performance**: Leverage Differential Dataflow for efficient incremental computation\n- **Unified code**: Same code works for batch and streaming scenarios\n- **Production-ready**: Built-in monitoring, logging, and Docker support\n- **LLM integration**: First-class support for RAG and AI pipelines\n\nThe framework is particularly well-suited for applications requiring real-time data processing, such as live document indexing, dynamic analytics, and AI-powered search systems.\n\n---\n\n<a id='page-concepts'></a>\n\n## Core Concepts\n\n### 相关页面\n\n相关主题：[Pathway Introduction](#page-introduction), [Data Model and Type System](#page-data-model)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/internals/schema.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/schema.py)\n- [python/pathway/internals/table.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/table.py)\n- [python/pathway/internals/datasource.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/datasource.py)\n- [docs/2.developers/4.user-guide/10.introduction/50.concepts.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/10.introduction/50.concepts.md)\n- [docs/2.developers/4.user-guide/10.introduction/70.streaming-and-static-modes.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/10.introduction/70.streaming-and-static-modes.md)\n</details>\n\n# Core Concepts\n\nPathway is a Python ETL framework built on a scalable Rust engine that leverages Differential Dataflow for incremental computation. Understanding its core concepts is essential for building robust data processing pipelines that work seamlessly in both development and production environments.\n\n## Architecture Overview\n\nPathway's architecture combines Python usability with Rust performance through a carefully designed abstraction layer. The Python API provides the user-facing interface, while the Rust engine handles the actual computation using Differential Dataflow's incremental processing model.\n\n```mermaid\ngraph TB\n    subgraph Python_API[\"Python API Layer\"]\n        Schema[\"Schema\"]\n        Table[\"Table\"]\n        Connector[\"Connectors\"]\n        UDF[\"User-Defined Functions\"]\n    end\n    \n    subgraph Rust_Engine[\"Rust Engine\"]\n        DiffDataflow[\"Differential Dataflow\"]\n        IncrementalComp[\"Incremental Computation\"]\n        Persistence[\"Persistence Layer\"]\n    end\n    \n    subgraph DataStreams[\"Data Streams\"]\n        Static[\"Static/Batch Data\"]\n        Streaming[\"Real-time Streams\"]\n    end\n    \n    Python_API --> Rust_Engine\n    DataStreams --> Rust_Engine\n```\n\nThe framework is designed so that the same code can run in multiple execution modes without modification, supporting local development, CI/CD tests, batch jobs, stream replays, and live data processing.\n\n## Tables\n\nTables are the fundamental data structure in Pathway, representing collections of typed rows with a defined schema. Every table has a primary key that uniquely identifies each row.\n\n### Table Structure\n\nFrom the Rust engine perspective, a `Table` consists of:\n\n| Component | Description | Rust Type |\n|-----------|-------------|-----------|\n| Scope | Execution scope for the table | `Scope` |\n| Handle | Internal reference to table data | `TableHandle` |\n| Columns | Typed data columns | `Vec<Column>` |\n| Universe | Set of all keys in the table | `Universe` |\n\nThe `Table` class wraps the Rust engine's table representation and caches Python instances to avoid redundant object creation.\n\n### Table Operations\n\nTables support a rich set of operations:\n\n- **Selection**: Filter rows based on conditions\n- **Projection**: Select and transform columns\n- **Joins**: Combine tables based on keys\n- **Aggregations**: Group and aggregate data\n- **Updates**: Modify existing rows\n\nAll operations are lazy—they don't execute until the computation is triggered by `pw.run()`.\n\n## Schemas\n\nSchemas define the structure of tables, specifying column names, types, and metadata. They serve as the contract between data sources and the processing logic.\n\n### Schema Definition\n\nSchemas are typically defined using the `@pw.schema_class` decorator combined with `pw.column_definition`:\n\n```python\nclass DocumentSchema(pw.Schema):\n    id = pw.column_definition(type_=pw.Type.STRING)\n    content = pw.column_definition(type_=pw.Type.STRING)\n    timestamp = pw.column_definition(type_=pw.Type.DATETIME)\n    embedding = pw.column_definition(type_=pw.Type.FLOAT_ARRAY)\n```\n\n### Column Types\n\n| Type | Python Representation | Use Case |\n|------|---------------------|----------|\n| `STRING` | `str` | Text data, identifiers |\n| `INT` | `int` | Integer values, counts |\n| `FLOAT` | `float` | Decimal numbers |\n| `BOOL` | `bool` | True/false values |\n| `DATETIME` | `datetime` | Timestamps and dates |\n| `FLOAT_ARRAY` | `List[float]` | Embeddings, vectors |\n| `DICT` | `dict` | Nested structured data |\n\n### Primary Keys\n\nEach schema must define a primary key that uniquely identifies rows:\n\n```python\nclass InputSchema(pw.Schema):\n    doc_id = pw.column_definition(type_=pw.Type.STRING)\n    content = pw.column_definition(type_=pw.Type.STRING)\n    \n    @staticmethod\n    def primary_key():\n        return InputSchema.doc_id\n```\n\n## Connectors and Data Sources\n\nConnectors bridge external data sources with Pathway's internal table representation. They handle reading data from various sources and writing results to destinations.\n\n### Input Connectors\n\nPathway provides input connectors for multiple data sources:\n\n| Connector | Description | Protocol |\n|-----------|-------------|----------|\n| `pw.io.filesystem.read` | File system reading | Batch/Streaming |\n| `pw.io.http.read` | HTTP endpoint polling | Streaming |\n| `pw.io.rdkafka.read` | Kafka consumer | Streaming |\n| `pw.io.amqp.read` | AMQP message queue | Streaming |\n| `pw.io.gcp_storage.read` | Google Cloud Storage | Batch |\n| `pw.io.s3.read` | Amazon S3 | Batch |\n| `pw.io.azure_blob.read` | Azure Blob Storage | Batch |\n| `pw.io.postgres.read` | PostgreSQL database | Batch/Streaming |\n\n### Output Connectors\n\nOutput connectors write processed data to external systems:\n\n```python\npw.io.json_lines.write(\n    table=result_table,\n    path=\"./output/results.jsonl\"\n)\n```\n\n### PythonSubject for Custom Sources\n\nFor custom data sources, Pathway provides the `PythonSubject` class that wraps Python callback functions:\n\n```python\npw.io.python.connector(\n    schema=MySchema,\n    start_function=start_callback,\n    read_function=read_callback,\n    seek_function=seek_callback,\n    stop_function=stop_callback\n)\n```\n\nThe subject handles the lifecycle of data reading with callbacks for initialization, data retrieval, and seeking to specific positions.\n\n## Data Flow and Processing\n\nPathway processes data through a directed acyclic graph (DAG) of operations. Each operation transforms input tables into output tables.\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Source Connector]\n    B --> C[Raw Table]\n    C --> D[Transformations]\n    D --> E[Joins & Aggregations]\n    E --> F[Computed Table]\n    F --> G[Output Connector]\n    G --> H[Destination]\n    \n    style C fill:#e1f5fe\n    style F fill:#e8f5e8\n```\n\n### Lazy Evaluation\n\nAll Pathway operations are lazily evaluated. The computation graph is built incrementally as operations are added, but no actual data processing occurs until `pw.run()` is called.\n\n### Incremental Computation\n\nThe Rust engine uses Differential Dataflow to perform incremental computation. When input data changes, only the affected portions of the computation are re-executed. This is particularly valuable for streaming scenarios where data arrives continuously.\n\n## Execution Modes\n\nPathway supports two primary execution modes that determine how data is processed over time.\n\n### Static Mode (Batch Processing)\n\nIn static mode, Pathway processes finite datasets completely before producing results. The computation terminates when all input has been consumed.\n\n```mermaid\ngraph LR\n    A[All Input Data] --> B[Process Entire Dataset]\n    B --> C[Single Output Result]\n    \n    style A fill:#fff3e0\n    style C fill:#e8f5e8\n```\n\nCharacteristics:\n- Processes data in batches\n- Terminates after completion\n- Suitable for ETL jobs\n- Optimized for throughput\n\n### Streaming Mode\n\nIn streaming mode, Pathway processes data continuously as it arrives, updating results incrementally.\n\n```mermaid\ngraph LR\n    A[Data Stream] --> B[Continuous Processing]\n    B --> C[Incremental Updates]\n    C --> D[Live Results]\n    D --> B\n    \n    style A fill:#e3f2fd\n    style D fill:#e8f5e8\n```\n\nCharacteristics:\n- Processes data as it arrives\n- Results update continuously\n- Suitable for real-time analytics\n- Maintains state over time\n\n### Unified API\n\nThe same Pathway code works in both modes:\n\n```python\n# Works identically in both modes\nresult = (\n    input_table\n    .filter(lambda row: row.value > threshold)\n    .groupby(lambda row: row.category)\n    .reduce(lambda key, rows: (key, sum(r.value for r in rows)))\n)\n```\n\nSwitching between modes is achieved by changing the connector configuration rather than the transformation logic.\n\n## The Rust Engine\n\nPathway's Rust engine provides the computational foundation, built on Timely Dataflow and Differential Dataflow.\n\n### Key Components\n\n| Component | Purpose | Location |\n|-----------|---------|----------|\n| `Scope` | Execution context | Rust Engine |\n| `TableHandle` | Internal table reference | Rust Engine |\n| `Column` | Column data structure | Rust Engine |\n| `Universe` | Key set management | Rust Engine |\n| `TraceAgent` | Arrangement tracking | Differential Dataflow |\n\n### Python-Rust Integration\n\nThe integration uses PyO3 to create Python bindings for Rust structures:\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen)]\npub struct Table {\n    scope: Py<Scope>,\n    handle: TableHandle,\n}\n```\n\nThis allows Python code to work with Rust objects seamlessly while maintaining Python's ease of use.\n\n### Persistence\n\nThe Rust engine supports checkpointing and state persistence:\n\n```python\npw.run(\n    persistence_config=pw.persistence.Config(\n        mode=pw.persistence.Mode.SNAPSHOT,\n        snapshot_path=\"./checkpoints\"\n    )\n)\n```\n\n## User-Defined Functions\n\nPathway allows custom Python functions to be used in transformations, with careful handling of serialization and execution.\n\n### Function Types\n\n| Type | Execution | Use Case |\n|------|-----------|----------|\n| Pure Python | Serialized to Rust | Simple transformations |\n| Stateful | Managed by Pathway | Complex aggregations |\n| External | Called externally | LLM integrations |\n\n### Execution Safety\n\nFunctions executed on streaming data must be deterministic and side-effect free to ensure correct incremental computation. Pathway validates function behavior during development and enforces constraints at runtime.\n\n## Monitoring and Debugging\n\nPathway provides built-in monitoring capabilities through its dashboard.\n\n### Available Metrics\n\n- Message counts per connector\n- Processing latency\n- System resource utilization\n- Error rates and logs\n\n### Debugging Features\n\nThe `Trace` class captures execution stack traces for debugging:\n\n```rust\n#[pyclass(module = \"pathway.engine\", frozen)]\npub struct Trace {\n    file_name: String,\n    line_number: usize,\n    line: String,\n    function: String,\n}\n```\n\n## Summary\n\nPathway's core concepts form a cohesive system for building data processing pipelines:\n\n| Concept | Role |\n|---------|------|\n| **Tables** | Primary data structure for collections of typed rows |\n| **Schemas** | Define table structure, types, and primary keys |\n| **Connectors** | Bridge external data sources with Pathway tables |\n| **Lazy Evaluation** | Build computation graphs without immediate execution |\n| **Incremental Computation** | Efficient updates through Differential Dataflow |\n| **Execution Modes** | Unified API for batch and streaming processing |\n| **Rust Engine** | High-performance computation layer |\n\nThese concepts work together to provide a framework that is both developer-friendly and production-ready, capable of handling the full spectrum from simple batch ETL to complex real-time streaming analytics.\n\n---\n\n<a id='page-architecture'></a>\n\n## System Architecture\n\n### 相关页面\n\n相关主题：[Python API Structure](#page-python-api), [Pathway Introduction](#page-introduction)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/mod.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/mod.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [src/connectors/postgres.rs](https://github.com/pathwaycom/pathway/blob/main/src/connectors/postgres.rs)\n- [external/differential-dataflow/src/lib.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/lib.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [external/differential-dataflow/src/trace/wrappers/freeze.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/wrappers/freeze.rs)\n- [external/differential-dataflow/src/trace/implementations/ord.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/implementations/ord.rs)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n</details>\n\n# System Architecture\n\nPathway is a Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG applications. The system provides an easy-to-use Python API while being powered by a scalable Rust engine based on Differential Dataflow for incremental computation. 资料来源：[README.md]()\n\n## High-Level Architecture Overview\n\nPathway follows a layered architecture that separates the Python user interface from the high-performance Rust computation engine.\n\n```mermaid\ngraph TD\n    subgraph Python_Layer[\"Python API Layer\"]\n        User_Code[\"User Python Code\"]\n        Python_API[\"Pathway Python API<br/>pw.Table, pw.io.*, pw.connectors\"]\n    end\n\n    subgraph Rust_Engine[\"Rust Engine Layer\"]\n        PyO3_Bindings[\"PyO3 Bindings\"]\n        Timely_Runtime[\"Timely Dataflow Runtime\"]\n        Diff_Computations[\"Differential Dataflow<br/>Incremental Computations\"]\n        Trace_Agent[\"Trace Agent & Arrangements\"]\n    end\n\n    subgraph Data_Storage[\"Data Storage Layer\"]\n        In_Memory_State[\"In-Memory State\"]\n        Persistence[\"Persistence Layer\"]\n        Connectors[\"External Connectors<br/>SQL, S3, HTTP, etc.\"]\n    end\n\n    User_Code --> Python_API\n    Python_API --> PyO3_Bindings\n    PyO3_Bindings --> Timely_Runtime\n    Timely_Runtime --> Diff_Computations\n    Diff_Computations --> Trace_Agent\n    Trace_Agent --> In_Memory_State\n    In_Memory_State --> Persistence\n    Diff_Computations --> Connectors\n```\n\n资料来源：[src/python_api.rs:1-50]()\n\n## Core Architectural Components\n\n### Python API Layer\n\nThe Python API layer provides the public interface for Pathway users. It exposes various I/O connectors, table operations, and transformation functions.\n\n**Key Python Classes Exposed via PyO3:**\n\n| Class | Purpose | Source |\n|-------|---------|--------|\n| `Table` | Primary data container for rows | `src/python_api.rs` |\n| `Universe` | Represents a set of row keys | `src/python_api.rs` |\n| `Column` | Column descriptor with type info | `src/python_api.rs` |\n| `Scope` | Computation scope context | `src/python_api.rs` |\n| `Context` | Runtime context for expressions | `src/python_api.rs` |\n| `Pointer` | Reference to table data | `src/python_api.rs` |\n| `ValueField` | Schema field definition | `src/python_api.rs` |\n| `PythonSubject` | Python-based data source | `src/python_api.rs` |\n\n资料来源：[src/python_api.rs:150-200]()\n\n### Rust Engine Layer\n\nThe Rust engine implements the core computation logic using Timely Dataflow and Differential Dataflow.\n\n#### Timely Dataflow\n\nTimely Dataflow provides the low-level infrastructure for parallel and distributed dataflow computation. Pathway extends the external timely-dataflow crate to support:\n\n- **Multi-worker execution**: Enables parallel processing across CPU cores\n- **Progress tracking**: Monitors dataflow completeness\n- **Dynamic reconfiguration**: Supports changing dataflow graphs\n\n资料来源：[external/timely-dataflow/mdbook/src/chapter_2/chapter_2_5.md]()\n\n#### Differential Dataflow\n\nDifferential Dataflow builds on Timely Dataflow to provide incremental computation capabilities. This is the heart of Pathway's ability to efficiently handle streaming data with updates and deletions.\n\n**Key Differential Dataflow Components:**\n\n| Component | Purpose |\n|-----------|---------|\n| `TraceAgent` | Manages persistent trace data for arrangements |\n| `Arranged` | Wrapper combining stream with trace |\n| `TraceReader` | Interface for reading from traces |\n| `Batch` | Immutable collection of updates |\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:1-30]()\n\n#### Trace Implementation\n\nPathway uses `OrdValBatch` and `OrdKeyBatch` for efficient in-memory storage of keyed data with timestamps.\n\n**OrdValBatch Structure:**\n```\nOrdValBatch<K, V, T, R, O, CK, CV>\n├── K: Key type (Ord + Clone)\n├── V: Value type (Ord + Clone)\n├── T: Timestamp type (Lattice + Ord + Clone)\n├── R: Weight type (Semigroup)\n├── O: Offset type for arrays\n├── CK: Container for keys\n└── CV: Container for values\n```\n\n资料来源：[external/differential-dataflow/src/trace/implementations/ord.rs:1-50]()\n\n### Timestamp System\n\nPathway's timestamp system is crucial for handling streaming data and maintaining temporal ordering.\n\n```mermaid\nclassDiagram\n    class Timestamp {\n        +u64 inner\n        +followed_by(other: Timestamp) Timestamp\n        +is_original() bool\n        +next_retraction_time() Timestamp\n    }\n    \n    class TotalFrontier {\n        <<enumeration>>\n        At(Timestamp)\n        Done\n    }\n    \n    class OriginalOrRetraction {\n        +is_original() bool\n    }\n    \n    class NextRetractionTime {\n        +next_retraction_time() Timestamp\n    }\n    \n    Timestamp ..|> OriginalOrRetraction\n    Timestamp ..|> NextRetractionTime\n```\n\n**Key Timestamp Properties:**\n\n- **Even timestamps** (`is_multiple_of(2)`): Represent original data\n- **Odd timestamps**: Represent retractions\n- **Lattice properties**: Enable merging of partial time information\n\n资料来源：[src/engine/timestamp.rs:1-50]()\n\n## Data Flow Architecture\n\n### Input Processing Flow\n\n```mermaid\ngraph LR\n    subgraph Input_Sources\n        Files[\"Files\"]\n        HTTP[\"HTTP/Websocket\"]\n        SQL[\"SQL Databases\"]\n        Kafka[\"Kafka/MQTT\"]\n    end\n\n    subgraph Processing\n        Parser[\"Parser/Deserializer\"]\n        Transformer[\"Transformer\"]\n        Arranger[\"Arranger\"]\n    end\n\n    subgraph Storage\n        Trace[\"Trace Agent\"]\n        Table[\"Table State\"]\n    end\n\n    Files --> Parser\n    HTTP --> Parser\n    SQL --> Parser\n    Kafka --> Parser\n    Parser --> Transformer\n    Transformer --> Arranger\n    Arranger --> Trace\n    Trace --> Table\n```\n\n### Table Operations Flow\n\nTables in Pathway are the primary data abstraction. Each table maintains:\n\n1. **Schema**: Column definitions with types\n2. **Keys**: Unique row identifiers\n3. **Values**: Column data\n4. **Timestamps**: For temporal ordering\n5. **Diffs**: Change weights (+1 for insert, -1 for delete)\n\n**Table Properties:**\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `id` | Key | Unique row identifier |\n| `values` | Map[Column, Value] | Row data |\n| `time` | Timestamp | Event timestamp |\n| `diff` | i64 | Change weight |\n\n资料来源：[src/python_api.rs:200-250]()\n\n## Connectors Architecture\n\nPathway provides connectors for various data sources and sinks. The connector system is designed with extensibility in mind.\n\n### Connector Base Infrastructure\n\n**ConnectorProperties Class Hierarchy:**\n\n```mermaid\nclassDiagram\n    class ConnectorProperties {\n        +name: String\n        +storage: DataStorage\n        +format: DataFormat\n    }\n    \n    class ColumnProperties {\n        +name: String\n        +type_: PathwayType\n        +source: FieldSource\n        +default: Option~Value~\n    }\n    \n    class TableProperties {\n        +columns: Vec~ColumnProperties~\n        +primary_key: Option~Vec~String~~\n    }\n    \n    class CsvParserSettings {\n        +delimiter: char\n        +quotechar: char\n        +header: bool\n    }\n    \n    ConnectorProperties --> TableProperties\n    TableProperties --> ColumnProperties\n```\n\n### Database Connectors\n\nThe PostgreSQL connector demonstrates the connector implementation pattern:\n\n**Type Mapping:**\n\n| PostgreSQL Type | Pathway Type | OID |\n|-----------------|--------------|-----|\n| BOOL | BOOLEAN | 16 |\n| INT2 | INT | 21 |\n| INT4 | INT | 23 |\n| INT8 | INT | 20 |\n| FLOAT4 | FLOAT | 700 |\n| FLOAT8 | FLOAT | 701 |\n| TEXT/VARCHAR | STRING | 25 |\n| BYTEA | BINARY | 17 |\n| JSONB | JSON | 3802 |\n| TIMESTAMP | DATETIME | 1114 |\n| UUID | UUID | 2950 |\n\n**Postgres Binary Array Format:**\n- `i32`: ndim (dimensions)\n- `i32`: has_nulls flag\n- `u32`: element OID\n- Per dimension: size, lower_bound\n- Per element: length (-1 for NULL), bytes\n\n资料来源：[src/connectors/postgres.rs:1-80]()\n\n### Other Supported Connectors\n\n| Connector | Settings Class | Description |\n|-----------|---------------|-------------|\n| AWS S3 | `AwsS3Settings` | S3 object storage |\n| Azure Blob | `AzureBlobStorageSettings` | Azure blob storage |\n| Elasticsearch | `ElasticSearchParams` | Search and indexing |\n| MQTT | `MqttSettings` | IoT messaging |\n| Kafka | `KafkaSettings` | Stream messaging |\n| Schema Registry | `PySchemaRegistrySettings` | Avro schema management |\n| Iceberg | `IcebergCatalogSettings` | Lakehouse tables |\n| PostgreSQL Replication | `PsqlReplicationSettings` | CDC from Postgres |\n\n资料来源：[src/python_api.rs:250-300]()\n\n## Persistence and State Management\n\n### Persistence Configuration\n\nPathway supports configurable persistence for fault tolerance and state recovery.\n\n```mermaid\ngraph TD\n    subgraph Application_Layer\n        User_Query[\"User Query\"]\n    end\n\n    subgraph Persistence_Layer\n        Config[\"PersistenceConfig\"]\n        Mode[\"PyPersistenceMode<br/>ON_STARTUP<br/>MANUAL<br/>periodic\"]\n        Storage[\"DataStorage\"]\n    end\n\n    subgraph State_Management\n        Snapshot[\"PySnapshotAccess\"]\n        Event[\"PySnapshotEvent\"]\n        Frontier[\"TotalFrontier\"]\n    end\n\n    User_Query --> Config\n    Config --> Mode\n    Config --> Storage\n    Mode --> Snapshot\n    Snapshot --> Event\n    Event --> Frontier\n```\n\n**Persistence Modes:**\n\n| Mode | Behavior |\n|------|----------|\n| `ON_STARTUP` | Automatically restore state on application start |\n| `MANUAL` | Explicit checkpoint management via API |\n| `PERIODIC` | Automatic snapshots at intervals |\n\n**Snapshot Access Methods:**\n\n| Method | Return Type | Description |\n|--------|-------------|-------------|\n| `snapshot_at(frontier)` | `Vec<(Key, Vec<Value>)>` | State at specific frontier |\n| `frontier()` | `TotalFrontier` | Current progress frontier |\n| `failed()` | `bool` | Whether computation failed |\n\n资料来源：[src/python_api.rs:100-150]()\n\n### Trace Freeze Wrapper\n\nThe `TraceFreeze` wrapper provides read-only access to historical trace data:\n\n```rust\npub struct TraceFreeze<Tr, F>\nwhere\n    Tr: TraceReader,\n    Tr::Time: Lattice+Clone+'static,\n    F: Fn(&Tr::Time)->Option<Tr::Time>,\n{\n    trace: Tr,\n    func: Rc<F>,\n}\n```\n\nThis enables efficient point-in-time queries without blocking ongoing updates.\n\n资料来源：[external/differential-dataflow/src/trace/wrappers/freeze.rs:1-30]()\n\n## Expression and Computation System\n\n### Python Expression Evaluation\n\nPathway allows Python functions to be used for data transformations through the `PyExpression` system.\n\n**Expression System Components:**\n\n| Component | Purpose |\n|-----------|---------|\n| `PyExpression` | Serialized Python expression |\n| `PyExpressionData` | Runtime data for evaluation |\n| `PyReducer` | Aggregation functions |\n| `PyUnaryOperator` | Single-input operators |\n| `PyBinaryOperator` | Two-input operators |\n\n**Computation Execution Model:**\n\n```mermaid\ngraph LR\n    subgraph Definition\n        Expr[\"Expression Definition\"]\n        Schema[\"Schema Definition\"]\n    end\n\n    subgraph Compilation\n        Parse[\"Parse & Validate\"]\n        Optimize[\"Optimize\"]\n        Generate[\"Generate Operators\"]\n    end\n\n    subgraph Execution\n        Compute[\"Compute\"]\n        Index[\"Index Results\"]\n        Output[\"Output to Tables\"]\n    end\n\n    Expr --> Parse\n    Schema --> Parse\n    Parse --> Optimize\n    Optimize --> Generate\n    Generate --> Compute\n    Compute --> Index\n    Index --> Output\n```\n\n资料来源：[src/python_api.rs:300-350]()\n\n### Computer and Scope System\n\nThe `Computer` and `Scope` classes manage computation execution:\n\n| Class | Responsibility |\n|-------|----------------|\n| `Scope` | Defines a computation boundary |\n| `Context` | Provides runtime evaluation context |\n| `Computer` | Executes transformation logic |\n| `Table` | Stores results |\n\n```mermaid\nsequenceDiagram\n    participant User as User Code\n    participant Scope as Scope\n    participant Context as Context\n    participant Computer as Computer\n    participant Table as Table\n    \n    User->>Scope: Create scope\n    User->>Context: Define operations\n    Context->>Computer: Create computer\n    Computer->>Table: Write results\n    Table-->>User: Return handle\n```\n\n## Monitoring and Telemetry\n\nPathway includes comprehensive monitoring capabilities through the `TelemetryConfig` and `PyMonitoringLevel` classes.\n\n**Monitoring Levels:**\n\n| Level | Description |\n|-------|-------------|\n| `OFF` | No telemetry |\n| `ERROR` | Only errors |\n| `WARN` | Warnings and errors |\n| `INFO` | Informational logs |\n| `DEBUG` | Detailed debugging |\n\n**Trace and Done Classes:**\n\nThe `Trace` class provides structured logging and tracing:\n\n```python\nclass Trace:\n    \"\"\"Structured logging and tracing\"\"\"\n    def log(level, message, context)\n    def span(name, fn)\n```\n\nThe `Done` class signals completion of operations and is used for synchronization.\n\n资料来源：[src/python_api.rs:350-400]()\n\n## External Integrations\n\nPathway integrates with various external systems and frameworks:\n\n| Integration | Description |\n|-------------|-------------|\n| LangChain | LLM pipeline integration |\n| LlamaIndex | RAG framework integration |\n| MinIO | S3-compatible object storage |\n| PaddleOCR | OCR capabilities |\n| Databento | Market data feeds |\n\nThese integrations are typically implemented as Pathway connectors or external index providers.\n\n资料来源：[README.md]()\n\n## Key Design Patterns\n\n### Incremental Computation Pattern\n\nPathway's core innovation is incremental computation:\n\n1. **Input Change**: A modification arrives at time T\n2. **Propagate**: The change flows through the computation graph\n3. **Differential Update**: Only affected results are updated\n4. **Efficient Output**: Minimal recomputation\n\n```mermaid\ngraph LR\n    A[\"Input Δ\"] --> B[\"Compute Δ\"]\n    B --> C[\"Update Trace\"]\n    C --> D[\"Emit Δ\"]\n    D --> A\n    \n    style A fill:#f96\n    style D fill:#96f\n```\n\n### Arrangement Pattern\n\nArrangements are a fundamental concept in Differential Dataflow:\n\n- An **arrangement** = a stream + its indexed state\n- Enables efficient joins and aggregations\n- Maintains multiple versions for time-travel queries\n\n```mermaid\ngraph TD\n    subgraph Arrangement\n        Stream[\"Stream Input\"]\n        Index[\"Trace Index\"]\n        Merge[\"Merge Operator\"]\n    end\n    \n    Stream --> Merge\n    Index --> Merge\n    Merge --> Output[\"Arranged Data\"]\n```\n\n## Summary\n\nPathway's architecture is designed around three key principles:\n\n1. **Python-First API**: Users write Python, but computation happens in Rust\n2. **Incremental Everywhere**: All operations support efficient updates\n3. **Unified Batch/Stream**: The same code works for both paradigms\n\nThe system successfully abstracts the complexity of parallel and distributed computation behind a simple Python interface, making real-time analytics and LLM pipelines accessible to Python developers.\n\n---\n\n<a id='page-python-api'></a>\n\n## Python API Structure\n\n### 相关页面\n\n相关主题：[System Architecture](#page-architecture), [Data Transformations](#page-transformations)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n- [examples/projects/web-scraping/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/web-scraping/README.md)\n</details>\n\n# Python API Structure\n\n## Overview\n\nPathway is a Python ETL framework for stream processing and real-time analytics, powered by a **scalable Rust engine** based on Differential Dataflow. The Python API Structure defines the interface layer that bridges Python user code with the high-performance Rust computation engine through PyO3 bindings. This architecture enables Python developers to write data pipelines while leveraging Rust's performance characteristics for incremental computation.\n\nThe framework allows seamless integration with Python ML libraries, supports both batch and streaming data, and can be used across development, CI/CD, and production environments with identical code. 资料来源：[README.md:1-15]()\n\n## Architecture Overview\n\n```mermaid\ngraph TD\n    A[Python User Code] --> B[Python API Layer]\n    B --> C[PyO3 Bindings]\n    C --> D[Rust Engine]\n    D --> E[Differential Dataflow]\n    E --> F[Incremental Computation]\n    \n    G[PythonSubject] --> B\n    H[ValueField] --> B\n    I[PyExpression] --> B\n    J[PyExportedTable] --> B\n```\n\n## Core Components\n\n### PythonSubject\n\nThe `PythonSubject` class serves as the primary interface for connecting external data sources to the Pathway engine. It wraps Python callback functions that the Rust engine invokes during data processing.\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `start` | `Py<PyAny>` | Callback invoked when a new session starts |\n| `read` | `Py<PyAny>` | Callback for reading data from the source |\n| `seek` | `Py<PyAny>` | Callback for seeking within the data stream |\n| `on_persisted_run` | `Py<PyAny>` | Callback triggered on persisted runs |\n| `end` | `Py<PyAny>` | Callback invoked when the source ends |\n| `is_internal` | `bool` | Flag indicating internal source usage |\n| `deletions_enabled` | `bool` | Flag enabling deletion tracking |\n\n资料来源：[src/python_api.rs:1-35]()\n\n```rust\n#[pymethods]\nimpl PythonSubject {\n    #[new]\n    #[pyo3(signature = (start, read, seek, on_persisted_run, end, is_internal, deletions_enabled))]\n    fn new(\n        start: Py<PyAny>,\n        read: Py<PyAny>,\n        seek: Py<PyAny>,\n        on_persisted_run: Py<PyAny>,\n        end: Py<PyAny>,\n        is_internal: bool,\n        deletions_enabled: bool,\n    ) -> Self {\n        Self { ... }\n    }\n}\n```\n\n### ValueField\n\n`ValueField` defines schema fields within Pathway tables, specifying the structure of data including type, source, default values, and metadata.\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `name` | `String` | Field identifier |\n| `type_` | `Type` | Data type specification |\n| `source` | `FieldSource` | Origin of the field data |\n| `default` | `Option<Value>` | Default value if none provided |\n| `metadata` | `Option<String>` | Additional metadata as JSON string |\n\n资料来源：[src/python_api.rs:40-55]()\n\n### FieldSource Enum\n\nThe `FieldSource` enum specifies where field data originates:\n\n| Value | Description |\n|-------|-------------|\n| `Payload` | Data from the input payload (default) |\n| `NewId` | System-generated identifier |\n| `USB` | User-specified binding |\n\n资料来源：[src/python_api.rs:200-210]()\n\n## Engine Integration Layer\n\n### TotalFrontier\n\n`TotalFrontier` represents the completion state of data processing across all workers. It uses an enum with two states:\n\n```mermaid\nstateDiagram-v2\n    [*] --> At\n    At --> Done: All workers complete\n    Done --> [*]\n    \n    TotalFrontier: At(timestamp) - Processing pending\n    TotalFrontier: Done - All complete\n```\n\n| State | Type | Meaning |\n|-------|------|---------|\n| `At(i)` | `Timestamp` | Frontier positioned at timestamp `i` |\n| `Done` | singleton | All processing completed |\n\n资料来源：[src/python_api.rs:90-130]()\n\n### PyExportedTable\n\n`PyExportedTable` exposes table data to Python consumers through three key methods:\n\n| Method | Return Type | Description |\n|--------|-------------|-------------|\n| `frontier()` | `TotalFrontier<Timestamp>` | Current frontier position |\n| `snapshot_at(frontier)` | `Vec<(Key, Vec<Value>)>` | Point-in-time table snapshot |\n| `failed()` | `bool` | Indicates if table processing failed |\n\n资料来源：[src/python_api.rs:140-165]()\n\n```rust\n#[pymethods]\nimpl PyExportedTable {\n    fn frontier(&self) -> TotalFrontier<Timestamp> {\n        self.inner.frontier()\n    }\n\n    fn snapshot_at(&self, frontier: TotalFrontier<Timestamp>) -> Vec<(Key, Vec<Value>)> {\n        self.inner.snapshot_at(frontier)\n    }\n\n    fn failed(&self) -> bool {\n        self.inner.failed()\n    }\n}\n```\n\n## Mode and Type Enums\n\n### ConnectorMode\n\nDefines data processing behavior for connectors:\n\n| Constant | Value | Use Case |\n|----------|-------|----------|\n| `STATIC` | `ConnectorMode::Static` | Batch processing, fixed dataset |\n| `STREAMING` | `ConnectorMode::Streaming` | Continuous data streams |\n\n资料来源：[src/python_api.rs:220-240]()\n\n### SessionType\n\nSpecifies how the engine handles sessionization:\n\n| Constant | Value | Description |\n|----------|-------|-------------|\n| `NATIVE` | `SessionType::Native` | Native differential dataflow processing |\n| `UPSERT` | `SessionType::Upsert` | Upsert-based session handling |\n\n资料来源：[src/python_api.rs:245-260]()\n\n## Expression System\n\n### PyExpression\n\nThe `PyExpression` class wraps Rust `Expression` objects and tracks whether Python GIL (Global Interpreter Lock) is required for evaluation:\n\n| Property | Type | Purpose |\n|----------|------|---------|\n| `inner` | `Arc<Expression>` | Rust expression AST |\n| `gil` | `bool` | Whether GIL acquisition needed |\n\n资料来源：[src/python_api.rs:280-310]()\n\n```rust\npub struct PyExpression {\n    inner: Arc<Expression>,\n    gil: bool,\n}\n\nimpl PyExpression {\n    fn new(inner: Arc<Expression>, gil: bool) -> Self {\n        Self { inner, gil }\n    }\n}\n```\n\n### Binary Operators\n\nBinary operations are implemented using macros that automatically handle GIL tracking:\n\n```rust\nmacro_rules! binary_op {\n    ($expression:path, $lhs:expr, $rhs:expr $(, $arg:expr)*) => {\n        Self::new(\n            Arc::new(Expression::from($expression(\n                $lhs.inner.clone(),\n                $rhs.inner.clone(),\n                $($arg,)*\n            ))),\n            $lhs.gil || $rhs.gil,\n        )\n    };\n}\n```\n\nThis pattern ensures that if either operand requires Python GIL, the entire expression is marked as requiring GIL during evaluation.\n\n## Timestamp System\n\nPathway uses a specialized timestamp system that supports incremental computation through Differential Dataflow:\n\n```mermaid\ngraph LR\n    A[Input] --> B[Timestamp Assignment]\n    B --> C[Frontier Computation]\n    C --> D[Incremental Update]\n    D --> E[Result]\n    \n    F[Retraction] -.->|Original| D\n    F[Retraction] -.->|Retraction| D\n```\n\n### OriginalOrRetraction Trait\n\nTimestamps determine whether a change is an original value or a retraction:\n\n```rust\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n```\n\n### NextRetractionTime Trait\n\nComputes the next retraction timestamp:\n\n```rust\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:1-50]()\n\n## Wakeup Handler\n\nThe `WakeupHandler` manages file descriptor-based wakeups for coordinating Rust async operations with Python's event loop:\n\n```rust\nstruct WakeupHandler<'py> {\n    _fd: OwnedFd,\n    set_wakeup_fd: Bound<'py, PyAny>,\n    old_wakeup_fd: Bound<'py, PyAny>,\n}\n```\n\nThis enables Pathway to integrate with Python's async ecosystem while maintaining efficient I/O handling in Rust.\n\n## Trace and Error Handling\n\n### EngineTrace\n\n`EngineTrace` captures source location information for debugging and error reporting:\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `file_name` | `String` | Source file path |\n| `line_number` | `u32` | Line number in source |\n| `line` | `String` | Source line content |\n| `function` | `String` | Function/method name |\n\n资料来源：[src/python_api.rs:180-200]()\n\nThe trace system integrates with Python's exception handling, converting Rust trace data into Python traceback objects:\n\n```rust\nimpl<'py> IntoPyObject<'py> for EngineTrace {\n    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {\n        match self {\n            Self::Empty => Ok(py.None().into_bound(py)),\n            Self::Frame { ... } => Trace { ... }.into_bound_py_any(py),\n        }\n    }\n}\n```\n\n## Data Flow Example\n\n```mermaid\ngraph TD\n    A[Data Source] -->|PythonSubject| B[Rust Engine]\n    B -->|Processing| C[Timestamp Assignment]\n    C -->|Frontier Update| D[TotalFrontier]\n    D -->|Snapshot| E[PyExportedTable]\n    E -->|Query| F[Python Application]\n    \n    G[ConnectorMode] -->|Configuration| B\n    H[SessionType] -->|Configuration| B\n```\n\n## Usage Patterns\n\n### Creating a Data Pipeline\n\nPathway pipelines are defined in Python and executed by the Rust engine:\n\n```python\nimport pathway as pw\n\n# Define schema\nclass InputSchema(pw.Schema):\n    id: int\n    data: str\n\n# Create connector\nconnector = pw.io.http.PathwayWebserver(host=\"0.0.0.0\", port=8011)\n\n# Run pipeline\npw.run()\n```\n\n资料来源：[examples/projects/question-answering-rag/README.md:30-45]()\n\n### Web Scraping Pipeline\n\nReal-time web scraping demonstrates the streaming capabilities:\n\n```python\n# Connector implementation\nclass NewsScraperSubject(pw.ConnectorSubject):\n    def __init__(self, url, interval=60):\n        # Configure callbacks\n        self.url = url\n        self.interval = interval\n```\n\n资料来源：[examples/projects/web-scraping/README.md:50-65]()\n\n### Multi-Agent RAG Pipeline\n\nThe integration with agent frameworks shows advanced API usage:\n\n```mermaid\ngraph LR\n    A[Documents] -->|Index| B[Pathway VectorStoreServer]\n    B -->|Retrieve| C[AG2 Agents]\n    C -->|Query| D[User]\n    \n    E[REST API /v1/retrieve] --> B\n```\n\n资料来源：[examples/projects/ag2-multiagent-rag/README.md:20-40]()\n\n## Summary\n\nThe Python API Structure in Pathway consists of several key layers:\n\n1. **Python Interface Layer**: PyO3-wrapped classes (`PythonSubject`, `ValueField`, `PyExportedTable`, `PyExpression`) provide idiomatic Python APIs\n2. **Type System**: Enums for `ConnectorMode`, `SessionType`, `FieldSource` configure engine behavior\n3. **State Management**: `TotalFrontier` tracks processing progress across distributed workers\n4. **Expression Evaluation**: The `PyExpression` system handles both pure Rust and Python-requiring computations\n5. **Error Handling**: `EngineTrace` and `WakeupHandler` provide debugging and async coordination capabilities\n\nThis architecture enables Python developers to leverage high-performance Rust computation while maintaining the productivity of Python development.\n\n---\n\n<a id='page-data-model'></a>\n\n## Data Model and Type System\n\n### 相关页面\n\n相关主题：[Core Concepts](#page-concepts), [Data Transformations](#page-transformations)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n- [src/connectors/postgres.rs](https://github.com/pathwaycom/pathway/blob/main/src/connectors/postgres.rs)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n- [external/differential-dataflow/src/trace/layers/ordered.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/trace/layers/ordered.rs)\n</details>\n\n# Data Model and Type System\n\nPathway's Data Model and Type System form the foundational architecture that enables the framework to handle structured data processing with type safety, schema validation, and efficient data representation. Built on a Rust-powered engine, Pathway provides a Python-friendly API while maintaining the performance benefits of differential dataflow computations.\n\n## Architecture Overview\n\nPathway's type system operates at multiple levels: the Python API layer where users define schemas, and the Rust engine layer where types are resolved and processed efficiently.\n\n```mermaid\ngraph TD\n    A[Python API Layer<br/>dtype.py, expression.py] --> B[Type Interpreter<br/>type_interpreter.py]\n    B --> C[Rust Engine<br/>python_api.rs]\n    C --> D[Differential Dataflow<br/>Timely Backend]\n    C --> E[Connectors<br/>postgres.rs, etc.]\n    \n    F[ValueField] --> C\n    G[FieldSource] --> C\n    H[ConnectorMode] --> C\n    I[Timestamp] --> D\n```\n\n## Core Type Components\n\n### Type Enumeration\n\nThe Pathway type system is built around a `Type` enumeration that defines all supported data types. These types map to both Python native types and Rust storage representations.\n\n| Type Category | Supported Types | Description |\n|---------------|-----------------|-------------|\n| Boolean | `BOOL` | True/False values |\n| Integer | `INT2`, `INT4`, `INT8` | 16, 32, and 64-bit signed integers |\n| Float | `FLOAT4`, `FLOAT8` | 32 and 64-bit floating point |\n| String | `TEXT`, `VARCHAR` | Variable-length text |\n| Binary | `BYTEA` | Binary/blob data |\n| JSON | `JSON`, `JSONB` | JSON data with/without binary optimization |\n| Temporal | `TIMESTAMP`, `TIMESTAMPTZ`, `INTERVAL` | Date, time, and duration values |\n| UUID | `UUID` | Universally unique identifiers |\n\n资料来源：[src/connectors/postgres.rs:1-30]()\n\n### ValueField Structure\n\n`ValueField` is the core schema definition unit that encapsulates field metadata:\n\n```python\n@dataclass\nclass ValueField:\n    name: str           # Field identifier\n    type_: Type         # Data type enumeration\n    source: FieldSource # Origin of the field data\n    default: Optional[Value]  # Default value if null\n    metadata: Optional[str]    # Additional metadata as JSON string\n```\n\n资料来源：[src/python_api.rs:30-47]()\n\n## Field Source Classification\n\n`FieldSource` defines where data originates within the Pathway computation model:\n\n| Source | Value | Description |\n|--------|-------|-------------|\n| `Payload` | Default | Data from the original input connector |\n| `Key` | - | Primary key field extracted/generated |\n| `Computed` | - | Derived from transformations |\n| `Pointer` | - | Reference to another table entry |\n\n资料来源：[src/python_api.rs:20-28]()\n\nThe `FieldSource` enum enables Pathway to track data lineage and enforce immutability constraints:\n\n```python\nconst PAYLOAD: FieldSource = FieldSource::Payload;\n```\n\n资料来源：[src/python_api.rs:450-451]()\n\n## Connector Modes\n\nPathway supports two distinct data processing modes:\n\n| Mode | Behavior | Use Case |\n|------|----------|----------|\n| `STATIC` | Process data once, complete when input exhausted | Batch processing, static datasets |\n| `STREAMING` | Continuous processing, incremental updates | Real-time data streams, live monitoring |\n\n资料来源：[src/python_api.rs:460-468]()\n\n```python\n#[pymethods]\nimpl PyConnectorMode {\n    #[classattr]\n    pub const STATIC: ConnectorMode = ConnectorMode::Static;\n    #[classattr]\n    pub const STREAMING: ConnectorMode = ConnectorMode::Streaming;\n}\n```\n\n## Session Types\n\nSession type configuration affects how Pathway handles data transactions:\n\n| Type | Description |\n|------|-------------|\n| `NATIVE` | Standard Pathway processing |\n| `UPSERT` | Specialized handling for upsert operations |\n\n资料来源：[src/python_api.rs:470-477]()\n\n## Timestamp System\n\nPathway's timestamp system implements incremental computation semantics using even-odd parity:\n\n```python\nimpl OriginalOrRetraction for Timestamp {\n    fn is_original(&self) -> bool {\n        self.0.is_multiple_of(2)\n    }\n}\n\nimpl NextRetractionTime for Timestamp {\n    fn next_retraction_time(&self) -> Self {\n        Self(self.0 + 1)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:25-36]()\n\n### Timestamp Parity Convention\n\n- **Even timestamps**: Original data values\n- **Odd timestamps**: Retractions (deletions/updates of previous values)\n\nThis design enables efficient incremental updates in the differential dataflow computation model.\n\n## Data Arrangement in Differential Dataflow\n\nPathway leverages differential dataflow for efficient state management. The arrangement layer provides key-value storage with ordered indexing:\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[OrderedLayer<br/>Keys + Values]\n    B --> C[Arranged Trace]\n    C --> D[Query Operations]\n    \n    E[Offset Index] --> B\n    F[Child Cursor] --> B\n```\n\n资料来源：[external/differential-dataflow/src/trace/layers/ordered.rs:1-20]()\n\n### Cursor Navigation\n\nThe cursor-based traversal in ordered layers supports efficient seeking and stepping:\n\n```rust\nfn step(&mut self, storage: &OrderedLayer<K, L, O, C>) {\n    // Pathway extension: pos_nonnegative tracks cursor repositioning\n    if self.pos_nonnegative { \n        self.pos += 1; \n    } else {\n        self.pos_nonnegative = true;\n    }\n}\n```\n\n资料来源：[external/differential-dataflow/src/trace/layers/ordered.rs:15-23]()\n\n## Table Representation\n\nPathway represents data using the `LegacyTable` structure:\n\n| Component | Type | Purpose |\n|-----------|------|---------|\n| `universe` | `Universe` | Unique identifier namespace |\n| `columns` | `Vec<Column>` | Collection of typed columns |\n\n资料来源：[src/python_api.rs:100-110]()\n\n### Universe System\n\nThe universe concept provides isolation and partitioning for tables:\n\n```python\npub fn __repr__(&self, py: Python) -> String {\n    format!(\n        \"<LegacyTable universe={:?} columns=[{}]>\",\n        self.universe.borrow(py).handle,\n        self.columns.iter().format_with(\", \", |column, f| {\n            f(&format_args!(\"{:?}\", column.borrow(py).handle))\n        })\n    )\n}\n```\n\n资料来源：[src/python_api.rs:112-120]()\n\n## Computer and Computation Model\n\nThe `Computer` class wraps Python functions for execution within the Rust engine:\n\n```python\npub struct Computer {\n    fun: Py<PyAny>,           # Python callable\n    dtype: Py<PyAny>,         # Return type annotation\n    is_output: bool,          # Marks output tables\n    is_method: bool,          # Instance vs class method\n    universe: Py<Universe>,   # Computation scope\n    data: Value,              # Closure-captured data\n    data_column: Option<Py<Column>>,  # Optional input column\n}\n```\n\n资料来源：[src/python_api.rs:145-154]()\n\n### Computation Flow\n\n```mermaid\ngraph TD\n    A[Python Function Definition] --> B[Computer Wrapper]\n    B --> C[Rust Engine Execution]\n    C --> D[ScopedContext Injection]\n    D --> E[Result Value]\n    \n    F[Timestamp Parity Check] -.-> C\n```\n\n## Key Generation Policies\n\nPathway supports flexible key generation strategies through `KeyGenerationPolicy`:\n\n| Policy | Behavior |\n|--------|----------|\n| `Auto` | System-generated unique keys |\n| `Manual` | User-provided key values |\n| `Composite` | Keys derived from multiple fields |\n\n资料来源：[src/python_api.rs:520-530]()\n\n## Monitoring and Debugging\n\nThe `EngineTrace` system captures execution context for debugging:\n\n```python\npub enum EngineTrace {\n    Empty,                    # No trace information\n    Frame {                   # Source location\n        file_name: String,\n        line: String,\n        line_number: usize,\n        function: String,\n    }\n}\n```\n\n资料来源：[src/python_api.rs:380-395]()\n\n## Python-Rust Type Conversions\n\nPathway provides bidirectional type conversions between Python and Rust:\n\n### IntoPyObject Implementation\n\n```python\nimpl<'py> IntoPyObject<'py> for Timestamp {\n    type Target = PyAny;\n    type Output = Bound<'py, Self::Target>;\n    type Error = PyErr;\n    \n    fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {\n        self.0.into_bound_py_any(py)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:8-18]()\n\n### FromPyObject Implementation\n\n```python\nimpl<'py> FromPyObject<'py> for Timestamp {\n    fn extract_bound(ob: &Bound<'py, PyAny>) -> PyResult<Self> {\n        ob.extract().map(Self)\n    }\n}\n```\n\n资料来源：[src/engine/timestamp.rs:20-24]()\n\n## PostgreSQL Type Mapping\n\nPathway's PostgreSQL connector provides explicit OID mappings for binary protocol:\n\n| PostgreSQL OID | Pathway Type |\n|----------------|--------------|\n| 16 | BOOL |\n| 21 | INT2 |\n| 23 | INT4 |\n| 20 | INT8 |\n| 700 | FLOAT4 |\n| 701 | FLOAT8 |\n| 25 | TEXT/VARCHAR |\n| 17 | BYTEA |\n| 3802 | JSONB |\n| 114 | JSON |\n| 1114 | TIMESTAMP |\n| 1184 | TIMESTAMPTZ |\n| 1186 | INTERVAL |\n| 2950 | UUID |\n\n资料来源：[src/connectors/postgres.rs:10-25]()\n\n## Summary\n\nPathway's Data Model and Type System provides:\n\n1. **Type Safety**: Strong typing from Python definitions through to Rust execution\n2. **Incremental Computation**: Timestamp-based parity for efficient updates\n3. **Flexible Schema**: ValueField with configurable sources and defaults\n4. **Multiple Processing Modes**: Static and streaming connector support\n5. **Differential Execution**: Powered by timely and differential dataflow\n6. **Connector Integration**: Native type mappings for external systems like PostgreSQL\n\nThis architecture enables users to write expressive Python pipelines while benefiting from high-performance Rust-based incremental computation.\n\n---\n\n<a id='page-connectors'></a>\n\n## Data Connectors\n\n### 相关页面\n\n相关主题：[Custom Connectors](#page-connectors-custom)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/io/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/__init__.py)\n- [python/pathway/io/csv/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/csv/__init__.py)\n- [python/pathway/io/kafka/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/kafka/__init__.py)\n- [python/pathway/io/postgres/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/postgres/__init__.py)\n- [python/pathway/io/s3/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/s3/__init__.py)\n- [python/pathway/io/http/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/io/http/__init__.py)\n- [src/connectors/mod.rs](https://github.com/pathwaycom/pathway/blob/main/src/connectors/mod.rs)\n- [docs/2.developers/4.user-guide/20.connect/35.pathway-connectors.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/20.connect/35.pathway-connectors.md)\n</details>\n\n# Data Connectors\n\n## Overview\n\nData Connectors in Pathway are the primary interface for ingesting data from external sources and persisting computed results to external destinations. Pathway provides a unified abstraction that handles both batch and streaming data, allowing developers to write Python code that works seamlessly across different execution modes. The connector system is built on top of Pathway's Rust engine, which provides incremental computation capabilities through Differential Dataflow.\n\nPathway connectors abstract the complexity of various data sources—including filesystems, databases, message queues, cloud storage, and HTTP endpoints—behind a consistent Python API. This design enables developers to connect to multiple data sources using the same patterns, whether working in local development, CI/CD tests, or production streaming environments.\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n## Architecture\n\n### Connector System Components\n\nThe Pathway connector system consists of three primary components that work together to provide reliable data movement:\n\n**Subjects** are the data source abstraction in Pathway. A `PythonSubject` wraps user-defined Python functions that handle the mechanics of reading from an external source—connecting, seeking, reading bytes, and managing persistence state. The subject tracks whether the source is internal or external and manages deletion notifications when upstream data is removed.\n\n**Connectors** orchestrate the data flow between subjects and the Pathway engine. They manage the lifecycle of data ingestion, including initialization, incremental updates, and graceful shutdown. Connectors expose configuration options for streaming versus static modes, enabling the same code to work in batch and real-time scenarios.\n\n**Sinks** handle output operations, persisting Pathway tables to external destinations. Sinks receive computed results from the engine and write them to files, databases, or streaming systems with exactly-once semantics where supported.\n\n```mermaid\ngraph TD\n    A[External Data Source] --> B[PythonSubject]\n    B --> C[Connector]\n    C --> D[Pathway Engine<br/>Differential Dataflow]\n    D --> E[Sink]\n    E --> F[External Data Destination]\n    \n    G[Schema Definition] --> C\n    H[Connector Mode<br/>Static/Streaming] --> C\n```\n\n资料来源：[src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n### Connector Mode\n\nPathway supports two distinct connector modes that determine how data is processed:\n\n| Mode | Description | Use Case |\n|------|-------------|----------|\n| `STATIC` | Process all available data at once, then complete | Batch jobs, one-time imports |\n| `STREAMING` | Continuously monitor for changes and updates | Real-time pipelines, live data |\n\n```python\n# Static mode - processes existing data and exits\npw.io.fs.read(path=\"./data/\", mode=pw ConnectorMode.STATIC)\n\n# Streaming mode - continuously monitors for changes\npw.io.fs.read(path=\"./data/\", mode=pw.ConnectorMode.STREAMING)\n```\n\n资料来源：[src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## Input Connectors\n\n### CSV Reader\n\nThe CSV connector reads comma-separated values from files or directories. It supports schema inference, type casting, and automatic handling of headers. The connector can operate in both static and streaming modes, re-reading files when they are modified or new files appear in the watched directory.\n\n```python\nimport pathway as pw\n\n# Basic CSV reading\ntable = pw.io.csv.read(\n    path=\"./input_data/\",\n    schema=pw.Schema(\n        id=pw.ColumnDefinition(type=pw.Type.INT),\n        name=pw.ColumnDefinition(type=pw.Type.STR),\n        value=pw.ColumnDefinition(type=pw.Type.FLOAT)\n    ),\n    mode=pw.ConnectorMode.STATIC\n)\n```\n\nConfiguration parameters for CSV reading include the file path, schema definition, whether to use streaming mode, and options for handling malformed rows. The connector tracks file modifications to enable incremental processing in streaming scenarios.\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n### Kafka Connector\n\nThe Kafka connector integrates with Apache Kafka for high-throughput message streaming. It supports both consumer and producer patterns, enabling Pathway pipelines to consume from Kafka topics and write results back to Kafka. The connector handles offset management, partition awareness, and graceful rebalancing when consumer groups change.\n\n```python\nimport pathway as pw\n\n# Kafka consumer\nevents = pw.io.kafka.read(\n    brokers=[\"localhost:9092\"],\n    topic=\"input-events\",\n    schema=pw.Schema(\n        event_id=pw.ColumnDefinition(type=pw.Type.STR),\n        timestamp=pw.ColumnDefinition(type=pw.Type.DATETIME),\n        payload=pw.ColumnDefinition(type=pw.Type.STR)\n    ),\n    format=\"json\",\n    kafka_settings=pw.io.kafka.KafkaReadSettings(\n        consumer_group=\"pathway-consumer\",\n        offset=pw.io.kafka.Offset.latest()\n    )\n)\n```\n\nThe connector supports JSON and Avro message formats, with automatic schema registration for Avro registries. It provides exactly-once processing semantics through coordinated offsets with Pathway's checkpointing system.\n\n资料来源：[examples/projects/option-greeks/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/option-greeks/README.md)\n\n### PostgreSQL Connector\n\nPathway provides bidirectional PostgreSQL integration for reading from and writing to PostgreSQL tables. The connector supports both full table scans and change data capture (CDC) when combined with logical replication, enabling real-time streaming from database changes.\n\n```python\nimport pathway as pw\n\n# PostgreSQL read with CDC\ntable = pw.io.postgres.read(\n    host=\"localhost\",\n    port=5432,\n    database=\"mydb\",\n    table_name=\"orders\",\n    user=\"pathway_user\",\n    password=pw.io.postgres.PasswordSecret(\"PG_PASSWORD\"),\n    schema=pw.Schema(\n        order_id=pw.ColumnDefinition(type=pw.Type.INT, primary_key=True),\n        customer_id=pw.ColumnDefinition(type=pw.Type.INT),\n        total=pw.ColumnDefinition(type=pw.Type.FLOAT),\n        created_at=pw.ColumnDefinition(type=pw.Type.DATETIME)\n    )\n)\n```\n\nFor write operations, the PostgreSQL sink supports batch inserts and upserts, using primary key columns to determine insert versus update behavior. Connection pooling ensures efficient resource utilization under high throughput.\n\n### S3 Connector\n\nThe S3 connector provides access to Amazon S3 and S3-compatible object stores (including MinIO). It supports reading objects as binary data, CSV, or JSON, and can watch S3 prefixes for new objects in streaming mode.\n\n```python\nimport pathway as pw\n\n# S3 read\ndata = pw.io.s3.read(\n    bucket=\"my-bucket\",\n    prefix=\"input/data-\",\n    access_key=pw.io.s3.AwsAccessKeySecret(\"AWS_ACCESS_KEY\"),\n    secret_key=pw.io.s3.AwsSecretKeySecret(\"AWS_SECRET_KEY\"),\n    region=\"us-east-1\",\n    format=\"csv\",\n    mode=pw.ConnectorMode.STREAMING\n)\n```\n\nThe connector supports AWS IAM roles, instance profiles, and explicit credentials. It handles retries automatically for transient network failures and implements efficient listing with pagination for buckets with many objects.\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n### HTTP Connector\n\nPathway includes HTTP-based connectors for both receiving data via HTTP servers and making outbound HTTP requests. The HTTP input connector starts a web server that accepts incoming data, while the HTTP output connector can send computed results to external webhooks.\n\n```python\nimport pathway as pw\n\n# HTTP input server\ninput_table = pw.io.http.read(\n    host=\"0.0.0.0\",\n    port=8080,\n    schema=pw.Schema(\n        request_id=pw.ColumnDefinition(type=pw.Type.STR),\n        data=pw.ColumnDefinition(type=pw.Type.STR)\n    ),\n    mode=pw.ConnectorMode.STREAMING\n)\n\n# HTTP output\npw.io.http.write(\n    table=result_table,\n    url=\"https://api.example.com/webhook\",\n    api_key=pw.io.http.ApiKeySecret(\"WEBHOOK_API_KEY\")\n)\n```\n\nThe HTTP input connector supports authentication via API keys and basic authentication. It processes incoming POST requests with JSON payloads and automatically handles concurrent requests with thread-safe processing.\n\n资料来源：[examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md)\n\n### SharePoint Connector\n\nPathway provides specialized connectors for Microsoft SharePoint that enable reading files from SharePoint document libraries and sites. The connector supports authenticated access using Azure AD application credentials with certificate-based authentication.\n\n```python\nimport pathway as pw\n\n# SharePoint read\ndocuments = pw.io.sharepoint.read(\n    url=\"https://company.sharepoint.com/sites/project\",\n    root_path=\"/documents\",\n    tenant=\"tenant-id\",\n    client_id=\"app-client-id\",\n    thumbprint=\"cert-thumbprint\",\n    cert_path=\"/path/to/certificate.pem\",\n    mode=pw.ConnectorMode.STREAMING\n)\n```\n\nThe connector indexes all files in the specified SharePoint location, monitoring for additions and modifications. It supports various file formats and can extract content for downstream processing.\n\n资料来源：[examples/projects/sharepoint-test/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/sharepoint-test/README.md)\n\n## Output Connectors\n\n### CSV Writer\n\nThe CSV writer persists Pathway tables to CSV files with configurable delimiters, quoting behavior, and header options. In streaming mode, the writer can append to files or create new files based on time windows.\n\n```python\nimport pathway as pw\n\npw.io.csv.write(\n    table=result_table,\n    path=\"./output/results.csv\",\n    include_header=True,\n    separator=\",\"\n)\n```\n\n### Kafka Writer\n\nThe Kafka writer sends Pathway table updates to Kafka topics. Each row update is serialized as a message with configurable key extraction for partition routing.\n\n```python\npw.io.kafka.write(\n    table=result_table,\n    brokers=[\"localhost:9092\"],\n    topic=\"output-events\",\n    format=\"json\",\n    key_column=\"event_id\"\n)\n```\n\n### Database Writers\n\nPathway supports writing to PostgreSQL and other databases with upsert semantics. Writers automatically batch updates for efficiency and handle retries for transient failures.\n\n```python\npw.io.postgres.write(\n    table=result_table,\n    host=\"localhost\",\n    port=5432,\n    database=\"output_db\",\n    table_name=\"results\",\n    user=\"writer_user\",\n    password=pw.io.postgres.PasswordSecret(\"PG_PASSWORD\"),\n    upsert=True\n)\n```\n\n## Schema Definition\n\nPathway uses explicit schema definitions to map external data formats to typed columns. Schemas define column names, types, and metadata that guide parsing and validation.\n\n```python\nimport pathway as pw\n\nclass InputSchema(pw.Schema):\n    id: int\n    name: str\n    timestamp: pw.DateTimeUtc\n    metadata: dict\n```\n\n| Schema Feature | Description |\n|----------------|-------------|\n| Column Types | INT, FLOAT, STR, BOOL, DATETIME, BYTES, JSON |\n| Primary Keys | Mark columns as primary keys for upsert operations |\n| Default Values | Provide fallback values for missing data |\n| Nested Types | Support for arrays and dictionaries |\n\n资料来源：[src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## Persistence and State Management\n\nConnectors integrate with Pathway's persistence system to maintain state across restarts. When a connector's subject tracks offsets, positions, or timestamps, this state is preserved in the persistence backend and restored on restart.\n\n```python\nimport pathway as pw\n\n# Persistence configuration for connector state\npersistence_config = pw.persistence.Config(\n    backend=pw.persistence.Backend.filesystem(\n        path=\"./persistence_storage/\"\n    )\n)\n\npw.run(\n    persistence_config=persistence_config\n)\n```\n\nThe persistence system ensures that streaming connectors resume from their last processed position, preventing data loss and duplicate processing during planned or unplanned shutdowns.\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n## Common Patterns\n\n### Multi-Source Aggregation\n\nPathway excels at combining data from multiple heterogeneous sources into unified computations. Connectors for each source produce tables that can be joined, unioned, and transformed together.\n\n```python\nimport pathway as pw\n\n# Combine CSV and Kafka sources\ncsv_data = pw.io.csv.read(path=\"./static-data/\", schema=CsvSchema)\nkafka_data = pw.io.kafka.read(\n    brokers=[\"localhost:9092\"],\n    topic=\"live-events\",\n    schema=KafkaSchema\n)\n\n# Join and process\nresult = csv_data.join(kafka_data, pw.left.id == pw.right.source_id).select(\n    id=pw.left.id,\n    static_info=pw.left.info,\n    live_update=pw.right.value\n)\n```\n\n### Live Document Indexing\n\nA common pattern for RAG (Retrieval-Augmented Generation) applications involves continuous document indexing with real-time search capability. The HTTP connector combined with vector embedding creates a live knowledge base.\n\n```python\nimport pathway as pw\n\n# Document source with automatic reindexing\ndocs = pw.io.fs.read(\n    path=\"./documents/\",\n    format=\"binary\",\n    mode=pw.ConnectorMode.STREAMING\n)\n\n# Web server for queries\npw.io.http.write_server(\n    table=embedded_docs,\n    host=\"0.0.0.0\",\n    port=8011\n)\n```\n\n资料来源：[examples/projects/ag2-multiagent-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/ag2-multiagent-rag/README.md)\n\n## Error Handling\n\nConnectors provide mechanisms for handling errors during data processing. The engine tracks connector failures through the `failed()` method on exported tables, enabling pipelines to detect and respond to source or destination errors.\n\n```python\nexported = pw.io.csv.read(\n    path=\"./data/\",\n    schema=InputSchema\n)\n\n# Check for connector failures\nif exported.failed():\n    # Handle error - log, alert, or retry\n    pass\n```\n\nConnectors also support dead-letter patterns where invalid records are routed to separate tables for inspection and reprocessing, ensuring that malformed data does not halt the entire pipeline.\n\n## Security Considerations\n\nConnectors handle sensitive data and credentials with appropriate security measures. Secrets can be provided through environment variables, files, or secret management services. The connector API uses typed secret classes that prevent accidental logging of credentials.\n\n| Secret Type | Usage |\n|-------------|-------|\n| `PasswordSecret` | Database passwords |\n| `ApiKeySecret` | HTTP API keys |\n| `AwsAccessKeySecret` | AWS access keys |\n| `AwsSecretKeySecret` | AWS secret keys |\n\nPathway recommends using IAM roles, Azure managed identities, or Kubernetes secrets for production deployments rather than embedding credentials in code.\n\n## Monitoring and Observability\n\nPathway's monitoring dashboard tracks connector metrics including message counts, processing latency, and error rates. Each connector reports statistics independently, enabling fine-grained observability across complex pipelines.\n\n```python\npw.run(monitoring=True)\n```\n\nThe dashboard displays per-connector metrics that help identify bottlenecks and failures in data pipelines. Metrics are also exported to Prometheus-compatible endpoints for integration with existing observability infrastructure.\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n---\n\n<a id='page-connectors-custom'></a>\n\n## Custom Connectors\n\n### 相关页面\n\n相关主题：[Data Connectors](#page-connectors)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/internals/datasource.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/datasource.py)\n- [python/pathway/internals/datasink.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/internals/datasink.py)\n- [docs/2.developers/4.user-guide/20.connect/99.connectors/30.custom-python-connectors.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/20.connect/99.connectors/30.custom-python-connectors.md)\n- [examples/projects/custom-python-connector-twitter/twitter_connector_example.py](https://github.com/pathwaycom/pathway/blob/main/examples/projects/custom-python-connector-twitter/twitter_connector_example.py)\n</details>\n\n# Custom Connectors\n\nPathway provides a powerful extensibility mechanism through Custom Connectors, enabling developers to create bespoke data source and data sink implementations tailored to specific data providers or consumers. This capability allows seamless integration with proprietary systems, legacy data sources, or third-party APIs that are not covered by Pathway's built-in connector library.\n\n## Overview\n\nCustom Connectors bridge the gap between Pathway's Rust-powered computation engine and Python-based data sources or sinks. They leverage the `ConnectorSubject` base class for data sources and provide a callback-based mechanism where Python functions handle the actual data reading or writing logic while the Rust engine manages state, persistence, and incremental computation.\n\nThe architecture follows a clean separation of concerns:\n\n- **Python Layer**: Implements the actual data fetching/writing logic\n- **Rust Engine**: Handles data flow, state management, watermarking, and incremental updates\n\n## Architecture\n\n```mermaid\ngraph TD\n    subgraph \"Python User Code\"\n        A[Custom Connector Class] --> B[ConnectorSubject Subclass]\n        A --> C[read callback]\n        A --> D[seek callback]\n        A --> E[start callback]\n        A --> F[end callback]\n    end\n    \n    subgraph \"Pathway Engine (Rust)\"\n        G[PythonSubject Wrapper] --> H[PythonReader]\n        H --> I[State Manager]\n        I --> J[Incremental Computation]\n    end\n    \n    B --> G\n    C --> G\n    D --> G\n    E --> G\n    F --> G\n```\n\n## Core Components\n\n### PythonSubject\n\nThe `PythonSubject` class serves as the bridge between Python connector implementations and the Pathway Rust engine. It wraps several Python callbacks that define the connector's behavior.\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `start` | `Py<PyAny>` | Callback invoked when the connector begins processing |\n| `read` | `Py<PyAny>` | Callback to read data entries from the source |\n| `seek` | `Py<PyAny>` | Callback to seek to a specific position in the data stream |\n| `on_persisted_run` | `Py<PyAny>` | Callback triggered when persisted state is restored |\n| `end` | `Py<PyAny>` | Callback invoked when the connector finishes processing |\n| `is_internal` | `bool` | Flag indicating if this is an internal connector |\n| `deletions_enabled` | `bool` | Flag enabling deletion support |\n\n*资料来源：[src/python_api.rs:1-50](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)*\n\n### ConnectorMode\n\nPathway supports two distinct connector modes that determine data processing behavior:\n\n| Mode | Description |\n|------|-------------|\n| `STATIC` | Reads data once without continuous polling. Ideal for batch processing of finite datasets. |\n| `STREAMING` | Enables polling and continuous data monitoring. Supports real-time data ingestion with watermarks. |\n\n*资料来源：[src/connectors/data_storage.rs:1-20](https://github.com/pathwaycom/pathway/blob/main/src/connectors/data_storage.rs)*\n\nThe `is_polling_enabled()` method returns `false` for Static mode and `true` for Streaming mode, controlling whether the engine periodically invokes the connector's read callback.\n\n## Creating a Custom Data Source\n\n### Step 1: Subclass ConnectorSubject\n\nCreate a new class that extends the base `ConnectorSubject` class. This defines your connector's schema and behavior.\n\n```python\nimport pathway as pw\n\nclass MyCustomSourceSubject(pw.ConnectorSubject):\n    schema: pw.Schema = pw.schema_from_types(\n        id=str,\n        value=float,\n        timestamp=int,\n    )\n```\n\n### Step 2: Implement Required Callbacks\n\nThe following callbacks must be implemented to define data retrieval behavior:\n\n| Callback | Purpose |\n|----------|---------|\n| `start()` | Initialize connection and set up reader state |\n| `read()` | Return new data entries since last read |\n| `seek()` | Move to a specific position for replay |\n| `end()` | Clean up resources when processing completes |\n\n### Step 3: Configure the Connector\n\nPass your custom subject class to the connector along with mode and configuration:\n\n```python\nclass MyCustomSource(pw.io.KafkaSource):\n    def __init__(self, subject, topics, start_at_timestamp=0):\n        super().__init__(\n            subject=subject,\n            topics=topics,\n            start_at_timestamp=start_at_timestamp,\n        )\n```\n\n## Creating a Custom Data Sink\n\nCustom data sinks follow a similar pattern to sources but focus on outputting processed data. The datasink receives table updates and writes them to the target system.\n\n### Sink Architecture\n\n```mermaid\ngraph LR\n    A[Pathway Table] --> B[DataSink Handler]\n    B --> C[write callback]\n    C --> D[External System]\n```\n\n### Key Properties\n\n| Property | Description |\n|----------|-------------|\n| `table` | The Pathway table to subscribe to for updates |\n| `id_column` | Column used for primary key identification |\n| `username` | Authentication username (optional) |\n| `password` | Authentication password (optional) |\n| `host` | Target system hostname |\n| `port` | Target system port |\n\n## Twitter Connector Example\n\nA complete example demonstrating custom connector implementation is available in the repository. The Twitter connector shows how to wrap an external API as a Pathway data source.\n\n### Key Implementation Details\n\n```python\nclass TwitterSubject(pw.ConnectorSubject):\n    schema: pw.Schema = pw.schema_from_types(\n        id=str,\n        text=str,\n        author=str,\n        created_at=int,\n    )\n    \n    def __init__(self, api_key, api_secret, **kwargs):\n        self.api_key = api_key\n        self.api_secret = api_secret\n        super().__init__(**kwargs)\n    \n    def start(self):\n        self.twitter_api = connect_to_twitter(self.api_key, self.api_secret)\n        self.last_read_id = None\n    \n    def read(self):\n        new_tweets = self.twitter_api.get_new_tweets(since_id=self.last_read_id)\n        self.last_read_id = max(t.id for t in new_tweets)\n        return [(t.id, t.text, t.author, t.created_at) for t in new_tweets]\n```\n\n*资料来源：[examples/projects/custom-python-connector-twitter/twitter_connector_example.py](https://github.com/pathwaycom/pathway/blob/main/examples/projects/custom-python-connector-twitter/twitter_connector_example.py)*\n\n## ValueField Schema Definition\n\nWhen defining connector schemas, `ValueField` provides fine-grained control over column properties:\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `name` | `String` | Column identifier |\n| `type_` | `Type` | Data type (INT, FLOAT, STRING, etc.) |\n| `source` | `FieldSource` | Origin of the field (Payload, Time, Partition) |\n| `default` | `Option<Value>` | Default value if field is missing |\n| `metadata` | `Option<String>` | Optional metadata string |\n\n*资料来源：[src/python_api.rs:30-60](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)*\n\n## Integration with Pathway Engine\n\n### Reader Builder Pattern\n\nThe `PythonReaderBuilder` constructs reader instances that integrate with the engine:\n\n```rust\npub struct PythonReaderBuilder {\n    subject: Py<PythonSubject>,\n    schema: HashMap<String, Type>,\n}\n```\n\n*资料来源：[src/connectors/data_storage.rs:20-30](https://github.com/pathwaycom/pathway/blob/main/src/connectors/data_storage.rs)*\n\n### State Management\n\nThe `PythonReader` maintains internal state for incremental processing:\n\n| State Field | Purpose |\n|-------------|---------|\n| `total_entries_read` | Counter for statistics and debugging |\n| `current_external_offset` | Position marker for resumable reads |\n| `is_initialized` | Flag indicating startup completion |\n| `is_finished` | Flag indicating processing completion |\n| `python_thread_state` | GIL state for thread-safe Python calls |\n\n## Best Practices\n\n### Error Handling\n\nImplement robust error handling in callbacks. The connector should gracefully handle transient failures and support retry mechanisms.\n\n### Checkpointing\n\nFor long-running connectors, implement periodic checkpointing by tracking processed record identifiers or offsets in `seek()` callback state.\n\n### Resource Management\n\nEnsure resources (connections, file handles, API sessions) are properly initialized in `start()` and cleaned up in `end()`.\n\n### Performance Considerations\n\n- Batch data reads when possible to reduce callback overhead\n- Use appropriate polling intervals for streaming connectors\n- Monitor `total_entries_read` for capacity planning\n\n## See Also\n\n- [Pathway Connectors Documentation](https://pathway.com/developers/user-guide/connect/pathway-connectors)\n- [Built-in Connectors Reference](https://pathway.com/developers/api-docs/pathway)\n- [Persistence and State Management](https://pathway.com/developers/user-guide/persistence/)\n- [Real-time Data Processing Guide](https://pathway.com/developers/user-guide/introduction/welcome)\n\n---\n\n<a id='page-transformations'></a>\n\n## Data Transformations\n\n### 相关页面\n\n相关主题：[Temporal and Time-Based Processing](#page-temporal-processing)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs) (核心引擎接口)\n- [src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs) (时间戳处理)\n- [README.md](https://github.com/pathwaycom/pathway/blob/main/README.md) (项目概述)\n- [examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md) (YAML配置示例)\n- [examples/projects/question-answering-rag/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/README.md) (RAG管道示例)\n- [external/differential-dataflow/src/operators/arrange/agent.rs](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs) (Differential Dataflow操作)\n</details>\n\n# Data Transformations\n\nData Transformations are the core operational layer in Pathway, enabling you to manipulate, combine, filter, and aggregate data as it flows through your pipeline. Built on a scalable Rust engine powered by Differential Dataflow, Pathway performs incremental computation that efficiently handles both batch and streaming data with the same Python API.\n\n## Architecture Overview\n\nPathway's transformation engine operates on **Tables**, which are the fundamental data structures in the framework. Each table consists of:\n\n- **Columns**: Named data fields with typed values\n- **Keys**: Unique identifiers for rows\n- **Timestamps**: Time indicators for stream ordering\n- **Change Metadata**: Track additions, modifications, and retractions\n\n```mermaid\ngraph TD\n    A[Input Connectors] --> B[Table Creation]\n    B --> C[Transformations]\n    C --> D[Aggregations]\n    C --> E[Joins]\n    C --> F[Filters & Mappings]\n    D --> G[Output Connectors]\n    E --> G\n    F --> G\n    G --> H[Results / Persistence]\n    \n    subgraph Rust Engine\n        C\n        D\n        E\n        F\n    end\n```\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n## Table Operations\n\nTables are immutable once created in Pathway. All data transformations produce new tables rather than modifying existing ones, ensuring data consistency and enabling proper change tracking.\n\n### Core Table Structure\n\nFrom the Rust engine perspective, tables are represented as:\n\n| Component | Description |\n|-----------|-------------|\n| `universe` | A unique identifier space for table rows |\n| `columns` | Collection of typed data columns |\n| `handle` | Internal reference for engine operations |\n\n资料来源：[src/python_api.rs:1-100](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n### Supported Data Types\n\nPathway supports a variety of primitive and complex types:\n\n| Category | Types |\n|----------|-------|\n| Primitives | `int`, `float`, `str`, `bool` |\n| Temporal | `datetime`, `date`, `time` |\n| Complex | `bytes`, `JSON`, `pointer` |\n\n### Filtering\n\nFiltering operations select subsets of rows based on predicates:\n\n```python\nfiltered_table = source_table.filter(source_table.column > threshold)\n```\n\nThe filter operation:\n1. Evaluates the predicate for each row\n2. Produces a new table with only matching rows\n3. Propagates timestamps from the source\n\n### Column Operations\n\nAdd, rename, or compute new columns:\n\n```python\n# Add computed column\nresult = table.select(\n    table.id,\n    table.value,\n    total=table.value * table.rate\n)\n\n# Rename columns\nrenamed = table.select(\n    new_name=table.old_name,\n    another=table.other\n)\n```\n\n## Join Operations\n\nPathway provides multiple join strategies for combining tables, all powered by the underlying Differential Dataflow computation model.\n\n```mermaid\ngraph LR\n    A[Table A] --> C[Join Operation]\n    B[Table B] --> C\n    C --> D[Result Table]\n    \n    D --> E[Inner Join<br/>Rows with matches in both]\n    D --> F[Left Join<br/>All rows from A]\n    D --> G[Outer Join<br/>All rows from both]\n```\n\n### Join Types\n\n| Join Type | Behavior |\n|-----------|----------|\n| Inner Join | Returns rows with matches in both tables |\n| Left Join | All rows from left table, matched from right |\n| Outer Join | All rows from both tables |\n| Cross Join | Cartesian product with optional filtering |\n\n### Join Syntax\n\n```python\nresult = left_table.join(\n    right_table,\n    left_table.key == right_table.key\n).select(\n    left_table.star,\n    right_table.data\n)\n```\n\n### Incremental Join Behavior\n\nThe Rust engine handles joins incrementally:\n\n```\nsrc/python_api.rs: Table implementation\n├── handle: TableHandle (internal engine reference)\n├── scope: Py<Scope> (computation scope)\n└── Methods for engine-level join operations\n```\n\n资料来源：[src/python_api.rs:200-300](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## Aggregation and Grouping\n\nPathway's aggregation system uses Differential Dataflow's efficient incremental computation model.\n\n### GroupBy Operations\n\nGroup data by one or more columns:\n\n```python\ngrouped = table.groupby(table.category)\n```\n\n### Built-in Reducers\n\n| Reducer | Description | Example |\n|---------|-------------|---------|\n| `count` | Number of rows | `t.groupby(k).reduce(count=pw.reducers.count())` |\n| `sum` | Sum of values | `t.groupby(k).reduce(total=pw.reducers.sum(t.val))` |\n| `avg` | Average value | `t.groupby(k).reduce(avg=pw.reducers.avg(t.val))` |\n| `min` | Minimum value | `t.groupby(k).reduce(min=pw.reducers.min(t.val))` |\n| `max` | Maximum value | `t.groupby(k).reduce(max=pw.reducers.max(t.val))` |\n| `collect` | Collect all values | `t.groupby(k).reduce(all=pw.reducers.collect(t.val))` |\n\n### Custom Aggregations\n\nDefine custom aggregation logic:\n\n```python\nresult = table.groupby(table.key).reduce(\n    key=table.key,\n    custom_agg=custom_reducer(table.values)\n)\n```\n\n## Temporal Operations\n\nPathway handles temporal data with a sophisticated timestamp model that supports streaming and batch scenarios.\n\n### Timestamp Semantics\n\n```mermaid\ngraph TD\n    T1[Timestamp 1] --> T2[Timestamp 2]\n    T2 --> T3[Timestamp 3]\n    T3 --> T4[Timestamp N]\n    \n    subgraph Original Values\n        T1\n        T3\n    end\n    \n    subgraph Retractions\n        T2\n        T4\n    end\n```\n\nThe timestamp system distinguishes between:\n- **Original values** (even timestamps)\n- **Retractions** (odd timestamps)\n\n资料来源：[src/engine/timestamp.rs:1-50](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n\n### Time-based Operations\n\n```python\n# Window by time\nresult = table.windowby(table.timestamp).reduce(\n    time_bucket=table.timestamp.dt.floor(\"1h\"),\n    value=pw.reducers.sum(table.amount)\n)\n```\n\n## SQL-style Processing\n\nPathway supports SQL-like query processing for complex transformations:\n\n```python\nresult = pw.sql.processing(\n    \"\"\"\n    SELECT \n        category,\n        SUM(value) as total,\n        AVG(value) as average\n    FROM source_table\n    GROUP BY category\n    HAVING SUM(value) > 100\n    \"\"\"\n)\n```\n\n### Supported SQL Features\n\n| Feature | Support |\n|---------|---------|\n| SELECT | Full projection support |\n| WHERE | Filter conditions |\n| GROUP BY | Aggregation grouping |\n| HAVING | Post-aggregation filters |\n| JOIN | Table combinations |\n| Subqueries | Nested queries |\n\n## Differential Dataflow Foundation\n\nPathway's transformation engine is built on **Differential Dataflow**, an extension of Timely Dataflow that efficiently handles incremental updates.\n\n```mermaid\ngraph LR\n    subgraph Timely Dataflow\n        A[Operators] --> B[Scope]\n        B --> C[Channels]\n    end\n    \n    subgraph Differential Dataflow\n        D[Arrangements] --> E[Collections]\n        E --> F[Traces]\n        D --> F\n    end\n    \n    G[Pathway Python API] --> H[Rust Engine]\n    H --> I[Differential Dataflow]\n```\n\n### Key Properties\n\n| Property | Benefit |\n|----------|---------|\n| Incremental | Only recomputes changed portions |\n| Monotonic | Handles retractions naturally |\n| Composable | Complex pipelines from simple operators |\n| Parallel | Scales across threads automatically |\n\n资料来源：[external/differential-dataflow/src/operators/arrange/agent.rs:1-50](https://github.com/pathwaycom/pathway/blob/main/external/differential-dataflow/src/operators/arrange/agent.rs)\n\n## Computation Model\n\nPathway's computation follows an **append-only** model where:\n\n1. **Input**: Data enters via connectors\n2. **Transform**: Operations produce new tables\n3. **Output**: Results flow to sinks\n4. **Persistence**: Optional state storage for recovery\n\n```mermaid\ngraph TD\n    A[Connectors] -->|Input Data| B[Engine]\n    B --> C{Transformations}\n    C --> D[Table Operations]\n    C --> E[Joins]\n    C --> F[Aggregations]\n    D --> G[New Tables]\n    E --> G\n    F --> G\n    G --> H[Output Connectors]\n    G --> I[Persistence Layer]\n```\n\n### Running the Pipeline\n\n```python\nimport pathway as pw\n\n# Define pipeline\ntable = pw.io.fs.read(\"./input\", schema=Schema)\nresult = table.groupby(table.key).reduce(count=pw.reducers.count())\n\n# Configure output\npw.io.json_lines.write(result, \"./output\")\n\n# Execute\npw.run()\n```\n\n资料来源：[README.md](https://github.com/pathwaycom/pathway/blob/main/README.md)\n\n## Configuration and YAML Integration\n\nPathway supports declarative pipeline definition via YAML:\n\n```yaml\n# Data source\nsource: !pw.io.csv.read\n  path: ./input.csv\n  schema: $schema\n\n# Transformations\nprocessed: !pw.Transformations.groupby\n  table: $source\n  by: [category]\n  reducers:\n    - !pw.reducers.sum\n      column: amount\n\n# Output\noutput: !pw.io.csv.write\n  table: $processed\n  path: ./output.csv\n```\n\n资料来源：[examples/templates/el-pipeline/README.md](https://github.com/pathwaycom/pathway/blob/main/examples/templates/el-pipeline/README.md)\n\n## Error Handling\n\nThe engine tracks and propagates errors through the computation graph:\n\n```mermaid\ngraph TD\n    A[Operation] -->|Success| B[Next Operation]\n    A -->|Error| C[ErrorLog]\n    C -->|Recorded| D[Monitoring]\n    D -->|Notification| E[Alert System]\n```\n\n```python\n# Access error information\nerror_log = table.error_history()\n```\n\n资料来源：[src/python_api.rs:300-400](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)\n\n## Performance Characteristics\n\n### Threading\n\nPathway natively supports multithreading:\n\n```bash\n$ pathway spawn --threads 3 python main.py\n```\n\n### Memory Efficiency\n\n| Strategy | Description |\n|----------|-------------|\n| Incremental | Only changed data is reprocessed |\n| Arrangements | Pre-indexed data structures for fast joins |\n| Persistence | Checkpointing to manage memory |\n\n### Scaling\n\nPathway pipelines scale from:\n- Local development (single thread)\n- Multi-threaded processing\n- Distributed deployment via containerization\n\n## See Also\n\n- [Input Connectors](https://pathway.com/developers/user-guide/connect/pathway-connectors) - Data ingestion methods\n- [Output Connectors](https://pathway.com/developers/user-guide/connect/pathway-connectors) - Data delivery methods\n- [Persistence Guide](https://pathway.com/developers/user-guide/persistence/overview) - State management\n- [RAG Pipeline Example](https://github.com/pathwaycom/pathway/blob/main/examples/projects/question-answering-rag/) - End-to-end transformation example\n\n---\n\n<a id='page-temporal-processing'></a>\n\n## Temporal and Time-Based Processing\n\n### 相关页面\n\n相关主题：[Data Transformations](#page-transformations)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/stdlib/temporal/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/__init__.py)\n- [python/pathway/stdlib/temporal/_window.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_window.py)\n- [python/pathway/stdlib/temporal/_asof_join.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_asof_join.py)\n- [python/pathway/stdlib/temporal/_interval_join.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_interval_join.py)\n- [docs/2.developers/4.user-guide/40.temporal-data/10.windows-manual.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/40.temporal-data/10.windows-manual.md)\n- [docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md)\n</details>\n\n# Temporal and Time-Based Processing\n\n## Overview\n\nPathway's temporal and time-based processing capabilities enable sophisticated stream analytics by leveraging the underlying Differential Dataflow computation model. The framework provides robust mechanisms for windowing operations, temporal joins, and time-based aggregations that work seamlessly across both batch and streaming data scenarios.\n\nPathway processes data with timestamps, maintaining the temporal order and allowing computations to reference historical data at specific time points. The Rust engine manages time progression through frontiers—boundary markers that indicate which timestamps have been fully processed—enabling consistent and deterministic results regardless of data arrival order.\n\n资料来源：[src/python_api.rs](https://github.com/pathwaycom/pathway/blob/main/src/python_api.rs)  \n资料来源：[src/engine/timestamp.rs](https://github.com/pathwaycom/pathway/blob/main/src/engine/timestamp.rs)\n\n## Core Concepts\n\n### Timestamps and Time Model\n\nPathway operates on timestamped data where each record carries a timestamp indicating when the event occurred or when it should be processed. The timestamp model supports both event time and processing time semantics.\n\n```python\n# Timestamps are automatically tracked by Pathway\n# Each data record carries temporal information\nresult = table.select(\n    col=table.value,\n    time=table._time_attribute  # System-managed timestamp\n)\n```\n\n### Frontiers and Time Progress\n\nFrontiers in Pathway represent the boundary between processed and unprocessed data. The system tracks two critical frontiers:\n\n| Frontier Type | Description |\n|--------------|-------------|\n| Input Frontier | Earliest incomplete input timestamp |\n| Output Frontier | Latest timestamp for which all computations are complete |\n\nThe capability system (borrowed from Timely Dataflow) allows operators to hold references to specific timestamps, ensuring computations only execute when their input data is available.\n\n资料来源：[external/timely-dataflow/timely/src/dataflow/operators/capability.rs](https://github.com/pathwaycom/pathway/blob/main/external/timely-dataflow/timely/src/dataflow/operators/capability.rs)\n\n## Windowing Operations\n\nPathway provides comprehensive windowing capabilities through the `pathway.stdlib.temporal` module. Windows group data based on temporal criteria, enabling aggregations, joins, and transformations over bounded time intervals.\n\n```mermaid\ngraph TD\n    A[Input Stream] --> B[Window Assigner]\n    B --> C[Tumbling Windows]\n    B --> D[Sliding Windows]\n    B --> E[Session Windows]\n    C --> F[Window 1]\n    C --> G[Window 2]\n    D --> H[Overlapping Windows]\n    E --> I[Session 1]\n    E --> J[Session 2]\n```\n\n### Tumbling Windows\n\nTumbling windows are fixed-size, non-overlapping time windows that partition the data stream into discrete, consecutive segments. Each event belongs to exactly one window.\n\n```python\nfrom pathway.stdlib.temporal import tumbling_window\n\n# Create tumbling windows of 1 hour\nresult = table.window.tumbling(\n    window_length=timedelta(hours=1),\n    timestamp=table.event_time\n).reduce(\n    count=tools.count(),\n    sum_value=tools.sum(table.value)\n)\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `window_length` | `timedelta` | Duration of each window |\n| `timestamp` | `ColumnExpression` | Column containing event timestamps |\n| `session_gap` | `timedelta` | Optional gap between sessions |\n\n### Sliding Windows\n\nSliding windows move continuously over the data stream with a fixed size and slide interval. Unlike tumbling windows, consecutive windows can overlap.\n\n```python\nfrom pathway.stdlib.temporal import sliding_window\n\n# Create sliding windows of 5 minutes, sliding every 1 minute\nresult = table.window.sliding(\n    window_length=timedelta(minutes=5),\n    slide=timedelta(minutes=1),\n    timestamp=table.event_time\n).reduce(\n    avg_value=tools.aggregate.avg(table.value)\n)\n```\n\n### Session Windows\n\nSession windows detect periods of activity separated by gaps of inactivity. They automatically expand as new events arrive within the gap threshold.\n\n```python\nfrom pathway.stdlib.temporal import session_window\n\n# Session windows with 30-minute inactivity gap\nresult = table.window.session(\n    gap=timedelta(minutes=30),\n    timestamp=table.event_time\n).reduce(\n    session_duration=tools.count(),\n    total_amount=tools.sum(table.amount)\n)\n```\n\n资料来源：[python/pathway/stdlib/temporal/_window.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_window.py)  \n资料来源：[docs/2.developers/4.user-guide/40.temporal-data/10.windows-manual.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/40.temporal-data/10.windows-manual.md)\n\n## Temporal Joins\n\n### ASOF Join\n\nASOF (As-Of) join is designed for time-series data where you want to match records from two tables based on the closest timestamp without exceeding a specified threshold. This is particularly useful for correlating events that occur at approximately the same time.\n\n```python\nfrom pathway.stdlib.temporal import asof_join\n\n# Match trades with the most recent quote before each trade\nresult = trades.join(\n    quotes,\n    trades.timestamp >= quotes.timestamp,\n    trades.timestamp < quotes.timestamp + timedelta(seconds=10),\n    selectors={\n        trades.price: \"trade_price\",\n        quotes.price: \"quote_price\",\n    }\n).select(\n    trade_price=result.trade_price,\n    quote_price=result.quote_price,\n    time_diff=result.trades_timestamp - result.quotes_timestamp\n)\n```\n\n**Configuration Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `time_distance` | `timedelta` | Maximum time distance between matched records |\n| `closed_window` | `bool` | Include boundary timestamps in matches |\n| `behavior` | `str` | `\"all\"`, `\"emit_per_right\"`, or `\"emit_per_left\"` |\n| `index` | `Table` | Optional reference table for index-based matching |\n\nThe ASOF join supports multiple matching strategies:\n\n```python\n# Emit one result per left record (default)\nresult = left.asof_join(\n    right,\n    left.time >= right.time,\n    left.time < right.time + time_distance\n)\n\n# Emit one result per right record\nresult = left.asof_join(\n    right,\n    left.time >= right.time,\n    left.time < right.time + time_distance,\n    behavior=ASOF_join_behavior.EMIT_PER_RIGHT\n)\n```\n\n资料来源：[python/pathway/stdlib/temporal/_asof_join.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_asof_join.py)  \n资料来源：[docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/40.temporal-data/40.asof-join.md)\n\n### Interval Join\n\nInterval join matches records from two tables where the timestamps fall within a specified time interval relative to each other. Unlike ASOF join which finds the closest match, interval join produces all pairs where timestamps satisfy the interval condition.\n\n```python\nfrom pathway.stdway.stdlib.temporal import interval_join\n\n# Match orders with shipments that arrive within 1-5 days after ordering\nresult = orders.join(\n    shipments,\n    orders.order_date <= shipments.ship_date,\n    orders.order_date + timedelta(days=5) >= shipments.ship_date,\n    orders.order_date + timedelta(days=1) <= shipments.ship_date,\n    selectors={\n        orders.order_id: \"order_id\",\n        orders.order_date: \"order_date\",\n        shipments.ship_date: \"ship_date\",\n        shipments.tracking: \"tracking_number\"\n    }\n).select(\n    order_id=result.order_id,\n    order_date=result.order_date,\n    ship_date=result.ship_date,\n    delay=result.ship_date - result.order_date\n)\n```\n\n**Key Differences from ASOF Join:**\n\n| Aspect | ASOF Join | Interval Join |\n|--------|----------|---------------|\n| Match Cardinality | One-to-one (closest) | One-to-many |\n| Timestamp Condition | Single bound (closest) | Interval range |\n| Use Case | Time-series alignment | Event correlation |\n| Performance | Optimized for single match | May produce more results |\n\n资料来源：[python/pathway/stdlib/temporal/_interval_join.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/_interval_join.py)\n\n## Windowed Aggregations\n\nPathway supports aggregations over windowed data with a rich set of built-in functions:\n\n```python\nfrom pathway.stdlib.temporal import window\nfrom pathway.stdlib import tools\n\n# Comprehensive windowed aggregation\nresult = sensor_data.window.tumbling(\n    window_length=timedelta(minutes=15)\n).reduce(\n    # Count aggregations\n    event_count=tools.count(),\n    distinct_sensors=tools.count(distinct=True, by=sensor_data.sensor_id),\n    \n    # Numeric aggregations\n    avg_temperature=tools.avg(sensor_data.temperature),\n    max_reading=tools.max(sensor_data.value),\n    min_reading=tools.min(sensor_data.value),\n    total_value=tools.sum(sensor_data.value),\n    variance_pop=tools.var_pop(sensor_data.value),\n    \n    # Collection aggregations\n    all_readings=tools.collect(sensor_data.value),\n    std_dev=tools.stddev(sensor_data.value)\n)\n```\n\n**Available Aggregation Functions:**\n\n| Function | Description | Applicable Types |\n|----------|-------------|------------------|\n| `count()` | Number of records | All |\n| `sum(column)` | Sum of values | Numeric |\n| `avg(column)` | Arithmetic mean | Numeric |\n| `min(column)` | Minimum value | Comparable |\n| `max(column)` | Maximum value | Comparable |\n| `var_pop(column)` | Population variance | Numeric |\n| `stddev(column)` | Standard deviation | Numeric |\n| `collect(column)` | Collect all values | All |\n| `any(column)` | Return any value | All |\n\n## Time Evolution and State Management\n\nPathway maintains state across time using differential dataflow's incremental computation model. The persistence layer tracks state changes and enables recovery from checkpoints.\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Timestamp Assignment]\n    B --> C[Differential Update]\n    C --> D[State Accumulation]\n    D --> E[Frontier Advancement]\n    E --> F[Output Production]\n    E -->|State Checkpoint| G[Persistence Backend]\n    G -->|Recovery| D\n```\n\n### Checkpoint and Recovery\n\nThe persistence system automatically snapshots state at frontier boundaries:\n\n```python\nimport pathway as pw\n\nclass MyApp(pw.Application):\n    # Enable persistence with automatic snapshots\n    persistence_config = pw.PersistenceConfig(\n        backend=pw.persistence.Backends.filesystem(\"./data\"),\n        snapshot_interval=timedelta(minutes=5)\n    )\n    \n    # Tables automatically benefit from persistence\n    @pw.operator\n    def my_operator(self, table):\n        return table.groupby(table.category).reduce(\n            total=pw.reducers.sum(table.value)\n        )\n```\n\n资料来源：[src/persistence/tracker.rs](https://github.com/pathwaycom/pathway/blob/main/src/persistence/tracker.rs)\n\n## API Reference\n\n### Module: pathway.stdlib.temporal\n\n#### Window Creation\n\n```python\npathway.stdlib.temporal.tumbling_window(\n    table,\n    time_column,\n    window_length,\n    hop=None,\n    name=None\n)\n```\n\n#### Join Operations\n\n```python\npathway.stdlib.temporal.asof_join(\n    left_table,\n    right_table,\n    left_time,\n    right_time,\n    time_distance,\n    behavior=ASOF_join_behavior.EMIT_PER_RIGHT,\n    selectors=None\n)\n\npathway.stdlib.temporal.interval_join(\n    left_table,\n    right_table,\n    left_time_lower,\n    left_time_upper,\n    right_time_lower,\n    right_time_upper,\n    selectors=None\n)\n```\n\n### Enum: ASOF_join_behavior\n\n| Value | Description |\n|-------|-------------|\n| `EMIT_PER_RIGHT` | Emit one result per right record matched |\n| `EMIT_PER_LEFT` | Emit one result per left record matched |\n| `EMIT_ALL` | Emit all possible matches |\n\n## Best Practices\n\n### Timestamp Column Selection\n\nChoose timestamp columns carefully:\n\n```python\n# Prefer event-time columns for windowing\ntable = pw.Table.read(\n    data,\n    id_column=\"event_id\",\n    time_column=\"event_timestamp\"  # Event time, not processing time\n)\n```\n\n### Performance Considerations\n\n1. **Window Size Tuning**: Smaller windows reduce memory footprint but increase computation frequency\n2. **Session Gap Configuration**: Set gaps based on expected user behavior patterns\n3. **Join Cardinality**: Monitor interval join result sizes to avoid unbounded growth\n4. **Late Data Handling**: Configure watermarks appropriately for your data arrival patterns\n\n### Handling Out-of-Order Data\n\n```python\n# Configure watermark tolerance for late events\nresult = table.window.tumbling(\n    window_length=timedelta(hours=1),\n    timestamp=table.event_time,\n    watermark=timedelta(minutes=5)  # Accept events up to 5 min late\n).reduce(\n    count=tools.count()\n)\n```\n\n## Conclusion\n\nPathway's temporal and time-based processing capabilities provide a powerful foundation for building real-time analytics applications. The combination of flexible windowing, specialized temporal joins, and the underlying differential dataflow engine enables processing of streaming data with the same ease as batch processing, while maintaining correctness and determinism across restarts and scaling events.\n\n资料来源：[python/pathway/stdlib/temporal/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/stdlib/temporal/__init__.py)\n\n---\n\n<a id='page-llm-xpack'></a>\n\n## LLM xPack\n\n### 相关页面\n\n相关主题：[Data Connectors](#page-connectors)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [python/pathway/xpacks/llm/__init__.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/__init__.py)\n- [python/pathway/xpacks/llm/llms.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/llms.py)\n- [python/pathway/xpacks/llm/embedders.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/embedders.py)\n- [python/pathway/xpacks/llm/parsers.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/parsers.py)\n- [python/pathway/xpacks/llm/splitters.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/splitters.py)\n- [python/pathway/xpacks/llm/rerankers.py](https://github.com/pathwaycom/pathway/blob/main/python/pathway/xpacks/llm/rerankers.py)\n- [docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/50.llm-xpack/10.overview.md)\n- [docs/2.developers/4.user-guide/50.llm-xpack/70.embedders.md](https://github.com/pathwaycom/pathway/blob/main/docs/2.developers/4.user-guide/50.llm-xpack/70.embedders.md)\n</details>\n\n# LLM xPack\n\nThe LLM xPack is a specialized extension module within Pathway that provides comprehensive integration capabilities for Large Language Models (LLMs), embedding models, and retrieval-augmented generation (RAG) workflows. This module enables developers to build production-ready LLM pipelines with real-time data processing capabilities.\n\n## Overview\n\nPathway's LLM xPack bridges the gap between traditional ETL stream processing and modern AI-powered applications. It provides Python-native interfaces for interacting with various LLM providers, embedding services, and RAG components while maintaining Pathway's core strengths in handling streaming and batch data with consistency guarantees.\n\nThe module is designed to work seamlessly with Pathway's Rust-powered engine, enabling high-performance inference and data transformation operations that can scale from local development to production deployments.\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Pathway Pipeline] --> B[LLM xPack]\n    B --> C[LLMs Module]\n    B --> D[Embedders Module]\n    B --> E[Rerankers Module]\n    B --> F[Splitters Module]\n    B --> G[Parsers Module]\n    \n    C --> H[OpenAI]\n    C --> I[Azure OpenAI]\n    C --> J[Anthropic]\n    C --> K[Google AI]\n    C --> L[Custom REST]\n    \n    D --> M[OpenAI Embeddings]\n    D --> N[HuggingFace Embeddings]\n    D --> O[Ollama]\n    D --> P[Azure OpenAI Embeddings]\n    \n    H --> Q[Responses]\n    I --> Q\n    J --> Q\n    K --> Q\n    L --> Q\n```\n\n## Module Structure\n\n| Module | Purpose | Primary Classes/Functions |\n|--------|---------|---------------------------|\n| `llms.py` | LLM provider integration | `OpenAILanguageModel`, `AzureOpenAILanguageModel`, `AnthropicLanguageModel` |\n| `embedders.py` | Text embedding generation | `OpenAIEmbedder`, `HuggingFaceEmbedder`, `OllamaEmbedder` |\n| `rerankers.py` | Document reranking for RAG | Cross-encoder based reranking |\n| `splitters.py` | Text document splitting | `TokenSplitter`, `SentenceSplitter` |\n| `parsers.py` | Output parsing and extraction | Response parsing utilities |\n\n## LLMs Module\n\nThe LLMs module provides unified interfaces for interacting with various language model providers. All implementations follow a common base interface ensuring portability across different backends.\n\n### Supported Providers\n\n```mermaid\ngraph LR\n    A[Developer Code] --> B[LLM Wrapper Layer]\n    B --> C[OpenAI]\n    B --> D[Azure OpenAI]\n    B --> E[Anthropic]\n    B --> F[Google AI]\n    B --> G[Custom API]\n```\n\n### Base Interface\n\nThe LLM wrappers implement a common pattern where prompts are processed and responses are returned as Pathway-compatible data structures. This enables seamless integration with Pathway's table operations.\n\n```python\n# Conceptual interface pattern\nclass BaseLanguageModel:\n    def __call__(self, prompt: str, **kwargs) -> LLMResponse:\n        ...\n```\n\n### Provider-Specific Configurations\n\n| Provider | Authentication | API Version | Model Selection |\n|----------|---------------|-------------|-----------------|\n| OpenAI | API Key | Latest | Via model parameter |\n| Azure OpenAI | API Key + Endpoint | 2024-02-01 | Deployment name |\n| Anthropic | API Key | Latest | claude-3 family |\n| Google AI | API Key | Latest | Gemini models |\n\n## Embedders Module\n\nEmbedders generate vector representations of text for semantic search and similarity matching. The module supports multiple embedding providers with consistent interfaces.\n\n### Supported Embedding Services\n\n| Embedder | Provider | Dimension Support | Batch Processing |\n|----------|----------|-------------------|------------------|\n| `OpenAIEmbedder` | OpenAI | 1536, 3072 | Yes |\n| `HuggingFaceEmbedder` | HuggingFace Inference API | Model-dependent | Yes |\n| `OllamaEmbedder` | Local Ollama server | Model-dependent | Yes |\n| `AzureOpenAIEmbedder` | Azure OpenAI | 1536, 3072 | Yes |\n\n### Embedding Workflow\n\n```mermaid\ngraph TD\n    A[Input Text] --> B[Preprocessing]\n    B --> C[Tokenization]\n    C --> D[API Call / Local Inference]\n    D --> E[Vector Embedding]\n    E --> F[Normalized Output]\n```\n\n### Configuration Parameters\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `model` | str | Required | Embedding model identifier |\n| `api_key` | str | Environment | Authentication key |\n| `batch_size` | int | 32 | Documents per batch |\n| `timeout` | float | 60.0 | Request timeout in seconds |\n| `max_retries` | int | 3 | Retry attempts on failure |\n\n## Rerankers Module\n\nRerankers improve retrieval quality by reordering initial search results using cross-encoder models. This is particularly valuable in RAG pipelines where initial vector search may return partially relevant results.\n\n### Reranking Process\n\n```mermaid\ngraph LR\n    A[Query] --> B[Initial Retrieval<br/>Top-K Results]\n    B --> C[Cross-Encoder Scorer]\n    C --> D[Reordered Results]\n    D --> E[Final Output]\n    \n    F[Retrieved Documents] --> C\n```\n\nThe reranking module uses cross-encoder models to score query-document pairs, returning results ordered by relevance score.\n\n## Splitters Module\n\nDocument splitters divide large texts into smaller chunks suitable for embedding and retrieval. Proper chunking is critical for RAG pipeline effectiveness.\n\n### Splitting Strategies\n\n| Strategy | Description | Use Case |\n|----------|-------------|----------|\n| `TokenSplitter` | Split by token count | Fixed context windows |\n| `SentenceSplitter` | Split at sentence boundaries | Natural language coherence |\n| `RecursiveSplitter` | Hierarchical splitting | Complex documents |\n\n### Configuration\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `chunk_size` | int | Target chunk size |\n| `chunk_overlap` | int | Overlap between chunks |\n| `separator` | str | Splitting delimiter |\n\n## Parsers Module\n\nThe parsers module provides utilities for extracting structured information from LLM responses. This is essential for turning raw model outputs into usable data.\n\n### Supported Parsing Patterns\n\n- JSON extraction from markdown-formatted responses\n- Structured output parsing with Pydantic models\n- Template-based response extraction\n\n## RAG Pipeline Integration\n\nThe LLM xPack is designed to compose into complete RAG pipelines within Pathway:\n\n```mermaid\ngraph TD\n    A[Data Sources] --> B[Pathway Connectors]\n    B --> C[Document Splitter]\n    C --> D[Embedder]\n    D --> E[Vector Store / Index]\n    E --> F[Query Input]\n    F --> G[Similarity Search]\n    G --> H[Reranker]\n    H --> I[LLM Prompt Assembly]\n    I --> J[LLM Inference]\n    J --> K[Response Parser]\n    K --> L[Structured Output]\n```\n\n## Usage Example\n\n```python\nimport pathway as pw\nfrom pathway.xpacks.llm import OpenAIEmbedder, OpenAILanguageModel\nfrom pathway.xpacks.llm.splitters import TokenSplitter\n\n# Configure embedder\nembedder = OpenAIEmbedder(model=\"text-embedding-3-small\")\n\n# Configure LLM\nllm = OpenAILanguageModel(model=\"gpt-4o\")\n\n# Document splitting\nsplitter = TokenSplitter(chunk_size=512, chunk_overlap=50)\n\n# Pipeline integration\nclass LLMQueryResult(pw.Result):\n    response: str\n    context: list[str]\n```\n\n## Error Handling and Resilience\n\nThe LLM xPack implements robust error handling for production deployments:\n\n- **Automatic retry** with exponential backoff for transient failures\n- **Timeout handling** to prevent pipeline stalls\n- **Rate limiting compliance** for API-based providers\n- **Graceful degradation** when optional components are unavailable\n\n## Performance Considerations\n\n| Aspect | Recommendation |\n|--------|----------------|\n| Batch Processing | Enable batching for multiple documents |\n| Caching | Use for repeated queries |\n| Async Operations | Prefer async APIs where supported |\n| Memory | Monitor embedding vector memory usage |\n\n## Configuration Sources\n\nLLM xPack credentials and settings can be configured through:\n\n1. **Environment variables** (recommended for production)\n2. **Pathway config files** (YAML/JSON)\n3. **Runtime parameters** (for development)\n\nCommon environment variables:\n\n```bash\nOPENAI_API_KEY=sk-...\nANTHROPIC_API_KEY=sk-ant-...\nGOOGLE_API_KEY=...\n```\n\n## See Also\n\n- [Pathway Documentation](https://pathway.com/developers/)\n- [LLM xPack User Guide](https://pathway.com/developers/user-guide/llm-xpack/)\n- [RAG Pipeline Examples](https://github.com/pathwaycom/pathway/tree/main/examples)\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：pathwaycom/pathway\n\n摘要：发现 22 个潜在踩坑项，其中 2 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any。\n\n## 1. 安装坑 · 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安全/权限坑 · 来源证据：Entra Authentication Support (credential handler and/or password callback)\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：Automated schema exploration in input connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安装坑 · 来源证据：Improve watermarks in POSIX-like objects tracker\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：Persistence in `iterate` operator\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 7. 安装坑 · 来源证据：Support LEANN for RAG pipelines\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：Support MongoDB Atlas in pw.io.mongodb\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 9. 安装坑 · 来源证据：feat: Add Microsoft SQL Server (MSSQL) connector\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：v0.27.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 安装坑 · 来源证据：v0.29.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.29.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b25babd3e3cc49108e56daaaaae8c5bf | https://github.com/pathwaycom/pathway/releases/tag/v0.29.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 12. 安装坑 · 来源证据：v0.30.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.30.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_0291ee068e4e4d38aa86d0fd433b960a | https://github.com/pathwaycom/pathway/releases/tag/v0.30.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 13. 配置坑 · 来源证据：v0.28.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.28.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_708184d9c8ac445d923c82d38af7bd19 | https://github.com/pathwaycom/pathway/releases/tag/v0.28.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 配置坑 · 来源证据：v0.30.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.30.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7a19bd03a49499e987781160e4b76f0 | https://github.com/pathwaycom/pathway/releases/tag/v0.30.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:571186647 | https://github.com/pathwaycom/pathway | README/documentation is current enough for a first validation pass.\n\n## 16. 运行坑 · 来源证据：`pw.io.mssql.read` needs LSN persistence\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：`pw.io.mssql.read` needs LSN persistence\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_59d927a667a843ccb7d0a4cd147a1dc0 | https://github.com/pathwaycom/pathway/issues/218 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 17. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | last_activity_observed missing\n\n## 18. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 19. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 20. 安全/权限坑 · 来源证据：Support encryption in Kinesis connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Support encryption in Kinesis connectors\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f29097c3eced4a209e7c80971aeea40c | https://github.com/pathwaycom/pathway/issues/223 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 21. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | issue_or_pr_quality=unknown\n\n## 22. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | release_recency=unknown\n\n<!-- canonical_name: pathwaycom/pathway; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：pathwaycom/pathway\n\n摘要：发现 22 个潜在踩坑项，其中 2 个为 high/blocking；最高优先级：安装坑 - 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any。\n\n## 1. 安装坑 · 来源证据：[Bug]: TypeError: Cannot instantiate typing.Any\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：[Bug]: TypeError: Cannot instantiate typing.Any\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_d63b2462b18449c4a2c02bb5e02a96c8 | https://github.com/pathwaycom/pathway/issues/227 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 2. 安全/权限坑 · 来源证据：Entra Authentication Support (credential handler and/or password callback)\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Entra Authentication Support (credential handler and/or password callback)\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7c65f4d8bce4b1a9c99accbf0457633 | https://github.com/pathwaycom/pathway/issues/230 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 安装坑 · 来源证据：Automated schema exploration in input connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Automated schema exploration in input connectors\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_74b91feb391f4b0298216d2a48fe5abe | https://github.com/pathwaycom/pathway/issues/224 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 4. 安装坑 · 来源证据：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Cannot Process Windowed Sessions from Kafka - Crash with \"key missing in output table\" on streaming retractions\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_ab5d2b2076034999bfaed612a3d95d60 | https://github.com/pathwaycom/pathway/issues/232 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安装坑 · 来源证据：Improve watermarks in POSIX-like objects tracker\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Improve watermarks in POSIX-like objects tracker\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_38f29b1fba3b41cc99b1364d15ef7213 | https://github.com/pathwaycom/pathway/issues/225 | 来源讨论提到 node 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：Persistence in `iterate` operator\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Persistence in `iterate` operator\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e642d639ebbc4382b242028ef89ec236 | https://github.com/pathwaycom/pathway/issues/214 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 7. 安装坑 · 来源证据：Support LEANN for RAG pipelines\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support LEANN for RAG pipelines\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_5d99dd1cfc794b299633b6c5dc9dd33b | https://github.com/pathwaycom/pathway/issues/173 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：Support MongoDB Atlas in pw.io.mongodb\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Support MongoDB Atlas in pw.io.mongodb\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fa1d6e2e977a45d99c414724681f0f32 | https://github.com/pathwaycom/pathway/issues/221 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 9. 安装坑 · 来源证据：feat: Add Microsoft SQL Server (MSSQL) connector\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：feat: Add Microsoft SQL Server (MSSQL) connector\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_370bcc864a314734b602f01d3d444ef9 | https://github.com/pathwaycom/pathway/issues/204 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 10. 安装坑 · 来源证据：v0.27.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.27.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_8d6905d8719c488cbcbb821e794add26 | https://github.com/pathwaycom/pathway/releases/tag/v0.27.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 安装坑 · 来源证据：v0.29.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.29.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_b25babd3e3cc49108e56daaaaae8c5bf | https://github.com/pathwaycom/pathway/releases/tag/v0.29.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 12. 安装坑 · 来源证据：v0.30.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.30.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_0291ee068e4e4d38aa86d0fd433b960a | https://github.com/pathwaycom/pathway/releases/tag/v0.30.0 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 13. 配置坑 · 来源证据：v0.28.0\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.28.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_708184d9c8ac445d923c82d38af7bd19 | https://github.com/pathwaycom/pathway/releases/tag/v0.28.0 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 14. 配置坑 · 来源证据：v0.30.1\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：v0.30.1\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e7a19bd03a49499e987781160e4b76f0 | https://github.com/pathwaycom/pathway/releases/tag/v0.30.1 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 15. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:571186647 | https://github.com/pathwaycom/pathway | README/documentation is current enough for a first validation pass.\n\n## 16. 运行坑 · 来源证据：`pw.io.mssql.read` needs LSN persistence\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：`pw.io.mssql.read` needs LSN persistence\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_59d927a667a843ccb7d0a4cd147a1dc0 | https://github.com/pathwaycom/pathway/issues/218 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 17. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | last_activity_observed missing\n\n## 18. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 19. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:571186647 | https://github.com/pathwaycom/pathway | no_demo; severity=medium\n\n## 20. 安全/权限坑 · 来源证据：Support encryption in Kinesis connectors\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Support encryption in Kinesis connectors\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f29097c3eced4a209e7c80971aeea40c | https://github.com/pathwaycom/pathway/issues/223 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 21. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | issue_or_pr_quality=unknown\n\n## 22. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:571186647 | https://github.com/pathwaycom/pathway | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# pathway - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for pathwaycom/pathway.\n\nProject:\n- Name: pathway\n- Repository: https://github.com/pathwaycom/pathway\n- Summary: Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Python ETL framework for stream processing, real-time analytics, LLM pipelines, and RAG.\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. page-introduction: Pathway Introduction. Produce one small intermediate artifact and wait for confirmation.\n2. page-concepts: Core Concepts. Produce one small intermediate artifact and wait for confirmation.\n3. page-architecture: System Architecture. Produce one small intermediate artifact and wait for confirmation.\n4. page-python-api: Python API Structure. Produce one small intermediate artifact and wait for confirmation.\n5. page-data-model: Data Model and Type System. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/pathwaycom/pathway\n- https://github.com/pathwaycom/pathway#readme\n- README.md\n- python/pathway/__init__.py\n- pyproject.toml\n- python/pathway/internals/schema.py\n- python/pathway/internals/table.py\n- python/pathway/internals/datasource.py\n- docs/2.developers/4.user-guide/10.introduction/50.concepts.md\n- docs/2.developers/4.user-guide/10.introduction/70.streaming-and-static-modes.md\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：pathwaycom/pathway\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install -U pathway\n```\n\n来源：https://github.com/pathwaycom/pathway#readme\n\n## 来源\n\n- repo: https://github.com/pathwaycom/pathway\n- docs: https://github.com/pathwaycom/pathway#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_6b1c86605d304988ae43163ed46c8441"
}
