{
  "canonical_name": "evidentlyai/evidently",
  "compilation_id": "pack_db514360f4244c49bf4eda4f0756cdcc",
  "created_at": "2026-05-16T21:00:49.442257+00:00",
  "created_by": "project-pack-compiler",
  "feedback": {
    "carrier_selection_notes": [
      "viable_asset_types=skill, recipe, host_instruction, eval, preflight",
      "recommended_asset_types=skill, recipe, host_instruction, eval, preflight"
    ],
    "evidence_delta": {
      "confirmed_claims": [
        "identity_anchor_present",
        "capability_and_host_targets_present",
        "install_path_declared_or_better"
      ],
      "missing_required_fields": [],
      "must_verify_forwarded": [
        "Run or inspect `pip install evidently` in an isolated environment.",
        "Confirm the project exposes the claimed capability to at least one target host."
      ],
      "quickstart_execution_scope": "allowlisted_sandbox_smoke",
      "sandbox_command": "pip install evidently",
      "sandbox_container_image": "python:3.12-slim",
      "sandbox_execution_backend": "docker",
      "sandbox_planner_decision": "llm_execute_isolated_install",
      "sandbox_validation_id": "sbx_11c89fa58d454f31ba9c4457c3c44874"
    },
    "feedback_event_type": "project_pack_compilation_feedback",
    "learning_candidate_reasons": [],
    "template_gaps": []
  },
  "identity": {
    "canonical_id": "project_4a61e3721dc01a4659b3fcbfe9279c15",
    "canonical_name": "evidentlyai/evidently",
    "homepage_url": null,
    "license": "unknown",
    "repo_url": "https://github.com/evidentlyai/evidently",
    "slug": "evidently",
    "source_packet_id": "phit_a034e1beb2e74d109ae3f2cd7483bcbd",
    "source_validation_id": "dval_fa1c64f5f5114e2b94febefe6ae8510a"
  },
  "merchandising": {
    "best_for": "需要软件开发与交付能力，并使用 local_cli的用户",
    "github_forks": 847,
    "github_stars": 7498,
    "one_liner_en": "Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.",
    "one_liner_zh": "Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.",
    "primary_category": {
      "category_id": "software-development",
      "confidence": "high",
      "name_en": "Software Development",
      "name_zh": "软件开发与交付",
      "reason": "matched_keywords:test, git, cli"
    },
    "target_user": "使用 local_cli 等宿主 AI 的用户",
    "title_en": "evidently",
    "title_zh": "evidently 能力包",
    "visible_tags": [
      {
        "label_en": "Browser Agents",
        "label_zh": "浏览器 Agent",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "product_domain-browser-agents",
        "type": "product_domain"
      },
      {
        "label_en": "Web Task Automation",
        "label_zh": "网页任务自动化",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "user_job-web-task-automation",
        "type": "user_job"
      },
      {
        "label_en": "Browser Automation",
        "label_zh": "浏览器自动化",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "core_capability-browser-automation",
        "type": "core_capability"
      },
      {
        "label_en": "Checkpoint Resume",
        "label_zh": "断点恢复流程",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "workflow_pattern-checkpoint-resume",
        "type": "workflow_pattern"
      },
      {
        "label_en": "Evaluation Suite",
        "label_zh": "评测体系",
        "source": "repo_evidence_project_characteristics",
        "tag_id": "selection_signal-evaluation-suite",
        "type": "selection_signal"
      }
    ]
  },
  "packet_id": "phit_a034e1beb2e74d109ae3f2cd7483bcbd",
  "page_model": {
    "artifacts": {
      "artifact_slug": "evidently",
      "files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json",
        "REPO_INSPECTION.md",
        "CAPABILITY_CONTRACT.json",
        "EVIDENCE_INDEX.json",
        "CLAIM_GRAPH.json"
      ],
      "required_files": [
        "PROJECT_PACK.json",
        "QUICK_START.md",
        "PROMPT_PREVIEW.md",
        "HUMAN_MANUAL.md",
        "AI_CONTEXT_PACK.md",
        "BOUNDARY_RISK_CARD.md",
        "PITFALL_LOG.md",
        "REPO_INSPECTION.json"
      ]
    },
    "detail": {
      "capability_source": "Project Hit Packet + DownstreamValidationResult",
      "commands": [
        {
          "command": "pip install evidently",
          "label": "Python / pip · 官方安装入口",
          "source": "https://github.com/evidentlyai/evidently#readme",
          "verified": true
        }
      ],
      "display_tags": [
        "浏览器 Agent",
        "网页任务自动化",
        "浏览器自动化",
        "断点恢复流程",
        "评测体系"
      ],
      "eyebrow": "软件开发与交付",
      "glance": [
        {
          "body": "判断自己是不是目标用户。",
          "label": "最适合谁",
          "value": "需要软件开发与交付能力，并使用 local_cli的用户"
        },
        {
          "body": "先理解能力边界，再决定是否继续。",
          "label": "核心价值",
          "value": "Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics."
        },
        {
          "body": "未完成验证前保持审慎。",
          "label": "继续前",
          "value": "publish to Doramagic.ai project surfaces"
        }
      ],
      "guardrail_source": "Boundary & Risk Card",
      "guardrails": [
        {
          "body": "Prompt Preview 只展示流程，不证明项目已安装或运行。",
          "label": "Check 1",
          "value": "不要把试用当真实运行"
        },
        {
          "body": "local_cli",
          "label": "Check 2",
          "value": "确认宿主兼容"
        },
        {
          "body": "publish to Doramagic.ai project surfaces",
          "label": "Check 3",
          "value": "先隔离验证"
        }
      ],
      "mode": "skill, recipe, host_instruction, eval, preflight",
      "pitfall_log": {
        "items": [
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Update scikit-learn version requirement to support v1.6.0",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_e82e24d201134e7f8dfa0b564d73fce8 | https://github.com/evidentlyai/evidently/issues/1407 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Update scikit-learn version requirement to support v1.6.0",
            "user_impact": "可能影响升级、迁移或版本选择。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个配置相关的待验证问题：PromptOptimizer throws OpenAIError when using Vertex AI judge",
            "category": "配置坑",
            "evidence": [
              "community_evidence:github | cevd_abf6f9b183ff4892be78fd198f60d8e8 | https://github.com/evidentlyai/evidently/issues/1856 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：PromptOptimizer throws OpenAIError when using Vertex AI judge",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个运行相关的待验证问题：IndexError in infer_column_type when column contains only null values",
            "category": "运行坑",
            "evidence": [
              "community_evidence:github | cevd_44cdd5f2b657432eb8418fb132959ada | https://github.com/evidentlyai/evidently/issues/1764 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：IndexError in infer_column_type when column contains only null values",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_872f2b57cd3a4c61ad5c387c8be3208b | https://github.com/evidentlyai/evidently/issues/1410 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "high",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Numpy 2.x support?",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_2fc84415b55a43d2be4f7f47cf46d7a1 | https://github.com/evidentlyai/evidently/issues/1557 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：Numpy 2.x support?",
            "user_impact": "可能阻塞安装或首次运行。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.12",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_fcf994a3bcf94b0d9b2c5d6159816728 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.12 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.7.12",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.15",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_a9de117bd8e440beab9f7be4af50d710 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.15 | 来源讨论提到 linux 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.7.15",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.20",
            "category": "安装坑",
            "evidence": [
              "community_evidence:github | cevd_f0d67058bf2a40218adfb0eea1f1665f | https://github.com/evidentlyai/evidently/releases/tag/v0.7.20 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.7.20",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.19",
            "category": "能力坑",
            "evidence": [
              "community_evidence:github | cevd_52e6ab37edca4901a0aa9fe7b5ba12f1 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.19 | 来源类型 github_release 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.7.19",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.21",
            "category": "能力坑",
            "evidence": [
              "community_evidence:github | cevd_fe9f0e6e29564afeb5610928cc9bc834 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.21 | 来源讨论提到 python 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.7.21",
            "user_impact": "可能增加新用户试用和生产接入成本。"
          },
          {
            "body": "README/documentation is current enough for a first validation pass.",
            "category": "能力坑",
            "evidence": [
              "capability.assumptions | github_repo:315977578 | https://github.com/evidentlyai/evidently | README/documentation is current enough for a first validation pass."
            ],
            "severity": "medium",
            "suggested_check": "将假设转成下游验证清单。",
            "title": "能力判断依赖假设",
            "user_impact": "假设不成立时，用户拿不到承诺的能力。"
          },
          {
            "body": "未记录 last_activity_observed。",
            "category": "维护坑",
            "evidence": [
              "evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | last_activity_observed missing"
            ],
            "severity": "medium",
            "suggested_check": "补 GitHub 最近 commit、release、issue/PR 响应信号。",
            "title": "维护活跃度未知",
            "user_impact": "新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "downstream_validation.risk_items | github_repo:315977578 | https://github.com/evidentlyai/evidently | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "进入安全/权限治理复核队列。",
            "title": "下游验证发现风险项",
            "user_impact": "下游已经要求复核，不能在页面中弱化。"
          },
          {
            "body": "no_demo",
            "category": "安全/权限坑",
            "evidence": [
              "risks.scoring_risks | github_repo:315977578 | https://github.com/evidentlyai/evidently | no_demo; severity=medium"
            ],
            "severity": "medium",
            "suggested_check": "把风险写入边界卡，并确认是否需要人工复核。",
            "title": "存在评分风险",
            "user_impact": "风险会影响是否适合普通用户安装。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Protect this repo from AI-generated PRs",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_cfcf13543c3941e88fd7c5f527f95dc3 | https://github.com/evidentlyai/evidently/issues/1879 | 来源类型 github_issue 暴露的待验证使用条件。"
            ],
            "severity": "medium",
            "suggested_check": "来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。",
            "title": "来源证据：Protect this repo from AI-generated PRs",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          },
          {
            "body": "GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.7.17",
            "category": "安全/权限坑",
            "evidence": [
              "community_evidence:github | cevd_305e0831e389460096b4d4b908731b41 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.17 | 来源讨论提到 api key 相关条件，需在安装/试用前复核。"
            ],
            "severity": "medium",
            "suggested_check": "来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。",
            "title": "来源证据：v0.7.17",
            "user_impact": "可能影响授权、密钥配置或安全边界。"
          }
        ],
        "source": "ProjectPitfallLog + ProjectHitPacket + validation + community signals",
        "summary": "发现 19 个潜在踩坑项，其中 4 个为 high/blocking；最高优先级：安装坑 - 来源证据：Update scikit-learn version requirement to support v1.6.0。",
        "title": "踩坑日志"
      },
      "snapshot": {
        "contributors": 97,
        "forks": 847,
        "license": "unknown",
        "note": "站点快照，非实时质量证明；用于开工前背景判断。",
        "stars": 7498
      },
      "source_url": "https://github.com/evidentlyai/evidently",
      "steps": [
        {
          "body": "不安装项目，先体验能力节奏。",
          "code": "preview",
          "title": "先试 Prompt"
        },
        {
          "body": "理解输入、输出、失败模式和边界。",
          "code": "manual",
          "title": "读说明书"
        },
        {
          "body": "把上下文交给宿主 AI 继续工作。",
          "code": "context",
          "title": "带给 AI"
        },
        {
          "body": "进入主力环境前先完成安装入口与风险边界验证。",
          "code": "verify",
          "title": "沙箱验证"
        }
      ],
      "subtitle": "Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.",
      "title": "evidently 能力包",
      "trial_prompt": "# evidently - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for evidentlyai/evidently.\n\nProject:\n- Name: evidently\n- Repository: https://github.com/evidentlyai/evidently\n- Summary: Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. arch-overview: Architecture Overview. Produce one small intermediate artifact and wait for confirmation.\n2. core-components: Core Components. Produce one small intermediate artifact and wait for confirmation.\n3. data-flow: Data Management and Data Flow. Produce one small intermediate artifact and wait for confirmation.\n4. ml-evaluation: ML Model Evaluation. Produce one small intermediate artifact and wait for confirmation.\n5. llm-evals: LLM Evaluation and Judging. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/evidentlyai/evidently\n- https://github.com/evidentlyai/evidently#readme\n- src/evidently/__init__.py\n- src/evidently/_registry.py\n- src/evidently/sdk/__init__.py\n- src/evidently/legacy/__init__.py\n- src/evidently/future/__init__.py\n- src/evidently/core/base_types.py\n- src/evidently/core/metric_types.py\n- src/evidently/core/report.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "voices": [
        {
          "body": "来源平台：github。github/github_issue: IndexError in infer_column_type when column contains only null values（https://github.com/evidentlyai/evidently/issues/1764）；github/github_issue: Protect this repo from AI-generated PRs（https://github.com/evidentlyai/evidently/issues/1879）；github/github_issue: Numpy 2.x support?（https://github.com/evidentlyai/evidently/issues/1557）；github/github_issue: PromptOptimizer throws OpenAIError when using Vertex AI judge（https://github.com/evidentlyai/evidently/issues/1856）；github/github_issue: Update scikit-learn version requirement to support v1.6.0（https://github.com/evidentlyai/evidently/issues/1407）；github/github_issue: Update evidently hashlib usage for FIPS-Compliant Systems and Security B（https://github.com/evidentlyai/evidently/issues/1410）；github/github_release: v0.7.21（https://github.com/evidentlyai/evidently/releases/tag/v0.7.21）；github/github_release: v0.7.20（https://github.com/evidentlyai/evidently/releases/tag/v0.7.20）；github/github_release: v0.7.19（https://github.com/evidentlyai/evidently/releases/tag/v0.7.19）；github/github_release: v0.7.18（https://github.com/evidentlyai/evidently/releases/tag/v0.7.18）；github/github_release: v0.7.17（https://github.com/evidentlyai/evidently/releases/tag/v0.7.17）；github/github_release: v0.7.16（https://github.com/evidentlyai/evidently/releases/tag/v0.7.16）。这些是项目级外部声音，不作为单独质量证明。",
          "items": [
            {
              "kind": "github_issue",
              "source": "github",
              "title": "IndexError in infer_column_type when column contains only null values",
              "url": "https://github.com/evidentlyai/evidently/issues/1764"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Protect this repo from AI-generated PRs",
              "url": "https://github.com/evidentlyai/evidently/issues/1879"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Numpy 2.x support?",
              "url": "https://github.com/evidentlyai/evidently/issues/1557"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "PromptOptimizer throws OpenAIError when using Vertex AI judge",
              "url": "https://github.com/evidentlyai/evidently/issues/1856"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Update scikit-learn version requirement to support v1.6.0",
              "url": "https://github.com/evidentlyai/evidently/issues/1407"
            },
            {
              "kind": "github_issue",
              "source": "github",
              "title": "Update evidently hashlib usage for FIPS-Compliant Systems and Security B",
              "url": "https://github.com/evidentlyai/evidently/issues/1410"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.7.21",
              "url": "https://github.com/evidentlyai/evidently/releases/tag/v0.7.21"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.7.20",
              "url": "https://github.com/evidentlyai/evidently/releases/tag/v0.7.20"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.7.19",
              "url": "https://github.com/evidentlyai/evidently/releases/tag/v0.7.19"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.7.18",
              "url": "https://github.com/evidentlyai/evidently/releases/tag/v0.7.18"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.7.17",
              "url": "https://github.com/evidentlyai/evidently/releases/tag/v0.7.17"
            },
            {
              "kind": "github_release",
              "source": "github",
              "title": "v0.7.16",
              "url": "https://github.com/evidentlyai/evidently/releases/tag/v0.7.16"
            }
          ],
          "status": "已收录 12 条来源",
          "title": "社区讨论"
        }
      ]
    },
    "homepage_card": {
      "category": "软件开发与交付",
      "desc": "Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.",
      "effort": "安装已验证",
      "forks": 847,
      "icon": "code",
      "name": "evidently 能力包",
      "risk": "可发布",
      "slug": "evidently",
      "stars": 7498,
      "tags": [
        "浏览器 Agent",
        "网页任务自动化",
        "浏览器自动化",
        "断点恢复流程",
        "评测体系"
      ],
      "thumb": "gray",
      "type": "Skill Pack"
    },
    "manual": {
      "markdown": "# https://github.com/evidentlyai/evidently 项目说明书\n\n生成时间：2026-05-16 20:53:23 UTC\n\n## 目录\n\n- [Architecture Overview](#arch-overview)\n- [Core Components](#core-components)\n- [Data Management and Data Flow](#data-flow)\n- [ML Model Evaluation](#ml-evaluation)\n- [LLM Evaluation and Judging](#llm-evals)\n- [Descriptors and Features System](#descriptors-features)\n- [Reports and Test Suites](#reports-test-suites)\n- [Presets and Metric Presets](#presets)\n- [Custom Metrics and Extensibility](#custom-metrics)\n- [UI Service Backend](#ui-service)\n\n<a id='arch-overview'></a>\n\n## Architecture Overview\n\n### 相关页面\n\n相关主题：[Core Components](#core-components), [Data Management and Data Flow](#data-flow)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/ui/workspace.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/ui/workspace.py)\n- [src/evidently/sdk/configs.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/configs.py)\n- [src/evidently/sdk/adapters.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/adapters.py)\n- [src/evidently/ui/utils.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/ui/utils.py)\n- [README.md](https://github.com/evidentlyai/evidently/blob/main/README.md)\n</details>\n\n# Architecture Overview\n\n## Introduction\n\nEvidently is an open-source Python framework designed to evaluate, test, and monitor ML and LLM-powered systems. The architecture follows a modular design pattern that separates concerns between core evaluation logic, SDK APIs, user interface components, and configuration management.\n\n资料来源：[README.md:1-30]()\n\n## High-Level Architecture\n\nThe Evidently platform consists of several interconnected layers:\n\n```mermaid\ngraph TB\n    subgraph \"User Interface Layer\"\n        UI[\"UI Components<br/>(React/TypeScript)\"]\n    end\n    \n    subgraph \"SDK Layer\"\n        SDK[\"Python SDK<br/>(evidently.sdk)\"]\n        API[\"Cloud Config API\"]\n    end\n    \n    subgraph \"Core Layer\"\n        CORE[\"Core Modules<br/>(metrics, descriptors, reports)\"]\n        LEGACY[\"Legacy Module<br/>(evidently.legacy)\"]\n        FUTURE[\"Future Module<br/>(evidently.future)\"]\n    end\n    \n    subgraph \"Data Layer\"\n        WS[\"Workspace Abstraction\"]\n        CFG[\"Config System\"]\n        ADP[\"Adapters\"]\n    end\n    \n    UI --> SDK\n    SDK --> API\n    SDK --> WS\n    SDK --> CFG\n    SDK --> ADP\n    ADP --> WS\n    CFG --> WS\n```\n\n## Core Module Structure\n\nThe main `evidently` package provides the primary user-facing APIs for ML evaluation:\n\n| Component | Purpose |\n|-----------|---------|\n| `Report` | Generate evaluation reports combining multiple metrics |\n| `Dataset` | Container for evaluation data with descriptors |\n| `DataDefinition` | Schema definition for dataset structure |\n| `Metrics` | Individual evaluation metrics |\n| `Descriptors` | Row-level evaluators (e.g., Sentiment, TextLength) |\n| `Presets` | Pre-configured evaluation suites (e.g., TextEvals) |\n\n资料来源：[README.md:30-55]()\n\n### Key Imports for LLM Evals\n\n```python\nfrom evidently import Report\nfrom evidently import Dataset, DataDefinition\nfrom evidently.descriptors import Sentiment, TextLength, Contains\nfrom evidently.presets import TextEvals\n```\n\n## SDK Architecture\n\nThe SDK layer provides programmatic access to remote configurations and cloud features.\n\n### CloudConfigAPI\n\nThe `CloudConfigAPI` class manages remote configuration operations:\n\n| Method | Purpose |\n|--------|---------|\n| `create_config()` | Create a new remote configuration |\n| `update_config()` | Update an existing configuration |\n| `get_config()` | Retrieve a configuration by ID |\n| `list_configs()` | List all configurations for a project |\n| `delete_config()` | Remove a configuration |\n| `create_version()` | Create a new version of a configuration |\n| `list_versions()` | List all versions of a configuration |\n| `get_version()` | Get a specific version |\n\n资料来源：[src/evidently/sdk/configs.py:1-50]()\n\n### Config Version Management\n\nThe configuration system uses a `ConfigVersion` model to track changes:\n\n```mermaid\nclassDiagram\n    class ConfigMetadata {\n        +str created_at\n        +str updated_at\n        +str author\n        +str description\n    }\n    \n    class ConfigVersion {\n        +str id\n        +str artifact_id\n        +int version\n        +Any content\n        +ConfigVersionMetadata metadata\n    }\n    \n    class ConfigVersionMetadata {\n        +str created_at\n        +str updated_at\n        +str author\n        +str comment\n    }\n    \n    ConfigVersion --> ConfigVersionMetadata\n    ConfigVersionMetadata --|> ConfigMetadata\n```\n\n资料来源：[src/evidently/sdk/configs.py:50-150]()\n\n### Adapter Pattern\n\nThe SDK uses adapters to convert between internal config models and domain objects:\n\n```mermaid\ngraph LR\n    A[\"ConfigVersion\"] -->|Adapter| B[\"ArtifactVersion\"]\n    A -->|Adapter| C[\"Descriptor\"]\n    A -->|Adapter| D[\"Prompt\"]\n```\n\n| Adapter Class | Source Config | Target Domain Object |\n|--------------|---------------|---------------------|\n| `ArtifactAdapter` | `ConfigVersion` | `ArtifactVersion` |\n| `DescriptorAdapter` | `ConfigVersion` | `Descriptor` |\n| `PromptAdapter` | `ConfigVersion` | `Prompt` |\n\n资料来源：[src/evidently/sdk/adapters.py:1-100]()\n\n## Workspace Architecture\n\nThe workspace abstraction provides a unified interface for project management:\n\n### Abstract Base Class\n\n```mermaid\nclassDiagram\n    class Workspace {\n        <<abstract>>\n        +create_project(name, description, org_id) Project\n        +add_project(project, org_id) Project\n        +get_project(project_id) Optional~Project~\n        +delete_project(project_id)\n        +list_projects(org_id) Sequence~Project~\n    }\n```\n\n### Core Workspace Methods\n\n| Method | Parameters | Returns | Description |\n|--------|------------|---------|-------------|\n| `create_project` | `name: str`, `description: str`, `org_id: Optional[OrgID]` | `Project` | Creates and adds a new project |\n| `add_project` | `project: ProjectModel`, `org_id: Optional[OrgID]` | `Project` | Adds an existing project model |\n| `get_project` | `project_id: STR_UUID` | `Optional[Project]` | Retrieves project by UUID |\n| `delete_project` | `project_id: STR_UUID` | `None` | Removes project from workspace |\n| `list_projects` | `org_id: Optional[OrgID]` | `Sequence[Project]` | Lists projects with optional filtering |\n\n资料来源：[src/evidently/ui/workspace.py:1-100]()\n\n### Project Model\n\nProjects are stored using a `ProjectModel` data structure:\n\n```python\nProjectModel(\n    name=str,\n    description=str,\n    org_id=Optional[OrgID]\n)\n```\n\n## UI Component Architecture\n\nThe frontend is built with React and TypeScript, organized as separate packages:\n\n```mermaid\ngraph TD\n    subgraph \"ui/packages/evidently-ui-lib\"\n        WC[\"Widgets\"]\n        CC[\"Components\"]\n        FP[\"Forms\"]\n        TB[\"Tables\"]\n    end\n    \n    subgraph \"Components\"\n        DT[\"Dashboard\"]\n        TR[\"Traces\"]\n        PR[\"Prompts\"]\n        DS[\"Descriptors\"]\n    end\n    \n    CC --> DT\n    CC --> TR\n    CC --> PR\n    CC --> DS\n    CC --> FP\n    CC --> TB\n```\n\n### Component Categories\n\n| Package Path | Purpose |\n|-------------|---------|\n| `src/widgets/` | Dashboard widgets and test suite components |\n| `src/components/Dashboard/` | Dashboard-specific UI elements |\n| `src/components/Traces/` | Trace viewing and management |\n| `src/components/Prompts/` | Prompt template management |\n| `src/components/Descriptors/` | Descriptor configuration forms |\n| `src/components/Utils/` | Shared utility components |\n\n## Template and Link Generation\n\nThe UI layer uses template functions to generate HTML for external links and reports:\n\n| Template | Purpose |\n|----------|---------|\n| `HTML_LINK_WITH_ID_TEMPLATE` | Report links with button and ID display |\n| `FILE_LINK_WITH_ID_TEMPLATE` | File links with ID metadata |\n| `RUNNING_SERVICE_LINK_TEMPLATE` | Service endpoint links |\n| `EVIDENTLY_STYLES_COMMON` | Shared CSS styles for links |\n\n资料来源：[src/evidently/ui/utils.py:1-60]()\n\n### Template Structure\n\n```html\n<div class=\"evidently-links container\">\n    <a target=\"_blank\" href=\"{button_url}\">{button_title}</a>\n    <p><b>{id_title}:</b> <span>{id}</span></p>\n</div>\n```\n\n## Data Flow Architecture\n\n```mermaid\ngraph TB\n    A[\"User Code\"] --> B[\"Dataset Creation\"]\n    B --> C[\"Descriptor Application\"]\n    C --> D[\"Report Generation\"]\n    D --> E[\"Workspace Storage\"]\n    \n    F[\"Cloud API\"] --> G[\"Config Management\"]\n    G --> E\n    \n    E --> H[\"UI Display\"]\n```\n\n## API Reference Documentation\n\nThe project includes automated API documentation generation:\n\n| Command | Description |\n|---------|-------------|\n| `./api-reference/generate.py --local-source-code` | Generate docs from local source |\n| `./api-reference/generate.py --git-revision <ref>` | Generate docs from git revision |\n| `./api-reference/generate.py --additional-modules` | Include extra modules |\n\nDocumentation is output to `api-reference/dist/` organized by revision.\n\n资料来源：[api-reference/README.md:1-50]()\n\n## Summary\n\nThe Evidently architecture is organized into three main layers:\n\n1. **Core Layer** - Python packages for evaluation logic (`evidently`, `evidently.core`)\n2. **SDK Layer** - Remote configuration and cloud integration (`evidently.sdk`)\n3. **UI Layer** - React/TypeScript web interface for visualization and management\n\nThe modular design allows users to:\n- Use the Python SDK directly for programmatic evaluation\n- Store and version configurations in the cloud\n- Access results through the web UI\n- Extend functionality through the descriptor and metric system\n\n---\n\n<a id='core-components'></a>\n\n## Core Components\n\n### 相关页面\n\n相关主题：[Architecture Overview](#arch-overview), [Reports and Test Suites](#reports-test-suites), [Custom Metrics and Extensibility](#custom-metrics)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/core/base_types.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/base_types.py)\n- [src/evidently/core/metric_types.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/metric_types.py)\n- [src/evidently/core/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/report.py)\n- [src/evidently/core/tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/tests.py)\n- [src/evidently/core/registries/components.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/components.py)\n- [src/evidently/core/registries/metrics.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/metrics.py)\n- [src/evidently/core/registries/metric_tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/metric_tests.py)\n</details>\n\n# Core Components\n\n## Overview\n\nThe Core Components form the foundational architecture of the Evidently framework, providing the essential building blocks for evaluation, testing, and monitoring of ML and LLM-powered systems. These components establish the base type system, metric definitions, reporting mechanisms, and registry patterns that enable the framework's extensibility and modularity.\n\nThe Core Components encompass several interconnected modules that work together to provide a unified approach to ML evaluation. At the heart of this system is a registration-based architecture that allows metrics, tests, and components to be dynamically discovered, configured, and executed within the Evidently ecosystem.\n\nThis architecture follows a plugin-like pattern where individual components can be registered, versioned, and managed through centralized registries. This design enables users to extend the framework's functionality by implementing custom metrics and tests while maintaining compatibility with the existing evaluation pipeline.\n\n## Base Types Module\n\n### Purpose and Scope\n\nThe base types module defines the fundamental data structures and abstractions that underpin all Evidently components. These types establish common interfaces and data models used throughout the framework, ensuring consistency and interoperability between different modules.\n\nThe base types primarily focus on defining what data flows through the evaluation pipeline, including data definitions, dataset representations, and result containers. This module serves as the foundation upon which all higher-level abstractions are built.\n\n### Key Data Structures\n\nThe base types module defines several critical classes and interfaces that form the backbone of the Evidently type system. These structures provide a standardized way to represent input data, evaluation results, and configuration options across all components.\n\nThe module establishes the `DataDefinition` class, which serves as a schema for describing the structure of datasets being evaluated. This includes column definitions, data types, and metadata that describe the characteristics of the data being analyzed. Data definitions enable Evidently to understand the structure of incoming data and apply appropriate transformations and evaluations.\n\nThe module also defines base classes for `Dataset` objects, which encapsulate reference and current data batches used in evaluations. These dataset representations include functionality for data validation, transformation, and slicing based on various criteria.\n\n## Metric Types Module\n\n### Metric Architecture\n\nThe metric types module defines the core abstraction for all metrics within Evidently. Metrics are the primary mechanism for evaluating data quality, model performance, and statistical properties of datasets. The module establishes a class hierarchy that enables both simple single-value metrics and complex multi-dimensional metric calculations.\n\nMetrics in Evidently follow a consistent pattern where each metric is associated with specific columns or data subsets and produces standardized result objects. This design enables metrics to be composed, combined, and analyzed together within reports and dashboards.\n\n### Metric Execution Model\n\nThe metric execution model defines how metrics are calculated, what inputs they receive, and how results are structured. Each metric implementation receives a data context containing the relevant dataset columns and produces a `MetricResult` object that encapsulates the calculated values and metadata.\n\nThe execution model supports both eager and lazy evaluation strategies. Some metrics compute their results immediately upon execution, while others may defer computation until results are actually needed. This flexibility enables optimizations in scenarios where multiple metrics share intermediate calculations or where memory efficiency is a concern.\n\nMetrics can be configured with parameters that control their behavior, including aggregation methods, thresholds, and visualization options. The parameter system uses type hints and validation to ensure configuration correctness while providing clear documentation through the API reference.\n\n## Report System\n\n### Report Architecture\n\nThe Report class serves as the primary interface for executing evaluations in Evidently. Reports orchestrate the execution of metrics and tests, collect results, and generate visualizations and summaries. The Report system provides a flexible framework for combining multiple evaluation components into cohesive analysis workflows.\n\nReports maintain state throughout their lifecycle, tracking which metrics and tests have been executed, their results, and any errors or warnings encountered during execution. This state management enables features like incremental recalculation, where only changed components need to be re-evaluated.\n\n```mermaid\ngraph TD\n    A[Create Report] --> B[Add Metrics]\n    B --> C[Add Tests]\n    C --> D[Run Report]\n    D --> E[Collect Results]\n    E --> F[Generate Visualizations]\n    F --> G[Export Report]\n    \n    H[Reference Data] --> D\n    I[Current Data] --> D\n    \n    E --> J[Metric Results]\n    E --> K[Test Results]\n```\n\n### Report Configuration\n\nReports support extensive configuration options that control execution behavior, result aggregation, and output formatting. Configuration options include parallel execution settings, result caching policies, and visualization preferences.\n\nThe Report class provides methods for both synchronous and asynchronous execution, enabling integration with various application architectures. Asynchronous execution is particularly useful for long-running evaluations or when reports are generated as part of batch processing workflows.\n\n### Report Export\n\nThe Report system includes export functionality that generates output in various formats including HTML, JSON, and Python dictionary structures. Export options enable customization of included visualizations, result detail levels, and styling preferences.\n\n## Testing Framework\n\n### Test Definition\n\nThe testing module extends Evidently's evaluation capabilities to include pass/fail style assertions. Tests in Evidently are designed to validate specific conditions about data or model behavior, returning boolean results that indicate whether defined criteria are met.\n\nTests can be defined inline within metrics or as standalone components that check specific conditions. This dual approach provides flexibility in how validation logic is structured and reused across different evaluation scenarios.\n\n### Test Execution\n\nTests execute alongside metrics within the Report framework, allowing unified execution of both evaluation and validation logic. The test execution model mirrors the metric execution model, receiving data contexts and producing structured results that include pass/fail status and diagnostic information.\n\nTest results include detailed failure messages that explain why a test did not pass, helping users understand data quality issues or model behavior problems. These diagnostic messages reference specific rows, values, or statistical properties that contributed to the test failure.\n\n## Component Registry System\n\n### Registry Architecture\n\nThe registry system provides a centralized mechanism for managing and discovering Evidently components. Registries maintain mappings between component identifiers and their implementations, enabling dynamic resolution of components at runtime.\n\nEvidently uses registries extensively for metrics, tests, descriptors, and other extensible components. This architecture allows the framework to support third-party extensions while maintaining a consistent interface for component discovery and execution.\n\n### Component Registration\n\nComponents are registered with their respective registries using decorators or explicit registration calls. The registration process captures metadata about each component, including its name, version, parameters, and dependencies. This metadata enables automatic documentation generation and configuration interfaces.\n\nRegistration supports versioning, allowing multiple versions of a component to coexist and enabling rollback to previous versions when needed. Version management is particularly important for production deployments where stability and reproducibility are critical.\n\n### Metrics Registry\n\nThe metrics registry extends the component registry pattern specifically for metrics. It provides specialized functionality for metric discovery, parameter validation, and result aggregation.\n\n```mermaid\ngraph TD\n    A[Metrics Registry] --> B[Metric Base Class]\n    A --> C[Metric Parameters]\n    A --> D[Metric Results]\n    \n    B --> E[Column Metrics]\n    B --> F[Dataset Metrics]\n    B --> G[Statistical Metrics]\n    \n    D --> H[Visualization Configs]\n    D --> I[Aggregation Methods]\n```\n\n### Metric Tests Registry\n\nThe metric tests registry manages test definitions that validate metric results or data conditions. This registry enables tests to reference metrics by name and access their results for validation purposes.\n\nTests registered in this registry can be automatically discovered and included in reports based on configuration. This discovery mechanism enables declarative specification of validation requirements without requiring explicit test instantiation.\n\n## Integration Patterns\n\n### SDK Integration\n\nThe Core Components integrate with Evidently's SDK layer, which provides higher-level APIs for cloud and remote deployments. The SDK layer uses the Core Components as its foundation while adding capabilities for remote execution, result storage, and collaborative features.\n\nThe `CloudConfigAPI` class demonstrates this integration, providing methods for managing project configurations, descriptor configs, and artifact versions. These high-level APIs delegate core functionality to the Core Components while handling network communication and serialization.\n\n### Adapter Pattern\n\nEvidently uses adapters to bridge different execution contexts and storage backends. Adapters transform between internal data structures and external representations, enabling Evidently to work with various data sources and deployment environments.\n\nThe adapter pattern is particularly important for cloud deployments where configurations and results are stored remotely. Adapters handle serialization, deserialization, and API communication while maintaining compatibility with the Core Component interfaces.\n\n## Configuration Management\n\n### Config Versioning\n\nThe Core Components support configuration versioning through specialized config classes. Each versioned configuration maintains a history of changes, enabling audit trails and rollback capabilities.\n\nConfigurations are organized by project, with separate namespaces for different config types like metrics, tests, and descriptors. This organization enables clear separation of concerns while maintaining relationships between related configurations.\n\n### Remote Configuration\n\nFor deployments requiring centralized configuration management, Evidently supports remote configuration storage through the CloudConfigAPI. Remote configurations can be fetched, updated, and versioned through the SDK, with changes automatically synchronized to connected clients.\n\n## Summary\n\nThe Core Components provide the essential infrastructure for Evidently's evaluation capabilities. Through a combination of base types, metric abstractions, reporting mechanisms, and registry systems, these components establish a flexible and extensible framework for ML evaluation.\n\nThe modular design enables users to leverage individual components for specific use cases or combine them into comprehensive evaluation pipelines. The registration-based architecture ensures extensibility while maintaining consistency across the framework.\n\nUnderstanding these core concepts is essential for effectively using Evidently and for extending the framework with custom metrics, tests, and integrations.\n\n---\n\n<a id='data-flow'></a>\n\n## Data Management and Data Flow\n\n### 相关页面\n\n相关主题：[Architecture Overview](#arch-overview), [ML Model Evaluation](#ml-evaluation), [LLM Evaluation and Judging](#llm-evals)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/core/datasets.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/datasets.py)\n- [src/evidently/core/container.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/container.py)\n- [src/evidently/sdk/datasets.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/datasets.py)\n- [src/evidently/llm/datagen/base.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/datagen/base.py)\n- [src/evidently/llm/rag/index.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/rag/index.py)\n- [src/evidently/llm/rag/splitter.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/rag/splitter.py)\n</details>\n\n# Data Management and Data Flow\n\n## Overview\n\nData Management and Data Flow in Evidently encompasses the mechanisms by which data is ingested, transformed, stored, and processed throughout the evaluation lifecycle. The system provides a unified abstraction layer that handles multiple data sources (pandas DataFrames, CSV files, Parquet files) and integrates with the broader evaluation pipeline including Reports, Test Suites, and LLM-specific processing like RAG (Retrieval-Augmented Generation) systems.\n\nThe architecture separates concerns between core data structures (`Dataset`, `Container`), SDK-level abstractions for cloud/local deployment, and specialized LLM data processing components. This modularity allows users to work with familiar Python data structures while benefiting from Evidently's evaluation and monitoring capabilities.\n\n资料来源：[src/evidently/core/datasets.py:1-50]()\n\n## Core Data Abstraction\n\n### The Dataset Class\n\nThe `Dataset` class serves as the primary data container throughout Evidently. It wraps various data sources into a standardized interface that supports:\n\n- Direct creation from pandas DataFrames\n- Automatic type inference via `DataDefinition`\n- Descriptor-based feature computation\n- Metadata and tagging support\n\n```python\n# Basic Dataset creation from pandas\nfrom evidently import Dataset, DataDefinition\n\ndataset = Dataset.from_pandas(\n    dataframe,\n    data_definition=DataDefinition(),\n    metadata={\"source\": \"production\"},\n    tags=[\"eval\", \"2024\"]\n)\n```\n\n资料来源：[src/evidently/core/datasets.py:30-45]()\n\n### Dataset Factory Methods\n\n| Method | Purpose | Input Types |\n|--------|---------|-------------|\n| `from_pandas()` | Create Dataset from pandas DataFrame | `pd.DataFrame` |\n| `from_any()` | Convert various types to Dataset | `pd.DataFrame`, `Dataset` |\n| `as_dataframe()` | Extract underlying DataFrame | N/A (output) |\n| `column()` | Retrieve specific column as `DatasetColumn` | column name |\n| `subdataset()` | Filter dataset by column value | column name, label |\n\n资料来源：[src/evidently/core/datasets.py:50-80]()\n\nThe `from_any()` static method implements a factory pattern that handles type conversion automatically:\n\n```python\n@staticmethod\ndef from_any(dataset: PossibleDatasetTypes) -> \"Dataset\":\n    if isinstance(dataset, Dataset):\n        return dataset\n    if isinstance(dataset, pd.DataFrame):\n        return Dataset.from_pandas(dataset)\n    raise ValueError(f\"Unsupported dataset type: {type(dataset)}\")\n```\n\n资料来源：[src/evidently/core/datasets.py:60-70]()\n\n## Data Flow Architecture\n\n```mermaid\ngraph TD\n    A[Input Data: pd.DataFrame / CSV / Parquet] --> B[Dataset.from_any]\n    B --> C{Data Type Check}\n    C -->|pd.DataFrame| D[Dataset.from_pandas]\n    C -->|Already Dataset| E[Return as-is]\n    D --> F[DataDefinition Type Inference]\n    F --> G[Add Descriptors]\n    G --> H[Dataset Object]\n    H --> I[Report.run / TestSuite.run]\n    I --> J[Snapshot with Results]\n    J --> K[Export: JSON / HTML / Dict]\n```\n\n## Container Architecture\n\nThe `Container` class provides the underlying storage and management mechanism for datasets within the Evidently ecosystem. Containers maintain references to data assets and provide CRUD operations for dataset lifecycle management.\n\nKey container responsibilities include:\n\n- Storing dataset references and metadata\n- Managing dataset versioning\n- Providing query and filtering capabilities\n- Handling persistence to storage backends\n\n资料来源：[src/evidently/core/container.py:1-30]()\n\n### Container-Dataset Relationship\n\n```mermaid\ngraph LR\n    A[Container] -->|manages| B[Dataset Registry]\n    B -->|references| C[Dataset v1]\n    B -->|references| D[Dataset v2]\n    B -->|references| E[Dataset vN]\n    C -->|wraps| F[pd.DataFrame]\n    D -->|wraps| G[pd.DataFrame]\n    E -->|wraps| H[pd.DataFrame]\n```\n\n## SDK-Level Data Management\n\nThe SDK layer provides deployment-agnostic data handling through the `CloudConfigAPI` and local storage adapters. This abstraction enables consistent data operations whether running locally or connected to Evidently Cloud.\n\n资料来源：[src/evidently/sdk/datasets.py:1-50]()\n\n### SDK Dataset Operations\n\n| Operation | Method Signature | Description |\n|-----------|-----------------|-------------|\n| Create | `add_dataset(project_id, dataset, name, description, link)` | Add dataset to project |\n| Read | `load_dataset(dataset_id)` | Retrieve dataset by UUID |\n| List | `list_datasets(project, origins)` | List all datasets in project |\n| Update | `update_dataset(dataset_id, dataset)` | Modify existing dataset |\n| Delete | `delete_dataset(dataset_id)` | Remove dataset from storage |\n\n资料来源：[src/evidently/ui/workspace.py:50-100]()\n\n## LLM-Specific Data Processing\n\n### RAG Data Pipeline\n\nFor LLM applications using Retrieval-Augmented Generation, Evidently provides specialized data processing components that handle document ingestion, chunking, and indexing.\n\n资料来源：[src/evidently/llm/rag/index.py:1-50]()\n\n```mermaid\ngraph TD\n    A[Raw Documents] --> B[RAG Splitter]\n    B --> C[Document Chunks]\n    C --> D[RAG Index]\n    D --> E[Vector Store]\n    E --> F[Retrieval Query]\n    F --> G[Context + Query]\n    G --> H[LLM Response]\n```\n\n### Document Splitting\n\nThe `splitter.py` module handles text chunking with configurable parameters:\n\n- **Chunk size**: Target size for each text segment\n- **Overlap**: Amount of overlap between consecutive chunks\n- **Separators**: Text boundaries for splitting priority\n\n资料来源：[src/evidently/llm/rag/splitter.py:1-40]()\n\n### RAG Indexing\n\nThe index module manages the vector storage and retrieval:\n\n- Embedding generation for document chunks\n- Vector store integration (FAISS, ChromaDB, etc.)\n- Similarity search capabilities\n- Metadata preservation for filtering\n\n资料来源：[src/evidently/llm/rag/index.py:50-100]()\n\n## Data Generation for LLM Evals\n\nThe `datagen/base.py` module provides infrastructure for synthetic data generation, enabling users to create evaluation datasets programmatically.\n\n资料来源：[src/evidently/llm/datagen/base.py:1-50]()\n\n### Data Generation Workflow\n\n```mermaid\ngraph LR\n    A[Seed Data / Templates] --> B[LLM Generator]\n    B --> C[Generated Samples]\n    C --> D[Quality Validation]\n    D -->|Pass| E[Evaluation Dataset]\n    D -->|Fail| F[Regeneration Loop]\n    F --> B\n```\n\n### DataGeneratorBase Class\n\nThe base class defines the contract for data generation:\n\n| Method | Purpose |\n|--------|---------|\n| `generate()` | Create new data samples |\n| `validate()` | Check generated data quality |\n| `expand()` | Augment existing datasets |\n\n资料来源：[src/evidently/llm/datagen/base.py:30-60]()\n\n## Data Flow in Report Execution\n\nReports represent the primary consumer of dataset objects within Evidently. The `run()` method orchestrates the complete data flow from input to output.\n\n```python\ndef run(\n    self,\n    current_data,\n    reference_data=None,\n    additional_data=None,\n    timestamp=None,\n    metadata=None,\n    tags=None,\n    name=None,\n) -> Snapshot:\n    # Data validation\n    if isinstance(current_data, pd.DataFrame) and current_data.empty:\n        raise ValueError(\"current_data must contain at least one column...\")\n    \n    # Convert to Dataset objects\n    current_dataset = Dataset.from_any(current_data)\n    reference_dataset = Dataset.from_any(reference_data) if reference_data else None\n    \n    # Execute metrics and generate snapshot\n    ...\n```\n\n资料来源：[src/evidently/core/report.py:100-130]()\n\n### Data Validation Rules\n\n| Condition | Error Raised |\n|-----------|--------------|\n| Empty current_data DataFrame | `ValueError: current_data must contain at least one column; received an empty DataFrame` |\n| Empty reference_data DataFrame | `ValueError: reference_data must contain at least one column; received an empty DataFrame` |\n| Unsupported dataset type | `ValueError: Unsupported dataset type: {type(dataset)}` |\n\n资料来源：[src/evidently/core/datasets.py:65-70]()\n\n## Workspace Dataset Management\n\nThe Workspace class provides high-level dataset management capabilities for organizing evaluations across projects.\n\n```python\nclass Workspace:\n    def add_dataset(\n        self,\n        project_id: STR_UUID,\n        dataset: Dataset,\n        name: str,\n        description: Optional[str] = None,\n        link: Optional[SnapshotLink] = None,\n    ) -> DatasetID:\n        \"\"\"Add a dataset to a project.\"\"\"\n        return self.datasets.add(project_id=project_id, dataset=dataset, ...)\n    \n    def load_dataset(self, dataset_id: DatasetID) -> Dataset:\n        \"\"\"Load a dataset by ID.\"\"\"\n        return self.datasets.load(dataset_id)\n    \n    def list_datasets(\n        self,\n        project: STR_UUID,\n        origins: Optional[List[str]] = None,\n    ) -> DatasetList:\n        \"\"\"List all datasets in a project.\"\"\"\n        return self.datasets.list(project, origins=origins)\n```\n\n资料来源：[src/evidently/ui/workspace.py:50-80]()\n\n### Dataset Metadata Schema\n\n| Field | Type | Required | Description |\n|-------|------|----------|-------------|\n| `name` | `str` | Yes | Human-readable dataset name |\n| `description` | `str` | No | Detailed description |\n| `link` | `SnapshotLink` | No | Associated snapshot reference |\n| `created_at` | `datetime` | Auto | Creation timestamp |\n| `author` | `str` | Auto | Creator identifier |\n\n## Summary\n\nData Management in Evidently follows a consistent pattern across the platform:\n\n1. **Ingestion**: Data enters through `Dataset.from_pandas()` or `Dataset.from_any()` factory methods\n2. **Validation**: Empty DataFrames and unsupported types are rejected early\n3. **Processing**: Descriptors and metrics operate on the standardized Dataset interface\n4. **Output**: Results are packaged into Snapshots with multiple export formats\n5. **Persistence**: Datasets can be stored and retrieved through Workspace and SDK APIs\n\nThe separation between core data structures, SDK abstractions, and specialized LLM components enables flexible deployment while maintaining a simple user-facing API based on pandas DataFrames.\n\n资料来源：[src/evidently/core/datasets.py:1-100]()\n资料来源：[src/evidently/core/report.py:100-150]()\n资料来源：[src/evidently/sdk/datasets.py:1-50]()\n资料来源：[src/evidently/llm/rag/index.py:1-100]()\n资料来源：[src/evidently/llm/rag/splitter.py:1-50]()\n资料来源：[src/evidently/llm/datagen/base.py:1-60]()\n\n---\n\n<a id='ml-evaluation'></a>\n\n## ML Model Evaluation\n\n### 相关页面\n\n相关主题：[LLM Evaluation and Judging](#llm-evals), [Descriptors and Features System](#descriptors-features), [Presets and Metric Presets](#presets)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/metrics/classification.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/classification.py)\n- [src/evidently/metrics/regression.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/regression.py)\n- [src/evidently/metrics/data_quality.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/data_quality.py)\n- [src/evidently/metrics/dataset_statistics.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/dataset_statistics.py)\n- [src/evidently/metrics/recsys.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/recsys.py)\n- [src/evidently/metrics/embeddings.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/embeddings.py)\n- [src/evidently/legacy/metrics/classification_performance/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/metrics/classification_performance/__init__.py)\n- [src/evidently/legacy/metrics/regression_performance/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/metrics/regression_performance/__init__.py)\n- [src/evidently/legacy/calculations/data_drift.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/calculations/data_drift.py)\n</details>\n\n# ML Model Evaluation\n\n## Overview\n\nML Model Evaluation in Evidently is a comprehensive module that enables data scientists and ML engineers to assess, test, and monitor machine learning models throughout their lifecycle—from experiments to production. The evaluation system supports both predictive tasks (classification, regression) and recommendation systems.\n\nThe evaluation framework is built around the concept of **Metrics** and **Reports**, where metrics are individual evaluation components that can be composed into reports for comprehensive model assessment.\n\n资料来源：[src/evidently/core/report.py:1-50]()\n\n### Core Architecture\n\n```mermaid\ngraph TD\n    A[Dataset] --> B[Report]\n    A --> C[Test Suite]\n    B --> D[Interactive Report]\n    B --> E[JSON/Dict Output]\n    C --> F[Pass/Fail Results]\n    G[Metrics] --> B\n    H[Presets] --> B\n    G --> C\n```\n\n### Supported Evaluation Types\n\n| Evaluation Type | Purpose | Key Metrics |\n|----------------|---------|-------------|\n| Classification | Evaluate classifier performance | Accuracy, Precision, Recall, F1, ROC-AUC |\n| Regression | Evaluate regression models | MAE, MSE, RMSE, R2 |\n| Data Quality | Assess input data integrity | Missing values, duplicates, correlations |\n| Data Drift | Detect distribution changes | PSI, KS test, L Infinity |\n| Recommendation Systems | Evaluate recsys models | Precision@K, Recall@K, NDCG |\n| Embeddings | Evaluate embedding quality | Cosine similarity, drift detection |\n\n## Classification Metrics\n\n### Purpose and Scope\n\nClassification metrics evaluate the performance of classification models by comparing predicted labels against actual labels. Evidently supports both binary and multiclass classification evaluation scenarios.\n\n资料来源：[src/evidently/metrics/classification.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description | Applicable Task Types |\n|--------|-------------|----------------------|\n| `Accuracy` | Proportion of correct predictions | Binary, Multiclass |\n| `Precision` | Positive predictive value | Binary, Multiclass |\n| `Recall` | Sensitivity, true positive rate | Binary, Multiclass |\n| `F1Score` | Harmonic mean of precision and recall | Binary, Multiclass |\n| `RocAuc` | Area under the ROC curve | Binary, Multiclass |\n| `ConfusionMatrix` | Cross-tabulation of predictions vs actuals | Binary, Multiclass |\n| `PrecisionRecallCurve` | Precision-Recall tradeoff visualization | Binary |\n| `ClassRepresentation` | Distribution of classes in predictions | Multiclass |\n\n### Usage Example\n\n```python\nfrom evidently import Report\nfrom evidently.metrics import Accuracy, Precision, Recall, F1Score, RocAuc\n\nreport = Report([\n    Accuracy(),\n    Precision(),\n    Recall(),\n    F1Score(),\n    RocAuc()\n])\n\nresult = report.run(current_data=current_dataset, reference_data=reference_dataset)\n```\n\n### Legacy Classification Performance\n\nThe legacy module provides comprehensive classification evaluation with detailed visualizations.\n\n资料来源：[src/evidently/legacy/metrics/classification_performance/__init__.py:1-50]()\n\nKey components include:\n- **ClassificationPerformanceMetrics**: Core metrics container\n- **Visualization widgets**: Confusion matrix plots, ROC curves, PR curves\n- **Per-class analysis**: Detailed breakdown for multiclass scenarios\n\n## Regression Metrics\n\n### Purpose and Scope\n\nRegression metrics assess the performance of regression models by computing various error measures and statistical properties of prediction residuals.\n\n资料来源：[src/evidently/metrics/regression.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description | Unit |\n|--------|-------------|------|\n| `MeanError` | Average prediction error | Same as target |\n| `MeanAbsoluteError` | MAE - average absolute error | Same as target |\n| `MeanSquaredError` | MSE - average squared error | Squared target units |\n| `RootMeanSquaredError` | RMSE - square root of MSE | Same as target |\n| `ErrorStd` | Standard deviation of errors | Same as target |\n| `ErrorNormality` | Shapiro-Wilk test for residual normality | p-value |\n| `R2Score` | Coefficient of determination | Dimensionless |\n| `ErrorPercentile` | Percentile-based error analysis | Same as target |\n\n### Usage Example\n\n```python\nfrom evidently import Report\nfrom evidently.metrics import (\n    MeanAbsoluteError,\n    MeanSquaredError,\n    R2Score,\n    ErrorPercentile\n)\n\nreport = Report([\n    MeanAbsoluteError(),\n    MeanSquaredError(),\n    R2Score(),\n    ErrorPercentile(percentile=95)\n])\n\nresult = report.run(current_data=current_dataset, reference_data=reference_dataset)\n```\n\n### Legacy Regression Performance\n\nThe legacy regression performance module provides time-series analysis of predictions.\n\n资料来源：[src/evidently/legacy/metrics/regression_performance/__init__.py:1-50]()\n\nFeatures include:\n- **Predicted vs Actual plots**: Time-series visualization\n- **Residual distribution analysis**: Histograms and QQ plots\n- **Error distribution by feature**: Feature-level error breakdown\n\n```mermaid\ngraph LR\n    A[Current Data] --> B[RegressionReport]\n    C[Reference Data] --> B\n    B --> D[Predicted vs Actual]\n    B --> E[Error Distribution]\n    B --> F[Performance Metrics]\n```\n\n## Data Quality Metrics\n\n### Purpose and Scope\n\nData quality metrics evaluate the integrity and quality of input data. These metrics are essential for understanding whether model predictions are reliable and for identifying data pipeline issues.\n\n资料来源：[src/evidently/metrics/data_quality.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description |\n|--------|-------------|\n| `ColumnCount` | Number of columns in dataset |\n| `RowCount` | Number of rows in dataset |\n| `ValueStats` | Statistical summary (min, max, mean, std) |\n| `MissingValuesMetric` | Count and percentage of missing values |\n| `UniqueValuesCount` | Number of unique values per column |\n| `DataInconsistencyMetric` | Detection of data inconsistencies |\n| `TextStats` | Statistics for text columns (length, word count) |\n\n### Usage Example\n\n```python\nfrom evidently import Report, Dataset\nfrom evidently.metrics import ColumnCount, ValueStats, MissingValuesMetric\n\ndataset = Dataset.from_pandas(df, data_definition=DataDefinition())\n\nreport = Report([\n    ColumnCount(),\n    ValueStats(column=\"target\"),\n    MissingValuesMetric()\n])\n\nresult = report.run(dataset, None)\n```\n\n## Dataset Statistics\n\n### Purpose and Scope\n\nDataset statistics provide comprehensive descriptive statistics about datasets, enabling quick overview and comparison between reference and current data distributions.\n\n资料来源：[src/evidently/metrics/dataset_statistics.py:1-100]()\n\n### Available Presets\n\n| Preset | Description |\n|--------|-------------|\n| `DataSummaryPreset` | Complete dataset overview |\n| `DataDriftPreset` | Distribution comparison between reference and current |\n| `TargetDriftPreset` | Target variable drift analysis |\n| `NumTargetDriftPreset` | Numeric target drift analysis |\n| `CatTargetDriftPreset` | Categorical target drift analysis |\n\n### Statistical Tests for Drift Detection\n\n资料来源：[src/evidently/legacy/calculations/data_drift.py:1-100]()\n\n| Test | Applicable Data Type | Description |\n|------|---------------------|-------------|\n| `KS` | Numerical | Kolmogorov-Smirnov test |\n| `ZScore` | Numerical | Z-score based drift detection |\n| `TTest` | Numerical | Two-sample t-test for mean comparison |\n| `ChiSquare` | Categorical | Chi-square test for category distribution |\n| `PSI` | Both | Population Stability Index |\n| `L Infinity` | Both | Max absolute difference |\n\n### Data Drift Workflow\n\n```mermaid\ngraph TD\n    A[Reference Dataset] --> F[Drift Detection Engine]\n    B[Current Dataset] --> F\n    F --> C[Statistical Tests]\n    F --> D[Distribution Comparison]\n    C --> E[Drift Report]\n    D --> E\n    E --> G{Drift Detected?}\n    G -->|Yes| H[Alert / Action]\n    G -->|No| I[Continue Monitoring]\n```\n\n## Recommendation System Metrics\n\n### Purpose and Scope\n\nRecommendation system (recsys) metrics evaluate the quality of recommendations generated by recommendation models.\n\n资料来源：[src/evidently/metrics/recsys.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description | K-Parameter |\n|--------|-------------|-------------|\n| `PrecisionAtK` | Precision at top-K recommendations | Required |\n| `RecallAtK` | Recall at top-K recommendations | Required |\n| `NDCGAtK` | Normalized Discounted Cumulative Gain at K | Required |\n| `HitRateAtK` | Hit rate at top-K recommendations | Optional |\n| `MRRAtK` | Mean Reciprocal Rank at K | Required |\n\n### Usage Example\n\n```python\nfrom evidently import Report\nfrom evidently.metrics import PrecisionAtK, RecallAtK, NDCGAtK\n\nreport = Report([\n    PrecisionAtK(k=10),\n    RecallAtK(k=10),\n    NDCGAtK(k=10)\n])\n\nresult = report.run(current_data=recs_dataset, reference_data=ref_recs_dataset)\n```\n\n## Embeddings Metrics\n\n### Purpose and Scope\n\nEmbeddings metrics evaluate the quality and drift of vector embeddings, which are critical for LLM and semantic search applications.\n\n资料来源：[src/evidently/metrics/embeddings.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description |\n|--------|-------------|\n| `Embedding Drift` | Detect drift in embedding distributions |\n| `Cosine Similarity` | Average cosine similarity between embeddings |\n| `Retrieval Metrics` | Evaluate RAG and retrieval system quality |\n\n### Key Features\n\n- **Semantic Drift Detection**: Identify changes in embedding space distribution\n- **Cluster Analysis**: Analyze embedding cluster stability\n- **Pairwise Comparison**: Compare individual embedding pairs\n\n## Report Configuration\n\n### Creating a Report\n\n```python\nfrom evidently import Report\nfrom evidently.presets import DataDriftPreset\n\n# Basic report with preset\nreport = Report([DataDriftPreset()])\n\n# Report with custom metadata\nreport = Report(\n    metrics=[...],\n    metadata={\"model_version\": \"v1.2.3\"},\n    tags=[\"production\", \"monthly\"]\n)\n\n# Run with reference data for comparison\nsnapshot = report.run(current_dataset, reference_dataset)\n```\n\n### Report Output Formats\n\n| Format | Method | Use Case |\n|--------|--------|----------|\n| Interactive | Direct notebook display | Exploration, debugging |\n| JSON | `.json()` | API responses, CI/CD |\n| Python Dict | `.dict()` | Programmatic access |\n| HTML | Export functionality | Shareable reports |\n\n## Test Suite Integration\n\nReports can be converted to **Test Suites** by adding pass/fail conditions, enabling automated regression testing.\n\n```python\nfrom evidently.test_suite import TestSuite\nfrom evidently.tests import TestColumnValue\n\nsuite = TestSuite([\n    TestColumnValue(column=\"accuracy\", gt=0.95),\n])\n\nsuite.run(current_data=dataset, reference_data=reference)\n```\n\n## Best Practices\n\n### 1. Always Use Reference Data for Comparison\nFor meaningful evaluation, maintain a reference dataset representing expected data distribution or model performance.\n\n### 2. Choose Appropriate Metrics by Task Type\n\n| Task Type | Recommended Metrics |\n|-----------|--------------------|\n| Binary Classification | Accuracy, Precision, Recall, F1, ROC-AUC |\n| Multiclass Classification | Per-class F1, Confusion Matrix, Macro/Micro Avg |\n| Regression | MAE, RMSE, R2, Error Percentiles |\n| Recommendation | Precision@K, Recall@K, NDCG@K |\n| Data Quality | Missing values, duplicates, consistency |\n\n### 3. Set Up Monitoring Schedules\nFor production models, schedule regular evaluations to detect performance degradation and data drift early.\n\n### 4. Use Auto-generated Test Conditions\nEvidently can auto-generate test thresholds from reference data:\n```python\nfrom evidently.presets import DataDriftPreset\n\nsuite = TestSuite.from_reference(reference_data, [DataDriftPreset()])\n```\n\n## Summary\n\nEvidently's ML Model Evaluation module provides a comprehensive, modular framework for evaluating machine learning models across multiple dimensions:\n\n- **Classification**: Binary and multiclass classification metrics with visualizations\n- **Regression**: Comprehensive error metrics and residual analysis\n- **Data Quality**: Input data integrity checks\n- **Data Drift**: Distribution comparison and statistical tests\n- **Recommendation Systems**: Top-K recommendation quality metrics\n- **Embeddings**: Semantic drift detection for LLM applications\n\nThe system supports both one-off evaluations and continuous monitoring, with seamless integration into CI/CD pipelines through Test Suites.\n\n---\n\n<a id='llm-evals'></a>\n\n## LLM Evaluation and Judging\n\n### 相关页面\n\n相关主题：[ML Model Evaluation](#ml-evaluation), [Descriptors and Features System](#descriptors-features)\n\n<details>\n<summary>Relevant Source Files</summary>\n\nThe following source files were used to generate this documentation:\n\n- [src/evidently/descriptors/llm_judges.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/llm_judges.py)\n- [src/evidently/descriptors/generated_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/generated_descriptors.py)\n- [src/evidently/llm/prompts/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/prompts/__init__.py)\n- [src/evidently/llm/utils/parsing.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/utils/parsing.py)\n- [src/evidently/llm/utils/prompt_render.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/utils/prompt_render.py)\n- [src/evidently/llm/optimization/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/optimization/__init__.py)\n- [src/evidently/llm/optimization/scorers.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/optimization/scorers.py)\n- [src/evidently/legacy/descriptors/llm_judges.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/descriptors/llm_judges.py)\n- [src/evidently/legacy/descriptors/sentiment_descriptor.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/descriptors/sentiment_descriptor.py)\n</details>\n\n# LLM Evaluation and Judging\n\n## Overview\n\nEvidently provides a comprehensive framework for evaluating Large Language Model (LLM) powered systems through specialized descriptors called **LLM Judges**. These judges leverage LLM APIs to automatically assess and score various quality dimensions of LLM outputs, including relevance, coherence, factual accuracy, and task completion.\n\nLLM Judges serve as evaluators that can be integrated into monitoring pipelines to continuously assess LLM performance without requiring manual human evaluation.\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Dataset with LLM Inputs/Outputs] --> B[LLM Judge Descriptors]\n    B --> C[Prompt Templates]\n    C --> D[LLM Provider API]\n    D --> E[Parsed Evaluation Results]\n    E --> F[Feature Descriptors]\n    F --> G[Metrics & Tests]\n    G --> H[Evidently Reports/Monitors]\n    \n    B1[ContextRelevanceLLMEval] --> B\n    B2[CompletenessLLMEval] --> B\n    B3[ContextQualityLLMEval] --> B\n    B4[GroundednessLLMEval] --> B\n    B5[RelevanceLLMEval] --> B\n    \n    C1[System Prompt] --> C\n    C2[User Query] --> C\n    C3[Evaluation Criteria] --> C\n```\n\n## Core Components\n\n### LLM Judge Types\n\nEvidently implements multiple specialized LLM judges for different evaluation dimensions:\n\n| Judge Type | Purpose | Output |\n|------------|---------|--------|\n| `ContextRelevanceLLMEval` | Measures relevance of retrieved context to the query | Score 0-1, reasoning |\n| `CompletenessLLMEval` | Evaluates completeness of response coverage | Score 0-1, reasoning |\n| `ContextQualityLLMEval` | Assesses quality of context for answering | Score 0-1, reasoning |\n| `GroundednessLLMEval` | Checks if response is grounded in provided context | Score 0-1, reasoning |\n| `RelevanceLLMEval` | Evaluates semantic relevance of response to query | Score 0-1, reasoning |\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py:1-200]()\n\n### Feature Descriptor Pattern\n\nLLM Judges follow the `FeatureDescriptor` pattern, wrapping LLM evaluation features with optional aliasing and test definitions:\n\n```python\nFeatureDescriptor(\n    feature=feature,\n    alias=alias,  # Custom display name\n    tests=tests   # Optional validation tests\n)\n```\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py:30-35]()\n\n## Configuration Options\n\n### Common Parameters\n\nAll LLM Judge functions share a common parameter structure:\n\n| Parameter | Type | Required | Default | Description |\n|-----------|------|----------|---------|-------------|\n| `column_name` | `str` | Yes | - | Name of the text column to evaluate |\n| `provider` | `str` | No | `\"openai\"` | LLM provider (openai, anthropic, etc.) |\n| `model` | `str` | No | `\"gpt-4o-mini\"` | Model identifier |\n| `additional_columns` | `Dict[str, str]` | No | `None` | Extra context columns for evaluation |\n| `include_category` | `bool` | No | `None` | Include categorical classification in output |\n| `include_score` | `bool` | No | `None` | Include numeric score in output |\n| `include_reasoning` | `bool` | No | `None` | Include explanation in output |\n| `uncertainty` | `Uncertainty` | No | `None` | Uncertainty handling strategy |\n| `alias` | `str` | No | `None` | Custom name for the feature |\n| `tests` | `List` | No | `None` | Tests to apply to the feature |\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py:50-70]()\n\n### Uncertainty Handling\n\nThe uncertainty parameter enables robust evaluation by detecting when the LLM is uncertain about its assessment:\n\n```python\nuncertainty: Optional[Uncertainty] = None\n```\n\nThis allows the system to flag low-confidence evaluations rather than returning potentially incorrect scores.\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py:58]()\n\n## Prompt System\n\n### Prompt Templates\n\nEvidently uses structured prompt templates for LLM evaluation:\n\n```python\n@dataclasses.dataclass\nclass LLMJudgePrompts:\n    system: str\n    user: str\n```\n\nPrompts are rendered with dynamic variables including:\n- Query text\n- Response text\n- Reference context\n- Evaluation criteria\n\n资料来源：[src/evidently/llm/prompts/__init__.py]()\n\n### Prompt Rendering\n\nThe prompt rendering system processes templates with type-safe variable substitution:\n\n```python\ndef render_prompt(\n    template: PromptTemplate,\n    **kwargs: Any\n) -> str:\n```\n\n资料来源：[src/evidently/llm/utils/prompt_render.py]()\n\n## Output Parsing\n\n### Response Parsing\n\nEvaluation results are parsed using structured output extraction:\n\n| Parser | Purpose |\n|--------|---------|\n| `parse_json_response()` | Extract JSON-structured evaluations |\n| `parse_score()` | Extract numeric scores |\n| `parse_reasoning()` | Extract explanation text |\n| `parse_category()` | Extract categorical labels |\n\n资料来源：[src/evidently/llm/utils/parsing.py]()\n\n### Score Calculation\n\nScores are calculated based on the evaluation criteria defined in each judge:\n\n- **0.0**: Does not meet criteria\n- **0.5**: Partially meets criteria\n- **1.0**: Fully meets criteria\n\nThe score can be combined with reasoning and category outputs for comprehensive evaluation reports.\n\n## Integration with Evidently Reports\n\n### Usage in Reports\n\nLLM Judges can be added to Evidently reports:\n\n```python\nfrom evidently.llm import RelevanceLLMEval\n\nreport = Report(metrics=[\n    RelevanceLLMEval(\n        column_name=\"response\",\n        include_score=True,\n        include_reasoning=True\n    )\n])\n```\n\n### Usage in Monitoring\n\nFor continuous monitoring, LLM Judges are integrated into dashboard widgets:\n\n```python\nfrom evidently.dashboard import Dashboard\n\ndashboard = Dashboard([\n    RelevanceLLMEval(column_name=\"response\")\n])\n```\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Descriptors/Features/LLMJudge/template.tsx]()\n\n## UI Components\n\n### LLMJudgeTemplate Component\n\nThe frontend provides visualization for LLM judge configurations:\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `state` | `LLMJudgeState` | Current judge configuration |\n| `errors` | `FormErrors` | Validation errors |\n| `availableTags` | `string[]` | Available categorization tags |\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Descriptors/Features/LLMJudge/template.tsx]()\n\n### Visualization States\n\nThe UI renders different states for judge configurations:\n\n1. **Uncertainty Configuration**: Shows uncertainty handling options\n2. **Multiclass Classification**: Displays class criteria for classification modes\n3. **Criteria Preview**: Shows evaluation criteria as formatted text\n4. **Output Options**: Displays configured output fields (reasoning, category, score)\n\n## Legacy Components\n\n### Legacy LLM Judges\n\nThe legacy implementation in `evidently.legacy` provides backward compatibility:\n\n```python\nfrom evidently.legacy.descriptors.llm_judges import (\n    CompletenessLLMEval as CompletenessLLMEvalV1\n)\n```\n\n资料来源：[src/evidently/legacy/descriptors/llm_judges.py]()\n\n### Sentiment Descriptor\n\nSpecialized descriptor for sentiment analysis:\n\n```python\nfrom evidently.legacy.descriptors.sentiment_descriptor import SentimentDescriptor\n```\n\nFeatures include:\n- Sentiment polarity detection (positive, negative, neutral)\n- Confidence scoring\n- Integration with reporting pipeline\n\n资料来源：[src/evidently/legacy/descriptors/sentiment_descriptor.py]()\n\n## Scorers and Optimization\n\n### LLM Scorers\n\nScorers provide optimized evaluation functions:\n\n```python\nfrom evidently.llm.optimization.scorers import LLMScorer\n```\n\nScorers support:\n- Batch evaluation\n- Caching of results\n- Configurable thresholds\n\n资料来源：[src/evidently/llm/optimization/scorers.py]()\n\n## Best Practices\n\n### 1. Choosing Evaluation Dimensions\n\n| Use Case | Recommended Judges |\n|----------|---------------------|\n| RAG Systems | ContextRelevance, Groundedness, Relevance |\n| Summarization | Completeness, Relevance |\n| Q&A Systems | ContextQuality, Groundedness |\n| General Chat | All dimensions |\n\n### 2. Score Interpretation\n\n- **0.0 - 0.3**: Significant issues detected\n- **0.4 - 0.6**: Partial criteria met\n- **0.7 - 1.0**: Criteria well satisfied\n\n### 3. Uncertainty Handling\n\nEnable uncertainty handling when:\n- Operating on edge cases\n- Using smaller models\n- Evaluating ambiguous inputs\n\n## See Also\n\n- [Evidently Documentation](https://docs.evidentlyai.com)\n- [API Reference](https://evidentlyai.github.io/evidently/api-reference)\n- [Dashboard Components](../dashboard/)\n- [Prompt Templates](../llm/prompts/)\n\n---\n\n<a id='descriptors-features'></a>\n\n## Descriptors and Features System\n\n### 相关页面\n\n相关主题：[LLM Evaluation and Judging](#llm-evals), [ML Model Evaluation](#ml-evaluation)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/descriptors/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/__init__.py)\n- [src/evidently/descriptors/_text_length.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/_text_length.py)\n- [src/evidently/descriptors/_custom_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/_custom_descriptors.py)\n- [src/evidently/descriptors/generated_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/generated_descriptors.py)\n- [src/evidently/descriptors/text_match.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/text_match.py)\n- [src/evidently/core/registries/descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/descriptors.py)\n- [src/evidently/legacy/descriptors/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/descriptors/__init__.py)\n- [src/evidently/legacy/features/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/features/__init__.py)\n</details>\n\n# Descriptors and Features System\n\n## Overview\n\nThe Descriptors and Features System is a core component of the Evidently framework that provides extensible ways to extract, compute, and evaluate characteristics from data columns. This system enables users to define custom descriptors that wrap features with validation tests, making it easy to assess data quality, text properties, and model behavior in a unified evaluation pipeline.\n\nDescriptors serve as wrappers around feature implementations, adding metadata, display names, and optional test configurations. The system supports both legacy v1 features and modern descriptor implementations, providing backward compatibility while enabling new functionality.\n\n## Architecture\n\nThe system follows a layered architecture with clear separation between descriptor definitions, feature implementations, and registry management.\n\n```mermaid\ngraph TD\n    subgraph \"Public API Layer\"\n        PD[Public Descriptors<br/>TextMatch, HuggingFace, etc.]\n        CD[Custom Descriptors<br/>FeatureDescriptor]\n    end\n    \n    subgraph \"Registry Layer\"\n        REG[DescriptorRegistry]\n    end\n    \n    subgraph \"Implementation Layer\"\n        LEG[Legacy Features V1<br/>hf_feature, text_length]\n        NEW[New Features<br/>_text_length, text_match]\n    end\n    \n    subgraph \"Evaluation Layer\"\n        EVAL[Evidently Metrics<br/>and Tests]\n    end\n    \n    PD --> REG\n    CD --> REG\n    REG --> LEG\n    REG --> NEW\n    LEG --> EVAL\n    NEW --> EVAL\n    \n    style PD fill:#90EE90\n    style CD fill:#87CEEB\n    style REG fill:#FFD700\n    style LEG fill:#FFA07A\n    style NEW fill:#DDA0DD\n```\n\n### Directory Structure\n\n```\nsrc/evidently/\n├── descriptors/\n│   ├── __init__.py              # Public API exports\n│   ├── _text_length.py          # Text length descriptor implementation\n│   ├── _custom_descriptors.py   # FeatureDescriptor class\n│   ├── generated_descriptors.py # Generated descriptor functions\n│   └── text_match.py            # Text matching descriptor\n├── core/\n│   └── registries/\n│       └── descriptors.py       # Descriptor registry implementation\n└── legacy/\n    ├── descriptors/             # Legacy descriptor implementations\n    └── features/                # Legacy feature implementations\n```\n\n## Core Components\n\n### FeatureDescriptor\n\nThe `FeatureDescriptor` class is the central abstraction that wraps a feature with additional metadata and optional tests.\n\n```python\n# Source: src/evidently/descriptors/_custom_descriptors.py\nclass FeatureDescriptor(BaseDescriptor):\n    \"\"\"Descriptor that wraps a feature with tests and metadata.\"\"\"\n    \n    def __init__(\n        self,\n        feature: AnyFeatureType,\n        alias: Optional[str] = None,\n        tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n    ):\n        self.feature = feature\n        self.alias = alias\n        self.tests = tests or []\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `feature` | `AnyFeatureType` | The underlying feature implementation |\n| `alias` | `Optional[str]` | Display name for the descriptor |\n| `tests` | `Optional[List]` | Optional list of tests to apply |\n\n### DescriptorRegistry\n\nThe registry manages descriptor registration and lookup, enabling dynamic descriptor discovery.\n\n```python\n# Source: src/evidently/core/registries/descriptors.py\nclass DescriptorRegistry:\n    \"\"\"Registry for managing descriptor instances.\"\"\"\n    \n    def __init__(self):\n        self._descriptors: Dict[str, Descriptor] = {}\n    \n    def register(self, name: str, descriptor: Descriptor) -> None:\n        \"\"\"Register a descriptor with a given name.\"\"\"\n        self._descriptors[name] = descriptor\n    \n    def get(self, name: str) -> Optional[Descriptor]:\n        \"\"\"Retrieve a descriptor by name.\"\"\"\n        return self._descriptors.get(name)\n```\n\n## Available Descriptors\n\n### Text Length Descriptor\n\nComputes text length statistics for string columns, including character count, word count, and sentence statistics.\n\n**Source:** `src/evidently/descriptors/_text_length.py`\n\n```python\ndef TextLength(\n    column_name: str,\n    alias: Optional[str] = None,\n    mode: str = \"chars\",\n    tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n):\n    \"\"\"Compute text length metrics for a column.\n    \n    Args:\n        column_name: Name of the text column to analyze.\n        alias: Display name for the descriptor.\n        mode: Measurement mode - \"chars\" or \"words\".\n        tests: Optional list of tests to apply.\n    \"\"\"\n    from evidently.legacy.features.text_length import TextLength as TextLengthV1\n    \n    feature = TextLengthV1(column_name=column_name, mode=mode, display_name=alias)\n    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `column_name` | `str` | Required | Column to analyze |\n| `alias` | `Optional[str]` | `None` | Custom display name |\n| `mode` | `str` | `\"chars\"` | `\"chars\"` for characters, `\"words\"` for word count |\n| `tests` | `Optional[List]` | `None` | Tests to attach |\n\n### Text Match Descriptor\n\nMatches text content against patterns or lists, useful for validation and filtering.\n\n**Source:** `src/evidently/descriptors/text_match.py`\n\n```python\ndef TextMatch(\n    column_name: str,\n    alias: str,\n    words_list: List[str],\n    mode: str = \"match\",\n    lemmatize: bool = False,\n    tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n):\n    \"\"\"Match text against a word list.\n    \n    Args:\n        column_name: Text column to match against.\n        alias: Name for the descriptor.\n        words_list: List of words/patterns to match.\n        mode: Matching mode - \"match\", \"any\", \"all\".\n        lemmatize: Whether to apply lemmatization.\n        tests: Optional test list.\n    \"\"\"\n    from evidently.legacy.features.text_features import TextMatch as TextMatchV1\n    \n    feature = TextMatchV1(\n        column_name=column_name,\n        words_list=words_list,\n        mode=mode,\n        lemmatize=lemmatize,\n        display_name=alias\n    )\n    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `column_name` | `str` | Required | Target text column |\n| `alias` | `str` | Required | Descriptor name |\n| `words_list` | `List[str]` | Required | Words to match |\n| `mode` | `str` | `\"match\"` | `\"match\"`, `\"any\"`, or `\"all\"` |\n| `lemmatize` | `bool` | `False` | Enable lemmatization |\n| `tests` | `Optional[List]` | `None` | Tests to apply |\n\n### HuggingFace Descriptor\n\nApplies HuggingFace models to text columns for various NLP tasks including toxicity detection.\n\n**Source:** `src/evidently/descriptors/generated_descriptors.py`\n\n```python\ndef HuggingFace(\n    column_name: str,\n    model: str,\n    params: dict,\n    alias: str,\n    tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n):\n    \"\"\"Apply a HuggingFace model to text column.\n    \n    Args:\n        column_name: Name of the text column to process.\n        model: HuggingFace model name or path.\n        params: Additional parameters for the model.\n        alias: Alias for the descriptor.\n        tests: Optional list of tests to apply.\n    \"\"\"\n    from evidently.legacy.features.hf_feature import HuggingFaceFeature\n    \n    feature = HuggingFaceFeature(\n        column_name=column_name,\n        model=model,\n        params=params,\n        display_name=alias\n    )\n    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)\n```\n\n### HuggingFace Toxicity Descriptor\n\nSpecialized descriptor for detecting toxic content using HuggingFace models.\n\n**Source:** `src/evidently/descriptors/generated_descriptors.py:39-56`\n\n```python\ndef HuggingFaceToxicity(\n    column_name: str,\n    alias: str,\n    model: Optional[str] = None,\n    toxic_label: Optional[str] = None,\n    tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n):\n    \"\"\"Detect toxicity in text using HuggingFace models.\n    \n    Args:\n        column_name: Name of the text column to check.\n        alias: Alias for the descriptor.\n        model: HuggingFace model name or path. If None, uses default.\n        toxic_label: Label for toxic content.\n        tests: Optional list of tests to apply.\n    \"\"\"\n```\n\n## Integration with Evidently Metrics\n\nDescriptors are designed to integrate seamlessly with Evidently's metric and test evaluation system. When a descriptor is included in an evaluation, the underlying feature is computed and the results are automatically formatted for display.\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Descriptor Definition]\n    B --> C[Feature Computation]\n    C --> D[Test Evaluation]\n    D --> E[Metric Results]\n    C --> F[Visualization Data]\n    F --> G[HTML Widgets]\n    \n    style A fill:#87CEEB\n    style E fill:#90EE90\n    style G fill:#FFD700\n```\n\n## Legacy Features System\n\nThe legacy features system (`evidently.legacy.features`) provides backward compatibility for existing integrations. These features are wrapped by modern descriptors but can also be used directly.\n\n**Source:** `src/evidently/legacy/features/__init__.py`\n\n### Available Legacy Features\n\n| Feature | Module | Purpose |\n|---------|--------|---------|\n| `TextLength` | `text_length` | Text length statistics |\n| `TextMatch` | `text_features` | Pattern matching in text |\n| `HuggingFaceFeature` | `hf_feature` | HuggingFace model integration |\n\n## Usage Example\n\n```python\nfrom evidently.descriptors import TextLength, TextMatch, HuggingFaceToxicity\n\n# Create descriptors for evaluation\ntext_length_desc = TextLength(\n    column_name=\"review_text\",\n    alias=\"Review Length\",\n    mode=\"words\"\n)\n\ntoxicity_desc = HuggingFaceToxicity(\n    column_name=\"review_text\",\n    alias=\"Toxicity Score\",\n    toxic_label=\"toxic\"\n)\n\n# Use in Evidently Dashboard\ndashboard = Dashboard(metrics=[\n    ColumnMetrics(column_name=\"review_text\")\n        .with_descriptors([\n            text_length_desc,\n            toxicity_desc\n        ])\n])\n```\n\n## Descriptor Tests Integration\n\nDescriptors can be associated with tests that validate the computed values:\n\n```python\nfrom evidently.descriptors import TextLength\nfrom evidently.test_suits import ColumnTestSuite\n\ntest_suite = ColumnTestSuite(\n    tests=[\n        TextLength(\n            column_name=\"description\",\n            alias=\"Description Length\",\n            tests=[\n                # Test within the descriptor\n                GreaterThan(\"Description Length\", threshold=10),\n                LessThan(\"Description Length\", threshold=500)\n            ]\n        )\n    ]\n)\n```\n\n## Summary\n\nThe Descriptors and Features System provides a flexible, extensible mechanism for computing and evaluating column characteristics within Evidently. Key aspects include:\n\n- **Wrapper Pattern**: `FeatureDescriptor` wraps features with metadata and tests\n- **Registry Pattern**: Centralized descriptor management via `DescriptorRegistry`\n- **Backward Compatibility**: Legacy features wrapped for v1 compatibility\n- **Test Integration**: Native support for attaching tests to descriptors\n- **Visualization**: Automatic integration with Evidently's HTML rendering pipeline\n\nThis architecture enables users to define custom descriptors for any data transformation or evaluation need while maintaining consistency with the Evidently evaluation framework.\n\n---\n\n<a id='reports-test-suites'></a>\n\n## Reports and Test Suites\n\n### 相关页面\n\n相关主题：[Presets and Metric Presets](#presets), [Custom Metrics and Extensibility](#custom-metrics), [Core Components](#core-components)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/core/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/report.py)\n- [src/evidently/core/tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/tests.py)\n- [src/evidently/core/compare.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/compare.py)\n- [src/evidently/core/serialization.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/serialization.py)\n- [src/evidently/legacy/report/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/report/report.py)\n- [src/evidently/legacy/test_suite/test_suite.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/test_suite/test_suite.py)\n- [src/evidently/metrics/_legacy.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/_legacy.py)\n</details>\n\n# Reports and Test Suites\n\n## Overview\n\nEvidently provides two primary evaluation constructs for ML and LLM-powered systems: **Reports** and **Test Suites**. Both are designed to evaluate, analyze, and monitor data and model quality, but they serve different purposes and use cases.\n\n| Component | Purpose | Use Case |\n|-----------|---------|----------|\n| **Report** | Generate comprehensive analysis with metrics, visualizations, and insights | Exploratory analysis, documentation, stakeholder communication |\n| **Test Suite** | Run structured checks with pass/fail outcomes | CI/CD pipelines, automated quality gates, regression testing |\n\n资料来源：[src/evidently/legacy/report/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/report/report.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    subgraph \"Core Evaluation Engine\"\n        A[Dataset / DataDefinition] --> B[Report / Test Suite]\n        C[Descriptors / Metrics] --> B\n        D[Test Cases] --> B\n    end\n    \n    subgraph \"Report Output\"\n        B --> E[Metric Results]\n        B --> F[Visualizations]\n        B --> G[JSON/HTML Export]\n    end\n    \n    subgraph \"Test Suite Output\"\n        B --> H[Test Results]\n        H --> I[Pass / Fail Status]\n        H --> J[Failure Details]\n    end\n```\n\n### Core Components\n\nThe evaluation system is built on several key components:\n\n| Component | File Location | Role |\n|-----------|---------------|------|\n| `Report` | `src/evidently/core/report.py` | Main class for generating analytical reports |\n| `TestSuite` | `src/evidently/core/tests.py` | Orchestrates test execution |\n| `Dataset` | Core data structure | Holds data with schema definitions |\n| `DataDefinition` | Schema definition | Defines column types and expectations |\n| `Descriptor` | Feature extraction | Row-level evaluators (e.g., Sentiment, TextLength) |\n| `Metric` | Metric calculation | Computes statistical measures |\n\n资料来源：[src/evidently/core/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/report.py)\n\n## Datasets and Data Definition\n\n### Dataset Structure\n\nEvidently uses a `Dataset` object to wrap pandas DataFrames with additional metadata:\n\n```python\nfrom evidently import Dataset, DataDefinition\n\neval_dataset = Dataset.from_pandas(\n    pd.DataFrame(eval_df),\n    data_definition=DataDefinition(),\n    descriptors=[\n        Sentiment(\"answer\", alias=\"Sentiment\"),\n        TextLength(\"answer\", alias=\"Length\"),\n        Contains(\"answer\", items=[\"sorry\", \"apologize\"], mode=\"any\")\n    ]\n)\n```\n\n资料来源：[README.md](https://github.com/evidentlyai/evidently/blob/main/README.md)\n\n### DataDefinition\n\nThe `DataDefinition` class defines the schema for the dataset:\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `column_schema` | dict | Maps column names to data types |\n| `relationships` | list | Defines relationships between datasets |\n| `timestamp_column` | str | Column containing datetime values |\n\n## Descriptors\n\nDescriptors are row-level evaluators that extract features or apply checks to individual rows. They can be used within datasets or standalone.\n\n### Available Descriptors\n\n| Descriptor | Purpose | Parameters |\n|------------|---------|------------|\n| `Sentiment` | Detect sentiment in text | `column_name`, `alias` |\n| `TextLength` | Measure text length | `column_name`, `alias` |\n| `Contains` | Check for substring presence | `column_name`, `items`, `mode`, `case_sensitive` |\n| `BeginsWith` | Verify text prefix | `column_name`, `prefix`, `case_sensitive` |\n| `HuggingFace` | Apply HF models | `column_name`, `model`, `params` |\n| `HuggingFaceToxicity` | Detect toxic content | `column_name`, `model`, `toxic_label` |\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/generated_descriptors.py)\n\n### Descriptor Parameters\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `column_name` | str | Yes | Name of the column to evaluate |\n| `alias` | str | No | Display name for the result |\n| `tests` | list | No | Optional test cases to apply |\n| `mode` | str | No | Matching mode: `\"any\"` or `\"all\"` |\n| `case_sensitive` | bool | No | Whether comparison is case-sensitive |\n\n## Reports\n\n### Creating a Report\n\nReports generate comprehensive HTML or JSON output with metrics and visualizations:\n\n```python\nfrom evidently import Report\nfrom evidently.presets import TextEvals\n\nreport = Report(\n    metrics=[\n        TextEvals(),\n    ]\n)\n\nresult = report.run(reference_data=reference_df, current_data=current_df)\nreport.save_html(\"report.html\")\n```\n\n资料来源：[src/evidently/legacy/report/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/report/report.py)\n\n### Report Configuration\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `metrics` | list | Required | List of metrics to compute |\n| `timestamp` | datetime | Now | Report generation time |\n| `include_tests` | bool | False | Include test results in report |\n\n### Report Output Formats\n\n| Format | Method | Use Case |\n|--------|--------|----------|\n| HTML | `save_html(path)` | Interactive visualization |\n| JSON | `save_json(path)` or `as_dict()` | Machine parsing, APIs |\n| Python dict | `as_dict()` | Programmatic access |\n\n资料来源：[src/evidently/core/serialization.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/serialization.py)\n\n## Test Suites\n\n### Creating a Test Suite\n\nTest Suites run structured checks and return pass/fail status:\n\n```python\nfrom evidently import TestSuite\nfrom evidently.test_suite import TestSuite\n\nsuite = TestSuite(tests=[\n    # Test definitions\n])\n\nresult = suite.run(reference_data=reference_df, current_data=current_df)\nsuite.save(\"test_results.json\")\n```\n\n资料来源：[src/evidently/legacy/test_suite/test_suite.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/test_suite/test_suite.py)\n\n### Test Suite Workflow\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Test Suite]\n    B --> C{Execute Tests}\n    C --> D[Test 1]\n    C --> E[Test 2]\n    C --> F[Test N]\n    D --> G[Test Results]\n    E --> G\n    F --> G\n    G --> H{Pass All?}\n    H -->|Yes| I[Success]\n    H -->|No| J[Failure Report]\n```\n\n### Test Results Structure\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `status` | str | `\"PASSED\"`, `\"FAILED\"`, `\"WARNING\"` |\n| `name` | str | Test identifier |\n| `group` | str | Test category |\n| `details` | dict | Additional context and metrics |\n| `timestamp` | datetime | When the test was run |\n\n## LLM Evaluation Features\n\n### LLM-Powered Judgments\n\nEvidently supports LLM-based evaluation through specialized descriptors:\n\n```python\nfrom evidently.descriptors import (\n    ContextQualityLLMEval,\n    CompletenessLLMEval,\n)\n```\n\n| Descriptor | Purpose | Key Parameters |\n|------------|---------|----------------|\n| `ContextQualityLLMEval` | Evaluate context relevance | `question`, `provider`, `model` |\n| `CompletenessLLMEval` | Check response completeness | `context`, `provider`, `model`, `include_reasoning` |\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/generated_descriptors.py)\n\n### LLM Provider Configuration\n\n| Provider | Model Default | Configuration |\n|----------|---------------|---------------|\n| `openai` | `gpt-4o-mini` | API key required |\n| `anthropic` | Claude models | API key required |\n| `azure` | Configurable | Endpoint + key |\n\n### Evaluation Options\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `include_category` | bool | Include categorical classification |\n| `include_score` | bool | Include numerical score |\n| `include_reasoning` | bool | Include LLM reasoning |\n| `uncertainty` | Uncertainty | Strategy for handling uncertainty |\n\n## Metrics and Presets\n\n### Preset Configurations\n\nEvidently provides pre-configured metric sets:\n\n| Preset | Description | File |\n|--------|-------------|------|\n| `TextEvals` | Text quality metrics | `src/evidently/presets` |\n| `DataDrift` | Drift detection | Legacy metrics |\n| `DataQuality` | Quality checks | Legacy metrics |\n\n资料来源：[src/evidently/metrics/_legacy.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/_legacy.py)\n\n## Legacy vs. Current API\n\nEvidently maintains both legacy and current APIs for backward compatibility:\n\n### Legacy API Structure\n\n```\nsrc/evidently/legacy/\n├── report/\n│   └── report.py          # Legacy Report class\n└── test_suite/\n    └── test_suite.py      # Legacy TestSuite class\n```\n\n### Current API Structure\n\n```\nsrc/evidently/\n├── core/\n│   ├── report.py          # Current Report class\n│   ├── tests.py           # Current test infrastructure\n│   ├── compare.py         # Data comparison utilities\n│   └── serialization.py   # Output serialization\n```\n\n资料来源：[src/evidently/metrics/_legacy.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/_legacy.py)\n\n## Usage Example: LLM Evaluation Pipeline\n\n```python\nimport pandas as pd\nfrom evidently import Report, Dataset, DataDefinition\nfrom evidently.descriptors import Sentiment, TextLength, Contains\nfrom evidently.presets import TextEvals\n\n# 1. Prepare data\neval_df = pd.DataFrame([\n    [\"What is the capital of Japan?\", \"The capital of Japan is Tokyo.\"],\n    [\"Who painted the Mona Lisa?\", \"Leonardo da Vinci.\"],\n    [\"Can you write an essay?\", \"I'm sorry, but I can't assist with homework.\"]],\n    columns=[\"question\", \"answer\"])\n\n# 2. Create dataset with descriptors\neval_dataset = Dataset.from_pandas(\n    pd.DataFrame(eval_df),\n    data_definition=DataDefinition(),\n    descriptors=[\n        Sentiment(\"answer\", alias=\"Sentiment\"),\n        TextLength(\"answer\", alias=\"Length\"),\n        Contains(\"answer\", items=[\"sorry\", \"apologize\"], mode=\"any\")\n    ]\n)\n\n# 3. Run report\nreport = Report(metrics=[TextEvals()])\nresult = report.run(reference_data=None, current_data=eval_dataset)\nreport.save_html(\"llm_eval_report.html\")\n```\n\n资料来源：[README.md](https://github.com/evidentlyai/evidently/blob/main/README.md)\n\n## Comparison: Report vs. Test Suite\n\n| Aspect | Report | Test Suite |\n|--------|--------|------------|\n| **Output** | Metrics, visualizations, insights | Pass/fail status, error details |\n| **Use Case** | Exploratory analysis | Automated quality gates |\n| **Integration** | Dashboards, documentation | CI/CD, monitoring alerts |\n| **Thresholds** | Configurable, informative | Strict pass/fail boundaries |\n| **Exit Code** | Always 0 (informational) | 0 (pass) or 1 (fail) |\n\n资料来源：[src/evidently/core/tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/tests.py)\n\n## Serialization\n\n### Supported Formats\n\n| Format | Extension | Use Case |\n|--------|-----------|----------|\n| JSON | `.json` | APIs, automation, storage |\n| HTML | `.html` | Human review, sharing |\n| Parquet | `.parquet` | Large-scale data storage |\n\n### Serialization Options\n\n```python\n# Save as JSON\nreport.save_json(\"report.json\")\n\n# Save as HTML\nreport.save_html(\"report.html\")\n\n# Export as dictionary\ndata = report.as_dict()\n```\n\n资料来源：[src/evidently/core/serialization.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/serialization.py)\n\n## Summary\n\nReports and Test Suites in Evidently provide complementary approaches to ML and LLM evaluation:\n\n- **Reports** excel at providing comprehensive, visual analysis suitable for exploration and documentation\n- **Test Suites** provide structured, automated quality checks ideal for CI/CD integration\n- Both share the same underlying dataset and descriptor infrastructure\n- LLM-powered evaluations are available through dedicated descriptors like `ContextQualityLLMEval` and `CompletenessLLMEval`\n- Output can be serialized to JSON, HTML, or accessed programmatically via Python dictionaries\n\n---\n\n<a id='presets'></a>\n\n## Presets and Metric Presets\n\n### 相关页面\n\n相关主题：[Reports and Test Suites](#reports-test-suites), [Custom Metrics and Extensibility](#custom-metrics)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/presets/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/__init__.py)\n- [src/evidently/presets/drift.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/drift.py)\n- [src/evidently/presets/classification.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/classification.py)\n- [src/evidently/presets/regression.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/regression.py)\n- [src/evidently/presets/recsys.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/recsys.py)\n- [src/evidently/presets/special.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/special.py)\n- [src/evidently/legacy/metric_preset/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/metric_preset/__init__.py)\n- [src/evidently/legacy/test_preset/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/test_preset/__init__.py)\n</details>\n\n# Presets and Metric Presets\n\n## Overview\n\nPresets in Evidently are pre-configured collections of metrics and evaluations designed to simplify the process of assessing machine learning models and LLM-powered systems. They provide ready-to-use evaluation templates that bundle relevant metrics, thresholds, and reporting components into cohesive units. This abstraction layer allows users to perform comprehensive model assessment without manually configuring individual metrics.\n\nThe preset system follows a modular architecture where each preset type addresses a specific use case domain. Users can instantiate presets and pass them directly to the `Report` or `TestSuite` objects, which execute the bundled evaluations and generate structured results.\n\n## Preset Architecture\n\n```mermaid\ngraph TD\n    A[Evidently Presets] --> B[Standard Presets]\n    A --> C[Legacy Presets]\n    \n    B --> B1[DataDriftPreset]\n    B --> B2[TargetDriftPreset]\n    B --> B3[ClassificationPreset]\n    B --> B4[RegressionPreset]\n    B --> B5[RecSysPreset]\n    B --> B6[TextEvals]\n    \n    C --> C1[MetricPreset]\n    C --> C2[TestPreset]\n    \n    D[Report / TestSuite] --> E[Integrates Presets]\n    E --> B\n    E --> C\n```\n\n## Preset Types\n\n### Standard Presets (src/evidently/presets/)\n\nThe modern preset implementation resides in the `src/evidently/presets/` directory. These presets integrate directly with the current Evidently reporting API.\n\n#### DataDriftPreset\n\nThe `DataDriftPreset` evaluates feature-level drift between reference and current datasets. It calculates drift scores for individual features and provides aggregate drift statistics. This preset is particularly useful for monitoring data pipeline changes and detecting distribution shifts that may impact model performance.\n\n```python\nfrom evidently.presets import DataDriftPreset\n\nreport = Report(metrics=[DataDriftPreset()])\nreport.run(reference_data=reference_df, current_data=current_df)\n```\n\n#### TargetDriftPreset\n\nThe `TargetDriftPreset` specifically monitors drift in the target variable distribution. It is essential for supervised learning scenarios where changes in label distribution can signal underlying data quality issues or concept drift.\n\n#### ClassificationPreset\n\nThe `ClassificationPreset` bundles metrics relevant to classification model evaluation. It includes accuracy metrics, confusion matrix analysis, class-level performance indicators, and probability calibration assessments. This preset supports both binary and multiclass classification scenarios.\n\n#### RegressionPreset\n\nThe `RegressionPreset` provides comprehensive regression model evaluation including error distribution analysis, quantile-based metrics, and residual diagnostics. It helps identify heteroscedasticity, non-linearity, and systematic prediction biases.\n\n#### RecSysPreset\n\nThe `RecSysPreset` is specialized for recommendation system evaluation. It includes ranking metrics, coverage indicators, and user-item interaction analysis tailored to collaborative filtering and content-based recommendation approaches.\n\n#### TextEvals\n\nThe `TextEvals` preset targets LLM and NLP model evaluation. It encompasses text-specific metrics such as sentiment analysis, text length distributions, and content quality indicators. This preset integrates with the descriptors system for row-level text evaluations.\n\n```python\nfrom evidently.presets import TextEvals\n\neval_dataset = Dataset.from_pandas(\n    pd.DataFrame(eval_df),\n    data_definition=DataDefinition(),\n    descriptors=[\n        Sentiment(\"answer\", alias=\"Sentiment\"),\n        TextLength(\"answer\", alias=\"Length\"),\n        Contains(\"answer\", words=[\"sorry\", \"cannot\"], alias=\"Denial\")\n    ]\n)\n```\n\n### Legacy Presets (src/evidently/legacy/)\n\nThe legacy preset system provided foundational abstractions for metric and test evaluation. These classes served as the original implementation pattern before the current preset architecture.\n\n#### MetricPreset\n\nThe `MetricPreset` class defines the base interface for metric collection presets. It maintains a list of metrics and provides methods for execution and result aggregation.\n\n#### TestPreset\n\nThe `TestPreset` class extends the preset concept to testing scenarios, enabling batch execution of statistical tests against model outputs. It provides assertions and threshold-based pass/fail criteria.\n\n## Preset Composition\n\nPresets can be combined within a single report to create comprehensive evaluation suites:\n\n```python\nfrom evidently import Report\nfrom evidently.presets import DataDriftPreset, ClassificationPreset\n\nreport = Report(\n    metrics=[\n        DataDriftPreset(),\n        ClassificationPreset()\n    ]\n)\n```\n\n## Integration with Report API\n\nPresets integrate seamlessly with the Evidently `Report` class:\n\n| Component | Role | Integration |\n|-----------|------|-------------|\n| `Report` | Execution container | Accepts presets as `metrics` parameter |\n| `Dataset` | Data wrapper | Provides reference and current data |\n| `DataDefinition` | Schema definition | Describes columns and their roles |\n| Preset | Metric bundle | Contains metric instances |\n\n## Descriptor System\n\nThe preset system works in conjunction with the descriptor system for row-level evaluations. Descriptors apply per-row transformations and evaluations:\n\n```python\nfrom evidently.descriptors import Sentiment, TextLength, Contains\n```\n\nDescriptors included in the `Dataset` configuration are evaluated alongside preset metrics, enabling both aggregate and granular assessment within a single report.\n\n## Use Cases\n\n### Model Validation\n\nBefore deploying a model, use presets to validate performance on held-out data:\n\n```python\nreport = Report(metrics=[RegressionPreset()])\nreport.run(reference_data=training_df, current_data=validation_df)\n```\n\n### Production Monitoring\n\nContinuously monitor deployed models for performance degradation:\n\n```python\nreport = Report(metrics=[DataDriftPreset(), ClassificationPreset()])\nreport.run(reference_data=baseline_df, current_data=current_df)\n```\n\n### LLM Evaluation\n\nAssess LLM responses using text-specific presets:\n\n```python\nfrom evidently.presets import TextEvals\nfrom evidently import Dataset, DataDefinition\n\neval_dataset = Dataset.from_pandas(response_df, data_definition=DataDefinition())\nreport = Report(metrics=[TextEvals()], descriptors=[...])\n```\n\n## Summary Table: Preset Comparison\n\n| Preset | Domain | Key Metrics | Use Case |\n|--------|--------|-------------|----------|\n| `DataDriftPreset` | Data | Drift scores, feature importance | Data pipeline monitoring |\n| `TargetDriftPreset` | Data | Target distribution, PSI | Concept drift detection |\n| `ClassificationPreset` | ML | Accuracy, F1, ROC-AUC | Classifier evaluation |\n| `RegressionPreset` | ML | MAE, RMSE, R² | Regression analysis |\n| `RecSysPreset` | ML | Ranking metrics, coverage | Recommendation systems |\n| `TextEvals` | NLP/LLM | Sentiment, length, content | Text model evaluation |\n\n## Conclusion\n\nThe preset system in Evidently provides a powerful abstraction for organizing and executing model evaluations. By bundling related metrics into coherent units, presets reduce boilerplate and enable consistent evaluation practices across different model types and deployment scenarios.\n\n---\n\n<a id='custom-metrics'></a>\n\n## Custom Metrics and Extensibility\n\n### 相关页面\n\n相关主题：[Reports and Test Suites](#reports-test-suites), [Presets and Metric Presets](#presets)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/core/registries/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/__init__.py)\n- [src/evidently/core/registries/configs.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/configs.py)\n- [src/evidently/core/registries/bound_tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/bound_tests.py)\n- [src/evidently/legacy/metrics/custom_metric.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/metrics/custom_metric.py)\n- [src/evidently/legacy/features/custom_feature.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/features/custom_feature.py)\n- [src/evidently/generators/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/generators/__init__.py)\n- [src/evidently/generators/column.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/generators/column.py)\n</details>\n\n# Custom Metrics and Extensibility\n\n## Overview\n\nEvidently provides a comprehensive extensibility system that allows users to create custom metrics, features, and test configurations. The framework follows a plugin-like architecture where custom implementations can be registered and discovered at runtime through the registry system.\n\nThe extensibility model encompasses several key areas:\n\n- **Custom Metrics**: User-defined metrics that compute specific evaluation logic\n- **Custom Features**: Row-level evaluators that extract or compute values from data\n- **Generators**: Automatic metric and feature generation from column specifications\n- **Bound Tests**: Test configurations attached to metrics with threshold definitions\n\nThis architecture enables users to extend the framework's built-in capabilities while maintaining consistency with the existing evaluation pipeline.\n\n## Registry System Architecture\n\nThe registry system serves as the central mechanism for discovering and managing extensions within Evidently. It provides a unified interface for registering, retrieving, and invoking custom implementations.\n\n```mermaid\ngraph TD\n    A[User Code] --> B[Registry API]\n    B --> C[MetricRegistry]\n    B --> D[FeatureRegistry]\n    B --> E[TestRegistry]\n    C --> F[Metric Implementations]\n    D --> G[Feature Implementations]\n    E --> H[Test Configurations]\n```\n\n### Core Registry Components\n\n| Component | Purpose | File Location |\n|-----------|---------|---------------|\n| MetricRegistry | Manages custom metric registrations | `src/evidently/core/registries/__init__.py` |\n| ConfigRegistry | Stores configuration metadata | `src/evidently/core/registries/configs.py` |\n| BoundTestRegistry | Handles test bindings and thresholds | `src/evidently/core/registries/bound_tests.py` |\n\nThe registry pattern follows a key-based lookup system where each extension is identified by a unique name. This allows for dynamic discovery and late binding of implementations at runtime.\n\n## Custom Metrics\n\nCustom metrics extend the base `Metric` class to implement domain-specific evaluation logic. The framework provides both legacy and modern approaches for metric creation.\n\n### Metric Structure\n\nA custom metric typically consists of:\n\n1. **Configuration**: Defines the metric's parameters and metadata\n2. **Calculation Logic**: Implements the `calculate` method to produce results\n3. **Visualization**: Provides rendering information through widget definitions\n\n```python\nclass CustomMetric(Metric):\n    def __init__(self, column: str, threshold: float = 0.5):\n        self.column = column\n        self.threshold = threshold\n    \n    def calculate(self, context: Context) -> MetricResult:\n        # Custom calculation logic\n        pass\n```\n\n资料来源：[src/evidently/legacy/metrics/custom_metric.py:1-50]()\n\n### Metric Container System\n\nThe `MetricContainer` abstract base class provides the foundation for grouping related metrics. It implements a caching mechanism to avoid redundant metric generation.\n\n```mermaid\ngraph LR\n    A[Context] --> B{Container Fingerprint}\n    B -->|Cache Hit| C[Return Cached Metrics]\n    B -->|Cache Miss| D[generate_metrics]\n    D --> E[Store in Context]\n    E --> C\n```\n\nThe `generate_metrics` method must be implemented by subclasses to define the metrics to be computed:\n\n```python\ndef generate_metrics(self, context: \"Context\") -> Sequence[MetricOrContainer]:\n    \"\"\"Generate metrics based on the container configuration.\n    \n    Args:\n        context: Context containing datasets and configuration.\n    \n    Returns:\n        Sequence of Metric or MetricContainer objects to compute.\n    \"\"\"\n    raise NotImplementedError()\n```\n\n资料来源：[src/evidently/core/container.py:1-80]()\n\n### Rendering and Widgets\n\nMetrics can contribute visualization widgets through the `render` method. The widget system supports hierarchical composition where parent containers aggregate widgets from child metrics:\n\n```python\ndef render(\n    self,\n    context: \"Context\",\n    child_widgets: Optional[List[Tuple[Optional[MetricId], List[BaseWidgetInfo]]]] = None,\n) -> List[BaseWidgetInfo]:\n    \"\"\"Render visualization widgets for this container.\n    \n    Combines widgets from all child metrics/containers.\n    \"\"\"\n```\n\n资料来源：[src/evidently/core/container.py:80-120]()\n\n## Custom Features\n\nCustom features extend the `Descriptor` base class to provide row-level evaluation capabilities. They can be applied to individual data points during dataset processing.\n\n### Feature Implementation Pattern\n\nFeatures implement the descriptor pattern, wrapping column values with evaluation logic:\n\n```python\nclass CustomFeature(Descriptor):\n    def __init__(self, column: str, alias: str = None):\n        self.column = column\n        self.alias = alias or column\n    \n    def apply(self, data: pd.Series) -> pd.Series:\n        # Custom feature calculation\n        return result\n```\n\n资料来源：[src/evidently/legacy/features/custom_feature.py:1-40]()\n\n### Integration with Descriptors\n\nThe descriptor system integrates with the data definition framework, allowing features to be declared as part of a dataset configuration:\n\n```python\neval_dataset = Dataset.from_pandas(\n    pd.DataFrame(eval_df),\n    data_definition=DataDefinition(),\n    descriptors=[\n        Sentiment(\"answer\", alias=\"Sentiment\"),\n        TextLength(\"answer\", alias=\"Length\"),\n        Contains(\"answer\", [\"denial\", \"refuse\"], alias=\"ContainsDenial\")\n    ]\n)\n```\n\n资料来源：[README.md:1-50]()\n\n## Generators System\n\nGenerators provide an automated way to create metrics and features based on column specifications. This reduces boilerplate when working with large numbers of similar evaluations.\n\n### Column Generator\n\nThe column generator creates standardized metrics for numeric and categorical columns:\n\n```python\ngenerator = ColumnGenerator(\n    columns=[\"feature_1\", \"feature_2\", \"feature_3\"],\n    generators=[\n        MeanGenerator(),\n        StdGenerator(),\n        NullCountGenerator()\n    ]\n)\n```\n\n资料来源：[src/evidently/generators/column.py:1-60]()\n\n### Generator Configuration\n\nGenerators can be configured with:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `columns` | List[str] | Target columns for generation |\n| `include_missing` | bool | Include columns with missing values |\n| `exclude_patterns` | List[str] | Regex patterns for column exclusion |\n| `generators` | List[Generator] | Specific generators to apply |\n\n资料来源：[src/evidently/generators/__init__.py:1-50]()\n\n## Bound Tests and Thresholds\n\nBound tests attach threshold configurations to metrics, enabling automated pass/fail determinations based on metric results.\n\n### Test Binding Pattern\n\nTests are bound to metrics through the registry system:\n\n```python\nclass BoundTest:\n    def __init__(self, metric: MetricId, threshold: TestThreshold):\n        self.metric = metric\n        self.threshold = threshold\n    \n    def evaluate(self, metric_result: MetricResult) -> TestResult:\n        # Compare metric value against threshold\n        pass\n```\n\n资料来源：[src/evidently/core/registries/bound_tests.py:1-50]()\n\n### Threshold Types\n\nThe framework supports multiple threshold configuration types:\n\n| Threshold Type | Description | Use Case |\n|---------------|-------------|----------|\n| `absolute` | Fixed value comparison | Fixed acceptable ranges |\n| `relative` | Percentage-based comparison | Drift detection |\n| `sigma` | Standard deviation bounds | Anomaly detection |\n| `quantile` | Percentile-based thresholds | Distribution extremes |\n\n资料来源：[src/evidently/core/registries/bound_tests.py:50-100]()\n\n## Configuration and Versioning\n\nThe extensibility system includes robust configuration management with versioning support for metrics and descriptors.\n\n### Config Version Model\n\nConfigurations are versioned to track changes over time:\n\n```python\n@dataclass\nclass ConfigVersion:\n    id: STR_UUID\n    artifact_id: STR_UUID\n    version: int\n    content: Any\n    metadata: ConfigVersionMetadata\n```\n\n资料来源：[src/evidently/sdk/configs.py:1-80]()\n\n### Metadata Structure\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `created_at` | datetime | Version creation timestamp |\n| `updated_at` | datetime | Last modification timestamp |\n| `author` | str | User who created/modified |\n| `comment` | str | Change description |\n\n资料来源：[src/evidently/sdk/adapters.py:1-100]()\n\n## Workflow Diagram\n\nThe complete extensibility workflow from definition to execution:\n\n```mermaid\ngraph TD\n    A[Define Custom Metric/Feature] --> B[Register in Registry]\n    B --> C[Create Dataset with Descriptors]\n    C --> D[Generate Metrics from Container]\n    D --> E[Calculate Results]\n    E --> F[Bind Tests with Thresholds]\n    F --> G[Evaluate Pass/Fail]\n    G --> H[Render Widgets]\n    H --> I[Display in Report/Dashboard]\n```\n\n## Best Practices\n\n### Performance Considerations\n\n- **Caching**: Use the container fingerprinting mechanism to cache generated metrics and avoid redundant calculations\n- **Lazy Evaluation**: Implement metrics using lazy evaluation patterns when possible to defer expensive computations\n- **Batch Processing**: Design features to operate on pandas Series rather than individual values for vectorized performance\n\n### Extensibility Guidelines\n\n1. **Consistent Naming**: Follow the established naming conventions for custom implementations\n2. **Type Hints**: Include comprehensive type hints for all public interfaces\n3. **Documentation**: Document parameters and return values using docstring conventions\n4. **Testing**: Create unit tests for custom metric calculation logic\n\n### Integration Points\n\nCustom extensions integrate with the framework through several standardized interfaces:\n\n- **Metric Protocol**: Implement the `calculate(self, context)` method\n- **Descriptor Protocol**: Implement the `apply(self, data)` method\n- **Widget Protocol**: Return `List[BaseWidgetInfo]` from `render()` method\n- **Test Protocol**: Implement the `evaluate(self, result)` method for bound tests\n\n## API Reference\n\n### Key Classes and Functions\n\n| Class/Function | Module | Purpose |\n|---------------|--------|---------|\n| `MetricContainer` | `evidently.core.container` | Base class for metric containers |\n| `Descriptor` | `evidently.legacy.features` | Base class for features |\n| `ColumnGenerator` | `evidently.generators.column` | Column-based metric generation |\n| `BoundTest` | `evidently.core.registries` | Threshold-bound test configuration |\n| `ConfigVersion` | `evidently.sdk.configs` | Versioned configuration storage |\n\n### Context Management\n\nThe `Context` object provides access to datasets and configuration during metric calculation:\n\n```python\nclass Context:\n    def metrics_container(self, fingerprint: str) -> Optional[List[MetricOrContainer]]:\n        \"\"\"Retrieve cached metrics for container fingerprint.\"\"\"\n    \n    def set_metric_container_data(\n        self, \n        fingerprint: str, \n        metrics: List[MetricOrContainer]\n    ) -> None:\n        \"\"\"Store generated metrics in cache.\"\"\"\n```\n\n资料来源：[src/evidently/core/container.py:50-80]()\n\n## Summary\n\nEvidently's extensibility system provides a comprehensive framework for customizing evaluation logic. The registry-based architecture enables dynamic discovery of custom implementations while maintaining consistency with built-in features. Custom metrics and features extend the base classes to implement domain-specific logic, and the generator system automates repetitive metric definitions. The bound test mechanism attaches configurable thresholds to metrics for automated validation, making the system suitable for both exploratory analysis and production monitoring scenarios.\n\n---\n\n<a id='ui-service'></a>\n\n## UI Service Backend\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/ui/workspace.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/ui/workspace.py)\n- [src/evidently/sdk/adapters.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/adapters.py)\n- [src/evidently/sdk/configs.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/configs.py)\n- [src/evidently/ui/utils.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/ui/utils.py)\n- [ui/packages/evidently-ui-lib/src/components/Project/ProjectCard.tsx](https://github.com/evidentlyai/evidently/blob/main/ui/packages/evidently-ui-lib/src/components/Project/ProjectCard.tsx)\n- [ui/packages/evidently-ui-lib/src/components/Prompts/PromptsTable.tsx](https://github.com/evidentlyai/evidently/blob/main/ui/packages/evidently-ui-lib/src/components/Prompts/PromptsTable.tsx)\n- [ui/packages/evidently-ui-lib/src/components/Traces/TracesTable.tsx](https://github.com/evidentlyai/evidently/blob/main/ui/packages/evidently-ui-lib/src/components/Traces/TracesTable.tsx)\n- [ui/packages/evidently-ui-lib/src/components/Utils/NameAndDescriptionForm.tsx](https://github.com/evidentlyai/evidently/blob/main/ui/packages/evidently-ui-lib/src/components/Utils/NameAndDescriptionForm.tsx)\n</details>\n\n# UI Service Backend\n\nThe Evidently UI Service Backend is a FastAPI-based REST API layer that powers the Evidently web interface, providing endpoints for managing projects, datasets, prompts, and trace data. It serves as the bridge between the React-based frontend and the Evidently SDK's core functionality.\n\n## Architecture Overview\n\nThe UI Service Backend follows a layered architecture:\n\n```graph TD\n    A[Frontend: React/TypeScript] --> B[UI Service Backend: FastAPI]\n    B --> C[Evidently SDK]\n    C --> D[Workspace Abstraction]\n    C --> E[Cloud Config API]\n    D --> F[(Local Storage)]\n    E --> G[(Remote Backend)]\n```\n\n### Component Stack\n\n| Layer | Technology | Purpose |\n|-------|------------|---------|\n| Frontend | React, TypeScript, MUI | User interface components |\n| Backend API | FastAPI/Python | REST API endpoints |\n| Business Logic | Evidently SDK | Core evaluation logic |\n| Data Layer | Local FS / Remote API | Persistence |\n\n## Core Data Models\n\n### Project Model\n\nProjects are the primary organizational unit in the UI. The `ProjectModel` represents:\n\n```python\nProjectModel:\n    - name: str\n    - description: Optional[str]\n    - org_id: Optional[OrgID]\n```\n\n资料来源：[src/evidently/ui/workspace.py:50-55]()\n\n### Config API Abstraction\n\nThe `ConfigAPI` class provides a generic interface for managing versioned configurations:\n\n| Method | Purpose |\n|--------|---------|\n| `create_config()` | Create a new configuration |\n| `get_config()` | Retrieve a configuration by ID |\n| `list_configs()` | List all configurations |\n| `update_config()` | Update an existing configuration |\n| `delete_config()` | Delete a configuration |\n| `add_version()` | Add a new version to a config |\n| `get_version()` | Get a specific version |\n\n资料来源：[src/evidently/sdk/configs.py:95-145]()\n\n### Descriptor Configuration\n\nThe `Descriptor` type is used to store and version descriptor configurations:\n\n```python\nDescriptorConfigAPI:\n    - add_descriptor() -> ConfigVersion\n    - get_descriptor() -> Descriptor\n```\n\n资料来源：[src/evidently/sdk/configs.py:165-180]()\n\n## Workspace Abstraction\n\nThe `Workspace` abstract class defines the contract for project management:\n\n```graph TD\n    A[Workspace] --> B[LocalWorkspace]\n    A --> C[CloudWorkspace]\n    A --> D[RemoteWorkspace]\n```\n\n### Abstract Methods\n\n| Method | Parameters | Returns | Description |\n|--------|------------|---------|-------------|\n| `create_project()` | name, description, org_id | Project | Creates a new project |\n| `add_project()` | project: ProjectModel, org_id | Project | Adds project to workspace |\n| `get_project()` | project_id: STR_UUID | Optional[Project] | Retrieves project by ID |\n| `delete_project()` | project_id: STR_UUID | None | Removes project from workspace |\n| `list_projects()` | org_id: Optional | Sequence[Project] | Lists all projects |\n\n资料来源：[src/evidently/ui/workspace.py:40-80]()\n\n## Frontend Components\n\n### Project Card Component\n\nThe `ProjectCard` component handles project display and editing:\n\n```typescript\nProjectCardProps:\n    - project: Project\n    - disabled?: boolean\n    - onEditProject: (args: { name: string; description: string }) => void\n    - LinkToProject: ComponentType\n```\n\nFeatures:\n- Toggle between view and edit modes\n- Uses `EditProjectInfoForm` for inline editing\n- Displays `ProjectInfoCard` in view mode\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Project/ProjectCard.tsx:60-80]()\n\n### Prompts Table\n\nThe `PromptsTable` component renders a sortable, paginated list of prompts:\n\n| Column | Render | Features |\n|--------|--------|----------|\n| ID | ID with copy button | `TextWithCopyIcon` component |\n| Name | Truncated text (max 200px) | Typography component |\n| Created at | Date formatted | `dayjs` locale formatting |\n| Actions | Link + Delete button | Edit/delete operations |\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Prompts/PromptsTable.tsx:40-75]()\n\n### Traces Table\n\nThe `TracesTable` component displays trace data with extended metadata:\n\n| Column | Features |\n|--------|----------|\n| Tags | `HidedTags` component, 250px min width |\n| Metadata | `JsonViewThemed` with clipboard support |\n| Type | Chip showing trace origin |\n| Created at | Sortable date column |\n| Actions | Dataset link, edit dialog, delete |\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Traces/TracesTable.tsx:50-90]()\n\n## API Endpoint Structure\n\n### Projects API\n\n```\n/projects\n  ├── GET    /              - List all projects\n  ├── POST   /              - Create new project\n  └── GET    /{project_id}  - Get project details\n\n/projects/{project_id}/\n  ├── prompts/\n  │   ├── GET    /          - List prompts\n  │   ├── POST   /          - Create prompt\n  │   └── GET    /{id}      - Get prompt details\n  ├── datasets/\n  │   ├── GET    /          - List datasets\n  │   └── POST   /          - Create dataset\n  └── traces/\n      ├── GET    /          - List traces\n      └── POST   /          - Create trace\n```\n\n### Prompts API\n\n| Endpoint | Method | Purpose |\n|----------|--------|---------|\n| `/prompts` | GET | List all prompts for a project |\n| `/prompts` | POST | Create a new prompt |\n| `/prompts/{id}` | GET | Get prompt details |\n\n资料来源：[ui/service/src/routes/.../index-prompts-list/index-prompts-list-main.tsx:40-60]()\n\n## Configuration Management\n\n### Artifact Version Management\n\nThe `ArtifactConfigAPI` handles versioned artifact storage:\n\n```python\nArtifactConfigAPI:\n    - create_version()      # Create new artifact version\n    - list_versions()       # List all versions\n    - get_version()         # Get specific version\n    - get_version_by_id()   # Get by version ID\n```\n\n资料来源：[src/evidently/sdk/adapters.py:45-70]()\n\n### Version Conversion\n\nBidirectional conversion between SDK and API models:\n\n```graph LR\n    A[ArtifactVersion] -->|convert| B[ConfigVersion]\n    B -->|convert| A\n```\n\nMethods:\n- `_artifact_version_to_config_version()` - SDK to API\n- `_config_version_to_artifact_version()` - API to SDK\n\n## UI Utilities\n\n### HTML Link Templates\n\nThe `utils.py` module provides HTML templates for dashboard rendering:\n\n```python\nHTML_LINK_WITH_ID_TEMPLATE   # Link with button and ID display\nFILE_LINK_WITH_ID_TEMPLATE   # File link with ID\nRUNNING_SERVICE_LINK_TEMPLATE # Service link with label\n```\n\n资料来源：[src/evidently/ui/utils.py:30-55]()\n\n## Workflow: Project Lifecycle\n\n```mermaid\ngraph TD\n    A[Create Project] --> B[Add Datasets]\n    B --> C[Create Prompts]\n    C --> D[Run Evals]\n    D --> E[Store Traces]\n    E --> F[View Results]\n    F --> G[Monitor]\n    \n    A -.->|via| H[Workspace.add_project]\n    C -.->|via| I[Prompts API]\n    E -.->|via| J[Traces API]\n```\n\n## Configuration Options\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `org_id` | UUID | None | Organization identifier |\n| `project_id` | UUID | Auto | Project identifier |\n| `version` | int | \"latest\" | Config version selector |\n\n## Key SDK Classes\n\n| Class | File | Responsibility |\n|-------|------|-----------------|\n| `ConfigAPI` | configs.py | Generic config CRUD operations |\n| `DescriptorConfigAPI` | configs.py | Descriptor-specific operations |\n| `CloudConfigAPI` | configs.py | Remote backend communication |\n| `Workspace` | workspace.py | Abstract workspace interface |\n| `ProjectModel` | workspace.py | Project data structure |\n\n## Technology Stack\n\n| Component | Technology | Version |\n|-----------|------------|---------|\n| Backend Framework | FastAPI | - |\n| SDK Core | Python | 3.11+ |\n| Frontend Framework | React | - |\n| UI Components | MUI | - |\n| State Management | React Hooks | - |\n| Date Handling | dayjs | - |\n\n## Development Workflow\n\n### Running the Service\n\n```bash\n# In ui/service folder\npnpm dev\n```\n\n### Code Quality\n\n```bash\n# In ui folder\npnpm code-check          # Format, sort imports, lint\npnpm code-check --fix    # Apply fixes automatically\n```\n\n资料来源：[ui/README.md:15-25]()\n\n### Building\n\n```bash\npnpm build\n```\n\n## Summary\n\nThe Evidently UI Service Backend provides:\n\n1. **REST API layer** for frontend-backend communication\n2. **Project management** via Workspace abstraction\n3. **Versioned configuration storage** using ConfigAPI\n4. **Tracing support** for evaluation history\n5. **Prompts management** for LLM-powered evaluations\n\nThe architecture cleanly separates concerns between the FastAPI backend, Evidently SDK business logic, and React frontend components, enabling modular development and testing.\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：evidentlyai/evidently\n\n摘要：发现 19 个潜在踩坑项，其中 4 个为 high/blocking；最高优先级：安装坑 - 来源证据：Update scikit-learn version requirement to support v1.6.0。\n\n## 1. 安装坑 · 来源证据：Update scikit-learn version requirement to support v1.6.0\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Update scikit-learn version requirement to support v1.6.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e82e24d201134e7f8dfa0b564d73fce8 | https://github.com/evidentlyai/evidently/issues/1407 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 2. 配置坑 · 来源证据：PromptOptimizer throws OpenAIError when using Vertex AI judge\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：PromptOptimizer throws OpenAIError when using Vertex AI judge\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_abf6f9b183ff4892be78fd198f60d8e8 | https://github.com/evidentlyai/evidently/issues/1856 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 运行坑 · 来源证据：IndexError in infer_column_type when column contains only null values\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：IndexError in infer_column_type when column contains only null values\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_44cdd5f2b657432eb8418fb132959ada | https://github.com/evidentlyai/evidently/issues/1764 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 4. 安全/权限坑 · 来源证据：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_872f2b57cd3a4c61ad5c387c8be3208b | https://github.com/evidentlyai/evidently/issues/1410 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安装坑 · 来源证据：Numpy 2.x support?\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Numpy 2.x support?\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_2fc84415b55a43d2be4f7f47cf46d7a1 | https://github.com/evidentlyai/evidently/issues/1557 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：v0.7.12\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.12\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fcf994a3bcf94b0d9b2c5d6159816728 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.12 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 7. 安装坑 · 来源证据：v0.7.15\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.15\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_a9de117bd8e440beab9f7be4af50d710 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.15 | 来源讨论提到 linux 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：v0.7.20\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.20\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f0d67058bf2a40218adfb0eea1f1665f | https://github.com/evidentlyai/evidently/releases/tag/v0.7.20 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 9. 能力坑 · 来源证据：v0.7.19\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.19\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_52e6ab37edca4901a0aa9fe7b5ba12f1 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.19 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 10. 能力坑 · 来源证据：v0.7.21\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.21\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fe9f0e6e29564afeb5610928cc9bc834 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.21 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:315977578 | https://github.com/evidentlyai/evidently | README/documentation is current enough for a first validation pass.\n\n## 12. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | last_activity_observed missing\n\n## 13. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:315977578 | https://github.com/evidentlyai/evidently | no_demo; severity=medium\n\n## 14. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:315977578 | https://github.com/evidentlyai/evidently | no_demo; severity=medium\n\n## 15. 安全/权限坑 · 来源证据：Protect this repo from AI-generated PRs\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Protect this repo from AI-generated PRs\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_cfcf13543c3941e88fd7c5f527f95dc3 | https://github.com/evidentlyai/evidently/issues/1879 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 16. 安全/权限坑 · 来源证据：v0.7.17\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.7.17\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_305e0831e389460096b4d4b908731b41 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.17 | 来源讨论提到 api key 相关条件，需在安装/试用前复核。\n\n## 17. 安全/权限坑 · 来源证据：v0.7.18\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.7.18\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_19a463a1bb2a4fb2bd5bd894c914cc36 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.18 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 18. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | issue_or_pr_quality=unknown\n\n## 19. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | release_recency=unknown\n\n<!-- canonical_name: evidentlyai/evidently; human_manual_source: deepwiki_human_wiki -->\n",
      "markdown_key": "evidently",
      "pages": "draft",
      "source_refs": [
        {
          "evidence_id": "github_repo:315977578",
          "kind": "repo",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/evidentlyai/evidently"
        },
        {
          "evidence_id": "art_e0f9cd67e5e547ec994f4c58cc5c9526",
          "kind": "docs",
          "supports_claim_ids": [
            "claim_identity",
            "claim_distribution",
            "claim_capability"
          ],
          "url": "https://github.com/evidentlyai/evidently#readme"
        }
      ],
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "evidently 说明书",
      "toc": [
        "https://github.com/evidentlyai/evidently 项目说明书",
        "目录",
        "Architecture Overview",
        "Introduction",
        "High-Level Architecture",
        "Core Module Structure",
        "SDK Architecture",
        "Workspace Architecture",
        "Doramagic 踩坑日志"
      ]
    }
  },
  "quality_gate": {
    "blocking_gaps": [],
    "category_confidence": "medium",
    "compile_status": "ready_for_review",
    "five_assets_present": true,
    "install_sandbox_verified": true,
    "missing_evidence": [],
    "next_action": "publish to Doramagic.ai project surfaces",
    "prompt_preview_boundary_ok": true,
    "publish_status": "publishable",
    "quick_start_verified": true,
    "repo_clone_verified": true,
    "repo_commit": "a4aa4c2b37fe7a4344cc5031f566deccf3d69e4f",
    "repo_inspection_error": null,
    "repo_inspection_files": [
      "pyproject.toml",
      "README.md",
      "examples/README.md",
      "examples/cookbook/README.md",
      "examples/service/postgres_config.yaml",
      "examples/service/sqlite_config.yaml",
      "examples/service/README.md",
      "examples/service/docker-compose.yml",
      "examples/service/remote_demo_project.py",
      "examples/grafana/README.md",
      "examples/future_examples/future_reviews.py",
      "examples/grafana/grafana_llm_evaluation_dashboard/README.md",
      "examples/grafana/grafana_llm_evaluation_dashboard/docker-compose.yml",
      "examples/grafana/grafana_llm_evaluation_dashboard/evidently_metrics_calculation.py",
      "examples/grafana/grafana_data_drift_dashboard/README.md",
      "examples/grafana/grafana_data_drift_dashboard/docker-compose.yml",
      "examples/grafana/grafana_data_drift_dashboard/evidently_metrics_calculation.py",
      "examples/grafana/grafana_llm_evaluation_dashboard/config/grafana_dashboards.yaml",
      "examples/grafana/grafana_llm_evaluation_dashboard/config/grafana_datasources.yaml",
      "examples/grafana/grafana_llm_evaluation_dashboard/dashboards/chatbot_evals.json",
      "examples/grafana/grafana_data_drift_dashboard/config/grafana_dashboards.yaml",
      "examples/grafana/grafana_data_drift_dashboard/config/grafana_datasources.yaml",
      "examples/grafana/grafana_data_drift_dashboard/dashboards/data_drift.json",
      "src/evidently/telemetry.py",
      "src/evidently/_registry.py",
      "src/evidently/_pydantic_compat.py",
      "src/evidently/_version.py",
      "src/evidently/pydantic_utils.py",
      "src/evidently/errors.py",
      "src/evidently/__init__.py",
      "src/evidently/cli/migrate.py",
      "src/evidently/cli/report.py",
      "src/evidently/cli/main.py",
      "src/evidently/cli/demo_project.py",
      "src/evidently/cli/utils.py",
      "src/evidently/cli/ui.py",
      "src/evidently/cli/legacy_ui.py",
      "src/evidently/cli/__init__.py",
      "src/evidently/cli/__main__.py",
      "src/evidently/widgets/__init__.py"
    ],
    "repo_inspection_verified": true,
    "review_reasons": [],
    "tag_count_ok": true,
    "unsupported_claims": []
  },
  "schema_version": "0.1",
  "user_assets": {
    "ai_context_pack": {
      "asset_id": "ai_context_pack",
      "filename": "AI_CONTEXT_PACK.md",
      "markdown": "# html-visual-testing - Doramagic AI Context Pack\n\n> 定位：安装前体验与判断资产。它帮助宿主 AI 有一个好的开始，但不代表已经安装、执行或验证目标项目。\n\n## 充分原则\n\n- **充分原则，不是压缩原则**：AI Context Pack 应该充分到让宿主 AI 在开工前理解项目价值、能力边界、使用入口、风险和证据来源；它可以分层组织，但不以最短摘要为目标。\n- **压缩策略**：只压缩噪声和重复内容，不压缩会影响判断和开工质量的上下文。\n\n## 给宿主 AI 的使用方式\n\n你正在读取 Doramagic 为 html-visual-testing 编译的 AI Context Pack。请把它当作开工前上下文：帮助用户理解适合谁、能做什么、如何开始、哪些必须安装后验证、风险在哪里。不要声称你已经安装、运行或执行了目标项目。\n\n## Claim 消费规则\n\n- **事实来源**：Repo Evidence + Claim/Evidence Graph；Human Wiki 只提供显著性、术语和叙事结构。\n- **事实最低状态**：`supported`\n- `supported`：可以作为项目事实使用，但回答中必须引用 claim_id 和证据路径。\n- `weak`：只能作为低置信度线索，必须要求用户继续核实。\n- `inferred`：只能用于风险提示或待确认问题，不能包装成项目事实。\n- `unverified`：不得作为事实使用，应明确说证据不足。\n- `contradicted`：必须展示冲突来源，不得替用户强行选择一个版本。\n\n## 它最适合谁\n\n- **想在安装前理解开源项目价值和边界的用户**：当前证据主要来自项目文档。 证据：`README.md` Claim：`clm_0002` supported 0.86\n\n## 它能做什么\n\n- **命令行启动或安装流程**（需要安装后验证）：项目文档中存在可执行命令，真实使用需要在本地或宿主环境中运行这些命令。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n## 怎么开始\n\n- `pip install evidently` 证据：`README.md` Claim：`clm_0003` supported 0.86\n- `pip install virtualenv` 证据：`README.md` Claim：`clm_0004` supported 0.86\n\n## 继续前判断卡\n\n- **当前建议**：仅建议沙盒试装\n- **为什么**：项目存在安装命令、宿主配置或本地写入线索，不建议直接进入主力环境，应先在隔离环境试装。\n\n### 30 秒判断\n\n- **现在怎么做**：仅建议沙盒试装\n- **最小安全下一步**：先跑 Prompt Preview；若仍要安装，只在隔离环境试装\n- **先别相信**：真实输出质量不能在安装前相信。\n- **继续会触碰**：命令执行、本地环境或项目文件、宿主 AI 上下文\n\n### 现在可以相信\n\n- **适合人群线索：想在安装前理解开源项目价值和边界的用户**（supported）：有 supported claim 或项目证据支撑，但仍不等于真实安装效果。 证据：`README.md` Claim：`clm_0002` supported 0.86\n- **能力存在：命令行启动或安装流程**（supported）：可以相信项目包含这类能力线索；是否适合你的具体任务仍要试用或安装后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n- **存在 Quick Start / 安装命令线索**（supported）：可以相信项目文档出现过启动或安装入口；不要因此直接在主力环境运行。 证据：`README.md` Claim：`clm_0003` supported 0.86\n\n### 现在还不能相信\n\n- **真实输出质量不能在安装前相信。**（unverified）：Prompt Preview 只能展示引导方式，不能证明真实项目中的结果质量。\n- **宿主 AI 版本兼容性不能在安装前相信。**（unverified）：Claude、Cursor、Codex、Gemini 等宿主加载规则和版本差异必须在真实环境验证。\n- **不会污染现有宿主 AI 行为，不能直接相信。**（inferred）：Skill、plugin、AGENTS/CLAUDE/GEMINI 指令可能改变宿主 AI 的默认行为。\n- **可安全回滚不能默认相信。**（unverified）：除非项目明确提供卸载和恢复说明，否则必须先在隔离环境验证。\n- **真实安装后是否与用户当前宿主 AI 版本兼容？**（unverified）：兼容性只能通过实际宿主环境验证。\n- **项目输出质量是否满足用户具体任务？**（unverified）：安装前预览只能展示流程和边界，不能替代真实评测。\n- **安装命令是否需要网络、权限或全局写入？**（unverified）：这影响企业环境和个人环境的安装风险。 证据：`README.md`\n\n### 继续会触碰什么\n\n- **命令执行**：包管理器、网络下载、本地插件目录、项目配置或用户主目录。 原因：运行第一条命令就可能产生环境改动；必须先判断是否值得跑。 证据：`README.md`\n- **本地环境或项目文件**：安装结果、插件缓存、项目配置或本地依赖目录。 原因：安装前无法证明写入范围和回滚方式，需要隔离验证。 证据：`README.md`\n- **宿主 AI 上下文**：AI Context Pack、Prompt Preview、Skill 路由、风险规则和项目事实。 原因：导入上下文会影响宿主 AI 后续判断，必须避免把未验证项包装成事实。\n\n### 最小安全下一步\n\n- **先跑 Prompt Preview**：用安装前交互式试用判断工作方式是否匹配，不需要授权或改环境。（适用：任何项目都适用，尤其是输出质量未知时。）\n- **只在隔离目录或测试账号试装**：避免安装命令污染主力宿主 AI、真实项目或用户主目录。（适用：存在命令执行、插件配置或本地写入线索时。）\n- **安装后只验证一个最小任务**：先验证加载、兼容、输出质量和回滚，再决定是否深用。（适用：准备从试用进入真实工作流时。）\n\n### 退出方式\n\n- **保留安装前状态**：记录原始宿主配置和项目状态，后续才能判断是否可恢复。\n- **记录安装命令和写入路径**：没有明确卸载说明时，至少要知道哪些目录或配置需要手动清理。\n- **如果没有回滚路径，不进入主力环境**：不可回滚是继续前阻断项，不应靠信任或运气继续。\n\n## 哪些只能预览\n\n- 解释项目适合谁和能做什么\n- 基于项目文档演示典型对话流程\n- 帮助用户判断是否值得安装或继续研究\n\n## 哪些必须安装后验证\n\n- 真实安装 Skill、插件或 CLI\n- 执行脚本、修改本地文件或访问外部服务\n- 验证真实输出质量、性能和兼容性\n\n## 边界与风险判断卡\n\n- **把安装前预览误认为真实运行**：用户可能高估项目已经完成的配置、权限和兼容性验证。 处理方式：明确区分 prompt_preview_can_do 与 runtime_required。 Claim：`clm_0005` inferred 0.45\n- **命令执行会修改本地环境**：安装命令可能写入用户主目录、宿主插件目录或项目配置。 处理方式：先在隔离环境或测试账号中运行。 证据：`README.md` Claim：`clm_0006` supported 0.86\n- **待确认**：真实安装后是否与用户当前宿主 AI 版本兼容？。原因：兼容性只能通过实际宿主环境验证。\n- **待确认**：项目输出质量是否满足用户具体任务？。原因：安装前预览只能展示流程和边界，不能替代真实评测。\n- **待确认**：安装命令是否需要网络、权限或全局写入？。原因：这影响企业环境和个人环境的安装风险。\n\n## 开工前工作上下文\n\n### 加载顺序\n\n- 先读取 how_to_use.host_ai_instruction，建立安装前判断资产的边界。\n- 读取 claim_graph_summary，确认事实来自 Claim/Evidence Graph，而不是 Human Wiki 叙事。\n- 再读取 intended_users、capabilities 和 quick_start_candidates，判断用户是否匹配。\n- 需要执行具体任务时，优先查 role_skill_index，再查 evidence_index。\n- 遇到真实安装、文件修改、网络访问、性能或兼容性问题时，转入 risk_card 和 boundaries.runtime_required。\n\n### 任务路由\n\n- **命令行启动或安装流程**：先说明这是安装后验证能力，再给出安装前检查清单。 边界：必须真实安装或运行后验证。 证据：`README.md` Claim：`clm_0001` supported 0.86\n\n### 上下文规模\n\n- 文件总数：1217\n- 重要文件覆盖：40/1217\n- 证据索引条目：64\n- 角色 / Skill 条目：15\n\n### 证据不足时的处理\n\n- **missing_evidence**：说明证据不足，要求用户提供目标文件、README 段落或安装后验证记录；不要补全事实。\n- **out_of_scope_request**：说明该任务超出当前 AI Context Pack 证据范围，并建议用户先查看 Human Manual 或真实安装后验证。\n- **runtime_request**：给出安装前检查清单和命令来源，但不要替用户执行命令或声称已执行。\n- **source_conflict**：同时展示冲突来源，标记为待核实，不要强行选择一个版本。\n\n## Prompt Recipes\n\n### 适配判断\n\n- 目标：判断这个项目是否适合用户当前任务。\n- 预期输出：适配结论、关键理由、证据引用、安装前可预览内容、必须安装后验证内容、下一步建议。\n\n```text\n请基于 html-visual-testing 的 AI Context Pack，先问我 3 个必要问题，然后判断它是否适合我的任务。回答必须包含：适合谁、能做什么、不能做什么、是否值得安装、证据来自哪里。所有项目事实必须引用 evidence_refs、source_paths 或 claim_id。\n```\n\n### 安装前体验\n\n- 目标：让用户在安装前感受核心工作流，同时避免把预览包装成真实能力或营销承诺。\n- 预期输出：一段带边界标签的体验剧本、安装后验证清单和谨慎建议；不含真实运行承诺或强营销表述。\n\n```text\n请把 html-visual-testing 当作安装前体验资产，而不是已安装工具或真实运行环境。\n\n请严格输出四段：\n1. 先问我 3 个必要问题。\n2. 给出一段“体验剧本”：用 [安装前可预览]、[必须安装后验证]、[证据不足] 三种标签展示它可能如何引导工作流。\n3. 给出安装后验证清单：列出哪些能力只有真实安装、真实宿主加载、真实项目运行后才能确认。\n4. 给出谨慎建议：只能说“值得继续研究/试装”“先补充信息后再判断”或“不建议继续”，不得替项目背书。\n\n硬性边界：\n- 不要声称已经安装、运行、执行测试、修改文件或产生真实结果。\n- 不要写“自动适配”“确保通过”“完美适配”“强烈建议安装”等承诺性表达。\n- 如果描述安装后的工作方式，必须使用“如果安装成功且宿主正确加载 Skill，它可能会……”这种条件句。\n- 体验剧本只能写成“示例台词/假设流程”：使用“可能会询问/可能会建议/可能会展示”，不要写“已写入、已生成、已通过、正在运行、正在生成”。\n- Prompt Preview 不负责给安装命令；如用户准备试装，只能提示先阅读 Quick Start 和 Risk Card，并在隔离环境验证。\n- 所有项目事实必须来自 supported claim、evidence_refs 或 source_paths；inferred/unverified 只能作风险或待确认项。\n\n```\n\n### 角色 / Skill 选择\n\n- 目标：从项目里的角色或 Skill 中挑选最匹配的资产。\n- 预期输出：候选角色或 Skill 列表，每项包含适用场景、证据路径、风险边界和是否需要安装后验证。\n\n```text\n请读取 role_skill_index，根据我的目标任务推荐 3-5 个最相关的角色或 Skill。每个推荐都要说明适用场景、可能输出、风险边界和 evidence_refs。\n```\n\n### 风险预检\n\n- 目标：安装或引入前识别环境、权限、规则冲突和质量风险。\n- 预期输出：环境、权限、依赖、许可、宿主冲突、质量风险和未知项的检查清单。\n\n```text\n请基于 risk_card、boundaries 和 quick_start_candidates，给我一份安装前风险预检清单。不要替我执行命令，只说明我应该检查什么、为什么检查、失败会有什么影响。\n```\n\n### 宿主 AI 开工指令\n\n- 目标：把项目上下文转成一次对话开始前的宿主 AI 指令。\n- 预期输出：一段边界明确、证据引用明确、适合复制给宿主 AI 的开工前指令。\n\n```text\n请基于 html-visual-testing 的 AI Context Pack，生成一段我可以粘贴给宿主 AI 的开工前指令。这段指令必须遵守 not_runtime=true，不能声称项目已经安装、运行或产生真实结果。\n```\n\n\n## 角色 / Skill 索引\n\n- 共索引 15 个角色 / Skill / 项目文档条目。\n\n- **:bar chart: What is Evidently?**（project_doc）：An open-source framework to evaluate, test and monitor ML and LLM-powered systems. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`README.md`\n- **Evidently API Reference Documentation**（project_doc）：Evidently API Reference Documentation 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`api-reference/README.md`\n- **Examples**（project_doc）：This folder contains examples of how to use Evidently to evaluate, test, and monitor different types of AI systems: agentic workflows, LLM applications, and traditional ML models. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/README.md`\n- **Evidently UI**（project_doc）：- service service - monitoring UI - standalone standalone - additional package we use for embedding graphics for instance in jupyter notebook and other places 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`ui/README.md`\n- **Cookbook**（project_doc）：This folder contains short, focused examples for specific Evidently functionality. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/cookbook/README.md`\n- **Grafana**（project_doc）：This folder contains examples of using Evidently metrics with Grafana dashboards. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/grafana/README.md`\n- **Monitoring example**（project_doc）：You need following tools installed: - docker - docker-compose included to Docker Desktop for Mac and Docker Desktop for Windows 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/grafana/grafana_data_drift_dashboard/README.md`\n- **Monitoring example**（project_doc）：You need following tools installed: - docker - docker-compose included to Docker Desktop for Mac and Docker Desktop for Windows 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/grafana/grafana_llm_evaluation_dashboard/README.md`\n- **Evidently UI Demo: Remote Project Example**（project_doc）：Evidently UI Demo: Remote Project Example 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`examples/service/README.md`\n- **Alembic Migrations for SQL Storage**（project_doc）：This directory contains Alembic migration scripts for managing database schema changes in the SQL storage backend. 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`src/evidently/ui/service/storage/sql/migrations/README.md`\n- **Readme**（project_doc）： 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`ui/html-visual-testing/README.md`\n- **Evidently UI**（project_doc）：Package common to service and standalone 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`ui/packages/evidently-ui-lib/README.md`\n- **Evidently UI service**（project_doc）： 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`ui/service/README.md`\n- **Evidently standalone**（project_doc）： 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`ui/standalone/README.md`\n- **Contributing to Evidently**（project_doc）：Thank you for considering contributing to Evidently! 激活提示：当用户需要理解项目结构、安装方式或边界时参考。 证据：`CONTRIBUTING.md`\n\n## 证据索引\n\n- 共索引 64 条证据。\n\n- **:bar chart: What is Evidently?**（documentation）：An open-source framework to evaluate, test and monitor ML and LLM-powered systems. 证据：`README.md`\n- **Evidently API Reference Documentation**（documentation）：Evidently API Reference Documentation 证据：`api-reference/README.md`\n- **Examples**（documentation）：This folder contains examples of how to use Evidently to evaluate, test, and monitor different types of AI systems: agentic workflows, LLM applications, and traditional ML models. 证据：`examples/README.md`\n- **Evidently UI**（documentation）：- service service - monitoring UI - standalone standalone - additional package we use for embedding graphics for instance in jupyter notebook and other places 证据：`ui/README.md`\n- **Cookbook**（documentation）：This folder contains short, focused examples for specific Evidently functionality. 证据：`examples/cookbook/README.md`\n- **Grafana**（documentation）：This folder contains examples of using Evidently metrics with Grafana dashboards. 证据：`examples/grafana/README.md`\n- **Monitoring example**（documentation）：You need following tools installed: - docker - docker-compose included to Docker Desktop for Mac and Docker Desktop for Windows 证据：`examples/grafana/grafana_data_drift_dashboard/README.md`\n- **Monitoring example**（documentation）：You need following tools installed: - docker - docker-compose included to Docker Desktop for Mac and Docker Desktop for Windows 证据：`examples/grafana/grafana_llm_evaluation_dashboard/README.md`\n- **Evidently UI Demo: Remote Project Example**（documentation）：Evidently UI Demo: Remote Project Example 证据：`examples/service/README.md`\n- **Alembic Migrations for SQL Storage**（documentation）：This directory contains Alembic migration scripts for managing database schema changes in the SQL storage backend. 证据：`src/evidently/ui/service/storage/sql/migrations/README.md`\n- **Evidently UI**（documentation）：Package common to service and standalone 证据：`ui/packages/evidently-ui-lib/README.md`\n- **Evidently UI service**（documentation）：Evidently UI service 证据：`ui/service/README.md`\n- **Evidently standalone**（documentation）：Evidently standalone 证据：`ui/standalone/README.md`\n- **Package**（package_manifest）：{ \"version\": \"1.0.0\", \"description\": \"Evidently UI Monorepo\", \"scripts\": { \"type-check\": \"pnpm --workspace-concurrency 1 -r type-check\", \"code-check\": \"pnpm -r code-check\", \"build\": \"pnpm -r build\", \"clean-assets-service\": \"rm -rf ../src/evidently/ui/service/assets/ \", \"clean-assets-standalone\": \"rm -rf ../src/evidently/nbextension/static/index.js\", \"copy-assets-service\": \"cp -r ./service/dist/ ../src/evidently/ui/service/assets\", \"copy-assets-standalone\": \"cp ./standalone/dist/index.js ../src/evidently/nbextension/static\", \"clean-assets-all\": \"pnpm clean-assets-service && pnpm clean-assets-standalone\", \"copy-assets-all\": \"pnpm copy-assets-service && pnpm copy-assets-standalone\", \"full-buil… 证据：`ui/package.json`\n- **Contributing to Evidently**（documentation）：Thank you for considering contributing to Evidently! 证据：`CONTRIBUTING.md`\n- **Package**（package_manifest）：{ \"name\": \"html-visual-testing\", \"version\": \"1.0.0\", \"type\": \"module\", \"scripts\": { \"test\": \"playwright test\" }, \"dependencies\": { }, \"devDependencies\": { \"wait-on\": \"^7.1.0\", \"@playwright/test\": \"^1.43.0\" } } 证据：`ui/html-visual-testing/package.json`\n- **Package**（package_manifest）：{ \"name\": \"evidently-ui-lib\", \"version\": \"1.0.0\", \"type\": \"module\", \"scripts\": { \"tsc\": \"tsc\", \"code-check\": \"biome check\" }, \"exports\": { \"./routes-components/ /data\": { \"default\": \"./src/routes-components/ /data.tsx\" }, \"./routes-components/ \": { \"default\": \"./src/routes-components/ /index.tsx\" }, \"./ \": { \"default\": \"./src/ \", \"types\": \"./.tsc-dts/src/ /index.d.ts\", \"./.tsc-dts/src/ .d.ts\" } }, \"peerDependencies\": { \"react\": \"^18.2.0\", \"react-dom\": \"^18.2.0\" }, \"dependencies\": { \"@hookform/resolvers\": \"^3.3.1\", \"@material-table/core\": \"^6.4.4\", \"@mui/icons-material\": \"^7.0.2\", \"@mui/material\": \"^7.0.2\", \"@mui/x-date-pickers\": \"^8.2.0\", \"@mui/x-data-grid\": \"^7.16.0\", \"material-ui-popup-st… 证据：`ui/packages/evidently-ui-lib/package.json`\n- **Package**（package_manifest）：{ \"name\": \"service\", \"version\": \"1.0.0\", \"type\": \"module\", \"scripts\": { \"dev\": \"vite\", \"type-check\": \"tsc --build\", \"build\": \"vite build\", \"code-check\": \"biome check\", \"preview\": \"vite preview\", \"test\": \"playwright test\", \"sync-back\": \"cd ../.. && ./.github/scripts/get-types-from-back.sh\" }, \"dependencies\": { \"evidently-ui-lib\": \"workspace: \", \"dayjs\": \"^1.11.13\", \"react\": \"^18.2.0\", \"react-dom\": \"^18.2.0\", \"tiny-invariant\": \"^1.3.3\" }, \"devDependencies\": { \"wait-on\": \"^7.1.0\", \"@playwright/test\": \"^1.43.0\", \"@types/react\": \"^18.2.0\", \"@types/react-dom\": \"^18.2.0\", \"vite\": \"^7.0.6\", \"@vitejs/plugin-react-swc\": \"^3.11.0\", \"vite-tsconfig-paths\": \"^5.1.4\", \"typescript\": \"^5.7.2\" } } 证据：`ui/service/package.json`\n- **Package**（package_manifest）：{ \"name\": \"standalone\", \"version\": \"1.0.0\", \"type\": \"module\", \"scripts\": { \"dev\": \"vite\", \"type-check\": \"tsc --build\", \"build\": \"vite build\", \"code-check\": \"biome check\", \"preview\": \"vite preview\" }, \"dependencies\": { \"evidently-ui-lib\": \"workspace: \", \"react\": \"^18.2.0\", \"react-dom\": \"^18.2.0\" }, \"devDependencies\": { \"@types/react\": \"^18.2.0\", \"@types/react-dom\": \"^18.2.0\", \"@vitejs/plugin-react-swc\": \"^3.7.0\", \"typescript\": \"^5.7.2\", \"vite\": \"^5.2.12\", \"vite-tsconfig-paths\": \"^4.2.1\" } } 证据：`ui/standalone/package.json`\n- **License**（source_file）：Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ 证据：`LICENSE`\n- **Devcontainer**（structured_config）：{ \"name\": \"Default Linux Universal\", \"image\": \"mcr.microsoft.com/devcontainers/universal:2\", \"onCreateCommand\": \"./.devcontainer/on-create.sh\", \"postAttachCommand\": { \"ui\": \"source ./venv/bin/activate && evidently ui --port 8000 --demo-projects all --workspace test-workspace\" }, \"forwardPorts\": 8000 , \"portsAttributes\": { \"8000\": { \"label\": \"evidently UI\", \"onAutoForward\": \"openBrowser\" } } } 证据：`.devcontainer/devcontainer.json`\n- **Config**（structured_config）：{ \"data format\": { \"separator\": \",\", \"header\": true, \"date column\": \"dteday\" }, \"column mapping\" : {}, \"dashboard tabs\": { \"data drift\": { }, \"cat target drift\":{ \"verbose level\": 0 } }, \"options\": { \"data drift\": { \"confidence\": 0.95, \"drift share\": 0.5, \"nbinsx\": null, \"xbins\": null } }, \"pretty print\": true, \"sampling\": { \"reference\": { \"type\": \"none\", \"n\": 1, \"ratio\": 0.1 }, \"current\": { \"type\": \"nth\", \"n\": 2, \"ratio\": 0.1 } } } 证据：`config.json`\n- **Tsconfig.Base.Ui**（structured_config）：{ \"compilerOptions\": { \"target\": \"ESNext\", \"useDefineForClassFields\": true, \"lib\": \"ESNext\", \"DOM\", \"DOM.Iterable\" , \"module\": \"ESNext\", \"skipLibCheck\": true, 证据：`ui/tsconfig.base.ui.json`\n- **Data Drift**（structured_config）：{ \"annotations\": { \"list\": { \"builtIn\": 1, \"datasource\": { \"type\": \"grafana\", \"uid\": \"-- Grafana --\" }, \"enable\": true, \"hide\": true, \"iconColor\": \"rgba 0, 211, 255, 1 \", \"name\": \"Annotations & Alerts\", \"target\": { \"limit\": 100, \"matchAny\": false, \"tags\": , \"type\": \"dashboard\" }, \"type\": \"dashboard\" } }, \"editable\": true, \"fiscalYearStartMonth\": 0, \"graphTooltip\": 0, \"id\": 1, \"links\": , \"liveNow\": false, \"panels\": { \"datasource\": { \"type\": \"postgres\", \"uid\": \"PCC52D03280B7034C\" }, \"fieldConfig\": { \"defaults\": { \"color\": { \"mode\": \"palette-classic\" }, \"custom\": { \"axisLabel\": \"\", \"axisPlacement\": \"auto\", \"barAlignment\": 0, \"drawStyle\": \"line\", \"fillOpacity\": 0, \"gradientMode\": \"none\", \"hideF… 证据：`examples/grafana/grafana_data_drift_dashboard/dashboards/data_drift.json`\n- **Chatbot Evals**（structured_config）：{ \"annotations\": { \"list\": { \"builtIn\": 1, \"datasource\": { \"type\": \"grafana\", \"uid\": \"-- Grafana --\" }, \"enable\": true, \"hide\": true, \"iconColor\": \"rgba 0, 211, 255, 1 \", \"name\": \"Annotations & Alerts\", \"type\": \"dashboard\" } }, \"editable\": true, \"fiscalYearStartMonth\": 0, \"graphTooltip\": 0, \"id\": 2, \"links\": , \"panels\": { \"datasource\": { \"type\": \"grafana-postgresql-datasource\", \"uid\": \"PCC52D03280B7034C\" }, \"fieldConfig\": { \"defaults\": { \"color\": { \"mode\": \"thresholds\" }, \"mappings\": , \"thresholds\": { \"mode\": \"absolute\", \"steps\": { \"color\": \"green\" }, { \"color\": \"red\", \"value\": 80 } } }, \"overrides\": }, \"gridPos\": { \"h\": 8, \"w\": 12, \"x\": 0, \"y\": 0 }, \"id\": 1, \"options\": { \"colorMode\": \"valu… 证据：`examples/grafana/grafana_llm_evaluation_dashboard/dashboards/chatbot_evals.json`\n- **Manifest**（structured_config）：{ \"short name\": \"Evidently.AI\", \"name\": \"Evidently.AI Dashboards\", \"icons\": { \"src\": \"favicon.ico\", \"sizes\": \"64x64 32x32 24x24 16x16\", \"type\": \"image/x-icon\" }, { \"src\": \"favicon-96x96.png\", \"type\": \"image/png\", \"sizes\": \"96x96\" } , \"start url\": \".\", \"display\": \"standalone\", \"theme color\": \" 000000\", \"background color\": \" ffffff\" } 证据：`src/evidently/legacy/ui/assets/manifest.json`\n- **Manifest**（structured_config）：{ \"short name\": \"Evidently.AI\", \"name\": \"Evidently.AI Dashboards\", \"icons\": { \"src\": \"favicon.ico\", \"sizes\": \"64x64 32x32 24x24 16x16\", \"type\": \"image/x-icon\" }, { \"src\": \"favicon-96x96.png\", \"type\": \"image/png\", \"sizes\": \"96x96\" } , \"start url\": \".\", \"display\": \"standalone\", \"theme color\": \" 000000\", \"background color\": \" ffffff\" } 证据：`src/evidently/ui/service/assets/manifest.json`\n- **Tsconfig**（structured_config）：{ \"extends\": \"../../tsconfig.base.ui\", \"compilerOptions\": { \"composite\": true, \"outDir\": \".tsc-dts\", \"noEmit\": false, \"emitDeclarationOnly\": true, \"declaration\": true, \"baseUrl\": \"./src\", \"paths\": { \"~/ \": \" \" } }, \"include\": \"src\" } 证据：`ui/packages/evidently-ui-lib/tsconfig.json`\n- **Manifest**（structured_config）：{ \"short name\": \"Evidently.AI\", \"name\": \"Evidently.AI Dashboards\", \"icons\": { \"src\": \"favicon.ico\", \"sizes\": \"64x64 32x32 24x24 16x16\", \"type\": \"image/x-icon\" }, { \"src\": \"favicon-96x96.png\", \"type\": \"image/png\", \"sizes\": \"96x96\" } , \"start url\": \".\", \"display\": \"standalone\", \"theme color\": \" 000000\", \"background color\": \" ffffff\" } 证据：`ui/service/public/manifest.json`\n- **Tsconfig**（structured_config）：{ \"extends\": \"../tsconfig.base.ui\", \"compilerOptions\": { \"baseUrl\": \"./src\", \"paths\": { \"~/ \": \" \" } }, \"include\": \"src\" , \"references\": { \"path\": \"../packages/evidently-ui-lib/\" } } 证据：`ui/service/tsconfig.json`\n- **Tsconfig**（structured_config）：{ \"extends\": \"../tsconfig.base.ui\", \"compilerOptions\": { \"baseUrl\": \"./src\" }, \"include\": \"src\" , \"references\": { \"path\": \"../packages/evidently-ui-lib/\" } } 证据：`ui/standalone/tsconfig.json`\n- **!/usr/bin/env bash**（source_file）：virtualenv venv source ./venv/bin/activate pip install -e \". dev \" 证据：`.devcontainer/on-create.sh`\n- **.dockerignore**（source_file）：ui 证据：`.dockerignore`\n- **.dvc/.gitignore**（source_file）：/config.local /tmp /cache 证据：`.dvc/.gitignore`\n- **Config**（source_file）：core remote = gs 'remote \"gs\"' url = gs://evidently-service-visual-testing 证据：`.dvc/config`\n- **Add patterns of files dvc should ignore, which could improve**（source_file）：Add patterns of files dvc should ignore, which could improve the performance. Learn more at https://dvc.org/doc/user-guide/dvcignore 证据：`.dvcignore`\n- **.Gitbook**（source_file）：structure: readme: README.md summary: SUMMARY.md 证据：`.gitbook.yaml`\n- **see .devcontainer/devcontainer.json**（source_file）：.egg-info venv .venv .vscode .DS Store examples/.DS Store example scripts .ipynb checkpoints .idea .mypy cache .pytest cache dist build pytest-reports/ MANIFEST uv.lock 证据：`.gitignore`\n- **.Pre Commit Config**（source_file）：repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: \"v4.6.0\" hooks: - id: check-case-conflict - id: check-merge-conflict - id: check-toml - id: check-yaml exclude: ' docs examples /. ' - id: end-of-file-fixer exclude: ' docs examples /. ' - id: trailing-whitespace exclude: ' docs examples /. ' 证据：`.pre-commit-config.yaml`\n- **Manifest**（source_file）：include setupbase.py include README.md 证据：`MANIFEST.in`\n- **!/usr/bin/env -S uv run --script**（source_file）：!/usr/bin/env -S uv run --script /// script requires-python = \" =3.11\" dependencies = \"pdoc =16.0.0\", \"tracely =0.2.12\", \"typer =0.3\", /// 证据：`api-reference/generate.py`\n- **!/bin/bash**（source_file）：SCRIPT DIR=\"$ cd \"$ dirname \"$0\" \" && pwd \" 证据：`api-reference/local-dev.sh`\n- **Biome**（source_file）：{ \"files\": { // endpoints.d.ts is autogenerated // JsonParser.ts copied from https://github.com/douglascrockford/JSON-js \"ignore\": \"ui/packages/evidently-ui-lib/src/ /JsonParser.ts\", \"ui/packages/evidently-ui-lib/src/ /endpoints.d.ts\", \"ui/packages/evidently-ui-lib/.tsc-dts/ \" , \"include\": \"ui/ /src/ \", \"ui/ /tests/ \", \"ui/packages/ /src/ \" }, \"formatter\": { \"enabled\": true, \"formatWithErrors\": false, \"indentStyle\": \"space\", \"indentWidth\": 2, \"lineEnding\": \"lf\", \"lineWidth\": 100, \"attributePosition\": \"auto\" }, \"css\": { \"linter\": { \"enabled\": true }, \"formatter\": { \"enabled\": true } }, \"javascript\": { \"formatter\": { \"arrowParentheses\": \"always\", \"bracketSameLine\": false, \"bracketSpacing\": tr… 证据：`biome.jsonc`\n- **Config**（source_file）：data format: separator: \",\" header: true date column: \"dteday\" column mapping: {} profile sections: - data drift: xbins: nbinsx: drift share: 0.5 pretty print: true sampling: reference: type: \"simple\" could be \"none\", \"simple\", \"random\" n: 5 used with simple sampling, number of rows to skip ratio: 0.1 used with random sampling, part of data to take from chunk random seed: 4 used with random sampling, used as seed for random generator current: type: \"nth\" could be \"none\", \"simple\", \"random\" n: 5 used with simple sampling, number of rows to skip ratio: 0.1 used with random sampling, part of data to take from chunk 证据：`config.yaml`\n- **Dockerfile**（source_file）：FROM python:3.11-slim-bookworm ARG BUILD DATE ARG VCS REF ARG VERSION ARG INSTALL EXTRAS=\" sql \" LABEL org.label-schema.build-date=$BUILD DATE \\ org.label-schema.name=\"Evidently AI Service\" \\ org.label-schema.url=\"https://github.com/evidentlyai/evidently\" \\ org.label-schema.vcs-ref=$VCS REF \\ org.label-schema.vcs-url=\"https://github.com/evidentlyai/evidently\" \\ org.label-schema.version=$VERSION \\ org.label-schema.schema-version=\"1.0\" LABEL maintainer=\"mike0sv@evidentlyai.com\" 证据：`docker/Dockerfile.service`\n- **Dockerfile.Service**（source_file）：FROM python:3.11-slim-bookworm ARG BUILD DATE ARG VCS REF ARG VERSION LABEL org.label-schema.build-date=$BUILD DATE \\ org.label-schema.name=\"Evidently AI Service\" \\ org.label-schema.url=\"https://github.com/evidentlyai/evidently\" \\ org.label-schema.vcs-ref=$VCS REF \\ org.label-schema.vcs-url=\"https://github.com/evidentlyai/evidently\" \\ org.label-schema.version=$VERSION \\ org.label-schema.schema-version=\"1.0\" LABEL maintainer=\"mike0sv@evidentlyai.com\" 证据：`docker/Dockerfile.service.dev`\n- **Build Docker image**（source_file）：Build Docker image build: docker build output 证据：`docker/Makefile`\n- **Example Test**（source_file）：excludes = \"nyc taxi data drift dashboard customization.py\", \"historical drift visualization.py\", \"mlflow integration.py\", \"ibm hr attrition model validation.py\", \"bicycle demand monitoring setup.py\", \"data and ml monitoring tutorial.py\", \"how to run drift report for text encoders.py\", \"comparing custom statest with classic distributions.py\", \"how to evaluate llm with text descriptors.py\", \"how to run drift report for text data.py\", too slow & torch version conflict? \"llm evaluation tutorial.py\", cloud usage \"llm tracing tutorial.py\", cloud usage 证据：`example_test.py`\n- **Agentic Systems Tracing**（source_file）：{ \"cells\": { \"cell type\": \"markdown\", \"id\": \"767f3682-1a00-4ca2-92db-8235879315ee\", \"metadata\": {}, \"source\": \" Agentic Systems: Tracing and Evaluation\" }, { \"cell type\": \"code\", \"execution count\": null, \"id\": \"1fbcabcc-b6ec-4ed4-9ae5-6c7c6a692084\", \"metadata\": {}, \"outputs\": , \"source\": \"import openai\\n\", \"import tracely\\n\", \"import evidently.ui.runner as service runner\" }, { \"cell type\": \"markdown\", \"id\": \"a23f3141-1636-4c66-baa8-c85044610740\", \"metadata\": {}, \"source\": \" Let's run evidently ui service in background\" }, { \"cell type\": \"code\", \"execution count\": null, \"id\": \"440d97b8-eff8-40e2-87bc-e15b30647375\", \"metadata\": {}, \"outputs\": , \"source\": \"runner = service runner.EvidentlyUIRu… 证据：`examples/agentic_systems_tracing.ipynb`\n- **Classic Ml Validation**（source_file）：{ \"cells\": { \"cell type\": \"markdown\", \"id\": \"fef71322-f97b-416c-bbb9-bf1dd39e40ab\", \"metadata\": {}, \"source\": \" Classic ML Validation: Data Quality and Drift\" }, { \"cell type\": \"code\", \"execution count\": null, \"id\": \"e5aa23cf-2d2c-46fa-bb66-05623f81e7d8\", \"metadata\": {}, \"outputs\": , \"source\": \"import pandas as pd\\n\", \"from sklearn import datasets\\n\", \"\\n\", \"from evidently import Dataset\\n\", \"from evidently import DataDefinition\\n\", \"from evidently import Report\\n\", \"from evidently.presets import DataDriftPreset, DataSummaryPreset\" }, { \"cell type\": \"markdown\", \"id\": \"dccb8957-33fc-4a43-90b4-202f2caffeba\", \"metadata\": {}, \"source\": \" Let's load tabular dataset for binary classification prob… 证据：`examples/classic_ml_validation.ipynb`\n- **Llm Input Output Validation**（source_file）：{ \"cells\": { \"cell type\": \"markdown\", \"id\": \"e94178b9-eb37-44dc-aa84-2a4ef489b4a3\", \"metadata\": {}, \"source\": \" LLM Validation: Input and Output Quality\" }, { \"cell type\": \"code\", \"execution count\": null, \"id\": \"ba78f1ae-cc47-4d50-a655-37f1e63869ed\", \"metadata\": {}, \"outputs\": , \"source\": \"import pandas as pd\\n\", \"\\n\", \"from evidently import Dataset\\n\", \"from evidently import DataDefinition\\n\", \"from evidently import Report\\n\", \"from evidently.presets import TextEvals\\n\", \"from evidently.tests import lte, gte, eq\\n\", \"from evidently.descriptors import LLMEval, TestSummary, DeclineLLMEval, Sentiment, TextLength, IncludesWords\\n\", \"from evidently.llm.templates import BinaryClassificationPromp… 证据：`examples/llm_input_output_validation.ipynb`\n- **Pyproject**（source_file）：build-system requires = \"hatchling\" build-backend = \"hatchling.build\" 证据：`pyproject.toml`\n- **keep in sync with pyproject.toml project.optional-dependencies dev block.**（source_file）：keep in sync with pyproject.toml project.optional-dependencies dev block. wheel==0.38.1 setuptools==65.5.1; python version = '3.12' jupyter==1.0.0 mypy==1.1.1 pandas-stubs pytest==7.4.4 pytest-asyncio==0.23.7 pytest-mock==3.14.0 pytest-html =4.2.0 types-PyYAML==6.0.1 types-requests==2.26.0 types-dataclasses==0.6 types-python-dateutil==2.8.19 types-ujson==5.4.0 pillow==11.3.0 pip-audit pyspark ruff==0.3.7 pre-commit==3.5.0 tracely =0.2.11 证据：`requirements.dev.txt`\n- **keep in sync with pyproject.toml project dependencies but there should be set as lower bound .**（source_file）：keep in sync with pyproject.toml project dependencies but there should be set as lower bound . plotly==5.10.0 statsmodels==0.14.0 scikit-learn==1.1.1 pandas parquet ==1.3.5 numpy==1.23.0 nltk==3.6.7 scipy==1.10.0 requests==2.32.0 PyYAML==6.0.1 pydantic==1.10.16 litestar==2.19.0 typing-inspect==0.9.0 uvicorn==0.22.0 watchdog==3.0.0 typer==0.3 rich==13 iterative-telemetry==0.0.5 dynaconf==3.2.4 pyarrow==14.0.1 pyspark==3.4.0 fsspec==2024.9.0 certifi==2024.7.4 urllib3==1.26.19 ujson==5.4.0 deprecation==2.1.0 uuid6==2024.7.10 cryptography==43.0.1 tracely==0.2.11 opentelemetry-proto==1.25.0 证据：`requirements.min.txt`\n- **Add the line-too-long rule to the enforced rule set. By default, Ruff omits rules that**（source_file）：lint.pycodestyle max-line-length = 140 allow unforamttable lines up to 140 chars 证据：`ruff.toml`\n- **test_data/.gitignore**（source_file）：/lfw-dataset.zip /20news-bydate py3.pkz 证据：`test_data/.gitignore`\n- **20News Bydate Py3.Pkz**（source_file）：outs: - md5: 795cc33db0b25fdf479a34c9bb776b3e size: 15302062 hash: md5 path: 20news-bydate py3.pkz 证据：`test_data/20news-bydate_py3.pkz.dvc`\n- **Adults**（source_file）：Title: Adult Description: Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as \"Census Income\" dataset. Dataset source: https://archive.ics.uci.edu/dataset/2/adult Creators: Barry Becker, Ronny Kohavi Licence: CC BY 4.0 证据：`test_data/adults.CITATION`\n- **Bike Sharing Dataset**（source_file）：========================================= License ========================================= Use of this dataset in publications must be cited to the following publication: 证据：`test_data/bike_sharing_dataset.CITATION`\n- **Lfw Dataset.Zip**（source_file）：outs: - md5: 658ceabf5ff1545c5ee12cbcfbd8f82b size: 318271483 hash: md5 path: lfw-dataset.zip 证据：`test_data/lfw-dataset.zip.dvc`\n- 其余 4 条证据见 `AI_CONTEXT_PACK.json` 或 `EVIDENCE_INDEX.json`。\n\n## 宿主 AI 必须遵守的规则\n\n- **把本资产当作开工前上下文，而不是运行环境。**：AI Context Pack 只包含证据化项目理解，不包含目标项目的可执行状态。 证据：`README.md`, `api-reference/README.md`, `examples/README.md`\n- **回答用户时区分可预览内容与必须安装后才能验证的内容。**：安装前体验的消费者价值来自降低误装和误判，而不是伪装成真实运行。 证据：`README.md`, `api-reference/README.md`, `examples/README.md`\n\n## 用户开工前应该回答的问题\n\n- 你准备在哪个宿主 AI 或本地环境中使用它？\n- 你只是想先体验工作流，还是准备真实安装？\n- 你最在意的是安装成本、输出质量、还是和现有规则的冲突？\n\n## 验收标准\n\n- 所有能力声明都能回指到 evidence_refs 中的文件路径。\n- AI_CONTEXT_PACK.md 没有把预览包装成真实运行。\n- 用户能在 3 分钟内看懂适合谁、能做什么、如何开始和风险边界。\n\n---\n\n## Doramagic Context Augmentation\n\n下面内容用于强化 Repomix/AI Context Pack 主体。Human Manual 只提供阅读骨架；踩坑日志会被转成宿主 AI 必须遵守的工作约束。\n\n## Human Manual 骨架\n\n使用规则：这里只是项目阅读路线和显著性信号，不是事实权威。具体事实仍必须回到 repo evidence / Claim Graph。\n\n宿主 AI 硬性规则：\n- 不得把页标题、章节顺序、摘要或 importance 当作项目事实证据。\n- 解释 Human Manual 骨架时，必须明确说它只是阅读路线/显著性信号。\n- 能力、安装、兼容性、运行状态和风险判断必须引用 repo evidence、source path 或 Claim Graph。\n\n- **Architecture Overview**：importance `high`\n  - source_paths: src/evidently/__init__.py, src/evidently/_registry.py, src/evidently/sdk/__init__.py, src/evidently/legacy/__init__.py, src/evidently/future/__init__.py\n- **Core Components**：importance `high`\n  - source_paths: src/evidently/core/base_types.py, src/evidently/core/metric_types.py, src/evidently/core/report.py, src/evidently/core/tests.py, src/evidently/core/registries/components.py\n- **Data Management and Data Flow**：importance `high`\n  - source_paths: src/evidently/core/datasets.py, src/evidently/core/container.py, src/evidently/sdk/datasets.py, src/evidently/llm/datagen/base.py, src/evidently/llm/rag/index.py\n- **ML Model Evaluation**：importance `high`\n  - source_paths: src/evidently/metrics/classification.py, src/evidently/metrics/regression.py, src/evidently/metrics/data_quality.py, src/evidently/metrics/dataset_statistics.py, src/evidently/metrics/recsys.py\n- **LLM Evaluation and Judging**：importance `high`\n  - source_paths: src/evidently/descriptors/llm_judges.py, src/evidently/descriptors/_context_relevance.py, src/evidently/llm/prompts/__init__.py, src/evidently/llm/utils/parsing.py, src/evidently/llm/utils/prompt_render.py\n- **Descriptors and Features System**：importance `medium`\n  - source_paths: src/evidently/descriptors/__init__.py, src/evidently/descriptors/_text_length.py, src/evidently/descriptors/_custom_descriptors.py, src/evidently/descriptors/generated_descriptors.py, src/evidently/descriptors/text_match.py\n- **Reports and Test Suites**：importance `high`\n  - source_paths: src/evidently/core/report.py, src/evidently/core/tests.py, src/evidently/core/compare.py, src/evidently/core/serialization.py, src/evidently/legacy/report/report.py\n- **Presets and Metric Presets**：importance `high`\n  - source_paths: src/evidently/presets/__init__.py, src/evidently/presets/drift.py, src/evidently/presets/classification.py, src/evidently/presets/regression.py, src/evidently/presets/recsys.py\n\n## Repo Inspection Evidence / 源码检查证据\n\n- repo_clone_verified: true\n- repo_inspection_verified: true\n- repo_commit: `a4aa4c2b37fe7a4344cc5031f566deccf3d69e4f`\n- inspected_files: `pyproject.toml`, `README.md`, `examples/README.md`, `examples/cookbook/README.md`, `examples/service/postgres_config.yaml`, `examples/service/sqlite_config.yaml`, `examples/service/README.md`, `examples/service/docker-compose.yml`, `examples/service/remote_demo_project.py`, `examples/grafana/README.md`, `examples/future_examples/future_reviews.py`, `examples/grafana/grafana_llm_evaluation_dashboard/README.md`, `examples/grafana/grafana_llm_evaluation_dashboard/docker-compose.yml`, `examples/grafana/grafana_llm_evaluation_dashboard/evidently_metrics_calculation.py`, `examples/grafana/grafana_data_drift_dashboard/README.md`, `examples/grafana/grafana_data_drift_dashboard/docker-compose.yml`, `examples/grafana/grafana_data_drift_dashboard/evidently_metrics_calculation.py`, `examples/grafana/grafana_llm_evaluation_dashboard/config/grafana_dashboards.yaml`, `examples/grafana/grafana_llm_evaluation_dashboard/config/grafana_datasources.yaml`, `examples/grafana/grafana_llm_evaluation_dashboard/dashboards/chatbot_evals.json`\n\n宿主 AI 硬性规则：\n- 没有 repo_clone_verified=true 时，不得声称已经读过源码。\n- 没有 repo_inspection_verified=true 时，不得把 README/docs/package 文件判断写成事实。\n- 没有 quick_start_verified=true 时，不得声称 Quick Start 已跑通。\n\n## Doramagic Pitfall Constraints / 踩坑约束\n\n这些规则来自 Doramagic 发现、验证或编译过程中的项目专属坑点。宿主 AI 必须把它们当作工作约束，而不是普通说明文字。\n\n### Constraint 1: 来源证据：Update scikit-learn version requirement to support v1.6.0\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Update scikit-learn version requirement to support v1.6.0\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能影响升级、迁移或版本选择。\n- Evidence: community_evidence:github | cevd_e82e24d201134e7f8dfa0b564d73fce8 | https://github.com/evidentlyai/evidently/issues/1407 | 来源类型 github_issue 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 2: 来源证据：PromptOptimizer throws OpenAIError when using Vertex AI judge\n\n- Trigger: GitHub 社区证据显示该项目存在一个配置相关的待验证问题：PromptOptimizer throws OpenAIError when using Vertex AI judge\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_abf6f9b183ff4892be78fd198f60d8e8 | https://github.com/evidentlyai/evidently/issues/1856 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 3: 来源证据：IndexError in infer_column_type when column contains only null values\n\n- Trigger: GitHub 社区证据显示该项目存在一个运行相关的待验证问题：IndexError in infer_column_type when column contains only null values\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能阻塞安装或首次运行。\n- Evidence: community_evidence:github | cevd_44cdd5f2b657432eb8418fb132959ada | https://github.com/evidentlyai/evidently/issues/1764 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 4: 来源证据：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices\n\n- Trigger: GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices\n- Host AI rule: 来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- Why it matters: 可能阻塞安装或首次运行。\n- Evidence: community_evidence:github | cevd_872f2b57cd3a4c61ad5c387c8be3208b | https://github.com/evidentlyai/evidently/issues/1410 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 5: 来源证据：Numpy 2.x support?\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Numpy 2.x support?\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能阻塞安装或首次运行。\n- Evidence: community_evidence:github | cevd_2fc84415b55a43d2be4f7f47cf46d7a1 | https://github.com/evidentlyai/evidently/issues/1557 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 6: 来源证据：v0.7.12\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.12\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_fcf994a3bcf94b0d9b2c5d6159816728 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.12 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 7: 来源证据：v0.7.15\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.15\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_a9de117bd8e440beab9f7be4af50d710 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.15 | 来源讨论提到 linux 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 8: 来源证据：v0.7.20\n\n- Trigger: GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.20\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_f0d67058bf2a40218adfb0eea1f1665f | https://github.com/evidentlyai/evidently/releases/tag/v0.7.20 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 9: 来源证据：v0.7.19\n\n- Trigger: GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.19\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_52e6ab37edca4901a0aa9fe7b5ba12f1 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.19 | 来源类型 github_release 暴露的待验证使用条件。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n\n### Constraint 10: 来源证据：v0.7.21\n\n- Trigger: GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.21\n- Host AI rule: 来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- Why it matters: 可能增加新用户试用和生产接入成本。\n- Evidence: community_evidence:github | cevd_fe9f0e6e29564afeb5610928cc9bc834 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.21 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n- Hard boundary: 不要把这个坑点包装成已解决、已验证或可忽略，除非后续验证证据明确证明它已经关闭。\n",
      "summary": "给宿主 AI 的上下文和工作边界。",
      "title": "AI Context Pack / 带给我的 AI"
    },
    "boundary_risk_card": {
      "asset_id": "boundary_risk_card",
      "filename": "BOUNDARY_RISK_CARD.md",
      "markdown": "# Boundary & Risk Card / 安装前决策卡\n\n项目：evidentlyai/evidently\n\n## Doramagic 试用结论\n\n当前结论：可以进入发布前推荐检查；首次使用仍应从最小权限、临时目录和可回滚配置开始。\n\n## 用户现在可以做\n\n- 可以先阅读 Human Manual，理解项目目的和主要工作流。\n- 可以复制 Prompt Preview 做安装前体验；这只验证交互感，不代表真实运行。\n- 可以把官方 Quick Start 命令放到隔离环境中验证，不要直接进主力环境。\n\n## 现在不要做\n\n- 不要把 Prompt Preview 当成项目实际运行结果。\n- 不要把 metadata-only validation 当成沙箱安装验证。\n- 不要把未验证能力写成“已支持、已跑通、可放心安装”。\n- 不要在首次试用时交出生产数据、私人文件、真实密钥或主力配置目录。\n\n## 安装前检查\n\n- 宿主 AI 是否匹配：local_cli\n- 官方安装入口状态：已发现官方入口\n- 是否在临时目录、临时宿主或容器中验证：必须是\n- 是否能回滚配置改动：必须能\n- 是否需要 API Key、网络访问、读写文件或修改宿主配置：未确认前按高风险处理\n- 是否记录了安装命令、实际输出和失败日志：必须记录\n\n## 当前阻塞项\n\n- 无阻塞项。\n\n## 项目专属踩坑\n\n- 来源证据：Update scikit-learn version requirement to support v1.6.0（high）：可能影响升级、迁移或版本选择。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：PromptOptimizer throws OpenAIError when using Vertex AI judge（high）：可能增加新用户试用和生产接入成本。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：IndexError in infer_column_type when column contains only null values（high）：可能阻塞安装或首次运行。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices（high）：可能阻塞安装或首次运行。 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 来源证据：Numpy 2.x support?（medium）：可能阻塞安装或首次运行。 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n\n## 风险与权限提示\n\n- no_demo: medium\n\n## 证据缺口\n\n- 暂未发现结构化证据缺口。\n",
      "summary": "安装、权限、验证和推荐前风险。",
      "title": "Boundary & Risk Card / 边界与风险卡"
    },
    "human_manual": {
      "asset_id": "human_manual",
      "filename": "HUMAN_MANUAL.md",
      "markdown": "# https://github.com/evidentlyai/evidently 项目说明书\n\n生成时间：2026-05-16 20:53:23 UTC\n\n## 目录\n\n- [Architecture Overview](#arch-overview)\n- [Core Components](#core-components)\n- [Data Management and Data Flow](#data-flow)\n- [ML Model Evaluation](#ml-evaluation)\n- [LLM Evaluation and Judging](#llm-evals)\n- [Descriptors and Features System](#descriptors-features)\n- [Reports and Test Suites](#reports-test-suites)\n- [Presets and Metric Presets](#presets)\n- [Custom Metrics and Extensibility](#custom-metrics)\n- [UI Service Backend](#ui-service)\n\n<a id='arch-overview'></a>\n\n## Architecture Overview\n\n### 相关页面\n\n相关主题：[Core Components](#core-components), [Data Management and Data Flow](#data-flow)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/ui/workspace.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/ui/workspace.py)\n- [src/evidently/sdk/configs.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/configs.py)\n- [src/evidently/sdk/adapters.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/adapters.py)\n- [src/evidently/ui/utils.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/ui/utils.py)\n- [README.md](https://github.com/evidentlyai/evidently/blob/main/README.md)\n</details>\n\n# Architecture Overview\n\n## Introduction\n\nEvidently is an open-source Python framework designed to evaluate, test, and monitor ML and LLM-powered systems. The architecture follows a modular design pattern that separates concerns between core evaluation logic, SDK APIs, user interface components, and configuration management.\n\n资料来源：[README.md:1-30]()\n\n## High-Level Architecture\n\nThe Evidently platform consists of several interconnected layers:\n\n```mermaid\ngraph TB\n    subgraph \"User Interface Layer\"\n        UI[\"UI Components<br/>(React/TypeScript)\"]\n    end\n    \n    subgraph \"SDK Layer\"\n        SDK[\"Python SDK<br/>(evidently.sdk)\"]\n        API[\"Cloud Config API\"]\n    end\n    \n    subgraph \"Core Layer\"\n        CORE[\"Core Modules<br/>(metrics, descriptors, reports)\"]\n        LEGACY[\"Legacy Module<br/>(evidently.legacy)\"]\n        FUTURE[\"Future Module<br/>(evidently.future)\"]\n    end\n    \n    subgraph \"Data Layer\"\n        WS[\"Workspace Abstraction\"]\n        CFG[\"Config System\"]\n        ADP[\"Adapters\"]\n    end\n    \n    UI --> SDK\n    SDK --> API\n    SDK --> WS\n    SDK --> CFG\n    SDK --> ADP\n    ADP --> WS\n    CFG --> WS\n```\n\n## Core Module Structure\n\nThe main `evidently` package provides the primary user-facing APIs for ML evaluation:\n\n| Component | Purpose |\n|-----------|---------|\n| `Report` | Generate evaluation reports combining multiple metrics |\n| `Dataset` | Container for evaluation data with descriptors |\n| `DataDefinition` | Schema definition for dataset structure |\n| `Metrics` | Individual evaluation metrics |\n| `Descriptors` | Row-level evaluators (e.g., Sentiment, TextLength) |\n| `Presets` | Pre-configured evaluation suites (e.g., TextEvals) |\n\n资料来源：[README.md:30-55]()\n\n### Key Imports for LLM Evals\n\n```python\nfrom evidently import Report\nfrom evidently import Dataset, DataDefinition\nfrom evidently.descriptors import Sentiment, TextLength, Contains\nfrom evidently.presets import TextEvals\n```\n\n## SDK Architecture\n\nThe SDK layer provides programmatic access to remote configurations and cloud features.\n\n### CloudConfigAPI\n\nThe `CloudConfigAPI` class manages remote configuration operations:\n\n| Method | Purpose |\n|--------|---------|\n| `create_config()` | Create a new remote configuration |\n| `update_config()` | Update an existing configuration |\n| `get_config()` | Retrieve a configuration by ID |\n| `list_configs()` | List all configurations for a project |\n| `delete_config()` | Remove a configuration |\n| `create_version()` | Create a new version of a configuration |\n| `list_versions()` | List all versions of a configuration |\n| `get_version()` | Get a specific version |\n\n资料来源：[src/evidently/sdk/configs.py:1-50]()\n\n### Config Version Management\n\nThe configuration system uses a `ConfigVersion` model to track changes:\n\n```mermaid\nclassDiagram\n    class ConfigMetadata {\n        +str created_at\n        +str updated_at\n        +str author\n        +str description\n    }\n    \n    class ConfigVersion {\n        +str id\n        +str artifact_id\n        +int version\n        +Any content\n        +ConfigVersionMetadata metadata\n    }\n    \n    class ConfigVersionMetadata {\n        +str created_at\n        +str updated_at\n        +str author\n        +str comment\n    }\n    \n    ConfigVersion --> ConfigVersionMetadata\n    ConfigVersionMetadata --|> ConfigMetadata\n```\n\n资料来源：[src/evidently/sdk/configs.py:50-150]()\n\n### Adapter Pattern\n\nThe SDK uses adapters to convert between internal config models and domain objects:\n\n```mermaid\ngraph LR\n    A[\"ConfigVersion\"] -->|Adapter| B[\"ArtifactVersion\"]\n    A -->|Adapter| C[\"Descriptor\"]\n    A -->|Adapter| D[\"Prompt\"]\n```\n\n| Adapter Class | Source Config | Target Domain Object |\n|--------------|---------------|---------------------|\n| `ArtifactAdapter` | `ConfigVersion` | `ArtifactVersion` |\n| `DescriptorAdapter` | `ConfigVersion` | `Descriptor` |\n| `PromptAdapter` | `ConfigVersion` | `Prompt` |\n\n资料来源：[src/evidently/sdk/adapters.py:1-100]()\n\n## Workspace Architecture\n\nThe workspace abstraction provides a unified interface for project management:\n\n### Abstract Base Class\n\n```mermaid\nclassDiagram\n    class Workspace {\n        <<abstract>>\n        +create_project(name, description, org_id) Project\n        +add_project(project, org_id) Project\n        +get_project(project_id) Optional~Project~\n        +delete_project(project_id)\n        +list_projects(org_id) Sequence~Project~\n    }\n```\n\n### Core Workspace Methods\n\n| Method | Parameters | Returns | Description |\n|--------|------------|---------|-------------|\n| `create_project` | `name: str`, `description: str`, `org_id: Optional[OrgID]` | `Project` | Creates and adds a new project |\n| `add_project` | `project: ProjectModel`, `org_id: Optional[OrgID]` | `Project` | Adds an existing project model |\n| `get_project` | `project_id: STR_UUID` | `Optional[Project]` | Retrieves project by UUID |\n| `delete_project` | `project_id: STR_UUID` | `None` | Removes project from workspace |\n| `list_projects` | `org_id: Optional[OrgID]` | `Sequence[Project]` | Lists projects with optional filtering |\n\n资料来源：[src/evidently/ui/workspace.py:1-100]()\n\n### Project Model\n\nProjects are stored using a `ProjectModel` data structure:\n\n```python\nProjectModel(\n    name=str,\n    description=str,\n    org_id=Optional[OrgID]\n)\n```\n\n## UI Component Architecture\n\nThe frontend is built with React and TypeScript, organized as separate packages:\n\n```mermaid\ngraph TD\n    subgraph \"ui/packages/evidently-ui-lib\"\n        WC[\"Widgets\"]\n        CC[\"Components\"]\n        FP[\"Forms\"]\n        TB[\"Tables\"]\n    end\n    \n    subgraph \"Components\"\n        DT[\"Dashboard\"]\n        TR[\"Traces\"]\n        PR[\"Prompts\"]\n        DS[\"Descriptors\"]\n    end\n    \n    CC --> DT\n    CC --> TR\n    CC --> PR\n    CC --> DS\n    CC --> FP\n    CC --> TB\n```\n\n### Component Categories\n\n| Package Path | Purpose |\n|-------------|---------|\n| `src/widgets/` | Dashboard widgets and test suite components |\n| `src/components/Dashboard/` | Dashboard-specific UI elements |\n| `src/components/Traces/` | Trace viewing and management |\n| `src/components/Prompts/` | Prompt template management |\n| `src/components/Descriptors/` | Descriptor configuration forms |\n| `src/components/Utils/` | Shared utility components |\n\n## Template and Link Generation\n\nThe UI layer uses template functions to generate HTML for external links and reports:\n\n| Template | Purpose |\n|----------|---------|\n| `HTML_LINK_WITH_ID_TEMPLATE` | Report links with button and ID display |\n| `FILE_LINK_WITH_ID_TEMPLATE` | File links with ID metadata |\n| `RUNNING_SERVICE_LINK_TEMPLATE` | Service endpoint links |\n| `EVIDENTLY_STYLES_COMMON` | Shared CSS styles for links |\n\n资料来源：[src/evidently/ui/utils.py:1-60]()\n\n### Template Structure\n\n```html\n<div class=\"evidently-links container\">\n    <a target=\"_blank\" href=\"{button_url}\">{button_title}</a>\n    <p><b>{id_title}:</b> <span>{id}</span></p>\n</div>\n```\n\n## Data Flow Architecture\n\n```mermaid\ngraph TB\n    A[\"User Code\"] --> B[\"Dataset Creation\"]\n    B --> C[\"Descriptor Application\"]\n    C --> D[\"Report Generation\"]\n    D --> E[\"Workspace Storage\"]\n    \n    F[\"Cloud API\"] --> G[\"Config Management\"]\n    G --> E\n    \n    E --> H[\"UI Display\"]\n```\n\n## API Reference Documentation\n\nThe project includes automated API documentation generation:\n\n| Command | Description |\n|---------|-------------|\n| `./api-reference/generate.py --local-source-code` | Generate docs from local source |\n| `./api-reference/generate.py --git-revision <ref>` | Generate docs from git revision |\n| `./api-reference/generate.py --additional-modules` | Include extra modules |\n\nDocumentation is output to `api-reference/dist/` organized by revision.\n\n资料来源：[api-reference/README.md:1-50]()\n\n## Summary\n\nThe Evidently architecture is organized into three main layers:\n\n1. **Core Layer** - Python packages for evaluation logic (`evidently`, `evidently.core`)\n2. **SDK Layer** - Remote configuration and cloud integration (`evidently.sdk`)\n3. **UI Layer** - React/TypeScript web interface for visualization and management\n\nThe modular design allows users to:\n- Use the Python SDK directly for programmatic evaluation\n- Store and version configurations in the cloud\n- Access results through the web UI\n- Extend functionality through the descriptor and metric system\n\n---\n\n<a id='core-components'></a>\n\n## Core Components\n\n### 相关页面\n\n相关主题：[Architecture Overview](#arch-overview), [Reports and Test Suites](#reports-test-suites), [Custom Metrics and Extensibility](#custom-metrics)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/core/base_types.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/base_types.py)\n- [src/evidently/core/metric_types.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/metric_types.py)\n- [src/evidently/core/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/report.py)\n- [src/evidently/core/tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/tests.py)\n- [src/evidently/core/registries/components.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/components.py)\n- [src/evidently/core/registries/metrics.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/metrics.py)\n- [src/evidently/core/registries/metric_tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/metric_tests.py)\n</details>\n\n# Core Components\n\n## Overview\n\nThe Core Components form the foundational architecture of the Evidently framework, providing the essential building blocks for evaluation, testing, and monitoring of ML and LLM-powered systems. These components establish the base type system, metric definitions, reporting mechanisms, and registry patterns that enable the framework's extensibility and modularity.\n\nThe Core Components encompass several interconnected modules that work together to provide a unified approach to ML evaluation. At the heart of this system is a registration-based architecture that allows metrics, tests, and components to be dynamically discovered, configured, and executed within the Evidently ecosystem.\n\nThis architecture follows a plugin-like pattern where individual components can be registered, versioned, and managed through centralized registries. This design enables users to extend the framework's functionality by implementing custom metrics and tests while maintaining compatibility with the existing evaluation pipeline.\n\n## Base Types Module\n\n### Purpose and Scope\n\nThe base types module defines the fundamental data structures and abstractions that underpin all Evidently components. These types establish common interfaces and data models used throughout the framework, ensuring consistency and interoperability between different modules.\n\nThe base types primarily focus on defining what data flows through the evaluation pipeline, including data definitions, dataset representations, and result containers. This module serves as the foundation upon which all higher-level abstractions are built.\n\n### Key Data Structures\n\nThe base types module defines several critical classes and interfaces that form the backbone of the Evidently type system. These structures provide a standardized way to represent input data, evaluation results, and configuration options across all components.\n\nThe module establishes the `DataDefinition` class, which serves as a schema for describing the structure of datasets being evaluated. This includes column definitions, data types, and metadata that describe the characteristics of the data being analyzed. Data definitions enable Evidently to understand the structure of incoming data and apply appropriate transformations and evaluations.\n\nThe module also defines base classes for `Dataset` objects, which encapsulate reference and current data batches used in evaluations. These dataset representations include functionality for data validation, transformation, and slicing based on various criteria.\n\n## Metric Types Module\n\n### Metric Architecture\n\nThe metric types module defines the core abstraction for all metrics within Evidently. Metrics are the primary mechanism for evaluating data quality, model performance, and statistical properties of datasets. The module establishes a class hierarchy that enables both simple single-value metrics and complex multi-dimensional metric calculations.\n\nMetrics in Evidently follow a consistent pattern where each metric is associated with specific columns or data subsets and produces standardized result objects. This design enables metrics to be composed, combined, and analyzed together within reports and dashboards.\n\n### Metric Execution Model\n\nThe metric execution model defines how metrics are calculated, what inputs they receive, and how results are structured. Each metric implementation receives a data context containing the relevant dataset columns and produces a `MetricResult` object that encapsulates the calculated values and metadata.\n\nThe execution model supports both eager and lazy evaluation strategies. Some metrics compute their results immediately upon execution, while others may defer computation until results are actually needed. This flexibility enables optimizations in scenarios where multiple metrics share intermediate calculations or where memory efficiency is a concern.\n\nMetrics can be configured with parameters that control their behavior, including aggregation methods, thresholds, and visualization options. The parameter system uses type hints and validation to ensure configuration correctness while providing clear documentation through the API reference.\n\n## Report System\n\n### Report Architecture\n\nThe Report class serves as the primary interface for executing evaluations in Evidently. Reports orchestrate the execution of metrics and tests, collect results, and generate visualizations and summaries. The Report system provides a flexible framework for combining multiple evaluation components into cohesive analysis workflows.\n\nReports maintain state throughout their lifecycle, tracking which metrics and tests have been executed, their results, and any errors or warnings encountered during execution. This state management enables features like incremental recalculation, where only changed components need to be re-evaluated.\n\n```mermaid\ngraph TD\n    A[Create Report] --> B[Add Metrics]\n    B --> C[Add Tests]\n    C --> D[Run Report]\n    D --> E[Collect Results]\n    E --> F[Generate Visualizations]\n    F --> G[Export Report]\n    \n    H[Reference Data] --> D\n    I[Current Data] --> D\n    \n    E --> J[Metric Results]\n    E --> K[Test Results]\n```\n\n### Report Configuration\n\nReports support extensive configuration options that control execution behavior, result aggregation, and output formatting. Configuration options include parallel execution settings, result caching policies, and visualization preferences.\n\nThe Report class provides methods for both synchronous and asynchronous execution, enabling integration with various application architectures. Asynchronous execution is particularly useful for long-running evaluations or when reports are generated as part of batch processing workflows.\n\n### Report Export\n\nThe Report system includes export functionality that generates output in various formats including HTML, JSON, and Python dictionary structures. Export options enable customization of included visualizations, result detail levels, and styling preferences.\n\n## Testing Framework\n\n### Test Definition\n\nThe testing module extends Evidently's evaluation capabilities to include pass/fail style assertions. Tests in Evidently are designed to validate specific conditions about data or model behavior, returning boolean results that indicate whether defined criteria are met.\n\nTests can be defined inline within metrics or as standalone components that check specific conditions. This dual approach provides flexibility in how validation logic is structured and reused across different evaluation scenarios.\n\n### Test Execution\n\nTests execute alongside metrics within the Report framework, allowing unified execution of both evaluation and validation logic. The test execution model mirrors the metric execution model, receiving data contexts and producing structured results that include pass/fail status and diagnostic information.\n\nTest results include detailed failure messages that explain why a test did not pass, helping users understand data quality issues or model behavior problems. These diagnostic messages reference specific rows, values, or statistical properties that contributed to the test failure.\n\n## Component Registry System\n\n### Registry Architecture\n\nThe registry system provides a centralized mechanism for managing and discovering Evidently components. Registries maintain mappings between component identifiers and their implementations, enabling dynamic resolution of components at runtime.\n\nEvidently uses registries extensively for metrics, tests, descriptors, and other extensible components. This architecture allows the framework to support third-party extensions while maintaining a consistent interface for component discovery and execution.\n\n### Component Registration\n\nComponents are registered with their respective registries using decorators or explicit registration calls. The registration process captures metadata about each component, including its name, version, parameters, and dependencies. This metadata enables automatic documentation generation and configuration interfaces.\n\nRegistration supports versioning, allowing multiple versions of a component to coexist and enabling rollback to previous versions when needed. Version management is particularly important for production deployments where stability and reproducibility are critical.\n\n### Metrics Registry\n\nThe metrics registry extends the component registry pattern specifically for metrics. It provides specialized functionality for metric discovery, parameter validation, and result aggregation.\n\n```mermaid\ngraph TD\n    A[Metrics Registry] --> B[Metric Base Class]\n    A --> C[Metric Parameters]\n    A --> D[Metric Results]\n    \n    B --> E[Column Metrics]\n    B --> F[Dataset Metrics]\n    B --> G[Statistical Metrics]\n    \n    D --> H[Visualization Configs]\n    D --> I[Aggregation Methods]\n```\n\n### Metric Tests Registry\n\nThe metric tests registry manages test definitions that validate metric results or data conditions. This registry enables tests to reference metrics by name and access their results for validation purposes.\n\nTests registered in this registry can be automatically discovered and included in reports based on configuration. This discovery mechanism enables declarative specification of validation requirements without requiring explicit test instantiation.\n\n## Integration Patterns\n\n### SDK Integration\n\nThe Core Components integrate with Evidently's SDK layer, which provides higher-level APIs for cloud and remote deployments. The SDK layer uses the Core Components as its foundation while adding capabilities for remote execution, result storage, and collaborative features.\n\nThe `CloudConfigAPI` class demonstrates this integration, providing methods for managing project configurations, descriptor configs, and artifact versions. These high-level APIs delegate core functionality to the Core Components while handling network communication and serialization.\n\n### Adapter Pattern\n\nEvidently uses adapters to bridge different execution contexts and storage backends. Adapters transform between internal data structures and external representations, enabling Evidently to work with various data sources and deployment environments.\n\nThe adapter pattern is particularly important for cloud deployments where configurations and results are stored remotely. Adapters handle serialization, deserialization, and API communication while maintaining compatibility with the Core Component interfaces.\n\n## Configuration Management\n\n### Config Versioning\n\nThe Core Components support configuration versioning through specialized config classes. Each versioned configuration maintains a history of changes, enabling audit trails and rollback capabilities.\n\nConfigurations are organized by project, with separate namespaces for different config types like metrics, tests, and descriptors. This organization enables clear separation of concerns while maintaining relationships between related configurations.\n\n### Remote Configuration\n\nFor deployments requiring centralized configuration management, Evidently supports remote configuration storage through the CloudConfigAPI. Remote configurations can be fetched, updated, and versioned through the SDK, with changes automatically synchronized to connected clients.\n\n## Summary\n\nThe Core Components provide the essential infrastructure for Evidently's evaluation capabilities. Through a combination of base types, metric abstractions, reporting mechanisms, and registry systems, these components establish a flexible and extensible framework for ML evaluation.\n\nThe modular design enables users to leverage individual components for specific use cases or combine them into comprehensive evaluation pipelines. The registration-based architecture ensures extensibility while maintaining consistency across the framework.\n\nUnderstanding these core concepts is essential for effectively using Evidently and for extending the framework with custom metrics, tests, and integrations.\n\n---\n\n<a id='data-flow'></a>\n\n## Data Management and Data Flow\n\n### 相关页面\n\n相关主题：[Architecture Overview](#arch-overview), [ML Model Evaluation](#ml-evaluation), [LLM Evaluation and Judging](#llm-evals)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/core/datasets.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/datasets.py)\n- [src/evidently/core/container.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/container.py)\n- [src/evidently/sdk/datasets.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/datasets.py)\n- [src/evidently/llm/datagen/base.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/datagen/base.py)\n- [src/evidently/llm/rag/index.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/rag/index.py)\n- [src/evidently/llm/rag/splitter.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/rag/splitter.py)\n</details>\n\n# Data Management and Data Flow\n\n## Overview\n\nData Management and Data Flow in Evidently encompasses the mechanisms by which data is ingested, transformed, stored, and processed throughout the evaluation lifecycle. The system provides a unified abstraction layer that handles multiple data sources (pandas DataFrames, CSV files, Parquet files) and integrates with the broader evaluation pipeline including Reports, Test Suites, and LLM-specific processing like RAG (Retrieval-Augmented Generation) systems.\n\nThe architecture separates concerns between core data structures (`Dataset`, `Container`), SDK-level abstractions for cloud/local deployment, and specialized LLM data processing components. This modularity allows users to work with familiar Python data structures while benefiting from Evidently's evaluation and monitoring capabilities.\n\n资料来源：[src/evidently/core/datasets.py:1-50]()\n\n## Core Data Abstraction\n\n### The Dataset Class\n\nThe `Dataset` class serves as the primary data container throughout Evidently. It wraps various data sources into a standardized interface that supports:\n\n- Direct creation from pandas DataFrames\n- Automatic type inference via `DataDefinition`\n- Descriptor-based feature computation\n- Metadata and tagging support\n\n```python\n# Basic Dataset creation from pandas\nfrom evidently import Dataset, DataDefinition\n\ndataset = Dataset.from_pandas(\n    dataframe,\n    data_definition=DataDefinition(),\n    metadata={\"source\": \"production\"},\n    tags=[\"eval\", \"2024\"]\n)\n```\n\n资料来源：[src/evidently/core/datasets.py:30-45]()\n\n### Dataset Factory Methods\n\n| Method | Purpose | Input Types |\n|--------|---------|-------------|\n| `from_pandas()` | Create Dataset from pandas DataFrame | `pd.DataFrame` |\n| `from_any()` | Convert various types to Dataset | `pd.DataFrame`, `Dataset` |\n| `as_dataframe()` | Extract underlying DataFrame | N/A (output) |\n| `column()` | Retrieve specific column as `DatasetColumn` | column name |\n| `subdataset()` | Filter dataset by column value | column name, label |\n\n资料来源：[src/evidently/core/datasets.py:50-80]()\n\nThe `from_any()` static method implements a factory pattern that handles type conversion automatically:\n\n```python\n@staticmethod\ndef from_any(dataset: PossibleDatasetTypes) -> \"Dataset\":\n    if isinstance(dataset, Dataset):\n        return dataset\n    if isinstance(dataset, pd.DataFrame):\n        return Dataset.from_pandas(dataset)\n    raise ValueError(f\"Unsupported dataset type: {type(dataset)}\")\n```\n\n资料来源：[src/evidently/core/datasets.py:60-70]()\n\n## Data Flow Architecture\n\n```mermaid\ngraph TD\n    A[Input Data: pd.DataFrame / CSV / Parquet] --> B[Dataset.from_any]\n    B --> C{Data Type Check}\n    C -->|pd.DataFrame| D[Dataset.from_pandas]\n    C -->|Already Dataset| E[Return as-is]\n    D --> F[DataDefinition Type Inference]\n    F --> G[Add Descriptors]\n    G --> H[Dataset Object]\n    H --> I[Report.run / TestSuite.run]\n    I --> J[Snapshot with Results]\n    J --> K[Export: JSON / HTML / Dict]\n```\n\n## Container Architecture\n\nThe `Container` class provides the underlying storage and management mechanism for datasets within the Evidently ecosystem. Containers maintain references to data assets and provide CRUD operations for dataset lifecycle management.\n\nKey container responsibilities include:\n\n- Storing dataset references and metadata\n- Managing dataset versioning\n- Providing query and filtering capabilities\n- Handling persistence to storage backends\n\n资料来源：[src/evidently/core/container.py:1-30]()\n\n### Container-Dataset Relationship\n\n```mermaid\ngraph LR\n    A[Container] -->|manages| B[Dataset Registry]\n    B -->|references| C[Dataset v1]\n    B -->|references| D[Dataset v2]\n    B -->|references| E[Dataset vN]\n    C -->|wraps| F[pd.DataFrame]\n    D -->|wraps| G[pd.DataFrame]\n    E -->|wraps| H[pd.DataFrame]\n```\n\n## SDK-Level Data Management\n\nThe SDK layer provides deployment-agnostic data handling through the `CloudConfigAPI` and local storage adapters. This abstraction enables consistent data operations whether running locally or connected to Evidently Cloud.\n\n资料来源：[src/evidently/sdk/datasets.py:1-50]()\n\n### SDK Dataset Operations\n\n| Operation | Method Signature | Description |\n|-----------|-----------------|-------------|\n| Create | `add_dataset(project_id, dataset, name, description, link)` | Add dataset to project |\n| Read | `load_dataset(dataset_id)` | Retrieve dataset by UUID |\n| List | `list_datasets(project, origins)` | List all datasets in project |\n| Update | `update_dataset(dataset_id, dataset)` | Modify existing dataset |\n| Delete | `delete_dataset(dataset_id)` | Remove dataset from storage |\n\n资料来源：[src/evidently/ui/workspace.py:50-100]()\n\n## LLM-Specific Data Processing\n\n### RAG Data Pipeline\n\nFor LLM applications using Retrieval-Augmented Generation, Evidently provides specialized data processing components that handle document ingestion, chunking, and indexing.\n\n资料来源：[src/evidently/llm/rag/index.py:1-50]()\n\n```mermaid\ngraph TD\n    A[Raw Documents] --> B[RAG Splitter]\n    B --> C[Document Chunks]\n    C --> D[RAG Index]\n    D --> E[Vector Store]\n    E --> F[Retrieval Query]\n    F --> G[Context + Query]\n    G --> H[LLM Response]\n```\n\n### Document Splitting\n\nThe `splitter.py` module handles text chunking with configurable parameters:\n\n- **Chunk size**: Target size for each text segment\n- **Overlap**: Amount of overlap between consecutive chunks\n- **Separators**: Text boundaries for splitting priority\n\n资料来源：[src/evidently/llm/rag/splitter.py:1-40]()\n\n### RAG Indexing\n\nThe index module manages the vector storage and retrieval:\n\n- Embedding generation for document chunks\n- Vector store integration (FAISS, ChromaDB, etc.)\n- Similarity search capabilities\n- Metadata preservation for filtering\n\n资料来源：[src/evidently/llm/rag/index.py:50-100]()\n\n## Data Generation for LLM Evals\n\nThe `datagen/base.py` module provides infrastructure for synthetic data generation, enabling users to create evaluation datasets programmatically.\n\n资料来源：[src/evidently/llm/datagen/base.py:1-50]()\n\n### Data Generation Workflow\n\n```mermaid\ngraph LR\n    A[Seed Data / Templates] --> B[LLM Generator]\n    B --> C[Generated Samples]\n    C --> D[Quality Validation]\n    D -->|Pass| E[Evaluation Dataset]\n    D -->|Fail| F[Regeneration Loop]\n    F --> B\n```\n\n### DataGeneratorBase Class\n\nThe base class defines the contract for data generation:\n\n| Method | Purpose |\n|--------|---------|\n| `generate()` | Create new data samples |\n| `validate()` | Check generated data quality |\n| `expand()` | Augment existing datasets |\n\n资料来源：[src/evidently/llm/datagen/base.py:30-60]()\n\n## Data Flow in Report Execution\n\nReports represent the primary consumer of dataset objects within Evidently. The `run()` method orchestrates the complete data flow from input to output.\n\n```python\ndef run(\n    self,\n    current_data,\n    reference_data=None,\n    additional_data=None,\n    timestamp=None,\n    metadata=None,\n    tags=None,\n    name=None,\n) -> Snapshot:\n    # Data validation\n    if isinstance(current_data, pd.DataFrame) and current_data.empty:\n        raise ValueError(\"current_data must contain at least one column...\")\n    \n    # Convert to Dataset objects\n    current_dataset = Dataset.from_any(current_data)\n    reference_dataset = Dataset.from_any(reference_data) if reference_data else None\n    \n    # Execute metrics and generate snapshot\n    ...\n```\n\n资料来源：[src/evidently/core/report.py:100-130]()\n\n### Data Validation Rules\n\n| Condition | Error Raised |\n|-----------|--------------|\n| Empty current_data DataFrame | `ValueError: current_data must contain at least one column; received an empty DataFrame` |\n| Empty reference_data DataFrame | `ValueError: reference_data must contain at least one column; received an empty DataFrame` |\n| Unsupported dataset type | `ValueError: Unsupported dataset type: {type(dataset)}` |\n\n资料来源：[src/evidently/core/datasets.py:65-70]()\n\n## Workspace Dataset Management\n\nThe Workspace class provides high-level dataset management capabilities for organizing evaluations across projects.\n\n```python\nclass Workspace:\n    def add_dataset(\n        self,\n        project_id: STR_UUID,\n        dataset: Dataset,\n        name: str,\n        description: Optional[str] = None,\n        link: Optional[SnapshotLink] = None,\n    ) -> DatasetID:\n        \"\"\"Add a dataset to a project.\"\"\"\n        return self.datasets.add(project_id=project_id, dataset=dataset, ...)\n    \n    def load_dataset(self, dataset_id: DatasetID) -> Dataset:\n        \"\"\"Load a dataset by ID.\"\"\"\n        return self.datasets.load(dataset_id)\n    \n    def list_datasets(\n        self,\n        project: STR_UUID,\n        origins: Optional[List[str]] = None,\n    ) -> DatasetList:\n        \"\"\"List all datasets in a project.\"\"\"\n        return self.datasets.list(project, origins=origins)\n```\n\n资料来源：[src/evidently/ui/workspace.py:50-80]()\n\n### Dataset Metadata Schema\n\n| Field | Type | Required | Description |\n|-------|------|----------|-------------|\n| `name` | `str` | Yes | Human-readable dataset name |\n| `description` | `str` | No | Detailed description |\n| `link` | `SnapshotLink` | No | Associated snapshot reference |\n| `created_at` | `datetime` | Auto | Creation timestamp |\n| `author` | `str` | Auto | Creator identifier |\n\n## Summary\n\nData Management in Evidently follows a consistent pattern across the platform:\n\n1. **Ingestion**: Data enters through `Dataset.from_pandas()` or `Dataset.from_any()` factory methods\n2. **Validation**: Empty DataFrames and unsupported types are rejected early\n3. **Processing**: Descriptors and metrics operate on the standardized Dataset interface\n4. **Output**: Results are packaged into Snapshots with multiple export formats\n5. **Persistence**: Datasets can be stored and retrieved through Workspace and SDK APIs\n\nThe separation between core data structures, SDK abstractions, and specialized LLM components enables flexible deployment while maintaining a simple user-facing API based on pandas DataFrames.\n\n资料来源：[src/evidently/core/datasets.py:1-100]()\n资料来源：[src/evidently/core/report.py:100-150]()\n资料来源：[src/evidently/sdk/datasets.py:1-50]()\n资料来源：[src/evidently/llm/rag/index.py:1-100]()\n资料来源：[src/evidently/llm/rag/splitter.py:1-50]()\n资料来源：[src/evidently/llm/datagen/base.py:1-60]()\n\n---\n\n<a id='ml-evaluation'></a>\n\n## ML Model Evaluation\n\n### 相关页面\n\n相关主题：[LLM Evaluation and Judging](#llm-evals), [Descriptors and Features System](#descriptors-features), [Presets and Metric Presets](#presets)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/metrics/classification.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/classification.py)\n- [src/evidently/metrics/regression.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/regression.py)\n- [src/evidently/metrics/data_quality.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/data_quality.py)\n- [src/evidently/metrics/dataset_statistics.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/dataset_statistics.py)\n- [src/evidently/metrics/recsys.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/recsys.py)\n- [src/evidently/metrics/embeddings.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/embeddings.py)\n- [src/evidently/legacy/metrics/classification_performance/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/metrics/classification_performance/__init__.py)\n- [src/evidently/legacy/metrics/regression_performance/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/metrics/regression_performance/__init__.py)\n- [src/evidently/legacy/calculations/data_drift.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/calculations/data_drift.py)\n</details>\n\n# ML Model Evaluation\n\n## Overview\n\nML Model Evaluation in Evidently is a comprehensive module that enables data scientists and ML engineers to assess, test, and monitor machine learning models throughout their lifecycle—from experiments to production. The evaluation system supports both predictive tasks (classification, regression) and recommendation systems.\n\nThe evaluation framework is built around the concept of **Metrics** and **Reports**, where metrics are individual evaluation components that can be composed into reports for comprehensive model assessment.\n\n资料来源：[src/evidently/core/report.py:1-50]()\n\n### Core Architecture\n\n```mermaid\ngraph TD\n    A[Dataset] --> B[Report]\n    A --> C[Test Suite]\n    B --> D[Interactive Report]\n    B --> E[JSON/Dict Output]\n    C --> F[Pass/Fail Results]\n    G[Metrics] --> B\n    H[Presets] --> B\n    G --> C\n```\n\n### Supported Evaluation Types\n\n| Evaluation Type | Purpose | Key Metrics |\n|----------------|---------|-------------|\n| Classification | Evaluate classifier performance | Accuracy, Precision, Recall, F1, ROC-AUC |\n| Regression | Evaluate regression models | MAE, MSE, RMSE, R2 |\n| Data Quality | Assess input data integrity | Missing values, duplicates, correlations |\n| Data Drift | Detect distribution changes | PSI, KS test, L Infinity |\n| Recommendation Systems | Evaluate recsys models | Precision@K, Recall@K, NDCG |\n| Embeddings | Evaluate embedding quality | Cosine similarity, drift detection |\n\n## Classification Metrics\n\n### Purpose and Scope\n\nClassification metrics evaluate the performance of classification models by comparing predicted labels against actual labels. Evidently supports both binary and multiclass classification evaluation scenarios.\n\n资料来源：[src/evidently/metrics/classification.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description | Applicable Task Types |\n|--------|-------------|----------------------|\n| `Accuracy` | Proportion of correct predictions | Binary, Multiclass |\n| `Precision` | Positive predictive value | Binary, Multiclass |\n| `Recall` | Sensitivity, true positive rate | Binary, Multiclass |\n| `F1Score` | Harmonic mean of precision and recall | Binary, Multiclass |\n| `RocAuc` | Area under the ROC curve | Binary, Multiclass |\n| `ConfusionMatrix` | Cross-tabulation of predictions vs actuals | Binary, Multiclass |\n| `PrecisionRecallCurve` | Precision-Recall tradeoff visualization | Binary |\n| `ClassRepresentation` | Distribution of classes in predictions | Multiclass |\n\n### Usage Example\n\n```python\nfrom evidently import Report\nfrom evidently.metrics import Accuracy, Precision, Recall, F1Score, RocAuc\n\nreport = Report([\n    Accuracy(),\n    Precision(),\n    Recall(),\n    F1Score(),\n    RocAuc()\n])\n\nresult = report.run(current_data=current_dataset, reference_data=reference_dataset)\n```\n\n### Legacy Classification Performance\n\nThe legacy module provides comprehensive classification evaluation with detailed visualizations.\n\n资料来源：[src/evidently/legacy/metrics/classification_performance/__init__.py:1-50]()\n\nKey components include:\n- **ClassificationPerformanceMetrics**: Core metrics container\n- **Visualization widgets**: Confusion matrix plots, ROC curves, PR curves\n- **Per-class analysis**: Detailed breakdown for multiclass scenarios\n\n## Regression Metrics\n\n### Purpose and Scope\n\nRegression metrics assess the performance of regression models by computing various error measures and statistical properties of prediction residuals.\n\n资料来源：[src/evidently/metrics/regression.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description | Unit |\n|--------|-------------|------|\n| `MeanError` | Average prediction error | Same as target |\n| `MeanAbsoluteError` | MAE - average absolute error | Same as target |\n| `MeanSquaredError` | MSE - average squared error | Squared target units |\n| `RootMeanSquaredError` | RMSE - square root of MSE | Same as target |\n| `ErrorStd` | Standard deviation of errors | Same as target |\n| `ErrorNormality` | Shapiro-Wilk test for residual normality | p-value |\n| `R2Score` | Coefficient of determination | Dimensionless |\n| `ErrorPercentile` | Percentile-based error analysis | Same as target |\n\n### Usage Example\n\n```python\nfrom evidently import Report\nfrom evidently.metrics import (\n    MeanAbsoluteError,\n    MeanSquaredError,\n    R2Score,\n    ErrorPercentile\n)\n\nreport = Report([\n    MeanAbsoluteError(),\n    MeanSquaredError(),\n    R2Score(),\n    ErrorPercentile(percentile=95)\n])\n\nresult = report.run(current_data=current_dataset, reference_data=reference_dataset)\n```\n\n### Legacy Regression Performance\n\nThe legacy regression performance module provides time-series analysis of predictions.\n\n资料来源：[src/evidently/legacy/metrics/regression_performance/__init__.py:1-50]()\n\nFeatures include:\n- **Predicted vs Actual plots**: Time-series visualization\n- **Residual distribution analysis**: Histograms and QQ plots\n- **Error distribution by feature**: Feature-level error breakdown\n\n```mermaid\ngraph LR\n    A[Current Data] --> B[RegressionReport]\n    C[Reference Data] --> B\n    B --> D[Predicted vs Actual]\n    B --> E[Error Distribution]\n    B --> F[Performance Metrics]\n```\n\n## Data Quality Metrics\n\n### Purpose and Scope\n\nData quality metrics evaluate the integrity and quality of input data. These metrics are essential for understanding whether model predictions are reliable and for identifying data pipeline issues.\n\n资料来源：[src/evidently/metrics/data_quality.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description |\n|--------|-------------|\n| `ColumnCount` | Number of columns in dataset |\n| `RowCount` | Number of rows in dataset |\n| `ValueStats` | Statistical summary (min, max, mean, std) |\n| `MissingValuesMetric` | Count and percentage of missing values |\n| `UniqueValuesCount` | Number of unique values per column |\n| `DataInconsistencyMetric` | Detection of data inconsistencies |\n| `TextStats` | Statistics for text columns (length, word count) |\n\n### Usage Example\n\n```python\nfrom evidently import Report, Dataset\nfrom evidently.metrics import ColumnCount, ValueStats, MissingValuesMetric\n\ndataset = Dataset.from_pandas(df, data_definition=DataDefinition())\n\nreport = Report([\n    ColumnCount(),\n    ValueStats(column=\"target\"),\n    MissingValuesMetric()\n])\n\nresult = report.run(dataset, None)\n```\n\n## Dataset Statistics\n\n### Purpose and Scope\n\nDataset statistics provide comprehensive descriptive statistics about datasets, enabling quick overview and comparison between reference and current data distributions.\n\n资料来源：[src/evidently/metrics/dataset_statistics.py:1-100]()\n\n### Available Presets\n\n| Preset | Description |\n|--------|-------------|\n| `DataSummaryPreset` | Complete dataset overview |\n| `DataDriftPreset` | Distribution comparison between reference and current |\n| `TargetDriftPreset` | Target variable drift analysis |\n| `NumTargetDriftPreset` | Numeric target drift analysis |\n| `CatTargetDriftPreset` | Categorical target drift analysis |\n\n### Statistical Tests for Drift Detection\n\n资料来源：[src/evidently/legacy/calculations/data_drift.py:1-100]()\n\n| Test | Applicable Data Type | Description |\n|------|---------------------|-------------|\n| `KS` | Numerical | Kolmogorov-Smirnov test |\n| `ZScore` | Numerical | Z-score based drift detection |\n| `TTest` | Numerical | Two-sample t-test for mean comparison |\n| `ChiSquare` | Categorical | Chi-square test for category distribution |\n| `PSI` | Both | Population Stability Index |\n| `L Infinity` | Both | Max absolute difference |\n\n### Data Drift Workflow\n\n```mermaid\ngraph TD\n    A[Reference Dataset] --> F[Drift Detection Engine]\n    B[Current Dataset] --> F\n    F --> C[Statistical Tests]\n    F --> D[Distribution Comparison]\n    C --> E[Drift Report]\n    D --> E\n    E --> G{Drift Detected?}\n    G -->|Yes| H[Alert / Action]\n    G -->|No| I[Continue Monitoring]\n```\n\n## Recommendation System Metrics\n\n### Purpose and Scope\n\nRecommendation system (recsys) metrics evaluate the quality of recommendations generated by recommendation models.\n\n资料来源：[src/evidently/metrics/recsys.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description | K-Parameter |\n|--------|-------------|-------------|\n| `PrecisionAtK` | Precision at top-K recommendations | Required |\n| `RecallAtK` | Recall at top-K recommendations | Required |\n| `NDCGAtK` | Normalized Discounted Cumulative Gain at K | Required |\n| `HitRateAtK` | Hit rate at top-K recommendations | Optional |\n| `MRRAtK` | Mean Reciprocal Rank at K | Required |\n\n### Usage Example\n\n```python\nfrom evidently import Report\nfrom evidently.metrics import PrecisionAtK, RecallAtK, NDCGAtK\n\nreport = Report([\n    PrecisionAtK(k=10),\n    RecallAtK(k=10),\n    NDCGAtK(k=10)\n])\n\nresult = report.run(current_data=recs_dataset, reference_data=ref_recs_dataset)\n```\n\n## Embeddings Metrics\n\n### Purpose and Scope\n\nEmbeddings metrics evaluate the quality and drift of vector embeddings, which are critical for LLM and semantic search applications.\n\n资料来源：[src/evidently/metrics/embeddings.py:1-100]()\n\n### Available Metrics\n\n| Metric | Description |\n|--------|-------------|\n| `Embedding Drift` | Detect drift in embedding distributions |\n| `Cosine Similarity` | Average cosine similarity between embeddings |\n| `Retrieval Metrics` | Evaluate RAG and retrieval system quality |\n\n### Key Features\n\n- **Semantic Drift Detection**: Identify changes in embedding space distribution\n- **Cluster Analysis**: Analyze embedding cluster stability\n- **Pairwise Comparison**: Compare individual embedding pairs\n\n## Report Configuration\n\n### Creating a Report\n\n```python\nfrom evidently import Report\nfrom evidently.presets import DataDriftPreset\n\n# Basic report with preset\nreport = Report([DataDriftPreset()])\n\n# Report with custom metadata\nreport = Report(\n    metrics=[...],\n    metadata={\"model_version\": \"v1.2.3\"},\n    tags=[\"production\", \"monthly\"]\n)\n\n# Run with reference data for comparison\nsnapshot = report.run(current_dataset, reference_dataset)\n```\n\n### Report Output Formats\n\n| Format | Method | Use Case |\n|--------|--------|----------|\n| Interactive | Direct notebook display | Exploration, debugging |\n| JSON | `.json()` | API responses, CI/CD |\n| Python Dict | `.dict()` | Programmatic access |\n| HTML | Export functionality | Shareable reports |\n\n## Test Suite Integration\n\nReports can be converted to **Test Suites** by adding pass/fail conditions, enabling automated regression testing.\n\n```python\nfrom evidently.test_suite import TestSuite\nfrom evidently.tests import TestColumnValue\n\nsuite = TestSuite([\n    TestColumnValue(column=\"accuracy\", gt=0.95),\n])\n\nsuite.run(current_data=dataset, reference_data=reference)\n```\n\n## Best Practices\n\n### 1. Always Use Reference Data for Comparison\nFor meaningful evaluation, maintain a reference dataset representing expected data distribution or model performance.\n\n### 2. Choose Appropriate Metrics by Task Type\n\n| Task Type | Recommended Metrics |\n|-----------|--------------------|\n| Binary Classification | Accuracy, Precision, Recall, F1, ROC-AUC |\n| Multiclass Classification | Per-class F1, Confusion Matrix, Macro/Micro Avg |\n| Regression | MAE, RMSE, R2, Error Percentiles |\n| Recommendation | Precision@K, Recall@K, NDCG@K |\n| Data Quality | Missing values, duplicates, consistency |\n\n### 3. Set Up Monitoring Schedules\nFor production models, schedule regular evaluations to detect performance degradation and data drift early.\n\n### 4. Use Auto-generated Test Conditions\nEvidently can auto-generate test thresholds from reference data:\n```python\nfrom evidently.presets import DataDriftPreset\n\nsuite = TestSuite.from_reference(reference_data, [DataDriftPreset()])\n```\n\n## Summary\n\nEvidently's ML Model Evaluation module provides a comprehensive, modular framework for evaluating machine learning models across multiple dimensions:\n\n- **Classification**: Binary and multiclass classification metrics with visualizations\n- **Regression**: Comprehensive error metrics and residual analysis\n- **Data Quality**: Input data integrity checks\n- **Data Drift**: Distribution comparison and statistical tests\n- **Recommendation Systems**: Top-K recommendation quality metrics\n- **Embeddings**: Semantic drift detection for LLM applications\n\nThe system supports both one-off evaluations and continuous monitoring, with seamless integration into CI/CD pipelines through Test Suites.\n\n---\n\n<a id='llm-evals'></a>\n\n## LLM Evaluation and Judging\n\n### 相关页面\n\n相关主题：[ML Model Evaluation](#ml-evaluation), [Descriptors and Features System](#descriptors-features)\n\n<details>\n<summary>Relevant Source Files</summary>\n\nThe following source files were used to generate this documentation:\n\n- [src/evidently/descriptors/llm_judges.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/llm_judges.py)\n- [src/evidently/descriptors/generated_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/generated_descriptors.py)\n- [src/evidently/llm/prompts/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/prompts/__init__.py)\n- [src/evidently/llm/utils/parsing.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/utils/parsing.py)\n- [src/evidently/llm/utils/prompt_render.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/utils/prompt_render.py)\n- [src/evidently/llm/optimization/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/optimization/__init__.py)\n- [src/evidently/llm/optimization/scorers.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/llm/optimization/scorers.py)\n- [src/evidently/legacy/descriptors/llm_judges.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/descriptors/llm_judges.py)\n- [src/evidently/legacy/descriptors/sentiment_descriptor.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/descriptors/sentiment_descriptor.py)\n</details>\n\n# LLM Evaluation and Judging\n\n## Overview\n\nEvidently provides a comprehensive framework for evaluating Large Language Model (LLM) powered systems through specialized descriptors called **LLM Judges**. These judges leverage LLM APIs to automatically assess and score various quality dimensions of LLM outputs, including relevance, coherence, factual accuracy, and task completion.\n\nLLM Judges serve as evaluators that can be integrated into monitoring pipelines to continuously assess LLM performance without requiring manual human evaluation.\n\n## Architecture\n\n```mermaid\ngraph TD\n    A[Dataset with LLM Inputs/Outputs] --> B[LLM Judge Descriptors]\n    B --> C[Prompt Templates]\n    C --> D[LLM Provider API]\n    D --> E[Parsed Evaluation Results]\n    E --> F[Feature Descriptors]\n    F --> G[Metrics & Tests]\n    G --> H[Evidently Reports/Monitors]\n    \n    B1[ContextRelevanceLLMEval] --> B\n    B2[CompletenessLLMEval] --> B\n    B3[ContextQualityLLMEval] --> B\n    B4[GroundednessLLMEval] --> B\n    B5[RelevanceLLMEval] --> B\n    \n    C1[System Prompt] --> C\n    C2[User Query] --> C\n    C3[Evaluation Criteria] --> C\n```\n\n## Core Components\n\n### LLM Judge Types\n\nEvidently implements multiple specialized LLM judges for different evaluation dimensions:\n\n| Judge Type | Purpose | Output |\n|------------|---------|--------|\n| `ContextRelevanceLLMEval` | Measures relevance of retrieved context to the query | Score 0-1, reasoning |\n| `CompletenessLLMEval` | Evaluates completeness of response coverage | Score 0-1, reasoning |\n| `ContextQualityLLMEval` | Assesses quality of context for answering | Score 0-1, reasoning |\n| `GroundednessLLMEval` | Checks if response is grounded in provided context | Score 0-1, reasoning |\n| `RelevanceLLMEval` | Evaluates semantic relevance of response to query | Score 0-1, reasoning |\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py:1-200]()\n\n### Feature Descriptor Pattern\n\nLLM Judges follow the `FeatureDescriptor` pattern, wrapping LLM evaluation features with optional aliasing and test definitions:\n\n```python\nFeatureDescriptor(\n    feature=feature,\n    alias=alias,  # Custom display name\n    tests=tests   # Optional validation tests\n)\n```\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py:30-35]()\n\n## Configuration Options\n\n### Common Parameters\n\nAll LLM Judge functions share a common parameter structure:\n\n| Parameter | Type | Required | Default | Description |\n|-----------|------|----------|---------|-------------|\n| `column_name` | `str` | Yes | - | Name of the text column to evaluate |\n| `provider` | `str` | No | `\"openai\"` | LLM provider (openai, anthropic, etc.) |\n| `model` | `str` | No | `\"gpt-4o-mini\"` | Model identifier |\n| `additional_columns` | `Dict[str, str]` | No | `None` | Extra context columns for evaluation |\n| `include_category` | `bool` | No | `None` | Include categorical classification in output |\n| `include_score` | `bool` | No | `None` | Include numeric score in output |\n| `include_reasoning` | `bool` | No | `None` | Include explanation in output |\n| `uncertainty` | `Uncertainty` | No | `None` | Uncertainty handling strategy |\n| `alias` | `str` | No | `None` | Custom name for the feature |\n| `tests` | `List` | No | `None` | Tests to apply to the feature |\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py:50-70]()\n\n### Uncertainty Handling\n\nThe uncertainty parameter enables robust evaluation by detecting when the LLM is uncertain about its assessment:\n\n```python\nuncertainty: Optional[Uncertainty] = None\n```\n\nThis allows the system to flag low-confidence evaluations rather than returning potentially incorrect scores.\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py:58]()\n\n## Prompt System\n\n### Prompt Templates\n\nEvidently uses structured prompt templates for LLM evaluation:\n\n```python\n@dataclasses.dataclass\nclass LLMJudgePrompts:\n    system: str\n    user: str\n```\n\nPrompts are rendered with dynamic variables including:\n- Query text\n- Response text\n- Reference context\n- Evaluation criteria\n\n资料来源：[src/evidently/llm/prompts/__init__.py]()\n\n### Prompt Rendering\n\nThe prompt rendering system processes templates with type-safe variable substitution:\n\n```python\ndef render_prompt(\n    template: PromptTemplate,\n    **kwargs: Any\n) -> str:\n```\n\n资料来源：[src/evidently/llm/utils/prompt_render.py]()\n\n## Output Parsing\n\n### Response Parsing\n\nEvaluation results are parsed using structured output extraction:\n\n| Parser | Purpose |\n|--------|---------|\n| `parse_json_response()` | Extract JSON-structured evaluations |\n| `parse_score()` | Extract numeric scores |\n| `parse_reasoning()` | Extract explanation text |\n| `parse_category()` | Extract categorical labels |\n\n资料来源：[src/evidently/llm/utils/parsing.py]()\n\n### Score Calculation\n\nScores are calculated based on the evaluation criteria defined in each judge:\n\n- **0.0**: Does not meet criteria\n- **0.5**: Partially meets criteria\n- **1.0**: Fully meets criteria\n\nThe score can be combined with reasoning and category outputs for comprehensive evaluation reports.\n\n## Integration with Evidently Reports\n\n### Usage in Reports\n\nLLM Judges can be added to Evidently reports:\n\n```python\nfrom evidently.llm import RelevanceLLMEval\n\nreport = Report(metrics=[\n    RelevanceLLMEval(\n        column_name=\"response\",\n        include_score=True,\n        include_reasoning=True\n    )\n])\n```\n\n### Usage in Monitoring\n\nFor continuous monitoring, LLM Judges are integrated into dashboard widgets:\n\n```python\nfrom evidently.dashboard import Dashboard\n\ndashboard = Dashboard([\n    RelevanceLLMEval(column_name=\"response\")\n])\n```\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Descriptors/Features/LLMJudge/template.tsx]()\n\n## UI Components\n\n### LLMJudgeTemplate Component\n\nThe frontend provides visualization for LLM judge configurations:\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `state` | `LLMJudgeState` | Current judge configuration |\n| `errors` | `FormErrors` | Validation errors |\n| `availableTags` | `string[]` | Available categorization tags |\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Descriptors/Features/LLMJudge/template.tsx]()\n\n### Visualization States\n\nThe UI renders different states for judge configurations:\n\n1. **Uncertainty Configuration**: Shows uncertainty handling options\n2. **Multiclass Classification**: Displays class criteria for classification modes\n3. **Criteria Preview**: Shows evaluation criteria as formatted text\n4. **Output Options**: Displays configured output fields (reasoning, category, score)\n\n## Legacy Components\n\n### Legacy LLM Judges\n\nThe legacy implementation in `evidently.legacy` provides backward compatibility:\n\n```python\nfrom evidently.legacy.descriptors.llm_judges import (\n    CompletenessLLMEval as CompletenessLLMEvalV1\n)\n```\n\n资料来源：[src/evidently/legacy/descriptors/llm_judges.py]()\n\n### Sentiment Descriptor\n\nSpecialized descriptor for sentiment analysis:\n\n```python\nfrom evidently.legacy.descriptors.sentiment_descriptor import SentimentDescriptor\n```\n\nFeatures include:\n- Sentiment polarity detection (positive, negative, neutral)\n- Confidence scoring\n- Integration with reporting pipeline\n\n资料来源：[src/evidently/legacy/descriptors/sentiment_descriptor.py]()\n\n## Scorers and Optimization\n\n### LLM Scorers\n\nScorers provide optimized evaluation functions:\n\n```python\nfrom evidently.llm.optimization.scorers import LLMScorer\n```\n\nScorers support:\n- Batch evaluation\n- Caching of results\n- Configurable thresholds\n\n资料来源：[src/evidently/llm/optimization/scorers.py]()\n\n## Best Practices\n\n### 1. Choosing Evaluation Dimensions\n\n| Use Case | Recommended Judges |\n|----------|---------------------|\n| RAG Systems | ContextRelevance, Groundedness, Relevance |\n| Summarization | Completeness, Relevance |\n| Q&A Systems | ContextQuality, Groundedness |\n| General Chat | All dimensions |\n\n### 2. Score Interpretation\n\n- **0.0 - 0.3**: Significant issues detected\n- **0.4 - 0.6**: Partial criteria met\n- **0.7 - 1.0**: Criteria well satisfied\n\n### 3. Uncertainty Handling\n\nEnable uncertainty handling when:\n- Operating on edge cases\n- Using smaller models\n- Evaluating ambiguous inputs\n\n## See Also\n\n- [Evidently Documentation](https://docs.evidentlyai.com)\n- [API Reference](https://evidentlyai.github.io/evidently/api-reference)\n- [Dashboard Components](../dashboard/)\n- [Prompt Templates](../llm/prompts/)\n\n---\n\n<a id='descriptors-features'></a>\n\n## Descriptors and Features System\n\n### 相关页面\n\n相关主题：[LLM Evaluation and Judging](#llm-evals), [ML Model Evaluation](#ml-evaluation)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/descriptors/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/__init__.py)\n- [src/evidently/descriptors/_text_length.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/_text_length.py)\n- [src/evidently/descriptors/_custom_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/_custom_descriptors.py)\n- [src/evidently/descriptors/generated_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/generated_descriptors.py)\n- [src/evidently/descriptors/text_match.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/text_match.py)\n- [src/evidently/core/registries/descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/descriptors.py)\n- [src/evidently/legacy/descriptors/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/descriptors/__init__.py)\n- [src/evidently/legacy/features/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/features/__init__.py)\n</details>\n\n# Descriptors and Features System\n\n## Overview\n\nThe Descriptors and Features System is a core component of the Evidently framework that provides extensible ways to extract, compute, and evaluate characteristics from data columns. This system enables users to define custom descriptors that wrap features with validation tests, making it easy to assess data quality, text properties, and model behavior in a unified evaluation pipeline.\n\nDescriptors serve as wrappers around feature implementations, adding metadata, display names, and optional test configurations. The system supports both legacy v1 features and modern descriptor implementations, providing backward compatibility while enabling new functionality.\n\n## Architecture\n\nThe system follows a layered architecture with clear separation between descriptor definitions, feature implementations, and registry management.\n\n```mermaid\ngraph TD\n    subgraph \"Public API Layer\"\n        PD[Public Descriptors<br/>TextMatch, HuggingFace, etc.]\n        CD[Custom Descriptors<br/>FeatureDescriptor]\n    end\n    \n    subgraph \"Registry Layer\"\n        REG[DescriptorRegistry]\n    end\n    \n    subgraph \"Implementation Layer\"\n        LEG[Legacy Features V1<br/>hf_feature, text_length]\n        NEW[New Features<br/>_text_length, text_match]\n    end\n    \n    subgraph \"Evaluation Layer\"\n        EVAL[Evidently Metrics<br/>and Tests]\n    end\n    \n    PD --> REG\n    CD --> REG\n    REG --> LEG\n    REG --> NEW\n    LEG --> EVAL\n    NEW --> EVAL\n    \n    style PD fill:#90EE90\n    style CD fill:#87CEEB\n    style REG fill:#FFD700\n    style LEG fill:#FFA07A\n    style NEW fill:#DDA0DD\n```\n\n### Directory Structure\n\n```\nsrc/evidently/\n├── descriptors/\n│   ├── __init__.py              # Public API exports\n│   ├── _text_length.py          # Text length descriptor implementation\n│   ├── _custom_descriptors.py   # FeatureDescriptor class\n│   ├── generated_descriptors.py # Generated descriptor functions\n│   └── text_match.py            # Text matching descriptor\n├── core/\n│   └── registries/\n│       └── descriptors.py       # Descriptor registry implementation\n└── legacy/\n    ├── descriptors/             # Legacy descriptor implementations\n    └── features/                # Legacy feature implementations\n```\n\n## Core Components\n\n### FeatureDescriptor\n\nThe `FeatureDescriptor` class is the central abstraction that wraps a feature with additional metadata and optional tests.\n\n```python\n# Source: src/evidently/descriptors/_custom_descriptors.py\nclass FeatureDescriptor(BaseDescriptor):\n    \"\"\"Descriptor that wraps a feature with tests and metadata.\"\"\"\n    \n    def __init__(\n        self,\n        feature: AnyFeatureType,\n        alias: Optional[str] = None,\n        tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n    ):\n        self.feature = feature\n        self.alias = alias\n        self.tests = tests or []\n```\n\n**Parameters:**\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `feature` | `AnyFeatureType` | The underlying feature implementation |\n| `alias` | `Optional[str]` | Display name for the descriptor |\n| `tests` | `Optional[List]` | Optional list of tests to apply |\n\n### DescriptorRegistry\n\nThe registry manages descriptor registration and lookup, enabling dynamic descriptor discovery.\n\n```python\n# Source: src/evidently/core/registries/descriptors.py\nclass DescriptorRegistry:\n    \"\"\"Registry for managing descriptor instances.\"\"\"\n    \n    def __init__(self):\n        self._descriptors: Dict[str, Descriptor] = {}\n    \n    def register(self, name: str, descriptor: Descriptor) -> None:\n        \"\"\"Register a descriptor with a given name.\"\"\"\n        self._descriptors[name] = descriptor\n    \n    def get(self, name: str) -> Optional[Descriptor]:\n        \"\"\"Retrieve a descriptor by name.\"\"\"\n        return self._descriptors.get(name)\n```\n\n## Available Descriptors\n\n### Text Length Descriptor\n\nComputes text length statistics for string columns, including character count, word count, and sentence statistics.\n\n**Source:** `src/evidently/descriptors/_text_length.py`\n\n```python\ndef TextLength(\n    column_name: str,\n    alias: Optional[str] = None,\n    mode: str = \"chars\",\n    tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n):\n    \"\"\"Compute text length metrics for a column.\n    \n    Args:\n        column_name: Name of the text column to analyze.\n        alias: Display name for the descriptor.\n        mode: Measurement mode - \"chars\" or \"words\".\n        tests: Optional list of tests to apply.\n    \"\"\"\n    from evidently.legacy.features.text_length import TextLength as TextLengthV1\n    \n    feature = TextLengthV1(column_name=column_name, mode=mode, display_name=alias)\n    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `column_name` | `str` | Required | Column to analyze |\n| `alias` | `Optional[str]` | `None` | Custom display name |\n| `mode` | `str` | `\"chars\"` | `\"chars\"` for characters, `\"words\"` for word count |\n| `tests` | `Optional[List]` | `None` | Tests to attach |\n\n### Text Match Descriptor\n\nMatches text content against patterns or lists, useful for validation and filtering.\n\n**Source:** `src/evidently/descriptors/text_match.py`\n\n```python\ndef TextMatch(\n    column_name: str,\n    alias: str,\n    words_list: List[str],\n    mode: str = \"match\",\n    lemmatize: bool = False,\n    tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n):\n    \"\"\"Match text against a word list.\n    \n    Args:\n        column_name: Text column to match against.\n        alias: Name for the descriptor.\n        words_list: List of words/patterns to match.\n        mode: Matching mode - \"match\", \"any\", \"all\".\n        lemmatize: Whether to apply lemmatization.\n        tests: Optional test list.\n    \"\"\"\n    from evidently.legacy.features.text_features import TextMatch as TextMatchV1\n    \n    feature = TextMatchV1(\n        column_name=column_name,\n        words_list=words_list,\n        mode=mode,\n        lemmatize=lemmatize,\n        display_name=alias\n    )\n    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)\n```\n\n**Parameters:**\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `column_name` | `str` | Required | Target text column |\n| `alias` | `str` | Required | Descriptor name |\n| `words_list` | `List[str]` | Required | Words to match |\n| `mode` | `str` | `\"match\"` | `\"match\"`, `\"any\"`, or `\"all\"` |\n| `lemmatize` | `bool` | `False` | Enable lemmatization |\n| `tests` | `Optional[List]` | `None` | Tests to apply |\n\n### HuggingFace Descriptor\n\nApplies HuggingFace models to text columns for various NLP tasks including toxicity detection.\n\n**Source:** `src/evidently/descriptors/generated_descriptors.py`\n\n```python\ndef HuggingFace(\n    column_name: str,\n    model: str,\n    params: dict,\n    alias: str,\n    tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n):\n    \"\"\"Apply a HuggingFace model to text column.\n    \n    Args:\n        column_name: Name of the text column to process.\n        model: HuggingFace model name or path.\n        params: Additional parameters for the model.\n        alias: Alias for the descriptor.\n        tests: Optional list of tests to apply.\n    \"\"\"\n    from evidently.legacy.features.hf_feature import HuggingFaceFeature\n    \n    feature = HuggingFaceFeature(\n        column_name=column_name,\n        model=model,\n        params=params,\n        display_name=alias\n    )\n    return FeatureDescriptor(feature=feature, alias=alias, tests=tests)\n```\n\n### HuggingFace Toxicity Descriptor\n\nSpecialized descriptor for detecting toxic content using HuggingFace models.\n\n**Source:** `src/evidently/descriptors/generated_descriptors.py:39-56`\n\n```python\ndef HuggingFaceToxicity(\n    column_name: str,\n    alias: str,\n    model: Optional[str] = None,\n    toxic_label: Optional[str] = None,\n    tests: Optional[List[Union[\"DescriptorTest\", \"GenericTest\"]]] = None,\n):\n    \"\"\"Detect toxicity in text using HuggingFace models.\n    \n    Args:\n        column_name: Name of the text column to check.\n        alias: Alias for the descriptor.\n        model: HuggingFace model name or path. If None, uses default.\n        toxic_label: Label for toxic content.\n        tests: Optional list of tests to apply.\n    \"\"\"\n```\n\n## Integration with Evidently Metrics\n\nDescriptors are designed to integrate seamlessly with Evidently's metric and test evaluation system. When a descriptor is included in an evaluation, the underlying feature is computed and the results are automatically formatted for display.\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Descriptor Definition]\n    B --> C[Feature Computation]\n    C --> D[Test Evaluation]\n    D --> E[Metric Results]\n    C --> F[Visualization Data]\n    F --> G[HTML Widgets]\n    \n    style A fill:#87CEEB\n    style E fill:#90EE90\n    style G fill:#FFD700\n```\n\n## Legacy Features System\n\nThe legacy features system (`evidently.legacy.features`) provides backward compatibility for existing integrations. These features are wrapped by modern descriptors but can also be used directly.\n\n**Source:** `src/evidently/legacy/features/__init__.py`\n\n### Available Legacy Features\n\n| Feature | Module | Purpose |\n|---------|--------|---------|\n| `TextLength` | `text_length` | Text length statistics |\n| `TextMatch` | `text_features` | Pattern matching in text |\n| `HuggingFaceFeature` | `hf_feature` | HuggingFace model integration |\n\n## Usage Example\n\n```python\nfrom evidently.descriptors import TextLength, TextMatch, HuggingFaceToxicity\n\n# Create descriptors for evaluation\ntext_length_desc = TextLength(\n    column_name=\"review_text\",\n    alias=\"Review Length\",\n    mode=\"words\"\n)\n\ntoxicity_desc = HuggingFaceToxicity(\n    column_name=\"review_text\",\n    alias=\"Toxicity Score\",\n    toxic_label=\"toxic\"\n)\n\n# Use in Evidently Dashboard\ndashboard = Dashboard(metrics=[\n    ColumnMetrics(column_name=\"review_text\")\n        .with_descriptors([\n            text_length_desc,\n            toxicity_desc\n        ])\n])\n```\n\n## Descriptor Tests Integration\n\nDescriptors can be associated with tests that validate the computed values:\n\n```python\nfrom evidently.descriptors import TextLength\nfrom evidently.test_suits import ColumnTestSuite\n\ntest_suite = ColumnTestSuite(\n    tests=[\n        TextLength(\n            column_name=\"description\",\n            alias=\"Description Length\",\n            tests=[\n                # Test within the descriptor\n                GreaterThan(\"Description Length\", threshold=10),\n                LessThan(\"Description Length\", threshold=500)\n            ]\n        )\n    ]\n)\n```\n\n## Summary\n\nThe Descriptors and Features System provides a flexible, extensible mechanism for computing and evaluating column characteristics within Evidently. Key aspects include:\n\n- **Wrapper Pattern**: `FeatureDescriptor` wraps features with metadata and tests\n- **Registry Pattern**: Centralized descriptor management via `DescriptorRegistry`\n- **Backward Compatibility**: Legacy features wrapped for v1 compatibility\n- **Test Integration**: Native support for attaching tests to descriptors\n- **Visualization**: Automatic integration with Evidently's HTML rendering pipeline\n\nThis architecture enables users to define custom descriptors for any data transformation or evaluation need while maintaining consistency with the Evidently evaluation framework.\n\n---\n\n<a id='reports-test-suites'></a>\n\n## Reports and Test Suites\n\n### 相关页面\n\n相关主题：[Presets and Metric Presets](#presets), [Custom Metrics and Extensibility](#custom-metrics), [Core Components](#core-components)\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/core/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/report.py)\n- [src/evidently/core/tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/tests.py)\n- [src/evidently/core/compare.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/compare.py)\n- [src/evidently/core/serialization.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/serialization.py)\n- [src/evidently/legacy/report/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/report/report.py)\n- [src/evidently/legacy/test_suite/test_suite.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/test_suite/test_suite.py)\n- [src/evidently/metrics/_legacy.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/_legacy.py)\n</details>\n\n# Reports and Test Suites\n\n## Overview\n\nEvidently provides two primary evaluation constructs for ML and LLM-powered systems: **Reports** and **Test Suites**. Both are designed to evaluate, analyze, and monitor data and model quality, but they serve different purposes and use cases.\n\n| Component | Purpose | Use Case |\n|-----------|---------|----------|\n| **Report** | Generate comprehensive analysis with metrics, visualizations, and insights | Exploratory analysis, documentation, stakeholder communication |\n| **Test Suite** | Run structured checks with pass/fail outcomes | CI/CD pipelines, automated quality gates, regression testing |\n\n资料来源：[src/evidently/legacy/report/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/report/report.py)\n\n## Architecture\n\n```mermaid\ngraph TD\n    subgraph \"Core Evaluation Engine\"\n        A[Dataset / DataDefinition] --> B[Report / Test Suite]\n        C[Descriptors / Metrics] --> B\n        D[Test Cases] --> B\n    end\n    \n    subgraph \"Report Output\"\n        B --> E[Metric Results]\n        B --> F[Visualizations]\n        B --> G[JSON/HTML Export]\n    end\n    \n    subgraph \"Test Suite Output\"\n        B --> H[Test Results]\n        H --> I[Pass / Fail Status]\n        H --> J[Failure Details]\n    end\n```\n\n### Core Components\n\nThe evaluation system is built on several key components:\n\n| Component | File Location | Role |\n|-----------|---------------|------|\n| `Report` | `src/evidently/core/report.py` | Main class for generating analytical reports |\n| `TestSuite` | `src/evidently/core/tests.py` | Orchestrates test execution |\n| `Dataset` | Core data structure | Holds data with schema definitions |\n| `DataDefinition` | Schema definition | Defines column types and expectations |\n| `Descriptor` | Feature extraction | Row-level evaluators (e.g., Sentiment, TextLength) |\n| `Metric` | Metric calculation | Computes statistical measures |\n\n资料来源：[src/evidently/core/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/report.py)\n\n## Datasets and Data Definition\n\n### Dataset Structure\n\nEvidently uses a `Dataset` object to wrap pandas DataFrames with additional metadata:\n\n```python\nfrom evidently import Dataset, DataDefinition\n\neval_dataset = Dataset.from_pandas(\n    pd.DataFrame(eval_df),\n    data_definition=DataDefinition(),\n    descriptors=[\n        Sentiment(\"answer\", alias=\"Sentiment\"),\n        TextLength(\"answer\", alias=\"Length\"),\n        Contains(\"answer\", items=[\"sorry\", \"apologize\"], mode=\"any\")\n    ]\n)\n```\n\n资料来源：[README.md](https://github.com/evidentlyai/evidently/blob/main/README.md)\n\n### DataDefinition\n\nThe `DataDefinition` class defines the schema for the dataset:\n\n| Property | Type | Description |\n|----------|------|-------------|\n| `column_schema` | dict | Maps column names to data types |\n| `relationships` | list | Defines relationships between datasets |\n| `timestamp_column` | str | Column containing datetime values |\n\n## Descriptors\n\nDescriptors are row-level evaluators that extract features or apply checks to individual rows. They can be used within datasets or standalone.\n\n### Available Descriptors\n\n| Descriptor | Purpose | Parameters |\n|------------|---------|------------|\n| `Sentiment` | Detect sentiment in text | `column_name`, `alias` |\n| `TextLength` | Measure text length | `column_name`, `alias` |\n| `Contains` | Check for substring presence | `column_name`, `items`, `mode`, `case_sensitive` |\n| `BeginsWith` | Verify text prefix | `column_name`, `prefix`, `case_sensitive` |\n| `HuggingFace` | Apply HF models | `column_name`, `model`, `params` |\n| `HuggingFaceToxicity` | Detect toxic content | `column_name`, `model`, `toxic_label` |\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/generated_descriptors.py)\n\n### Descriptor Parameters\n\n| Parameter | Type | Required | Description |\n|-----------|------|----------|-------------|\n| `column_name` | str | Yes | Name of the column to evaluate |\n| `alias` | str | No | Display name for the result |\n| `tests` | list | No | Optional test cases to apply |\n| `mode` | str | No | Matching mode: `\"any\"` or `\"all\"` |\n| `case_sensitive` | bool | No | Whether comparison is case-sensitive |\n\n## Reports\n\n### Creating a Report\n\nReports generate comprehensive HTML or JSON output with metrics and visualizations:\n\n```python\nfrom evidently import Report\nfrom evidently.presets import TextEvals\n\nreport = Report(\n    metrics=[\n        TextEvals(),\n    ]\n)\n\nresult = report.run(reference_data=reference_df, current_data=current_df)\nreport.save_html(\"report.html\")\n```\n\n资料来源：[src/evidently/legacy/report/report.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/report/report.py)\n\n### Report Configuration\n\n| Parameter | Type | Default | Description |\n|-----------|------|---------|-------------|\n| `metrics` | list | Required | List of metrics to compute |\n| `timestamp` | datetime | Now | Report generation time |\n| `include_tests` | bool | False | Include test results in report |\n\n### Report Output Formats\n\n| Format | Method | Use Case |\n|--------|--------|----------|\n| HTML | `save_html(path)` | Interactive visualization |\n| JSON | `save_json(path)` or `as_dict()` | Machine parsing, APIs |\n| Python dict | `as_dict()` | Programmatic access |\n\n资料来源：[src/evidently/core/serialization.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/serialization.py)\n\n## Test Suites\n\n### Creating a Test Suite\n\nTest Suites run structured checks and return pass/fail status:\n\n```python\nfrom evidently import TestSuite\nfrom evidently.test_suite import TestSuite\n\nsuite = TestSuite(tests=[\n    # Test definitions\n])\n\nresult = suite.run(reference_data=reference_df, current_data=current_df)\nsuite.save(\"test_results.json\")\n```\n\n资料来源：[src/evidently/legacy/test_suite/test_suite.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/test_suite/test_suite.py)\n\n### Test Suite Workflow\n\n```mermaid\ngraph LR\n    A[Input Data] --> B[Test Suite]\n    B --> C{Execute Tests}\n    C --> D[Test 1]\n    C --> E[Test 2]\n    C --> F[Test N]\n    D --> G[Test Results]\n    E --> G\n    F --> G\n    G --> H{Pass All?}\n    H -->|Yes| I[Success]\n    H -->|No| J[Failure Report]\n```\n\n### Test Results Structure\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `status` | str | `\"PASSED\"`, `\"FAILED\"`, `\"WARNING\"` |\n| `name` | str | Test identifier |\n| `group` | str | Test category |\n| `details` | dict | Additional context and metrics |\n| `timestamp` | datetime | When the test was run |\n\n## LLM Evaluation Features\n\n### LLM-Powered Judgments\n\nEvidently supports LLM-based evaluation through specialized descriptors:\n\n```python\nfrom evidently.descriptors import (\n    ContextQualityLLMEval,\n    CompletenessLLMEval,\n)\n```\n\n| Descriptor | Purpose | Key Parameters |\n|------------|---------|----------------|\n| `ContextQualityLLMEval` | Evaluate context relevance | `question`, `provider`, `model` |\n| `CompletenessLLMEval` | Check response completeness | `context`, `provider`, `model`, `include_reasoning` |\n\n资料来源：[src/evidently/descriptors/generated_descriptors.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/descriptors/generated_descriptors.py)\n\n### LLM Provider Configuration\n\n| Provider | Model Default | Configuration |\n|----------|---------------|---------------|\n| `openai` | `gpt-4o-mini` | API key required |\n| `anthropic` | Claude models | API key required |\n| `azure` | Configurable | Endpoint + key |\n\n### Evaluation Options\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `include_category` | bool | Include categorical classification |\n| `include_score` | bool | Include numerical score |\n| `include_reasoning` | bool | Include LLM reasoning |\n| `uncertainty` | Uncertainty | Strategy for handling uncertainty |\n\n## Metrics and Presets\n\n### Preset Configurations\n\nEvidently provides pre-configured metric sets:\n\n| Preset | Description | File |\n|--------|-------------|------|\n| `TextEvals` | Text quality metrics | `src/evidently/presets` |\n| `DataDrift` | Drift detection | Legacy metrics |\n| `DataQuality` | Quality checks | Legacy metrics |\n\n资料来源：[src/evidently/metrics/_legacy.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/_legacy.py)\n\n## Legacy vs. Current API\n\nEvidently maintains both legacy and current APIs for backward compatibility:\n\n### Legacy API Structure\n\n```\nsrc/evidently/legacy/\n├── report/\n│   └── report.py          # Legacy Report class\n└── test_suite/\n    └── test_suite.py      # Legacy TestSuite class\n```\n\n### Current API Structure\n\n```\nsrc/evidently/\n├── core/\n│   ├── report.py          # Current Report class\n│   ├── tests.py           # Current test infrastructure\n│   ├── compare.py         # Data comparison utilities\n│   └── serialization.py   # Output serialization\n```\n\n资料来源：[src/evidently/metrics/_legacy.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/metrics/_legacy.py)\n\n## Usage Example: LLM Evaluation Pipeline\n\n```python\nimport pandas as pd\nfrom evidently import Report, Dataset, DataDefinition\nfrom evidently.descriptors import Sentiment, TextLength, Contains\nfrom evidently.presets import TextEvals\n\n# 1. Prepare data\neval_df = pd.DataFrame([\n    [\"What is the capital of Japan?\", \"The capital of Japan is Tokyo.\"],\n    [\"Who painted the Mona Lisa?\", \"Leonardo da Vinci.\"],\n    [\"Can you write an essay?\", \"I'm sorry, but I can't assist with homework.\"]],\n    columns=[\"question\", \"answer\"])\n\n# 2. Create dataset with descriptors\neval_dataset = Dataset.from_pandas(\n    pd.DataFrame(eval_df),\n    data_definition=DataDefinition(),\n    descriptors=[\n        Sentiment(\"answer\", alias=\"Sentiment\"),\n        TextLength(\"answer\", alias=\"Length\"),\n        Contains(\"answer\", items=[\"sorry\", \"apologize\"], mode=\"any\")\n    ]\n)\n\n# 3. Run report\nreport = Report(metrics=[TextEvals()])\nresult = report.run(reference_data=None, current_data=eval_dataset)\nreport.save_html(\"llm_eval_report.html\")\n```\n\n资料来源：[README.md](https://github.com/evidentlyai/evidently/blob/main/README.md)\n\n## Comparison: Report vs. Test Suite\n\n| Aspect | Report | Test Suite |\n|--------|--------|------------|\n| **Output** | Metrics, visualizations, insights | Pass/fail status, error details |\n| **Use Case** | Exploratory analysis | Automated quality gates |\n| **Integration** | Dashboards, documentation | CI/CD, monitoring alerts |\n| **Thresholds** | Configurable, informative | Strict pass/fail boundaries |\n| **Exit Code** | Always 0 (informational) | 0 (pass) or 1 (fail) |\n\n资料来源：[src/evidently/core/tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/tests.py)\n\n## Serialization\n\n### Supported Formats\n\n| Format | Extension | Use Case |\n|--------|-----------|----------|\n| JSON | `.json` | APIs, automation, storage |\n| HTML | `.html` | Human review, sharing |\n| Parquet | `.parquet` | Large-scale data storage |\n\n### Serialization Options\n\n```python\n# Save as JSON\nreport.save_json(\"report.json\")\n\n# Save as HTML\nreport.save_html(\"report.html\")\n\n# Export as dictionary\ndata = report.as_dict()\n```\n\n资料来源：[src/evidently/core/serialization.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/serialization.py)\n\n## Summary\n\nReports and Test Suites in Evidently provide complementary approaches to ML and LLM evaluation:\n\n- **Reports** excel at providing comprehensive, visual analysis suitable for exploration and documentation\n- **Test Suites** provide structured, automated quality checks ideal for CI/CD integration\n- Both share the same underlying dataset and descriptor infrastructure\n- LLM-powered evaluations are available through dedicated descriptors like `ContextQualityLLMEval` and `CompletenessLLMEval`\n- Output can be serialized to JSON, HTML, or accessed programmatically via Python dictionaries\n\n---\n\n<a id='presets'></a>\n\n## Presets and Metric Presets\n\n### 相关页面\n\n相关主题：[Reports and Test Suites](#reports-test-suites), [Custom Metrics and Extensibility](#custom-metrics)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/presets/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/__init__.py)\n- [src/evidently/presets/drift.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/drift.py)\n- [src/evidently/presets/classification.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/classification.py)\n- [src/evidently/presets/regression.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/regression.py)\n- [src/evidently/presets/recsys.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/recsys.py)\n- [src/evidently/presets/special.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/presets/special.py)\n- [src/evidently/legacy/metric_preset/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/metric_preset/__init__.py)\n- [src/evidently/legacy/test_preset/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/test_preset/__init__.py)\n</details>\n\n# Presets and Metric Presets\n\n## Overview\n\nPresets in Evidently are pre-configured collections of metrics and evaluations designed to simplify the process of assessing machine learning models and LLM-powered systems. They provide ready-to-use evaluation templates that bundle relevant metrics, thresholds, and reporting components into cohesive units. This abstraction layer allows users to perform comprehensive model assessment without manually configuring individual metrics.\n\nThe preset system follows a modular architecture where each preset type addresses a specific use case domain. Users can instantiate presets and pass them directly to the `Report` or `TestSuite` objects, which execute the bundled evaluations and generate structured results.\n\n## Preset Architecture\n\n```mermaid\ngraph TD\n    A[Evidently Presets] --> B[Standard Presets]\n    A --> C[Legacy Presets]\n    \n    B --> B1[DataDriftPreset]\n    B --> B2[TargetDriftPreset]\n    B --> B3[ClassificationPreset]\n    B --> B4[RegressionPreset]\n    B --> B5[RecSysPreset]\n    B --> B6[TextEvals]\n    \n    C --> C1[MetricPreset]\n    C --> C2[TestPreset]\n    \n    D[Report / TestSuite] --> E[Integrates Presets]\n    E --> B\n    E --> C\n```\n\n## Preset Types\n\n### Standard Presets (src/evidently/presets/)\n\nThe modern preset implementation resides in the `src/evidently/presets/` directory. These presets integrate directly with the current Evidently reporting API.\n\n#### DataDriftPreset\n\nThe `DataDriftPreset` evaluates feature-level drift between reference and current datasets. It calculates drift scores for individual features and provides aggregate drift statistics. This preset is particularly useful for monitoring data pipeline changes and detecting distribution shifts that may impact model performance.\n\n```python\nfrom evidently.presets import DataDriftPreset\n\nreport = Report(metrics=[DataDriftPreset()])\nreport.run(reference_data=reference_df, current_data=current_df)\n```\n\n#### TargetDriftPreset\n\nThe `TargetDriftPreset` specifically monitors drift in the target variable distribution. It is essential for supervised learning scenarios where changes in label distribution can signal underlying data quality issues or concept drift.\n\n#### ClassificationPreset\n\nThe `ClassificationPreset` bundles metrics relevant to classification model evaluation. It includes accuracy metrics, confusion matrix analysis, class-level performance indicators, and probability calibration assessments. This preset supports both binary and multiclass classification scenarios.\n\n#### RegressionPreset\n\nThe `RegressionPreset` provides comprehensive regression model evaluation including error distribution analysis, quantile-based metrics, and residual diagnostics. It helps identify heteroscedasticity, non-linearity, and systematic prediction biases.\n\n#### RecSysPreset\n\nThe `RecSysPreset` is specialized for recommendation system evaluation. It includes ranking metrics, coverage indicators, and user-item interaction analysis tailored to collaborative filtering and content-based recommendation approaches.\n\n#### TextEvals\n\nThe `TextEvals` preset targets LLM and NLP model evaluation. It encompasses text-specific metrics such as sentiment analysis, text length distributions, and content quality indicators. This preset integrates with the descriptors system for row-level text evaluations.\n\n```python\nfrom evidently.presets import TextEvals\n\neval_dataset = Dataset.from_pandas(\n    pd.DataFrame(eval_df),\n    data_definition=DataDefinition(),\n    descriptors=[\n        Sentiment(\"answer\", alias=\"Sentiment\"),\n        TextLength(\"answer\", alias=\"Length\"),\n        Contains(\"answer\", words=[\"sorry\", \"cannot\"], alias=\"Denial\")\n    ]\n)\n```\n\n### Legacy Presets (src/evidently/legacy/)\n\nThe legacy preset system provided foundational abstractions for metric and test evaluation. These classes served as the original implementation pattern before the current preset architecture.\n\n#### MetricPreset\n\nThe `MetricPreset` class defines the base interface for metric collection presets. It maintains a list of metrics and provides methods for execution and result aggregation.\n\n#### TestPreset\n\nThe `TestPreset` class extends the preset concept to testing scenarios, enabling batch execution of statistical tests against model outputs. It provides assertions and threshold-based pass/fail criteria.\n\n## Preset Composition\n\nPresets can be combined within a single report to create comprehensive evaluation suites:\n\n```python\nfrom evidently import Report\nfrom evidently.presets import DataDriftPreset, ClassificationPreset\n\nreport = Report(\n    metrics=[\n        DataDriftPreset(),\n        ClassificationPreset()\n    ]\n)\n```\n\n## Integration with Report API\n\nPresets integrate seamlessly with the Evidently `Report` class:\n\n| Component | Role | Integration |\n|-----------|------|-------------|\n| `Report` | Execution container | Accepts presets as `metrics` parameter |\n| `Dataset` | Data wrapper | Provides reference and current data |\n| `DataDefinition` | Schema definition | Describes columns and their roles |\n| Preset | Metric bundle | Contains metric instances |\n\n## Descriptor System\n\nThe preset system works in conjunction with the descriptor system for row-level evaluations. Descriptors apply per-row transformations and evaluations:\n\n```python\nfrom evidently.descriptors import Sentiment, TextLength, Contains\n```\n\nDescriptors included in the `Dataset` configuration are evaluated alongside preset metrics, enabling both aggregate and granular assessment within a single report.\n\n## Use Cases\n\n### Model Validation\n\nBefore deploying a model, use presets to validate performance on held-out data:\n\n```python\nreport = Report(metrics=[RegressionPreset()])\nreport.run(reference_data=training_df, current_data=validation_df)\n```\n\n### Production Monitoring\n\nContinuously monitor deployed models for performance degradation:\n\n```python\nreport = Report(metrics=[DataDriftPreset(), ClassificationPreset()])\nreport.run(reference_data=baseline_df, current_data=current_df)\n```\n\n### LLM Evaluation\n\nAssess LLM responses using text-specific presets:\n\n```python\nfrom evidently.presets import TextEvals\nfrom evidently import Dataset, DataDefinition\n\neval_dataset = Dataset.from_pandas(response_df, data_definition=DataDefinition())\nreport = Report(metrics=[TextEvals()], descriptors=[...])\n```\n\n## Summary Table: Preset Comparison\n\n| Preset | Domain | Key Metrics | Use Case |\n|--------|--------|-------------|----------|\n| `DataDriftPreset` | Data | Drift scores, feature importance | Data pipeline monitoring |\n| `TargetDriftPreset` | Data | Target distribution, PSI | Concept drift detection |\n| `ClassificationPreset` | ML | Accuracy, F1, ROC-AUC | Classifier evaluation |\n| `RegressionPreset` | ML | MAE, RMSE, R² | Regression analysis |\n| `RecSysPreset` | ML | Ranking metrics, coverage | Recommendation systems |\n| `TextEvals` | NLP/LLM | Sentiment, length, content | Text model evaluation |\n\n## Conclusion\n\nThe preset system in Evidently provides a powerful abstraction for organizing and executing model evaluations. By bundling related metrics into coherent units, presets reduce boilerplate and enable consistent evaluation practices across different model types and deployment scenarios.\n\n---\n\n<a id='custom-metrics'></a>\n\n## Custom Metrics and Extensibility\n\n### 相关页面\n\n相关主题：[Reports and Test Suites](#reports-test-suites), [Presets and Metric Presets](#presets)\n\n<details>\n<summary>Relevant Source Files</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/core/registries/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/__init__.py)\n- [src/evidently/core/registries/configs.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/configs.py)\n- [src/evidently/core/registries/bound_tests.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/core/registries/bound_tests.py)\n- [src/evidently/legacy/metrics/custom_metric.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/metrics/custom_metric.py)\n- [src/evidently/legacy/features/custom_feature.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/legacy/features/custom_feature.py)\n- [src/evidently/generators/__init__.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/generators/__init__.py)\n- [src/evidently/generators/column.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/generators/column.py)\n</details>\n\n# Custom Metrics and Extensibility\n\n## Overview\n\nEvidently provides a comprehensive extensibility system that allows users to create custom metrics, features, and test configurations. The framework follows a plugin-like architecture where custom implementations can be registered and discovered at runtime through the registry system.\n\nThe extensibility model encompasses several key areas:\n\n- **Custom Metrics**: User-defined metrics that compute specific evaluation logic\n- **Custom Features**: Row-level evaluators that extract or compute values from data\n- **Generators**: Automatic metric and feature generation from column specifications\n- **Bound Tests**: Test configurations attached to metrics with threshold definitions\n\nThis architecture enables users to extend the framework's built-in capabilities while maintaining consistency with the existing evaluation pipeline.\n\n## Registry System Architecture\n\nThe registry system serves as the central mechanism for discovering and managing extensions within Evidently. It provides a unified interface for registering, retrieving, and invoking custom implementations.\n\n```mermaid\ngraph TD\n    A[User Code] --> B[Registry API]\n    B --> C[MetricRegistry]\n    B --> D[FeatureRegistry]\n    B --> E[TestRegistry]\n    C --> F[Metric Implementations]\n    D --> G[Feature Implementations]\n    E --> H[Test Configurations]\n```\n\n### Core Registry Components\n\n| Component | Purpose | File Location |\n|-----------|---------|---------------|\n| MetricRegistry | Manages custom metric registrations | `src/evidently/core/registries/__init__.py` |\n| ConfigRegistry | Stores configuration metadata | `src/evidently/core/registries/configs.py` |\n| BoundTestRegistry | Handles test bindings and thresholds | `src/evidently/core/registries/bound_tests.py` |\n\nThe registry pattern follows a key-based lookup system where each extension is identified by a unique name. This allows for dynamic discovery and late binding of implementations at runtime.\n\n## Custom Metrics\n\nCustom metrics extend the base `Metric` class to implement domain-specific evaluation logic. The framework provides both legacy and modern approaches for metric creation.\n\n### Metric Structure\n\nA custom metric typically consists of:\n\n1. **Configuration**: Defines the metric's parameters and metadata\n2. **Calculation Logic**: Implements the `calculate` method to produce results\n3. **Visualization**: Provides rendering information through widget definitions\n\n```python\nclass CustomMetric(Metric):\n    def __init__(self, column: str, threshold: float = 0.5):\n        self.column = column\n        self.threshold = threshold\n    \n    def calculate(self, context: Context) -> MetricResult:\n        # Custom calculation logic\n        pass\n```\n\n资料来源：[src/evidently/legacy/metrics/custom_metric.py:1-50]()\n\n### Metric Container System\n\nThe `MetricContainer` abstract base class provides the foundation for grouping related metrics. It implements a caching mechanism to avoid redundant metric generation.\n\n```mermaid\ngraph LR\n    A[Context] --> B{Container Fingerprint}\n    B -->|Cache Hit| C[Return Cached Metrics]\n    B -->|Cache Miss| D[generate_metrics]\n    D --> E[Store in Context]\n    E --> C\n```\n\nThe `generate_metrics` method must be implemented by subclasses to define the metrics to be computed:\n\n```python\ndef generate_metrics(self, context: \"Context\") -> Sequence[MetricOrContainer]:\n    \"\"\"Generate metrics based on the container configuration.\n    \n    Args:\n        context: Context containing datasets and configuration.\n    \n    Returns:\n        Sequence of Metric or MetricContainer objects to compute.\n    \"\"\"\n    raise NotImplementedError()\n```\n\n资料来源：[src/evidently/core/container.py:1-80]()\n\n### Rendering and Widgets\n\nMetrics can contribute visualization widgets through the `render` method. The widget system supports hierarchical composition where parent containers aggregate widgets from child metrics:\n\n```python\ndef render(\n    self,\n    context: \"Context\",\n    child_widgets: Optional[List[Tuple[Optional[MetricId], List[BaseWidgetInfo]]]] = None,\n) -> List[BaseWidgetInfo]:\n    \"\"\"Render visualization widgets for this container.\n    \n    Combines widgets from all child metrics/containers.\n    \"\"\"\n```\n\n资料来源：[src/evidently/core/container.py:80-120]()\n\n## Custom Features\n\nCustom features extend the `Descriptor` base class to provide row-level evaluation capabilities. They can be applied to individual data points during dataset processing.\n\n### Feature Implementation Pattern\n\nFeatures implement the descriptor pattern, wrapping column values with evaluation logic:\n\n```python\nclass CustomFeature(Descriptor):\n    def __init__(self, column: str, alias: str = None):\n        self.column = column\n        self.alias = alias or column\n    \n    def apply(self, data: pd.Series) -> pd.Series:\n        # Custom feature calculation\n        return result\n```\n\n资料来源：[src/evidently/legacy/features/custom_feature.py:1-40]()\n\n### Integration with Descriptors\n\nThe descriptor system integrates with the data definition framework, allowing features to be declared as part of a dataset configuration:\n\n```python\neval_dataset = Dataset.from_pandas(\n    pd.DataFrame(eval_df),\n    data_definition=DataDefinition(),\n    descriptors=[\n        Sentiment(\"answer\", alias=\"Sentiment\"),\n        TextLength(\"answer\", alias=\"Length\"),\n        Contains(\"answer\", [\"denial\", \"refuse\"], alias=\"ContainsDenial\")\n    ]\n)\n```\n\n资料来源：[README.md:1-50]()\n\n## Generators System\n\nGenerators provide an automated way to create metrics and features based on column specifications. This reduces boilerplate when working with large numbers of similar evaluations.\n\n### Column Generator\n\nThe column generator creates standardized metrics for numeric and categorical columns:\n\n```python\ngenerator = ColumnGenerator(\n    columns=[\"feature_1\", \"feature_2\", \"feature_3\"],\n    generators=[\n        MeanGenerator(),\n        StdGenerator(),\n        NullCountGenerator()\n    ]\n)\n```\n\n资料来源：[src/evidently/generators/column.py:1-60]()\n\n### Generator Configuration\n\nGenerators can be configured with:\n\n| Parameter | Type | Description |\n|-----------|------|-------------|\n| `columns` | List[str] | Target columns for generation |\n| `include_missing` | bool | Include columns with missing values |\n| `exclude_patterns` | List[str] | Regex patterns for column exclusion |\n| `generators` | List[Generator] | Specific generators to apply |\n\n资料来源：[src/evidently/generators/__init__.py:1-50]()\n\n## Bound Tests and Thresholds\n\nBound tests attach threshold configurations to metrics, enabling automated pass/fail determinations based on metric results.\n\n### Test Binding Pattern\n\nTests are bound to metrics through the registry system:\n\n```python\nclass BoundTest:\n    def __init__(self, metric: MetricId, threshold: TestThreshold):\n        self.metric = metric\n        self.threshold = threshold\n    \n    def evaluate(self, metric_result: MetricResult) -> TestResult:\n        # Compare metric value against threshold\n        pass\n```\n\n资料来源：[src/evidently/core/registries/bound_tests.py:1-50]()\n\n### Threshold Types\n\nThe framework supports multiple threshold configuration types:\n\n| Threshold Type | Description | Use Case |\n|---------------|-------------|----------|\n| `absolute` | Fixed value comparison | Fixed acceptable ranges |\n| `relative` | Percentage-based comparison | Drift detection |\n| `sigma` | Standard deviation bounds | Anomaly detection |\n| `quantile` | Percentile-based thresholds | Distribution extremes |\n\n资料来源：[src/evidently/core/registries/bound_tests.py:50-100]()\n\n## Configuration and Versioning\n\nThe extensibility system includes robust configuration management with versioning support for metrics and descriptors.\n\n### Config Version Model\n\nConfigurations are versioned to track changes over time:\n\n```python\n@dataclass\nclass ConfigVersion:\n    id: STR_UUID\n    artifact_id: STR_UUID\n    version: int\n    content: Any\n    metadata: ConfigVersionMetadata\n```\n\n资料来源：[src/evidently/sdk/configs.py:1-80]()\n\n### Metadata Structure\n\n| Field | Type | Description |\n|-------|------|-------------|\n| `created_at` | datetime | Version creation timestamp |\n| `updated_at` | datetime | Last modification timestamp |\n| `author` | str | User who created/modified |\n| `comment` | str | Change description |\n\n资料来源：[src/evidently/sdk/adapters.py:1-100]()\n\n## Workflow Diagram\n\nThe complete extensibility workflow from definition to execution:\n\n```mermaid\ngraph TD\n    A[Define Custom Metric/Feature] --> B[Register in Registry]\n    B --> C[Create Dataset with Descriptors]\n    C --> D[Generate Metrics from Container]\n    D --> E[Calculate Results]\n    E --> F[Bind Tests with Thresholds]\n    F --> G[Evaluate Pass/Fail]\n    G --> H[Render Widgets]\n    H --> I[Display in Report/Dashboard]\n```\n\n## Best Practices\n\n### Performance Considerations\n\n- **Caching**: Use the container fingerprinting mechanism to cache generated metrics and avoid redundant calculations\n- **Lazy Evaluation**: Implement metrics using lazy evaluation patterns when possible to defer expensive computations\n- **Batch Processing**: Design features to operate on pandas Series rather than individual values for vectorized performance\n\n### Extensibility Guidelines\n\n1. **Consistent Naming**: Follow the established naming conventions for custom implementations\n2. **Type Hints**: Include comprehensive type hints for all public interfaces\n3. **Documentation**: Document parameters and return values using docstring conventions\n4. **Testing**: Create unit tests for custom metric calculation logic\n\n### Integration Points\n\nCustom extensions integrate with the framework through several standardized interfaces:\n\n- **Metric Protocol**: Implement the `calculate(self, context)` method\n- **Descriptor Protocol**: Implement the `apply(self, data)` method\n- **Widget Protocol**: Return `List[BaseWidgetInfo]` from `render()` method\n- **Test Protocol**: Implement the `evaluate(self, result)` method for bound tests\n\n## API Reference\n\n### Key Classes and Functions\n\n| Class/Function | Module | Purpose |\n|---------------|--------|---------|\n| `MetricContainer` | `evidently.core.container` | Base class for metric containers |\n| `Descriptor` | `evidently.legacy.features` | Base class for features |\n| `ColumnGenerator` | `evidently.generators.column` | Column-based metric generation |\n| `BoundTest` | `evidently.core.registries` | Threshold-bound test configuration |\n| `ConfigVersion` | `evidently.sdk.configs` | Versioned configuration storage |\n\n### Context Management\n\nThe `Context` object provides access to datasets and configuration during metric calculation:\n\n```python\nclass Context:\n    def metrics_container(self, fingerprint: str) -> Optional[List[MetricOrContainer]]:\n        \"\"\"Retrieve cached metrics for container fingerprint.\"\"\"\n    \n    def set_metric_container_data(\n        self, \n        fingerprint: str, \n        metrics: List[MetricOrContainer]\n    ) -> None:\n        \"\"\"Store generated metrics in cache.\"\"\"\n```\n\n资料来源：[src/evidently/core/container.py:50-80]()\n\n## Summary\n\nEvidently's extensibility system provides a comprehensive framework for customizing evaluation logic. The registry-based architecture enables dynamic discovery of custom implementations while maintaining consistency with built-in features. Custom metrics and features extend the base classes to implement domain-specific logic, and the generator system automates repetitive metric definitions. The bound test mechanism attaches configurable thresholds to metrics for automated validation, making the system suitable for both exploratory analysis and production monitoring scenarios.\n\n---\n\n<a id='ui-service'></a>\n\n## UI Service Backend\n\n<details>\n<summary>相关源码文件</summary>\n\n以下源码文件用于生成本页说明：\n\n- [src/evidently/ui/workspace.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/ui/workspace.py)\n- [src/evidently/sdk/adapters.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/adapters.py)\n- [src/evidently/sdk/configs.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/sdk/configs.py)\n- [src/evidently/ui/utils.py](https://github.com/evidentlyai/evidently/blob/main/src/evidently/ui/utils.py)\n- [ui/packages/evidently-ui-lib/src/components/Project/ProjectCard.tsx](https://github.com/evidentlyai/evidently/blob/main/ui/packages/evidently-ui-lib/src/components/Project/ProjectCard.tsx)\n- [ui/packages/evidently-ui-lib/src/components/Prompts/PromptsTable.tsx](https://github.com/evidentlyai/evidently/blob/main/ui/packages/evidently-ui-lib/src/components/Prompts/PromptsTable.tsx)\n- [ui/packages/evidently-ui-lib/src/components/Traces/TracesTable.tsx](https://github.com/evidentlyai/evidently/blob/main/ui/packages/evidently-ui-lib/src/components/Traces/TracesTable.tsx)\n- [ui/packages/evidently-ui-lib/src/components/Utils/NameAndDescriptionForm.tsx](https://github.com/evidentlyai/evidently/blob/main/ui/packages/evidently-ui-lib/src/components/Utils/NameAndDescriptionForm.tsx)\n</details>\n\n# UI Service Backend\n\nThe Evidently UI Service Backend is a FastAPI-based REST API layer that powers the Evidently web interface, providing endpoints for managing projects, datasets, prompts, and trace data. It serves as the bridge between the React-based frontend and the Evidently SDK's core functionality.\n\n## Architecture Overview\n\nThe UI Service Backend follows a layered architecture:\n\n```graph TD\n    A[Frontend: React/TypeScript] --> B[UI Service Backend: FastAPI]\n    B --> C[Evidently SDK]\n    C --> D[Workspace Abstraction]\n    C --> E[Cloud Config API]\n    D --> F[(Local Storage)]\n    E --> G[(Remote Backend)]\n```\n\n### Component Stack\n\n| Layer | Technology | Purpose |\n|-------|------------|---------|\n| Frontend | React, TypeScript, MUI | User interface components |\n| Backend API | FastAPI/Python | REST API endpoints |\n| Business Logic | Evidently SDK | Core evaluation logic |\n| Data Layer | Local FS / Remote API | Persistence |\n\n## Core Data Models\n\n### Project Model\n\nProjects are the primary organizational unit in the UI. The `ProjectModel` represents:\n\n```python\nProjectModel:\n    - name: str\n    - description: Optional[str]\n    - org_id: Optional[OrgID]\n```\n\n资料来源：[src/evidently/ui/workspace.py:50-55]()\n\n### Config API Abstraction\n\nThe `ConfigAPI` class provides a generic interface for managing versioned configurations:\n\n| Method | Purpose |\n|--------|---------|\n| `create_config()` | Create a new configuration |\n| `get_config()` | Retrieve a configuration by ID |\n| `list_configs()` | List all configurations |\n| `update_config()` | Update an existing configuration |\n| `delete_config()` | Delete a configuration |\n| `add_version()` | Add a new version to a config |\n| `get_version()` | Get a specific version |\n\n资料来源：[src/evidently/sdk/configs.py:95-145]()\n\n### Descriptor Configuration\n\nThe `Descriptor` type is used to store and version descriptor configurations:\n\n```python\nDescriptorConfigAPI:\n    - add_descriptor() -> ConfigVersion\n    - get_descriptor() -> Descriptor\n```\n\n资料来源：[src/evidently/sdk/configs.py:165-180]()\n\n## Workspace Abstraction\n\nThe `Workspace` abstract class defines the contract for project management:\n\n```graph TD\n    A[Workspace] --> B[LocalWorkspace]\n    A --> C[CloudWorkspace]\n    A --> D[RemoteWorkspace]\n```\n\n### Abstract Methods\n\n| Method | Parameters | Returns | Description |\n|--------|------------|---------|-------------|\n| `create_project()` | name, description, org_id | Project | Creates a new project |\n| `add_project()` | project: ProjectModel, org_id | Project | Adds project to workspace |\n| `get_project()` | project_id: STR_UUID | Optional[Project] | Retrieves project by ID |\n| `delete_project()` | project_id: STR_UUID | None | Removes project from workspace |\n| `list_projects()` | org_id: Optional | Sequence[Project] | Lists all projects |\n\n资料来源：[src/evidently/ui/workspace.py:40-80]()\n\n## Frontend Components\n\n### Project Card Component\n\nThe `ProjectCard` component handles project display and editing:\n\n```typescript\nProjectCardProps:\n    - project: Project\n    - disabled?: boolean\n    - onEditProject: (args: { name: string; description: string }) => void\n    - LinkToProject: ComponentType\n```\n\nFeatures:\n- Toggle between view and edit modes\n- Uses `EditProjectInfoForm` for inline editing\n- Displays `ProjectInfoCard` in view mode\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Project/ProjectCard.tsx:60-80]()\n\n### Prompts Table\n\nThe `PromptsTable` component renders a sortable, paginated list of prompts:\n\n| Column | Render | Features |\n|--------|--------|----------|\n| ID | ID with copy button | `TextWithCopyIcon` component |\n| Name | Truncated text (max 200px) | Typography component |\n| Created at | Date formatted | `dayjs` locale formatting |\n| Actions | Link + Delete button | Edit/delete operations |\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Prompts/PromptsTable.tsx:40-75]()\n\n### Traces Table\n\nThe `TracesTable` component displays trace data with extended metadata:\n\n| Column | Features |\n|--------|----------|\n| Tags | `HidedTags` component, 250px min width |\n| Metadata | `JsonViewThemed` with clipboard support |\n| Type | Chip showing trace origin |\n| Created at | Sortable date column |\n| Actions | Dataset link, edit dialog, delete |\n\n资料来源：[ui/packages/evidently-ui-lib/src/components/Traces/TracesTable.tsx:50-90]()\n\n## API Endpoint Structure\n\n### Projects API\n\n```\n/projects\n  ├── GET    /              - List all projects\n  ├── POST   /              - Create new project\n  └── GET    /{project_id}  - Get project details\n\n/projects/{project_id}/\n  ├── prompts/\n  │   ├── GET    /          - List prompts\n  │   ├── POST   /          - Create prompt\n  │   └── GET    /{id}      - Get prompt details\n  ├── datasets/\n  │   ├── GET    /          - List datasets\n  │   └── POST   /          - Create dataset\n  └── traces/\n      ├── GET    /          - List traces\n      └── POST   /          - Create trace\n```\n\n### Prompts API\n\n| Endpoint | Method | Purpose |\n|----------|--------|---------|\n| `/prompts` | GET | List all prompts for a project |\n| `/prompts` | POST | Create a new prompt |\n| `/prompts/{id}` | GET | Get prompt details |\n\n资料来源：[ui/service/src/routes/.../index-prompts-list/index-prompts-list-main.tsx:40-60]()\n\n## Configuration Management\n\n### Artifact Version Management\n\nThe `ArtifactConfigAPI` handles versioned artifact storage:\n\n```python\nArtifactConfigAPI:\n    - create_version()      # Create new artifact version\n    - list_versions()       # List all versions\n    - get_version()         # Get specific version\n    - get_version_by_id()   # Get by version ID\n```\n\n资料来源：[src/evidently/sdk/adapters.py:45-70]()\n\n### Version Conversion\n\nBidirectional conversion between SDK and API models:\n\n```graph LR\n    A[ArtifactVersion] -->|convert| B[ConfigVersion]\n    B -->|convert| A\n```\n\nMethods:\n- `_artifact_version_to_config_version()` - SDK to API\n- `_config_version_to_artifact_version()` - API to SDK\n\n## UI Utilities\n\n### HTML Link Templates\n\nThe `utils.py` module provides HTML templates for dashboard rendering:\n\n```python\nHTML_LINK_WITH_ID_TEMPLATE   # Link with button and ID display\nFILE_LINK_WITH_ID_TEMPLATE   # File link with ID\nRUNNING_SERVICE_LINK_TEMPLATE # Service link with label\n```\n\n资料来源：[src/evidently/ui/utils.py:30-55]()\n\n## Workflow: Project Lifecycle\n\n```mermaid\ngraph TD\n    A[Create Project] --> B[Add Datasets]\n    B --> C[Create Prompts]\n    C --> D[Run Evals]\n    D --> E[Store Traces]\n    E --> F[View Results]\n    F --> G[Monitor]\n    \n    A -.->|via| H[Workspace.add_project]\n    C -.->|via| I[Prompts API]\n    E -.->|via| J[Traces API]\n```\n\n## Configuration Options\n\n| Option | Type | Default | Description |\n|--------|------|---------|-------------|\n| `org_id` | UUID | None | Organization identifier |\n| `project_id` | UUID | Auto | Project identifier |\n| `version` | int | \"latest\" | Config version selector |\n\n## Key SDK Classes\n\n| Class | File | Responsibility |\n|-------|------|-----------------|\n| `ConfigAPI` | configs.py | Generic config CRUD operations |\n| `DescriptorConfigAPI` | configs.py | Descriptor-specific operations |\n| `CloudConfigAPI` | configs.py | Remote backend communication |\n| `Workspace` | workspace.py | Abstract workspace interface |\n| `ProjectModel` | workspace.py | Project data structure |\n\n## Technology Stack\n\n| Component | Technology | Version |\n|-----------|------------|---------|\n| Backend Framework | FastAPI | - |\n| SDK Core | Python | 3.11+ |\n| Frontend Framework | React | - |\n| UI Components | MUI | - |\n| State Management | React Hooks | - |\n| Date Handling | dayjs | - |\n\n## Development Workflow\n\n### Running the Service\n\n```bash\n# In ui/service folder\npnpm dev\n```\n\n### Code Quality\n\n```bash\n# In ui folder\npnpm code-check          # Format, sort imports, lint\npnpm code-check --fix    # Apply fixes automatically\n```\n\n资料来源：[ui/README.md:15-25]()\n\n### Building\n\n```bash\npnpm build\n```\n\n## Summary\n\nThe Evidently UI Service Backend provides:\n\n1. **REST API layer** for frontend-backend communication\n2. **Project management** via Workspace abstraction\n3. **Versioned configuration storage** using ConfigAPI\n4. **Tracing support** for evaluation history\n5. **Prompts management** for LLM-powered evaluations\n\nThe architecture cleanly separates concerns between the FastAPI backend, Evidently SDK business logic, and React frontend components, enabling modular development and testing.\n\n---\n\n---\n\n## Doramagic 踩坑日志\n\n项目：evidentlyai/evidently\n\n摘要：发现 19 个潜在踩坑项，其中 4 个为 high/blocking；最高优先级：安装坑 - 来源证据：Update scikit-learn version requirement to support v1.6.0。\n\n## 1. 安装坑 · 来源证据：Update scikit-learn version requirement to support v1.6.0\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Update scikit-learn version requirement to support v1.6.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e82e24d201134e7f8dfa0b564d73fce8 | https://github.com/evidentlyai/evidently/issues/1407 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 2. 配置坑 · 来源证据：PromptOptimizer throws OpenAIError when using Vertex AI judge\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：PromptOptimizer throws OpenAIError when using Vertex AI judge\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_abf6f9b183ff4892be78fd198f60d8e8 | https://github.com/evidentlyai/evidently/issues/1856 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 运行坑 · 来源证据：IndexError in infer_column_type when column contains only null values\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：IndexError in infer_column_type when column contains only null values\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_44cdd5f2b657432eb8418fb132959ada | https://github.com/evidentlyai/evidently/issues/1764 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 4. 安全/权限坑 · 来源证据：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_872f2b57cd3a4c61ad5c387c8be3208b | https://github.com/evidentlyai/evidently/issues/1410 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安装坑 · 来源证据：Numpy 2.x support?\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Numpy 2.x support?\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_2fc84415b55a43d2be4f7f47cf46d7a1 | https://github.com/evidentlyai/evidently/issues/1557 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：v0.7.12\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.12\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fcf994a3bcf94b0d9b2c5d6159816728 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.12 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 7. 安装坑 · 来源证据：v0.7.15\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.15\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_a9de117bd8e440beab9f7be4af50d710 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.15 | 来源讨论提到 linux 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：v0.7.20\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.20\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f0d67058bf2a40218adfb0eea1f1665f | https://github.com/evidentlyai/evidently/releases/tag/v0.7.20 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 9. 能力坑 · 来源证据：v0.7.19\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.19\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_52e6ab37edca4901a0aa9fe7b5ba12f1 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.19 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 10. 能力坑 · 来源证据：v0.7.21\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.21\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fe9f0e6e29564afeb5610928cc9bc834 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.21 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:315977578 | https://github.com/evidentlyai/evidently | README/documentation is current enough for a first validation pass.\n\n## 12. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | last_activity_observed missing\n\n## 13. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:315977578 | https://github.com/evidentlyai/evidently | no_demo; severity=medium\n\n## 14. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:315977578 | https://github.com/evidentlyai/evidently | no_demo; severity=medium\n\n## 15. 安全/权限坑 · 来源证据：Protect this repo from AI-generated PRs\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Protect this repo from AI-generated PRs\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_cfcf13543c3941e88fd7c5f527f95dc3 | https://github.com/evidentlyai/evidently/issues/1879 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 16. 安全/权限坑 · 来源证据：v0.7.17\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.7.17\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_305e0831e389460096b4d4b908731b41 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.17 | 来源讨论提到 api key 相关条件，需在安装/试用前复核。\n\n## 17. 安全/权限坑 · 来源证据：v0.7.18\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.7.18\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_19a463a1bb2a4fb2bd5bd894c914cc36 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.18 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 18. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | issue_or_pr_quality=unknown\n\n## 19. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | release_recency=unknown\n\n<!-- canonical_name: evidentlyai/evidently; human_manual_source: deepwiki_human_wiki -->\n",
      "summary": "DeepWiki/Human Wiki 完整输出，末尾追加 Discovery Agent 踩坑日志。",
      "title": "Human Manual / 人类版说明书"
    },
    "pitfall_log": {
      "asset_id": "pitfall_log",
      "filename": "PITFALL_LOG.md",
      "markdown": "# Pitfall Log / 踩坑日志\n\n项目：evidentlyai/evidently\n\n摘要：发现 19 个潜在踩坑项，其中 4 个为 high/blocking；最高优先级：安装坑 - 来源证据：Update scikit-learn version requirement to support v1.6.0。\n\n## 1. 安装坑 · 来源证据：Update scikit-learn version requirement to support v1.6.0\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Update scikit-learn version requirement to support v1.6.0\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_e82e24d201134e7f8dfa0b564d73fce8 | https://github.com/evidentlyai/evidently/issues/1407 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 2. 配置坑 · 来源证据：PromptOptimizer throws OpenAIError when using Vertex AI judge\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个配置相关的待验证问题：PromptOptimizer throws OpenAIError when using Vertex AI judge\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_abf6f9b183ff4892be78fd198f60d8e8 | https://github.com/evidentlyai/evidently/issues/1856 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 3. 运行坑 · 来源证据：IndexError in infer_column_type when column contains only null values\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个运行相关的待验证问题：IndexError in infer_column_type when column contains only null values\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_44cdd5f2b657432eb8418fb132959ada | https://github.com/evidentlyai/evidently/issues/1764 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 4. 安全/权限坑 · 来源证据：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices\n\n- 严重度：high\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Update evidently hashlib usage for FIPS-Compliant Systems and Security Best Practices\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_872f2b57cd3a4c61ad5c387c8be3208b | https://github.com/evidentlyai/evidently/issues/1410 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 5. 安装坑 · 来源证据：Numpy 2.x support?\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：Numpy 2.x support?\n- 对用户的影响：可能阻塞安装或首次运行。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_2fc84415b55a43d2be4f7f47cf46d7a1 | https://github.com/evidentlyai/evidently/issues/1557 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 6. 安装坑 · 来源证据：v0.7.12\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.12\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fcf994a3bcf94b0d9b2c5d6159816728 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.12 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 7. 安装坑 · 来源证据：v0.7.15\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.15\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_a9de117bd8e440beab9f7be4af50d710 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.15 | 来源讨论提到 linux 相关条件，需在安装/试用前复核。\n\n## 8. 安装坑 · 来源证据：v0.7.20\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安装相关的待验证问题：v0.7.20\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_f0d67058bf2a40218adfb0eea1f1665f | https://github.com/evidentlyai/evidently/releases/tag/v0.7.20 | 来源讨论提到 docker 相关条件，需在安装/试用前复核。\n\n## 9. 能力坑 · 来源证据：v0.7.19\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.19\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_52e6ab37edca4901a0aa9fe7b5ba12f1 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.19 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 10. 能力坑 · 来源证据：v0.7.21\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个能力理解相关的待验证问题：v0.7.21\n- 对用户的影响：可能增加新用户试用和生产接入成本。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_fe9f0e6e29564afeb5610928cc9bc834 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.21 | 来源讨论提到 python 相关条件，需在安装/试用前复核。\n\n## 11. 能力坑 · 能力判断依赖假设\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：README/documentation is current enough for a first validation pass.\n- 对用户的影响：假设不成立时，用户拿不到承诺的能力。\n- 建议检查：将假设转成下游验证清单。\n- 防护动作：假设必须转成验证项；没有验证结果前不能写成事实。\n- 证据：capability.assumptions | github_repo:315977578 | https://github.com/evidentlyai/evidently | README/documentation is current enough for a first validation pass.\n\n## 12. 维护坑 · 维护活跃度未知\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：未记录 last_activity_observed。\n- 对用户的影响：新项目、停更项目和活跃项目会被混在一起，推荐信任度下降。\n- 建议检查：补 GitHub 最近 commit、release、issue/PR 响应信号。\n- 防护动作：维护活跃度未知时，推荐强度不能标为高信任。\n- 证据：evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | last_activity_observed missing\n\n## 13. 安全/权限坑 · 下游验证发现风险项\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：下游已经要求复核，不能在页面中弱化。\n- 建议检查：进入安全/权限治理复核队列。\n- 防护动作：下游风险存在时必须保持 review/recommendation 降级。\n- 证据：downstream_validation.risk_items | github_repo:315977578 | https://github.com/evidentlyai/evidently | no_demo; severity=medium\n\n## 14. 安全/权限坑 · 存在评分风险\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：no_demo\n- 对用户的影响：风险会影响是否适合普通用户安装。\n- 建议检查：把风险写入边界卡，并确认是否需要人工复核。\n- 防护动作：评分风险必须进入边界卡，不能只作为内部分数。\n- 证据：risks.scoring_risks | github_repo:315977578 | https://github.com/evidentlyai/evidently | no_demo; severity=medium\n\n## 15. 安全/权限坑 · 来源证据：Protect this repo from AI-generated PRs\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：Protect this repo from AI-generated PRs\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源问题仍为 open，Pack Agent 需要复核是否仍影响当前版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_cfcf13543c3941e88fd7c5f527f95dc3 | https://github.com/evidentlyai/evidently/issues/1879 | 来源类型 github_issue 暴露的待验证使用条件。\n\n## 16. 安全/权限坑 · 来源证据：v0.7.17\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.7.17\n- 对用户的影响：可能影响授权、密钥配置或安全边界。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_305e0831e389460096b4d4b908731b41 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.17 | 来源讨论提到 api key 相关条件，需在安装/试用前复核。\n\n## 17. 安全/权限坑 · 来源证据：v0.7.18\n\n- 严重度：medium\n- 证据强度：source_linked\n- 发现：GitHub 社区证据显示该项目存在一个安全/权限相关的待验证问题：v0.7.18\n- 对用户的影响：可能影响升级、迁移或版本选择。\n- 建议检查：来源显示可能已有修复、规避或版本变化，说明书中必须标注适用版本。\n- 防护动作：不得脱离来源链接放大为确定性结论；需要标注适用版本和复核状态。\n- 证据：community_evidence:github | cevd_19a463a1bb2a4fb2bd5bd894c914cc36 | https://github.com/evidentlyai/evidently/releases/tag/v0.7.18 | 来源类型 github_release 暴露的待验证使用条件。\n\n## 18. 维护坑 · issue/PR 响应质量未知\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：issue_or_pr_quality=unknown。\n- 对用户的影响：用户无法判断遇到问题后是否有人维护。\n- 建议检查：抽样最近 issue/PR，判断是否长期无人处理。\n- 防护动作：issue/PR 响应未知时，必须提示维护风险。\n- 证据：evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | issue_or_pr_quality=unknown\n\n## 19. 维护坑 · 发布节奏不明确\n\n- 严重度：low\n- 证据强度：source_linked\n- 发现：release_recency=unknown。\n- 对用户的影响：安装命令和文档可能落后于代码，用户踩坑概率升高。\n- 建议检查：确认最近 release/tag 和 README 安装命令是否一致。\n- 防护动作：发布节奏未知或过期时，安装说明必须标注可能漂移。\n- 证据：evidence.maintainer_signals | github_repo:315977578 | https://github.com/evidentlyai/evidently | release_recency=unknown\n",
      "summary": "用户实践前最可能遇到的身份、安装、配置、运行和安全坑。",
      "title": "Pitfall Log / 踩坑日志"
    },
    "prompt_preview": {
      "asset_id": "prompt_preview",
      "filename": "PROMPT_PREVIEW.md",
      "markdown": "# evidently - Prompt Preview\n\n> Copy the prompt below into your AI host before installing anything.\n> Its purpose is to let you safely feel the project's workflow, not to claim the project has already run.\n\n## Copy this prompt\n\n```text\nYou are using an independent Doramagic capability pack for evidentlyai/evidently.\n\nProject:\n- Name: evidently\n- Repository: https://github.com/evidentlyai/evidently\n- Summary: Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.\n- Host target: local_cli\n\nGoal:\nHelp me evaluate this project for the following task without installing it yet: Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.\n\nBefore taking action:\n1. Restate my task, success standard, and boundary.\n2. Identify whether the next step requires tools, browser access, network access, filesystem access, credentials, package installation, or host configuration.\n3. Use only the Doramagic Project Pack, the upstream repository, and the source-linked evidence listed below.\n4. If a real command, install step, API call, file write, or host integration is required, mark it as \"requires post-install verification\" and ask for approval first.\n5. If evidence is missing, say \"evidence is missing\" instead of filling the gap.\n\nPreviewable capabilities:\n- Capability 1: Evidently is \\u200b\\u200ban open-source ML and LLM observability framework. Evaluate, test, and monitor any AI-powered system or data pipeline. From tabular data to Gen AI. 100+ metrics.\n\nCapabilities that require post-install verification:\n- Capability 1: Use the source-backed project context to guide one small, checkable workflow step.\n\nCore service flow:\n1. arch-overview: Architecture Overview. Produce one small intermediate artifact and wait for confirmation.\n2. core-components: Core Components. Produce one small intermediate artifact and wait for confirmation.\n3. data-flow: Data Management and Data Flow. Produce one small intermediate artifact and wait for confirmation.\n4. ml-evaluation: ML Model Evaluation. Produce one small intermediate artifact and wait for confirmation.\n5. llm-evals: LLM Evaluation and Judging. Produce one small intermediate artifact and wait for confirmation.\n\nSource-backed evidence to keep in mind:\n- https://github.com/evidentlyai/evidently\n- https://github.com/evidentlyai/evidently#readme\n- src/evidently/__init__.py\n- src/evidently/_registry.py\n- src/evidently/sdk/__init__.py\n- src/evidently/legacy/__init__.py\n- src/evidently/future/__init__.py\n- src/evidently/core/base_types.py\n- src/evidently/core/metric_types.py\n- src/evidently/core/report.py\n\nFirst response rules:\n1. Start Step 1 only.\n2. Explain the one service action you will perform first.\n3. Ask exactly three questions about my target workflow, success standard, and sandbox boundary.\n4. Stop and wait for my answers.\n\nStep 1 follow-up protocol:\n- After I answer the first three questions, stay in Step 1.\n- Produce six parts only: clarified task, success standard, boundary conditions, two or three options, tradeoffs for each option, and one recommendation.\n- End by asking whether I confirm the recommendation.\n- Do not move to Step 2 until I explicitly confirm.\n\nConversation rules:\n- Advance one step at a time and wait for confirmation after each small artifact.\n- Write outputs as recommendations or planned checks, not as completed execution.\n- Do not claim tests passed, files changed, commands ran, APIs were called, or the project was installed.\n- If the user asks for execution, first provide the sandbox setup, expected output, rollback, and approval checkpoint.\n```\n",
      "summary": "不安装项目也能感受能力节奏的安全试用 Prompt。",
      "title": "Prompt Preview / 安装前试用 Prompt"
    },
    "quick_start": {
      "asset_id": "quick_start",
      "filename": "QUICK_START.md",
      "markdown": "# Quick Start / 官方入口\n\n项目：evidentlyai/evidently\n\n## 官方安装入口\n\n### Python / pip · 官方安装入口\n\n```bash\npip install evidently\n```\n\n来源：https://github.com/evidentlyai/evidently#readme\n\n## 来源\n\n- repo: https://github.com/evidentlyai/evidently\n- docs: https://github.com/evidentlyai/evidently#readme\n",
      "summary": "从项目官方 README 或安装文档提取的开工入口。",
      "title": "Quick Start / 官方入口"
    }
  },
  "validation_id": "dval_fa1c64f5f5114e2b94febefe6ae8510a"
}
