#!/usr/bin/env python3
"""AI Review 准确率自动评估 (#259 D4-auto-eval) — 替代人工抽样标注。

为什么有这个：人工抽样违反"能不用人做就不用"原则（AI-first，CLAUDE.md
「持续优化 AI-first 工作方式」）。本脚本用 4 个客观 proxy 指标自动评估
review 质量，输出综合分给 #171 转正决策用。

四个指标：
  1. 采纳率（adoption_rate）：finding 在下轮 review 消失 / 总首次出现 finding
     语义：finding 是否真有价值（作者修了说明命中真问题）
     用 stable_id 优先匹配（schema 化后），无 ID 用 (category, msg 前 60 字) fingerprint

  2. 漏检率（miss_rate）：verdict=pass 后 14 天内同时间窗口出现的 hotfix/fix PR 比例
     语义：review 当时是否漏了真 bug
     简化 proxy：不细到模块级，接受 noise（团队整体趋势更重要）

  3. block 合理性（block_legit_rate）：verdict=block 后续轮 review 降级为 pass/_risk 的比例
     语义：block 是不是真问题（被修了就降级；没修说明 AI 误报或被忽略）

  4. 自洽率（consistency_rate）：verdict 与 findings.severity 是否符合 schema 约定
     - verdict=block 必有 hard_block finding，反之亦然
     - verdict=needs_fix / should_fix 必有 risk 或 hard_block finding
     语义：review 内部逻辑一致性

综合分 = 加权平均（采纳 30% + 自洽 30% + block 合理 25% + 漏检率反向 15%）
门槛 70%（继承 #171 验收清单精神）。

用法：
    python3 scripts/ops/ai-review-auto-eval.py                  # 近 30 天
    python3 scripts/ops/ai-review-auto-eval.py --since-days 14
    python3 scripts/ops/ai-review-auto-eval.py --print          # stdout
"""

from __future__ import annotations

import argparse
import importlib.util
import json
import re
import sys
from collections import defaultdict
from datetime import datetime, timedelta
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[2]
REPORTS_DIR = REPO_ROOT / "testing" / "reports"

# 综合分权重 — 各指标重要性的工程判断
SCORE_WEIGHTS = {
    "adoption": 0.30,       # 高：finding 真有用是 review 价值的核心
    "consistency": 0.30,    # 高：内部矛盾说明 prompt 有问题
    "block_legit": 0.25,    # 中：block 准确率仍是 #171 的关注点
    "miss_inverse": 0.15,   # 低：proxy 噪声大（不细到模块），权重压低
}
SCORE_THRESHOLD = 0.70

HOTFIX_TITLE_RE = re.compile(r"^(fix|hotfix)\(", re.IGNORECASE)


# ---------- importlib 复用 ai-review-stats.py（文件名带连字符，只能这样 import）----------
# 待第 4 个消费者出现时考虑抽 _ai_review_common.py module（当前 2 个消费者 ROI 不划算）

def _load_stats():
    name = "_ai_review_stats"
    spec = importlib.util.spec_from_file_location(
        name, Path(__file__).parent / "ai-review-stats.py"
    )
    if spec is None or spec.loader is None:
        raise RuntimeError("无法加载 ai-review-stats.py")
    mod = importlib.util.module_from_spec(spec)
    # Python 3.12 坑：@dataclass 装饰器内部 sys.modules.get(cls.__module__).__dict__
    # 反查模块，若未注册到 sys.modules 会抛 'NoneType' has no attribute '__dict__'。
    # 必须在 exec_module 之前手动注册。详见 .learnings/ERRORS/ERR-20260511-001.md
    sys.modules[name] = mod
    spec.loader.exec_module(mod)
    return mod


stats = _load_stats()
Api = stats.Api
paginate = stats.paginate
get_token = stats.get_token
detect_repo = stats.detect_repo
parse_review = stats.parse_review
collect_ai_reviews_for_pr = stats.collect_ai_reviews_for_pr
collect_prs_in_window = stats.collect_prs_in_window
_safe_parse = stats._safe_parse


# ---------- 数据采集 ----------

def collect(api: "Api", days: int) -> tuple[list, list[dict], list]:
    """拉 PR + 评论 + hotfix PR + ERR learnings。返回 (pr_with_reviews, hotfix_prs, err_dates)。"""
    cutoff = datetime.now().astimezone() - timedelta(days=days)
    prs = collect_prs_in_window(api, cutoff)
    print(f"  窗口内活跃 PR: {len(prs)}", file=sys.stderr)

    pr_with_reviews = []
    hotfix_prs = []
    for pr in prs:
        title = pr.get("title") or ""
        if HOTFIX_TITLE_RE.match(title) and pr.get("merged_at"):
            hotfix_prs.append(pr)
        n = pr.get("number")
        rs = collect_ai_reviews_for_pr(api, n)
        if rs:
            pr_with_reviews.append((pr, rs))
    print(f"  其中跑过 AI Review: {len(pr_with_reviews)} | hotfix 合并: {len(hotfix_prs)}", file=sys.stderr)

    # .learnings/ERRORS/ 里 ERR-YYYYMMDD-NNN.md 解析日期
    err_dir = REPO_ROOT / ".learnings" / "ERRORS"
    err_dates = []
    if err_dir.exists():
        for f in err_dir.glob("ERR-*.md"):
            m = re.match(r"ERR-(\d{4})(\d{2})(\d{2})-\d+\.md", f.name)
            if m:
                try:
                    err_dates.append(datetime(int(m[1]), int(m[2]), int(m[3])).astimezone())
                except ValueError:
                    pass
    print(f"  .learnings/ERRORS/ 文件: {len(err_dates)}", file=sys.stderr)
    return pr_with_reviews, hotfix_prs, err_dates


# ---------- 4 个指标 ----------

def _finding_key(f) -> tuple[str, str]:
    """优先 stable_id（schema 化后），无则 fingerprint。"""
    sid = getattr(f, "stable_id", None) or ""
    if sid:
        return ("sid", sid.lower())
    return f.fingerprint()


def eval_adoption(pr_with_reviews) -> dict:
    """采纳率：跨 review 中消失的 finding 比例。"""
    total_first_seen = 0
    resolved = 0
    for _pr, reviews in pr_with_reviews:
        if len(reviews) < 2:
            continue  # 只跑过一次 review 的 PR 无从判断采纳
        seen_keys: set = set()
        # 每个 finding 按时序追踪：首次出现时记一笔；后续 review 中不出现即"消失"
        appearances: dict[tuple, list[int]] = defaultdict(list)
        for idx, r in enumerate(reviews):
            for f in r.findings:
                appearances[_finding_key(f)].append(idx)
        for key, idxs in appearances.items():
            first = idxs[0]
            # 只看 first 之前还没出现、且 first < 最后一轮（有机会消失）的 finding
            if first >= len(reviews) - 1:
                continue
            total_first_seen += 1
            # finding 在 first 之后任一轮缺席 = 视为被修
            if any(i not in idxs for i in range(first + 1, len(reviews))):
                resolved += 1
    rate = resolved / total_first_seen if total_first_seen else 0.0
    return {"resolved": resolved, "total_first_seen": total_first_seen, "rate": rate}


def eval_miss(pr_with_reviews, hotfix_prs, err_dates) -> dict:
    """漏检率：verdict=pass 后 14 天内有 hotfix 合并 或 ERR 出现 / 总 verdict=pass PR。

    粗 proxy：不细到模块。verdict=pass PR 之后 14 天有 hotfix/ERR =
    候选漏检；这会高估漏检率（hotfix 可能跟 pass PR 模块无关），但作为团队趋势指标够用。
    """
    pass_prs = []
    for pr, reviews in pr_with_reviews:
        if not pr.get("merged_at"):
            continue
        last_verdict = reviews[-1].verdict
        if last_verdict == "pass":
            pass_prs.append((pr, _safe_parse(pr["merged_at"])))

    if not pass_prs:
        return {"pass_total": 0, "miss_candidates": 0, "rate": 0.0}

    hotfix_dates = [_safe_parse(p["merged_at"]) for p in hotfix_prs if p.get("merged_at")]
    miss = 0
    window = timedelta(days=14)
    for _, merged_at in pass_prs:
        if not merged_at:
            continue
        if any(merged_at < hf < merged_at + window for hf in hotfix_dates if hf) \
           or any(merged_at < ed < merged_at + window for ed in err_dates):
            miss += 1
    rate = miss / len(pass_prs)
    return {"pass_total": len(pass_prs), "miss_candidates": miss, "rate": rate}


def eval_block_legit(pr_with_reviews) -> dict:
    """block 合理性：verdict=block 后续轮变 pass / pass_with_risk 的比例。

    DRY RUN 期 block 不真阻断 → 看下轮 review 是否降级判断 AI 提的问题是否真被修。
    block 出现在 last review（没下轮）→ 不计入分母（无法评估）。
    """
    total = 0
    legit = 0
    for _pr, reviews in pr_with_reviews:
        for i, r in enumerate(reviews):
            if r.verdict != "block":
                continue
            if i >= len(reviews) - 1:
                continue  # 最后一轮就是 block，没下轮可看
            total += 1
            nxt = reviews[i + 1].verdict
            if nxt in ("pass", "pass_with_risk"):
                legit += 1
    rate = legit / total if total else 0.0
    return {"legit": legit, "total": total, "rate": rate, "note": "DRY RUN 期 block 不阻断，靠下轮降级判合理"}


def eval_consistency(pr_with_reviews) -> dict:
    """自洽率：verdict 与 findings.severity 是否符合 schema 约定。"""
    total = 0
    consistent = 0
    inconsistencies: list[str] = []
    for pr, reviews in pr_with_reviews:
        for r in reviews:
            if not r.verdict:
                continue  # 老评论解析不到 verdict 跳过
            total += 1
            sc = r.severity_counts()
            hb, rk = sc["hard_block"], sc["risk"]

            ok = True
            if r.verdict == "block" and hb == 0:
                ok = False
                inconsistencies.append(f"PR #{pr['number']}: verdict=block 但 hard_block=0")
            elif r.verdict in ("needs_fix", "should_fix") and (hb + rk) == 0:
                ok = False
                inconsistencies.append(f"PR #{pr['number']}: verdict={r.verdict} 但 hard_block+risk=0")
            elif r.verdict == "pass" and hb > 0:
                ok = False
                inconsistencies.append(f"PR #{pr['number']}: verdict=pass 但 hard_block={hb}")
            if ok:
                consistent += 1
    rate = consistent / total if total else 0.0
    return {"consistent": consistent, "total": total, "rate": rate, "sample_issues": inconsistencies[:5]}


def overall_score(metrics: dict) -> float:
    return (
        metrics["adoption"]["rate"] * SCORE_WEIGHTS["adoption"]
        + metrics["consistency"]["rate"] * SCORE_WEIGHTS["consistency"]
        + metrics["block_legit"]["rate"] * SCORE_WEIGHTS["block_legit"]
        + (1.0 - metrics["miss"]["rate"]) * SCORE_WEIGHTS["miss_inverse"]
    )


# ---------- 报告渲染 ----------

def render(metrics: dict, ctx: dict) -> str:
    score = overall_score(metrics)
    L = [
        f"# AI Review 自动准确率评估 · {ctx['generated_at']:%Y-%m-%d}（近 {ctx['days']} 天）",
        "",
        f"**生成时间**: {ctx['generated_at']:%Y-%m-%d %H:%M %z}",
        f"**数据源**: Gitea API · {ctx['repo']} + 本地 .learnings/ERRORS/",
        f"**采样**: {ctx['pr_count']} 个跑过 AI Review 的 PR / {ctx['hotfix_count']} 个 hotfix PR / {ctx['err_count']} 个 ERR 笔记",
        "",
        f"> **综合分**: **{score*100:.1f}%** {'✅ 达标' if score >= SCORE_THRESHOLD else '❌ 未达标'}（门槛 {SCORE_THRESHOLD*100:.0f}%）",
        "",
        "> 本脚本替代人工抽样标注（#259 D4 解锁条件）。基于客观 proxy 指标做团队趋势判断，",
        "> 不试图给单条 review 打分。详细方法见脚本头注释。",
        "",
        "---",
        "",
        "## 各项指标",
        "",
        "| 指标 | 值 | 权重 | 加权贡献 | 语义 |",
        "|---|---|---|---|---|",
        f"| 采纳率 | {metrics['adoption']['rate']*100:.1f}% ({metrics['adoption']['resolved']}/{metrics['adoption']['total_first_seen']}) | {SCORE_WEIGHTS['adoption']*100:.0f}% | {metrics['adoption']['rate']*SCORE_WEIGHTS['adoption']*100:.1f} | finding 跨轮消失 = 被修了 |",
        f"| 自洽率 | {metrics['consistency']['rate']*100:.1f}% ({metrics['consistency']['consistent']}/{metrics['consistency']['total']}) | {SCORE_WEIGHTS['consistency']*100:.0f}% | {metrics['consistency']['rate']*SCORE_WEIGHTS['consistency']*100:.1f} | verdict 与 findings.severity 一致 |",
        f"| Block 合理性 | {metrics['block_legit']['rate']*100:.1f}% ({metrics['block_legit']['legit']}/{metrics['block_legit']['total']}) | {SCORE_WEIGHTS['block_legit']*100:.0f}% | {metrics['block_legit']['rate']*SCORE_WEIGHTS['block_legit']*100:.1f} | block 下轮降级 = 真问题 |",
        f"| 漏检率（反向计分）| 漏检 {metrics['miss']['rate']*100:.1f}% ({metrics['miss']['miss_candidates']}/{metrics['miss']['pass_total']}) | {SCORE_WEIGHTS['miss_inverse']*100:.0f}% | {(1-metrics['miss']['rate'])*SCORE_WEIGHTS['miss_inverse']*100:.1f} | verdict=pass 后 14d 出 hotfix |",
        "",
        "## 自洽性问题样本",
        "",
    ]
    if metrics["consistency"]["sample_issues"]:
        for s in metrics["consistency"]["sample_issues"]:
            L.append(f"- {s}")
    else:
        L.append("_无_")
    L += [
        "",
        "## 数据局限",
        "",
        "- **采纳率**：finding 在下轮没出现可能是被修、也可能是 AI 这次没指出（盘点已知问题）。stable_id 落地后追踪更稳",
        "- **漏检率**：粗 proxy，hotfix 跟 verdict=pass PR 模块无关也算了，会高估漏检",
        "- **block 合理性**：DRY RUN 期 block 不阻断，靠下轮降级判断；如果 PR 后续没再触发 review，整个 PR 不计入",
        "- **自洽率**：老评论（schema 化前）verdict 解析为空 → 跳过不计入分母",
        "",
        f"## 决策（#259 D4 / #171 转正）",
        "",
    ]
    if score >= SCORE_THRESHOLD:
        L.append(f"✅ **综合分 {score*100:.1f}% ≥ {SCORE_THRESHOLD*100:.0f}%，可启动 D4 退出 DRY RUN**")
        L.append("")
        L.append("操作：")
        L.append("1. `.gitea/workflows/ai-review.yml` 删 `AI_REVIEW_DRY_RUN: '1'` 那行")
        L.append("2. Gitea web → repo Settings → Branches → develop 保护规则 → 必需检查加 `ai-review`")
        L.append("3. 同步 `docs/ops/02-gitea-config.md` + `docs/standards/05-development-workflow.md`")
        L.append("4. 关闭 #259 + #171")
    else:
        gap = (SCORE_THRESHOLD - score) * 100
        L.append(f"❌ **综合分 {score*100:.1f}%，距门槛 {SCORE_THRESHOLD*100:.0f}% 差 {gap:.1f} 个百分点**")
        L.append("")
        L.append("最低拖累项：")
        contributions = sorted(
            [
                ("采纳率", metrics["adoption"]["rate"], SCORE_WEIGHTS["adoption"]),
                ("自洽率", metrics["consistency"]["rate"], SCORE_WEIGHTS["consistency"]),
                ("block 合理性", metrics["block_legit"]["rate"], SCORE_WEIGHTS["block_legit"]),
                ("漏检率反向", 1 - metrics["miss"]["rate"], SCORE_WEIGHTS["miss_inverse"]),
            ],
            key=lambda x: x[1],
        )
        for name, r, w in contributions[:2]:
            L.append(f"- **{name}**: {r*100:.1f}%（权重 {w*100:.0f}%）→ 优先调 prompt 改善这一项")
    L += [
        "",
        "---",
        f"_Generated by `scripts/ops/ai-review-auto-eval.py`. 跟踪 [#259](http://43.130.59.228/FFAIWorkspace/workspace/issues/259) / [#171](http://43.130.59.228/FFAIWorkspace/workspace/issues/171)._",
        "",
    ]
    return "\n".join(L)


def main() -> int:
    p = argparse.ArgumentParser(description="AI Review 自动准确率评估")
    p.add_argument("--since-days", type=int, default=30)
    p.add_argument("--print", dest="print_only", action="store_true")
    p.add_argument("--out", default=None)
    args = p.parse_args()

    token = get_token()
    repo = detect_repo()
    api = Api(token, repo)
    now = datetime.now().astimezone()
    print(f"采样：repo={repo} since={args.since_days}d", file=sys.stderr)

    pr_with_reviews, hotfix_prs, err_dates = collect(api, args.since_days)
    if not pr_with_reviews:
        print("窗口内无 AI Review 评论，无法评估", file=sys.stderr)
        return 1

    metrics = {
        "adoption": eval_adoption(pr_with_reviews),
        "consistency": eval_consistency(pr_with_reviews),
        "block_legit": eval_block_legit(pr_with_reviews),
        "miss": eval_miss(pr_with_reviews, hotfix_prs, err_dates),
    }
    ctx = {
        "generated_at": now,
        "days": args.since_days,
        "repo": repo,
        "pr_count": len(pr_with_reviews),
        "hotfix_count": len(hotfix_prs),
        "err_count": len(err_dates),
    }
    md = render(metrics, ctx)

    if args.print_only:
        print(md)
        return 0
    out = Path(args.out) if args.out else REPORTS_DIR / f"ai-review-auto-eval-{now:%Y%m%d}.md"
    out.parent.mkdir(parents=True, exist_ok=True)
    out.write_text(md, encoding="utf-8")
    score = overall_score(metrics)
    print(f"\n✅ 报告: {out}", file=sys.stderr)
    print(f"   综合分: {score*100:.1f}% ({'达标' if score >= SCORE_THRESHOLD else '未达标'})", file=sys.stderr)
    return 0


if __name__ == "__main__":
    sys.exit(main())
