#!/usr/bin/env bash
# pm2-restart-watch.sh —— PM2 进程 ↺ 阈值监控 + webhook 告警
#
# 拦元根因 5（监控告警缺失）。详见：
#   - 工单 #249【1】（issue link in docs/standards/12-five-meta-rules.md）
#   - docs/standards/12-five-meta-rules.md 规则 5
#   - docs/ops/08-pm2-restart-watch.md（部署到 4 台业务主机的步骤）
#
# 行为：
#   - 抓 `pm2 jlist` 解析每个进程当前 ↺ 计数
#   - 跟上次抓取（state file）做 diff，时间窗口默认 1h
#   - 任一进程窗口内 ↺ 增长超阈值 → POST webhook 告警
#   - state file 老于窗口 → 不告警，只刷新基线（视为冷启动 / 服务重置）
#
# 不依赖 PM2 插件，纯 bash + jq + curl 自包。
#
# Env（可被命令行 flag 覆盖）：
#   OPS_WEBHOOK_URL          钉钉或飞书 incoming webhook
#   OPS_WEBHOOK_TYPE         dingtalk | feishu（缺省自适应判断 URL 域名）
#   PM2_RESTART_THRESHOLD    默认 100
#   PM2_RESTART_WINDOW_SECS  默认 3600（1 小时）
#   PM2_RESTART_STATE_DIR    默认 ~/.cache/pm2-restart-watch
#
# Exit codes:
#   0  正常 / 未触发告警
#   1  触发告警（webhook 已发或 dry-run 已 print）
#   2  配置错误（缺 webhook、jq/pm2 不存在等）
#   3  webhook POST 失败

set -eo pipefail

THRESHOLD="${PM2_RESTART_THRESHOLD:-100}"
WINDOW_SECS="${PM2_RESTART_WINDOW_SECS:-3600}"
STATE_DIR="${PM2_RESTART_STATE_DIR:-${HOME}/.cache/pm2-restart-watch}"
WEBHOOK_URL="${OPS_WEBHOOK_URL:-}"
WEBHOOK_TYPE="${OPS_WEBHOOK_TYPE:-}"

DRY_RUN=0
SELF_CHECK=0
VERBOSE=0

usage() {
  cat <<'EOF'
pm2-restart-watch.sh —— PM2 ↺ 阈值监控 + webhook 告警

用法:
  pm2-restart-watch.sh [选项]

选项:
  --dry-run            不发 webhook，告警内容打到 stdout
  --self-check         验证依赖、webhook 可达、state dir 可写后退出
  --verbose            打详细日志到 stderr
  --threshold N        覆盖 PM2_RESTART_THRESHOLD（默认 100）
  --window SECS        覆盖 PM2_RESTART_WINDOW_SECS（默认 3600）
  --webhook URL        覆盖 OPS_WEBHOOK_URL
  --type TYPE          覆盖 OPS_WEBHOOK_TYPE（dingtalk | feishu）
  -h, --help           本帮助

环境变量见脚本头部注释。

部署到生产/UAT 见 docs/ops/08-pm2-restart-watch.md。
EOF
}

log() { [[ "${VERBOSE}" == 1 ]] && echo "[pm2-restart-watch] $*" >&2 || true; }
die() { echo "❌ $1" >&2; exit "${2:-2}"; }

# ----- arg parse -----

while [[ $# -gt 0 ]]; do
  case "$1" in
    --dry-run) DRY_RUN=1; shift ;;
    --self-check) SELF_CHECK=1; shift ;;
    --verbose) VERBOSE=1; shift ;;
    --threshold) THRESHOLD="$2"; shift 2 ;;
    --window) WINDOW_SECS="$2"; shift 2 ;;
    --webhook) WEBHOOK_URL="$2"; shift 2 ;;
    --type) WEBHOOK_TYPE="$2"; shift 2 ;;
    -h|--help) usage; exit 0 ;;
    *) die "未知参数: $1" ;;
  esac
done

[[ "${THRESHOLD}" =~ ^[1-9][0-9]*$ ]] || die "--threshold 必须为正整数，得到: ${THRESHOLD}"
[[ "${WINDOW_SECS}" =~ ^[1-9][0-9]*$ ]] || die "--window 必须为正整数，得到: ${WINDOW_SECS}"

# ----- 自适应 webhook 类型 -----

detect_webhook_type() {
  [[ -n "${WEBHOOK_TYPE}" ]] && return 0
  case "${WEBHOOK_URL}" in
    *oapi.dingtalk.com*) WEBHOOK_TYPE="dingtalk" ;;
    *open.feishu.cn*|*open.larksuite.com*) WEBHOOK_TYPE="feishu" ;;
    *) WEBHOOK_TYPE="dingtalk" ;;  # 默认按钉钉格式
  esac
}

# ----- 依赖检查 -----

check_deps() {
  command -v jq >/dev/null 2>&1 || die "缺少依赖: jq"
  command -v curl >/dev/null 2>&1 || die "缺少依赖: curl"
  # pm2 在生产模式必填；self-check / dry-run 缺 pm2 只 warn，便于本地验证
  if [[ "${SELF_CHECK}" == 0 && "${DRY_RUN}" == 0 ]]; then
    command -v pm2 >/dev/null 2>&1 || die "缺少依赖: pm2"
  fi
}

# ----- self-check -----

run_self_check() {
  echo "===== self-check ====="
  echo "依赖: jq=$(command -v jq), curl=$(command -v curl), pm2=$(command -v pm2 || echo MISSING)"
  echo "STATE_DIR: ${STATE_DIR}"
  mkdir -p "${STATE_DIR}" 2>/dev/null || die "STATE_DIR 不可写: ${STATE_DIR}"
  touch "${STATE_DIR}/.write-test" && rm "${STATE_DIR}/.write-test"
  echo "  ✅ STATE_DIR 可写"

  if [[ -z "${WEBHOOK_URL}" ]]; then
    echo "  ⚠️  OPS_WEBHOOK_URL 未配置，跳过 webhook 可达性测试"
  else
    detect_webhook_type
    echo "  webhook type: ${WEBHOOK_TYPE}"
    # 钉钉/飞书 quirk：HTTP 总是 200，错误码在 body 里。必须解析 body。
    local resp http_code body errcode err_log
    err_log=$(mktemp)
    resp=$(curl -sS -w '\n__HTTP__%{http_code}' --max-time 5 -X POST \
      -H 'Content-Type: application/json' \
      -d "$(build_payload "[self-check] $(hostname) at $(date -Is)" "PM2 watch self-check")" \
      "${WEBHOOK_URL}" 2>"${err_log}") || { local e; e=$(cat "${err_log}"); rm -f "${err_log}"; die "curl 失败: ${e}" 3; }
    rm -f "${err_log}"
    http_code="${resp##*__HTTP__}"
    body="${resp%__HTTP__*}"
    [[ "${http_code}" == "200" ]] || die "webhook HTTP 错误: ${http_code}" 3
    # 钉钉用 errcode，飞书用 code（或 StatusCode）；0 = 成功
    errcode=$(echo "${body}" | jq -r '.errcode // .code // .StatusCode // 0' 2>/dev/null || echo "parse-error")
    if [[ "${errcode}" == "0" ]]; then
      echo "  ✅ webhook 可达且配置正确（body errcode=0）"
    else
      die "webhook 配置错误: body=${body}" 3
    fi
  fi
  echo "===== self-check OK ====="
}

# ----- payload 构造（钉钉 / 飞书自适应）-----

build_payload() {
  # $1 = title, $2 = text body
  local title="$1" text="$2"
  case "${WEBHOOK_TYPE}" in
    dingtalk)
      jq -n --arg title "${title}" --arg text "${text}" '{
        msgtype: "markdown",
        markdown: { title: $title, text: $text }
      }'
      ;;
    feishu)
      jq -n --arg text "${title}\n\n${text}" '{
        msg_type: "text",
        content: { text: $text }
      }'
      ;;
    *)
      die "未知 webhook 类型: ${WEBHOOK_TYPE}"
      ;;
  esac
}

# ----- 抓 PM2 状态 -----

snapshot_pm2() {
  if ! command -v pm2 >/dev/null 2>&1; then
    echo "[]"
    return 0
  fi
  pm2 jlist 2>/dev/null | jq -c '[.[] | {
    name: .name,
    pm_id: .pm_id,
    restart_time: (.pm2_env.restart_time // 0),
    status: (.pm2_env.status // "unknown"),
    err_log: (.pm2_env.pm_err_log_path // "")
  }]' || echo "[]"
}

# ----- 抓最近 stderr 日志摘要（actionable info）-----
# 直接 tail err_log_path，避免 spawn pm2 logs（pm2 启动 ~100-300ms × N 个进程很贵）
tail_stderr() {
  local err_log="$1"
  if [[ -n "${err_log}" && -r "${err_log}" ]]; then
    tail -n 5 "${err_log}" 2>/dev/null || echo "(read failed)"
  else
    echo "(no err log)"
  fi
}

# ----- 主流程 -----

main() {
  detect_webhook_type
  check_deps
  if [[ "${SELF_CHECK}" == 1 ]]; then run_self_check; exit 0; fi

  if [[ "${DRY_RUN}" == 0 && -z "${WEBHOOK_URL}" ]]; then
    die "OPS_WEBHOOK_URL 未配置（生产模式必填，本地测试用 --dry-run）"
  fi

  mkdir -p "${STATE_DIR}"
  local state_file="${STATE_DIR}/state.json"
  local now_ts; now_ts=$(date +%s)

  # 写 state file 用临时文件 + mv 实现原子覆写
  write_state() {
    local tmp; tmp=$(mktemp "${state_file}.XXXXXX")
    echo "$1" | jq --arg ts "${now_ts}" '{ts: ($ts|tonumber), procs: .}' > "${tmp}" \
      && mv "${tmp}" "${state_file}"
  }

  log "snapshot pm2 ..."
  local current; current=$(snapshot_pm2)
  if [[ -z "${current}" || "${current}" == "[]" ]]; then
    log "无 PM2 进程，退出"
    write_state "[]"
    exit 0
  fi

  # 冷启动 / 窗口过期 → 只刷新基线
  if [[ ! -f "${state_file}" ]]; then
    log "首次运行，写入基线 ${state_file}"
    write_state "${current}"
    exit 0
  fi

  # state 一次性 slurp 进变量，避免多次 jq 解析；corrupt JSON 兜底为 ts=0 触发重置
  local state_blob last_ts
  state_blob=$(cat "${state_file}" 2>/dev/null || echo "{}")
  last_ts=$(echo "${state_blob}" | jq -r '.ts // 0' 2>/dev/null || echo 0)
  [[ "${last_ts}" =~ ^[0-9]+$ ]] || last_ts=0
  local age=$((now_ts - last_ts))
  if (( age > WINDOW_SECS * 2 )); then
    log "state 太老或 corrupt（age=${age}s），重置基线"
    write_state "${current}"
    exit 0
  fi

  # 计算 diff（用 --slurpfile 一次解析 state，省一次 jq spawn）
  local diffs
  diffs=$(jq -n \
    --argjson state "${state_blob}" \
    --argjson curr "${current}" \
    --argjson th "${THRESHOLD}" '
      ($state.procs // []) as $last
      | [
          $curr[] as $c
          | (($last[] | select(.pm_id == $c.pm_id) | .restart_time) // 0) as $last_rt
          | ($c.restart_time - $last_rt) as $delta
          | select($delta >= $th)
          | { name: $c.name, pm_id: $c.pm_id, err_log: $c.err_log,
              last: $last_rt, curr: $c.restart_time, delta: $delta, status: $c.status }
        ]
    ')

  if [[ "${diffs}" == "[]" ]]; then
    log "无超阈值进程，更新基线"
    write_state "${current}"
    exit 0
  fi

  # 构造告警内容
  local hostname_v; hostname_v=$(hostname 2>/dev/null || echo "${HOSTNAME:-unknown}")
  local count; count=$(echo "${diffs}" | jq 'length')
  local title="🚨 PM2 ↺ 告警 [${hostname_v}] ${count} 个进程异常"
  local body="**主机**: ${hostname_v}\n**距上次抓取**: ${age}s（阈值: 增长≥${THRESHOLD}）\n\n"

  # 一次 jq TSV 拆字段，避免每行 spawn 多次 jq
  local diffs_tsv
  diffs_tsv=$(echo "${diffs}" | jq -r '.[] | [.name, .pm_id, .last, .curr, .delta, .status, .err_log] | @tsv')
  while IFS=$'\t' read -r name pm_id last curr delta status err_log; do
    body+="### ${name} (pm_id=${pm_id})\n"
    body+="- 状态: ${status}\n"
    body+="- ↺: ${last} → ${curr}（+${delta}）\n"
    local stderr_tail; stderr_tail=$(tail_stderr "${err_log}")
    body+="- 最近 stderr:\n\`\`\`\n${stderr_tail}\n\`\`\`\n\n"
  done <<<"${diffs_tsv}"

  if [[ "${DRY_RUN}" == 1 ]]; then
    echo "===== DRY RUN ====="
    echo "Title: ${title}"
    echo
    echo -e "${body}"
    echo "==================="
    # 仍刷新基线，避免 dry-run 期间反复触发
    write_state "${current}"
    exit 1
  fi

  log "POST webhook（type=${WEBHOOK_TYPE}）"
  local payload; payload=$(build_payload "${title}" "${body}")
  local code
  code=$(curl -sS -o /dev/null -w '%{http_code}' --max-time 10 \
    --retry 2 --retry-delay 2 \
    -X POST -H 'Content-Type: application/json' \
    -d "${payload}" "${WEBHOOK_URL}" 2>/dev/null) || code="curl-error"
  if [[ "${code}" != "200" ]]; then
    die "webhook POST 失败: HTTP=${code}" 3
  fi

  write_state "${current}"
  echo "✅ 告警已发送（${count} 个进程超阈值）"
  exit 1
}

main
