/**
 * WebFetchService —— 抓取指定 URL 的内容（区别于 web_search 的"按关键词搜"）。
 *
 * Provider 选择（auto-detect，参考 web-search.service 同模式）：
 *   1. 优先 Tavily /extract API（TAVILY_API_KEY 配置时启用，干净 markdown，对 SPA 友好）
 *   2. 兜底 native fetch + HTML 简化剥离（无 key 直接用，dev / demo 零成本；对 SPA 渲染不完整）
 *
 * 设计意图：让 LLM 拿到 web_search 结果 URL 后能"读全文"再回答；或者用户直接
 * 粘 URL 让 agent 总结。
 *
 * 大文件 cap：输出截断到 10k 字符避免 LLM context 爆。
 */

import { Injectable, Logger } from '@nestjs/common';
import { lookup as dnsLookup } from 'dns/promises';
import { stripHtmlTags, decodeHtmlEntities } from '../utils/html.util';
import { assertPublicHost } from '../utils/ssrf-guard.util';

export interface WebFetchOutput {
  url: string;
  title: string | null;
  /** 抓回的正文（已剥 HTML / 已截断）*/
  content: string;
  /** 'tavily' / 'native' / 表明 provider */
  provider: 'tavily' | 'native';
  /** 真实长度 vs 截断长度，给 LLM 知道还有多少没看到 */
  rawLength: number;
  truncated: boolean;
}

const MAX_CONTENT_CHARS = 10000;
const HTTP_TIMEOUT_MS = 15000;

@Injectable()
export class WebFetchService {
  private readonly logger = new Logger(WebFetchService.name);

  async fetch(url: string): Promise<WebFetchOutput> {
    const u = url.trim();
    if (!u) throw new Error('url required');
    if (!/^https?:\/\//i.test(u)) {
      throw new Error('only http(s) URLs supported');
    }

    const tavilyKey = process.env.TAVILY_API_KEY;
    if (tavilyKey) {
      try {
        return await this.fetchTavily(u, tavilyKey);
      } catch (err) {
        this.logger.warn(`Tavily /extract failed, fallback to native: ${(err as Error).message}`);
      }
    }
    return await this.fetchNative(u);
  }

  private async fetchTavily(url: string, apiKey: string): Promise<WebFetchOutput> {
    const res = await fetch('https://api.tavily.com/extract', {
      method: 'POST',
      headers: { 'Content-Type': 'application/json' },
      body: JSON.stringify({ api_key: apiKey, urls: [url] }),
      signal: AbortSignal.timeout(HTTP_TIMEOUT_MS),
    });
    if (!res.ok) {
      throw new Error(`Tavily extract HTTP ${res.status}`);
    }
    const data = (await res.json()) as {
      results?: Array<{ url: string; raw_content?: string; title?: string }>;
      failed_results?: Array<{ url: string; error: string }>;
    };
    const r = data.results?.[0];
    if (!r) {
      const failed = data.failed_results?.[0];
      throw new Error(`Tavily extract no result: ${failed?.error ?? 'unknown'}`);
    }
    const raw = r.raw_content ?? '';
    return {
      url: r.url,
      title: r.title ?? null,
      content: truncate(raw),
      provider: 'tavily',
      rawLength: raw.length,
      truncated: raw.length > MAX_CONTENT_CHARS,
    };
  }

  private async fetchNative(url: string): Promise<WebFetchOutput> {
    // SSRF 拦截：拒绝指向私有 / loopback / link-local / 云元数据 IP 的 host
    // 同时 redirect=manual 防 DNS rebinding（302 重定向到内网）
    const hostname = new URL(url).hostname;
    await assertPublicHost(hostname, dnsLookup);

    const res = await fetch(url, {
      headers: {
        'User-Agent':
          'Mozilla/5.0 (compatible; FFAI-Agent/1.0; +https://ff.com/ai)',
        Accept: 'text/html,application/xhtml+xml',
      },
      signal: AbortSignal.timeout(HTTP_TIMEOUT_MS),
      redirect: 'manual',
    });
    if (res.status >= 300 && res.status < 400) {
      throw new Error(`HTTP ${res.status}: redirect rejected (SSRF guard)`);
    }
    if (!res.ok) {
      throw new Error(`HTTP ${res.status}`);
    }
    const html = await res.text();

    // 抽 <title>
    const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
    const title = titleMatch ? decodeHtmlEntities(stripHtmlTags(titleMatch[1])).trim() : null;

    // 简化"可读化"：去 script/style/nav/header/footer/aside 块 + 剥剩余标签
    // 不引入 jsdom + readability（重依赖）；够用 + dev 友好。
    let cleaned = html
      .replace(/<script[\s\S]*?<\/script>/gi, '')
      .replace(/<style[\s\S]*?<\/style>/gi, '')
      .replace(/<noscript[\s\S]*?<\/noscript>/gi, '')
      .replace(/<svg[\s\S]*?<\/svg>/gi, '')
      .replace(/<nav[\s\S]*?<\/nav>/gi, '')
      .replace(/<header[\s\S]*?<\/header>/gi, '')
      .replace(/<footer[\s\S]*?<\/footer>/gi, '')
      .replace(/<aside[\s\S]*?<\/aside>/gi, '')
      .replace(/<form[\s\S]*?<\/form>/gi, '');

    // 抽 <body> 内（其余 head 等丢弃）
    const bodyMatch = cleaned.match(/<body[^>]*>([\s\S]*?)<\/body>/i);
    if (bodyMatch) cleaned = bodyMatch[1];

    // 把块级元素换行后剥标签 + 折叠空白
    const text = decodeHtmlEntities(
      stripHtmlTags(
        cleaned
          .replace(/<\/(p|div|li|h[1-6]|tr|br)>/gi, '\n')
          .replace(/<br\s*\/?>/gi, '\n'),
      ),
    )
      // 折叠多余空白
      .split('\n')
      .map((line) => line.replace(/[\t ]+/g, ' ').trim())
      // 丢弃单字符/纯空白行（导航装饰常残留）
      .filter((line) => line.length > 1)
      .join('\n')
      .replace(/\n{3,}/g, '\n\n')
      .trim();

    return {
      url,
      title,
      content: truncate(text),
      provider: 'native',
      rawLength: text.length,
      truncated: text.length > MAX_CONTENT_CHARS,
    };
  }
}

function truncate(s: string): string {
  if (s.length <= MAX_CONTENT_CHARS) return s;
  return s.slice(0, MAX_CONTENT_CHARS) + `\n\n…（已截断，原始 ${s.length} 字符）`;
}
