const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i); const title = titleMatch ? decodeHtmlEntities(stripHtmlTags(titleMatch[1])).trim() : null; // 简化"可读化"：去 script/style/nav/header/footer/aside 块 + 剥剩余标签 // 不引入 jsdom + readability（重依赖）；够用 + dev 友好。 let cleaned = html .replace(/<script[\s\S]*?<\/script>/gi, '') .replace(/<style[\s\S]*?<\/style>/gi, '') .replace(/<noscript[\s\S]*?<\/noscript>/gi, '') .replace(/<svg[\s\S]*?<\/svg>/gi, '') .replace(/<nav[\s\S]*?<\/nav>/gi, '') .replace(/<header[\s\S]*?<\/header>/gi, '') .replace(/<footer[\s\S]*?<\/footer>/gi, '') .replace(/<aside[\s\S]*?<\/aside>/gi, '') .replace(/<form[\s\S]*?<\/form>/gi, ''); // 抽 <body> 内（其余 head 等丢弃） const bodyMatch = cleaned.match(/<body[^>]*>([\s\S]*?)<\/body>/i); if (bodyMatch) cleaned = bodyMatch[1]; // 把块级元素换行后剥标签 + 折叠空白 const text = decodeHtmlEntities( stripHtmlTags( cleaned .replace(/<\/(p|div|li|h[1-6]|tr|br)>/gi, '\n') .replace(/<br\s*\/?>/gi, '\n'), ), ) // 折叠多余空白 .split('\n') .map((line) => line.replace(/[\t ]+/g, ' ').trim()) // 丢弃单字符/纯空白行（导航装饰常残留） .filter((line) => line.length > 1) .join('\n') .replace(/\n{3,}/g, '\n\n') .trim(); return { url, title, content: truncate(text), provider: 'native', rawLength: text.length, truncated: text.length > MAX_CONTENT_CHARS, }; } } function truncate(s: string): string { if (s.length <= MAX_CONTENT_CHARS) return s; return s.slice(0, MAX_CONTENT_CHARS) + `\n\n…（已截断，原始 ${s.length} 字符）`; }

/** * WebFetchService —— 抓取指定 URL 的内容（区别于 web_search 的"按关键词搜"）。 * * Provider 选择（auto-detect，参考 web-search.service 同模式）： * 1. 优先 Tavily /extract API（TAVILY_API_KEY 配置时启用，干净 markdown，对 SPA 友好） * 2. 兜底 native fetch + HTML 简化剥离（无 key 直接用，dev / demo 零成本；对 SPA 渲染不完整） * * 设计意图：让 LLM 拿到 web_search 结果 URL 后能"读全文"再回答；或者用户直接 * 粘 URL 让 agent 总结。 * * 大文件 cap：输出截断到 10k 字符避免 LLM context 爆。 */ import { Injectable, Logger } from '@nestjs/common'; import { lookup as dnsLookup } from 'dns/promises'; import { stripHtmlTags, decodeHtmlEntities } from '../utils/html.util'; import { assertPublicHost } from '../utils/ssrf-guard.util'; export interface WebFetchOutput { url: string; title: string | null; /** 抓回的正文（已剥 HTML / 已截断）*/ content: string; /** 'tavily' / 'native' / 表明 provider */ provider: 'tavily' | 'native'; /** 真实长度 vs 截断长度，给 LLM 知道还有多少没看到 */ rawLength: number; truncated: boolean; } const MAX_CONTENT_CHARS = 10000; const HTTP_TIMEOUT_MS = 15000; @Injectable() export class WebFetchService { private readonly logger = new Logger(WebFetchService.name); async fetch(url: string): Promise { const u = url.trim(); if (!u) throw new Error('url required'); if (!/^https?:\/\//i.test(u)) { throw new Error('only http(s) URLs supported'); } const tavilyKey = process.env.TAVILY_API_KEY; if (tavilyKey) { try { return await this.fetchTavily(u, tavilyKey); } catch (err) { this.logger.warn(`Tavily /extract failed, fallback to native: ${(err as Error).message}`); } } return await this.fetchNative(u); } private async fetchTavily(url: string, apiKey: string): Promise { const res = await fetch('https://api.tavily.com/extract', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ api_key: apiKey, urls: [url] }), signal: AbortSignal.timeout(HTTP_TIMEOUT_MS), }); if (!res.ok) { throw new Error(`Tavily extract HTTP ${res.status}`); } const data = (await res.json()) as { results?: Array<{ url: string; raw_content?: string; title?: string }>; failed_results?: Array<{ url: string; error: string }>; }; const r = data.results?.[0]; if (!r) { const failed = data.failed_results?.[0]; throw new Error(`Tavily extract no result: ${failed?.error ?? 'unknown'}`); } const raw = r.raw_content ?? ''; return { url: r.url, title: r.title ?? null, content: truncate(raw), provider: 'tavily', rawLength: raw.length, truncated: raw.length > MAX_CONTENT_CHARS, }; } private async fetchNative(url: string): Promise { // SSRF 拦截：拒绝指向私有 / loopback / link-local / 云元数据 IP 的 host // 同时 redirect=manual 防 DNS rebinding（302 重定向到内网） const hostname = new URL(url).hostname; await assertPublicHost(hostname, dnsLookup); const res = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (compatible; FFAI-Agent/1.0; +https://ff.com/ai)', Accept: 'text/html,application/xhtml+xml', }, signal: AbortSignal.timeout(HTTP_TIMEOUT_MS), redirect: 'manual', }); if (res.status >= 300 && res.status < 400) { throw new Error(`HTTP ${res.status}: redirect rejected (SSRF guard)`); } if (!res.ok) { throw new Error(`HTTP ${res.status}`); } const html = await res.text(); // 抽 const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i); const title = titleMatch ? decodeHtmlEntities(stripHtmlTags(titleMatch[1])).trim() : null; // 简化"可读化"：去 script/style/nav/header/footer/aside 块 + 剥剩余标签 // 不引入 jsdom + readability（重依赖）；够用 + dev 友好。 let cleaned = html .replace(/<script[\s\S]*?<\/script>/gi, '') .replace(/<style[\s\S]*?<\/style>/gi, '') .replace(/<noscript[\s\S]*?<\/noscript>/gi, '') .replace(/<svg[\s\S]*?<\/svg>/gi, '') .replace(/<nav[\s\S]*?<\/nav>/gi, '') .replace(/<header[\s\S]*?<\/header>/gi, '') .replace(/<footer[\s\S]*?<\/footer>/gi, '') .replace(/<aside[\s\S]*?<\/aside>/gi, '') .replace(/<form[\s\S]*?<\/form>/gi, ''); // 抽 <body> 内（其余 head 等丢弃） const bodyMatch = cleaned.match(/<body[^>]*>([\s\S]*?)<\/body>/i); if (bodyMatch) cleaned = bodyMatch[1]; // 把块级元素换行后剥标签 + 折叠空白 const text = decodeHtmlEntities( stripHtmlTags( cleaned .replace(/<\/(p|div|li|h[1-6]|tr|br)>/gi, '\n') .replace(/<br\s*\/?>/gi, '\n'), ), ) // 折叠多余空白 .split('\n') .map((line) => line.replace(/[\t ]+/g, ' ').trim()) // 丢弃单字符/纯空白行（导航装饰常残留） .filter((line) => line.length > 1) .join('\n') .replace(/\n{3,}/g, '\n\n') .trim(); return { url, title, content: truncate(text), provider: 'native', rawLength: text.length, truncated: text.length > MAX_CONTENT_CHARS, }; } } function truncate(s: string): string { if (s.length <= MAX_CONTENT_CHARS) return s; return s.slice(0, MAX_CONTENT_CHARS) + `\n\n…（已截断，原始 ${s.length} 字符）`; }