import axios, { AxiosResponse } from 'axios'; import * as cheerio from 'cheerio'; import crypto from 'crypto'; export interface FetchResult { html: string; text: string; hash: string; status: number; responseTime: number; error?: string; } export async function fetchPage( url: string, elementSelector?: string ): Promise { const startTime = Date.now(); try { // Validate URL new URL(url); const response: AxiosResponse = await axios.get(url, { timeout: 30000, maxRedirects: 5, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', Connection: 'keep-alive', 'Upgrade-Insecure-Requests': '1', }, validateStatus: (status) => status < 500, }); const responseTime = Date.now() - startTime; let html = response.data; // If element selector is provided, extract only that element if (elementSelector) { const $ = cheerio.load(html); const element = $(elementSelector); if (element.length === 0) { throw new Error(`Element not found: ${elementSelector}`); } html = element.html() || ''; } // Extract text content const $ = cheerio.load(html); const text = $.text().trim(); // Generate hash const hash = crypto.createHash('sha256').update(html).digest('hex'); return { html, text, hash, status: response.status, responseTime, }; } catch (error: any) { const responseTime = Date.now() - startTime; if (error.response) { return { html: '', text: '', hash: '', status: error.response.status, responseTime, error: `HTTP ${error.response.status}: ${error.response.statusText}`, }; } if (error.code === 'ENOTFOUND') { return { html: '', text: '', hash: '', status: 0, responseTime, error: 'Domain not found', }; } if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') { return { html: '', text: '', hash: '', status: 0, responseTime, error: 'Request timeout', }; } return { html: '', text: '', hash: '', status: 0, responseTime, error: error.message || 'Unknown error', }; } } export function extractTextFromHtml(html: string): string { const $ = cheerio.load(html); // Remove script and style elements $('script').remove(); $('style').remove(); return $.text().trim(); } export function calculateHash(content: string): string { return crypto.createHash('sha256').update(content).digest('hex'); }