website-monitor/backend/src/services/fetcher.ts

129 lines
2.9 KiB
TypeScript

import axios, { AxiosResponse } from 'axios';
import * as cheerio from 'cheerio';
import crypto from 'crypto';
export interface FetchResult {
html: string;
text: string;
hash: string;
status: number;
responseTime: number;
error?: string;
}
export async function fetchPage(
url: string,
elementSelector?: string
): Promise<FetchResult> {
const startTime = Date.now();
try {
// Validate URL
new URL(url);
const response: AxiosResponse = await axios.get(url, {
timeout: 30000,
maxRedirects: 5,
headers: {
'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
Accept:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
Connection: 'keep-alive',
'Upgrade-Insecure-Requests': '1',
},
validateStatus: (status) => status < 500,
});
const responseTime = Date.now() - startTime;
let html = response.data;
// If element selector is provided, extract only that element
if (elementSelector) {
const $ = cheerio.load(html);
const element = $(elementSelector);
if (element.length === 0) {
throw new Error(`Element not found: ${elementSelector}`);
}
html = element.html() || '';
}
// Extract text content
const $ = cheerio.load(html);
const text = $.text().trim();
// Generate hash
const hash = crypto.createHash('sha256').update(html).digest('hex');
return {
html,
text,
hash,
status: response.status,
responseTime,
};
} catch (error: any) {
const responseTime = Date.now() - startTime;
if (error.response) {
return {
html: '',
text: '',
hash: '',
status: error.response.status,
responseTime,
error: `HTTP ${error.response.status}: ${error.response.statusText}`,
};
}
if (error.code === 'ENOTFOUND') {
return {
html: '',
text: '',
hash: '',
status: 0,
responseTime,
error: 'Domain not found',
};
}
if (error.code === 'ETIMEDOUT' || error.code === 'ECONNABORTED') {
return {
html: '',
text: '',
hash: '',
status: 0,
responseTime,
error: 'Request timeout',
};
}
return {
html: '',
text: '',
hash: '',
status: 0,
responseTime,
error: error.message || 'Unknown error',
};
}
}
export function extractTextFromHtml(html: string): string {
const $ = cheerio.load(html);
// Remove script and style elements
$('script').remove();
$('style').remove();
return $.text().trim();
}
export function calculateHash(content: string): string {
return crypto.createHash('sha256').update(content).digest('hex');
}