email-amazon/email-worker-nodejs/parser.ts

121 lines
3.9 KiB
TypeScript

/**
* Email parsing utilities
*
* Wraps `mailparser` for parsing raw MIME bytes and provides
* header sanitization (e.g. Microsoft's malformed Message-IDs).
*/
import { simpleParser, type ParsedMail } from 'mailparser';
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface BodyParts {
text: string;
html: string | null;
}
// ---------------------------------------------------------------------------
// Parser
// ---------------------------------------------------------------------------
/**
* Parse raw email bytes into a ParsedMail object.
* Applies pre-sanitization for known malformed headers before parsing.
*/
export async function parseEmail(raw: Buffer): Promise<ParsedMail> {
// Pre-sanitize: fix Microsoft's [uuid]@domain Message-IDs
const sanitized = sanitizeRawHeaders(raw);
return simpleParser(sanitized);
}
/**
* Extract text and HTML body parts from a parsed email.
*/
export function extractBodyParts(parsed: ParsedMail): BodyParts {
const text = parsed.text?.trim() || '(No body content)';
const html = parsed.html || null;
return { text, html };
}
/**
* Check if email was already processed by our worker (loop detection).
*/
export function isProcessedByWorker(parsed: ParsedMail): boolean {
const headers = parsed.headers;
const xWorker = headers.get('x-ses-worker-processed');
const autoSubmitted = headers.get('auto-submitted');
const isProcessedByUs = !!xWorker;
const isOurAutoReply = autoSubmitted === 'auto-replied' && !!xWorker;
return isProcessedByUs || isOurAutoReply;
}
/**
* Check if email is a SES MAILER-DAEMON bounce notification.
*/
export function isSesBounceNotification(parsed: ParsedMail): boolean {
const from = (parsed.from?.text ?? '').toLowerCase();
return from.includes('mailer-daemon@') && from.includes('amazonses.com');
}
/**
* Get a header value as string. Handles mailparser's headerlines Map.
*/
export function getHeader(parsed: ParsedMail, name: string): string {
const val = parsed.headers.get(name.toLowerCase());
if (val === undefined || val === null) return '';
if (typeof val === 'string') return val;
if (typeof val === 'object' && 'text' in val) return (val as any).text ?? '';
return String(val);
}
// ---------------------------------------------------------------------------
// Raw header sanitization
// ---------------------------------------------------------------------------
/**
* Fix known problematic patterns in raw MIME headers BEFORE parsing.
*
* Specifically targets Microsoft's `Message-ID: <[uuid]@domain>` which
* causes strict parsers to crash.
*/
function sanitizeRawHeaders(raw: Buffer): Buffer {
// We only need to check/fix the header section (before first blank line).
// For efficiency we work on the first ~8KB where headers live.
const headerEnd = findDoubleNewline(raw);
const headerLen = headerEnd === -1 ? Math.min(raw.length, 8192) : headerEnd;
const headerStr = raw.subarray(0, headerLen).toString('utf-8');
// Fix: Message-ID with square brackets <[...]@...>
if (headerStr.includes('[') || headerStr.includes(']')) {
const fixed = headerStr.replace(
/^(Message-ID:\s*<?)(\[.*?\])(@[^>]*>?\s*)$/im,
(_match, prefix, bracketed, suffix) =>
prefix + bracketed.replace(/\[/g, '').replace(/\]/g, '') + suffix,
);
if (fixed !== headerStr) {
return Buffer.concat([
Buffer.from(fixed, 'utf-8'),
raw.subarray(headerLen),
]);
}
}
return raw;
}
function findDoubleNewline(buf: Buffer): number {
// Look for \r\n\r\n or \n\n
for (let i = 0; i < buf.length - 3; i++) {
if (buf[i] === 0x0d && buf[i + 1] === 0x0a && buf[i + 2] === 0x0d && buf[i + 3] === 0x0a) {
return i;
}
if (buf[i] === 0x0a && buf[i + 1] === 0x0a) {
return i;
}
}
return -1;
}