email-amazon/email-worker-nodejs/parser.ts

/**
 * Email parsing utilities
 *
 * Wraps `mailparser` for parsing raw MIME bytes and provides
 * header sanitization (e.g. Microsoft's malformed Message-IDs).
 */

import { simpleParser, type ParsedMail } from 'mailparser';

// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface BodyParts {
  text: string;
  html: string | null;
}

// ---------------------------------------------------------------------------
// Parser
// ---------------------------------------------------------------------------

/**
 * Parse raw email bytes into a ParsedMail object.
 * Applies pre-sanitization for known malformed headers before parsing.
 */
export async function parseEmail(raw: Buffer): Promise<ParsedMail> {
  // Pre-sanitize: fix Microsoft's [uuid]@domain Message-IDs
  const sanitized = sanitizeRawHeaders(raw);
  return simpleParser(sanitized);
}

/**
 * Extract text and HTML body parts from a parsed email.
 */
export function extractBodyParts(parsed: ParsedMail): BodyParts {
  const text = parsed.text?.trim() || '(No body content)';
  const html = parsed.html || null;
  return { text, html };
}

/**
 * Check if email was already processed by our worker (loop detection).
 */
export function isProcessedByWorker(parsed: ParsedMail): boolean {
  const headers = parsed.headers;
  const xWorker = headers.get('x-ses-worker-processed');
  const autoSubmitted = headers.get('auto-submitted');

  const isProcessedByUs = !!xWorker;
  const isOurAutoReply = autoSubmitted === 'auto-replied' && !!xWorker;

  return isProcessedByUs || isOurAutoReply;
}

/**
 * Check if email is a SES MAILER-DAEMON bounce notification.
 */
export function isSesBounceNotification(parsed: ParsedMail): boolean {
  const from = (parsed.from?.text ?? '').toLowerCase();
  return from.includes('mailer-daemon@') && from.includes('amazonses.com');
}

/**
 * Get a header value as string. Handles mailparser's headerlines Map.
 */
export function getHeader(parsed: ParsedMail, name: string): string {
  const val = parsed.headers.get(name.toLowerCase());
  if (val === undefined || val === null) return '';
  if (typeof val === 'string') return val;
  if (typeof val === 'object' && 'text' in val) return (val as any).text ?? '';
  return String(val);
}

// ---------------------------------------------------------------------------
// Raw header sanitization
// ---------------------------------------------------------------------------

/**
 * Fix known problematic patterns in raw MIME headers BEFORE parsing.
 *
 * Specifically targets Microsoft's `Message-ID: <[uuid]@domain>` which
 * causes strict parsers to crash.
 */
function sanitizeRawHeaders(raw: Buffer): Buffer {
  // We only need to check/fix the header section (before first blank line).
  // For efficiency we work on the first ~8KB where headers live.
  const headerEnd = findDoubleNewline(raw);
  const headerLen = headerEnd === -1 ? Math.min(raw.length, 8192) : headerEnd;
  const headerStr = raw.subarray(0, headerLen).toString('utf-8');

  // Fix: Message-ID with square brackets  <[...]@...>
  if (headerStr.includes('[') || headerStr.includes(']')) {
    const fixed = headerStr.replace(
      /^(Message-ID:\s*<?)(\[.*?\])(@[^>]*>?\s*)$/im,
      (_match, prefix, bracketed, suffix) =>
        prefix + bracketed.replace(/\[/g, '').replace(/\]/g, '') + suffix,
    );
    if (fixed !== headerStr) {
      return Buffer.concat([
        Buffer.from(fixed, 'utf-8'),
        raw.subarray(headerLen),
      ]);
    }
  }

  return raw;
}

function findDoubleNewline(buf: Buffer): number {
  // Look for \r\n\r\n or \n\n
  for (let i = 0; i < buf.length - 3; i++) {
    if (buf[i] === 0x0d && buf[i + 1] === 0x0a && buf[i + 2] === 0x0d && buf[i + 3] === 0x0a) {
      return i;
    }
    if (buf[i] === 0x0a && buf[i + 1] === 0x0a) {
      return i;
    }
  }
  return -1;
}