hot reload & better pooling

This commit is contained in:
Andreas Knuth 2026-01-10 18:03:08 -06:00
parent 06195b9a60
commit f95461ad75
1 changed files with 207 additions and 79 deletions

View File

@ -161,7 +161,7 @@ except Exception as e:
# ============================================ # ============================================
class SMTPPool: class SMTPPool:
"""Thread-safe SMTP Connection Pool""" """Thread-safe SMTP Connection Pool with robust connection handling"""
def __init__(self, host: str, port: int, pool_size: int = 5): def __init__(self, host: str, port: int, pool_size: int = 5):
self.host = host self.host = host
@ -181,42 +181,71 @@ class SMTPPool:
conn.ehlo() conn.ehlo()
if config.smtp_user and config.smtp_pass: if config.smtp_user and config.smtp_pass:
conn.login(config.smtp_user, config.smtp_pass) conn.login(config.smtp_user, config.smtp_pass)
log(f" 📡 New SMTP connection created to {self.host}:{self.port}")
return conn return conn
except Exception as e: except Exception as e:
log(f"Failed to create SMTP connection: {e}", 'ERROR') log(f"Failed to create SMTP connection: {e}", 'ERROR')
return None return None
def _test_connection(self, conn: smtplib.SMTP) -> bool:
"""Test if connection is still alive"""
try:
status = conn.noop()[0]
return status == 250
except Exception:
return False
def initialize(self): def initialize(self):
"""Pre-create connections""" """Pre-create connections"""
if self._initialized: if self._initialized:
return return
for _ in range(self.pool_size): # Nur 1-2 Connections initial, Rest on-demand
for _ in range(min(2, self.pool_size)):
conn = self._create_connection() conn = self._create_connection()
if conn: if conn:
self._pool.put(conn) self._pool.put(conn)
self._initialized = True self._initialized = True
log(f"SMTP pool initialized with {self._pool.qsize()} connections") log(f"SMTP pool initialized with {self._pool.qsize()} connections (max: {self.pool_size})")
def get_connection(self, timeout: float = 5.0) -> Optional[smtplib.SMTP]: def get_connection(self, timeout: float = 5.0) -> Optional[smtplib.SMTP]:
"""Get connection from pool""" """Get a valid connection from pool or create new one"""
# Versuche aus Pool zu holen
try: try:
conn = self._pool.get(timeout=timeout) conn = self._pool.get(block=False)
try: # Teste ob Connection noch lebt
conn.noop() if self._test_connection(conn):
return conn return conn
else:
# Connection ist tot, schließen und neue erstellen
try:
conn.quit()
except: except:
conn = self._create_connection() pass
return conn log(f" ♻ Recycled stale SMTP connection")
return self._create_connection()
except Empty: except Empty:
# Pool leer, neue Connection erstellen
return self._create_connection() return self._create_connection()
def return_connection(self, conn: smtplib.SMTP): def return_connection(self, conn: smtplib.SMTP):
"""Return connection to pool""" """Return connection to pool if still valid"""
if conn is None: if conn is None:
return return
# Prüfen ob Connection noch gut ist
if not self._test_connection(conn):
try:
conn.quit()
except:
pass
log(f" 🗑 Discarded broken SMTP connection")
return
# Versuche zurück in Pool zu legen
try: try:
self._pool.put_nowait(conn) self._pool.put_nowait(conn)
except: except:
# Pool voll, Connection schließen
try: try:
conn.quit() conn.quit()
except: except:
@ -651,14 +680,29 @@ def is_permanent_recipient_error(error_msg: str) -> bool:
error_lower = error_msg.lower() error_lower = error_msg.lower()
return any(indicator in error_lower for indicator in permanent_indicators) return any(indicator in error_lower for indicator in permanent_indicators)
def send_email_to_recipient(from_addr: str, recipient: str, raw_message: bytes, smtp_conn: smtplib.SMTP, worker_name: str) -> Tuple[bool, Optional[str], bool]: def send_email_to_recipient(from_addr: str, recipient: str, raw_message: bytes, worker_name: str, max_retries: int = 2) -> Tuple[bool, Optional[str], bool]:
""" """
Sendet E-Mail via SMTP an EINEN Empfänger Sendet E-Mail via SMTP an EINEN Empfänger
Mit Retry-Logik bei Connection-Fehlern
Returns: (success: bool, error: str or None, is_permanent: bool) Returns: (success: bool, error: str or None, is_permanent: bool)
""" """
last_error = None
for attempt in range(max_retries + 1):
smtp_conn = smtp_pool.get_connection()
if not smtp_conn:
last_error = "Could not get SMTP connection"
log(f"{recipient}: No SMTP connection (attempt {attempt + 1}/{max_retries + 1})", 'WARNING', worker_name)
time.sleep(0.5)
continue
try: try:
result = smtp_conn.sendmail(from_addr, [recipient], raw_message) result = smtp_conn.sendmail(from_addr, [recipient], raw_message)
# Connection war erfolgreich, zurück in Pool
smtp_pool.return_connection(smtp_conn)
if isinstance(result, dict) and result: if isinstance(result, dict) and result:
error = str(result.get(recipient, 'Unknown refusal')) error = str(result.get(recipient, 'Unknown refusal'))
is_permanent = is_permanent_recipient_error(error) is_permanent = is_permanent_recipient_error(error)
@ -668,7 +712,20 @@ def send_email_to_recipient(from_addr: str, recipient: str, raw_message: bytes,
log(f"{recipient}: Delivered", 'SUCCESS', worker_name) log(f"{recipient}: Delivered", 'SUCCESS', worker_name)
return True, None, False return True, None, False
except smtplib.SMTPServerDisconnected as e:
# Connection wurde geschlossen - Retry mit neuer Connection
log(f"{recipient}: Connection lost, retrying... (attempt {attempt + 1}/{max_retries + 1})", 'WARNING', worker_name)
last_error = str(e)
# Connection nicht zurückgeben (ist kaputt)
try:
smtp_conn.quit()
except:
pass
time.sleep(0.3)
continue
except smtplib.SMTPRecipientsRefused as e: except smtplib.SMTPRecipientsRefused as e:
smtp_pool.return_connection(smtp_conn)
error_msg = str(e) error_msg = str(e)
is_permanent = is_permanent_recipient_error(error_msg) is_permanent = is_permanent_recipient_error(error_msg)
log(f"{recipient}: Recipients refused - {error_msg}", 'ERROR', worker_name) log(f"{recipient}: Recipients refused - {error_msg}", 'ERROR', worker_name)
@ -676,14 +733,35 @@ def send_email_to_recipient(from_addr: str, recipient: str, raw_message: bytes,
except smtplib.SMTPException as e: except smtplib.SMTPException as e:
error_msg = str(e) error_msg = str(e)
# Bei Connection-Fehlern: Retry
if 'disconnect' in error_msg.lower() or 'closed' in error_msg.lower() or 'connection' in error_msg.lower():
log(f"{recipient}: SMTP connection error, retrying... (attempt {attempt + 1}/{max_retries + 1})", 'WARNING', worker_name)
last_error = error_msg
try:
smtp_conn.quit()
except:
pass
time.sleep(0.3)
continue
smtp_pool.return_connection(smtp_conn)
is_permanent = is_permanent_recipient_error(error_msg) is_permanent = is_permanent_recipient_error(error_msg)
log(f"{recipient}: SMTP error - {error_msg}", 'ERROR', worker_name) log(f"{recipient}: SMTP error - {error_msg}", 'ERROR', worker_name)
return False, error_msg, is_permanent return False, error_msg, is_permanent
except Exception as e: except Exception as e:
log(f"{recipient}: Connection error - {e}", 'ERROR', worker_name) # Unbekannter Fehler - Connection verwerfen, aber nicht permanent
try:
smtp_conn.quit()
except:
pass
log(f"{recipient}: Unexpected error - {e}", 'ERROR', worker_name)
return False, str(e), False return False, str(e), False
# Alle Retries fehlgeschlagen
log(f"{recipient}: All retries failed - {last_error}", 'ERROR', worker_name)
return False, last_error or "Connection failed after retries", False
# ============================================ # ============================================
# S3 METADATA UPDATES (wie in domain-worker) # S3 METADATA UPDATES (wie in domain-worker)
# ============================================ # ============================================
@ -872,14 +950,8 @@ def process_message(domain: str, message: dict, receive_count: int) -> bool:
failed_permanent = [] failed_permanent = []
failed_temporary = [] failed_temporary = []
smtp_conn = smtp_pool.get_connection()
if not smtp_conn:
log("❌ Could not get SMTP connection", 'ERROR', worker_name)
return False
try:
for recipient in recipients: for recipient in recipients:
success, error, is_perm = send_email_to_recipient(from_addr_final, recipient, raw_bytes, smtp_conn, worker_name) success, error, is_perm = send_email_to_recipient(from_addr_final, recipient, raw_bytes, worker_name)
if success: if success:
successful.append(recipient) successful.append(recipient)
@ -893,8 +965,6 @@ def process_message(domain: str, message: dict, receive_count: int) -> bool:
failed_temporary.append(recipient) failed_temporary.append(recipient)
if PROMETHEUS_ENABLED: if PROMETHEUS_ENABLED:
emails_processed.labels(domain=domain, status='temporary_failure').inc() emails_processed.labels(domain=domain, status='temporary_failure').inc()
finally:
smtp_pool.return_connection(smtp_conn)
# 7. RESULTAT & CLEANUP # 7. RESULTAT & CLEANUP
log(f"📊 Results: {len(successful)} OK, {len(failed_temporary)} TempFail, {len(failed_permanent)} PermFail", 'INFO', worker_name) log(f"📊 Results: {len(successful)} OK, {len(failed_temporary)} TempFail, {len(failed_permanent)} PermFail", 'INFO', worker_name)
@ -1028,6 +1098,9 @@ class UnifiedWorker:
self.executor: Optional[ThreadPoolExecutor] = None self.executor: Optional[ThreadPoolExecutor] = None
self.domains: List[str] = [] self.domains: List[str] = []
self.queue_urls: Dict[str, str] = {} self.queue_urls: Dict[str, str] = {}
self.active_pollers: Dict[str, threading.Thread] = {}
self.poller_lock = threading.Lock()
self.reload_interval = int(os.environ.get('DOMAIN_RELOAD_INTERVAL', '60')) # Sekunden
def setup(self): def setup(self):
"""Initialize worker""" """Initialize worker"""
@ -1056,35 +1129,90 @@ class UnifiedWorker:
log(f"Initialized with {len(self.queue_urls)} domains") log(f"Initialized with {len(self.queue_urls)} domains")
def start(self): def start(self):
"""Start all domain pollers""" """Start all domain pollers with hot-reload support"""
self.executor = ThreadPoolExecutor( # Initial pollers starten
max_workers=len(self.queue_urls),
thread_name_prefix='poller'
)
futures = []
for domain, queue_url in self.queue_urls.items(): for domain, queue_url in self.queue_urls.items():
future = self.executor.submit(poll_domain, domain, queue_url, self.stop_event) self._start_poller(domain, queue_url)
futures.append(future)
log(f"Started {len(futures)} domain pollers") log(f"Started {len(self.active_pollers)} domain pollers")
# Domain reload thread starten
reload_thread = threading.Thread(
target=self._domain_reload_loop,
name='domain-reloader',
daemon=True
)
reload_thread.start()
log(f"Domain hot-reload enabled (checking every {self.reload_interval}s)")
# Warten bis Shutdown
try: try:
for future in as_completed(futures): while not self.stop_event.is_set():
try: self.stop_event.wait(timeout=1)
future.result()
except Exception as e:
log(f"Poller error: {e}", 'ERROR')
except KeyboardInterrupt: except KeyboardInterrupt:
pass pass
def _start_poller(self, domain: str, queue_url: str):
"""Start a single domain poller thread"""
with self.poller_lock:
if domain in self.active_pollers:
return # Already running
thread = threading.Thread(
target=poll_domain,
args=(domain, queue_url, self.stop_event),
name=f"poller-{domain}",
daemon=True
)
thread.start()
self.active_pollers[domain] = thread
log(f"🚀 Started poller for {domain}", 'INFO', f"worker-{domain}")
def _domain_reload_loop(self):
"""Periodically check for new domains and start pollers"""
while not self.stop_event.is_set():
self.stop_event.wait(timeout=self.reload_interval)
if self.stop_event.is_set():
break
try:
# Aktuelle Domains laden
current_domains = load_domains()
# Neue Domains finden
new_domains = set(current_domains) - set(self.queue_urls.keys())
if new_domains:
log(f"🔄 Hot-reload: Found {len(new_domains)} new domain(s): {', '.join(new_domains)}")
for domain in new_domains:
queue_url = get_queue_url(domain)
if queue_url:
self.queue_urls[domain] = queue_url
self._start_poller(domain, queue_url)
else:
log(f"⚠ Could not find queue for new domain: {domain}", 'WARNING')
# Cleanup: Entfernte Domains (optional - Threads laufen weiter bis restart)
removed_domains = set(self.queue_urls.keys()) - set(current_domains)
if removed_domains:
log(f" Domains removed from config (will stop on restart): {', '.join(removed_domains)}")
except Exception as e:
log(f"Error in domain reload: {e}", 'ERROR')
def stop(self): def stop(self):
"""Stop gracefully""" """Stop gracefully"""
log("⚠ Stopping worker...") log("⚠ Stopping worker...")
self.stop_event.set() self.stop_event.set()
if self.executor: # Warten auf Poller-Threads (max 10 Sekunden)
self.executor.shutdown(wait=True, cancel_futures=False) with self.poller_lock:
for domain, thread in self.active_pollers.items():
thread.join(timeout=10)
if thread.is_alive():
log(f"Warning: Poller for {domain} did not stop gracefully", 'WARNING')
smtp_pool.close_all() smtp_pool.close_all()
log("👋 Worker stopped") log("👋 Worker stopped")
@ -1163,7 +1291,7 @@ def main():
log(f"🚀 UNIFIED EMAIL WORKER (Full Featured)") log(f"🚀 UNIFIED EMAIL WORKER (Full Featured)")
log(f"{'='*70}") log(f"{'='*70}")
log(f" Domains: {len(worker.queue_urls)}") log(f" Domains: {len(worker.queue_urls)}")
log(f" Threads: {config.worker_threads}") log(f" Hot-Reload: Every {worker.reload_interval}s")
log(f" DynamoDB: {'Connected' if DYNAMODB_AVAILABLE else 'Not Available'}") log(f" DynamoDB: {'Connected' if DYNAMODB_AVAILABLE else 'Not Available'}")
log(f" SMTP Pool: {config.smtp_pool_size} connections -> {config.smtp_host}:{config.smtp_port}") log(f" SMTP Pool: {config.smtp_pool_size} connections -> {config.smtp_host}:{config.smtp_port}")
log(f" Poll Interval: {config.poll_interval}s") log(f" Poll Interval: {config.poll_interval}s")