import pypdf import re pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf' def extract_emails_direct(): try: reader = pypdf.PdfReader(pdf_path) print(f"PDF matches {len(reader.pages)} pages.") full_text = "" for i, page in enumerate(reader.pages): text = page.extract_text() full_text += text + "\n" print(f"--- Page {i+1} Text Sample (First 200 chars) ---") print(text[:200]) print("------------------------------------------------") emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', full_text) print(f"Total extracted text length: {len(full_text)}") print(f"Found {len(emails)} emails.") for email in emails: print(f"Email: {email}") # Find context idx = full_text.find(email) start = max(0, idx - 50) end = min(len(full_text), idx + 50 + len(email)) print(f"Context: {full_text[start:end].replace(chr(10), ' ')}") except Exception as e: print(f"Error: {e}") if __name__ == "__main__": extract_emails_direct()