import pypdf import re pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf' try: reader = pypdf.PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() + "\n" print(f"Extracted {len(text)} characters.") print("--- PREVIEW ---") print(text[:1000]) print("--- END PREVIEW ---") # Simple regex check for emails emails = re.findall(r'[\w\.-]+@[\w\.-]+', text) print(f"Found {len(emails)} potential email addresses.") except Exception as e: print(f"Error reading PDF: {e}")