stadtwerke/scripts/preview_duesseldorf_pdf.py

23 lines
598 B
Python

import pypdf
import re
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
try:
reader = pypdf.PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
print(f"Extracted {len(text)} characters.")
print("--- PREVIEW ---")
print(text[:1000])
print("--- END PREVIEW ---")
# Simple regex check for emails
emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
print(f"Found {len(emails)} potential email addresses.")
except Exception as e:
print(f"Error reading PDF: {e}")