stadtwerke/scripts/extract_emails_direct.py

36 lines
1.2 KiB
Python

import pypdf
import re
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
def extract_emails_direct():
try:
reader = pypdf.PdfReader(pdf_path)
print(f"PDF matches {len(reader.pages)} pages.")
full_text = ""
for i, page in enumerate(reader.pages):
text = page.extract_text()
full_text += text + "\n"
print(f"--- Page {i+1} Text Sample (First 200 chars) ---")
print(text[:200])
print("------------------------------------------------")
emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', full_text)
print(f"Total extracted text length: {len(full_text)}")
print(f"Found {len(emails)} emails.")
for email in emails:
print(f"Email: {email}")
# Find context
idx = full_text.find(email)
start = max(0, idx - 50)
end = min(len(full_text), idx + 50 + len(email))
print(f"Context: {full_text[start:end].replace(chr(10), ' ')}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
extract_emails_direct()