23 lines
598 B
Python
23 lines
598 B
Python
import pypdf
|
|
import re
|
|
|
|
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
|
|
|
try:
|
|
reader = pypdf.PdfReader(pdf_path)
|
|
text = ""
|
|
for page in reader.pages:
|
|
text += page.extract_text() + "\n"
|
|
|
|
print(f"Extracted {len(text)} characters.")
|
|
print("--- PREVIEW ---")
|
|
print(text[:1000])
|
|
print("--- END PREVIEW ---")
|
|
|
|
# Simple regex check for emails
|
|
emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
|
|
print(f"Found {len(emails)} potential email addresses.")
|
|
|
|
except Exception as e:
|
|
print(f"Error reading PDF: {e}")
|