36 lines
1.2 KiB
Python
36 lines
1.2 KiB
Python
import pypdf
|
|
import re
|
|
|
|
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
|
|
|
def extract_emails_direct():
|
|
try:
|
|
reader = pypdf.PdfReader(pdf_path)
|
|
print(f"PDF matches {len(reader.pages)} pages.")
|
|
|
|
full_text = ""
|
|
for i, page in enumerate(reader.pages):
|
|
text = page.extract_text()
|
|
full_text += text + "\n"
|
|
print(f"--- Page {i+1} Text Sample (First 200 chars) ---")
|
|
print(text[:200])
|
|
print("------------------------------------------------")
|
|
|
|
emails = re.findall(r'[\w\.-]+@[\w\.-]+\.\w+', full_text)
|
|
print(f"Total extracted text length: {len(full_text)}")
|
|
print(f"Found {len(emails)} emails.")
|
|
|
|
for email in emails:
|
|
print(f"Email: {email}")
|
|
# Find context
|
|
idx = full_text.find(email)
|
|
start = max(0, idx - 50)
|
|
end = min(len(full_text), idx + 50 + len(email))
|
|
print(f"Context: {full_text[start:end].replace(chr(10), ' ')}")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
extract_emails_direct()
|