stadtwerke/scripts/extract_leads.py

86 lines
3.1 KiB
Python

import re
import csv
from pypdf import PdfReader
def extract_leads(pdf_path, output_csv):
reader = PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
lines = text.split('\n')
leads = []
seen_emails = set()
current_innung = "Unbekannte Innung"
current_contact = None
# Improved patterns
# Innung usually starts the line, maybe bolded in PDF (not visible here).
# We look for keywords.
innung_start_keywords = ["Innung", "Kreishandwerkerschaft", "Bäckerinnung", "Bauinnung", "Metzgerinnung", "Friseurinnung", "Maler", "Zimmerer"]
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)")
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)")
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)")
for line in lines:
line = line.strip()
if not line:
continue
# Heuristic for Innung name: contains "Innung" or "schaft", is not a sentence (no " und ", " die ", etc in middle usually?), short length.
# The PDF seems to use headers like "Bäckerinnung Bayerischer Untermain"
if any(k in line for k in innung_start_keywords) and len(line) < 80 and " und " not in line[5:-5] and "," not in line:
if " die " not in line and " der " not in line:
current_innung = line
current_contact = None # New Innung, reset contact?
# Capture contact
match_om = obermeister_pattern.match(line)
if match_om:
current_contact = match_om.group(1)
match_ap = ansprechpartner_pattern.match(line)
if match_ap and not current_contact:
current_contact = match_ap.group(1)
match_khm = kreishandwerksmeister_pattern.match(line)
if match_khm:
current_contact = match_khm.group(1)
# Capture Email
match_email = email_pattern.search(line)
if match_email:
email = match_email.group(1)
# Additional cleanup
if email in seen_emails:
continue
# validation
if len(email) < 5 or "@" not in email:
continue
seen_emails.add(email)
leads.append({
"Firm/Innung": current_innung,
"Contact Person": current_contact if current_contact else "N/A",
"Email": email,
"Region": "Unterfranken"
})
# Write to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} unique leads to {output_csv}")
if __name__ == "__main__":
extract_leads("unterfranken.pdf", "leads.csv")