stadtwerke/scripts/extract_leads.py


import re
import csv
from pypdf import PdfReader

def extract_leads(pdf_path, output_csv):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + "\n"

    lines = text.split('\n')

    leads = []
    seen_emails = set()

    current_innung = "Unbekannte Innung"
    current_contact = None

    # Improved patterns
    # Innung usually starts the line, maybe bolded in PDF (not visible here).
    # We look for keywords.
    innung_start_keywords = ["Innung", "Kreishandwerkerschaft", "Bäckerinnung", "Bauinnung", "Metzgerinnung", "Friseurinnung", "Maler", "Zimmerer"]

    email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
    obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)")
    ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)")
    kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)")

    for line in lines:
        line = line.strip()
        if not line:
            continue

        # Heuristic for Innung name: contains "Innung" or "schaft", is not a sentence (no " und ", " die ", etc in middle usually?), short length.
        # The PDF seems to use headers like "Bäckerinnung Bayerischer Untermain"
        if any(k in line for k in innung_start_keywords) and len(line) < 80 and " und " not in line[5:-5] and "," not in line:
             if " die " not in line and " der " not in line:
                current_innung = line
                current_contact = None # New Innung, reset contact?

        # Capture contact
        match_om = obermeister_pattern.match(line)
        if match_om:
            current_contact = match_om.group(1)

        match_ap = ansprechpartner_pattern.match(line)
        if match_ap and not current_contact:
             current_contact = match_ap.group(1)

        match_khm = kreishandwerksmeister_pattern.match(line)
        if match_khm:
             current_contact = match_khm.group(1)

        # Capture Email
        match_email = email_pattern.search(line)
        if match_email:
            email = match_email.group(1)

            # Additional cleanup
            if email in seen_emails:
                continue

            # validation
            if len(email) < 5 or "@" not in email:
                continue

            seen_emails.add(email)
            leads.append({
                "Firm/Innung": current_innung,
                "Contact Person": current_contact if current_contact else "N/A",
                "Email": email,
                "Region": "Unterfranken"
            })

    # Write to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
        writer.writeheader()
        writer.writerows(leads)

    print(f"Extracted {len(leads)} unique leads to {output_csv}")

if __name__ == "__main__":
    extract_leads("unterfranken.pdf", "leads.csv")