stadtwerke/scripts/extract_leads_unterfranken_...


import re
import csv
from pypdf import PdfReader

def extract_leads_v2(pdf_path, output_csv):
    print(f"Extracting from {pdf_path}...")
    reader = PdfReader(pdf_path)
    text_lines = []

    # Extract text and split into lines
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text_lines.extend(page_text.split('\n'))

    leads = []
    current_innung = "Unbekannte Innung"

    # regex patterns
    email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", re.IGNORECASE)
    obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)", re.IGNORECASE)
    ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)", re.IGNORECASE)
    kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)", re.IGNORECASE)
    landkreis_pattern = re.compile(r"^Landkreis(e?):", re.IGNORECASE)

    # Temporary storage for the current Innung's data
    # We need to be careful: a single Innung block might have multiple contacts?
    # Based on the PDF, usually contacts follow the Innung header.

    # Strategy:
    # Iterate through lines.
    # If we detect "Landkreis:", look back for Innung Name. Update current_innung.
    # Process lines for contacts/emails. Assign to current_innung.

    extracted_entries = [] # List of dicts

    # Clean lines first
    lines = [l.strip() for l in text_lines]

    seen_combinations = set()

    for i, line in enumerate(lines):
        if not line:
            continue

        # Detect Innung Name via Lookahead/Lookbehind context
        # Check if this line is "Landkreis: ..."
        if landkreis_pattern.match(line):
            # The Innung Name is likely the previous non-empty line
            # Look backwards from i-1
            k = i - 1
            while k >= 0 and not lines[k]:
                k -= 1

            if k >= 0:
                potential_name = lines[k]
                # Sanity check: Name shouldn't be too long or look like a page number "5.12.2025 8"
                if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
                     current_innung = potential_name
                     # print(f"Found Innung: {current_innung}")

        # Capture People
        contact_person = None
        match_om = obermeister_pattern.match(line)
        if match_om:
            contact_person = match_om.group(1).strip()

        match_ap = ansprechpartner_pattern.match(line)
        if match_ap: # We take Ansprechpartner too
             contact_person = match_ap.group(1).strip()

        match_khm = kreishandwerksmeister_pattern.match(line)
        if match_khm:
             contact_person = match_khm.group(1).strip()

        # Capture Email
        match_email = email_pattern.search(line)
        if match_email:
            email = match_email.group(1).strip()

            # If we found an email, we verify if valid
            if len(email) < 5 or "@" not in email:
                continue

            # Check if we have a contact person on this line or previous line?
            # The loop structure is linear. If we found a contact person 3 lines ago, should we link it?
            # A simple heuristic: Keep the last seen contact person for this Innung block.
            # But the block might define "Obermeister" then "Email", then "Ansprechpartner" then "Email".
            # So we need `current_contact` state that resets somewhat?
            # Actually, usually getting the email is the trigger to save a lead.
            # We use the most recently seen contact person *since the last email or Innung change*.

            entry = {
                "Firm/Innung": current_innung,
                "Contact Person": contact_person if contact_person else "N/A", # Use local var if found on same line, else need state
                "Email": email,
                "Region": "Unterfranken"
            }

            # Improvement: If contact_person is None on this line, try to use a "running" contact person
            # But we must be careful not to apply Obermeister to Ansprechpartner's email.
            # Let's verify the text dump.
            # 117: Obermeister: Ullrich Amthor
            # ...
            # 123: E-Mail: ...

            # So the contact person appears BEFORE the email.
            pass

    # Refined loop with state
    current_contact = "N/A"

    # Reset loop
    current_innung = "Unbekannte Innung"

    for i, line in enumerate(lines):
        if not line:
            continue

        # 1. Check for Innung Header (Landkreis pattern)
        if landkreis_pattern.match(line):
             # Backtrack to find name
            k = i - 1
            while k >= 0 and not lines[k]:
                k -= 1
            if k >= 0:
                potential_name = lines[k]
                if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
                     current_innung = potential_name
                     current_contact = "N/A" # Reset contact for new Innung


        # 2. Check for Contact Person
        # If line starts with Obermeister/Ansprechpartner, store it.
        match_om = obermeister_pattern.match(line)
        if match_om:
            current_contact = match_om.group(1).strip()
            continue # Move to next line (don't expect email on same line usually, but check pdf)

        match_ap = ansprechpartner_pattern.match(line)
        if match_ap:
             current_contact = match_ap.group(1).strip()
             continue

        match_khm = kreishandwerksmeister_pattern.match(line)
        if match_khm:
             current_contact = match_khm.group(1).strip()
             continue


        # 3. Check for Email description on same line (rare but possible) or email line
        match_email = email_pattern.search(line)
        if match_email:
            email = match_email.group(1).strip()

            # Dedup
            combo = (current_innung, email)
            if combo in seen_combinations:
                continue
            seen_combinations.add(combo)

            leads.append({
                "Firm/Innung": current_innung,
                "Contact Person": current_contact,
                "Email": email,
                "Region": "Unterfranken"
            })


    # Write to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
        writer.writeheader()
        writer.writerows(leads)

    print(f"Extracted {len(leads)} leads to {output_csv}")

if __name__ == "__main__":
    extract_leads_v2("leads/raw/unterfranken.pdf", "leads/raw/leads_unterfranken_v2.csv")