stadtwerke/scripts/extract_duesseldorf.py

import pypdf
import re
import csv

pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
output_csv = 'cologne_duesseldorf_data/duesseldorf_leads.csv'

def extract_duesseldorf_leads():
    try:
        reader = pypdf.PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n"

        lines = text.split('\n')
        leads = []
        current_innung = "Unknown Innung"

        # Regex for email
        email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')

        for i, line in enumerate(lines):
            line = line.strip()
            if not line:
                continue

            # Update current Innung if line looks like a title (pure text, no email, short-ish)
            # This is still heuristic but let's try to capture lines with "Innung" OR "Verband"
            if ("Innung" in line or "Verband" in line) and "@" not in line and len(line) < 100:
                current_innung = line

            emails = email_regex.findall(line)
            for email in emails:
                email = email.rstrip('.')

                if any(l['Email'] == email for l in leads):
                    continue

                leads.append({
                    'Firm/Innung': current_innung,
                    'Contact': "N/A",
                    'Email': email,
                    'Phone': "N/A",
                    'Region': 'Düsseldorf'
                })

        # Write to CSV
        with open(output_csv, 'w', newline='', encoding='utf-8') as f:
            writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
            writer.writeheader()
            writer.writerows(leads)

        print(f"Extracted {len(leads)} leads from Düsseldorf PDF.")
        # Print first 5 for verification
        for l in leads[:5]:
            print(f"- {l['Firm/Innung']}: {l['Email']}")

    except Exception as e:
        print(f"Error extracting Düsseldorf leads: {e}")

if __name__ == "__main__":
    extract_duesseldorf_leads()