stadtwerke/scripts/parse_cologne_serp.py

import json
import csv
import re

# Parse the SERP output file (it's JSON content inside a text file, usually)
# The previous view_file showed it's valid JSON.

input_file = r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\141\output.txt'
output_csv = 'cologne_duesseldorf_data/cologne_leads.csv'

def parse_serp():
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    leads = []
    items = data.get('items', [])

    email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')

    for item in items:
        if item.get('type') == 'organic':
            desc = item.get('description', '')
            title = item.get('title', '')
            snippet = item.get('pre_snippet', '')
            full_text = f"{title} {desc} {snippet}"

            emails = email_regex.findall(full_text)
            for email in emails:
                email = email.rstrip('.')

                # Use title or domain as Innung name
                innung_name = item.get('website_name') or item.get('domain') or title

                if any(l['Email'] == email for l in leads):
                    continue

                leads.append({
                    'Firm/Innung': innung_name,
                    'Contact': "N/A",
                    'Email': email,
                    'Phone': "N/A",
                    'Region': 'Köln'
                })

    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
        writer.writeheader()
        writer.writerows(leads)

    print(f"Extracted {len(leads)} leads from Cologne SERP.")
    for l in leads:
        print(f"{l['Firm/Innung']}: {l['Email']}")

if __name__ == "__main__":
    parse_serp()