import json import csv import re import os # Files from step 255-268 files = [ r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\255\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\256\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\257\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\258\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\259\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\260\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\261\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\262\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\263\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\264\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\265\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\266\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\267\output.txt', r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\268\output.txt' ] output_csv = 'cologne_duesseldorf_data/duesseldorf_batch5.csv' names = [ "Stukkateur", "Bootsbauer", "Goldschmiede", "IT", "Kachel", "Karosserie", "Schneider", "Instrumenten", "Ortho-Technik", "Ortho-Schuh", "Parkett", "Sattler", "Werbe", "Zahn" ] def parse_batch5(): leads = [] email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+') for i, file_path in enumerate(files): try: with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) items = data.get('items', []) innung_name = names[i] found_email = False for item in items: if item.get('type') == 'organic': desc = item.get('description', '') title = item.get('title', '') snippet = item.get('pre_snippet', '') full_text = f"{title} {desc} {snippet}" emails = email_regex.findall(full_text) for email in emails: email = email.rstrip('.') # Filter out trash if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email: continue leads.append({ 'Firm/Innung': f"{innung_name} Düsseldorf", 'Contact': "N/A", 'Email': email, 'Phone': "N/A", 'Region': 'Düsseldorf' }) found_email = True break if found_email: break except Exception as e: print(f"Error parsing {file_path}: {e}") with open(output_csv, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region']) writer.writeheader() writer.writerows(leads) print(f"Extracted {len(leads)} leads from Batch 5.") for l in leads: print(f"{l['Firm/Innung']}: {l['Email']}") if __name__ == "__main__": parse_batch5()