78 lines
3.6 KiB
Python
78 lines
3.6 KiB
Python
import json
|
|
import csv
|
|
import re
|
|
import os
|
|
|
|
# Files from step 242-251
|
|
files = [
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\242\output.txt',
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\243\output.txt',
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\244\output.txt',
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\245\output.txt',
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\246\output.txt',
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\247\output.txt',
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\248\output.txt',
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\249\output.txt',
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\250\output.txt',
|
|
r'C:\Users\a931627\.gemini\antigravity\brain\6060ab5d-4406-4d40-803f-c8d1df8bb430\.system_generated\steps\251\output.txt'
|
|
]
|
|
|
|
output_csv = 'cologne_duesseldorf_data/duesseldorf_batch3_4.csv'
|
|
names = [
|
|
"Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker",
|
|
"Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer"
|
|
]
|
|
|
|
def parse_batches_3_4():
|
|
leads = []
|
|
email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+')
|
|
|
|
for i, file_path in enumerate(files):
|
|
try:
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
items = data.get('items', [])
|
|
innung_name = names[i]
|
|
|
|
found_email = False
|
|
for item in items:
|
|
if item.get('type') == 'organic':
|
|
desc = item.get('description', '')
|
|
title = item.get('title', '')
|
|
snippet = item.get('pre_snippet', '')
|
|
full_text = f"{title} {desc} {snippet}"
|
|
|
|
emails = email_regex.findall(full_text)
|
|
for email in emails:
|
|
email = email.rstrip('.')
|
|
# Filter out trash
|
|
if email.endswith('png') or email.endswith('jpg') or 'datenschutz' in email:
|
|
continue
|
|
|
|
leads.append({
|
|
'Firm/Innung': f"{innung_name} Düsseldorf",
|
|
'Contact': "N/A",
|
|
'Email': email,
|
|
'Phone': "N/A",
|
|
'Region': 'Düsseldorf'
|
|
})
|
|
found_email = True
|
|
break
|
|
if found_email:
|
|
break
|
|
except Exception as e:
|
|
print(f"Error parsing {file_path}: {e}")
|
|
|
|
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region'])
|
|
writer.writeheader()
|
|
writer.writerows(leads)
|
|
|
|
print(f"Extracted {len(leads)} leads from Batches 3 & 4.")
|
|
for l in leads:
|
|
print(f"{l['Firm/Innung']}: {l['Email']}")
|
|
|
|
if __name__ == "__main__":
|
|
parse_batches_3_4()
|