stadtwerke/recover_cologne_leads.py

68 lines
2.1 KiB
Python

import pandas as pd
import os
def normalize(text):
if not isinstance(text, str):
return ""
return text.strip().lower()
def recover_cologne():
leads_csv_path = 'leads/leads.csv'
cologne_raw_path = 'leads/raw/innungen_leads_koeln_duesseldorf.csv'
if not os.path.exists(leads_csv_path) or not os.path.exists(cologne_raw_path):
print("Files not found.")
return
# Load existing leads
leads_df = pd.read_csv(leads_csv_path)
done_emails = set(leads_df['Email'].apply(normalize))
done_names = set(leads_df['Firm/Innung'].apply(normalize))
# Load raw Cologne data
cologne_df = pd.read_csv(cologne_raw_path)
new_rows = []
print(f"Scanning {len(cologne_df)} raw entries...")
for _, row in cologne_df.iterrows():
name = row.get('organisation', '')
region = row.get('region', '')
email = row.get('email', '')
# We only care about Cologne for this recovery
if str(region).lower() != 'koeln':
continue
# Must have a valid email
if pd.isna(email) or email.strip() == '':
continue
# Check if already in leads.csv
if normalize(email) in done_emails or normalize(name) in done_names:
continue
# It's a valid new lead!
new_row = {
'Firm/Innung': name,
'Contact Person': 'N/A', # Raw data might not have person, or we need to check columns
'Email': email,
'Region': 'Köln'
}
new_rows.append(new_row)
done_emails.add(normalize(email)) # Prevent dupes within batch
print(f"Found {len(new_rows)} new Cologne leads to add.")
if new_rows:
new_df = pd.DataFrame(new_rows)
# Append to CSV
new_df.to_csv(leads_csv_path, mode='a', header=False, index=False)
print("Successfully appended to leads.csv")
else:
print("No new leads found.")
if __name__ == "__main__":
recover_cologne()