import pandas as pd import os def normalize(text): if not isinstance(text, str): return "" return text.strip().lower() def recover_cologne(): leads_csv_path = 'leads/leads.csv' cologne_raw_path = 'leads/raw/innungen_leads_koeln_duesseldorf.csv' if not os.path.exists(leads_csv_path) or not os.path.exists(cologne_raw_path): print("Files not found.") return # Load existing leads leads_df = pd.read_csv(leads_csv_path) done_emails = set(leads_df['Email'].apply(normalize)) done_names = set(leads_df['Firm/Innung'].apply(normalize)) # Load raw Cologne data cologne_df = pd.read_csv(cologne_raw_path) new_rows = [] print(f"Scanning {len(cologne_df)} raw entries...") for _, row in cologne_df.iterrows(): name = row.get('organisation', '') region = row.get('region', '') email = row.get('email', '') # We only care about Cologne for this recovery if str(region).lower() != 'koeln': continue # Must have a valid email if pd.isna(email) or email.strip() == '': continue # Check if already in leads.csv if normalize(email) in done_emails or normalize(name) in done_names: continue # It's a valid new lead! new_row = { 'Firm/Innung': name, 'Contact Person': 'N/A', # Raw data might not have person, or we need to check columns 'Email': email, 'Region': 'Köln' } new_rows.append(new_row) done_emails.add(normalize(email)) # Prevent dupes within batch print(f"Found {len(new_rows)} new Cologne leads to add.") if new_rows: new_df = pd.DataFrame(new_rows) # Append to CSV new_df.to_csv(leads_csv_path, mode='a', header=False, index=False) print("Successfully appended to leads.csv") else: print("No new leads found.") if __name__ == "__main__": recover_cologne()