68 lines
2.1 KiB
Python
68 lines
2.1 KiB
Python
import pandas as pd
|
|
import os
|
|
|
|
def normalize(text):
|
|
if not isinstance(text, str):
|
|
return ""
|
|
return text.strip().lower()
|
|
|
|
def recover_cologne():
|
|
leads_csv_path = 'leads/leads.csv'
|
|
cologne_raw_path = 'leads/raw/innungen_leads_koeln_duesseldorf.csv'
|
|
|
|
if not os.path.exists(leads_csv_path) or not os.path.exists(cologne_raw_path):
|
|
print("Files not found.")
|
|
return
|
|
|
|
# Load existing leads
|
|
leads_df = pd.read_csv(leads_csv_path)
|
|
done_emails = set(leads_df['Email'].apply(normalize))
|
|
done_names = set(leads_df['Firm/Innung'].apply(normalize))
|
|
|
|
# Load raw Cologne data
|
|
cologne_df = pd.read_csv(cologne_raw_path)
|
|
|
|
new_rows = []
|
|
|
|
print(f"Scanning {len(cologne_df)} raw entries...")
|
|
|
|
for _, row in cologne_df.iterrows():
|
|
name = row.get('organisation', '')
|
|
region = row.get('region', '')
|
|
email = row.get('email', '')
|
|
|
|
# We only care about Cologne for this recovery
|
|
if str(region).lower() != 'koeln':
|
|
continue
|
|
|
|
# Must have a valid email
|
|
if pd.isna(email) or email.strip() == '':
|
|
continue
|
|
|
|
# Check if already in leads.csv
|
|
if normalize(email) in done_emails or normalize(name) in done_names:
|
|
continue
|
|
|
|
# It's a valid new lead!
|
|
new_row = {
|
|
'Firm/Innung': name,
|
|
'Contact Person': 'N/A', # Raw data might not have person, or we need to check columns
|
|
'Email': email,
|
|
'Region': 'Köln'
|
|
}
|
|
new_rows.append(new_row)
|
|
done_emails.add(normalize(email)) # Prevent dupes within batch
|
|
|
|
print(f"Found {len(new_rows)} new Cologne leads to add.")
|
|
|
|
if new_rows:
|
|
new_df = pd.DataFrame(new_rows)
|
|
# Append to CSV
|
|
new_df.to_csv(leads_csv_path, mode='a', header=False, index=False)
|
|
print("Successfully appended to leads.csv")
|
|
else:
|
|
print("No new leads found.")
|
|
|
|
if __name__ == "__main__":
|
|
recover_cologne()
|