stadtwerke/leads/identify_missing_leads.py

106 lines
4.2 KiB
Python

import pandas as pd
import json
import os
def normalize(text):
if not isinstance(text, str):
return ""
return text.strip().lower()
def main():
# 1. Load the "Done" list
leads_csv_path = 'leads/leads.csv'
if os.path.exists(leads_csv_path):
leads_df = pd.read_csv(leads_csv_path)
done_names = set(leads_df['Firm/Innung'].apply(normalize))
done_emails = set(leads_df['Email'].apply(normalize))
else:
done_names = set()
done_emails = set()
missing_duesseldorf = []
missing_cologne = []
missing_unterfranken = []
# 2. Check Düsseldorf Targets
duesseldorf_path = 'leads/cologne_duesseldorf_data/duesseldorf_targets.json'
if os.path.exists(duesseldorf_path):
with open(duesseldorf_path, 'r', encoding='utf-8') as f:
targets = json.load(f)
seen_d = set()
for t in targets:
name = t.get('innung', '')
if normalize(name) not in done_names and normalize(name) not in seen_d:
missing_duesseldorf.append(t)
seen_d.add(normalize(name))
# 3. Check Cologne/Düsseldorf Raw CSV
cologne_raw_path = 'leads/raw/innungen_leads_koeln_duesseldorf.csv'
if os.path.exists(cologne_raw_path):
cologne_df = pd.read_csv(cologne_raw_path)
seen_c = set()
for _, row in cologne_df.iterrows():
name = row.get('organisation', '')
region = row.get('region', '')
email = row.get('email', '')
if str(region).lower() == 'koeln':
if normalize(name) in seen_c:
continue
seen_c.add(normalize(name))
if pd.isna(email) or email.strip() == '':
missing_cologne.append({'name': name, 'reason': 'No Email'})
else:
if normalize(email) not in done_emails and normalize(name) not in done_names:
missing_cologne.append({'name': name, 'email': email, 'reason': 'Not in final list'})
# 4. Check Unterfranken Raw CSV
unterfranken_raw_path = 'leads/raw/leads_unterfranken.csv'
if os.path.exists(unterfranken_raw_path):
uf_df = pd.read_csv(unterfranken_raw_path)
name_col = None
for col in uf_df.columns:
if 'innung' in col.lower() or 'firm' in col.lower() or 'name' in col.lower():
name_col = col
break
seen_u = set()
if name_col:
for _, row in uf_df.iterrows():
name = str(row[name_col]).strip()
# Filter garbage
if len(name) < 5: continue
if "regierungsbezirk" in name.lower() and "sitz" in name.lower(): continue # Garbage header line
if normalize(name) not in done_names and normalize(name) not in seen_u:
missing_unterfranken.append(name)
seen_u.add(normalize(name))
# 5. Generate Markdown
with open('missing_leads.md', 'w', encoding='utf-8') as f:
f.write('# Missing Leads Report\n\n')
f.write(f'## Düsseldorf (Missing: {len(missing_duesseldorf)})\n')
if not missing_duesseldorf:
f.write("No missing leads identified (or source file empty).\n")
for item in missing_duesseldorf:
f.write(f"- {item.get('innung')} (Contact: {item.get('person', 'N/A')})\n")
f.write(f'\n## Cologne (Missing: {len(missing_cologne)})\n')
if not missing_cologne:
f.write("No missing leads identified from raw source.\n")
for item in missing_cologne:
reason = item.get('reason', '')
email_part = f" (Email: {item['email']})" if 'email' in item else ""
f.write(f"- {item.get('name')}{email_part} [{reason}]\n")
f.write(f'\n## Unterfranken (Missing: {len(missing_unterfranken)})\n')
if not missing_unterfranken:
f.write("All raw Unterfranken leads seem to be in the final list.\n")
for name in missing_unterfranken:
f.write(f"- {name}\n")
if __name__ == "__main__":
main()