import pandas as pd import json import os def normalize(text): if not isinstance(text, str): return "" return text.strip().lower() def main(): # 1. Load the "Done" list leads_csv_path = 'leads/leads.csv' if os.path.exists(leads_csv_path): leads_df = pd.read_csv(leads_csv_path) done_names = set(leads_df['Firm/Innung'].apply(normalize)) done_emails = set(leads_df['Email'].apply(normalize)) else: done_names = set() done_emails = set() missing_duesseldorf = [] missing_cologne = [] missing_unterfranken = [] # 2. Check Düsseldorf Targets duesseldorf_path = 'leads/cologne_duesseldorf_data/duesseldorf_targets.json' if os.path.exists(duesseldorf_path): with open(duesseldorf_path, 'r', encoding='utf-8') as f: targets = json.load(f) seen_d = set() for t in targets: name = t.get('innung', '') if normalize(name) not in done_names and normalize(name) not in seen_d: missing_duesseldorf.append(t) seen_d.add(normalize(name)) # 3. Check Cologne/Düsseldorf Raw CSV cologne_raw_path = 'leads/raw/innungen_leads_koeln_duesseldorf.csv' if os.path.exists(cologne_raw_path): cologne_df = pd.read_csv(cologne_raw_path) seen_c = set() for _, row in cologne_df.iterrows(): name = row.get('organisation', '') region = row.get('region', '') email = row.get('email', '') if str(region).lower() == 'koeln': if normalize(name) in seen_c: continue seen_c.add(normalize(name)) if pd.isna(email) or email.strip() == '': missing_cologne.append({'name': name, 'reason': 'No Email'}) else: if normalize(email) not in done_emails and normalize(name) not in done_names: missing_cologne.append({'name': name, 'email': email, 'reason': 'Not in final list'}) # 4. Check Unterfranken Raw CSV unterfranken_raw_path = 'leads/raw/leads_unterfranken.csv' if os.path.exists(unterfranken_raw_path): uf_df = pd.read_csv(unterfranken_raw_path) name_col = None for col in uf_df.columns: if 'innung' in col.lower() or 'firm' in col.lower() or 'name' in col.lower(): name_col = col break seen_u = set() if name_col: for _, row in uf_df.iterrows(): name = str(row[name_col]).strip() # Filter garbage if len(name) < 5: continue if "regierungsbezirk" in name.lower() and "sitz" in name.lower(): continue # Garbage header line if normalize(name) not in done_names and normalize(name) not in seen_u: missing_unterfranken.append(name) seen_u.add(normalize(name)) # 5. Generate Markdown with open('missing_leads.md', 'w', encoding='utf-8') as f: f.write('# Missing Leads Report\n\n') f.write(f'## Düsseldorf (Missing: {len(missing_duesseldorf)})\n') if not missing_duesseldorf: f.write("No missing leads identified (or source file empty).\n") for item in missing_duesseldorf: f.write(f"- {item.get('innung')} (Contact: {item.get('person', 'N/A')})\n") f.write(f'\n## Cologne (Missing: {len(missing_cologne)})\n') if not missing_cologne: f.write("No missing leads identified from raw source.\n") for item in missing_cologne: reason = item.get('reason', '') email_part = f" (Email: {item['email']})" if 'email' in item else "" f.write(f"- {item.get('name')}{email_part} [{reason}]\n") f.write(f'\n## Unterfranken (Missing: {len(missing_unterfranken)})\n') if not missing_unterfranken: f.write("All raw Unterfranken leads seem to be in the final list.\n") for name in missing_unterfranken: f.write(f"- {name}\n") if __name__ == "__main__": main()