106 lines
4.2 KiB
Python
106 lines
4.2 KiB
Python
import pandas as pd
|
|
import json
|
|
import os
|
|
|
|
def normalize(text):
|
|
if not isinstance(text, str):
|
|
return ""
|
|
return text.strip().lower()
|
|
|
|
def main():
|
|
# 1. Load the "Done" list
|
|
leads_csv_path = 'leads/leads.csv'
|
|
if os.path.exists(leads_csv_path):
|
|
leads_df = pd.read_csv(leads_csv_path)
|
|
done_names = set(leads_df['Firm/Innung'].apply(normalize))
|
|
done_emails = set(leads_df['Email'].apply(normalize))
|
|
else:
|
|
done_names = set()
|
|
done_emails = set()
|
|
|
|
missing_duesseldorf = []
|
|
missing_cologne = []
|
|
missing_unterfranken = []
|
|
|
|
# 2. Check Düsseldorf Targets
|
|
duesseldorf_path = 'leads/cologne_duesseldorf_data/duesseldorf_targets.json'
|
|
if os.path.exists(duesseldorf_path):
|
|
with open(duesseldorf_path, 'r', encoding='utf-8') as f:
|
|
targets = json.load(f)
|
|
seen_d = set()
|
|
for t in targets:
|
|
name = t.get('innung', '')
|
|
if normalize(name) not in done_names and normalize(name) not in seen_d:
|
|
missing_duesseldorf.append(t)
|
|
seen_d.add(normalize(name))
|
|
|
|
# 3. Check Cologne/Düsseldorf Raw CSV
|
|
cologne_raw_path = 'leads/raw/innungen_leads_koeln_duesseldorf.csv'
|
|
if os.path.exists(cologne_raw_path):
|
|
cologne_df = pd.read_csv(cologne_raw_path)
|
|
seen_c = set()
|
|
for _, row in cologne_df.iterrows():
|
|
name = row.get('organisation', '')
|
|
region = row.get('region', '')
|
|
email = row.get('email', '')
|
|
|
|
if str(region).lower() == 'koeln':
|
|
if normalize(name) in seen_c:
|
|
continue
|
|
seen_c.add(normalize(name))
|
|
|
|
if pd.isna(email) or email.strip() == '':
|
|
missing_cologne.append({'name': name, 'reason': 'No Email'})
|
|
else:
|
|
if normalize(email) not in done_emails and normalize(name) not in done_names:
|
|
missing_cologne.append({'name': name, 'email': email, 'reason': 'Not in final list'})
|
|
|
|
# 4. Check Unterfranken Raw CSV
|
|
unterfranken_raw_path = 'leads/raw/leads_unterfranken.csv'
|
|
if os.path.exists(unterfranken_raw_path):
|
|
uf_df = pd.read_csv(unterfranken_raw_path)
|
|
name_col = None
|
|
for col in uf_df.columns:
|
|
if 'innung' in col.lower() or 'firm' in col.lower() or 'name' in col.lower():
|
|
name_col = col
|
|
break
|
|
|
|
seen_u = set()
|
|
if name_col:
|
|
for _, row in uf_df.iterrows():
|
|
name = str(row[name_col]).strip()
|
|
# Filter garbage
|
|
if len(name) < 5: continue
|
|
if "regierungsbezirk" in name.lower() and "sitz" in name.lower(): continue # Garbage header line
|
|
|
|
if normalize(name) not in done_names and normalize(name) not in seen_u:
|
|
missing_unterfranken.append(name)
|
|
seen_u.add(normalize(name))
|
|
|
|
# 5. Generate Markdown
|
|
with open('missing_leads.md', 'w', encoding='utf-8') as f:
|
|
f.write('# Missing Leads Report\n\n')
|
|
|
|
f.write(f'## Düsseldorf (Missing: {len(missing_duesseldorf)})\n')
|
|
if not missing_duesseldorf:
|
|
f.write("No missing leads identified (or source file empty).\n")
|
|
for item in missing_duesseldorf:
|
|
f.write(f"- {item.get('innung')} (Contact: {item.get('person', 'N/A')})\n")
|
|
|
|
f.write(f'\n## Cologne (Missing: {len(missing_cologne)})\n')
|
|
if not missing_cologne:
|
|
f.write("No missing leads identified from raw source.\n")
|
|
for item in missing_cologne:
|
|
reason = item.get('reason', '')
|
|
email_part = f" (Email: {item['email']})" if 'email' in item else ""
|
|
f.write(f"- {item.get('name')}{email_part} [{reason}]\n")
|
|
|
|
f.write(f'\n## Unterfranken (Missing: {len(missing_unterfranken)})\n')
|
|
if not missing_unterfranken:
|
|
f.write("All raw Unterfranken leads seem to be in the final list.\n")
|
|
for name in missing_unterfranken:
|
|
f.write(f"- {name}\n")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|