stadtwerke/scripts/deduplicate_leads.py

23 lines
658 B
Python

import pandas as pd
def deduplicate_leads():
filepath = 'leads/leads.csv'
df = pd.read_csv(filepath)
initial_count = len(df)
# Remove duplicates based on 'Firm/Innung' column, keeping the first occurrence
# (Assuming first occurrence is valid or same as others since they were duplicates)
df_dedup = df.drop_duplicates(subset=['Firm/Innung'], keep='first')
final_count = len(df_dedup)
print(f"Removed {initial_count - final_count} duplicates.")
df_dedup.to_csv(filepath, index=False)
print("Deduplication complete.")
if __name__ == "__main__":
deduplicate_leads()