import pypdf import re import csv pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf' output_csv = 'cologne_duesseldorf_data/duesseldorf_leads.csv' def extract_duesseldorf_leads(): try: reader = pypdf.PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() + "\n" lines = text.split('\n') leads = [] current_innung = "Unknown Innung" # Regex for email email_regex = re.compile(r'[\w\.-]+@[\w\.-]+\.\w+') for i, line in enumerate(lines): line = line.strip() if not line: continue # Update current Innung if line looks like a title (pure text, no email, short-ish) # This is still heuristic but let's try to capture lines with "Innung" OR "Verband" if ("Innung" in line or "Verband" in line) and "@" not in line and len(line) < 100: current_innung = line emails = email_regex.findall(line) for email in emails: email = email.rstrip('.') if any(l['Email'] == email for l in leads): continue leads.append({ 'Firm/Innung': current_innung, 'Contact': "N/A", 'Email': email, 'Phone': "N/A", 'Region': 'Düsseldorf' }) # Write to CSV with open(output_csv, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=['Firm/Innung', 'Contact', 'Email', 'Phone', 'Region']) writer.writeheader() writer.writerows(leads) print(f"Extracted {len(leads)} leads from Düsseldorf PDF.") # Print first 5 for verification for l in leads[:5]: print(f"- {l['Firm/Innung']}: {l['Email']}") except Exception as e: print(f"Error extracting Düsseldorf leads: {e}") if __name__ == "__main__": extract_duesseldorf_leads()