import re import csv from pypdf import PdfReader def extract_leads(pdf_path, output_csv): reader = PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() + "\n" lines = text.split('\n') leads = [] seen_emails = set() current_innung = "Unbekannte Innung" current_contact = None # Improved patterns # Innung usually starts the line, maybe bolded in PDF (not visible here). # We look for keywords. innung_start_keywords = ["Innung", "Kreishandwerkerschaft", "Bäckerinnung", "Bauinnung", "Metzgerinnung", "Friseurinnung", "Maler", "Zimmerer"] email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})") obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)") ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)") kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)") for line in lines: line = line.strip() if not line: continue # Heuristic for Innung name: contains "Innung" or "schaft", is not a sentence (no " und ", " die ", etc in middle usually?), short length. # The PDF seems to use headers like "Bäckerinnung Bayerischer Untermain" if any(k in line for k in innung_start_keywords) and len(line) < 80 and " und " not in line[5:-5] and "," not in line: if " die " not in line and " der " not in line: current_innung = line current_contact = None # New Innung, reset contact? # Capture contact match_om = obermeister_pattern.match(line) if match_om: current_contact = match_om.group(1) match_ap = ansprechpartner_pattern.match(line) if match_ap and not current_contact: current_contact = match_ap.group(1) match_khm = kreishandwerksmeister_pattern.match(line) if match_khm: current_contact = match_khm.group(1) # Capture Email match_email = email_pattern.search(line) if match_email: email = match_email.group(1) # Additional cleanup if email in seen_emails: continue # validation if len(email) < 5 or "@" not in email: continue seen_emails.add(email) leads.append({ "Firm/Innung": current_innung, "Contact Person": current_contact if current_contact else "N/A", "Email": email, "Region": "Unterfranken" }) # Write to CSV with open(output_csv, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"]) writer.writeheader() writer.writerows(leads) print(f"Extracted {len(leads)} unique leads to {output_csv}") if __name__ == "__main__": extract_leads("unterfranken.pdf", "leads.csv")