import re import csv from pypdf import PdfReader def extract_leads_v2(pdf_path, output_csv): print(f"Extracting from {pdf_path}...") reader = PdfReader(pdf_path) text_lines = [] # Extract text and split into lines for page in reader.pages: page_text = page.extract_text() if page_text: text_lines.extend(page_text.split('\n')) leads = [] current_innung = "Unbekannte Innung" # regex patterns email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", re.IGNORECASE) obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)", re.IGNORECASE) ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)", re.IGNORECASE) kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)", re.IGNORECASE) landkreis_pattern = re.compile(r"^Landkreis(e?):", re.IGNORECASE) # Temporary storage for the current Innung's data # We need to be careful: a single Innung block might have multiple contacts? # Based on the PDF, usually contacts follow the Innung header. # Strategy: # Iterate through lines. # If we detect "Landkreis:", look back for Innung Name. Update current_innung. # Process lines for contacts/emails. Assign to current_innung. extracted_entries = [] # List of dicts # Clean lines first lines = [l.strip() for l in text_lines] seen_combinations = set() for i, line in enumerate(lines): if not line: continue # Detect Innung Name via Lookahead/Lookbehind context # Check if this line is "Landkreis: ..." if landkreis_pattern.match(line): # The Innung Name is likely the previous non-empty line # Look backwards from i-1 k = i - 1 while k >= 0 and not lines[k]: k -= 1 if k >= 0: potential_name = lines[k] # Sanity check: Name shouldn't be too long or look like a page number "5.12.2025 8" if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name): current_innung = potential_name # print(f"Found Innung: {current_innung}") # Capture People contact_person = None match_om = obermeister_pattern.match(line) if match_om: contact_person = match_om.group(1).strip() match_ap = ansprechpartner_pattern.match(line) if match_ap: # We take Ansprechpartner too contact_person = match_ap.group(1).strip() match_khm = kreishandwerksmeister_pattern.match(line) if match_khm: contact_person = match_khm.group(1).strip() # Capture Email match_email = email_pattern.search(line) if match_email: email = match_email.group(1).strip() # If we found an email, we verify if valid if len(email) < 5 or "@" not in email: continue # Check if we have a contact person on this line or previous line? # The loop structure is linear. If we found a contact person 3 lines ago, should we link it? # A simple heuristic: Keep the last seen contact person for this Innung block. # But the block might define "Obermeister" then "Email", then "Ansprechpartner" then "Email". # So we need `current_contact` state that resets somewhat? # Actually, usually getting the email is the trigger to save a lead. # We use the most recently seen contact person *since the last email or Innung change*. entry = { "Firm/Innung": current_innung, "Contact Person": contact_person if contact_person else "N/A", # Use local var if found on same line, else need state "Email": email, "Region": "Unterfranken" } # Improvement: If contact_person is None on this line, try to use a "running" contact person # But we must be careful not to apply Obermeister to Ansprechpartner's email. # Let's verify the text dump. # 117: Obermeister: Ullrich Amthor # ... # 123: E-Mail: ... # So the contact person appears BEFORE the email. pass # Refined loop with state current_contact = "N/A" # Reset loop current_innung = "Unbekannte Innung" for i, line in enumerate(lines): if not line: continue # 1. Check for Innung Header (Landkreis pattern) if landkreis_pattern.match(line): # Backtrack to find name k = i - 1 while k >= 0 and not lines[k]: k -= 1 if k >= 0: potential_name = lines[k] if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name): current_innung = potential_name current_contact = "N/A" # Reset contact for new Innung # 2. Check for Contact Person # If line starts with Obermeister/Ansprechpartner, store it. match_om = obermeister_pattern.match(line) if match_om: current_contact = match_om.group(1).strip() continue # Move to next line (don't expect email on same line usually, but check pdf) match_ap = ansprechpartner_pattern.match(line) if match_ap: current_contact = match_ap.group(1).strip() continue match_khm = kreishandwerksmeister_pattern.match(line) if match_khm: current_contact = match_khm.group(1).strip() continue # 3. Check for Email description on same line (rare but possible) or email line match_email = email_pattern.search(line) if match_email: email = match_email.group(1).strip() # Dedup combo = (current_innung, email) if combo in seen_combinations: continue seen_combinations.add(combo) leads.append({ "Firm/Innung": current_innung, "Contact Person": current_contact, "Email": email, "Region": "Unterfranken" }) # Write to CSV with open(output_csv, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"]) writer.writeheader() writer.writerows(leads) print(f"Extracted {len(leads)} leads to {output_csv}") if __name__ == "__main__": extract_leads_v2("leads/raw/unterfranken.pdf", "leads/raw/leads_unterfranken_v2.csv")