stadtwerke/scripts/extract_leads_unterfranken_...

181 lines
7.0 KiB
Python

import re
import csv
from pypdf import PdfReader
def extract_leads_v2(pdf_path, output_csv):
print(f"Extracting from {pdf_path}...")
reader = PdfReader(pdf_path)
text_lines = []
# Extract text and split into lines
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text_lines.extend(page_text.split('\n'))
leads = []
current_innung = "Unbekannte Innung"
# regex patterns
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", re.IGNORECASE)
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)", re.IGNORECASE)
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)", re.IGNORECASE)
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)", re.IGNORECASE)
landkreis_pattern = re.compile(r"^Landkreis(e?):", re.IGNORECASE)
# Temporary storage for the current Innung's data
# We need to be careful: a single Innung block might have multiple contacts?
# Based on the PDF, usually contacts follow the Innung header.
# Strategy:
# Iterate through lines.
# If we detect "Landkreis:", look back for Innung Name. Update current_innung.
# Process lines for contacts/emails. Assign to current_innung.
extracted_entries = [] # List of dicts
# Clean lines first
lines = [l.strip() for l in text_lines]
seen_combinations = set()
for i, line in enumerate(lines):
if not line:
continue
# Detect Innung Name via Lookahead/Lookbehind context
# Check if this line is "Landkreis: ..."
if landkreis_pattern.match(line):
# The Innung Name is likely the previous non-empty line
# Look backwards from i-1
k = i - 1
while k >= 0 and not lines[k]:
k -= 1
if k >= 0:
potential_name = lines[k]
# Sanity check: Name shouldn't be too long or look like a page number "5.12.2025 8"
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
current_innung = potential_name
# print(f"Found Innung: {current_innung}")
# Capture People
contact_person = None
match_om = obermeister_pattern.match(line)
if match_om:
contact_person = match_om.group(1).strip()
match_ap = ansprechpartner_pattern.match(line)
if match_ap: # We take Ansprechpartner too
contact_person = match_ap.group(1).strip()
match_khm = kreishandwerksmeister_pattern.match(line)
if match_khm:
contact_person = match_khm.group(1).strip()
# Capture Email
match_email = email_pattern.search(line)
if match_email:
email = match_email.group(1).strip()
# If we found an email, we verify if valid
if len(email) < 5 or "@" not in email:
continue
# Check if we have a contact person on this line or previous line?
# The loop structure is linear. If we found a contact person 3 lines ago, should we link it?
# A simple heuristic: Keep the last seen contact person for this Innung block.
# But the block might define "Obermeister" then "Email", then "Ansprechpartner" then "Email".
# So we need `current_contact` state that resets somewhat?
# Actually, usually getting the email is the trigger to save a lead.
# We use the most recently seen contact person *since the last email or Innung change*.
entry = {
"Firm/Innung": current_innung,
"Contact Person": contact_person if contact_person else "N/A", # Use local var if found on same line, else need state
"Email": email,
"Region": "Unterfranken"
}
# Improvement: If contact_person is None on this line, try to use a "running" contact person
# But we must be careful not to apply Obermeister to Ansprechpartner's email.
# Let's verify the text dump.
# 117: Obermeister: Ullrich Amthor
# ...
# 123: E-Mail: ...
# So the contact person appears BEFORE the email.
pass
# Refined loop with state
current_contact = "N/A"
# Reset loop
current_innung = "Unbekannte Innung"
for i, line in enumerate(lines):
if not line:
continue
# 1. Check for Innung Header (Landkreis pattern)
if landkreis_pattern.match(line):
# Backtrack to find name
k = i - 1
while k >= 0 and not lines[k]:
k -= 1
if k >= 0:
potential_name = lines[k]
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
current_innung = potential_name
current_contact = "N/A" # Reset contact for new Innung
# 2. Check for Contact Person
# If line starts with Obermeister/Ansprechpartner, store it.
match_om = obermeister_pattern.match(line)
if match_om:
current_contact = match_om.group(1).strip()
continue # Move to next line (don't expect email on same line usually, but check pdf)
match_ap = ansprechpartner_pattern.match(line)
if match_ap:
current_contact = match_ap.group(1).strip()
continue
match_khm = kreishandwerksmeister_pattern.match(line)
if match_khm:
current_contact = match_khm.group(1).strip()
continue
# 3. Check for Email description on same line (rare but possible) or email line
match_email = email_pattern.search(line)
if match_email:
email = match_email.group(1).strip()
# Dedup
combo = (current_innung, email)
if combo in seen_combinations:
continue
seen_combinations.add(combo)
leads.append({
"Firm/Innung": current_innung,
"Contact Person": current_contact,
"Email": email,
"Region": "Unterfranken"
})
# Write to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
writer.writeheader()
writer.writerows(leads)
print(f"Extracted {len(leads)} leads to {output_csv}")
if __name__ == "__main__":
extract_leads_v2("leads/raw/unterfranken.pdf", "leads/raw/leads_unterfranken_v2.csv")