181 lines
7.0 KiB
Python
181 lines
7.0 KiB
Python
|
|
import re
|
|
import csv
|
|
from pypdf import PdfReader
|
|
|
|
def extract_leads_v2(pdf_path, output_csv):
|
|
print(f"Extracting from {pdf_path}...")
|
|
reader = PdfReader(pdf_path)
|
|
text_lines = []
|
|
|
|
# Extract text and split into lines
|
|
for page in reader.pages:
|
|
page_text = page.extract_text()
|
|
if page_text:
|
|
text_lines.extend(page_text.split('\n'))
|
|
|
|
leads = []
|
|
current_innung = "Unbekannte Innung"
|
|
|
|
# regex patterns
|
|
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})", re.IGNORECASE)
|
|
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)", re.IGNORECASE)
|
|
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)", re.IGNORECASE)
|
|
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)", re.IGNORECASE)
|
|
landkreis_pattern = re.compile(r"^Landkreis(e?):", re.IGNORECASE)
|
|
|
|
# Temporary storage for the current Innung's data
|
|
# We need to be careful: a single Innung block might have multiple contacts?
|
|
# Based on the PDF, usually contacts follow the Innung header.
|
|
|
|
# Strategy:
|
|
# Iterate through lines.
|
|
# If we detect "Landkreis:", look back for Innung Name. Update current_innung.
|
|
# Process lines for contacts/emails. Assign to current_innung.
|
|
|
|
extracted_entries = [] # List of dicts
|
|
|
|
# Clean lines first
|
|
lines = [l.strip() for l in text_lines]
|
|
|
|
seen_combinations = set()
|
|
|
|
for i, line in enumerate(lines):
|
|
if not line:
|
|
continue
|
|
|
|
# Detect Innung Name via Lookahead/Lookbehind context
|
|
# Check if this line is "Landkreis: ..."
|
|
if landkreis_pattern.match(line):
|
|
# The Innung Name is likely the previous non-empty line
|
|
# Look backwards from i-1
|
|
k = i - 1
|
|
while k >= 0 and not lines[k]:
|
|
k -= 1
|
|
|
|
if k >= 0:
|
|
potential_name = lines[k]
|
|
# Sanity check: Name shouldn't be too long or look like a page number "5.12.2025 8"
|
|
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
|
|
current_innung = potential_name
|
|
# print(f"Found Innung: {current_innung}")
|
|
|
|
# Capture People
|
|
contact_person = None
|
|
match_om = obermeister_pattern.match(line)
|
|
if match_om:
|
|
contact_person = match_om.group(1).strip()
|
|
|
|
match_ap = ansprechpartner_pattern.match(line)
|
|
if match_ap: # We take Ansprechpartner too
|
|
contact_person = match_ap.group(1).strip()
|
|
|
|
match_khm = kreishandwerksmeister_pattern.match(line)
|
|
if match_khm:
|
|
contact_person = match_khm.group(1).strip()
|
|
|
|
# Capture Email
|
|
match_email = email_pattern.search(line)
|
|
if match_email:
|
|
email = match_email.group(1).strip()
|
|
|
|
# If we found an email, we verify if valid
|
|
if len(email) < 5 or "@" not in email:
|
|
continue
|
|
|
|
# Check if we have a contact person on this line or previous line?
|
|
# The loop structure is linear. If we found a contact person 3 lines ago, should we link it?
|
|
# A simple heuristic: Keep the last seen contact person for this Innung block.
|
|
# But the block might define "Obermeister" then "Email", then "Ansprechpartner" then "Email".
|
|
# So we need `current_contact` state that resets somewhat?
|
|
# Actually, usually getting the email is the trigger to save a lead.
|
|
# We use the most recently seen contact person *since the last email or Innung change*.
|
|
|
|
entry = {
|
|
"Firm/Innung": current_innung,
|
|
"Contact Person": contact_person if contact_person else "N/A", # Use local var if found on same line, else need state
|
|
"Email": email,
|
|
"Region": "Unterfranken"
|
|
}
|
|
|
|
# Improvement: If contact_person is None on this line, try to use a "running" contact person
|
|
# But we must be careful not to apply Obermeister to Ansprechpartner's email.
|
|
# Let's verify the text dump.
|
|
# 117: Obermeister: Ullrich Amthor
|
|
# ...
|
|
# 123: E-Mail: ...
|
|
|
|
# So the contact person appears BEFORE the email.
|
|
pass
|
|
|
|
# Refined loop with state
|
|
current_contact = "N/A"
|
|
|
|
# Reset loop
|
|
current_innung = "Unbekannte Innung"
|
|
|
|
for i, line in enumerate(lines):
|
|
if not line:
|
|
continue
|
|
|
|
# 1. Check for Innung Header (Landkreis pattern)
|
|
if landkreis_pattern.match(line):
|
|
# Backtrack to find name
|
|
k = i - 1
|
|
while k >= 0 and not lines[k]:
|
|
k -= 1
|
|
if k >= 0:
|
|
potential_name = lines[k]
|
|
if len(potential_name) < 100 and not re.match(r'^\d{2}\.\d{2}\.\d{4}', potential_name):
|
|
current_innung = potential_name
|
|
current_contact = "N/A" # Reset contact for new Innung
|
|
|
|
|
|
# 2. Check for Contact Person
|
|
# If line starts with Obermeister/Ansprechpartner, store it.
|
|
match_om = obermeister_pattern.match(line)
|
|
if match_om:
|
|
current_contact = match_om.group(1).strip()
|
|
continue # Move to next line (don't expect email on same line usually, but check pdf)
|
|
|
|
match_ap = ansprechpartner_pattern.match(line)
|
|
if match_ap:
|
|
current_contact = match_ap.group(1).strip()
|
|
continue
|
|
|
|
match_khm = kreishandwerksmeister_pattern.match(line)
|
|
if match_khm:
|
|
current_contact = match_khm.group(1).strip()
|
|
continue
|
|
|
|
|
|
# 3. Check for Email description on same line (rare but possible) or email line
|
|
match_email = email_pattern.search(line)
|
|
if match_email:
|
|
email = match_email.group(1).strip()
|
|
|
|
# Dedup
|
|
combo = (current_innung, email)
|
|
if combo in seen_combinations:
|
|
continue
|
|
seen_combinations.add(combo)
|
|
|
|
leads.append({
|
|
"Firm/Innung": current_innung,
|
|
"Contact Person": current_contact,
|
|
"Email": email,
|
|
"Region": "Unterfranken"
|
|
})
|
|
|
|
|
|
# Write to CSV
|
|
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
|
|
writer.writeheader()
|
|
writer.writerows(leads)
|
|
|
|
print(f"Extracted {len(leads)} leads to {output_csv}")
|
|
|
|
if __name__ == "__main__":
|
|
extract_leads_v2("leads/raw/unterfranken.pdf", "leads/raw/leads_unterfranken_v2.csv")
|