86 lines
3.1 KiB
Python
86 lines
3.1 KiB
Python
|
|
import re
|
|
import csv
|
|
from pypdf import PdfReader
|
|
|
|
def extract_leads(pdf_path, output_csv):
|
|
reader = PdfReader(pdf_path)
|
|
text = ""
|
|
for page in reader.pages:
|
|
text += page.extract_text() + "\n"
|
|
|
|
lines = text.split('\n')
|
|
|
|
leads = []
|
|
seen_emails = set()
|
|
|
|
current_innung = "Unbekannte Innung"
|
|
current_contact = None
|
|
|
|
# Improved patterns
|
|
# Innung usually starts the line, maybe bolded in PDF (not visible here).
|
|
# We look for keywords.
|
|
innung_start_keywords = ["Innung", "Kreishandwerkerschaft", "Bäckerinnung", "Bauinnung", "Metzgerinnung", "Friseurinnung", "Maler", "Zimmerer"]
|
|
|
|
email_pattern = re.compile(r"E-Mail:\s*([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})")
|
|
obermeister_pattern = re.compile(r"Obermeister(?:in)?:\s*(.*)")
|
|
ansprechpartner_pattern = re.compile(r"Ansprechpartner(?:in)?:\s*(.*)")
|
|
kreishandwerksmeister_pattern = re.compile(r"Kreishandwerksmeister(?:in)?:\s*(.*)")
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
|
|
# Heuristic for Innung name: contains "Innung" or "schaft", is not a sentence (no " und ", " die ", etc in middle usually?), short length.
|
|
# The PDF seems to use headers like "Bäckerinnung Bayerischer Untermain"
|
|
if any(k in line for k in innung_start_keywords) and len(line) < 80 and " und " not in line[5:-5] and "," not in line:
|
|
if " die " not in line and " der " not in line:
|
|
current_innung = line
|
|
current_contact = None # New Innung, reset contact?
|
|
|
|
# Capture contact
|
|
match_om = obermeister_pattern.match(line)
|
|
if match_om:
|
|
current_contact = match_om.group(1)
|
|
|
|
match_ap = ansprechpartner_pattern.match(line)
|
|
if match_ap and not current_contact:
|
|
current_contact = match_ap.group(1)
|
|
|
|
match_khm = kreishandwerksmeister_pattern.match(line)
|
|
if match_khm:
|
|
current_contact = match_khm.group(1)
|
|
|
|
# Capture Email
|
|
match_email = email_pattern.search(line)
|
|
if match_email:
|
|
email = match_email.group(1)
|
|
|
|
# Additional cleanup
|
|
if email in seen_emails:
|
|
continue
|
|
|
|
# validation
|
|
if len(email) < 5 or "@" not in email:
|
|
continue
|
|
|
|
seen_emails.add(email)
|
|
leads.append({
|
|
"Firm/Innung": current_innung,
|
|
"Contact Person": current_contact if current_contact else "N/A",
|
|
"Email": email,
|
|
"Region": "Unterfranken"
|
|
})
|
|
|
|
# Write to CSV
|
|
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
|
writer = csv.DictWriter(f, fieldnames=["Firm/Innung", "Contact Person", "Email", "Region"])
|
|
writer.writeheader()
|
|
writer.writerows(leads)
|
|
|
|
print(f"Extracted {len(leads)} unique leads to {output_csv}")
|
|
|
|
if __name__ == "__main__":
|
|
extract_leads("unterfranken.pdf", "leads.csv")
|