stadtwerke/leads/read_pdf.py

25 lines
907 B
Python

import pdfplumber
all_text = []
with pdfplumber.open(r'C:\Users\a931627\Documents\stadtwerke-saas-analysis\leads\raw\unterfranken.pdf') as pdf:
for i, page in enumerate(pdf.pages):
w = page.width
h = page.height
# Split into left and right columns
left = page.crop((0, 0, w/2, h))
right = page.crop((w/2, 0, w, h))
left_text = left.extract_text(x_tolerance=2, y_tolerance=2) or ""
right_text = right.extract_text(x_tolerance=2, y_tolerance=2) or ""
all_text.append(f"=== PAGE {i+1} LEFT ===\n{left_text}")
all_text.append(f"=== PAGE {i+1} RIGHT ===\n{right_text}")
full = "\n".join(all_text)
# Write to file for easier reading
with open(r'C:\Users\a931627\Documents\stadtwerke-saas-analysis\leads\unterfranken_pdf_raw.txt', 'w', encoding='utf-8') as f:
f.write(full)
print(f"Written {len(full)} chars")
print(full[:3000])