25 lines
907 B
Python
25 lines
907 B
Python
import pdfplumber
|
|
|
|
all_text = []
|
|
|
|
with pdfplumber.open(r'C:\Users\a931627\Documents\stadtwerke-saas-analysis\leads\raw\unterfranken.pdf') as pdf:
|
|
for i, page in enumerate(pdf.pages):
|
|
w = page.width
|
|
h = page.height
|
|
# Split into left and right columns
|
|
left = page.crop((0, 0, w/2, h))
|
|
right = page.crop((w/2, 0, w, h))
|
|
left_text = left.extract_text(x_tolerance=2, y_tolerance=2) or ""
|
|
right_text = right.extract_text(x_tolerance=2, y_tolerance=2) or ""
|
|
all_text.append(f"=== PAGE {i+1} LEFT ===\n{left_text}")
|
|
all_text.append(f"=== PAGE {i+1} RIGHT ===\n{right_text}")
|
|
|
|
full = "\n".join(all_text)
|
|
|
|
# Write to file for easier reading
|
|
with open(r'C:\Users\a931627\Documents\stadtwerke-saas-analysis\leads\unterfranken_pdf_raw.txt', 'w', encoding='utf-8') as f:
|
|
f.write(full)
|
|
|
|
print(f"Written {len(full)} chars")
|
|
print(full[:3000])
|