stadtwerke/scripts/dump_duesseldorf_text.py

16 lines
457 B
Python

import pypdf
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
try:
reader = pypdf.PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
with open('cologne_duesseldorf_data/duesseldorf_raw.txt', 'w', encoding='utf-8') as f:
f.write(text)
print(f"Dumped {len(text)} characters to duesseldorf_raw.txt")
except Exception as e:
print(f"Error: {e}")