stadtwerke/scripts/debug_pdf.py

37 lines
1.1 KiB
Python

import pypdf
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
def debug_pdf():
try:
reader = pypdf.PdfReader(pdf_path)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
# Search for known name
target = "Jens Schulz"
idx = text.find(target)
if idx != -1:
print(f"Found '{target}' at index {idx}")
context = text[max(0, idx-200):min(len(text), idx+500)]
print("--- CONTEXT AROUND JENS SCHULZ ---")
print(context)
print("--- END CONTEXT ---")
else:
print(f"'{target}' not found!")
# Search for @
at_indices = [i for i, c in enumerate(text) if c == '@']
print(f"Found {len(at_indices)} '@' symbols.")
if at_indices:
first_at = at_indices[0]
print(f"Context around first '@':")
print(text[max(0, first_at-50):min(len(text), first_at+50)])
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
debug_pdf()