37 lines
1.1 KiB
Python
37 lines
1.1 KiB
Python
import pypdf
|
|
|
|
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
|
|
|
def debug_pdf():
|
|
try:
|
|
reader = pypdf.PdfReader(pdf_path)
|
|
text = ""
|
|
for page in reader.pages:
|
|
text += page.extract_text() + "\n"
|
|
|
|
# Search for known name
|
|
target = "Jens Schulz"
|
|
idx = text.find(target)
|
|
if idx != -1:
|
|
print(f"Found '{target}' at index {idx}")
|
|
context = text[max(0, idx-200):min(len(text), idx+500)]
|
|
print("--- CONTEXT AROUND JENS SCHULZ ---")
|
|
print(context)
|
|
print("--- END CONTEXT ---")
|
|
else:
|
|
print(f"'{target}' not found!")
|
|
|
|
# Search for @
|
|
at_indices = [i for i, c in enumerate(text) if c == '@']
|
|
print(f"Found {len(at_indices)} '@' symbols.")
|
|
if at_indices:
|
|
first_at = at_indices[0]
|
|
print(f"Context around first '@':")
|
|
print(text[max(0, first_at-50):min(len(text), first_at+50)])
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
debug_pdf()
|