import pypdf pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf' def debug_pdf(): try: reader = pypdf.PdfReader(pdf_path) text = "" for page in reader.pages: text += page.extract_text() + "\n" # Search for known name target = "Jens Schulz" idx = text.find(target) if idx != -1: print(f"Found '{target}' at index {idx}") context = text[max(0, idx-200):min(len(text), idx+500)] print("--- CONTEXT AROUND JENS SCHULZ ---") print(context) print("--- END CONTEXT ---") else: print(f"'{target}' not found!") # Search for @ at_indices = [i for i, c in enumerate(text) if c == '@'] print(f"Found {len(at_indices)} '@' symbols.") if at_indices: first_at = at_indices[0] print(f"Context around first '@':") print(text[max(0, first_at-50):min(len(text), first_at+50)]) except Exception as e: print(f"Error: {e}") if __name__ == "__main__": debug_pdf()