stadtwerke/scripts/extract_pdf_links.py

29 lines
840 B
Python

import pypdf
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
def extract_links():
try:
reader = pypdf.PdfReader(pdf_path)
links = []
for page in reader.pages:
if "/Annots" in page:
for annot in page["/Annots"]:
obj = annot.get_object()
if "/A" in obj and "/URI" in obj["/A"]:
uri = obj["/A"]["/URI"]
links.append(uri)
print(f"Found {len(links)} links.")
for link in links:
if "mailto:" in link:
print(f"Mailto: {link}")
else:
print(f"Link: {link}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
extract_links()