29 lines
840 B
Python
29 lines
840 B
Python
import pypdf
|
|
|
|
pdf_path = 'cologne_duesseldorf_data/duesseldorf_innungen.pdf'
|
|
|
|
def extract_links():
|
|
try:
|
|
reader = pypdf.PdfReader(pdf_path)
|
|
links = []
|
|
for page in reader.pages:
|
|
if "/Annots" in page:
|
|
for annot in page["/Annots"]:
|
|
obj = annot.get_object()
|
|
if "/A" in obj and "/URI" in obj["/A"]:
|
|
uri = obj["/A"]["/URI"]
|
|
links.append(uri)
|
|
|
|
print(f"Found {len(links)} links.")
|
|
for link in links:
|
|
if "mailto:" in link:
|
|
print(f"Mailto: {link}")
|
|
else:
|
|
print(f"Link: {link}")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
extract_links()
|