import re import json input_file = 'cologne_duesseldorf_data/duesseldorf_raw.txt' output_json = 'cologne_duesseldorf_data/duesseldorf_targets.json' def parse_targets(): with open(input_file, 'r', encoding='utf-8') as f: lines = f.readlines() targets = [] # Pattern: • {Innung}/OM: {Name} # Example: • Augenoptiker-Innung Düssel-Rhein-Ruhr/OM: Jens Schulz pattern = re.compile(r'•\s*(.*?)/OM:\s*(.*)') for line in lines: line = line.strip() match = pattern.match(line) if match: innung = match.group(1).strip() name = match.group(2).strip() targets.append({ "query": f"{innung} Düsseldorf Kontakt Email", "innung": innung, "person": name }) print(f"Found {len(targets)} targets.") # Prioritize "Bau", "Elektro", "Sanitär", "Metall", "Dach" priority_keywords = ["Bau", "Elektro", "Sanitär", "Metall", "Dach", "Tischler"] sorted_targets = sorted(targets, key=lambda x: any(k in x['innung'] for k in priority_keywords), reverse=True) with open(output_json, 'w', encoding='utf-8') as f: json.dump(sorted_targets, f, indent=2) for t in sorted_targets[:5]: print(f"Target: {t['innung']} ({t['person']})") if __name__ == "__main__": parse_targets()