import json processed_proximates = [ "Baugewerbe", "Dachdecker", "Elektro", "Sanitär", "Stahl", "Tischler", "Maler", "Kraftfahrzeug", "Friseur", "Fleischer", "Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker", "Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer", "Stukkateur", "Boots", "Gold", "Informationstechnik", "Kachel", "Karosserie", "Schneider", "Instrumenten", "Orthopädie", "Parkett", "Sattler", "Werbe", "Zahn" ] def is_processed(name): for p in processed_proximates: # Check for word boundary or similar to avoid false positives if possible, but simple substring is mostly fine # "Sanitär" matches "Innung Sanitär-Heizung..." if p in name: return True return False def prepare_batch6(): with open('cologne_duesseldorf_data/duesseldorf_targets.json', 'r', encoding='utf-8') as f: targets = json.load(f) new_targets = [] skipped_count = 0 for t in targets: if not is_processed(t['innung']): new_targets.append(t) else: skipped_count += 1 print(f"Skipped {skipped_count} processed targets.") print(f"Found {len(new_targets)} unprocessed targets.") batch6 = new_targets[:30] with open('cologne_duesseldorf_data/batch6_targets.json', 'w', encoding='utf-8') as f: json.dump(batch6, f, indent=2) for i, t in enumerate(batch6): print(f"Target {i+1}: {t['innung']}") if __name__ == "__main__": prepare_batch6()