stadtwerke/scripts/prepare_batch6_v2.py

41 lines
1.6 KiB
Python

import json
processed_proximates = [
"Baugewerbe", "Dachdecker", "Elektro", "Sanitär", "Stahl", "Tischler", "Maler", "Kraftfahrzeug", "Friseur", "Fleischer",
"Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker", "Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer",
"Stukkateur", "Boots", "Gold", "Informationstechnik", "Kachel", "Karosserie", "Schneider", "Instrumenten", "Orthopädie", "Parkett", "Sattler", "Werbe", "Zahn"
]
def is_processed(name):
for p in processed_proximates:
# Check for word boundary or similar to avoid false positives if possible, but simple substring is mostly fine
# "Sanitär" matches "Innung Sanitär-Heizung..."
if p in name:
return True
return False
def prepare_batch6():
with open('cologne_duesseldorf_data/duesseldorf_targets.json', 'r', encoding='utf-8') as f:
targets = json.load(f)
new_targets = []
skipped_count = 0
for t in targets:
if not is_processed(t['innung']):
new_targets.append(t)
else:
skipped_count += 1
print(f"Skipped {skipped_count} processed targets.")
print(f"Found {len(new_targets)} unprocessed targets.")
batch6 = new_targets[:30]
with open('cologne_duesseldorf_data/batch6_targets.json', 'w', encoding='utf-8') as f:
json.dump(batch6, f, indent=2)
for i, t in enumerate(batch6):
print(f"Target {i+1}: {t['innung']}")
if __name__ == "__main__":
prepare_batch6()