41 lines
1.6 KiB
Python
41 lines
1.6 KiB
Python
import json
|
|
|
|
processed_proximates = [
|
|
"Baugewerbe", "Dachdecker", "Elektro", "Sanitär", "Stahl", "Tischler", "Maler", "Kraftfahrzeug", "Friseur", "Fleischer",
|
|
"Zimmerer", "Glaser", "Rollladen", "Gebäudereiniger", "Augenoptiker", "Bäcker", "Konditoren", "Schornsteinfeger", "Steinmetz", "Straßenbauer",
|
|
"Stukkateur", "Boots", "Gold", "Informationstechnik", "Kachel", "Karosserie", "Schneider", "Instrumenten", "Orthopädie", "Parkett", "Sattler", "Werbe", "Zahn"
|
|
]
|
|
|
|
def is_processed(name):
|
|
for p in processed_proximates:
|
|
# Check for word boundary or similar to avoid false positives if possible, but simple substring is mostly fine
|
|
# "Sanitär" matches "Innung Sanitär-Heizung..."
|
|
if p in name:
|
|
return True
|
|
return False
|
|
|
|
def prepare_batch6():
|
|
with open('cologne_duesseldorf_data/duesseldorf_targets.json', 'r', encoding='utf-8') as f:
|
|
targets = json.load(f)
|
|
|
|
new_targets = []
|
|
skipped_count = 0
|
|
for t in targets:
|
|
if not is_processed(t['innung']):
|
|
new_targets.append(t)
|
|
else:
|
|
skipped_count += 1
|
|
|
|
print(f"Skipped {skipped_count} processed targets.")
|
|
print(f"Found {len(new_targets)} unprocessed targets.")
|
|
|
|
batch6 = new_targets[:30]
|
|
with open('cologne_duesseldorf_data/batch6_targets.json', 'w', encoding='utf-8') as f:
|
|
json.dump(batch6, f, indent=2)
|
|
|
|
for i, t in enumerate(batch6):
|
|
print(f"Target {i+1}: {t['innung']}")
|
|
|
|
if __name__ == "__main__":
|
|
prepare_batch6()
|