43 lines
1.4 KiB
Python
43 lines
1.4 KiB
Python
import re
|
|
import json
|
|
|
|
input_file = 'cologne_duesseldorf_data/duesseldorf_raw.txt'
|
|
output_json = 'cologne_duesseldorf_data/duesseldorf_targets.json'
|
|
|
|
def parse_targets():
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
lines = f.readlines()
|
|
|
|
targets = []
|
|
# Pattern: • {Innung}/OM: {Name}
|
|
# Example: • Augenoptiker-Innung Düssel-Rhein-Ruhr/OM: Jens Schulz
|
|
|
|
pattern = re.compile(r'•\s*(.*?)/OM:\s*(.*)')
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
match = pattern.match(line)
|
|
if match:
|
|
innung = match.group(1).strip()
|
|
name = match.group(2).strip()
|
|
targets.append({
|
|
"query": f"{innung} Düsseldorf Kontakt Email",
|
|
"innung": innung,
|
|
"person": name
|
|
})
|
|
|
|
print(f"Found {len(targets)} targets.")
|
|
|
|
# Prioritize "Bau", "Elektro", "Sanitär", "Metall", "Dach"
|
|
priority_keywords = ["Bau", "Elektro", "Sanitär", "Metall", "Dach", "Tischler"]
|
|
sorted_targets = sorted(targets, key=lambda x: any(k in x['innung'] for k in priority_keywords), reverse=True)
|
|
|
|
with open(output_json, 'w', encoding='utf-8') as f:
|
|
json.dump(sorted_targets, f, indent=2)
|
|
|
|
for t in sorted_targets[:5]:
|
|
print(f"Target: {t['innung']} ({t['person']})")
|
|
|
|
if __name__ == "__main__":
|
|
parse_targets()
|