Greenlens/scripts/generate_semantic_audit.js

315 lines
9.6 KiB
JavaScript

#!/usr/bin/env node
/* eslint-disable no-console */
const fs = require('fs');
const path = require('path');
const vm = require('vm');
const ts = require('typescript');
const ROOT_DIR = path.resolve(__dirname, '..');
const OUTPUT_DIR = path.join(ROOT_DIR, 'audits', 'semantic-search');
const CATEGORY_DIR = path.join(OUTPUT_DIR, 'categories');
const ROOT_EXPORT_PATH = path.join(ROOT_DIR, 'all-plants-categories.csv');
const BATCH_1_PATH = path.join(ROOT_DIR, 'constants', 'lexiconBatch1.ts');
const BATCH_2_PATH = path.join(ROOT_DIR, 'constants', 'lexiconBatch2.ts');
const AUDIT_PRIORITY = [
'pet_friendly',
'air_purifier',
'medicinal',
'low_light',
'bright_light',
'sun',
'easy',
'high_humidity',
'hanging',
'tree',
'large',
'patterned',
'flowering',
'succulent',
];
const HIGH_CONFIDENCE_MANUAL_REVIEW_CATEGORIES = new Set([
'pet_friendly',
'air_purifier',
'medicinal',
]);
const CATEGORY_DISPLAY_ORDER = [
'easy',
'pet_friendly',
'flowering',
'succulent',
'patterned',
'tree',
'large',
'medicinal',
'hanging',
'air_purifier',
'low_light',
'bright_light',
'high_humidity',
'sun',
];
const resolveTsFilePath = (fromFile, specifier) => {
if (!specifier.startsWith('.')) return null;
const fromDirectory = path.dirname(fromFile);
const absoluteBase = path.resolve(fromDirectory, specifier);
const candidates = [
absoluteBase,
`${absoluteBase}.ts`,
`${absoluteBase}.tsx`,
path.join(absoluteBase, 'index.ts'),
];
for (const candidate of candidates) {
if (fs.existsSync(candidate) && fs.statSync(candidate).isFile()) {
return candidate;
}
}
return null;
};
const loadTsModule = (absolutePath, cache = new Map()) => {
if (cache.has(absolutePath)) return cache.get(absolutePath);
const source = fs.readFileSync(absolutePath, 'utf8');
const transpiled = ts.transpileModule(source, {
compilerOptions: {
module: ts.ModuleKind.CommonJS,
target: ts.ScriptTarget.ES2020,
esModuleInterop: true,
jsx: ts.JsxEmit.ReactJSX,
},
fileName: absolutePath,
reportDiagnostics: false,
}).outputText;
const module = { exports: {} };
cache.set(absolutePath, module.exports);
const localRequire = (specifier) => {
const resolvedTsPath = resolveTsFilePath(absolutePath, specifier);
if (resolvedTsPath) return loadTsModule(resolvedTsPath, cache);
return require(specifier);
};
const sandbox = {
module,
exports: module.exports,
require: localRequire,
__dirname: path.dirname(absolutePath),
__filename: absolutePath,
console,
process,
Buffer,
setTimeout,
clearTimeout,
};
vm.runInNewContext(transpiled, sandbox, { filename: absolutePath });
cache.set(absolutePath, module.exports);
return module.exports;
};
const ensureDir = (directoryPath) => {
fs.mkdirSync(directoryPath, { recursive: true });
};
const csvEscape = (value) => {
const stringValue = String(value ?? '');
if (/[",\n]/.test(stringValue)) {
return `"${stringValue.replace(/"/g, '""')}"`;
}
return stringValue;
};
const writeCsv = (filePath, rows) => {
if (!rows.length) {
fs.writeFileSync(filePath, '', 'utf8');
return;
}
const headers = Object.keys(rows[0]);
const lines = [headers.join(',')];
rows.forEach((row) => {
lines.push(headers.map((header) => csvEscape(row[header])).join(','));
});
fs.writeFileSync(filePath, `${lines.join('\n')}\n`, 'utf8');
};
const normalizeCategoryFilename = (category) => category.replace(/[^a-z0-9_-]+/gi, '-').toLowerCase();
const sortCategories = (categories = []) => (
[...categories].sort((left, right) => {
const leftIndex = CATEGORY_DISPLAY_ORDER.indexOf(left);
const rightIndex = CATEGORY_DISPLAY_ORDER.indexOf(right);
const normalizedLeft = leftIndex === -1 ? Number.MAX_SAFE_INTEGER : leftIndex;
const normalizedRight = rightIndex === -1 ? Number.MAX_SAFE_INTEGER : rightIndex;
return normalizedLeft - normalizedRight || left.localeCompare(right);
})
);
const buildRiskFlags = (entry) => {
const categories = new Set(entry.categories || []);
const flags = [];
if (categories.has('low_light') && categories.has('sun')) {
flags.push('light_conflict_low_light_and_sun');
}
if (categories.has('low_light') && categories.has('bright_light')) {
flags.push('light_conflict_low_light_and_bright_light');
}
if (categories.has('succulent') && categories.has('high_humidity')) {
flags.push('succulent_high_humidity_combo_review');
}
(entry.categories || []).forEach((category) => {
if (HIGH_CONFIDENCE_MANUAL_REVIEW_CATEGORIES.has(category)) {
flags.push(`${category}_requires_external_evidence`);
}
});
return [...new Set(flags)];
};
const toAuditRow = (entry, category) => ({
category,
source_file: entry.sourceFile,
source_index: entry.sourceIndex,
name: entry.name,
botanical_name: entry.botanicalName,
description: entry.description || '',
light: entry.careInfo?.light || '',
temp: entry.careInfo?.temp || '',
water_interval_days: entry.careInfo?.waterIntervalDays ?? '',
all_categories: sortCategories(entry.categories || []).join('|'),
risk_flags: buildRiskFlags(entry).join('|'),
audit_status: '',
evidence_source: '',
evidence_url: '',
notes: '',
});
const toPlantCategoryRow = (entry) => ({
source_file: entry.sourceFile,
source_index: entry.sourceIndex,
name: entry.name,
botanical_name: entry.botanicalName,
all_categories: sortCategories(entry.categories || []).join('|'),
category_count: (entry.categories || []).length,
description: entry.description || '',
light: entry.careInfo?.light || '',
temp: entry.careInfo?.temp || '',
water_interval_days: entry.careInfo?.waterIntervalDays ?? '',
});
const loadBatchEntries = () => {
const batch1Entries = loadTsModule(BATCH_1_PATH).LEXICON_BATCH_1_ENTRIES;
const batch2Entries = loadTsModule(BATCH_2_PATH).LEXICON_BATCH_2_ENTRIES;
if (!Array.isArray(batch1Entries) || !Array.isArray(batch2Entries)) {
throw new Error('Could not load lexicon batch entries.');
}
return [
...batch1Entries.map((entry, index) => ({ ...entry, sourceFile: 'constants/lexiconBatch1.ts', sourceIndex: index + 1 })),
...batch2Entries.map((entry, index) => ({ ...entry, sourceFile: 'constants/lexiconBatch2.ts', sourceIndex: index + 1 })),
];
};
const main = () => {
ensureDir(CATEGORY_DIR);
const entries = loadBatchEntries();
const categories = [...new Set(entries.flatMap((entry) => entry.categories || []))].sort();
const summary = {
generatedAt: new Date().toISOString(),
totalEntries: entries.length,
categories: categories.map((category) => ({
category,
count: entries.filter((entry) => (entry.categories || []).includes(category)).length,
priority: AUDIT_PRIORITY.indexOf(category) >= 0 ? AUDIT_PRIORITY.indexOf(category) + 1 : 999,
})).sort((left, right) =>
left.priority - right.priority ||
right.count - left.count ||
left.category.localeCompare(right.category)),
};
const plantCategoryRows = [...entries]
.sort((left, right) =>
left.botanicalName.localeCompare(right.botanicalName) ||
left.name.localeCompare(right.name))
.map((entry) => toPlantCategoryRow(entry));
const masterRows = [];
const suspiciousRows = [];
categories.forEach((category) => {
const categoryEntries = entries
.filter((entry) => (entry.categories || []).includes(category))
.sort((left, right) =>
left.botanicalName.localeCompare(right.botanicalName) ||
left.name.localeCompare(right.name));
const rows = categoryEntries.map((entry) => {
const row = toAuditRow(entry, category);
masterRows.push(row);
const riskFlags = row.risk_flags ? row.risk_flags.split('|').filter(Boolean) : [];
if (riskFlags.length > 0) {
suspiciousRows.push({
category,
source_file: entry.sourceFile,
source_index: entry.sourceIndex,
name: entry.name,
botanical_name: entry.botanicalName,
risk_flags: riskFlags.join('|'),
});
}
return row;
});
writeCsv(path.join(CATEGORY_DIR, `${normalizeCategoryFilename(category)}.csv`), rows);
});
writeCsv(path.join(OUTPUT_DIR, 'all-plants-categories.csv'), plantCategoryRows);
writeCsv(ROOT_EXPORT_PATH, plantCategoryRows);
writeCsv(path.join(OUTPUT_DIR, 'master.csv'), masterRows);
writeCsv(path.join(OUTPUT_DIR, 'suspicious.csv'), suspiciousRows);
fs.writeFileSync(path.join(OUTPUT_DIR, 'summary.json'), `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
fs.writeFileSync(path.join(OUTPUT_DIR, 'suspicious.json'), `${JSON.stringify(suspiciousRows, null, 2)}\n`, 'utf8');
const readme = `# Semantic Search Audit
Generated: ${summary.generatedAt}
Files:
- \`summary.json\`: category counts and suggested audit order
- \`all-plants-categories.csv\`: one row per plant with its full category list
- \`master.csv\`: all category assignments with blank evidence columns
- \`suspicious.csv\`: entries that require elevated review based on rule flags
- \`categories/*.csv\`: per-category audit sheets
Suggested audit order:
${summary.categories.map((item) => `- ${item.category} (${item.count})`).join('\n')}
Workflow:
1. Review one category CSV at a time.
2. Fill \`audit_status\`, \`evidence_source\`, \`evidence_url\`, and \`notes\`.
3. Apply only high-confidence source-tag corrections to the lexicon batch files.
4. Rebuild the server catalog from batches after source edits.
`;
fs.writeFileSync(path.join(OUTPUT_DIR, 'README.md'), readme, 'utf8');
console.log(`Audit artifacts written to ${OUTPUT_DIR}`);
console.log(`Categories exported: ${categories.length}`);
console.log(`Suspicious rows flagged: ${suspiciousRows.length}`);
};
main();