315 lines
9.6 KiB
JavaScript
315 lines
9.6 KiB
JavaScript
#!/usr/bin/env node
|
|
/* eslint-disable no-console */
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const vm = require('vm');
|
|
const ts = require('typescript');
|
|
|
|
const ROOT_DIR = path.resolve(__dirname, '..');
|
|
const OUTPUT_DIR = path.join(ROOT_DIR, 'audits', 'semantic-search');
|
|
const CATEGORY_DIR = path.join(OUTPUT_DIR, 'categories');
|
|
const ROOT_EXPORT_PATH = path.join(ROOT_DIR, 'all-plants-categories.csv');
|
|
const BATCH_1_PATH = path.join(ROOT_DIR, 'constants', 'lexiconBatch1.ts');
|
|
const BATCH_2_PATH = path.join(ROOT_DIR, 'constants', 'lexiconBatch2.ts');
|
|
|
|
const AUDIT_PRIORITY = [
|
|
'pet_friendly',
|
|
'air_purifier',
|
|
'medicinal',
|
|
'low_light',
|
|
'bright_light',
|
|
'sun',
|
|
'easy',
|
|
'high_humidity',
|
|
'hanging',
|
|
'tree',
|
|
'large',
|
|
'patterned',
|
|
'flowering',
|
|
'succulent',
|
|
];
|
|
|
|
const HIGH_CONFIDENCE_MANUAL_REVIEW_CATEGORIES = new Set([
|
|
'pet_friendly',
|
|
'air_purifier',
|
|
'medicinal',
|
|
]);
|
|
|
|
const CATEGORY_DISPLAY_ORDER = [
|
|
'easy',
|
|
'pet_friendly',
|
|
'flowering',
|
|
'succulent',
|
|
'patterned',
|
|
'tree',
|
|
'large',
|
|
'medicinal',
|
|
'hanging',
|
|
'air_purifier',
|
|
'low_light',
|
|
'bright_light',
|
|
'high_humidity',
|
|
'sun',
|
|
];
|
|
|
|
const resolveTsFilePath = (fromFile, specifier) => {
|
|
if (!specifier.startsWith('.')) return null;
|
|
const fromDirectory = path.dirname(fromFile);
|
|
const absoluteBase = path.resolve(fromDirectory, specifier);
|
|
const candidates = [
|
|
absoluteBase,
|
|
`${absoluteBase}.ts`,
|
|
`${absoluteBase}.tsx`,
|
|
path.join(absoluteBase, 'index.ts'),
|
|
];
|
|
|
|
for (const candidate of candidates) {
|
|
if (fs.existsSync(candidate) && fs.statSync(candidate).isFile()) {
|
|
return candidate;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
};
|
|
|
|
const loadTsModule = (absolutePath, cache = new Map()) => {
|
|
if (cache.has(absolutePath)) return cache.get(absolutePath);
|
|
|
|
const source = fs.readFileSync(absolutePath, 'utf8');
|
|
const transpiled = ts.transpileModule(source, {
|
|
compilerOptions: {
|
|
module: ts.ModuleKind.CommonJS,
|
|
target: ts.ScriptTarget.ES2020,
|
|
esModuleInterop: true,
|
|
jsx: ts.JsxEmit.ReactJSX,
|
|
},
|
|
fileName: absolutePath,
|
|
reportDiagnostics: false,
|
|
}).outputText;
|
|
|
|
const module = { exports: {} };
|
|
cache.set(absolutePath, module.exports);
|
|
|
|
const localRequire = (specifier) => {
|
|
const resolvedTsPath = resolveTsFilePath(absolutePath, specifier);
|
|
if (resolvedTsPath) return loadTsModule(resolvedTsPath, cache);
|
|
return require(specifier);
|
|
};
|
|
|
|
const sandbox = {
|
|
module,
|
|
exports: module.exports,
|
|
require: localRequire,
|
|
__dirname: path.dirname(absolutePath),
|
|
__filename: absolutePath,
|
|
console,
|
|
process,
|
|
Buffer,
|
|
setTimeout,
|
|
clearTimeout,
|
|
};
|
|
|
|
vm.runInNewContext(transpiled, sandbox, { filename: absolutePath });
|
|
cache.set(absolutePath, module.exports);
|
|
return module.exports;
|
|
};
|
|
|
|
const ensureDir = (directoryPath) => {
|
|
fs.mkdirSync(directoryPath, { recursive: true });
|
|
};
|
|
|
|
const csvEscape = (value) => {
|
|
const stringValue = String(value ?? '');
|
|
if (/[",\n]/.test(stringValue)) {
|
|
return `"${stringValue.replace(/"/g, '""')}"`;
|
|
}
|
|
return stringValue;
|
|
};
|
|
|
|
const writeCsv = (filePath, rows) => {
|
|
if (!rows.length) {
|
|
fs.writeFileSync(filePath, '', 'utf8');
|
|
return;
|
|
}
|
|
|
|
const headers = Object.keys(rows[0]);
|
|
const lines = [headers.join(',')];
|
|
rows.forEach((row) => {
|
|
lines.push(headers.map((header) => csvEscape(row[header])).join(','));
|
|
});
|
|
fs.writeFileSync(filePath, `${lines.join('\n')}\n`, 'utf8');
|
|
};
|
|
|
|
const normalizeCategoryFilename = (category) => category.replace(/[^a-z0-9_-]+/gi, '-').toLowerCase();
|
|
|
|
const sortCategories = (categories = []) => (
|
|
[...categories].sort((left, right) => {
|
|
const leftIndex = CATEGORY_DISPLAY_ORDER.indexOf(left);
|
|
const rightIndex = CATEGORY_DISPLAY_ORDER.indexOf(right);
|
|
const normalizedLeft = leftIndex === -1 ? Number.MAX_SAFE_INTEGER : leftIndex;
|
|
const normalizedRight = rightIndex === -1 ? Number.MAX_SAFE_INTEGER : rightIndex;
|
|
return normalizedLeft - normalizedRight || left.localeCompare(right);
|
|
})
|
|
);
|
|
|
|
const buildRiskFlags = (entry) => {
|
|
const categories = new Set(entry.categories || []);
|
|
const flags = [];
|
|
|
|
if (categories.has('low_light') && categories.has('sun')) {
|
|
flags.push('light_conflict_low_light_and_sun');
|
|
}
|
|
if (categories.has('low_light') && categories.has('bright_light')) {
|
|
flags.push('light_conflict_low_light_and_bright_light');
|
|
}
|
|
if (categories.has('succulent') && categories.has('high_humidity')) {
|
|
flags.push('succulent_high_humidity_combo_review');
|
|
}
|
|
|
|
(entry.categories || []).forEach((category) => {
|
|
if (HIGH_CONFIDENCE_MANUAL_REVIEW_CATEGORIES.has(category)) {
|
|
flags.push(`${category}_requires_external_evidence`);
|
|
}
|
|
});
|
|
|
|
return [...new Set(flags)];
|
|
};
|
|
|
|
const toAuditRow = (entry, category) => ({
|
|
category,
|
|
source_file: entry.sourceFile,
|
|
source_index: entry.sourceIndex,
|
|
name: entry.name,
|
|
botanical_name: entry.botanicalName,
|
|
description: entry.description || '',
|
|
light: entry.careInfo?.light || '',
|
|
temp: entry.careInfo?.temp || '',
|
|
water_interval_days: entry.careInfo?.waterIntervalDays ?? '',
|
|
all_categories: sortCategories(entry.categories || []).join('|'),
|
|
risk_flags: buildRiskFlags(entry).join('|'),
|
|
audit_status: '',
|
|
evidence_source: '',
|
|
evidence_url: '',
|
|
notes: '',
|
|
});
|
|
|
|
const toPlantCategoryRow = (entry) => ({
|
|
source_file: entry.sourceFile,
|
|
source_index: entry.sourceIndex,
|
|
name: entry.name,
|
|
botanical_name: entry.botanicalName,
|
|
all_categories: sortCategories(entry.categories || []).join('|'),
|
|
category_count: (entry.categories || []).length,
|
|
description: entry.description || '',
|
|
light: entry.careInfo?.light || '',
|
|
temp: entry.careInfo?.temp || '',
|
|
water_interval_days: entry.careInfo?.waterIntervalDays ?? '',
|
|
});
|
|
|
|
const loadBatchEntries = () => {
|
|
const batch1Entries = loadTsModule(BATCH_1_PATH).LEXICON_BATCH_1_ENTRIES;
|
|
const batch2Entries = loadTsModule(BATCH_2_PATH).LEXICON_BATCH_2_ENTRIES;
|
|
|
|
if (!Array.isArray(batch1Entries) || !Array.isArray(batch2Entries)) {
|
|
throw new Error('Could not load lexicon batch entries.');
|
|
}
|
|
|
|
return [
|
|
...batch1Entries.map((entry, index) => ({ ...entry, sourceFile: 'constants/lexiconBatch1.ts', sourceIndex: index + 1 })),
|
|
...batch2Entries.map((entry, index) => ({ ...entry, sourceFile: 'constants/lexiconBatch2.ts', sourceIndex: index + 1 })),
|
|
];
|
|
};
|
|
|
|
const main = () => {
|
|
ensureDir(CATEGORY_DIR);
|
|
const entries = loadBatchEntries();
|
|
const categories = [...new Set(entries.flatMap((entry) => entry.categories || []))].sort();
|
|
|
|
const summary = {
|
|
generatedAt: new Date().toISOString(),
|
|
totalEntries: entries.length,
|
|
categories: categories.map((category) => ({
|
|
category,
|
|
count: entries.filter((entry) => (entry.categories || []).includes(category)).length,
|
|
priority: AUDIT_PRIORITY.indexOf(category) >= 0 ? AUDIT_PRIORITY.indexOf(category) + 1 : 999,
|
|
})).sort((left, right) =>
|
|
left.priority - right.priority ||
|
|
right.count - left.count ||
|
|
left.category.localeCompare(right.category)),
|
|
};
|
|
|
|
const plantCategoryRows = [...entries]
|
|
.sort((left, right) =>
|
|
left.botanicalName.localeCompare(right.botanicalName) ||
|
|
left.name.localeCompare(right.name))
|
|
.map((entry) => toPlantCategoryRow(entry));
|
|
|
|
const masterRows = [];
|
|
const suspiciousRows = [];
|
|
|
|
categories.forEach((category) => {
|
|
const categoryEntries = entries
|
|
.filter((entry) => (entry.categories || []).includes(category))
|
|
.sort((left, right) =>
|
|
left.botanicalName.localeCompare(right.botanicalName) ||
|
|
left.name.localeCompare(right.name));
|
|
|
|
const rows = categoryEntries.map((entry) => {
|
|
const row = toAuditRow(entry, category);
|
|
masterRows.push(row);
|
|
|
|
const riskFlags = row.risk_flags ? row.risk_flags.split('|').filter(Boolean) : [];
|
|
if (riskFlags.length > 0) {
|
|
suspiciousRows.push({
|
|
category,
|
|
source_file: entry.sourceFile,
|
|
source_index: entry.sourceIndex,
|
|
name: entry.name,
|
|
botanical_name: entry.botanicalName,
|
|
risk_flags: riskFlags.join('|'),
|
|
});
|
|
}
|
|
|
|
return row;
|
|
});
|
|
|
|
writeCsv(path.join(CATEGORY_DIR, `${normalizeCategoryFilename(category)}.csv`), rows);
|
|
});
|
|
|
|
writeCsv(path.join(OUTPUT_DIR, 'all-plants-categories.csv'), plantCategoryRows);
|
|
writeCsv(ROOT_EXPORT_PATH, plantCategoryRows);
|
|
writeCsv(path.join(OUTPUT_DIR, 'master.csv'), masterRows);
|
|
writeCsv(path.join(OUTPUT_DIR, 'suspicious.csv'), suspiciousRows);
|
|
fs.writeFileSync(path.join(OUTPUT_DIR, 'summary.json'), `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
|
|
fs.writeFileSync(path.join(OUTPUT_DIR, 'suspicious.json'), `${JSON.stringify(suspiciousRows, null, 2)}\n`, 'utf8');
|
|
|
|
const readme = `# Semantic Search Audit
|
|
|
|
Generated: ${summary.generatedAt}
|
|
|
|
Files:
|
|
- \`summary.json\`: category counts and suggested audit order
|
|
- \`all-plants-categories.csv\`: one row per plant with its full category list
|
|
- \`master.csv\`: all category assignments with blank evidence columns
|
|
- \`suspicious.csv\`: entries that require elevated review based on rule flags
|
|
- \`categories/*.csv\`: per-category audit sheets
|
|
|
|
Suggested audit order:
|
|
${summary.categories.map((item) => `- ${item.category} (${item.count})`).join('\n')}
|
|
|
|
Workflow:
|
|
1. Review one category CSV at a time.
|
|
2. Fill \`audit_status\`, \`evidence_source\`, \`evidence_url\`, and \`notes\`.
|
|
3. Apply only high-confidence source-tag corrections to the lexicon batch files.
|
|
4. Rebuild the server catalog from batches after source edits.
|
|
`;
|
|
|
|
fs.writeFileSync(path.join(OUTPUT_DIR, 'README.md'), readme, 'utf8');
|
|
|
|
console.log(`Audit artifacts written to ${OUTPUT_DIR}`);
|
|
console.log(`Categories exported: ${categories.length}`);
|
|
console.log(`Suspicious rows flagged: ${suspiciousRows.length}`);
|
|
};
|
|
|
|
main();
|