#!/usr/bin/env node /* eslint-disable no-console */ const fs = require('fs'); const path = require('path'); const vm = require('vm'); const ts = require('typescript'); const ROOT_DIR = path.resolve(__dirname, '..'); const OUTPUT_DIR = path.join(ROOT_DIR, 'audits', 'semantic-search'); const CATEGORY_DIR = path.join(OUTPUT_DIR, 'categories'); const ROOT_EXPORT_PATH = path.join(ROOT_DIR, 'all-plants-categories.csv'); const BATCH_1_PATH = path.join(ROOT_DIR, 'constants', 'lexiconBatch1.ts'); const BATCH_2_PATH = path.join(ROOT_DIR, 'constants', 'lexiconBatch2.ts'); const AUDIT_PRIORITY = [ 'pet_friendly', 'air_purifier', 'medicinal', 'low_light', 'bright_light', 'sun', 'easy', 'high_humidity', 'hanging', 'tree', 'large', 'patterned', 'flowering', 'succulent', ]; const HIGH_CONFIDENCE_MANUAL_REVIEW_CATEGORIES = new Set([ 'pet_friendly', 'air_purifier', 'medicinal', ]); const CATEGORY_DISPLAY_ORDER = [ 'easy', 'pet_friendly', 'flowering', 'succulent', 'patterned', 'tree', 'large', 'medicinal', 'hanging', 'air_purifier', 'low_light', 'bright_light', 'high_humidity', 'sun', ]; const resolveTsFilePath = (fromFile, specifier) => { if (!specifier.startsWith('.')) return null; const fromDirectory = path.dirname(fromFile); const absoluteBase = path.resolve(fromDirectory, specifier); const candidates = [ absoluteBase, `${absoluteBase}.ts`, `${absoluteBase}.tsx`, path.join(absoluteBase, 'index.ts'), ]; for (const candidate of candidates) { if (fs.existsSync(candidate) && fs.statSync(candidate).isFile()) { return candidate; } } return null; }; const loadTsModule = (absolutePath, cache = new Map()) => { if (cache.has(absolutePath)) return cache.get(absolutePath); const source = fs.readFileSync(absolutePath, 'utf8'); const transpiled = ts.transpileModule(source, { compilerOptions: { module: ts.ModuleKind.CommonJS, target: ts.ScriptTarget.ES2020, esModuleInterop: true, jsx: ts.JsxEmit.ReactJSX, }, fileName: absolutePath, reportDiagnostics: false, }).outputText; const module = { exports: {} }; cache.set(absolutePath, module.exports); const localRequire = (specifier) => { const resolvedTsPath = resolveTsFilePath(absolutePath, specifier); if (resolvedTsPath) return loadTsModule(resolvedTsPath, cache); return require(specifier); }; const sandbox = { module, exports: module.exports, require: localRequire, __dirname: path.dirname(absolutePath), __filename: absolutePath, console, process, Buffer, setTimeout, clearTimeout, }; vm.runInNewContext(transpiled, sandbox, { filename: absolutePath }); cache.set(absolutePath, module.exports); return module.exports; }; const ensureDir = (directoryPath) => { fs.mkdirSync(directoryPath, { recursive: true }); }; const csvEscape = (value) => { const stringValue = String(value ?? ''); if (/[",\n]/.test(stringValue)) { return `"${stringValue.replace(/"/g, '""')}"`; } return stringValue; }; const writeCsv = (filePath, rows) => { if (!rows.length) { fs.writeFileSync(filePath, '', 'utf8'); return; } const headers = Object.keys(rows[0]); const lines = [headers.join(',')]; rows.forEach((row) => { lines.push(headers.map((header) => csvEscape(row[header])).join(',')); }); fs.writeFileSync(filePath, `${lines.join('\n')}\n`, 'utf8'); }; const normalizeCategoryFilename = (category) => category.replace(/[^a-z0-9_-]+/gi, '-').toLowerCase(); const sortCategories = (categories = []) => ( [...categories].sort((left, right) => { const leftIndex = CATEGORY_DISPLAY_ORDER.indexOf(left); const rightIndex = CATEGORY_DISPLAY_ORDER.indexOf(right); const normalizedLeft = leftIndex === -1 ? Number.MAX_SAFE_INTEGER : leftIndex; const normalizedRight = rightIndex === -1 ? Number.MAX_SAFE_INTEGER : rightIndex; return normalizedLeft - normalizedRight || left.localeCompare(right); }) ); const buildRiskFlags = (entry) => { const categories = new Set(entry.categories || []); const flags = []; if (categories.has('low_light') && categories.has('sun')) { flags.push('light_conflict_low_light_and_sun'); } if (categories.has('low_light') && categories.has('bright_light')) { flags.push('light_conflict_low_light_and_bright_light'); } if (categories.has('succulent') && categories.has('high_humidity')) { flags.push('succulent_high_humidity_combo_review'); } (entry.categories || []).forEach((category) => { if (HIGH_CONFIDENCE_MANUAL_REVIEW_CATEGORIES.has(category)) { flags.push(`${category}_requires_external_evidence`); } }); return [...new Set(flags)]; }; const toAuditRow = (entry, category) => ({ category, source_file: entry.sourceFile, source_index: entry.sourceIndex, name: entry.name, botanical_name: entry.botanicalName, description: entry.description || '', light: entry.careInfo?.light || '', temp: entry.careInfo?.temp || '', water_interval_days: entry.careInfo?.waterIntervalDays ?? '', all_categories: sortCategories(entry.categories || []).join('|'), risk_flags: buildRiskFlags(entry).join('|'), audit_status: '', evidence_source: '', evidence_url: '', notes: '', }); const toPlantCategoryRow = (entry) => ({ source_file: entry.sourceFile, source_index: entry.sourceIndex, name: entry.name, botanical_name: entry.botanicalName, all_categories: sortCategories(entry.categories || []).join('|'), category_count: (entry.categories || []).length, description: entry.description || '', light: entry.careInfo?.light || '', temp: entry.careInfo?.temp || '', water_interval_days: entry.careInfo?.waterIntervalDays ?? '', }); const loadBatchEntries = () => { const batch1Entries = loadTsModule(BATCH_1_PATH).LEXICON_BATCH_1_ENTRIES; const batch2Entries = loadTsModule(BATCH_2_PATH).LEXICON_BATCH_2_ENTRIES; if (!Array.isArray(batch1Entries) || !Array.isArray(batch2Entries)) { throw new Error('Could not load lexicon batch entries.'); } return [ ...batch1Entries.map((entry, index) => ({ ...entry, sourceFile: 'constants/lexiconBatch1.ts', sourceIndex: index + 1 })), ...batch2Entries.map((entry, index) => ({ ...entry, sourceFile: 'constants/lexiconBatch2.ts', sourceIndex: index + 1 })), ]; }; const main = () => { ensureDir(CATEGORY_DIR); const entries = loadBatchEntries(); const categories = [...new Set(entries.flatMap((entry) => entry.categories || []))].sort(); const summary = { generatedAt: new Date().toISOString(), totalEntries: entries.length, categories: categories.map((category) => ({ category, count: entries.filter((entry) => (entry.categories || []).includes(category)).length, priority: AUDIT_PRIORITY.indexOf(category) >= 0 ? AUDIT_PRIORITY.indexOf(category) + 1 : 999, })).sort((left, right) => left.priority - right.priority || right.count - left.count || left.category.localeCompare(right.category)), }; const plantCategoryRows = [...entries] .sort((left, right) => left.botanicalName.localeCompare(right.botanicalName) || left.name.localeCompare(right.name)) .map((entry) => toPlantCategoryRow(entry)); const masterRows = []; const suspiciousRows = []; categories.forEach((category) => { const categoryEntries = entries .filter((entry) => (entry.categories || []).includes(category)) .sort((left, right) => left.botanicalName.localeCompare(right.botanicalName) || left.name.localeCompare(right.name)); const rows = categoryEntries.map((entry) => { const row = toAuditRow(entry, category); masterRows.push(row); const riskFlags = row.risk_flags ? row.risk_flags.split('|').filter(Boolean) : []; if (riskFlags.length > 0) { suspiciousRows.push({ category, source_file: entry.sourceFile, source_index: entry.sourceIndex, name: entry.name, botanical_name: entry.botanicalName, risk_flags: riskFlags.join('|'), }); } return row; }); writeCsv(path.join(CATEGORY_DIR, `${normalizeCategoryFilename(category)}.csv`), rows); }); writeCsv(path.join(OUTPUT_DIR, 'all-plants-categories.csv'), plantCategoryRows); writeCsv(ROOT_EXPORT_PATH, plantCategoryRows); writeCsv(path.join(OUTPUT_DIR, 'master.csv'), masterRows); writeCsv(path.join(OUTPUT_DIR, 'suspicious.csv'), suspiciousRows); fs.writeFileSync(path.join(OUTPUT_DIR, 'summary.json'), `${JSON.stringify(summary, null, 2)}\n`, 'utf8'); fs.writeFileSync(path.join(OUTPUT_DIR, 'suspicious.json'), `${JSON.stringify(suspiciousRows, null, 2)}\n`, 'utf8'); const readme = `# Semantic Search Audit Generated: ${summary.generatedAt} Files: - \`summary.json\`: category counts and suggested audit order - \`all-plants-categories.csv\`: one row per plant with its full category list - \`master.csv\`: all category assignments with blank evidence columns - \`suspicious.csv\`: entries that require elevated review based on rule flags - \`categories/*.csv\`: per-category audit sheets Suggested audit order: ${summary.categories.map((item) => `- ${item.category} (${item.count})`).join('\n')} Workflow: 1. Review one category CSV at a time. 2. Fill \`audit_status\`, \`evidence_source\`, \`evidence_url\`, and \`notes\`. 3. Apply only high-confidence source-tag corrections to the lexicon batch files. 4. Rebuild the server catalog from batches after source edits. `; fs.writeFileSync(path.join(OUTPUT_DIR, 'README.md'), readme, 'utf8'); console.log(`Audit artifacts written to ${OUTPUT_DIR}`); console.log(`Categories exported: ${categories.length}`); console.log(`Suspicious rows flagged: ${suspiciousRows.length}`); }; main();