Greenlens/scripts/generate_semantic_audit.js

#!/usr/bin/env node
/* eslint-disable no-console */
const fs = require('fs');
const path = require('path');
const vm = require('vm');
const ts = require('typescript');

const ROOT_DIR = path.resolve(__dirname, '..');
const OUTPUT_DIR = path.join(ROOT_DIR, 'audits', 'semantic-search');
const CATEGORY_DIR = path.join(OUTPUT_DIR, 'categories');
const ROOT_EXPORT_PATH = path.join(ROOT_DIR, 'all-plants-categories.csv');
const BATCH_1_PATH = path.join(ROOT_DIR, 'constants', 'lexiconBatch1.ts');
const BATCH_2_PATH = path.join(ROOT_DIR, 'constants', 'lexiconBatch2.ts');

const AUDIT_PRIORITY = [
  'pet_friendly',
  'air_purifier',
  'medicinal',
  'low_light',
  'bright_light',
  'sun',
  'easy',
  'high_humidity',
  'hanging',
  'tree',
  'large',
  'patterned',
  'flowering',
  'succulent',
];

const HIGH_CONFIDENCE_MANUAL_REVIEW_CATEGORIES = new Set([
  'pet_friendly',
  'air_purifier',
  'medicinal',
]);

const CATEGORY_DISPLAY_ORDER = [
  'easy',
  'pet_friendly',
  'flowering',
  'succulent',
  'patterned',
  'tree',
  'large',
  'medicinal',
  'hanging',
  'air_purifier',
  'low_light',
  'bright_light',
  'high_humidity',
  'sun',
];

const resolveTsFilePath = (fromFile, specifier) => {
  if (!specifier.startsWith('.')) return null;
  const fromDirectory = path.dirname(fromFile);
  const absoluteBase = path.resolve(fromDirectory, specifier);
  const candidates = [
    absoluteBase,
    `${absoluteBase}.ts`,
    `${absoluteBase}.tsx`,
    path.join(absoluteBase, 'index.ts'),
  ];

  for (const candidate of candidates) {
    if (fs.existsSync(candidate) && fs.statSync(candidate).isFile()) {
      return candidate;
    }
  }

  return null;
};

const loadTsModule = (absolutePath, cache = new Map()) => {
  if (cache.has(absolutePath)) return cache.get(absolutePath);

  const source = fs.readFileSync(absolutePath, 'utf8');
  const transpiled = ts.transpileModule(source, {
    compilerOptions: {
      module: ts.ModuleKind.CommonJS,
      target: ts.ScriptTarget.ES2020,
      esModuleInterop: true,
      jsx: ts.JsxEmit.ReactJSX,
    },
    fileName: absolutePath,
    reportDiagnostics: false,
  }).outputText;

  const module = { exports: {} };
  cache.set(absolutePath, module.exports);

  const localRequire = (specifier) => {
    const resolvedTsPath = resolveTsFilePath(absolutePath, specifier);
    if (resolvedTsPath) return loadTsModule(resolvedTsPath, cache);
    return require(specifier);
  };

  const sandbox = {
    module,
    exports: module.exports,
    require: localRequire,
    __dirname: path.dirname(absolutePath),
    __filename: absolutePath,
    console,
    process,
    Buffer,
    setTimeout,
    clearTimeout,
  };

  vm.runInNewContext(transpiled, sandbox, { filename: absolutePath });
  cache.set(absolutePath, module.exports);
  return module.exports;
};

const ensureDir = (directoryPath) => {
  fs.mkdirSync(directoryPath, { recursive: true });
};

const csvEscape = (value) => {
  const stringValue = String(value ?? '');
  if (/[",\n]/.test(stringValue)) {
    return `"${stringValue.replace(/"/g, '""')}"`;
  }
  return stringValue;
};

const writeCsv = (filePath, rows) => {
  if (!rows.length) {
    fs.writeFileSync(filePath, '', 'utf8');
    return;
  }

  const headers = Object.keys(rows[0]);
  const lines = [headers.join(',')];
  rows.forEach((row) => {
    lines.push(headers.map((header) => csvEscape(row[header])).join(','));
  });
  fs.writeFileSync(filePath, `${lines.join('\n')}\n`, 'utf8');
};

const normalizeCategoryFilename = (category) => category.replace(/[^a-z0-9_-]+/gi, '-').toLowerCase();

const sortCategories = (categories = []) => (
  [...categories].sort((left, right) => {
    const leftIndex = CATEGORY_DISPLAY_ORDER.indexOf(left);
    const rightIndex = CATEGORY_DISPLAY_ORDER.indexOf(right);
    const normalizedLeft = leftIndex === -1 ? Number.MAX_SAFE_INTEGER : leftIndex;
    const normalizedRight = rightIndex === -1 ? Number.MAX_SAFE_INTEGER : rightIndex;
    return normalizedLeft - normalizedRight || left.localeCompare(right);
  })
);

const buildRiskFlags = (entry) => {
  const categories = new Set(entry.categories || []);
  const flags = [];

  if (categories.has('low_light') && categories.has('sun')) {
    flags.push('light_conflict_low_light_and_sun');
  }
  if (categories.has('low_light') && categories.has('bright_light')) {
    flags.push('light_conflict_low_light_and_bright_light');
  }
  if (categories.has('succulent') && categories.has('high_humidity')) {
    flags.push('succulent_high_humidity_combo_review');
  }

  (entry.categories || []).forEach((category) => {
    if (HIGH_CONFIDENCE_MANUAL_REVIEW_CATEGORIES.has(category)) {
      flags.push(`${category}_requires_external_evidence`);
    }
  });

  return [...new Set(flags)];
};

const toAuditRow = (entry, category) => ({
  category,
  source_file: entry.sourceFile,
  source_index: entry.sourceIndex,
  name: entry.name,
  botanical_name: entry.botanicalName,
  description: entry.description || '',
  light: entry.careInfo?.light || '',
  temp: entry.careInfo?.temp || '',
  water_interval_days: entry.careInfo?.waterIntervalDays ?? '',
  all_categories: sortCategories(entry.categories || []).join('|'),
  risk_flags: buildRiskFlags(entry).join('|'),
  audit_status: '',
  evidence_source: '',
  evidence_url: '',
  notes: '',
});

const toPlantCategoryRow = (entry) => ({
  source_file: entry.sourceFile,
  source_index: entry.sourceIndex,
  name: entry.name,
  botanical_name: entry.botanicalName,
  all_categories: sortCategories(entry.categories || []).join('|'),
  category_count: (entry.categories || []).length,
  description: entry.description || '',
  light: entry.careInfo?.light || '',
  temp: entry.careInfo?.temp || '',
  water_interval_days: entry.careInfo?.waterIntervalDays ?? '',
});

const loadBatchEntries = () => {
  const batch1Entries = loadTsModule(BATCH_1_PATH).LEXICON_BATCH_1_ENTRIES;
  const batch2Entries = loadTsModule(BATCH_2_PATH).LEXICON_BATCH_2_ENTRIES;

  if (!Array.isArray(batch1Entries) || !Array.isArray(batch2Entries)) {
    throw new Error('Could not load lexicon batch entries.');
  }

  return [
    ...batch1Entries.map((entry, index) => ({ ...entry, sourceFile: 'constants/lexiconBatch1.ts', sourceIndex: index + 1 })),
    ...batch2Entries.map((entry, index) => ({ ...entry, sourceFile: 'constants/lexiconBatch2.ts', sourceIndex: index + 1 })),
  ];
};

const main = () => {
  ensureDir(CATEGORY_DIR);
  const entries = loadBatchEntries();
  const categories = [...new Set(entries.flatMap((entry) => entry.categories || []))].sort();

  const summary = {
    generatedAt: new Date().toISOString(),
    totalEntries: entries.length,
    categories: categories.map((category) => ({
      category,
      count: entries.filter((entry) => (entry.categories || []).includes(category)).length,
      priority: AUDIT_PRIORITY.indexOf(category) >= 0 ? AUDIT_PRIORITY.indexOf(category) + 1 : 999,
    })).sort((left, right) =>
      left.priority - right.priority ||
      right.count - left.count ||
      left.category.localeCompare(right.category)),
  };

  const plantCategoryRows = [...entries]
    .sort((left, right) =>
      left.botanicalName.localeCompare(right.botanicalName) ||
      left.name.localeCompare(right.name))
    .map((entry) => toPlantCategoryRow(entry));

  const masterRows = [];
  const suspiciousRows = [];

  categories.forEach((category) => {
    const categoryEntries = entries
      .filter((entry) => (entry.categories || []).includes(category))
      .sort((left, right) =>
        left.botanicalName.localeCompare(right.botanicalName) ||
        left.name.localeCompare(right.name));

    const rows = categoryEntries.map((entry) => {
      const row = toAuditRow(entry, category);
      masterRows.push(row);

      const riskFlags = row.risk_flags ? row.risk_flags.split('|').filter(Boolean) : [];
      if (riskFlags.length > 0) {
        suspiciousRows.push({
          category,
          source_file: entry.sourceFile,
          source_index: entry.sourceIndex,
          name: entry.name,
          botanical_name: entry.botanicalName,
          risk_flags: riskFlags.join('|'),
        });
      }

      return row;
    });

    writeCsv(path.join(CATEGORY_DIR, `${normalizeCategoryFilename(category)}.csv`), rows);
  });

  writeCsv(path.join(OUTPUT_DIR, 'all-plants-categories.csv'), plantCategoryRows);
  writeCsv(ROOT_EXPORT_PATH, plantCategoryRows);
  writeCsv(path.join(OUTPUT_DIR, 'master.csv'), masterRows);
  writeCsv(path.join(OUTPUT_DIR, 'suspicious.csv'), suspiciousRows);
  fs.writeFileSync(path.join(OUTPUT_DIR, 'summary.json'), `${JSON.stringify(summary, null, 2)}\n`, 'utf8');
  fs.writeFileSync(path.join(OUTPUT_DIR, 'suspicious.json'), `${JSON.stringify(suspiciousRows, null, 2)}\n`, 'utf8');

  const readme = `# Semantic Search Audit

Generated: ${summary.generatedAt}

Files:
- \`summary.json\`: category counts and suggested audit order
- \`all-plants-categories.csv\`: one row per plant with its full category list
- \`master.csv\`: all category assignments with blank evidence columns
- \`suspicious.csv\`: entries that require elevated review based on rule flags
- \`categories/*.csv\`: per-category audit sheets

Suggested audit order:
${summary.categories.map((item) => `- ${item.category} (${item.count})`).join('\n')}

Workflow:
1. Review one category CSV at a time.
2. Fill \`audit_status\`, \`evidence_source\`, \`evidence_url\`, and \`notes\`.
3. Apply only high-confidence source-tag corrections to the lexicon batch files.
4. Rebuild the server catalog from batches after source edits.
`;

  fs.writeFileSync(path.join(OUTPUT_DIR, 'README.md'), readme, 'utf8');

  console.log(`Audit artifacts written to ${OUTPUT_DIR}`);
  console.log(`Categories exported: ${categories.length}`);
  console.log(`Suspicious rows flagged: ${suspiciousRows.length}`);
};

main();