Greenlens/utils/hybridSearch.ts

214 lines
7.2 KiB
TypeScript

import { CareInfo } from '../types';
const { SEARCH_INTENT_CONFIG } = require('../constants/searchIntentConfig');
type SearchIntentConfig = {
aliases?: string[];
entryHints?: string[];
lightHints?: string[];
};
export interface HybridSearchEntryLike {
name: string;
botanicalName?: string;
description?: string;
categories?: string[];
careInfo?: Partial<CareInfo> | null;
}
interface RankedEntry<T> {
entry: T;
score: number;
}
const normalizeArray = (values: string[]): string[] => {
return [...new Set(values.map((value) => normalizeSearchText(value)).filter(Boolean))];
};
export const normalizeSearchText = (value: string): string => {
return value
.toLowerCase()
.normalize('NFD')
.replace(/[\u0300-\u036f]/g, '')
.trim()
.replace(/[^a-z0-9\s_-]+/g, ' ')
.replace(/[_-]+/g, ' ')
.replace(/\s+/g, ' ');
};
const tokenize = (normalizedValue: string): string[] => {
return normalizedValue.split(' ').filter(Boolean);
};
const tokenSetFromQuery = (normalizedQuery: string): Set<string> => {
const rawTokens = tokenize(normalizedQuery);
const noise = new Set<string>(SEARCH_INTENT_CONFIG.noiseTokens.map((token: string) => normalizeSearchText(token)));
return new Set(rawTokens.filter((token) => !noise.has(token)));
};
const includesPhrase = (normalizedQuery: string, normalizedAlias: string, queryTokens: Set<string>): boolean => {
if (!normalizedAlias) return false;
if (normalizedQuery.includes(normalizedAlias)) return true;
const aliasTokens = tokenize(normalizedAlias);
if (aliasTokens.length <= 1) return queryTokens.has(normalizedAlias);
return aliasTokens.every((token) => queryTokens.has(token));
};
const detectQueryIntents = (normalizedQuery: string): string[] => {
const queryTokens = tokenSetFromQuery(normalizedQuery);
const intents = (Object.entries(SEARCH_INTENT_CONFIG.intents) as Array<[string, SearchIntentConfig]>)
.filter(([, value]) =>
(value.aliases || []).some((alias) => includesPhrase(normalizedQuery, normalizeSearchText(alias), queryTokens)))
.map(([intentId]) => intentId);
return intents;
};
const getLevenshteinDistance = (left: string, right: string): number => {
const rows = left.length + 1;
const cols = right.length + 1;
const matrix: number[][] = Array.from({ length: rows }, (_, rowIndex) => [rowIndex]);
for (let col = 0; col < cols; col += 1) {
matrix[0][col] = col;
}
for (let row = 1; row < rows; row += 1) {
for (let col = 1; col < cols; col += 1) {
const cost = left[row - 1] === right[col - 1] ? 0 : 1;
matrix[row][col] = Math.min(
matrix[row - 1][col] + 1,
matrix[row][col - 1] + 1,
matrix[row - 1][col - 1] + cost,
);
}
}
return matrix[left.length][right.length];
};
const fuzzyBonus = (normalizedQuery: string, candidates: string[]): number => {
if (normalizedQuery.length < 3 || normalizedQuery.length > 32) return 0;
let best = Number.POSITIVE_INFINITY;
candidates.forEach((candidate) => {
if (!candidate) return;
tokenize(candidate).forEach((token) => {
best = Math.min(best, getLevenshteinDistance(normalizedQuery, token));
});
best = Math.min(best, getLevenshteinDistance(normalizedQuery, candidate));
});
if (best === 1) return 14;
if (best === 2) return 8;
return 0;
};
const scoreTextMatch = (normalizedQuery: string, normalizedTarget: string, exact: number, prefix: number, contains: number): number => {
if (!normalizedQuery || !normalizedTarget) return 0;
if (normalizedTarget === normalizedQuery) return exact;
if (normalizedTarget.startsWith(normalizedQuery)) return prefix;
if (normalizedTarget.includes(normalizedQuery)) return contains;
return 0;
};
const buildDerivedIntentSignals = (entry: HybridSearchEntryLike): string[] => {
const normalizedDescription = normalizeSearchText(entry.description || '');
const normalizedLight = normalizeSearchText(entry.careInfo?.light || '');
const derivedSignals = new Set<string>();
const normalizedCategories = (entry.categories || []).map((category) => normalizeSearchText(category));
normalizedCategories.forEach((category) => derivedSignals.add(category));
(Object.entries(SEARCH_INTENT_CONFIG.intents) as Array<[string, SearchIntentConfig]>).forEach(([intentId, intentConfig]) => {
const entryHints = normalizeArray(intentConfig.entryHints || []);
if (entryHints.some((hint) => normalizedDescription.includes(hint))) {
derivedSignals.add(intentId);
}
const lightHints = normalizeArray(intentConfig.lightHints || []);
if (lightHints.some((hint) => normalizedLight.includes(hint))) {
derivedSignals.add(intentId);
}
});
return [...derivedSignals];
};
export const scoreHybridEntry = (entry: HybridSearchEntryLike, query: string): number => {
const normalizedQuery = normalizeSearchText(query);
if (!normalizedQuery) return 0;
const normalizedName = normalizeSearchText(entry.name || '');
const normalizedBotanical = normalizeSearchText(entry.botanicalName || '');
const normalizedDescription = normalizeSearchText(entry.description || '');
const normalizedCategories = (entry.categories || []).map((category) => normalizeSearchText(category));
const derivedSignals = buildDerivedIntentSignals(entry);
const requestedIntents = detectQueryIntents(normalizedQuery);
let score = 0;
score += Math.max(
scoreTextMatch(normalizedQuery, normalizedName, 140, 100, 64),
scoreTextMatch(normalizedQuery, normalizedBotanical, 130, 96, 58),
);
if (normalizedDescription.includes(normalizedQuery)) {
score += 24;
}
score += fuzzyBonus(normalizedQuery, [normalizedName, normalizedBotanical, ...normalizedCategories]);
let matchedIntentCount = 0;
requestedIntents.forEach((intentId) => {
const categoryHit = normalizedCategories.includes(intentId);
const derivedHit = derivedSignals.includes(intentId);
if (categoryHit) {
score += 92;
matchedIntentCount += 1;
return;
}
if (derivedHit) {
score += 56;
matchedIntentCount += 1;
}
});
if (matchedIntentCount >= 2) {
score += 38 * matchedIntentCount;
} else if (matchedIntentCount === 1) {
score += 10;
}
const queryTokens = [...tokenSetFromQuery(normalizedQuery)];
if (queryTokens.length > 1) {
const searchableText = [normalizedName, normalizedBotanical, normalizedDescription, ...normalizedCategories, ...derivedSignals].join(' ');
const tokenHits = queryTokens.filter((token) => searchableText.includes(token)).length;
score += tokenHits * 8;
if (tokenHits === queryTokens.length && queryTokens.length > 0) {
score += 16;
}
}
return score;
};
export const rankHybridEntries = <T extends HybridSearchEntryLike>(
entries: T[],
query: string,
limit = 30,
): RankedEntry<T>[] => {
const normalizedQuery = normalizeSearchText(query);
if (!normalizedQuery) {
return entries.slice(0, limit).map((entry) => ({ entry, score: 0 }));
}
return entries
.map((entry) => ({ entry, score: scoreHybridEntry(entry, normalizedQuery) }))
.filter((candidate) => candidate.score > 0)
.sort((left, right) =>
right.score - left.score ||
left.entry.name.length - right.entry.name.length ||
left.entry.name.localeCompare(right.entry.name))
.slice(0, limit);
};