487 lines
14 KiB
JavaScript
487 lines
14 KiB
JavaScript
#!/usr/bin/env node
|
|
/* eslint-disable no-console */
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
const crypto = require('crypto');
|
|
require('dotenv').config();
|
|
|
|
const sharp = require('sharp');
|
|
const { openDatabase, closeDatabase, all, run } = require('../lib/sqlite');
|
|
const { ensurePlantSchema } = require('../lib/plants');
|
|
|
|
const OUTPUT_DIR = path.join(__dirname, '..', 'public', 'plants');
|
|
const MANIFEST_PATH = path.join(OUTPUT_DIR, 'manifest.json');
|
|
const ROOT_DIR = path.join(__dirname, '..', '..');
|
|
const PLANTS_DUMP_PATH = path.join(ROOT_DIR, 'plants_dump_utf8.json');
|
|
const SEARCH_CACHE_PATH = path.join(OUTPUT_DIR, 'wikimedia-search-cache.json');
|
|
const MAX_CONCURRENCY = Number(process.env.PLANT_IMAGE_CONCURRENCY || 1);
|
|
const REQUEST_TIMEOUT_MS = 20000;
|
|
const MAX_FETCH_RETRIES = 5;
|
|
const WIKIMEDIA_SEARCH_PREFIX = 'wikimedia-search:';
|
|
|
|
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
|
|
const slugify = (value) => {
|
|
const normalized = String(value || '')
|
|
.normalize('NFD')
|
|
.replace(/[\u0300-\u036f]/g, '')
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, '-')
|
|
.replace(/^-+|-+$/g, '');
|
|
return normalized || 'plant';
|
|
};
|
|
|
|
const buildFileBaseName = (plant) => {
|
|
const botanicalSlug = slugify(plant.botanicalName);
|
|
const nameSlug = slugify(plant.name);
|
|
const suffix = crypto
|
|
.createHash('sha1')
|
|
.update(`${plant.id}|${plant.botanicalName}|${plant.name}`)
|
|
.digest('hex')
|
|
.slice(0, 8);
|
|
|
|
if (nameSlug && nameSlug !== botanicalSlug) {
|
|
return `${botanicalSlug}--${nameSlug}--${suffix}`;
|
|
}
|
|
|
|
return `${botanicalSlug}--${suffix}`;
|
|
};
|
|
|
|
const ensureOutputDir = () => {
|
|
fs.mkdirSync(OUTPUT_DIR, { recursive: true });
|
|
};
|
|
|
|
const loadRefreshMatchers = () => new Set(
|
|
String(process.env.PLANT_IMAGE_REFRESH || '')
|
|
.split(',')
|
|
.map((value) => value.trim().toLowerCase())
|
|
.filter(Boolean),
|
|
);
|
|
|
|
const loadManifest = () => {
|
|
try {
|
|
const raw = fs.readFileSync(MANIFEST_PATH, 'utf8');
|
|
return JSON.parse(raw);
|
|
} catch {
|
|
return { generatedAt: null, items: [] };
|
|
}
|
|
};
|
|
|
|
const saveManifest = (manifest) => {
|
|
fs.writeFileSync(MANIFEST_PATH, JSON.stringify(manifest, null, 2));
|
|
};
|
|
|
|
const loadSearchCache = () => {
|
|
try {
|
|
return JSON.parse(fs.readFileSync(SEARCH_CACHE_PATH, 'utf8'));
|
|
} catch {
|
|
return {};
|
|
}
|
|
};
|
|
|
|
const saveSearchCache = (cache) => {
|
|
fs.writeFileSync(SEARCH_CACHE_PATH, JSON.stringify(cache, null, 2));
|
|
};
|
|
|
|
const shouldRefreshPlantImage = (plant, refreshMatchers) => {
|
|
if (!refreshMatchers || refreshMatchers.size === 0) return false;
|
|
|
|
return [
|
|
plant.id,
|
|
plant.name,
|
|
plant.botanicalName,
|
|
].some((value) => refreshMatchers.has(String(value || '').trim().toLowerCase()));
|
|
};
|
|
|
|
const loadDumpFallbackMap = () => {
|
|
try {
|
|
const raw = fs.readFileSync(PLANTS_DUMP_PATH, 'utf8');
|
|
const entries = JSON.parse(raw);
|
|
if (!Array.isArray(entries)) return new Map();
|
|
|
|
const map = new Map();
|
|
for (const entry of entries) {
|
|
if (!entry || typeof entry.botanicalName !== 'string' || typeof entry.imageUri !== 'string') continue;
|
|
const key = entry.botanicalName.trim().toLowerCase();
|
|
if (!key || !/^https?:\/\//i.test(entry.imageUri)) continue;
|
|
if (!map.has(key)) map.set(key, entry.imageUri.trim());
|
|
}
|
|
return map;
|
|
} catch {
|
|
return new Map();
|
|
}
|
|
};
|
|
|
|
const getRetryDelayMs = (attempt, retryAfterHeader) => {
|
|
const retryAfterSeconds = Number(retryAfterHeader);
|
|
if (Number.isFinite(retryAfterSeconds) && retryAfterSeconds > 0) {
|
|
return retryAfterSeconds * 1000;
|
|
}
|
|
return Math.min(30000, 3000 * 2 ** attempt);
|
|
};
|
|
|
|
const tryDecode = (value) => {
|
|
try {
|
|
return decodeURIComponent(value);
|
|
} catch {
|
|
return value;
|
|
}
|
|
};
|
|
|
|
const decodeRepeatedly = (value, rounds = 3) => {
|
|
let current = value;
|
|
for (let index = 0; index < rounds; index += 1) {
|
|
const decoded = tryDecode(current);
|
|
if (decoded === current) break;
|
|
current = decoded;
|
|
}
|
|
return current;
|
|
};
|
|
|
|
const toWikimediaFilePathUrl = (rawUrl) => {
|
|
if (typeof rawUrl !== 'string' || !rawUrl.includes('upload.wikimedia.org/wikipedia/commons/')) {
|
|
return null;
|
|
}
|
|
|
|
const cleanUrl = rawUrl.split(/[?#]/)[0];
|
|
const parts = cleanUrl.split('/').filter(Boolean);
|
|
if (parts.length < 2) return null;
|
|
|
|
let fileName = null;
|
|
const thumbIndex = parts.indexOf('thumb');
|
|
|
|
if (thumbIndex >= 0 && parts.length >= thumbIndex + 5) {
|
|
fileName = parts[parts.length - 2];
|
|
} else {
|
|
fileName = parts[parts.length - 1];
|
|
}
|
|
|
|
if (!fileName) return null;
|
|
const decoded = tryDecode(fileName).trim();
|
|
if (!decoded) return null;
|
|
|
|
return `https://commons.wikimedia.org/wiki/Special:FilePath/${encodeURIComponent(decoded)}`;
|
|
};
|
|
|
|
const parseWikimediaSearchQuery = (value) => {
|
|
if (typeof value !== 'string') return null;
|
|
|
|
const trimmed = value.trim();
|
|
if (!trimmed.toLowerCase().startsWith(WIKIMEDIA_SEARCH_PREFIX)) return null;
|
|
|
|
const rawQuery = trimmed.slice(WIKIMEDIA_SEARCH_PREFIX.length).trim();
|
|
if (!rawQuery) return null;
|
|
|
|
return decodeRepeatedly(rawQuery);
|
|
};
|
|
|
|
const fetchImageBuffer = async (url, attempt = 0, redirectCount = 0) => {
|
|
if (redirectCount > 5) {
|
|
throw new Error('Too many redirects');
|
|
}
|
|
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
|
|
|
|
try {
|
|
const response = await fetch(url, {
|
|
headers: {
|
|
'User-Agent': 'GreenLens-PlantImageImporter/1.0',
|
|
'Accept': 'image/avif,image/webp,image/apng,image/*,*/*;q=0.8',
|
|
'Referer': 'https://commons.wikimedia.org/',
|
|
},
|
|
redirect: 'manual',
|
|
signal: controller.signal,
|
|
});
|
|
|
|
if ([301, 302, 303, 307, 308].includes(response.status)) {
|
|
const location = response.headers.get('location');
|
|
if (!location) throw new Error(`Redirect without location for ${url}`);
|
|
const nextUrl = new URL(location, url).toString();
|
|
return fetchImageBuffer(nextUrl, attempt, redirectCount + 1);
|
|
}
|
|
|
|
if ((response.status === 429 || response.status >= 500) && attempt < MAX_FETCH_RETRIES) {
|
|
const delayMs = getRetryDelayMs(attempt, response.headers.get('retry-after'));
|
|
await sleep(delayMs);
|
|
return fetchImageBuffer(url, attempt + 1, redirectCount);
|
|
}
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`HTTP ${response.status}`);
|
|
}
|
|
|
|
const arrayBuffer = await response.arrayBuffer();
|
|
return Buffer.from(arrayBuffer);
|
|
} finally {
|
|
clearTimeout(timeout);
|
|
}
|
|
};
|
|
|
|
const searchWikimediaImage = async (query, searchCache) => {
|
|
const normalizedQuery = String(query || '').trim();
|
|
if (!normalizedQuery) return null;
|
|
|
|
if (Object.prototype.hasOwnProperty.call(searchCache, normalizedQuery)) {
|
|
return searchCache[normalizedQuery] || null;
|
|
}
|
|
|
|
const apiUrl = `https://commons.wikimedia.org/w/api.php?action=query&generator=search&gsrnamespace=6&gsrsearch=${encodeURIComponent(normalizedQuery)}&gsrlimit=5&prop=imageinfo&iiprop=url&iiurlwidth=1200&format=json`;
|
|
|
|
try {
|
|
const controller = new AbortController();
|
|
const timeout = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
|
|
const response = await fetch(apiUrl, {
|
|
headers: {
|
|
'User-Agent': 'GreenLens-PlantImageImporter/1.0',
|
|
'Accept': 'application/json',
|
|
},
|
|
signal: controller.signal,
|
|
});
|
|
clearTimeout(timeout);
|
|
|
|
if (!response.ok) {
|
|
searchCache[normalizedQuery] = null;
|
|
saveSearchCache(searchCache);
|
|
return null;
|
|
}
|
|
|
|
const data = await response.json();
|
|
const pages = data?.query?.pages ? Object.values(data.query.pages) : [];
|
|
|
|
for (const page of pages) {
|
|
const imageInfo = page?.imageinfo?.[0];
|
|
const candidate = imageInfo?.thumburl || imageInfo?.url || null;
|
|
if (candidate && /^https?:\/\//i.test(candidate)) {
|
|
searchCache[normalizedQuery] = candidate;
|
|
saveSearchCache(searchCache);
|
|
return candidate;
|
|
}
|
|
}
|
|
} catch {
|
|
// Ignore and cache as null below.
|
|
}
|
|
|
|
searchCache[normalizedQuery] = null;
|
|
saveSearchCache(searchCache);
|
|
return null;
|
|
};
|
|
|
|
const convertToWebp = async (inputBuffer, outputPath) => {
|
|
const tempPath = `${outputPath}.tmp-${process.pid}-${Date.now()}.webp`;
|
|
await sharp(inputBuffer)
|
|
.rotate()
|
|
.resize({
|
|
width: 1200,
|
|
height: 1200,
|
|
fit: 'inside',
|
|
withoutEnlargement: true,
|
|
})
|
|
.webp({ quality: 82 })
|
|
.toFile(tempPath);
|
|
|
|
fs.copyFileSync(tempPath, outputPath);
|
|
fs.unlinkSync(tempPath);
|
|
};
|
|
|
|
const updatePlantImageUri = async (db, plantId, localImageUri) => {
|
|
await run(
|
|
db,
|
|
'UPDATE plants SET imageUri = ?, imageStatus = ?, updatedAt = datetime(\'now\') WHERE id = ?',
|
|
[localImageUri, 'ok', plantId],
|
|
);
|
|
};
|
|
|
|
const processPlant = async (db, plant, manifestItems, dumpFallbackMap, searchCache, refreshMatchers) => {
|
|
const currentUri = String(plant.imageUri || '').trim();
|
|
const placeholderQuery = parseWikimediaSearchQuery(currentUri);
|
|
const fileBaseName = buildFileBaseName(plant);
|
|
const fileName = `${fileBaseName}.webp`;
|
|
const localImageUri = `/plants/${fileName}`;
|
|
const outputPath = path.join(OUTPUT_DIR, fileName);
|
|
const dumpFallbackUri = dumpFallbackMap.get(String(plant.botanicalName || '').trim().toLowerCase()) || null;
|
|
const shouldRefresh = shouldRefreshPlantImage(plant, refreshMatchers);
|
|
|
|
if (fs.existsSync(outputPath) && !shouldRefresh) {
|
|
await updatePlantImageUri(db, plant.id, localImageUri);
|
|
manifestItems.push({
|
|
id: plant.id,
|
|
botanicalName: plant.botanicalName,
|
|
name: plant.name,
|
|
sourceUri: currentUri,
|
|
localImageUri,
|
|
status: 'existing',
|
|
});
|
|
return { status: 'existing', plantId: plant.id, localImageUri };
|
|
}
|
|
|
|
if (!/^https?:\/\//i.test(currentUri) && !placeholderQuery) {
|
|
manifestItems.push({
|
|
id: plant.id,
|
|
botanicalName: plant.botanicalName,
|
|
name: plant.name,
|
|
sourceUri: currentUri,
|
|
localImageUri,
|
|
status: 'skipped',
|
|
reason: 'Current imageUri is not a remote URL and no local file exists yet.',
|
|
});
|
|
return { status: 'skipped', plantId: plant.id, localImageUri };
|
|
}
|
|
|
|
let lastError = null;
|
|
let sourceUsed = currentUri;
|
|
let buffer = null;
|
|
|
|
const searchedUri = await searchWikimediaImage(placeholderQuery, searchCache)
|
|
|| await searchWikimediaImage(plant.botanicalName, searchCache)
|
|
|| await searchWikimediaImage(plant.name, searchCache);
|
|
|
|
const candidateUris = [
|
|
/^https?:\/\//i.test(currentUri) ? currentUri : null,
|
|
/^https?:\/\//i.test(currentUri) ? toWikimediaFilePathUrl(currentUri) : null,
|
|
dumpFallbackUri,
|
|
toWikimediaFilePathUrl(dumpFallbackUri),
|
|
searchedUri,
|
|
toWikimediaFilePathUrl(searchedUri),
|
|
].filter(Boolean);
|
|
|
|
for (const candidateUri of [...new Set(candidateUris)]) {
|
|
try {
|
|
buffer = await fetchImageBuffer(candidateUri);
|
|
sourceUsed = candidateUri;
|
|
break;
|
|
} catch (error) {
|
|
lastError = error;
|
|
}
|
|
}
|
|
|
|
if (!buffer) {
|
|
throw lastError || new Error('Image download failed');
|
|
}
|
|
|
|
await convertToWebp(buffer, outputPath);
|
|
await updatePlantImageUri(db, plant.id, localImageUri);
|
|
|
|
manifestItems.push({
|
|
id: plant.id,
|
|
botanicalName: plant.botanicalName,
|
|
name: plant.name,
|
|
sourceUri: sourceUsed,
|
|
localImageUri,
|
|
status: 'downloaded',
|
|
});
|
|
|
|
await sleep(900);
|
|
return { status: 'downloaded', plantId: plant.id, localImageUri };
|
|
};
|
|
|
|
const runWithConcurrency = async (items, worker, concurrency) => {
|
|
const queue = [...items];
|
|
const results = [];
|
|
|
|
const runners = Array.from({ length: Math.min(concurrency, queue.length) }, async () => {
|
|
while (queue.length > 0) {
|
|
const item = queue.shift();
|
|
if (!item) return;
|
|
results.push(await worker(item));
|
|
}
|
|
});
|
|
|
|
await Promise.all(runners);
|
|
return results;
|
|
};
|
|
|
|
const main = async () => {
|
|
ensureOutputDir();
|
|
const manifest = loadManifest();
|
|
const manifestItems = [];
|
|
const dumpFallbackMap = loadDumpFallbackMap();
|
|
const searchCache = loadSearchCache();
|
|
const refreshMatchers = loadRefreshMatchers();
|
|
const db = await openDatabase();
|
|
|
|
try {
|
|
await ensurePlantSchema(db);
|
|
const plants = await all(
|
|
db,
|
|
`SELECT id, name, botanicalName, imageUri
|
|
FROM plants
|
|
ORDER BY name COLLATE NOCASE ASC`,
|
|
);
|
|
|
|
console.log(`Preparing ${plants.length} plant images...`);
|
|
|
|
const failures = [];
|
|
let completed = 0;
|
|
|
|
await runWithConcurrency(
|
|
plants,
|
|
async (plant) => {
|
|
try {
|
|
const result = await processPlant(db, plant, manifestItems, dumpFallbackMap, searchCache, refreshMatchers);
|
|
completed += 1;
|
|
console.log(`[${completed}/${plants.length}] ${plant.botanicalName} -> ${result.status}`);
|
|
return result;
|
|
} catch (error) {
|
|
completed += 1;
|
|
const message = error instanceof Error ? error.message : String(error);
|
|
console.error(`[${completed}/${plants.length}] ${plant.botanicalName} -> failed: ${message}`);
|
|
failures.push({
|
|
id: plant.id,
|
|
name: plant.name,
|
|
botanicalName: plant.botanicalName,
|
|
sourceUri: plant.imageUri,
|
|
error: message,
|
|
});
|
|
manifestItems.push({
|
|
id: plant.id,
|
|
botanicalName: plant.botanicalName,
|
|
name: plant.name,
|
|
sourceUri: plant.imageUri,
|
|
status: 'failed',
|
|
error: message,
|
|
});
|
|
return { status: 'failed', plantId: plant.id };
|
|
}
|
|
},
|
|
MAX_CONCURRENCY,
|
|
);
|
|
|
|
const downloadedCount = manifestItems.filter((item) => item.status === 'downloaded').length;
|
|
const existingCount = manifestItems.filter((item) => item.status === 'existing').length;
|
|
const skippedCount = manifestItems.filter((item) => item.status === 'skipped').length;
|
|
|
|
saveManifest({
|
|
generatedAt: new Date().toISOString(),
|
|
summary: {
|
|
totalPlants: plants.length,
|
|
downloadedCount,
|
|
existingCount,
|
|
skippedCount,
|
|
failureCount: failures.length,
|
|
},
|
|
failures,
|
|
items: manifestItems,
|
|
});
|
|
|
|
console.log('');
|
|
console.log(`Downloaded: ${downloadedCount}`);
|
|
console.log(`Already present: ${existingCount}`);
|
|
console.log(`Skipped: ${skippedCount}`);
|
|
console.log(`Failed: ${failures.length}`);
|
|
console.log(`Manifest: ${MANIFEST_PATH}`);
|
|
|
|
if (failures.length > 0) {
|
|
process.exitCode = 1;
|
|
}
|
|
} finally {
|
|
await closeDatabase(db);
|
|
}
|
|
};
|
|
|
|
main().catch((error) => {
|
|
console.error('Plant image import failed.');
|
|
console.error(error instanceof Error ? error.stack || error.message : String(error));
|
|
process.exit(1);
|
|
});
|