From 995468fa30a5242774fdc1955d24463558803eca Mon Sep 17 00:00:00 2001 From: Andreas Knuth Date: Sun, 9 Nov 2025 16:17:44 -0600 Subject: [PATCH] asds --- crawler/changeUserId.js | 40 +++++++++ crawler/import.js | 27 ++++++ crawler/index.js | 179 ++++++++++++++++++++++++++++++++++++++++ crawler/updateFields.js | 40 +++++++++ 4 files changed, 286 insertions(+) create mode 100644 crawler/changeUserId.js create mode 100644 crawler/import.js create mode 100644 crawler/index.js create mode 100644 crawler/updateFields.js diff --git a/crawler/changeUserId.js b/crawler/changeUserId.js new file mode 100644 index 0000000..d7b3fde --- /dev/null +++ b/crawler/changeUserId.js @@ -0,0 +1,40 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __importDefault = (this && this.__importDefault) || function (mod) { + return (mod && mod.__esModule) ? mod : { "default": mod }; +}; +Object.defineProperty(exports, "__esModule", { value: true }); +const yargs_1 = __importDefault(require("yargs")); +const helpers_1 = require("yargs/helpers"); +const argv = (0, yargs_1.default)((0, helpers_1.hideBin)(process.argv)).argv; +if (!argv.userId) { + console.log(' --userId [any valid userId]'); + process.exit(1); +} +(() => __awaiter(void 0, void 0, void 0, function* () { + console; + const response = yield fetch('http://localhost:3000/bizmatch/listings', { + method: 'GET', + headers: { 'Content-Type': 'application/json' }, + }); + const listings = yield response.json(); + for (const listing of listings) { + listing.userId = argv.userId; + listing.created = new Date(); + listing.updated = new Date(); + const response = yield fetch(`http://localhost:3000/bizmatch/listings/${listing.id}`, { + method: 'PUT', + body: JSON.stringify(listing), + headers: { 'Content-Type': 'application/json' }, + }); + } +}))(); +//# sourceMappingURL=changeUserId.js.map \ No newline at end of file diff --git a/crawler/import.js b/crawler/import.js new file mode 100644 index 0000000..33e8190 --- /dev/null +++ b/crawler/import.js @@ -0,0 +1,27 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __importDefault = (this && this.__importDefault) || function (mod) { + return (mod && mod.__esModule) ? mod : { "default": mod }; +}; +Object.defineProperty(exports, "__esModule", { value: true }); +const fs_extra_1 = __importDefault(require("fs-extra")); +(() => __awaiter(void 0, void 0, void 0, function* () { + const listings = yield fs_extra_1.default.readJson('./listings.json'); + //listings.forEach(element => { + for (const listing of listings) { + const response = yield fetch('http://localhost:3000/bizmatch/listings', { + method: 'POST', + body: JSON.stringify(listing), + headers: { 'Content-Type': 'application/json' }, + }); + } +}))(); +//# sourceMappingURL=import.js.map \ No newline at end of file diff --git a/crawler/index.js b/crawler/index.js new file mode 100644 index 0000000..d48b259 --- /dev/null +++ b/crawler/index.js @@ -0,0 +1,179 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __importDefault = (this && this.__importDefault) || function (mod) { + return (mod && mod.__esModule) ? mod : { "default": mod }; +}; +Object.defineProperty(exports, "__esModule", { value: true }); +// import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer-core'; +const puppeteer_1 = __importDefault(require("puppeteer")); +const currency_js_1 = __importDefault(require("currency.js")); +const fs_extra_1 = __importDefault(require("fs-extra")); +const typesOfBusiness = [ + { name: 'Automotive', value: '1' }, + { name: 'Industrial Services', value: '2' }, + { name: 'Real Estate', value: '3' }, + { name: 'Uncategorized', value: '4' }, + { name: 'Retail', value: '5' }, + { name: 'Oilfield SVE and MFG.', value: '6' }, + { name: 'Service', value: '7' }, + { name: 'Advertising', value: '8' }, + { name: 'Agriculture', value: '9' }, + { name: 'Franchise', value: '10' }, + { name: 'Professional', value: '11' }, + { name: 'Manufacturing', value: '12' }, + { name: 'Food and Restaurant', value: '13' }, +]; +function getParentElementText(elementHandle) { + return __awaiter(this, void 0, void 0, function* () { + const textContent = elementHandle + ? yield elementHandle.evaluate((el) => { + const getText = (nodes) => { + const result = []; + //debugger; + for (const node of nodes) { + if (node.nodeType === Node.TEXT_NODE && node.nodeValue !== "\n") { + result.push(node.nodeValue.replace('\n', '')); + } + } + return result; + }; + const parent = el.parentElement; + if (!parent) + return null; + let text = ''; + const preResult = Array.from(parent.childNodes).find((e) => e.nodeName === 'PRE'); + if (preResult) { + return getText(Array.from(preResult.childNodes)); + } + else { + return getText(Array.from(parent.childNodes)); + } + }) + : null; + return textContent ? (textContent.length < 2 ? textContent.join() : textContent) : null; + }); +} +function extractListingData(page) { + var _a, _b; + return __awaiter(this, void 0, void 0, function* () { + const labels = { + summaryLabel: 'Summary', + descriptionLabel: 'Description', + categoryLabel: 'Category:', + locationLabel: 'Located in:', + askingPriceLabel: 'Asking Price:', + realEstateLabel: 'Real Estate Included:', + salesRevenueLabel: 'Sales revenue:', + cashflowLabel: 'Cash flow:', + inventoryLabel: 'Inventory:', + brokerLabel: 'Broker licensing:', + reasonLabel: 'Reason for sale:', + employeesLabel: 'Employees:', + }; + const title = (yield page.$eval('div.title', (el) => el.textContent)).trim(); + const content = {}; + for (const key of Object.values(labels)) { + const element = yield findElementWithText(page, 'div.sub-title', key); + try { + content[key] = element ? yield getParentElementText(element) : 'N/A'; + } + catch (error) { + console.log(`Fehler bei : ${key}`); + } + } + let categoryType; + if (content['Category:']) { + categoryType = typesOfBusiness.find((t) => t.name.toLowerCase() === content['Category:'].toLowerCase()); + } + else { + console.log(`---> No Category ...`); + } + if (!categoryType) { + console.log(`---> ${content['Category:']}`); + } + try { + const listing = { + id: 'NA', + userId: '1', + listingsCategory: 'business', + title: title, + summary: Array.isArray(content[labels.summaryLabel]) ? content[labels.summaryLabel] : [content[labels.summaryLabel]], + description: Array.isArray(content[labels.descriptionLabel]) ? content[labels.descriptionLabel] : [content[labels.descriptionLabel]], + type: categoryType.value, + location: content[labels.locationLabel], + price: (0, currency_js_1.default)(content[labels.askingPriceLabel]).value, + salesRevenue: (_a = (0, currency_js_1.default)(content[labels.salesRevenueLabel])) === null || _a === void 0 ? void 0 : _a.value, + cashFlow: (_b = (0, currency_js_1.default)(content[labels.cashflowLabel])) === null || _b === void 0 ? void 0 : _b.value, + brokerLicencing: content[labels.brokerLabel], + established: null, + realEstateIncluded: content[labels.realEstateLabel] === 'Yes' ? true : false, + inventory: content[labels.inventoryLabel], + employees: content[labels.employeesLabel], + reasonForSale: content[labels.reasonLabel], + internals: '', + }; + return listing; + } + catch (error) { + console.log(`Fehler bei ${title}`); + return null; + } + }); +} +function findElementWithText(page, selector, text) { + return __awaiter(this, void 0, void 0, function* () { + const elementHandle = yield page.evaluateHandle((selector, text) => { + const elements = Array.from(document.querySelectorAll(selector)); + return elements.find((element) => { var _a; return ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) === text; }); + }, selector, text); + return elementHandle; + }); +} +function processPage(browser, url, out) { + return __awaiter(this, void 0, void 0, function* () { + const page = yield browser.newPage(); + yield page.goto(url, { waitUntil: 'domcontentloaded' }); + const listings = yield page.$$('div.ResultsGridItem'); + for (const listing of listings) { + const detailLinkElement = yield listing.$('a.viewListing'); + if (detailLinkElement) { + const detailLink = yield detailLinkElement.evaluate((el) => el.getAttribute('href')); + const detailPage = yield browser.newPage(); + yield detailPage.goto(detailLink, { waitUntil: 'domcontentloaded' }); + const listingData = yield extractListingData(detailPage); + if (listingData) { + console.log(JSON.stringify(listingData)); + out.push(listingData); + } + yield detailPage.close(); + } + } + const nextPageElement = yield page.$('a.next'); + if (nextPageElement) { + let nextPageLink = yield nextPageElement.evaluate((el) => el.getAttribute('href')); + if (!nextPageLink.startsWith('https')) { + const origin = yield page.evaluate(() => location.origin); + nextPageLink = `${origin}${nextPageLink}`; + } + yield processPage(browser, nextPageLink, out); + } + yield page.close(); + }); +} +(() => __awaiter(void 0, void 0, void 0, function* () { + const browser = yield puppeteer_1.default.launch({ headless: true, executablePath: '/snap/bin/chromium', devtools: true, slowMo: 50 }); + //const browser = await puppeteer.launch({devtools: true}); + const out = []; + yield processPage(browser, 'https://www.bizmatch.net/results', out); + yield fs_extra_1.default.writeJson('./listings.json', out); + yield browser.close(); +}))(); +//# sourceMappingURL=index.js.map \ No newline at end of file diff --git a/crawler/updateFields.js b/crawler/updateFields.js new file mode 100644 index 0000000..fe0b13b --- /dev/null +++ b/crawler/updateFields.js @@ -0,0 +1,40 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +Object.defineProperty(exports, "__esModule", { value: true }); +//const argv = yargs(hideBin(process.argv)).argv +// if (!argv.userId){ +// console.log(' --userId [any valid userId]') +// process.exit(1) +// } +(() => __awaiter(void 0, void 0, void 0, function* () { + const selectOptionsResponse = yield fetch('http://localhost:3000/bizmatch/select-options', { + method: 'GET', + headers: { 'Content-Type': 'application/json' }, + }); + const selectOptions = yield selectOptionsResponse.json(); + const response = yield fetch('http://localhost:3000/bizmatch/listings', { + method: 'GET', + headers: { 'Content-Type': 'application/json' }, + }); + const listings = yield response.json(); + for (const listing of listings) { + const option = selectOptions.locations.find(l => l.name.toLowerCase() === listing.location.toLowerCase()); + if (option) { + listing.location = option.value; + } + const response = yield fetch(`http://localhost:3000/bizmatch/listings/${listing.id}`, { + method: 'PUT', + body: JSON.stringify(listing), + headers: { 'Content-Type': 'application/json' }, + }); + } +}))(); +//# sourceMappingURL=updateFields.js.map \ No newline at end of file