// import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer-core'; import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer'; import { BusinessListing } from "../common-models/src/main.model" import currency from 'currency.js'; import fs from 'fs-extra' export interface KeyValue { name: string; value: string; } const typesOfBusiness: Array = [ { name: 'Automotive', value: '1' }, { name: 'Industrial Services', value: '2' }, { name: 'Real Estate', value: '3' }, { name: 'Uncategorized', value: '4' }, { name: 'Retail', value: '5' }, { name: 'Oilfield SVE and MFG.', value: '6' }, { name: 'Service', value: '7' }, { name: 'Advertising', value: '8' }, { name: 'Agriculture', value: '9' }, { name: 'Franchise', value: '10' }, { name: 'Professional', value: '11' }, { name: 'Manufacturing', value: '12' }, { name: 'Food and Restaurant', value: '13' }, ]; async function getParentElementText(elementHandle: ElementHandle | null) { const textContent = elementHandle ? await elementHandle.evaluate((el) => { const getText = (nodes: Node[]) => { const result = []; //debugger; for (const node of nodes) { if (node.nodeType === Node.TEXT_NODE && node.nodeValue!=="\n") { result.push(node.nodeValue.replace('\n','')); } } return result; }; const parent = el.parentElement; if (!parent) return null; let text = ''; const preResult = Array.from(parent.childNodes).find((e) => e.nodeName === 'PRE'); if (preResult) { return getText(Array.from(preResult.childNodes)); } else { return getText(Array.from(parent.childNodes)); } }) : null; return textContent?(textContent.length<2?textContent.join():textContent):null } async function extractListingData(page: Page): Promise { const labels = { summaryLabel: 'Summary', descriptionLabel: 'Description', categoryLabel: 'Category:', locationLabel: 'Located in:', askingPriceLabel: 'Asking Price:', realEstateLabel: 'Real Estate Included:', salesRevenueLabel: 'Sales revenue:', cashflowLabel: 'Cash flow:', inventoryLabel: 'Inventory:', brokerLabel: 'Broker licensing:', reasonLabel: 'Reason for sale:', employeesLabel: 'Employees:', }; const title = (await page.$eval('div.title', (el) => el.textContent)).trim(); const content = {}; for (const key of Object.values(labels)) { const element = await findElementWithText(page, 'div.sub-title', key); try { content[key] = element ? await getParentElementText(element) : 'N/A'; } catch (error) { console.log(`Fehler bei : ${key}`); } } let categoryType if (content['Category:']){ categoryType = typesOfBusiness.find((t) => t.name.toLowerCase() === content['Category:'].toLowerCase()); } else { console.log(`---> No Category ...`); } if (!categoryType) { console.log(`---> ${content['Category:']}`); } try { const listing = { id: 'NA', userId: '1', listingsCategory: 'business', title: title, summary: Array.isArray(content[labels.summaryLabel])?content[labels.summaryLabel]:[content[labels.summaryLabel]], description: Array.isArray(content[labels.descriptionLabel])?content[labels.descriptionLabel]:[content[labels.descriptionLabel]], type: categoryType.value, location: content[labels.locationLabel], price: currency(content[labels.askingPriceLabel]).value, salesRevenue: currency(content[labels.salesRevenueLabel])?.value, cashFlow: currency(content[labels.cashflowLabel])?.value, brokerLicencing: content[labels.brokerLabel], established: null, realEstateIncluded: content[labels.realEstateLabel] === 'Yes' ? true : false, inventory: content[labels.inventoryLabel], employees: content[labels.employeesLabel], reasonForSale: content[labels.reasonLabel], internals: '', } as any; return listing; } catch (error) { console.log(`Fehler bei ${title}`); return null; } } async function findElementWithText(page: Page, selector: string, text: string) { const elementHandle = await page.evaluateHandle( (selector, text) => { const elements = Array.from(document.querySelectorAll(selector)); return elements.find((element) => element.textContent?.trim() === text); }, selector, text ); return elementHandle; } async function processPage(browser: Browser, url: string,out:Array) { const page = await browser.newPage(); await page.goto(url, { waitUntil: 'domcontentloaded' }); const listings = await page.$$('div.ResultsGridItem'); for (const listing of listings) { const detailLinkElement = await listing.$('a.viewListing'); if (detailLinkElement) { const detailLink = await detailLinkElement.evaluate((el) => el.getAttribute('href')); const detailPage = await browser.newPage(); await detailPage.goto(detailLink!, { waitUntil: 'domcontentloaded' }); const listingData = await extractListingData(detailPage); if (listingData) { console.log(JSON.stringify(listingData)); out.push(listingData); } await detailPage.close(); } } const nextPageElement = await page.$('a.next'); if (nextPageElement) { let nextPageLink = await nextPageElement.evaluate((el) => el.getAttribute('href')); if (!nextPageLink.startsWith('https')) { const origin = await page.evaluate(() => location.origin); nextPageLink = `${origin}${nextPageLink}`; } await processPage(browser, nextPageLink!, out); } await page.close(); } (async () => { const browser = await puppeteer.launch({ headless: true, executablePath: '/snap/bin/chromium', devtools: true,slowMo: 50 }); //const browser = await puppeteer.launch({devtools: true}); const out = [] await processPage(browser, 'https://www.bizmatch.net/results',out); await fs.writeJson('./listings.json', out) await browser.close(); })();