bizmatch-project/crawler/index.ts

173 lines
6.5 KiB
TypeScript

// import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer-core';
import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer';
import currency from 'currency.js';
import fs from 'fs-extra'
export interface KeyValue {
name: string;
value: string;
}
const typesOfBusiness: Array<KeyValue> = [
{ name: 'Automotive', value: '1' },
{ name: 'Industrial Services', value: '2' },
{ name: 'Real Estate', value: '3' },
{ name: 'Uncategorized', value: '4' },
{ name: 'Retail', value: '5' },
{ name: 'Oilfield SVE and MFG.', value: '6' },
{ name: 'Service', value: '7' },
{ name: 'Advertising', value: '8' },
{ name: 'Agriculture', value: '9' },
{ name: 'Franchise', value: '10' },
{ name: 'Professional', value: '11' },
{ name: 'Manufacturing', value: '12' },
{ name: 'Food and Restaurant', value: '13' },
];
async function getParentElementText(elementHandle: ElementHandle<Element> | null) {
const textContent = elementHandle
? await elementHandle.evaluate((el) => {
const getText = (nodes: Node[]) => {
const result = [];
//debugger;
for (const node of nodes) {
if (node.nodeType === Node.TEXT_NODE && node.nodeValue!=="\n") {
result.push(node.nodeValue.replace('\n',''));
}
}
return result;
};
const parent = el.parentElement;
if (!parent) return null;
let text = '';
const preResult = Array.from(parent.childNodes).find((e) => e.nodeName === 'PRE');
if (preResult) {
return getText(Array.from(preResult.childNodes));
} else {
return getText(Array.from(parent.childNodes));
}
})
: null;
return textContent?(textContent.length<2?textContent.join():textContent):null
}
async function extractListingData(page: Page): Promise<any | null> {
const labels = {
summaryLabel: 'Summary',
descriptionLabel: 'Description',
categoryLabel: 'Category:',
locationLabel: 'Located in:',
askingPriceLabel: 'Asking Price:',
realEstateLabel: 'Real Estate Included:',
salesRevenueLabel: 'Sales revenue:',
cashflowLabel: 'Cash flow:',
inventoryLabel: 'Inventory:',
brokerLabel: 'Broker licensing:',
reasonLabel: 'Reason for sale:',
employeesLabel: 'Employees:',
};
const title = (await page.$eval('div.title', (el) => el.textContent)).trim();
const content = {};
for (const key of Object.values(labels)) {
const element = await findElementWithText(page, 'div.sub-title', key);
try {
content[key] = element ? await getParentElementText(element) : 'N/A';
} catch (error) {
console.log(`Fehler bei : ${key}`);
}
}
let categoryType
if (content['Category:']){
categoryType = typesOfBusiness.find((t) => t.name.toLowerCase() === content['Category:'].toLowerCase());
} else {
console.log(`---> No Category ...`);
}
if (!categoryType) {
console.log(`---> ${content['Category:']}`);
}
try {
const listing = {
id: 'NA',
userId: '1',
listingsCategory: 'business',
title: title,
summary: Array.isArray(content[labels.summaryLabel])?content[labels.summaryLabel]:[content[labels.summaryLabel]],
description: Array.isArray(content[labels.descriptionLabel])?content[labels.descriptionLabel]:[content[labels.descriptionLabel]],
type: categoryType.value,
location: content[labels.locationLabel],
price: currency(content[labels.askingPriceLabel]).value,
salesRevenue: currency(content[labels.salesRevenueLabel])?.value,
cashFlow: currency(content[labels.cashflowLabel])?.value,
brokerLicencing: content[labels.brokerLabel],
established: null,
realEstateIncluded: content[labels.realEstateLabel] === 'Yes' ? true : false,
inventory: content[labels.inventoryLabel],
employees: content[labels.employeesLabel],
reasonForSale: content[labels.reasonLabel],
internals: '',
} as any;
return listing;
} catch (error) {
console.log(`Fehler bei ${title}`);
return null;
}
}
async function findElementWithText(page: Page, selector: string, text: string) {
const elementHandle = await page.evaluateHandle(
(selector, text) => {
const elements = Array.from(document.querySelectorAll(selector));
return elements.find((element) => element.textContent?.trim() === text);
},
selector,
text
);
return elementHandle;
}
async function processPage(browser: Browser, url: string,out:Array<any>) {
const page = await browser.newPage();
await page.goto(url, { waitUntil: 'domcontentloaded' });
const listings = await page.$$('div.ResultsGridItem');
for (const listing of listings) {
const detailLinkElement = await listing.$('a.viewListing');
if (detailLinkElement) {
const detailLink = await detailLinkElement.evaluate((el) => el.getAttribute('href'));
const detailPage = await browser.newPage();
await detailPage.goto(detailLink!, { waitUntil: 'domcontentloaded' });
const listingData = await extractListingData(detailPage);
if (listingData) {
console.log(JSON.stringify(listingData));
out.push(listingData);
}
await detailPage.close();
}
}
const nextPageElement = await page.$('a.next');
if (nextPageElement) {
let nextPageLink = await nextPageElement.evaluate((el) => el.getAttribute('href'));
if (!nextPageLink.startsWith('https')) {
const origin = await page.evaluate(() => location.origin);
nextPageLink = `${origin}${nextPageLink}`;
}
await processPage(browser, nextPageLink!, out);
}
await page.close();
}
(async () => {
const browser = await puppeteer.launch({ headless: true, executablePath: '/snap/bin/chromium', devtools: true,slowMo: 50 });
//const browser = await puppeteer.launch({devtools: true});
const out = []
await processPage(browser, 'https://www.bizmatch.net/results',out);
await fs.writeJson('./listings.json', out)
await browser.close();
})();