173 lines
6.5 KiB
TypeScript
173 lines
6.5 KiB
TypeScript
// import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer-core';
|
|
import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer';
|
|
|
|
import currency from 'currency.js';
|
|
import fs from 'fs-extra'
|
|
|
|
export interface KeyValue {
|
|
name: string;
|
|
value: string;
|
|
}
|
|
const typesOfBusiness: Array<KeyValue> = [
|
|
{ name: 'Automotive', value: '1' },
|
|
{ name: 'Industrial Services', value: '2' },
|
|
{ name: 'Real Estate', value: '3' },
|
|
{ name: 'Uncategorized', value: '4' },
|
|
{ name: 'Retail', value: '5' },
|
|
{ name: 'Oilfield SVE and MFG.', value: '6' },
|
|
{ name: 'Service', value: '7' },
|
|
{ name: 'Advertising', value: '8' },
|
|
{ name: 'Agriculture', value: '9' },
|
|
{ name: 'Franchise', value: '10' },
|
|
{ name: 'Professional', value: '11' },
|
|
{ name: 'Manufacturing', value: '12' },
|
|
{ name: 'Food and Restaurant', value: '13' },
|
|
];
|
|
|
|
async function getParentElementText(elementHandle: ElementHandle<Element> | null) {
|
|
const textContent = elementHandle
|
|
? await elementHandle.evaluate((el) => {
|
|
const getText = (nodes: Node[]) => {
|
|
const result = [];
|
|
//debugger;
|
|
for (const node of nodes) {
|
|
if (node.nodeType === Node.TEXT_NODE && node.nodeValue!=="\n") {
|
|
result.push(node.nodeValue.replace('\n',''));
|
|
}
|
|
}
|
|
return result;
|
|
};
|
|
const parent = el.parentElement;
|
|
if (!parent) return null;
|
|
let text = '';
|
|
const preResult = Array.from(parent.childNodes).find((e) => e.nodeName === 'PRE');
|
|
if (preResult) {
|
|
return getText(Array.from(preResult.childNodes));
|
|
} else {
|
|
return getText(Array.from(parent.childNodes));
|
|
}
|
|
})
|
|
: null;
|
|
return textContent?(textContent.length<2?textContent.join():textContent):null
|
|
}
|
|
|
|
async function extractListingData(page: Page): Promise<any | null> {
|
|
const labels = {
|
|
summaryLabel: 'Summary',
|
|
descriptionLabel: 'Description',
|
|
categoryLabel: 'Category:',
|
|
locationLabel: 'Located in:',
|
|
askingPriceLabel: 'Asking Price:',
|
|
realEstateLabel: 'Real Estate Included:',
|
|
salesRevenueLabel: 'Sales revenue:',
|
|
cashflowLabel: 'Cash flow:',
|
|
inventoryLabel: 'Inventory:',
|
|
brokerLabel: 'Broker licensing:',
|
|
reasonLabel: 'Reason for sale:',
|
|
employeesLabel: 'Employees:',
|
|
};
|
|
const title = (await page.$eval('div.title', (el) => el.textContent)).trim();
|
|
|
|
const content = {};
|
|
for (const key of Object.values(labels)) {
|
|
const element = await findElementWithText(page, 'div.sub-title', key);
|
|
try {
|
|
content[key] = element ? await getParentElementText(element) : 'N/A';
|
|
} catch (error) {
|
|
console.log(`Fehler bei : ${key}`);
|
|
}
|
|
}
|
|
let categoryType
|
|
if (content['Category:']){
|
|
categoryType = typesOfBusiness.find((t) => t.name.toLowerCase() === content['Category:'].toLowerCase());
|
|
} else {
|
|
console.log(`---> No Category ...`);
|
|
}
|
|
if (!categoryType) {
|
|
console.log(`---> ${content['Category:']}`);
|
|
}
|
|
try {
|
|
const listing = {
|
|
id: 'NA',
|
|
userId: '1',
|
|
listingsCategory: 'business',
|
|
title: title,
|
|
summary: Array.isArray(content[labels.summaryLabel])?content[labels.summaryLabel]:[content[labels.summaryLabel]],
|
|
description: Array.isArray(content[labels.descriptionLabel])?content[labels.descriptionLabel]:[content[labels.descriptionLabel]],
|
|
type: categoryType.value,
|
|
location: content[labels.locationLabel],
|
|
price: currency(content[labels.askingPriceLabel]).value,
|
|
salesRevenue: currency(content[labels.salesRevenueLabel])?.value,
|
|
cashFlow: currency(content[labels.cashflowLabel])?.value,
|
|
brokerLicencing: content[labels.brokerLabel],
|
|
established: null,
|
|
realEstateIncluded: content[labels.realEstateLabel] === 'Yes' ? true : false,
|
|
inventory: content[labels.inventoryLabel],
|
|
employees: content[labels.employeesLabel],
|
|
reasonForSale: content[labels.reasonLabel],
|
|
internals: '',
|
|
} as any;
|
|
return listing;
|
|
} catch (error) {
|
|
console.log(`Fehler bei ${title}`);
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function findElementWithText(page: Page, selector: string, text: string) {
|
|
const elementHandle = await page.evaluateHandle(
|
|
(selector, text) => {
|
|
const elements = Array.from(document.querySelectorAll(selector));
|
|
return elements.find((element) => element.textContent?.trim() === text);
|
|
},
|
|
selector,
|
|
text
|
|
);
|
|
|
|
return elementHandle;
|
|
}
|
|
async function processPage(browser: Browser, url: string,out:Array<any>) {
|
|
const page = await browser.newPage();
|
|
|
|
await page.goto(url, { waitUntil: 'domcontentloaded' });
|
|
|
|
const listings = await page.$$('div.ResultsGridItem');
|
|
for (const listing of listings) {
|
|
const detailLinkElement = await listing.$('a.viewListing');
|
|
if (detailLinkElement) {
|
|
const detailLink = await detailLinkElement.evaluate((el) => el.getAttribute('href'));
|
|
const detailPage = await browser.newPage();
|
|
await detailPage.goto(detailLink!, { waitUntil: 'domcontentloaded' });
|
|
|
|
const listingData = await extractListingData(detailPage);
|
|
if (listingData) {
|
|
console.log(JSON.stringify(listingData));
|
|
out.push(listingData);
|
|
}
|
|
|
|
await detailPage.close();
|
|
}
|
|
}
|
|
|
|
const nextPageElement = await page.$('a.next');
|
|
if (nextPageElement) {
|
|
let nextPageLink = await nextPageElement.evaluate((el) => el.getAttribute('href'));
|
|
if (!nextPageLink.startsWith('https')) {
|
|
const origin = await page.evaluate(() => location.origin);
|
|
nextPageLink = `${origin}${nextPageLink}`;
|
|
}
|
|
await processPage(browser, nextPageLink!, out);
|
|
}
|
|
|
|
await page.close();
|
|
}
|
|
|
|
(async () => {
|
|
const browser = await puppeteer.launch({ headless: true, executablePath: '/snap/bin/chromium', devtools: true,slowMo: 50 });
|
|
//const browser = await puppeteer.launch({devtools: true});
|
|
const out = []
|
|
await processPage(browser, 'https://www.bizmatch.net/results',out);
|
|
await fs.writeJson('./listings.json', out)
|
|
await browser.close();
|
|
})();
|