bizmatch-project/crawler/index.js

179 lines
8.2 KiB
JavaScript

"use strict";
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
// import puppeteer, { Browser, ElementHandle, Page } from 'puppeteer-core';
const puppeteer_1 = __importDefault(require("puppeteer"));
const currency_js_1 = __importDefault(require("currency.js"));
const fs_extra_1 = __importDefault(require("fs-extra"));
const typesOfBusiness = [
{ name: 'Automotive', value: '1' },
{ name: 'Industrial Services', value: '2' },
{ name: 'Real Estate', value: '3' },
{ name: 'Uncategorized', value: '4' },
{ name: 'Retail', value: '5' },
{ name: 'Oilfield SVE and MFG.', value: '6' },
{ name: 'Service', value: '7' },
{ name: 'Advertising', value: '8' },
{ name: 'Agriculture', value: '9' },
{ name: 'Franchise', value: '10' },
{ name: 'Professional', value: '11' },
{ name: 'Manufacturing', value: '12' },
{ name: 'Food and Restaurant', value: '13' },
];
function getParentElementText(elementHandle) {
return __awaiter(this, void 0, void 0, function* () {
const textContent = elementHandle
? yield elementHandle.evaluate((el) => {
const getText = (nodes) => {
const result = [];
//debugger;
for (const node of nodes) {
if (node.nodeType === Node.TEXT_NODE && node.nodeValue !== "\n") {
result.push(node.nodeValue.replace('\n', ''));
}
}
return result;
};
const parent = el.parentElement;
if (!parent)
return null;
let text = '';
const preResult = Array.from(parent.childNodes).find((e) => e.nodeName === 'PRE');
if (preResult) {
return getText(Array.from(preResult.childNodes));
}
else {
return getText(Array.from(parent.childNodes));
}
})
: null;
return textContent ? (textContent.length < 2 ? textContent.join() : textContent) : null;
});
}
function extractListingData(page) {
var _a, _b;
return __awaiter(this, void 0, void 0, function* () {
const labels = {
summaryLabel: 'Summary',
descriptionLabel: 'Description',
categoryLabel: 'Category:',
locationLabel: 'Located in:',
askingPriceLabel: 'Asking Price:',
realEstateLabel: 'Real Estate Included:',
salesRevenueLabel: 'Sales revenue:',
cashflowLabel: 'Cash flow:',
inventoryLabel: 'Inventory:',
brokerLabel: 'Broker licensing:',
reasonLabel: 'Reason for sale:',
employeesLabel: 'Employees:',
};
const title = (yield page.$eval('div.title', (el) => el.textContent)).trim();
const content = {};
for (const key of Object.values(labels)) {
const element = yield findElementWithText(page, 'div.sub-title', key);
try {
content[key] = element ? yield getParentElementText(element) : 'N/A';
}
catch (error) {
console.log(`Fehler bei : ${key}`);
}
}
let categoryType;
if (content['Category:']) {
categoryType = typesOfBusiness.find((t) => t.name.toLowerCase() === content['Category:'].toLowerCase());
}
else {
console.log(`---> No Category ...`);
}
if (!categoryType) {
console.log(`---> ${content['Category:']}`);
}
try {
const listing = {
id: 'NA',
userId: '1',
listingsCategory: 'business',
title: title,
summary: Array.isArray(content[labels.summaryLabel]) ? content[labels.summaryLabel] : [content[labels.summaryLabel]],
description: Array.isArray(content[labels.descriptionLabel]) ? content[labels.descriptionLabel] : [content[labels.descriptionLabel]],
type: categoryType.value,
location: content[labels.locationLabel],
price: (0, currency_js_1.default)(content[labels.askingPriceLabel]).value,
salesRevenue: (_a = (0, currency_js_1.default)(content[labels.salesRevenueLabel])) === null || _a === void 0 ? void 0 : _a.value,
cashFlow: (_b = (0, currency_js_1.default)(content[labels.cashflowLabel])) === null || _b === void 0 ? void 0 : _b.value,
brokerLicencing: content[labels.brokerLabel],
established: null,
realEstateIncluded: content[labels.realEstateLabel] === 'Yes' ? true : false,
inventory: content[labels.inventoryLabel],
employees: content[labels.employeesLabel],
reasonForSale: content[labels.reasonLabel],
internals: '',
};
return listing;
}
catch (error) {
console.log(`Fehler bei ${title}`);
return null;
}
});
}
function findElementWithText(page, selector, text) {
return __awaiter(this, void 0, void 0, function* () {
const elementHandle = yield page.evaluateHandle((selector, text) => {
const elements = Array.from(document.querySelectorAll(selector));
return elements.find((element) => { var _a; return ((_a = element.textContent) === null || _a === void 0 ? void 0 : _a.trim()) === text; });
}, selector, text);
return elementHandle;
});
}
function processPage(browser, url, out) {
return __awaiter(this, void 0, void 0, function* () {
const page = yield browser.newPage();
yield page.goto(url, { waitUntil: 'domcontentloaded' });
const listings = yield page.$$('div.ResultsGridItem');
for (const listing of listings) {
const detailLinkElement = yield listing.$('a.viewListing');
if (detailLinkElement) {
const detailLink = yield detailLinkElement.evaluate((el) => el.getAttribute('href'));
const detailPage = yield browser.newPage();
yield detailPage.goto(detailLink, { waitUntil: 'domcontentloaded' });
const listingData = yield extractListingData(detailPage);
if (listingData) {
console.log(JSON.stringify(listingData));
out.push(listingData);
}
yield detailPage.close();
}
}
const nextPageElement = yield page.$('a.next');
if (nextPageElement) {
let nextPageLink = yield nextPageElement.evaluate((el) => el.getAttribute('href'));
if (!nextPageLink.startsWith('https')) {
const origin = yield page.evaluate(() => location.origin);
nextPageLink = `${origin}${nextPageLink}`;
}
yield processPage(browser, nextPageLink, out);
}
yield page.close();
});
}
(() => __awaiter(void 0, void 0, void 0, function* () {
const browser = yield puppeteer_1.default.launch({ headless: true, executablePath: '/snap/bin/chromium', devtools: true, slowMo: 50 });
//const browser = await puppeteer.launch({devtools: true});
const out = [];
yield processPage(browser, 'https://www.bizmatch.net/results', out);
yield fs_extra_1.default.writeJson('./listings.json', out);
yield browser.close();
}))();
//# sourceMappingURL=index.js.map