https://crawlee.dev/
#nodejs
Web Scraping and Generating PDFs Using C# and NET (mirror) -- use of AngleSharp
#nodejs
Web Scraping and Generating PDFs Using C# and NET (mirror) -- use of AngleSharp
#set this variable as is.. - https://www3.ntu.edu.sg/home/ehchua/programming/howto/Environment_Variables.html
set PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
#will create the package.json
npm init -y
#because of PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD we setted at beginning, install barebone playwright (excluding browsers)
npm i -D playwright
#order chromium+ffmpeg installation only (if not already exist to C:\Users\%username%\AppData\Local\ms-playwright\)
#in that way we excluding the firefox (78mb) + webkit (73mb)
npx playwright install chromium
#you can even, skip to download chromium and use an existing OS browser https://playwright.dev/docs/browsers#google-chrome--microsoft-edge
#init a project - this will ask you to 'Install Playwright browsers' choose NO!
npm init playwright
#now you can use the famous 'codegen', an empty browser will open, browse to target page do what is needed
#now everything you doing on this Chromium instance, recorded to a script, this script afterwards can be in any of the following languages
#in the end go to 'inspector window' and save the script (ex hi.js) as /library/
npx playwright codegen
#executing as, will do the job
node hi.js
//show browser + slow mode 100ms
const browser = await chromium.launch({ headless: false , slowMo: 100 });
await page.goto('https://test.com');
//insert value to inpufield with delay 50ms per char
const x = await page.$('#email');
email.type('test@test.com', { delay : 50 });
//click an element
await page.click('#table > tr > td > a');
//select value from /select/
const dropdown = await page.$('#dropdown');
//by value
await dropdown.selectOption({value:'1'});
//by text
await dropdown.selectOption({label:'1'});
//by index
await dropdown.selectOption({index:'1'});
//loop through /select/ items
const dropdownItems = await dropdown.$$('option');
foreach (let i =0; i < to dropdownItems.length;i++)
console.log(await dropdownItems[i].innerText());
//more at - https://testautomationu.applitools.com/js-playwright-tutorial/
--
#You can create also tests
#https://playwright.dev/docs/intro
#https://playwright.dev/docs/running-tests
#will create the playwright.config.js > edit it and leave only chormium.
#will create the project and lastly will output these :
#Runs the end-to-end tests
npx playwright test
#Runs the tests only on Desktop Chrome.
npx playwright test --project=chromium
#will start the test.specs.js with 'inspector' and will run the script line by line
npx playwright test --debug
#or without debug, you can use in code
#await page.pause();
#will start the browser and you can see what is doing
npx playwright test --headed
npm I puppeteer
this will also download a standalone chromium.by running :console.log(puppeteer.executablePath());return;we can discover where exists in my case C:\Users\%username%\.cache\puppeteer\chrome\in case of firewall, just need the chrome.exe to be added to whitelist.
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('https://pipiscrew.com');
await page.screenshot({ path: 'example.png', fullPage: true });
await browser.close();
})();
this will produce the example.png, near test.jsnpm test.js
const browser = await puppeteer.launch({headless: false, devtools: true});
const resultsSelector = '#example';
await page.waitForSelector(resultsSelector);
Spread (...) Operator = make the items iterable.
const inner_html = await page.$eval('#example', element => element.innerHTML);
console.log(inner_html);
const list = await page.$$eval('li.example>a', a => a.map(a =>a.href));
console.log(list);
*simple*
const inner_html = await page.evaluate(() => document.querySelector('#example').innerHTML);
*advanced*
const data = await page.evaluate(() => {
const li = Array.from(document.querySelectorAll('li.title>a'));
return li.map(td => {
return td.innerHTML;
});
});
//1
const element = await page.$('div#example');
const element_property = await element.getProperty('innerHTML');
const inner_html = await element_property.jsonValue();
//2
const item = await page.$(resultsSelector);
const data = await (await item.getProperty('textContent')).jsonValue();
console.log(data);
//3 - ref https://qiita.com/go_sagawa/items/85f97deab7ccfdce53ea
const item = await page.$(resultsSelector);
const data = {
href: await (await item.getProperty('href')).jsonValue(),
textContent: await (await item.getProperty('textContent')).jsonValue(),
innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
};
console.log(data);
//4
const list = await page.$$(resultsSelector);
const datas = [];
for (let i = 0; i < list.length; i++) {
datas.push(await (await list[i].getProperty('textContent')).jsonValue())
}
console.log(datas);
//1
await page.$$eval('._qv64e', elements => elements[3].click());
//2
//loop through items, find the one with specific text and click it
await page.$$eval(resultsSelector, elements => {
// const element = elements.find(element => element.firstElementChild);
// const element = elements.find(element => element.innerHTML === '<h1>Hello, world!</h1>');
// console.log(element);
// element.click();
elements[0].click();
});
//1
await page.evaluate(() => { document.querySelector('div.example>button').click(); });
//2
await page.evaluate(() => { document.querySelectorAll('._qv64e')[80].click(); });
//3
const dd = await page.evaluate(() => {
const elements = [...document.querySelectorAll('div#example>button')];
// const element = elements.find(element => element.firstChild);
const element = elements.find(element => element.textContent == 'Press for more');
element.click();
});
//1
const x = await page.$('div.example>button');
if (!x)
console.log("nofound");
else {
console.log(x.length);
await x.click();
}
//2
const selectAll = await page.$$('div.example>button');
console.log(selectAll.length);
await selectAll[0].click();
const element = await page.evaluateHandle(() => {
const elements = [...document.querySelectorAll('div#example>button')];
//const element = elements.find(element => element.firstChild);
const element = elements.find(element => element.textContent == 'Press for more');
return element;
});
await element.click();
#2
//ref - https://github.com/osfunapps/os-puppeteer-helper-npm/blob/master/index.js
const last = await page.$('.item:last-child');
const prev = await page.evaluateHandle(el => el.previousElementSibling, last);
console.log(await (await prev.getProperty('innerHTML')).jsonValue());
//https://docs.apify.com/tutorials/scraping-dynamic-content#timeout-and-errors
let exitSwitch = 0;
await page.waitForSelector('my-selector', { timeout: 10000 })
.catch(() => {console.log('Wait for my-selector timed out'); exitSwitch=1;}
);
if (exitSwitch==1)
{
await browser.close();
return;
}
let all = [...document.querySelectorAll('td')].map(elem => elem.innerText);
console.log(all);
//puppeteer START
(async () => {
const browser = await puppeteer.launch({ headless: false, devtools: true });
const page = await browser.newPage();
// https://stackoverflow.com/a/69127872
await page.evaluateOnNewDocument(() => {
window.parseFields = function parseFields(el) {
//your function logic to be used inside page.$$eval
}
});
// UPDATE - expose the function "logInNodeJs" for use from inside page context - https://stackoverflow.com/a/73964712
await page.exposeFunction('logInNodeJs', (value) => console.log(value));
//browser to page
await page.goto('https://yoursite.here');
const rows = await page.$$eval('.table tr', list => {
logInNodeJs("check");
for (let i = 0; i < list.length; i++) {
let TDs = list[i].querySelectorAll('td');
col1 = parseFields(TDs[5].querySelectorAll('.inline-block'));
var data = {
col1: col1,
cells: [...list[i].querySelectorAll('td')].map(elem => elem.innerText),
};
datas.push(data);
}
return datas;
});
});
//https://stackoverflow.com/a/65283313
//before
const rows = await page.$$eval('.table tr', list => {
//you code here
});
//after
const rows = await page.$$eval('.table tr', (list, currentURL) => {
console.log(currentURL);
}, theURL);