https://crawlee.dev/
#nodejs
Web Scraping and Generating PDFs Using C# and NET (mirror) -- use of AngleSharp
			
			#nodejs
Web Scraping and Generating PDFs Using C# and NET (mirror) -- use of AngleSharp
 
	#set this variable as is.. - https://www3.ntu.edu.sg/home/ehchua/programming/howto/Environment_Variables.html
set PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
#will create the package.json
npm init -y
#because of PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD we setted at beginning, install barebone playwright (excluding browsers)
npm i -D playwright
#order chromium+ffmpeg installation only (if not already exist to C:\Users\%username%\AppData\Local\ms-playwright\)
#in that way we excluding the firefox (78mb) + webkit (73mb)
npx playwright install chromium
#you can even, skip to download chromium and use an existing OS browser https://playwright.dev/docs/browsers#google-chrome--microsoft-edge
#init a project - this will ask you to 'Install Playwright browsers' choose NO!
npm init playwright
#now you can use the famous 'codegen', an empty browser will open, browse to target page do what is needed
#now everything you doing on this Chromium instance, recorded to a script, this script afterwards can be in any of the following languages
#in the end go to 'inspector window' and save the script (ex hi.js) as /library/
npx playwright codegen
#executing as, will do the job
node hi.js 
	//show browser + slow mode 100ms
const browser = await chromium.launch({ headless: false ,  slowMo: 100 });
await page.goto('https://test.com');
//insert value to inpufield with delay 50ms per char
const x = await page.$('#email');
email.type('test@test.com', { delay : 50 });
//click an element
await page.click('#table > tr > td > a');
//select value from /select/
const dropdown = await page.$('#dropdown');
//by value
await dropdown.selectOption({value:'1'});
//by text
await dropdown.selectOption({label:'1'});
//by index
await dropdown.selectOption({index:'1'});
//loop through /select/ items
const dropdownItems = await dropdown.$$('option');
foreach (let i =0; i < to dropdownItems.length;i++)
  console.log(await dropdownItems[i].innerText());
//more at - https://testautomationu.applitools.com/js-playwright-tutorial/
--
#You can create also tests
#https://playwright.dev/docs/intro
#https://playwright.dev/docs/running-tests
#will create the playwright.config.js > edit it and leave only chormium.
#will create the project and lastly will output these :
#Runs the end-to-end tests
npx playwright test
#Runs the tests only on Desktop Chrome.
npx playwright test --project=chromium
#will start the test.specs.js with 'inspector' and will run the script line by line
npx playwright test --debug
#or without debug, you can use in code
#await page.pause();
#will start the browser and you can see what is doing
npx playwright test --headednpm I puppeteer
this will also download a standalone chromium.by running :console.log(puppeteer.executablePath());return;we can discover where exists in my case C:\Users\%username%\.cache\puppeteer\chrome\in case of firewall, just need the chrome.exe to be added to whitelist.
const puppeteer = require('puppeteer');
(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto('https://pipiscrew.com');
  await page.screenshot({ path: 'example.png', fullPage: true });
  await browser.close();
})();this will produce the example.png, near test.jsnpm test.js
const browser = await puppeteer.launch({headless: false, devtools: true});const resultsSelector = '#example';
await page.waitForSelector(resultsSelector);Spread (...) Operator = make the items iterable.
const inner_html = await page.$eval('#example', element => element.innerHTML);
console.log(inner_html);  const list = await page.$$eval('li.example>a', a => a.map(a =>a.href));
  console.log(list);*simple*
const inner_html = await page.evaluate(() => document.querySelector('#example').innerHTML);
*advanced*
const data = await page.evaluate(() => {
    const li = Array.from(document.querySelectorAll('li.title>a'));
 
    return li.map(td => {
        return td.innerHTML;
    });
});//1
  const element = await page.$('div#example');
  const element_property = await element.getProperty('innerHTML');
  const inner_html = await element_property.jsonValue();
//2
  const item = await page.$(resultsSelector);
  const data = await (await item.getProperty('textContent')).jsonValue();
  console.log(data);
//3 - ref https://qiita.com/go_sagawa/items/85f97deab7ccfdce53ea
  const item = await page.$(resultsSelector);
  const data = {
        href: await (await item.getProperty('href')).jsonValue(),
        textContent: await (await item.getProperty('textContent')).jsonValue(),
        innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
  };
  console.log(data);
//4
  const list = await page.$$(resultsSelector);
  const datas = [];
  for (let i = 0; i < list.length; i++) {
    datas.push(await (await list[i].getProperty('textContent')).jsonValue())
  }
  console.log(datas);//1
await page.$$eval('._qv64e', elements => elements[3].click());
//2
//loop through items, find the one with specific text and click it
await page.$$eval(resultsSelector, elements => {
  // const element = elements.find(element => element.firstElementChild);
  // const element = elements.find(element => element.innerHTML === '<h1>Hello, world!</h1>');
  // console.log(element);
  // element.click();
  elements[0].click();
});//1
await page.evaluate(() => { document.querySelector('div.example>button').click(); });
//2
await page.evaluate(() => { document.querySelectorAll('._qv64e')[80].click(); });
//3
  const dd = await page.evaluate(() => {
    const elements = [...document.querySelectorAll('div#example>button')];
    // const element = elements.find(element => element.firstChild);
    const element = elements.find(element => element.textContent == 'Press for more');
    element.click();
  });//1
  const x = await page.$('div.example>button');
  if (!x)
    console.log("nofound");
  else {
    console.log(x.length);
    await x.click();
  }
//2
  const selectAll = await page.$$('div.example>button');
  console.log(selectAll.length);
  await selectAll[0].click();  const element = await page.evaluateHandle(() => {
    const elements = [...document.querySelectorAll('div#example>button')];
    //const element = elements.find(element => element.firstChild);
    const element = elements.find(element => element.textContent == 'Press for more');
    return element;
  });
 
  await element.click();
#2
//ref - https://github.com/osfunapps/os-puppeteer-helper-npm/blob/master/index.js
 const last = await page.$('.item:last-child');
 const prev = await page.evaluateHandle(el => el.previousElementSibling, last);
 console.log(await (await prev.getProperty('innerHTML')).jsonValue());//https://docs.apify.com/tutorials/scraping-dynamic-content#timeout-and-errors
  let exitSwitch = 0;
  await page.waitForSelector('my-selector', { timeout: 10000 })
    .catch(() => {console.log('Wait for my-selector timed out');  exitSwitch=1;}
  );
  if (exitSwitch==1)
  {
    await browser.close();
    return;
  }  let all = [...document.querySelectorAll('td')].map(elem => elem.innerText);
  console.log(all);//puppeteer START
(async () => {
    const browser = await puppeteer.launch({ headless: false, devtools: true });
    const page = await browser.newPage();
    // https://stackoverflow.com/a/69127872
    await page.evaluateOnNewDocument(() => {
        window.parseFields = function parseFields(el) {
            //your function logic to be used inside page.$$eval
        }
    });
    // UPDATE - expose the function "logInNodeJs" for use from inside page context  - https://stackoverflow.com/a/73964712
    await page.exposeFunction('logInNodeJs', (value) => console.log(value));
    //browser to page
    await page.goto('https://yoursite.here');
    const rows = await page.$$eval('.table tr', list => {
        logInNodeJs("check");
        for (let i = 0; i < list.length; i++) {
            let TDs = list[i].querySelectorAll('td');
            col1 = parseFields(TDs[5].querySelectorAll('.inline-block'));
            var data = {
                col1: col1,
                cells: [...list[i].querySelectorAll('td')].map(elem => elem.innerText),
            };
            datas.push(data);
        }
        return datas;
    });
});//https://stackoverflow.com/a/65283313
//before
const rows = await page.$$eval('.table tr', list => {
//you code here
});
//after
const rows = await page.$$eval('.table tr', (list, currentURL) => {
    console.log(currentURL);
}, theURL);