点击内部javascript链接并使用puppeteer返回url



我的目标是点击这个页面上的每个链接(称为脚注(,然后返回脚注链接、文本,然后返回侧边栏中出现的所有URL。当侧边栏值出现时,我一直在访问它们,在失败几周后,我正在寻找一些关于我做错了什么的指针(对javascript和puppeteer来说都是新的(。

const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function () {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto(url);
const footnotes = await page.$$eval(selector, nodes => {
return nodes.map(node => {
const ref = node.href.replace('https://www.churchofjesuschrist.org', '');
const txt = node.text;
return {
ref,
txt
};
});
});
for (const a of footnotes) {
page.click(a.ref);
const links = await page.$$eval('.scripture-ref', nodes => {
return nodes.map(node => {
return node.href
})
})
}
console.log(footnotes);
console.log(links);
// const fs = require('fs');
// fs.writeFile('./footnotes.json', JSON.stringify(footnotes), err => err ? console.log(err) : null);
await browser.close();
})();

也许是这样的:

const puppeteer = require('puppeteer');
const url = 'https://www.churchofjesuschrist.org/study/scriptures/bofm/1-ne/11?lang=eng';
const selector = '.study-note-ref';
(async function main() {
const browser = await puppeteer.launch({ headless: true });
const [page] = await browser.pages();
await page.goto(url);
const data = {};
for (const footnote of await page.$$(selector)) {
const [href, text] = await page.evaluate(
(a) => {
a.click();
return [a.getAttribute('href').replace('/#note', ''), a.innerText.slice(1)];
},
footnote
);
data[href] = { text };
const header = await page.waitForXPath(`//aside/div/header/span[text()="${href} ${text}"]`);
data[href].links = await page.evaluate(
(span) => {
const aside = span.closest('aside');
return [...aside.querySelectorAll('a[href]')].map(
a => ({ [a.innerText]: a.href })
);
},
header
);
console.log(`Done: ${href} ${text}`);
}
console.log(JSON.stringify(data, null, 2));
await browser.close();
})();

部分输出:

{
"1a": {
"text": "pondering",
"links": [
{
"D&C 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19"
},
{
"TG Meditation": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
},
{
"Doctrine and Covenants 76:19": "https://www.churchofjesuschrist.org/study/scriptures/dc-testament/dc/76.19?lang=eng#p19#19"
},
{
"Meditation, Meditate": "https://www.churchofjesuschrist.org/study/scriptures/tg/meditation?lang=eng"
}
]
},
}

最新更新