使用Apify和Puppeteer清除URL



我正在尝试从https://en.wikipedia.org/wiki/List_of_hedge_funds通过使用一个名为"web scraper"的Apify演员(https://apify.com/apify/web-scraper)

具体来说,我尝试使用下面的ApifypageFunction来抓取目标页面,并从HTML中的锚标记返回URL列表。

pageFunction
async function pageFunction( context ) {
const url = 'https://en.wikipedia.org/wiki/List_of_hedge_funds';
const cssSelector = 'tr > td > a';
const $ = context.jQuery;
const pageTitle = $('title').first().text();
const anchorTag = $( cssSelector );
return {
url: context.request.url,
pageTitle, anchorTag,
};
}

在控制台中,我希望在名为anchorTag的属性中看到目标页面上存在的一个或多个锚标记的href属性的值。我还希望在名为pageTitleurl的属性中看到页面标题。如下:

我期望看到的内容:
{
"url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
"pageTitle": "List of hedge funds - Wikipedia",
"anchorTag": {
"0": "http://example0.com", // each instance of "http://example.com" represents a unique url on the target page to be scraped
"1": "http://example1.com",
"2": "http://example2.com",
"3": "http://example3.com",
...
"39": "http://example39.com",
}}

但是,actor返回的不是URL列表,而是以下数据集:

我实际看到的:
[{
"url": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
"pageTitle": "List of hedge funds - Wikipedia",
"anchorTag": {
"0": {},
"1": {},
"2": {},
"3": {},
"4": {},
"5": {},
"6": {},
"7": {},
"8": {},
"9": {},
"10": {},
"11": {},
"12": {},
"13": {},
"14": {},
"15": {},
"16": {},
"17": {},
"18": {},
"19": {},
"20": {},
"21": {},
"22": {},
"23": {},
"24": {},
"25": {},
"26": {},
"27": {},
"28": {},
"29": {},
"30": {},
"31": {},
"32": {},
"33": {},
"34": {},
"35": {},
"36": {},
"37": {},
"38": {},
"39": {},
"length": 40,
"prevObject": {
"0": {
"location": {
"href": "https://en.wikipedia.org/wiki/List_of_hedge_funds",
"ancestorOrigins": {},
"origin": "https://en.wikipedia.org",
"protocol": "https:",
"host": "en.wikipedia.org",
"hostname": "en.wikipedia.org",
"port": "",
"pathname": "/wiki/List_of_hedge_funds",
"search": "",
"hash": "",
"assign": {},
"reload": {},
"toString": {},
"replace": {}
},
"write": {},
"writeln": {},
"jQuery3410461525655351679551": {
"events": {
"mmv-setup-overlay": [
{
"type": "mmv-setup-overlay",
"origType": "mmv-setup-overlay",
"handler": {
"guid": 21
},
"guid": 21,
"namespace": ""
}
],
"mmv-cleanup-overlay": [
{
"type": "mmv-cleanup-overlay",
"origType": "mmv-cleanup-overlay",
"handler": {
"guid": 22
},
"guid": 22,
"namespace": ""
}
],
"keyup": [
{
"type": "keyup",
"origType": "keyup",
"handler": {
"guid": 24
},
"guid": 24,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"mouseover": [
{
"type": "mouseover",
"origType": "mouseover",
"handler": {
"guid": 24
},
"guid": 24,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"focusout": [
{
"type": "focusout",
"origType": "blur",
"handler": {
"guid": 25
},
"guid": 25,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"mouseout": [
{
"type": "mouseout",
"origType": "mouseout",
"handler": {
"guid": 25
},
"guid": 25,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
],
"click": [
{
"type": "click",
"origType": "click",
"handler": {
"guid": 26
},
"guid": 26,
"selector": "#mw-content-text a[href][title]:not(.extiw, .image, .new, .internal, .external, .mw-cite-backlink a, .oo-ui-buttonedElement-button, .cancelLink a)",
"needsContext": false,
"namespace": ""
}
]
},
"handle": {},
"focusin": 1,
"focusout": 1
}
},
"length": 1
}
}
}]

我做错了什么?

您必须访问a标记的href属性才能获得URL。此外,还需要循环遍历所有a标记,将它们放入一个数组中。

// ...
const anchorTag = $( cssSelector );
const links = [];
// anchorTag in a JQuery handle, not a normal JavaScript value so it has special JQuery methods
anchorTag.each((index, el) => {
const link = $(el).attr('href');
if (link) {
links.push(link);
}
})
return {
url: context.request.url,
pageTitle,
links,
};

相关内容

  • 没有找到相关文章

最新更新