从HTML代码中删除URL和描述



我对编码很陌生,我很难从一块HTML代码中返回多个URL和描述。我知道我必须以某种方式循环它,但不知道如何循环。希望有人能帮帮我。我有

function pageFunction(context) {

const $ = context.jQuery;
const venueName = $('div.collist a').first().text() + ' /';
const venueURL = $('div.collist a[href]').attr('href');

return {
venueName,
venueURL
};
}

我试图抓取的HTML代码是

</div>
<div class="collist">
<h3>Properties</h3>x
<ul class="list-unstyled">
<li>
<a href="search?query=500" rel="nofollow" title="Venue1"><span>London, UK</span></a> (<span>810</span>)
</li><li>
<a href="search?query=600" rel="nofollow" title="Venue2"><span>Pretoria, South Africa</span></a> (<span>820</span>)
</li><li>
<a href="search?query=700" rel="nofollow" title="Venue3"><span>New York, USA</span></a> (<span>830</span>)
</li><li>
<a href="search?query=800" rel="nofollow" title="Venue4"><span>Paris, France</span></a> (<span>840</span>)
</li><li>
<a href="search?query=900" rel="nofollow" title="Venue5"><span>Denver, USA</span></a> (<span>850</span>)
</li><li>
<a href="search?query=1000" rel="nofollow" title="Venue6"><span>Deli, India</span></a> (<span>860</span>)
</li><li>
<a href="search?query=1100" rel="nofollow" title="Venue7"><span>Lisbon, Protugal</span></a> (<span>870</span>)
</li><li>
<a href="search?query=1200" rel="nofollow" title="Venue8"><span>Madrid, Spain/span></a> (<span>880</span>)
</li><li>
<a href="search?query=1300" rel="nofollow" title="Venue9><span>Berlin, Germany</span></a> (<span>890</span>)
</li><li>
<a href="search?query=1400" rel="nofollow" title="Venue10"><span>Stockholm, Sweden</span></a> (<span>900</span>)
</li>
</ul>

我目前的结果是

[{
"venueName": "London, UK /",
"venueURL": "search?query=500"
}]

但是我想看

[{
"venueName": "London, UK /","venueURL": "search?query=500",
},{
"venueName": "Pretoria, South Africa /","venueURL": "search?query=600",
},{
"venueName": "New York, USA /","venueURL": "search?query=700",
},{
"venueName": "Paris, France /","venueURL": "search?query=800",
},{
"venueName": "Denver, USA / ","venueURL": "search?query=900",
}]

我试着加入.each()

function pageFunction(context) {

const $ = context.jQuery;
const venueURL = $('div.collist a[href]').each(function(index, value){console.log(this.href);})


return {
venueURL
};
}

当我在浏览器控制台中运行它时,它似乎可以工作,但一旦我在应用程序中运行它,我就会收到以下错误

错误PuppeterCrawler:handleRequestFunction失败,正在回收请求返回列表或队列失败

错误:评估失败:范围错误:超出的最大调用堆栈大小

当您试图从page函数返回一个不可字符串化的对象时,会出现错误。意味着尝试将each()获取到数据集是不可能的,因为它是一个jQuery对象,而JSON.stringify失败了。您需要为每个结果返回一个对象数组,以成为一个单独的数据集项目,如下所示:

function pageFunction(context) {

const $ = context.jQuery;
const results = [];

// treat the parent as the unique container of data
$('.collist li').each((_, li) => {
const $li = $(li);
// find the data individually per found LI element 
const $link = $li.find('a').first();
// using text() on an element that have children can include a lot of whitespace, narrow down to `span`
const venueName = $link.find('span').first().text() + ' /';
const venueURL = $link.attr('href');
results.push({
venueName,
venueURL
});
});
return results;
}

最新更新