无法应用在函数中抓取下一页的逻辑



我使用axioscheerio创建了一个脚本,从黄页获取不同的商店名称及其相关链接,然后使用这些链接从其内页刮掉电话和电子邮件。脚本运行正常。

我现在想做的是使用下一页链接来继续从下一页抓取内容。我只是不知道如何在getLinks()功能中应用解析和使用下一页的逻辑。

目前我正在尝试:

const axios = require('axios');
const cheerio = require('cheerio');
const startUrl = 'https://www.yellowpages.com/search?search_terms=Pizza&geo_location_terms=San+Francisco%2C+CA';
const host = 'https://www.yellowpages.com';
const getLinks = async (url,host,callback) => {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
$('[class="result"] a.business-name').each(function(){
let items = $(this).find('span').text();
let links = host + $(this).attr("href");
return callback(items,links);
});
}
const fetchContent = async (shopName,shopLink,callback) => {
const { data } = await axios.get(shopLink);
const $ = cheerio.load(data);
let phone = $('.contact > p.phone').eq(0).text();
let email = $('.business-card-footer > a.email-business').eq(0).attr("href");
return callback(shopName,shopLink,phone,email);
}
async function scrapeData() {
getLinks(startUrl,host,function(itemName,link){
fetchContent(itemName,link,function(shopName,shopLink,phone,email){
console.log({shopName,shopLink,phone,email});
});
});
}
scrapeData();

下一页链接通常在[rel=next],所以它通常是这样的:

async function get(url) {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
return $
}
async function run(){
let url = 'https://www.yellowpages.com/search?search_terms=Pizza&geo_location_terms=San+Francisco%2C+CA'
let $ = await get(url)
// doSomething($)
let href = $('[rel=next]').attr('href')
while(href){
url = new URL(href, url).href
$ = await get(url)
// doSomething($)
href = $('[rel=next]').attr('href')
}
}

最新更新