在一系列链接中循环会导致导航超时错误



我有一个按钮元素数组,我想一个接一个地单击它,并为打开的每个新选项卡执行此操作:

  1. 抓取一些信息并存储在名为"providers"的数组中
  2. 关闭该选项卡

虽然我可以做到这一点,但由于在browser.pages((之前使用的导航组件,我一直收到超时错误。如果我删除该组件,我会收到另一个超时错误。此外,每次我运行程序时,在按钮数组的不同迭代次数后,它都会遇到超时错误。这是我的代码:

const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
//google.com
await page.setExtraHTTPHeaders({ "Accept-Language": "en-US" });
await page.goto("https://google.com");
await page.type("input.gLFyf.gsfi", "hotels in london");
await page.keyboard.press("Enter");
//search results
await page.waitForXPath('//span[contains(text(),"View ")]');
const btn1 = await page.$x('//span[contains(text(),"View ")]');
await btn1[0].click();
//list of hotels
await page.waitForXPath('//span[contains(text(),"Learn more")]');
let hotels = [];

//buttons array that contains a list of buttons
let buttons = await page.$x("//button[contains(., 'View prices')]");

//prints a different value each time the program is run
console.log(buttons.length);

//looping through buttons array
for (var i = 0; i < buttons.length; i++) {
//i = 1 or 0 when program hangs 
console.log("got here " + I);
//*******************************cause of timeout error******************************************
await page.setDefaultNavigationTimeout(0);
await Promise.all([
page.waitForNavigation({ waitUntil: "load", timeout: 0 }),
buttons[i].click(),
]);
//***********************************************************************************************
//getting all open tabs in an array
const pages = await browser.pages();
const page2 = pages[pages.length - 1];
console.log(pages.length);
//newly opened tab, sometimes program hangs before opening a new tab
await page2
.waitForSelector(
"#prices > c-wiz > div > div.G86l0b > div > div > div > div > div > section > div.Hkwcrd.q9W60.A5WLXb.fLClSe > c-wiz > div > div > span > div > div > div > div > div > a > div > div.cFdfnb > div > span.mK0tQb > span",
{ timeout: 30000 }
)
.catch(() => console.log("Class doesn't exist!"));
/*-----------------scraping information on new tab ----------------------------------*/
console.log("going to start collecting providers");
let providers = await page2.evaluate(() => {
let data = [];
let elements = document.querySelectorAll(
"#prices > c-wiz > div > div.G86l0b > div > div > div > div > div > section > div.Hkwcrd.q9W60.A5WLXb.fLClSe > c-wiz > div > div > span > div > div > div > div > div > a > div > div.cFdfnb > div > span.mK0tQb > span"
);
for (var element of elements) data.push(element.textContent);
return data;
});
console.log(providers.length);
console.log("all done");
console.log(providers);
hotels.push(providers);
//closing the new tab
page2.close();
}

await browser.close();
return hotels;
} catch (err) {
console.error(err);
}
})()
.then((resolvedValue) => {
console.log(resolvedValue);
})
.catch((rejectedValue) => {
console.log(rejectedValue);
});

为了消除这个错误,我使用了timeout:0和setDefaultNavigationTimeout(0(,但现在程序冻结了。这是我在禁用超时获取之前得到的错误:

TimeoutError: Navigation timeout of 30000 ms exceeded
at C:UsersMeDesktopweb_scraping_practicenode_modulespuppeteerlibLifecycleWatcher.js:100:111
at async FrameManager.waitForFrameNavigation (C:UsersMeDesktopweb_scraping_practicenode_modulespuppeteerlibFrameManager.js:107:23)
at async Frame.waitForNavigation (C:UsersMeDesktopweb_scraping_practicenode_modulespuppeteerlibFrameManager.js:298:16)
at async Page.waitForNavigation (C:UsersMeDesktopweb_scraping_practicenode_modulespuppeteerlibPage.js:560:16)
at async Promise.all (index 0)
at async C:UsersMeDesktopweb_scraping_practicebackend.js:41:7
-- ASYNC --
at Frame.<anonymous> (C:UsersMeDesktopweb_scraping_practicenode_modulespuppeteerlibhelper.js:116:19)
at Page.waitForNavigation (C:UsersMeDesktopweb_scraping_practicenode_modulespuppeteerlibPage.js:560:53)
at Page.<anonymous> (C:UsersMeDesktopweb_scraping_practicenode_modulespuppeteerlibhelper.js:117:27)
at C:UsersMeDesktopweb_scraping_practicebackend.js:42:14
at processTicksAndRejections (internal/process/task_queues.js:97:5) {
name: 'TimeoutError'
}
undefined

谢谢

尝试运行您的代码时,如果您根据跨度的内容搜索跨度,那么对Chromium区域进行硬编码似乎是明智的,因为在我的浏览器中,它们不是英文的。但我稍微调整了一下,打开了一个包含酒店详细信息的标签。问题是这个选择器:

$("#prices > c-wiz > div > div.G86l0b > div > div > div > div > div > section > div.Hkwcrd.q9W60.A5WLXb.fLClSe > c-wiz > div > div > span > div > div > div > div > div > a > div > div.cFdfnb > div > span.mK0tQb > span");

不幸的是,这件事渲染了null。我相信这组类div.Hkwcrd.q9W60.A5WLXb.fLClSe是动态生成的。不确定您实际想要提取什么信息,但我会尝试通过这个data-click-type属性来查找DOM元素。在我的情况下,它会产生:

document.querySelectorAll("div[data-click-type='283']");
NodeList(18) [div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd, div.YPrvOd]

这似乎是房间的类型(高级双人间等(。"268"点击类型似乎是酒店的网站(预订、hotels.com等(

以下代码:

const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({
headless: false,
});
const page = await browser.newPage();
//google.com
await page.setExtraHTTPHeaders({ "Accept-Language": "en-US" });
await page.goto("https://google.com");
await page.type("input.gLFyf.gsfi", "hotels in london");
await page.keyboard.press("Enter");
//search results
await page.waitForXPath('//span[contains(text(),"View ")]');
const btn1 = await page.$x('//span[contains(text(),"View ")]');
await btn1[0].click();
//list of hotels
await page.waitForXPath('//span[contains(text(),"Learn more")]');
let hotels = [];
//buttons array that contains a list of buttons
let buttons = await page.$x("//button[contains(., 'View prices')]");
//prints a different value each time the program is run
console.log(buttons.length);
//looping through buttons array
for (var i = 0; i < buttons.length; i++) {
//i = 1 or 0 when program hangs
console.log("got here " + i);
//*******************************cause of timeout error******************************************
await page.setDefaultNavigationTimeout(0);
await Promise.all([
page.waitForNavigation({ waitUntil: "load", timeout: 0 }),
buttons[i].click(),
]);
//***********************************************************************************************
//getting all open tabs in an array
const pages = await browser.pages();
const page2 = pages[pages.length - 1];
console.log(pages.length);
//newly opened tab, sometimes program hangs before opening a new tab
await page2
.waitForSelector(
"span[data-click-type='268']",
{ timeout: 30000 }
)
.catch(() => console.log("Class doesn't exist!"));
/*-----------------scraping information on new tab ----------------------------------*/
console.log("going to start collecting providers");
let providers = await page2.evaluate(() => {
let data = [];
let elements = document.querySelectorAll(
"span[data-click-type='268']"
);
for (var element of elements) data.push(element.textContent);
return data;
});
console.log(providers.length);
console.log("all done");
console.log(providers);
hotels.push(providers);
//closing the new tab
page2.close();
}
await browser.close();
return hotels;
} catch (err) {
console.error(err);
}
})()
.then((resolvedValue) => {
console.log(resolvedValue);
})
.catch((rejectedValue) => {
console.log(rejectedValue);
});

在我的情况下呈现以下内容:

(node:16816) ExperimentalWarning: The fs.promises API is experimental
12
got here 0
3
going to start collecting providers
16
all done
[ 'Booking.com',
'Tripadvisor.com',
'Agoda',
'Hotels.com',
'Booking.com',
'Tripadvisor.com',
'Agoda',
'Hotels.com',
'Expedia.com',
'Destinia',
'Stayforlong.com',
'Trip.com',
'ebookers.ie',
'Etrip',
'ZenHotels.com',
'Nustay.com' ]
got here 1

我相信这就是providers的列表请注意使用的选择器:span[data-click-type='268']

相关内容

  • 没有找到相关文章

最新更新