我需要你的帮助。我在做一个刮削项目。
这是我试图删除此地址的输出选项卡的web链接:https://www.napier.ac.uk/research-and-innovation/research-search?fv=BE7C946393454607AD2CC3D91E65303F%7eBusiness+School&dtFrom=2021-01&dtTo=2022-12&t1sz=100&tab=1&tabpg1=3#rms
我可以抓取第一页和下一页页面中迭代。也许是因为HTML标签上没有下一页按钮。
这就是我所做的
output_tab = wait.until(ec.element_to_be_clickable((By.XPATH, "(//a[@class='r-tabs-anchor'][normalize-space()='Outputs'])[1]")))
output_tab.click()
time.sleep(2)
df = pd.DataFrame({'Titles': [''], 'SubTitle': [''], 'Abstract': [''], 'Intro': [''], 'Links': ['']})
counter = 0
while counter < 4:
driver.refresh()
post_blocks = driver.find_elements(By.XPATH, "(//div[@class='output bgGrey'])")
for post_block in post_blocks:
title = post_block.find_element(By.XPATH, "./div/h3").text # (//div[@class='output bgGrey'])/div/h3
sub_title = post_block.find_element(By.XPATH, "./div[3]").text # (//div[@class='output bgGrey'])/div[3]
try:
post_abstract = post_block.find_element(By.XPATH, "./div[4]").text # (//div[@class='output bgGrey'])/div[4]
except Exception:
continue
try:
post_intro = post_block.find_element(By.XPATH, "./div[5]").text # # (//div[@class='output bgGrey'])/div[5]
except Exception:
continue
post_link = post_block.find_element(By.XPATH, "./parent::a").get_attribute('href') # (//div[@class='output bgGrey'])/parent::a
df = df.append({'Titles': title, 'SubTitle': sub_title, 'Abstract': post_abstract, 'Intro': post_intro, 'Links': post_link}, ignore_index = True)
next_page = wait.until(ec.element_to_be_clickable((By.XPATH, "(//a[contains(@class,'')][normalize-space()='3'])[2]"))).click()
counter += 1
df.to_csv('C:/Users/testuser/napier_outputs.csv')
注意到错误TimeoutException: TimedPromise timed out after 300000 ms
通过将XPATH(//a[contains(@class,'')][normalize-space()='3'])[2]
中的值从3更改为4,我可以轻松地废弃页面。等等。
总之,我如何像第一页那样迭代页面并收集数据?
您可以通过分析";最后一个";页这个<a>
元素的href
属性包含查询参数tabpg1
,其值等于页数。有了这些知识,您现在可以通过定义添加计数器的基本url来检索所有页面。检查此代码:
import urllib.parse
wait = WebDriverWait(driver,3)
counter = 1
base_url = "https://www.napier.ac.uk/research-and-innovation/research-search?fv=BE7C946393454607AD2CC3D91E65303F%7eBusiness+School&dtFrom=2021-01&dtTo=2022-12&t1sz=100&tab=1&tabpg1="
driver.get(f"{base_url}{counter}")
output_tab = wait.until(ec.element_to_be_clickable((By.XPATH, "(//a[@class='r-tabs-anchor'][normalize-space()='Outputs'])[1]")))
output_tab.click()
time.sleep(2)
df = pd.DataFrame({'Titles': [], 'SubTitle': [], 'Abstract': [], 'Intro': [], 'Links': []})
# get number of pages
last_link = driver.find_element(By.ID, "bodycontent_1_ctl07_lnkLast")
url_parts = urllib.parse.urlparse(last_link.get_attribute("href"))
last_page = int(urllib.parse.parse_qs(url_parts.query)["tabpg1"][0])
while counter <= last_page:
driver.refresh()
post_blocks = driver.find_elements(By.XPATH, "(//div[@class='output bgGrey'])")
for post_block in post_blocks:
title = post_block.find_element(By.XPATH, "./div/h3").text # (//div[@class='output bgGrey'])/div/h3
sub_title = post_block.find_element(By.XPATH, "./div[3]").text # (//div[@class='output bgGrey'])/div[3]
try:
post_abstract = post_block.find_element(By.XPATH, "./div[4]").text # (//div[@class='output bgGrey'])/div[4]
except Exception:
continue
try:
post_intro = post_block.find_element(By.XPATH, "./div[5]").text # # (//div[@class='output bgGrey'])/div[5]
except Exception:
continue
post_link = post_block.find_element(By.XPATH, "./parent::a").get_attribute('href') # (//div[@class='output bgGrey'])/parent::a
df = df.append({'Titles': title, 'SubTitle': sub_title, 'Abstract': post_abstract, 'Intro': post_intro, 'Links': post_link}, ignore_index = True)
counter += 1
driver.get(f"{base_url}{counter}")
df.to_csv('napier_outputs.csv')