使用硒进行网页抓取:移动到下一页



我如何从这个网站获得以下信息,查看gif在下一页中有更多评论?我想使用硒和网络驱动程序

  • <span class="a-profile-name">NAME</span>

  • <i data-hook="review-star-rating" class="a-icon a-icon-star a-star-2 review-rating"><span class="a-icon-alt">2.0 out of 5 stars</span></I>

  • 几个月后分崩离析
    • <span data-hook="review-date" class="a-size-base a-color-secondary review-date">Reviewed in the United States on January 23, 2019</span>
  • 审查机构:

鞋底穿了大约4个月后完全脱胶在办公室环境中。我无法想象一双合法的匡威运动鞋的质量会如此低劣。我不是专家但我觉得它们是假的。

不管怎样,这些鞋都不值钱。

我更喜欢使用selenium,因为我可以轻松地移动到下一页并存储收集的数据。

对于这些字段中的每一个,我都应该有单独的列表,其中包含:author, dates, stars, review's title and review's body。例如:

https://www.amazon.com/Converse-Chuck-Taylor-Star-Core/dp/B07KLM7JRL/ref=sr_1_1?dchild=1&keywords=converse&qid=1596469913&sr=8-1&th=1

拥有2226条评级评论。

你认为硒有什么可行的吗?

代码(代码包含丢失的信息,可能搜索的部分也是错误的(:

from bs4 import BeautifulSoup
import time
from selenium import webdriver
import re
def spider():

driver = webdriver.Chrome('path/chromedriver'))

driver.get('https://www.amazon.com/Converse-Chuck-Taylor-Star-Core/dp/B07KLM7JRL/ref=sr_1_1?dchild=1&keywords=converse&qid=1596469913&sr=8-1&th=1') #in th I should add page number info
time.sleep(1)
search = driver.find_element_by_name('q')
time.sleep(2)
search.submit()
author = []
dates = []
score = []
review_min = []
review = []

while True:
soup = BeautifulSoup(driver.page_source,'lxml')
result_div = soup.find_all('div', attrs={'class': 'g'})
time.sleep(2)
for r in result_div:
# here there should be the part to get info about author, dates, scores, ...
time.sleep(1)
# part where I append results scraped
next_page_btn =driver.find_elements_by_xpath("//a[@id='pnnext']")
if len(next_page_btn) <1:
print("no more pages left")
break
element =WebDriverWait(driver,100).until(expected_conditions.element_to_be_clickable((By.ID,'pnnext')))
driver.execute_script("return arguments[0].scrollIntoView();", element)
element.click()
time.sleep(2)

driver.quit()

您的解决方案需要由几个层组成。每一层负责不同的行动和行为。

第一层

负责导航和页面迭代-将对每个页面重复。

第二层

负责项目-将提取单个项目的评审信息,并在页面中为每个项目重复。

这是最棘手的部分,因为它必须在不同的页面中打开每个项目(如果使用"返回",它将刷新并丢失数据(,导航到新页面,切换、提取、关闭并切换回-所以我们回到0点,为下一个项目做好准备。

第三层

负责审查-将提取单个项目的所有审查,并重复每个项目页面的所有审查

摘要

For Each Page Extract
> Item, For Each Item Extract
> Reviews

结果将是以下格式的审查项目数组

{
"product": "My Product",
"link": "https://products/my_product",
"reviews": [
{ "author": "foo", "date": "0000-000"... },
{ "author": "bar", "date": "0000-000"... },
...
]
}

代码示例

这将是你的出发点,你可以实现缺失的部分。这将提取单个页面中所有项目的评论。

按原样运行示例,只需更改驱动程序路径。

import re
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.wait import WebDriverWait

def spider(page_number: int):
# setup: web driver > wait object > url format > page number
driver = webdriver.Chrome('D:\automation-env\web-drivers\chromedriver.exe')
wait = WebDriverWait(driver, 15)
url_format =
"https://www.amazon.com/Converse-Chuck-Taylor-Star-Core/dp/B07KLM7JRL/" 
"ref=sr_1_1?" 
"dchild=1&" 
"keywords=converse&" 
"qid=1596469913&" 
"sr=8-1&" 
"th={page_number}"
try:
# navigate
driver.get(url_format.format(page_number=page_number))
driver.maximize_window()
# search your product
__search(driver_wait=wait, search_for='converse')
# cache item
rate_locator = (By.XPATH, "//i[contains(@class,'a-star-small-')]")
items = wait.until(expected_conditions.visibility_of_all_elements_located(rate_locator))
# product cycle
reviews = []
for i in range(len(items)):
reviews.append(__product_cycle(on_driver=driver, on_element=items[i], on_element_index=i + 1))
# output
print(reviews)
except Exception as e:
print(e)
finally:
if driver is not None:
driver.quit()

# execute search product
def __search(driver_wait: WebDriverWait, search_for: str):
# search
search = driver_wait.until(expected_conditions.element_to_be_clickable((By.ID, 'twotabsearchtextbox')))
search.clear()
search.send_keys(search_for)
search.submit()

# execute an extraction on single item in the products list
# you can add more logic to extract the rest of the review
def __product_cycle(on_driver, on_element, on_element_index):
# hover the review element
ActionChains(driver=on_driver).move_to_element(on_element).perform()
# open reviews in new page (the index is here to handle amazon keeping in the DOM all reviews already inspected)
wait = WebDriverWait(on_driver, 15)
link_element_locator = (By.XPATH, "(//a[.='See all customer reviews'])[" + f'{on_element_index}' + "]")
link_element =
wait.until(expected_conditions.element_to_be_clickable(link_element_locator))
link = link_element.get_attribute(name='href')
on_driver.execute_script(script="window.open('about:blank', '_blank');")
on_driver.switch_to_window(on_driver.window_handles[1])
on_driver.get(link)
# cache review elements
review_locator = (By.XPATH, "//div[contains(@id,'customer_review-')]")
review_elements = wait.until(expected_conditions.visibility_of_all_elements_located(review_locator))
# extract reviews for page
# if you want to iterate pages put this inside page iteration loop
reviews = {
"product": on_driver.title,
"link": on_driver.current_url,
"data": []
}
reviews_data = []
for e in review_elements:
reviews["data"].append(__get_item_review(on_driver, e))
# return to point 0
on_driver.close()
on_driver.switch_to_window(on_driver.window_handles[0])
# results
return reviews

# extracts a single item reviews collection
def __get_item_review(on_driver, on_element) -> dict:
# locators
author_locator = ".//span[@class='a-profile-name']"
date_locator = ".//span[@data-hook='review-date']"
score_locator = ".//a[.//i[@data-hook='review-star-rating']]"
review_locator = ".//div[@data-hook='review-collapsed']/span"
# data
review_data = {
'author': on_element.find_element_by_xpath(author_locator).text.strip(),
'date': re.findall('(?<=on ).*', on_element.find_element_by_xpath(date_locator).text.strip())[0],
'score': re.findall('\d+.\d+', on_element.find_element_by_xpath(score_locator).get_attribute("title"))[0],
'review': on_element.find_element_by_xpath(review_locator).text.strip(),
}
# TODO: add more logic to get also the hidden reviews for this item.
# results data
return review_data

spider(page_number=1)

相关内容

  • 没有找到相关文章

最新更新