我正试图修复一个数据爬虫,直到最近几周工作完美。该脚本由两部分组成,一部分检索文章的链接,另一部分对这些文章的规格进行抓取,该站点是https://www.fravega.com/。我所遇到的问题是在第一部分,其中的脚本得到的总页数在网站上使用它作为链接检索循环的迭代器。下面的代码:
def get_links(postal_code, section):
'''FUNCION QUE DEVUELVE UNA LISTA DE LOS LINKS DE ARTICULOS SEGUN EL CODIGO POSTAL, LA SECCION,
LA CANTIDAD DE PAGINAS Y EL SITIO WEB QUE LOS CONTIENE'''
chrome_options = webdriver.ChromeOptions()
# chrome_options.add_argument('--headless')
# chrome_options.add_argument('--no-sandbox')
# chrome_options.add_argument('--disable-dev-shm-usage')
# chrome_options.add_argument("start-maximized")
# chrome_options.add_argument("enable-automation")
# chrome_options.add_argument("--disable-infobars")
# chrome_options.add_argument("--disable-browser-side-navigation")
# chrome_options.add_argument("--disable-gpu")
driver =webdriver.Chrome('chromedriver',chrome_options=chrome_options)
wait = WebDriverWait(driver, 60)
driver.get("https://www.fravega.com/")
element = wait.until(EC.visibility_of_element_located((By.ID, "header-geo-location-form-postal-number"))) # Obtengo el codigo postal
element.send_keys(postal_code) # Escribe el codigo
wait.until(lambda driver: element.get_attribute('value') == postal_code)
element.submit() # Envia el formulario
wait.until(EC.presence_of_element_located((By.LINK_TEXT, section))).click() # Halla el texto de la seccion y lo clickea
wait.until(EC.visibility_of_element_located((By.XPATH, "//li[contains(@class,'sc-efd39989-1 laPjUm')]")))
# wait.until(EC.visibility_of_element_located((By.ID,'pagination-next-button')))
# wait.until(EC.presence_of_element_located((By.CLASS_NAME,'sc-efd39989-0 gUwmHE')))
pags = int(driver.find_elements_by_xpath("//li[contains(@class,'sc-efd39989-1 laPjUm')]/a")[-1].text)
# pags = int(wait.until(EC.visibility_of_element_located((By.XPATH, "//li[contains(@class,'sc-efd39989-1 laPjUm')]"))).text)
print(pags)
links = []
for n in tqdm(range(1, pags+1), initial=1):
wait.until(EC.visibility_of_element_located((By.XPATH, "//ul[contains(@class,'sc-e1732e90-0 fJzBdi')]")))
events = driver.find_elements_by_xpath("//ul[contains(@class,'sc-e1732e90-0 fJzBdi')]/li") # Encuentra los elementos del grid de articulos
for event in events:
link = event.find_element_by_tag_name('a').get_attribute('href') # Halla el link de cada articulo
links.append(link) # Lo adjunto a la lista
try:
driver.find_element_by_link_text('Siguiente >').click() # Click en siguiente
except NoSuchElementException: # Sino hay tal elemento, se rompe el for
break
driver.close()
driver.quit() # Se cierra el browser
return links
selenium超时的那行是:
wait.until(EC.visibility_of_element_located((By.XPATH, "//li[contains(@class,'sc-efd39989-1 laPjUm')]")))
正如你在下面的注释行中看到的,你可以看到我已经尝试了其他方法,但到目前为止,我所尝试的一切都使硒超时。我想知道是否有某种方法可以使这个工作,或者是否有其他方法可以获得总页数。由于
脚本一直超时的原因很简单:只有当您向下滚动到页面底部时,才能加载到下一页的链接。这也可能是脚本停止工作的原因。
所以你基本上必须首先等待,直到包含条目的页面加载。我使用过滤器的形式样式属性,但你可以使用任何其他(包括一个简单的睡眠)
# wait until page loads. You can wait for a specific element here
filter_form_selector = '[style="grid-area:filters-form"]'
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, filter_form_selector)))
然后,滚动到页面的最底部你也可以通过在循环中多次按下一个页面来更自然地滚动。
# scroll to the bottom of the page
driver.execute_script('window.scrollTo(0, 1000000000);')
然后等待页面链接出现
page_count_css_selector = '[data-type=page]'
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, page_count_css_selector)))
最后,检索最后一个页面按钮并读取其文本以查看总页数
pages_elements = driver.find_elements(By.CSS_SELECTOR, page_count_css_selector)
last_page = pages_elements[len(pages_elements) - 1]
pages = int(last_page.text)
print(pages)
下面是完整的工作代码:
import chromedriver_autoinstaller
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
def get_links(postal_code, section):
chromedriver_autoinstaller.install()
'''FUNCION QUE DEVUELVE UNA LISTA DE LOS LINKS DE ARTICULOS SEGUN EL CODIGO POSTAL, LA SECCION,
LA CANTIDAD DE PAGINAS Y EL SITIO WEB QUE LOS CONTIENE'''
chrome_options = Options()
chrome_options.headless = False
driver = webdriver.Chrome('chromedriver', chrome_options=chrome_options)
wait = WebDriverWait(driver, 60)
driver.get("https://www.fravega.com/")
element = wait.until(
EC.visibility_of_element_located((By.ID, "header-geo-location-form-postal-number"))) # Obtengo el codigo postal
element.send_keys(postal_code) # Escribe el codigo
wait.until(lambda driver: element.get_attribute('value') == postal_code)
element.submit() # Envia el formulario
wait.until(
EC.presence_of_element_located((By.LINK_TEXT, section))).click() # Halla el texto de la seccion y lo clickea
# wait until page loads. You can wait for a specific element here
filter_form_selector = '[style="grid-area:filters-form"]'
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, filter_form_selector)))
# scroll to the bottom of the page
driver.execute_script('window.scrollTo(0, 1000000000);')
page_count_css_selector = '[data-type=page]'
wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, page_count_css_selector)))
pages_elements = driver.find_elements(By.CSS_SELECTOR, page_count_css_selector)
last_page = pages_elements[len(pages_elements) - 1]
pages = int(last_page.text)
print(pages)
我还建议你为selenium设置一个代理来隐藏你的IP,因为你试图抓取的商店最终可能会阻止你的IP访问其数据,因为他们清楚地知道你的抓取尝试。
希望有帮助!