如何从包含分页的网站中提取链接?(使用硒)



我想从以下站点提取链接,但它确实包括分页: 我想提取更多信息按钮下的链接

我正在使用以下代码片段:

import time
import requests
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import re

browser = webdriver.Chrome()
time.sleep(5)
browser.get('https://www.usta.com/en/home/play/facility-listing.html?searchTerm=&distance=5000000000&address=Palo%20Alto,%20%20CA')
wait = WebDriverWait(browser,15)
def extract_data(browser):
links = browser.find_elements_by_xpath("//div[@class='seeMoreBtn']/a")
return [link.get_attribute('href') for link in links]

element = WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH, "//a[@class='glyphicon glyphicon-chevron-right']")))
max_pages = int(re.search(r'd+ de (d+)', element.text).group(1), re.UNICODE)
# extract from the current (1) page
print("Page 1")
print(extract_data(browser))
for page in range(2, max_pages + 1):
print("Page %d" % page)
next_page = browser.find_element_by_xpath("//a[@class='glyphicon glyphicon-chevron-right']").click()
print(extract_data(browser))
print("-----")

当我运行上面的脚本时,我收到此错误**(我也不太熟悉正则表达式,只是探索这个概念(**:

Traceback (most recent call last):
File "E:/Python/CSV/testingtesting.py", line 29, in <module>
max_pages = int(re.search(r'd+ de (d+)', element.text).group(1), re.UNICODE)
AttributeError: 'NoneType' object has no attribute 'group'

如果可能的话,请向我建议解决方案。不知何故,我设法使用等待并单击分页链接来提取链接。但是它所花费的时间增加了近 13 秒的等待时间,并且工作代码如下:

import time
import requests
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import re

# ----------------------------------------------HANDLING-SELENIUM-STUFF-------------------------------------------------
linkList = []
driver = webdriver.Chrome()
time.sleep(5)
driver.get('https://www.usta.com/en/home/play/facility-listing.html?searchTerm=&distance=5000000000&address=Palo%20Alto,%20%20CA')
wait = WebDriverWait(driver,8)
time.sleep(7)
for i in range(1,2925):
time.sleep(3)
# wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "//div[@class='seeMoreBtn']/a")))
links = driver.find_elements_by_xpath("//div[@class='seeMoreBtn']/a")
# print(links.text)
time.sleep(3)
#appending extracted links to the list
for link in links:
value=link.get_attribute("href")
# linkList.append(value)
with open('test.csv','a',encoding='utf-8',newline='') as fp:
writer = csv.writer(fp, delimiter=',')
writer.writerow([value])
# print(i,"  ",)
time.sleep(1)
driver.find_element_by_xpath("//a[@class='glyphicon glyphicon-chevron-right']").click()
time.sleep(6)

尝试以下代码以获取所需的数据,而无需额外的"睡眠":

import requests
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# ----------------------------------------------HANDLING-SELENIUM-STUFF-------------------------------------------------
driver = webdriver.Chrome()
driver.get('https://www.usta.com/en/home/play/facility-listing.html?searchTerm=&distance=5000000000&address=Palo%20Alto,%20%20CA')
wait = WebDriverWait(driver, 8)
links = []
while True:
new_links = wait.until(EC.visibility_of_all_elements_located((By.LINK_TEXT, "MORE INFO")))
links.extend([link.get_attribute("href") for link in new_links])
try:
next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "li[title='Next page']>a")))
next_button.click()
except TimeoutException:
break
wait.until(EC.staleness_of(new_links[-1]))
#  Do whatever you need with links 

相关内容

  • 没有找到相关文章

最新更新