如何在硒 Python 中动态地逐个获取 URL?



我是Selenium Python的新手。我想在谷歌上搜索一个关键字并打开它,在结果部分,我想单击第一个网址并获取数据,然后返回单击第二个链接并获取数据.....依此类推,直到 10 个 URL。我已经在下面的代码中使用 x-path 完成了它,但我想动态地完成它而不编写一个链接的特定 x 路径? 附言 - 我尝试过使用 for 循环,但我无法做到。 总而言之,我想在不指定某些 x 路径的情况下获取以下代码的结果,但动态获取任何关键字的 URL。

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
driver=webdriver.Chrome(executable_path="E:Sahilseleniumchromedriverchromedriver.exe")
driver.get("https://www.google.com/")
print(driver.title)
driver.maximize_window()
time.sleep(2)
driver.find_element(By.XPATH, "//input[@name='q']").send_keys('selenium')
driver.find_element(By.XPATH, "//div[@class='FPdoLc tfB0Bf']//input[@name='btnK']").send_keys(Keys.ENTER)
# time.sleep(5)
# 1>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "Selenium Web Driver").click()
a=driver.find_elements(By.TAG_NAME, "p")
for data in a:
print(data.text)
driver.back()
# 2>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "The Selenium Browser Automation Project :: Documentation ...").click()
b=driver.find_elements(By.TAG_NAME, "p")
for data in b:
print(data.text)
driver.back()
# 3>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "Selenium Tutorial for Beginners: Learn WebDriver in 7 Days").click()
c=driver.find_elements(By.TAG_NAME, "p")
for data in c:
print(data.text)
driver.back()
# 4>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "Selenium with Python — Selenium Python Bindings 2 ...").click()
d=driver.find_elements(By.TAG_NAME, "p")
for data in d:
print(data.text)
driver.back()
# 5>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "Selenium: Definition, How it works and Why you need it ...").click()
e=driver.find_elements(By.TAG_NAME, "p")
for data in e:
print(data.text)
driver.back()
# 6>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "selenium · PyPI").click()
f=driver.find_elements(By.TAG_NAME, "p")
for data in f:
print(data.text)
driver.back()
# 7>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "Selenium (software) - Wikipedia").click()
g=driver.find_elements(By.TAG_NAME, "p")
for data in g:
print(data.text)
driver.back()
# 8>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "Selenium: Health benefits, sources, and potential risks").click()
h=driver.find_elements(By.TAG_NAME, "p")
for data in h:
print(data.text)
driver.back()
# 9>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "SeleniumHQ/selenium: A browser automation ... - GitHub").click()
i=driver.find_elements(By.TAG_NAME, "p")
for data in i:
print(data.text)
driver.back()
# Next Page
driver.find_element(By.LINK_TEXT, "2").click()
# 10>>>
driver.find_element(By.PARTIAL_LINK_TEXT, "Selenium - Testing Framework | Sauce Labs").click()
j=driver.find_elements(By.TAG_NAME, "p")
for data in j:
print(data.text)
driver.back()
driver.close();

试试这个:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
driver=webdriver.Chrome("chromedriver.exe")
driver.get("https://www.google.com/")
print(driver.title)
driver.maximize_window()
time.sleep(2)
driver.find_element(By.XPATH, "//input[@name='q']").send_keys('selenium')
driver.find_element(By.XPATH, "//div[@class='FPdoLc tfB0Bf']//input[@name='btnK']").send_keys(Keys.ENTER)
a = driver.find_elements_by_xpath("//div[@class='r']/a") 
links = []
for x in a:      # this loop get all the webpages link and store into 'links' list.
links.append(x.get_attribute('href'))
link_data = []
for new_url in links:    #go on every webpage and store page source in link_data list.
print('new url : ' , new_url)
driver.get(new_url)
link_data.append(driver.page_source)
driver.back()
#print('link data len : ' ,len(link_data)) 
#print('link data [0] : ' , link_data[0])  # print first webpage source.

此代码从所有链接中获取所有数据并保存在link_data列表中。

对于 p 标签,您可以使用以下代码:

from bs4 import BeautifulSoup as bs
page = bs(link_data[0],'html.parser')
p_tag = page.find_all('p')
print(p_tag)

最新更新