Python Selenium - 使用循环语句从网站列表文件中获取每个网站的元素属性



如何使用Python Selenium遍历网站列表(从Excel文件(从每个网站获取值?

例如,Excel 文件中的列包含:

https://www.inc.com/profile/dom-&-tom
https://www.inc.com/profile/decksouth
https://www.inc.com/profile/shp-financial
and many more.....

我想从每个链接中获取特定的 HREF 属性。

到目前为止我的代码:

from selenium import webdriver
browser = webdriver.Chrome()
browser.get("https://www.inc.com/profile/bluestone-staffing")
website_link_anchor = 
browser.find_element_by_xpath("//dd[@class='website']/a")
actual_website_link = website_link_anchor.get_attribute("href")
print(actual_website_link)
browser.get("https://www.inc.com/profile/homecity-real-estate")
website_link_anchor = 
browser.find_element_by_xpath("//dd[@class='website']/a")
actual_website_link = website_link_anchor.get_attribute("href")
print(actual_website_link)
browser.close()

任何意见将不胜感激。

要遍历网站列表(从 Excel 文件(并从每个网站获取值,您需要:

  • 创建要浏览的网站列表
  • 然后调用每个网站并查找所需的元素。
  • 打印actual_website_link并再次循环。
  • 始终在方法中调用driver.quit()tearDown(){}以正常关闭和销毁WebDriverWeb 客户端实例。
  • 您的示例代码将是:

    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    myLinks = ['https://www.inc.com/profile/dom-&-tom', 'https://www.inc.com/profile/decksouth', 'https://www.inc.com/profile/shp-financial']
    options = Options()
    options.add_argument("start-maximized")
    options.add_argument("disable-infobars")
    options.add_argument("--disable-extensions")
    browser = webdriver.Chrome(chrome_options=options, executable_path=r'C:pathtochromedriver.exe')  
    for link in myLinks:
    browser.get(link)
    website_link_anchor = browser.find_element_by_xpath("//dd[@class='website']/a")
    actual_website_link = website_link_anchor.get_attribute("href")
    print(actual_website_link)
    browser.quit()
    

要阅读 excelxlrd库。在sheet.cell_value(i, 0)这里irow索引,0是列索引。根据您的Excel数据更改列索引。

定义了用于报废和返回值的函数,或者在必要时将其附加到另一个列表中。 在您的情况下,您只是在打印,所以我返回None

import xlrd
from selenium import webdriver
# Give the location of the file

def scrapping(browser, links):
browser.get(links)
website_link_anchor = browser.find_element_by_xpath("//dd[@class='website']/a")
actual_website_link = website_link_anchor.get_attribute("href")
print(actual_website_link)
return None

driver = webdriver.Chrome()
loc = ("path of file")
# To open Workbook
wb = xlrd.open_workbook(loc)
sheet = wb.sheet_by_index(0)
# links = []

for i in range(1, sheet.nrows):
scrapping(driver, sheet.cell_value(i, 0))
# links.append(sheet.cell_value(i, 0))
driver.close()

有什么改进代码的建议吗?

from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.firefox.options import Options
import xlrd
import xlwt
from xlutils.copy import copy
def scraping(browser, link):
returnValue = ""
browser.get(link)
try:
website_link_anchor = browser.find_element_by_xpath("//dd[@class='website']/a")
actual_website_link = website_link_anchor.get_attribute("href")
returnValue = actual_website_link
except NoSuchElementException: 
returnValue = "Element not found for: " + link
return returnValue
options = Options()
options.add_argument("--headless")
browser = webdriver.Firefox(firefox_options=options, executable_path=r'C:WebDriversgeckodriver.exe')
file_to_read = ("C:INC5000list.xlsx")
# read
file_to_read_wb = xlrd.open_workbook(file_to_read)
file_to_read_wb_sheet = file_to_read_wb.sheet_by_index(0)
# copy and write
file_to_write_to_wb = copy(file_to_read_wb)
file_to_write_to_wb_sheet = file_to_write_to_wb.get_sheet(0)
for i in range(1, file_to_read_wb_sheet.nrows):
result = scraping(browser, file_to_read_wb_sheet.cell_value(i, 0))
file_to_write_to_wb_sheet.write(i, 1, result)
file_to_write_to_wb.save("C:INC5000list2.xls")
browser.close()

最新更新