使用Selenium Python迭代搜索许多搜索词



我正在尝试在LexisNexis上进行搜索并抓取结果。我需要从所有页面中抓取结果,所以我希望 selenium 进行搜索,抓取数据,然后单击下一步并重新执行。此外,我希望它对多个术语进行搜索。例如,我希望它搜索术语Law,执行我刚才描述的操作,然后搜索术语医疗补助,执行我刚才描述的操作,等等。

这是我的代码:

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
#from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
#import requests
#import re
import csv
import numpy as np
#import pandas as pd
###############################################################################
#CLICKING AND SEARCH
###############################################################################
browser = webdriver.Firefox(executable_path='/usr/local/bin/geckodriver')
browser.implicitly_wait(5)
#Goes to library website and finds database
browser.get('https://sfx.carli.illinois.edu/sfxuiu?url_ver=Z39.88-2004&url_ctx_fmt=infofi/fmt:kev:mtx:ctx&ctx_enc=info:ofi/enc:UTF-8&ctx_ver=Z39.88-2004&rfr_id=info:sid/sfxit.com:azlist&sfx.ignore_date_threshold=1&rft.object_id=63750000000001351&svc.fulltext=yes')
browser.find_element_by_link_text('LEXIS NEXIS DATABASES').click()
alert = browser.switch_to.alert
alert.accept()
browser.close()
browser.switch_to.window(browser.window_handles[0])
#Login to NexisUni through university library ONLY WHEN NOT ON CAMPUS
browser.find_element_by_id('j_username').send_keys('USERNAME')
browser.find_element_by_id('j_password').send_keys('PASS')
browser.find_element_by_name('_eventId_proceed').click()
#click on advanced search on NexisUni homepage
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH ('/html/body/main/div[13]/div[2]/div[1]/header/div[3]/ul/li[1]/button'))
advancedSearch = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[1]/header/div[3]/ul/li[1]/button')
advancedSearch.click()
#Selecting Specific Content Type
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/button')))
Select_Content = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/button')
Select_Content.click()
#Choose News
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/aside/div[2]/ul[2]/li[2]/button')))
Choose_News = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/header/h2/ul/li/div/aside/div[2]/ul[2]/li[2]/button')
Choose_News.click()
#Type in Search Term
browser.find_element_by_xpath('//*[@id="headline"]').send_keys('Law')
#Type in Publication
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('//*[@id="publication"]')))
Pub = browser.find_element_by_xpath('//*[@id="publication"]')
Pub.send_keys('The Associated Press')
#input date range
select = Select(browser.find_element_by_id('date'))
select.select_by_visible_text('Date is after')
browser.find_element_by_id('dateFrom').send_keys('01/01/1980')
#click on Search
WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.XPATH('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/footer/span/button[1]')))
Search = browser.find_element_by_xpath('/html/body/main/div[13]/div[2]/div[2]/div/div[1]/footer/span/button[1]')
Search.click()      
###############################################################################
#SCRAPING
###############################################################################
scd = browser.page_source
soup = BeautifulSoup(scd, "lxml")
HEADLINES = soup.findAll('a', attrs={"data-action":"title"})
headlines=[]
for H in HEADLINES:
headlines.append(H.text.strip())
DETAILS = soup.findAll('div', attrs={"class":"dataInfo translate"})
details = []
for D in DETAILS:
details.append(D.text.strip())

Dates1 = [i.split('tttttnn',2)[1] for i in details]
Dates = [i.split('n',1)[0] for i in Dates1]
Source1 = [i.split('tttttnn',1)[1] for i in details]
Source = [i.split('n',1)[1] for i in Source1]

News = zip(headlines,Dates,Source)
result = "/Users/danashaat/Desktop/data.csv"
with open(result, 'a') as result:
newswriter = csv.writer(result) 
for row in News:
newswriter.writerow(row)
#Next Page:
while True:
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')))
Next = browser.find_element_by_xpath('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')
if len(Next) < 1:
print("No more pages left")
break
else:
WebDriverWait(browser, 10).until(EC.presence_of_element_located(By.XPATH('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')))
Next = browser.find_element_by_xpath('/html/body/main/main/div[2]/div/div[2]/div[2]/form/div[2]/nav/ol/li[7]/a')
Next.click()

它没有按照我想要的方式工作! 我不确定为什么:/

代码末尾的while循环将继续单击"下一步"按钮,直到显示"不再有页面"消息。单击"下一步"后,它不会再次调用代码的"抓取"部分。因此,您的代码只会抓取第一页并保存zip文件,但随后会继续单击"下一步",直到它存在。

也许您应该创建一个用于抓取的函数,并在每次单击"下一步"按钮后在while循环中调用它。

最新更新