Selenium Webdriver没有滚动以单击"加载更多"按钮



我正在尝试为新闻标题构建一个边缘档案抓取器,我的主要目标是从给定的月份和年份抓取数据。几天前代码还在工作,滚动正常,但现在无法滚动,每次都会卡住。我正在尝试按CTRL+END的操作链滚动,但它不起作用。我也尝试了其他方法,但没有运气

def scrolling_func(wait,driver):
print("It is trying to scroll")
SCROLL_PAUSE_TIME = 5
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
ActionChains(driver).key_down(Keys.CONTROL).send_keys('END').key_up(Keys.CONTROL).perform()
load_button = driver.find_element_by_css_selector('.p-button')
# driver.execute_script("arguments[0].scrollIntoView();", load_button)
element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.p-button')))
# ActionChains(driver).move_to_element(load_button).click().perform()  
load_button.click()   
time.sleep(SCROLL_PAUSE_TIME)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
# driver.delete_all_cookies()
time.sleep(1)
ActionChains(driver).key_down(Keys.CONTROL).send_keys('HOME').key_up(Keys.CONTROL).perform()

刮刀是

def scraper(years,months):
PATH = r"C:UsersastarStock market tutorialschromedriver_win64chromedriver.exe"
options = webdriver.ChromeOptions()
options.use_chromium = True

options.add_argument('--ignore-certificate-errors')
options.add_argument('--ignore-ssl-errors')
driver = webdriver.Chrome(executable_path=PATH,options=options)
driver.maximize_window()

urls = parse_dates(years,months)
final_headlines = []
final_dates = []
final_links = []
for url in urls:
driver.get(url)
done=True
while done:
try:
wait = WebDriverWait(driver,10)
scrolling_func(wait,driver)
except:
done=False
ActionChains(driver).key_down(Keys.CONTROL).send_keys('HOME').key_up(Keys.CONTROL).perform()
soup = BeautifulSoup(driver.page_source,'lxml')
#https://stackoverflow.com/questions/5041008/how-to-find-elements-by-class
#https://stackoverflow.com/questions/42732958/python-parallel-execution-with-selenium
#https://stackoverflow.com/questions/44245451/how-to-scrape-multiple-html-page-in-parallel-with-beautifulsoup-in-python
#https://stackoverflow.com/questions/45816619/selenium-firefox-webdriver-for-python-keyerror-value
num_articles = soup.find("h1",class_="p-page-title").text
current = num_articles[num_articles.find("for")+4:num_articles.find("(")]
articles_num = num_articles[num_articles.find("(")+1:-1]
titles = soup.find_all("h2",class_="c-entry-box--compact__title")
dates = soup.find_all("time",class_="c-byline__item")
if articles_num != len(titles):
logger.warning("Actual #articles {} and #scraped articles {} for {}".format(articles_num,len(titles),current))
print(len(titles),len(dates))
headlines_results = map(title_extractor,titles)
dates_results = map(date_extractor,dates)
links_results = map(link_extractor,titles)

def list_process(gens):
return [gen for gen in gens]

headlines = list_process(headlines_results)
dates = list_process(dates_results)
links = list_process(links_results)
final_headlines.extend(headlines) 
final_dates.extend(dates) 
final_links.extend(links)
time.sleep(15)
print(len(final_headlines),len(final_dates),len(final_links))    
assert len(final_headlines)==len(final_dates)==len(final_links), f'Different lengths of headlines {len(headlines)} and date {len(dates)}'
data = {"Headlines":final_headlines,"Dates":final_dates,"Links":final_links}
df = pd.DataFrame(data) 
df.to_csv('file1.csv') 
return df 

如果name==">main":刮刀(["2021"]、["3"](

正如我所说,它无法滚动,几天前它工作得很好,但现在它坏了。同样在早些时候,我遇到了无法加载页面的整个列表的问题,因为它被卡住了。有人能帮我吗?提前谢谢。

如果您更喜欢使用css选择器,请尝试以下滚动:

driver.execute_script("arguments[0].scrollIntoView();",driver.find_element_by_css_selector(.your_css_selector))

或者,如果使用xpath:

driver.execute_script("arguments[0].scrollIntoView();",driver.find_element_by_xpath(.your_xpath_selector))

加载更多应该是您需要的定位器元素。

查看此处了解scrollIntoViewmoveToElement之间的区别它们比您现在使用的更可靠。scrollIntoView与moveToElement

最新更新