在div内滚动时突破while循环



删除谷歌地图评论。当while循环到达评论末尾时,我如何突破它?我哪里搞错了?

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv

driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
header_added = False
with open('urls.txt') as f:
for url in f:
driver.get(url) #line
driver.maximize_window()
# Page Title
title = driver.title
ftitle = title.split("-")
title = ftitle[0]
old_reviews = set()
time.sleep(3)
last_count = 0
while True:
scroll_div = WebDriverWait(driver, 8).until(EC.presence_of_element_located(
(By.XPATH, '//div[@class="section-layout section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc"]')))
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div)  # Scroll
reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="section-layout"]//div[@aria-label]')))
ans = set(reviews) - set(old_reviews)  # Remove duplicate reviews
new_count = len(reviews)
# Reviews Div
for div in ans:
driver.execute_script("arguments[0].scrollIntoView();", div)
name = div.get_attribute('aria-label')
photo_url = div.find_element_by_tag_name('a').get_attribute('href')
rating = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-H1e3jb"]').get_attribute(
'aria-label')
try:
image_links = []
image_div = div.find_element_by_xpath('.//div[@class="ODSEW-ShBeI-Jz7rA"]')
images = image_div.find_elements_by_xpath('button')
for img in images:
im_link = img.value_of_css_property('background-image')
im_link = im_link[5:]
im_link = im_link[:-2]
image_links.append(im_link)
except:
image_links = ''
pass
try:
div.find_element_by_xpath('.//jsl//button').click()
time.sleep(1)
except:
pass
comment = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-text"]').text
dict1 = {'Title': title, "Name": name, "Profile": photo_url, "Rating": rating,
"Comment": comment, "Images Posted": image_links}
with open(f'Google_reviews_for_{title}.csv', 'a+', encoding='utf-8-sig', newline='') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
old_reviews = reviews
if last_count == new_count:
break
last_count = new_count

URL:-https://www.google.com/maps/place/El+TabanKo/@42.848117,-26741402,19z/数据=!4m15!1米7!3m6!1s0xd4fc26e313bc85:0xb10d327c782f87fa!2校正%C3%ADa+Kalea,+45,+001001+Gasteiz,+Araba!3b1!8m2!3d42.8480012!4d-2.6737255!3m6!1s0xd4fc26e26c5be1:0x5f5e0ee05fe08041!8m2!3d42.8481171!4d-2.6735931!9m1!1b1

尝试使用return而不是break
由于您的breakfor循环中,它会将您带出for循环,但您仍然在while True:的无尽循环中
要使用return,您的代码应该在方法中
UPD
您不会在内部for循环内更新new_count!!!

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import csv

driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
header_added = False
with open('urls.txt') as f:
for url in f:
driver.get(url) #line
driver.maximize_window()
# Page Title
title = driver.title
ftitle = title.split("-")
title = ftitle[0]
old_reviews = set()
time.sleep(3)
last_count = 0
while True:
scroll_div = WebDriverWait(driver, 8).until(EC.presence_of_element_located(
(By.XPATH, '//div[@class="section-layout section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc"]')))
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div)  # Scroll
reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="section-layout"]//div[@aria-label]')))
ans = set(reviews) - set(old_reviews)  # Remove duplicate reviews
new_count = len(reviews)
# Reviews Div
for div in ans:
driver.execute_script("arguments[0].scrollIntoView();", div)
name = div.get_attribute('aria-label')

photo_url = div.find_element_by_tag_name('a').get_attribute('href')

rating = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-H1e3jb"]').get_attribute(
'aria-label')

try:
image_links = []
image_div = div.find_element_by_xpath('.//div[@class="ODSEW-ShBeI-Jz7rA"]')
images = image_div.find_elements_by_xpath('button')
for img in images:
im_link = img.value_of_css_property('background-image')
im_link = im_link[5:]
im_link = im_link[:-2]
image_links.append(im_link)

except:
image_links = ''
pass
try:
div.find_element_by_xpath('.//jsl//button').click()
time.sleep(1)
except:
pass
comment = div.find_element_by_xpath('.//span[@class="ODSEW-ShBeI-text"]').text

dict1 = {'Title': title, "Name": name, "Profile": photo_url, "Rating": rating,
"Comment": comment, "Images Posted": image_links}
with open(f'Google_reviews_for_{title}.csv', 'a+', encoding='utf-8-sig', newline='') as f:
w = csv.DictWriter(f, dict1.keys())
if not header_added:
w.writeheader()
header_added = True
w.writerow(dict1)
old_reviews = reviews
if last_count == new_count:
break
last_count = new_count

我猜您想退出while循环:

if last_count == new_count:
break

您现在正在做的是退出for循环,而不是while循环。为了退出while循环,您需要设置一个条件,如下所示:

exit_condition = False
last_count = 0
old_reviews =set()
while not exit_condition:
scroll_div = WebDriverWait(driver, 8).until(EC.presence_of_element_located(
(By.XPATH, '//div[@class="section-layout section-scrollbox cYB2Ge-oHo7ed cYB2Ge-ti6hGc"]')))
driver.execute_script('arguments[0].scrollTop = arguments[0].scrollHeight', scroll_div)  # Scroll
reviews = WebDriverWait(driver, 5).until(
EC.presence_of_all_elements_located((By.XPATH, '//div[@class="section-layout"]//div[@aria-label]')))
ans = set(reviews) - set(old_reviews)  # Remove duplicate reviews
new_count = len(reviews)
# Reviews Div
for div in ans:
driver.execute_script("arguments[0].scrollIntoView();", div)
name = div.get_attribute('aria-label')
old_reviews = reviews
if last_count == new_count:
exit_condition = True
break
last_count = new_count

最新更新