Python Selenium Crawler进入元素并获取细节



我试图从以下网站获得所有属性的详细信息,该网站将属性列为元素:

https://www.altamirarealestate.com.cy/results/for-sale/flats/cyprus/35p009679979327046l33p17435142059772z9

我在Python中使用Selenium来抓取元素的详细信息,但是一旦我转到元素,我就无法单击其链接将其打开到新页面并获得必要的信息。下列代码:

from selenium.webdriver.common.keys import Keys
import webbrowser
import random
import time
import selenium.webdriver.support.ui as ui
from selenium.webdriver.support.wait import WebDriverWait 
from selenium.webdriver.support.select import Select
import csv
from csv import writer
from selenium.common.exceptions import ElementNotVisibleException, WebDriverException, NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
Link = 'https://www.altamirarealestate.com.cy/results/for-sale/flats/cyprus/35p009679979327046l33p17435142059772z9'
# MAIN
driver = webdriver.Chrome()
driver.maximize_window()

#Go to link
driver.get(Link)
#Accept cookies
time.sleep(2)
driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]').click()
time.sleep(2)

#Load everything
while True:
try:
driver.find_element_by_xpath("//*[contains(@value,'View more')]").click()
time.sleep(3)
except Exception as no_more_properties:
print('all properties expanded: ', no_more_properties)
break
#Get properties
properties_list=driver.find_elements_by_xpath('//*[@class="minificha   "]')
print (len(properties_list))#25
time.sleep(2)
#Get each property link
property_url=set()
properties_details=[]
main_window_handle = driver.current_window_handle
for i in range(0,len(properties_list)):
driver.switch_to_window(main_window_handle)
property = properties_list[i]
property_link = property.find_element_by_xpath('//a[@href="'+url+'"]')
property_link.click()
time.sleep(2)
#Switch to property window
window_after = driver.window_handles[1]
driver.switch_to.window(window_after)
#Get number of properties
number_of_flats=driver.find_elements_by_xpath('//[@class="lineainmu "]')
print(len(number_of_flats))
time.sleep(2)
currentWindow = driver.current_window_handle
for j in range(0,len(number_of_flats)):
driver.switch_to_window(currentWindow)
flat= number_of_flats[j]
flat.click()
time.sleep(2)

#Switch to flat window
window_after = driver.window_handles[1]
driver.switch_to.window(window_after)

当我们点击第一页的链接时,它将打开一个新标签页。在selenium中,在这种情况下,我们应该将焦点切换到新窗口,然后我们可以在新打开的页面上与web元素进行交互。

任务完成后,关闭标签页,然后切换回原始内容是很重要的。

如果不在循环中重新定义web元素,可能会导致元素引用失效。

代码:

driver = webdriver.Chrome(driver_path)
driver.maximize_window()
driver.implicitly_wait(30)
wait = WebDriverWait(driver, 30)
driver.get("https://www.altamirarealestate.com.cy/results/for-sale/flats/cyprus/35p009679979327046l33p17435142059772z9")
try:
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "button#onetrust-accept-btn-handler"))).click()
except:
pass
size = driver.find_elements(By.XPATH, "//div[@class='slick-list draggable']")
j = 1
org_windows_handle = driver.current_window_handle
for i in range(len(size)):
ele = driver.find_element(By.XPATH, f"(//div[@class='slick-list draggable'])[{j}]")
driver.execute_script("arguments[0].scrollIntoView(true);", ele)
ele.click()
all_handles = driver.window_handles
driver.switch_to.window(all_handles[1])
try:
name = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "p#tituloFiltroTipo"))).text
print(name)
except:
pass
try:
price = wait.until(EC.visibility_of_element_located((By.ID, "soloPrecio"))).text
print(price)
except:
pass
driver.close()
driver.switch_to.window(org_windows_handle)
j = j + 1

进口:

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

输出:

Flats - Egkomi, Nicosia
310,000
Flat - Strovolos, Nicosia
115,000
Flat - Agios Dometios, Nicosia
185,000
Flats - Aglantzia, Nicosia
765,000
Flat - Kaimakli, Nicosia
170,000
Flat - Kaimakli, Nicosia
280,000
Flat - Kaimakli, Nicosia
130,000
Flat - Germasogia, Limassol
410,000
Flat - Germasogeia, Limassol
285,000
Flat - Petrou & Pavlou, Limassol
230,000

不建议将隐式与显式混合使用。但是在像这样的少数情况下,我们使用find_element和显式等待,不会造成任何伤害。请注释隐含的等待行,并运行代码。如果失败,请取消注释,然后再试一次。

最新更新