问题.使用请求+硒的Python抓取



CODE IS HERE

你好男人

我有一些问题这个动态网站(https://kvartiry-bolgarii.ru/)我需要得到所有的链接到房屋销售广告

我使用selenium加载页面并获得广告链接,之后我将页面向下移动以加载新广告。加载新广告后,我开始解析页面上的所有链接并将它们再次写入列表。但是列表中的数据没有更新,脚本继续处理在向下滚动之前页面上的链接。

顺便说一下,我设置了一个检查,以便脚本执行,直到网站上的最后一个公告出现在列表中,我提前发现的链接

如何纠正这个问题?

def get_link_info():
try:
url = "https://kvartiry-bolgarii.ru/"
driver = webdriver.Chrome(
executable_path=r'C:UserskkDesktopscrape_housedriverschromedriver.exe',
options=options
)
driver.get(url)
req = requests.get(url)
req.encoding = 'utf8'
soup = BeautifulSoup(req.text, "lxml")
articles = soup.find_all("div", class_="content")
links_urls = []
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
#print(links_urls)
first_link_number = links_urls[-2].split("-")[-1]
first_link_number = first_link_number[1:]
#print(first_link_number)
last_link_number = links_urls[-1].split("-")[-1]
last_link_number = last_link_number[1:]
#print(last_link_number)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
check = "https://kvartiry-bolgarii.ru/kvartira-v-elitnom-komplekse-s-unikalynym-sadom-o21751"
for a in links_urls:
if a != check:
for article in articles:
house_url = article.find("a").get("href")
links_urls.append(house_url)
html = driver.find_element_by_tag_name('html')
html.send_keys(Keys.END)
print(links_urls[-1])

else:
print(links_urls[0], links_urls[-1])
print("all links are ready")

一些指针。你不需要混合selenium,requestsBeautifulSoup。只要selenium就够了。当您无限滚动时,您需要在将重复元素添加到列表之前删除它们。你可以试试这个。这应该可以工作。

from selenium import webdriver
import time
def get_link_info():
all_links = []
try:
driver = webdriver.Chrome(executable_path='C:/chromedriver.exe')
driver.get('https://kvartiry-bolgarii.ru/')
time.sleep(3)
old_links = set()  # Empty Set
while True:
# Scroll to get more ads
driver.execute_script("window.scrollBy(0,3825)", "")
# Wait for new ads to load
time.sleep(8)
links_divs = driver.find_elements_by_xpath('//div[@class="content"]//a')  # Find Elements
ans = set(links_divs) - set(old_links)  # Remove old elements
for link in ans:
# Scroll to the link. 
driver.execute_script("arguments[0].scrollIntoView();", link)
fir = link.get_attribute('href')
all_links.append(fir)
# Remove Duplicates
old_links = links_divs
except Exception as e:
raise e
get_link_info()

最新更新