BeautifulSoup没有返回网站的完整HTML



我正试图刮一个名为autocheck的网站为他们列出的所有汽车出售。制作完汤之后,我创建了汤的两个子列表来迭代获取我需要放入DataFrame的信息,但由于某种原因,汤只返回了前8辆车。我认为这与必须滚动页面加载数据有关,但我不确定。你能帮忙吗?

from bs4 import BeautifulSoup
from requests import get
import pandas as pd
import selenium
from selenium import webdriver # conda install selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

url = 'https://autochek.africa/ng/cars-for-sale'
driver = webdriver.Chrome()
driver.get(url)
driver.maximize_window()
time.sleep(2)
xpath_search = r'//*[@id="__next"]/div/div[2]/div/div/div[2]/div[3]/div[2]/div[3]/div/div'
element = driver.find_element_by_xpath(xpath_search)
element.click()
element_2 = driver.find_element_by_xpath(r'//*[@id="__next"]/div/div[2]/div/div/div[2]/div[3]/div[2]/div[3]/div/ul/li[4]')
element_2.click()
time.sleep(10)
new_url = driver.current_url
print(new_url)
headers = ({'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit
/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})
url = new_url
response = get(url, headers= headers)
soup = BeautifulSoup(response.text,'html.parser')
tag = 'div'
attributes =  {'class':'car-grid-container'}
content_list = soup.find_all(tag, attributes)
basic_info = []
for item in content_list:
basic_info.append(item.find_all('a', {'class':'hover:tw-shadow-md'}))
text_info = []
for item in content_list:
text_info.append(item.find_all('div', {'class':'other-details tw-flex tw-flex-row tw-justify-between tw-items-stretch'}))

您必须向下滚动到页面底部才能访问所有容器(100)。每次下载新内容时,您可以使用下面的函数向下滚动:

def scroll_down(driver):
get_scroll_height_command = ("return (document.documentElement || document.body).scrollHeight;")
scroll_to_command = "scrollTo(0, {});"

# Set y origin and grab the initial scroll height
y_position = 0
scroll_height = driver.execute_script(get_scroll_height_command)

while y_position != scroll_height:
y_position = scroll_height
driver.execute_script(scroll_to_command.format(scroll_height))

# Page needs to load yet again otherwise the scroll height matches the y position
# and it breaks out of the loop
time.sleep(4)
scroll_height = driver.execute_script(get_scroll_height_command)

使用位置:

time.sleep(10)
scroll_down(driver)
new_url = driver.current_url

这里不需要Selenium。直接从数据源获取数据:

import requests
import pandas as pd
url ='https://autochek.africa/_next/data/78f0e3518ea1a34c27af75e4181bc80d2c112ebf/ng/cars-for-sale.json'
pageSize = 9999
page = 0
rows = []
for page in range(1,100):
if page > pageSize:
break
print(page)
payload = {
'country': 'ng',
'page_number': '%s' %page}

jsonData = requests.get(url, params=payload).json()
pageSize = jsonData['pageProps']['pagination']['pageSize']

rows += jsonData['pageProps']['cars']
df = pd.DataFrame(rows)

输出:

print(df)
id                 title  ... hasCleanTitle  soldDate
0    -e4h58sQe          Toyota Camry  ...          True       NaN
1    SWbyrtv5P          Toyota Venza  ...         False       NaN
2    R9_sIKnJy     Toyota Highlander  ...           NaN       NaN
3    4DAiUe1oC          Toyota Camry  ...          True       NaN
4    ECIkVk1aJ          Toyota Camry  ...          True       NaN
..         ...                   ...  ...           ...       ...
571  l7fFI5Q3w          Toyota Camry  ...          True       NaN
572  GquAWYVWu          Toyota Camry  ...          True       NaN
573  SVkuXCLiV         Toyota Matrix  ...           NaN       NaN
574  2O0OKpL_d        Toyota Corolla  ...          True       NaN
575  lp2PtU1oO  Mercedes-Benz ML 350  ...           NaN       NaN
[576 rows x 24 columns]

最新更新