我正试图刮一个名为autocheck的网站为他们列出的所有汽车出售。制作完汤之后,我创建了汤的两个子列表来迭代获取我需要放入DataFrame的信息,但由于某种原因,汤只返回了前8辆车。我认为这与必须滚动页面加载数据有关,但我不确定。你能帮忙吗?
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
import selenium
from selenium import webdriver # conda install selenium
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
url = 'https://autochek.africa/ng/cars-for-sale'
driver = webdriver.Chrome()
driver.get(url)
driver.maximize_window()
time.sleep(2)
xpath_search = r'//*[@id="__next"]/div/div[2]/div/div/div[2]/div[3]/div[2]/div[3]/div/div'
element = driver.find_element_by_xpath(xpath_search)
element.click()
element_2 = driver.find_element_by_xpath(r'//*[@id="__next"]/div/div[2]/div/div/div[2]/div[3]/div[2]/div[3]/div/ul/li[4]')
element_2.click()
time.sleep(10)
new_url = driver.current_url
print(new_url)
headers = ({'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit
/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})
url = new_url
response = get(url, headers= headers)
soup = BeautifulSoup(response.text,'html.parser')
tag = 'div'
attributes = {'class':'car-grid-container'}
content_list = soup.find_all(tag, attributes)
basic_info = []
for item in content_list:
basic_info.append(item.find_all('a', {'class':'hover:tw-shadow-md'}))
text_info = []
for item in content_list:
text_info.append(item.find_all('div', {'class':'other-details tw-flex tw-flex-row tw-justify-between tw-items-stretch'}))
您必须向下滚动到页面底部才能访问所有容器(100)。每次下载新内容时,您可以使用下面的函数向下滚动:
def scroll_down(driver):
get_scroll_height_command = ("return (document.documentElement || document.body).scrollHeight;")
scroll_to_command = "scrollTo(0, {});"
# Set y origin and grab the initial scroll height
y_position = 0
scroll_height = driver.execute_script(get_scroll_height_command)
while y_position != scroll_height:
y_position = scroll_height
driver.execute_script(scroll_to_command.format(scroll_height))
# Page needs to load yet again otherwise the scroll height matches the y position
# and it breaks out of the loop
time.sleep(4)
scroll_height = driver.execute_script(get_scroll_height_command)
使用位置:
time.sleep(10)
scroll_down(driver)
new_url = driver.current_url
这里不需要Selenium。直接从数据源获取数据:
import requests
import pandas as pd
url ='https://autochek.africa/_next/data/78f0e3518ea1a34c27af75e4181bc80d2c112ebf/ng/cars-for-sale.json'
pageSize = 9999
page = 0
rows = []
for page in range(1,100):
if page > pageSize:
break
print(page)
payload = {
'country': 'ng',
'page_number': '%s' %page}
jsonData = requests.get(url, params=payload).json()
pageSize = jsonData['pageProps']['pagination']['pageSize']
rows += jsonData['pageProps']['cars']
df = pd.DataFrame(rows)
输出:
print(df)
id title ... hasCleanTitle soldDate
0 -e4h58sQe Toyota Camry ... True NaN
1 SWbyrtv5P Toyota Venza ... False NaN
2 R9_sIKnJy Toyota Highlander ... NaN NaN
3 4DAiUe1oC Toyota Camry ... True NaN
4 ECIkVk1aJ Toyota Camry ... True NaN
.. ... ... ... ... ...
571 l7fFI5Q3w Toyota Camry ... True NaN
572 GquAWYVWu Toyota Camry ... True NaN
573 SVkuXCLiV Toyota Matrix ... NaN NaN
574 2O0OKpL_d Toyota Corolla ... True NaN
575 lp2PtU1oO Mercedes-Benz ML 350 ... NaN NaN
[576 rows x 24 columns]