我正在尝试使用python抓取一个web。下面是我的代码:
import selenium
from selenium import webdriver
import time
from bs4 import BeautifulSoup
import sys # using for command Line argument
from selenium.webdriver.chrome.options import Options
# generating url for product
def get_url(product):
product = product.replace(' ','%20')
template = 'https://www.carousell.com.my/search/{}'
url = template.format(product_name)
return url
def get_all_products(card):
product_image = card.find('img','D_iN D_iK D_uf')
product_image = product_image['src']
product_name = card.find('p','D_bU M_ch D_aQ M_aL D_bV M_ci D_bY
M_cl D_ca M_cn D_ce M_cq D_ch M_ct D_bR').text.strip()
product_price = card.find('p','D_bU M_ch D_aQ M_aL D_bV M_ci D_bY
M_cl D_ca M_cn D_ce M_cq D_cg M_cs D_bQ').text.strip()
product_summary = card.find('p','D_bU M_ch D_aQ M_aL D_bV M_ci
D_bY M_cl D_ca M_cn D_ce M_cq D_cg M_cs D_bR').text.strip()
anchor = card.find('a','D_gk M_bj')
product_link = 'https://www.carousell.com.my'+anchor.get('href')
product_info = (product_image, product_name, product_price,
product_summary, product_link)
return product_info
def main(product):
url = get_url(product)
options = Options()
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('--log-level=3')
driver = `webdriver.Chrome(executable_path='C:\webDrivers\chromedriver.exe',options=options)`
driver.get(url)
driver.maximize_window()
time.sleep(5)
soup = BeautifulSoup(driver.page_source,'html.parser')
product_card = soup.find_all('div','D_jb D_ph D_pm M_np')
#fetching single product from Carousell
singleCard = product_card[0]
productDetails = get_all_products(singleCard)
return productDetails
pname = str(sys.argv[1])
scrape_data = main(pname)
print(scrape_data)
当我尝试使用cmd运行时,我得到了这个错误:
File "C:wampwwwprojectCarousell_Scrap_PHP.py", line 63, in main
singleCard = product_card[0]
IndexError: list index out of range
我尝试使用相同的代码为shopee,它运行良好,但当我尝试它与另一个carousell,它显示此错误。我也试图找到这个错误的答案,但我不能得到它。如果有人能帮我,我会很感激的。我知道索引从0
开始,但是我已经在代码中写了0
。仍然显示indexerror
你得到这个错误,因为product_card
是空的。
这显然是由于您使用错误的定位器div','D_jb D_ph D_pm M_np
来获取这些元素造成的。
我找不到此元素的清除定位符。
product_card = soup.find_all('div','.D_ow.D_qW.D_rb')
为我工作
可能是因为类名D_jb D_ph D_pm M_np
中有空格。只使用类名的第一部分。在类别为D_ow D_qW D_rb
的div
标签的链接中,只尝试了D_ow
,它返回了那20个项目。
driver.get("https://www.carousell.com.my/search/waist%20bag?addRecent=true&canChangeKeyword=true&includeSuggestions=true&searchId=F7LtXr")
soup = BeautifulSoup(driver.page_source,'html.parser')
product_card = soup.find_all('div','D_ow')
print(len(product_card))