如何重写部分代码(spare_parts)来抓取所有数据



谁能帮我的部分代码- spare_parts?当我创建字典并导出到excel时,我没有得到所有的数据。但是当我对Spare_part_number、Spare_part_name和Price使用print(注释)时,终端会把它们全部写出来。问题在哪里?.................................

import requests
from bs4 import BeautifulSoup
import pandas as pd
import xlsxwriter
baseurl = 'https://www.roco.cc/'
headers = {
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
}
productlinks = []
for x in range(1,2):
r = requests.get(
f'https://www.roco.cc/ren/products/locomotives/steam-locomotives.html?p={x}&verfuegbarkeit_status=41%2C42%2C43%2C45%2C44')
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('li', class_='item product product-item')

for item in productlist:
for link in item.find_all('a', class_='product-item-link', href=True):
productlinks.append(link['href'])


Loco_list = []
Spare_parts_list = []
for link in productlinks:
r = requests.get(link, allow_redirects=False)
soup = BeautifulSoup(r.content, 'lxml')
Manufacturer_name = 'Roco'

try:
Reference = soup.find('span', class_='product-head-artNr').text.strip()
except:
Reference = ''

try:  
Price = soup.find('div', class_='product-head-price').text.strip()
except:
Price = ''

try:    
Type = soup.find(
'div', class_='product-head-name').h1.text.strip()
except:
Type = ''
try:
Scale = soup.find('td', {'data-th': 'Scale'}).text.strip()
except:
Scale = ''
try:  
Current = soup.find('td', {'data-th': 'Control'}).text.split(' ')[0]
except:
Current = ''

try:  
Control = soup.find('td', {'data-th': 'Control'}).text.strip()
except:
Control = ''
try:
Interface = soup.find('td', {'data-th': 'Interface'}).text.strip()
except:
Interface = ''
try:
Digital_decoder = soup.find(
'td', {'data-th': 'Digital decoder'}).text.strip()
except:
Digital_decoder = ''
try:
Decoder_Type = soup.find(
'td', {'data-th': 'Decoder-Type'}).text.strip()
except:
Decoder_Type = ''
try:
Motor = soup.find('td', {'data-th': 'Motor'}).text.strip()
except:
Motor = ''
try:
Flywheel = soup.find('td', {'data-th': 'Flywheel'}).text.strip()
except:
Flywheel = ''
try:
Minimum_radius = soup.find(
'td', {'data-th': 'Minimum radius'}).text.strip()
except:
Minimum_radius = ''
try:
Length_over_buffer = soup.find(
'td', {'data-th': 'Length over buffer'}).text.strip()
except:
Length_over_buffer = ''
try:
Number_of_driven_axles = soup.find(
'td', {'data-th': 'Number of  driven axles'}).text.strip()
except:
Number_of_driven_axles = ''
try:
Number_of_axles_with_traction_tyres = soup.find(
'td', {'data-th': 'Number of  axles with traction tyres'}).text.strip()
except:
Number_of_axles_with_traction_tyres = ''
try:
Coupling = soup.find('td', {'data-th': 'Coupling'}).text.strip()
except:
Coupling = ''
try:
LED_lighting = soup.find(
'td', {'data-th': 'LED lighting'}).text.strip()
except:
LED_lighting = ''
try:
Head_light = soup.find('td', {'data-th': 'Head light'}).text.strip()
except:
Head_light = ''
try:
LED_head_light = soup.find(
'td', {'data-th': 'LED head light'}).text.strip()
except:
LED_head_light = ''
try:
Country = soup.find(
'td', {'data-th': 'Original (country)'}).text.strip()
except:
Country = ''
try:
Railway_company = soup.find(
'td', {'data-th': 'Railway Company'}).text.strip()
except:
Railway_company = ''
try:
Epoch = soup.find('td', {'data-th': 'Epoch'}).text.strip()
except:
Epoch = ''
try:
Description = soup.find(
'div', class_='product-add-form-text').text.strip()
except:
Description = ''

Locomotives = {
'Manufacturer_name': Manufacturer_name,
'Reference': Reference,
'Price': Price,
'Type': Type,
'Scale': Scale,
'Current': Current,
'Control': Control,
'Interface': Interface,
'Digital_decoder': Digital_decoder,
'Decoder_Type': Decoder_Type,
'Motor': Motor,
'Flywheel': Flywheel,
'Minimum_radius': Minimum_radius,
'Length_over_buffer': Length_over_buffer,
'Number_of_driven_axles': Number_of_driven_axles,
'Number_of_axles_with_traction_tyres': Number_of_axles_with_traction_tyres,
'Coupling': Coupling,
'LED_lighting': LED_lighting,
'Head_light': Head_light,
'LED_head_light': LED_head_light,
'Country': Country,
'Railway_company': Railway_company,
'Epoch': Epoch,
'Description': Description,
} 
Loco_list.append(Locomotives)
# print(Locomotives)

Spare_part_list = soup.find_all(
'table', class_='data table additional-attributes')
# print(Spare_part_number_list)
# Spare_part_number_links = []
for item in Spare_part_list:
for Spare_part_number in item.find_all('td', {'data-th': 'Art. No.:'}):
Spare_part_number = Spare_part_number.text.strip()
# print(Spare_part_number)

for Spare_part_name in item.find_all('td', {'data-th': 'Description'}):
Spare_part_name = Spare_part_name.text.strip()
# print(Spare_part_name)

for Spare_part_price in item.find_all('td', {'data-th': 'Price:'}):
Spare_part_price = Spare_part_price.text.strip()
# print(Spare_part_price)

Spare_parts = {
# 'Manufacturer_name': Manufacturer_name,
# 'Reference': Reference,
'Spare part number': Spare_part_number,
'Spare part name': Spare_part_name,
'Spare part price': Spare_part_price
}

Spare_parts_list.append(Spare_parts)
# print(Spare_parts)
#             Spare_parts.append(Spare_part_number_links)


df1 = pd.DataFrame(Loco_list)
df2 = pd.DataFrame(Spare_parts_list)
# # # df3 = pd.DataFrame()
# # # df4 = pd.DataFrame()
writer = pd.ExcelWriter('Roco - locomotives.xlsx', engine='xlsxwriter')
df1.to_excel(writer, sheet_name='Model')
df2.to_excel(writer, sheet_name='Spare parts')
# # # df3.to_excel(writer, sheet_name='Documents')
# # # df4.to_excel(writer, sheet_name='Photos')
writer.save()
print('Saved to file')


尽量简化和集中您的示例-您的备用部件的问题是,您只是打印了很多,但覆盖了。当使用pandas时,使用它的pd.read_html来实现您的目标:

#read et table, stripping not needed columns
spare_parts = pd.read_html(str(soup.select('#product-attribute-et-table')))[0].iloc[:,:3]
#add column with reference to product
spare_parts['Reference'] = soup.select_one('.product-head-artNr').text.strip()
#append dataframe to a list, that later could be concatinated
spare_part_list.append(spare_parts)

或在for-loop中迭代行而不是列。

import requests
import pandas as pd
from bs4 import BeautifulSoup
loco_list = []
spare_part_list = []
url = 'https://www.roco.cc/ren/products/locomotives/steam-locomotives.html?verfuegbarkeit_status=42%2C45%2C41%2C43%2C44'
for link in BeautifulSoup(requests.get(url).text).select('[id^="product-item-info_"]>a')[:3]:
soup = BeautifulSoup(requests.get(link.get('href')).text)
d = dict(e.stripped_strings for e in soup.select('#product-attribute-specs-table tr'))
d.update({'Reference': soup.select_one('.product-head-artNr').text.strip()})
loco_list.append(d)

spare_parts = pd.read_html(str(soup.select('#product-attribute-et-table')))[0].iloc[:,:3]
spare_parts['Reference'] = soup.select_one('.product-head-artNr').text.strip()
spare_part_list.append(spare_parts)
df1 = pd.DataFrame(loco_list)
df2 = pd.concat(spare_part_list, ignore_index=True)

最新更新