我是python和编程的新手,但我的项目有点问题。我正试图在一个网站上抓取数据,并将其保存在csv中。我工作,但当我写";lst";列表到";图像URL";以及";"图像特色";括号";["one_answers"]";和"也被写入csv文件。有办法去掉这个吗?我知道这是因为;lst";list用url共同包含其他列表。
import csv
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import date
today = date.today()
source = requests.get('https://www.meklarin.fo/').text
soup = BeautifulSoup(source, 'lxml')
df = pd.read_csv(r'C:UsersusernameDesktopKassin.fokassinblogmanagementcommandstest.csv')
print(df.to_string())
original_house_title_list = []
original_house_link_list = []
house_titles_list = []
house_asking_price_list = []
house_current_bid_price_list = []
house_link_list = []
product = 'product'
current_date = today.strftime("%m.%d.%y")
house_image_list = []
house_location_list = []
lst = []
lst1 = []
house_info_list = []
house_final_info = []
list_convert = []
for house_link in soup.find_all('a', class_='house-air-content'):
house_link = house_link.get('href')
house_link_list.append(house_link.strip())
print(house_link.strip())
for house_link in house_link_list:
if house_link in original_house_link_list:
continue
else:
source = requests.get(house_link).text
soup = BeautifulSoup(source, 'lxml')
for house_titles in soup.find_all('div', class_='ogn-base-info'):
house_title = house_titles.h1.text
house_titles_list.append(house_title)
#print(house_title)
for house__asking_price in soup.find_all('div', class_='col-xs-12 col-sm-12 col-md-6 house-ask-price house-price-column'):
house_asking_price = house__asking_price.text
house_asking_price = str(house_asking_price)
house_asking_price = house_asking_price.removeprefix('Prísuppskotkr.')
house_asking_price = house_asking_price.replace('.','')
house_asking_price_list.append(house_asking_price.strip())
#print(house_asking_price.strip())
for house__current_bid_price in soup.find_all('div', class_='col-xs-12 col-sm-12 col-md-6 house-bid-price house-price-column'):
house_current_bid_price = house__current_bid_price.h3.text
house_current_bid_price = str(house_current_bid_price)
house_current_bid_price = house_current_bid_price.replace('.','')
house_current_bid_price = house_current_bid_price.replace('kr','')
house_current_bid_price_list.append(house_current_bid_price.strip())
print(house_current_bid_price.strip())
for house_all_images in soup.find_all('a'):
if 'https://www.meklarin.fo/wp-content/uploads' in str(house_all_images):
house_all_images = house_all_images.get('href')
house_image_list.append(house_all_images)
#print(house_all_images)
else:
continue
lst.append(house_image_list)
lst1.append(lst)
house_image_list=[]
for house_build_year in soup.find_all('div', class_='house-info-box-value'):
if 'Trýst her' in str(house_build_year):
continue
else:
print(house_build_year.text)
for house_info in soup.find_all('div', class_='house-desc-comp'):
house_info = house_info.text
house_info = str(house_info)
house_info = house_info.replace('Upplýsingar um bústaðin','')
house_info_list.append(house_info)
#print(house_info)
house_final_info.append(house_info)
house_info_list = []
dict = {'Title': house_titles_list, 'Content': house_final_info, 'Date':current_date, 'Post Type': product, 'Price': house_asking_price_list, 'Regular Price': house_asking_price_list, 'Sale Price':house_asking_price_list, 'Stock Status': 'instock', 'Image URL': lst, 'Image Title': house_titles_list, 'Image Featured': lst}
df = pd.DataFrame(dict)
df.to_csv('test.csv')
print(len(house_titles_list))
print(len(house_asking_price_list))
print(len(lst))
print(len(house_final_info))
要删除(例如(图像URL单元格中的列表,在写入文件之前,请尝试:
df['Image URL'] = [','.join(map(str, i)) for i in df['Image URL']]
可以复制上面的行,并且可以将上面的Image URL
更改为Image Featured
以清理另一列中的列表。