我正试图从bbc食谱中获取一个url,然后获取信息,然后将其放入数据帧中。当我试图运行我所做的功能时,我得到的结果是"无",我不确定为什么,因为在我试图将它们组织成一个功能之前,它就起了作用
columns_name=['title', 'total_time', 'image', 'ingredients', 'rating_val',
'rating_count',
'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url']
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
def print_dataframe(df):
return df
def insert_df(name,totalTime,image,rating_count,rating_value,Category,Ingredients,diet,vegan,vegetarian,url,df):
new_row = {'name':name,'totalTime':totalTime,'image':image,'rating_count':rating_count,'rating_value':rating_value,'Category':Category,'Ingredients':Ingredients,'diet':diet,'vegan':vegan,'vegetarian':vegetarian,'url':url}
df = df.append(new_row, ignore_index=True)
def collect_page_data(url,columns_name):
df = pd.DataFrame(columns = columns_name)
page = requests.get(url)
page_soup = BeautifulSoup(page.text,'html.parser')
res = page_soup.find("script", {"type":
"application/ld+json"})
data = json.loads(res.text)
name = data['author']['name']
image= data['image']
rating_count = data['aggregateRating']['ratingCount']
rating_value = data['aggregateRating']['ratingValue']
Category = data['recipeCategory']
Ingredients = data['recipeIngredient']
diet = data['suitableForDiet'][1]
vegan = data['suitableForDiet'][2]
vegetarian = data['suitableForDiet'][3]
prepTime = data['prepTime']
cookTime = data['cookTime']
l = ['P','T','M']
for i in l:
prepTime = prepTime.replace(i,"")
cookTime = cookTime.replace(i,"")
totalTime = int(prepTime) + int(cookTime)
insert_df(name,totalTime,image,rating_count,rating_value,Category,Ingredients,diet,vegan,vegetarian,url,df)
print_dataframe(df)
print(collect_page_data(url,columns_name))
您有两个return
的问题。
第一个:
在insert_df()
中使用
df = df.append(...)
在insert_df()
内部创建本地df
-它不会更改外部df
您应该使用return
并使用
return df.append(...)
并执行功能
df = insert_df()
秒((:
在collect_page_data()
结束时,运行
print_dataframe(df)
其得到CCD_ 8并且仅将其返回。无法从`collect_page_data((取回
在collect_page_data()
结束时,您应该运行
return df
这是完整的代码:
在我的BeautifulSoup
版本中,我不得不使用res.string
而不是res.text
来获取文本。
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# --- functions ---
def insert_df(name, totalTime, image, rating_count, rating_value, Category, Ingredients, diet, vegan, vegetarian, url, df):
new_row = {'name':name,'totalTime':totalTime,'image':image,'rating_count':rating_count,'rating_value':rating_value,'Category':Category,'Ingredients':Ingredients,'diet':diet,'vegan':vegan,'vegetarian':vegetarian,'url':url}
return df.append(new_row, ignore_index=True)
def collect_page_data(url, columns_name):
df = pd.DataFrame(columns=columns_name)
page = requests.get(url)
page_soup = BeautifulSoup(page.text, 'html.parser')
res = page_soup.find("script", {"type": "application/ld+json"})
#data = json.loads(res.text)
data = json.loads(res.string)
name = data['author']['name']
image= data['image']
rating_count = data['aggregateRating']['ratingCount']
rating_value = data['aggregateRating']['ratingValue']
Category = data['recipeCategory']
Ingredients = data['recipeIngredient']
diet = data['suitableForDiet'][1]
vegan = data['suitableForDiet'][2]
vegetarian = data['suitableForDiet'][3]
prepTime = data['prepTime']
cookTime = data['cookTime']
l = ['P','T','M']
for i in l:
prepTime = prepTime.replace(i, "")
cookTime = cookTime.replace(i, "")
totalTime = int(prepTime) + int(cookTime)
df = insert_df(name, totalTime, image, rating_count, rating_value, Category, Ingredients, diet, vegan, vegetarian, url, df)
return df
# --- main ---
columns_name = [
'title', 'total_time', 'image', 'ingredients', 'rating_val',
'rating_count', 'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url'
]
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
df = collect_page_data(url, columns_name)
print(df.iloc[0])
结果:
(我只得到第一行,所以我有Series
,它将数据显示为列(
title NaN
total_time NaN
image [https://food-images.files.bbci.co.uk/food/rec...
ingredients NaN
rating_val NaN
rating_count 22
category NaN
cuisine NaN
diet http://schema.org/LowCalorieDiet
vegan http://schema.org/VeganDiet
vegetarian http://schema.org/VegetarianDiet
url https://www.bbc.co.uk/food/recipes/avocado_pas...
Category Main course
Ingredients [375g/13oz pasta, such as penne or fusilli, 1 ...
name Nadiya Hussain
rating_value 4.363636
totalTime 40.0
Name: 0, dtype: object
编辑:
对我来说,insert_df()
是完全无用的,你可以直接在collect_page_data()
中运行它的代码。它可以创建可读性更强的代码。
import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# --- functions ---
def collect_page_data(url, columns_name):
# --- scraping ---
page = requests.get(url)
page_soup = BeautifulSoup(page.text, 'html.parser')
res = page_soup.find("script", {"type": "application/ld+json"})
#data = json.loads(res.text)
data = json.loads(res.string) #
prep_time = data['prepTime']
cook_time = data['cookTime']
for char in ['P', 'T', 'M']:
prep_time = prep_time.replace(char, "")
cook_time = cook_time.replace(char, "")
total_time = int(prep_time) + int(cook_time)
# --- dataframe ---
df = pd.DataFrame(columns=columns_name)
df = df.append({
'name': data['author']['name'],
'total_time': total_time,
'image': data['image'],
'rating_count': data['aggregateRating']['ratingCount'],
'rating_value': data['aggregateRating']['ratingValue'],
'category': data['recipeCategory'],
'ingredients': data['recipeIngredient'],
'diet': data['suitableForDiet'][1],
'vegan': data['suitableForDiet'][2],
'vegetarian': data['suitableForDiet'][3],
'url': url
}, ignore_index=True)
return df
# --- main ---
columns_name = [
'title', 'name', 'total_time', 'image',
'ingredients', 'rating_value', 'rating_count',
'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url'
]
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
df = collect_page_data(url, columns_name)
print(df.iloc[0])