Python函数在尝试显示数据帧时返回None



我正试图从bbc食谱中获取一个url,然后获取信息,然后将其放入数据帧中。当我试图运行我所做的功能时,我得到的结果是"无",我不确定为什么,因为在我试图将它们组织成一个功能之前,它就起了作用

columns_name=['title', 'total_time', 'image', 'ingredients', 'rating_val', 
'rating_count',
'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url']
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
def print_dataframe(df):
return df
def insert_df(name,totalTime,image,rating_count,rating_value,Category,Ingredients,diet,vegan,vegetarian,url,df):
new_row = {'name':name,'totalTime':totalTime,'image':image,'rating_count':rating_count,'rating_value':rating_value,'Category':Category,'Ingredients':Ingredients,'diet':diet,'vegan':vegan,'vegetarian':vegetarian,'url':url}
df = df.append(new_row, ignore_index=True)
def collect_page_data(url,columns_name):
df = pd.DataFrame(columns = columns_name)
page = requests.get(url)
page_soup = BeautifulSoup(page.text,'html.parser')

res = page_soup.find("script", {"type":
"application/ld+json"})
data = json.loads(res.text)
name = data['author']['name']
image= data['image']
rating_count = data['aggregateRating']['ratingCount'] 
rating_value = data['aggregateRating']['ratingValue']
Category = data['recipeCategory']
Ingredients = data['recipeIngredient']
diet = data['suitableForDiet'][1]
vegan = data['suitableForDiet'][2]
vegetarian = data['suitableForDiet'][3]
prepTime = data['prepTime']
cookTime = data['cookTime']
l = ['P','T','M']
for i in l:
prepTime = prepTime.replace(i,"")
cookTime = cookTime.replace(i,"")
totalTime = int(prepTime) + int(cookTime)
insert_df(name,totalTime,image,rating_count,rating_value,Category,Ingredients,diet,vegan,vegetarian,url,df)

print_dataframe(df)
print(collect_page_data(url,columns_name))

您有两个return的问题。

第一个:

insert_df()中使用

df = df.append(...)

insert_df()内部创建本地df-它不会更改外部df

您应该使用return并使用

return df.append(...)

并执行功能

df = insert_df()

秒((:

collect_page_data()结束时,运行

print_dataframe(df)

其得到CCD_ 8并且仅将其返回。无法从`collect_page_data((取回

collect_page_data()结束时,您应该运行

return df

这是完整的代码:

在我的BeautifulSoup版本中,我不得不使用res.string而不是res.text来获取文本。

import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# --- functions ---
def insert_df(name, totalTime, image, rating_count, rating_value, Category, Ingredients, diet, vegan, vegetarian, url, df):
new_row = {'name':name,'totalTime':totalTime,'image':image,'rating_count':rating_count,'rating_value':rating_value,'Category':Category,'Ingredients':Ingredients,'diet':diet,'vegan':vegan,'vegetarian':vegetarian,'url':url}
return df.append(new_row, ignore_index=True)
def collect_page_data(url, columns_name):
df = pd.DataFrame(columns=columns_name)
page = requests.get(url)
page_soup = BeautifulSoup(page.text, 'html.parser')

res = page_soup.find("script", {"type": "application/ld+json"})

#data = json.loads(res.text)
data = json.loads(res.string)

name = data['author']['name']

image= data['image']
rating_count = data['aggregateRating']['ratingCount'] 
rating_value = data['aggregateRating']['ratingValue']
Category = data['recipeCategory']
Ingredients = data['recipeIngredient']
diet = data['suitableForDiet'][1]
vegan = data['suitableForDiet'][2]
vegetarian = data['suitableForDiet'][3]
prepTime = data['prepTime']
cookTime = data['cookTime']

l = ['P','T','M']
for i in l:
prepTime = prepTime.replace(i, "")
cookTime = cookTime.replace(i, "")

totalTime = int(prepTime) + int(cookTime)

df = insert_df(name, totalTime, image, rating_count, rating_value, Category, Ingredients, diet, vegan, vegetarian, url, df)

return df
# --- main ---
columns_name = [
'title', 'total_time', 'image', 'ingredients', 'rating_val', 
'rating_count', 'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url'
]
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
df = collect_page_data(url, columns_name)
print(df.iloc[0])

结果:

(我只得到第一行,所以我有Series,它将数据显示为列(

title                                                         NaN
total_time                                                    NaN
image           [https://food-images.files.bbci.co.uk/food/rec...
ingredients                                                   NaN
rating_val                                                    NaN
rating_count                                                   22
category                                                      NaN
cuisine                                                       NaN
diet                             http://schema.org/LowCalorieDiet
vegan                                 http://schema.org/VeganDiet
vegetarian                       http://schema.org/VegetarianDiet
url             https://www.bbc.co.uk/food/recipes/avocado_pas...
Category                                              Main course
Ingredients     [375g/13oz pasta, such as penne or fusilli, 1 ...
name                                               Nadiya Hussain
rating_value                                             4.363636
totalTime                                                    40.0
Name: 0, dtype: object

编辑:

对我来说,insert_df()是完全无用的,你可以直接在collect_page_data()中运行它的代码。它可以创建可读性更强的代码。

import pandas as pd
import requests
from bs4 import BeautifulSoup
import json
# --- functions ---
def collect_page_data(url, columns_name):
# --- scraping ---

page = requests.get(url)
page_soup = BeautifulSoup(page.text, 'html.parser')

res = page_soup.find("script", {"type": "application/ld+json"})
#data = json.loads(res.text)
data = json.loads(res.string)  # 

prep_time = data['prepTime']
cook_time = data['cookTime']

for char in ['P', 'T', 'M']:
prep_time = prep_time.replace(char, "")
cook_time = cook_time.replace(char, "")

total_time = int(prep_time) + int(cook_time)
# --- dataframe ---

df = pd.DataFrame(columns=columns_name)

df = df.append({
'name': data['author']['name'],
'total_time': total_time,
'image': data['image'],
'rating_count': data['aggregateRating']['ratingCount'],
'rating_value': data['aggregateRating']['ratingValue'],
'category': data['recipeCategory'],
'ingredients': data['recipeIngredient'],
'diet': data['suitableForDiet'][1],
'vegan': data['suitableForDiet'][2],
'vegetarian': data['suitableForDiet'][3],
'url': url
}, ignore_index=True)

return df
# --- main ---
columns_name = [
'title', 'name', 'total_time', 'image',
'ingredients', 'rating_value', 'rating_count',
'category', 'cuisine', 'diet', 'vegan', 'vegetarian', 'url'
]
url = 'https://www.bbc.co.uk/food/recipes/avocado_pasta_with_peas_31700'
df = collect_page_data(url, columns_name)
print(df.iloc[0])

最新更新