我只想从json文件中提取文本内容。我试图实现这个主题,但我到了这一步,我不知道如何从"text"中提取内容,并将提取的数据(text:content)保存在csv文件中
import pandas as pd
import json
with open('shahd_yahia33.json','r', encoding='utf-8') as f:
data = json.loads(f.read())
df_nested_list = pd.json_normalize(data, record_path =['GraphImages'])
df = df_nested_list.iloc[:,16]
print(df)
的结果将是这样的:
78 [{'node': {'text': '🍑🍑'}}]
79 [{'node': {'text': '🤍🤍🤍'}}]
80 [{'node': {'text': '🍇🍑🍉'}}]
81 [{'node': {'text': '🤍🤍'}}]
82 [{'node': {'text': '🤍🤍'}}]
这是我使用的数据的一个示例部分:
{
"GraphImages": [
{
"__typename": "GraphImage",
"comments_disabled": false,
"dimensions": {
"height": 800,
"width": 640
},
"display_url": "https://instagram.fnjf19-1.fna.fbcdn.net/v/t51.2885-15/289451079_3011485445757272_2206549761362656362_n.webp?stp=dst-jpg_e35&_nc_ht=instagram.fnjf19-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=TjN-cA8G8NcAX9qBH1b&edm=APU89FABAAAA&ccb=7-5&ig_cache_key=Mjg2Njg4OTQ2NDQzMDkxNjI4NQ%3D%3D.2-ccb7-5&oh=00_AT_6mkMBSUhOLrS4KFzFa3J1UyTyr7HL6DiWVzcJ2roaXg&oe=62BBE498&_nc_sid=86f79a",
"edge_media_preview_like": {
"count": 1
},
"edge_media_to_caption": {
"edges": [
{
"node": {
"text": "Good morning"
}
}
]
},
"edge_media_to_comment": {
"count": 0
},
"gating_info": null,
"id": "2866889464430916285",
"is_video": false,
"media_preview": "ACIqxt359e/X8ak2557VBEN/J6L+pPT/AOvU5NSMNopcCkXGRu+7nmrhQZyq5B7Dp+dAFOirxtl9cUUAZ0Qwn4/04qcBVDA/e798eg/qaZbON20DjqM9c/y9almGwMw6nFAxYF3/ACn5gaqhdpIBIGexNOEzkY4APXHH+c+1JxTJHbR6UUu8UUWGR2qkv7CpZyCdmcd8np9D/niordiFbBIqtISTzRYLlnaV4YY/z+tPGMcVBEx2kZOBRuPqaAJciiodx9TRTEf/2Q==",
"owner": {
"id": "27600881499"
},
"shortcode": "CfJPMdJtvK9",
"taken_at_timestamp": 1655979877,
"thumbnail_resources": [
{
"config_height": 150,
"config_width": 150,
"src": "https://instagram.fnjf19-1.fna.fbcdn.net/v/t51.2885-15/289451079_3011485445757272_2206549761362656362_n.webp?stp=c0.67.540.540a_dst-jpg_e35_s150x150&_nc_ht=instagram.fnjf19-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=TjN-cA8G8NcAX9qBH1b&edm=APU89FABAAAA&ccb=7-5&oh=00_AT-2oabihtpiNO5OZJHvUQnmGEiGquiytXOu4uouKZNBew&oe=62BBE498&_nc_sid=86f79a"
},
{
"config_height": 240,
"config_width": 240,
"src": "https://instagram.fnjf19-1.fna.fbcdn.net/v/t51.2885-15/289451079_3011485445757272_2206549761362656362_n.webp?stp=c0.67.540.540a_dst-jpg_e35_s240x240&_nc_ht=instagram.fnjf19-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=TjN-cA8G8NcAX9qBH1b&edm=APU89FABAAAA&ccb=7-5&oh=00_AT8VQCOOH0lEIpPQrdd8LwfVzXN09XgNpSv9AG7Ko-91_g&oe=62BBE498&_nc_sid=86f79a"
},
{
"config_height": 320,
"config_width": 320,
"src": "https://instagram.fnjf19-1.fna.fbcdn.net/v/t51.2885-15/289451079_3011485445757272_2206549761362656362_n.webp?stp=c0.67.540.540a_dst-jpg_e35_s320x320&_nc_ht=instagram.fnjf19-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=TjN-cA8G8NcAX9qBH1b&edm=APU89FABAAAA&ccb=7-5&oh=00_AT8eoYkONHDiSOiGOpi6TxMDW17pxuUWok3GPOHp81UKGA&oe=62BBE498&_nc_sid=86f79a"
},
{
"config_height": 480,
"config_width": 480,
"src": "https://instagram.fnjf19-1.fna.fbcdn.net/v/t51.2885-15/289451079_3011485445757272_2206549761362656362_n.webp?stp=c0.67.540.540a_dst-jpg_e35_s480x480&_nc_ht=instagram.fnjf19-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=TjN-cA8G8NcAX9qBH1b&edm=APU89FABAAAA&ccb=7-5&oh=00_AT_fSMkXGf02tc7PyR4WytizjITO2n-qOoy_0ysk_KrppA&oe=62BBE498&_nc_sid=86f79a"
},
{
"config_height": 640,
"config_width": 640,
"src": "https://instagram.fnjf19-1.fna.fbcdn.net/v/t51.2885-15/289451079_3011485445757272_2206549761362656362_n.webp?stp=c0.67.540.540a_dst-jpg_e35&_nc_ht=instagram.fnjf19-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=TjN-cA8G8NcAX9qBH1b&edm=APU89FABAAAA&ccb=7-5&oh=00_AT9I51ZVqmfC2wvCUTn_LffpnoDv6Ayvj4RR_KU8G_eXzQ&oe=62BBE498&_nc_sid=86f79a"
}
],
"thumbnail_src": "https://instagram.fnjf19-1.fna.fbcdn.net/v/t51.2885-15/289451079_3011485445757272_2206549761362656362_n.webp?stp=c0.67.540.540a_dst-jpg_e35&_nc_ht=instagram.fnjf19-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=TjN-cA8G8NcAX9qBH1b&edm=APU89FABAAAA&ccb=7-5&ig_cache_key=Mjg2Njg4OTQ2NDQzMDkxNjI4NQ%3D%3D.2.c-ccb7-5&oh=00_AT9I51ZVqmfC2wvCUTn_LffpnoDv6Ayvj4RR_KU8G_eXzQ&oe=62BBE498&_nc_sid=86f79a",
"urls": [
"https://instagram.fnjf19-1.fna.fbcdn.net/v/t51.2885-15/289451079_3011485445757272_2206549761362656362_n.webp?stp=dst-jpg_e35&_nc_ht=instagram.fnjf19-1.fna.fbcdn.net&_nc_cat=108&_nc_ohc=TjN-cA8G8NcAX9qBH1b&edm=APU89FABAAAA&ccb=7-5&ig_cache_key=Mjg2Njg4OTQ2NDQzMDkxNjI4NQ%3D%3D.2-ccb7-5&oh=00_AT_6mkMBSUhOLrS4KFzFa3J1UyTyr7HL6DiWVzcJ2roaXg&oe=62BBE498&_nc_sid=86f79a"
],
"username": "shahd_yahia33"
},
如果您只想从'text'键中提取内容到数据框架中,则可以这样做:
def extract_values(entry):
return entry[0]['node']['text']
with open('shahd_yahia33.json','r', encoding='utf-8') as f:
data = json.loads(f.read())
df_nested_list = pd.json_normalize(data, record_path =['GraphImages'])
df = df_nested_list.iloc[:,16]
df = df.apply(extract_values)
print(df)
在发布的代码中已经完成了所有的预处理之后,我们使用df.apply
操作将提取函数应用于每个entry
。这将为我们提供如下样例文件的输出:
0 Good morning
Name: edge_media_to_caption.edges, dtype: object
最后我们可以使用to_csv
操作保存为csv文件:
df.to_csv("./extracted.csv")