使用Python从xml url获取数据



我想使用URL(https://www.bbc.co.uk/food/sitemap.xml)获取Python中的食谱url列表。我尝试使用xmltodict,但正如我所看到的,它不能很好地转换文本。我的代码:

import urllib.request
import xmltodict
with urllib.request.urlopen('https://www.bbc.co.uk/food/sitemap.xml') as url:
data = url.read()
data = xmltodict.parse(data)
print(data)

部分结果:

[…]OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yoghurtspicedchicken_74830’),('lastmod','2012-06-07'),('image:image',OrderedDict([('image:loc','https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yoghurtspicedchicken_74830_16x9.jpg')])]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yoghurt_and_muesli_61842’),('lastmod','2018-04-18')]),有序字典([('loc','https://www.bbc.co.uk/food/recipes/yoghurt_cake_87253'),('lastmod','2020-03-31'),('image:image',OrderedDict([('image:loc','https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yoghurt_cake_87253_16x9.jpg')])]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshirecurdpie_86473’),('lastmod','2019-05-23')]),有序字典([('loc','https://www.bbc.co.uk/food/recipes/yorkshireparkin_83745’),('lastmod','2019-01-02')]),有序字典([('loc','https://www.bbc.co.uk/food/recipes/yorkshirepotwithchri_87677’),('lastmod','2018-12-03')]),有序字典([('loc','https://www.bbc.co.uk/food/recipes/yorkshirepuddingswit_92145’),('lastmod','2016-09-13')]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshirepuddings_86010’),('lastmod','2018-08-08'),('image:image',OrderedDict([('image.loc','https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yorkshirepuddings_86010_16x9.jpg')])]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshirepuddingviap_9974’),('lastmod','2015-12-07')]),有序字典([('loc','https://www.bbc.co.uk/food/recipes/yorkshirepuddingwith_83703’),('lastmod','2018-10-30')]),有序字典([('loc','https://www.bbc.co.uk/food/recipes/yorkshirepudding_81824’),('lastmod','2019-01-21'),('image:image',OrderedDict([('image:loc','https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yorkshirepudding_81824_16x9.jpg')])]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshirepudding_93848’),('lastmod','2018-08-08'),('image:image',OrderedDict([('image.loc','https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yorkshirepudding_93848_16x9.jpg')])]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_curd_tart_20002’),('lastmod','2019-01-03'),('image:image',OrderedDict([('image:loc','https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yorkshire_curd_tart_20002_16x9.jpg')])]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_curd_tart_23874’),('lastmod','2019-01-03')]),有序字典([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_curd_tart_63644’),('lastmod','2016-09-19')]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_oatmeal_parkin_13911’),('lastmod','2016-09-19')]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_puddings_61798’),('lastmod','2018-11-28'),('image:image',OrderedDict([('image.loc','https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yorkshire_puddings_61798_16x9.jpg')])]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_puddings_and_40867’),('lastmod','2018-12-04')]),有序字典([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_puddings_with_15870’),('lastmod','2018-04-30')]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_puddings_with_50889’),('lastmod','2019-02-11')]),有序字典([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_pudding_69240’),('lastmod','2019-12-10'),('image:image',OrderedDict([('image.loc','https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yorkshire_pudding_69240_16x9.jpg')])]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_pudding_wraps_73052’),(‘lastmod’,‘2019-09-30’),(‘image:image’,OrderedDict([(‘image:loc’,'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yorkshire_pudding_wraps_73052_16x9.jpg')])]),OrderedDict([('loc','https://www.bbc.co.uk/food/recipes/yorkshire_tapas_puddings_93245’),('lastmod','2016-09-14'),('image:image',OrderedDict([('image:loc','https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/yorkshire_tapas_puddings_93245_16x9.jpg')])]),[…]

我只想获得包含在XML标记中的URL,并根据模式"对其进行过滤;https://www.bbc.co.uk/food/recipes/">

不要使用适用于更简单、更平坦的XML文档的方便方法xmltodict,而是考虑使用Python内置的xml.etree模块解析XML并映射到字典。

请确保分配名称空间并有条件地检索图像,因为它并不总是出现在<url>节点下。

import urllib.request
import xml.etree.ElementTree as et
with urllib.request.urlopen('https://www.bbc.co.uk/food/sitemap.xml') as url:
data = url.read()
xml = et.fromstring(data)
nsmp = {"doc": "http://www.sitemaps.org/schemas/sitemap/0.9",
"image" : "http://www.google.com/schemas/sitemap-image/1.1"}

recipies_dict = [] 
for url in xml.findall('doc:url', namespaces = nsmp):
loc = url.find('doc:loc', namespaces = nsmp).text

img_node = url.find('image:image', namespaces = nsmp)   
img = img_node.find('image:loc', namespaces = nsmp).text if img_node is not None else None
recipies_dict.append({'loc':loc, 'img': img})

输出

len(recipes_dict)
# 20084
recipes_dict[1:20]    
# {'loc': 'https://www.bbc.co.uk/food/', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/recipes', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/chefs', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/programmes', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/ingredients', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/seasons', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/occasions', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/cuisines', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/techniques', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/recipes/10minutepizza_87314', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/recipes/15_minute_pasta_33407', 'img': 'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/15_minute_pasta_33407_16x9.jpg'}
# {'loc': 'https://www.bbc.co.uk/food/recipes/1_creamy_chicken_pasta_24218', 'img': 'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/1_creamy_chicken_pasta_24218_16x9.jpg'}
# {'loc': 'https://www.bbc.co.uk/food/recipes/1_hoisin_spinach_and_egg_86057', 'img': 'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/1_hoisin_spinach_and_egg_86057_16x9.jpg'}
# {'loc': 'https://www.bbc.co.uk/food/recipes/1_mixed_vegetable_and_84703', 'img': 'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/1_mixed_vegetable_and_84703_16x9.jpg'}
# {'loc': 'https://www.bbc.co.uk/food/recipes/2_hour_christmas_dinner_79341', 'img': 'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/2_hour_christmas_dinner_79341_16x9.jpg'}
# {'loc': 'https://www.bbc.co.uk/food/recipes/3d_biscuits_29555', 'img': 'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/3d_biscuits_29555_16x9.jpg'}
# {'loc': 'https://www.bbc.co.uk/food/recipes/3wayswithlemoncurd_67266', 'img': None}
# {'loc': 'https://www.bbc.co.uk/food/recipes/3_stir-fry_sauces_52376', 'img': 'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/3_stir-fry_sauces_52376_16x9.jpg'}
# {'loc': 'https://www.bbc.co.uk/food/recipes/5-ingredient_33925', 'img': 'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/5-ingredient_33925_16x9.jpg'}
# {'loc': 'https://www.bbc.co.uk/food/recipes/5-minute_chicken_noodle_78996', 'img': 'https://ichef.bbci.co.uk/food/ic/food_16x9_320/recipes/5-minute_chicken_noodle_78996_16x9.jpg'}

最新更新