用硒刮亲子关系数据，beautifulsoup

我希望你们都做得很好！我正设法把这个单子略去(https://cov-lineages.org/lineage_list.html)世系，世系是亲子关系。我要做的：

循环浏览列表(这个https://cov-lineages.org/lineage_list.html)并单击每个元素刮取其数据
然后转到一个链接(在同一页面中(，该链接有每个谱系的突变表
向下滚动到具有该谱系的子级的表，循环浏览它们，单击其中的每一个并废弃其数据，还有每个子级，如果它有子级，我们应该执行相同的过程并废弃它们。我在这里包含了一个pdf文件中的屏幕截图解释。请看一看，看看你是否能想出一个关于如何实现树或嵌套词典的想法

您不需要Selenium来执行此任务，requests将完成此任务。

此代码将获取列表中的所有行：

import requests
from bs4 import BeautifulSoup
res = requests.get('https://cov-lineages.org/lineage_list.html')
soup = BeautifulSoup(res.text, 'html.parser')
rows = soup.find_all('tr')
for row in rows:
print(row)

从这里你可以得到所有带有row.find_all('td')的单个细胞。使用检查器CTRL+SHIFT+I来识别所需的html元素。

数据都在json源中，供站点渲染。只需直接获取数据，效率更高。这将在很短的时间内获得您使用Selenium收集到的所有数据。这需要几秒钟，而不是几个小时，让Selenium点击每个1907个Parent链接，然后点击(我甚至不知道有多少……但看起来Selenium总共点击了2181个左右的链接(子链接。

就将其转换为输出而言，计算逻辑并找出哪些谱系是哪些父母的后代，然后从叶节点向上构建它有点棘手。我相信有更好的方法来编码它，但我认为这能做到：

import requests
import pandas as pd
import re

# Source data
# This will get each individual lineage data into the desired form
_url = 'https://raw.githubusercontent.com/cov-lineages/lineages-website/master/_data/lineage_data.json'
jsonData = requests.get(_url).json()
jsonData = [v for k,v in jsonData.items()]
sourceData = {}
for _each in jsonData:
_lineage = _each['Lineage']
_description = _each['Description']
_most_common_countries = _each['Countries']
_earliest_date = _each['Earliest date']
_number_designated = _each['Number designated']
_number_assigned = _each['Number assigned']
_children = []
sourceData[_lineage] = {
'id':_lineage,
'description':_description,
'most_common_countries':_most_common_countries,
'earliest_date':_earliest_date,
'number_designated':_number_designated,
'number_assigned':_number_assigned,
'children':[]}

# This parses the yml file to work out which child belongs to which parent
_url = 'https://cov-lineages.org/data/lineages.yml'
_response = requests.get(_url).text
_lineages = re.findall('(name: |parent: )(.*)', _response)
parent_children = {}
# Create dictionary of all parent lineages
for _idx, _lineage in enumerate(_lineages):
if _lineage[0] == 'parent: ' and _lineage[1] != '' and _lineage[1] not in parent_children.keys():
parent_children[_lineage[-1]] = {'children':[]}

if _lineage[1] == '' and _lineages[_idx-1][1] not in parent_children.keys():
parent_children[_lineages[_idx-1][1]] = {'children':[]}

# Match parent with appropriate children
for _idx, _lineage in enumerate(_lineages):
if (_idx+1 == len(_lineages) or (_lineages[_idx][0] == 'name: ' and _lineages[_idx+1][0] == 'name: ')) or (_lineages[_idx+1][-1] == ''):
continue 
if _lineages[_idx+1][0] == 'parent: ':
parent_children[_lineages[_idx+1][-1]]['children'].append(_lineages[_idx][-1])

# Creates a list and dictionary so that I can call out the parent
# given a child by it's key/lineage id
parent_child_relations = []
child_parent_relations = {}
for parent, children in parent_children.items():
child_list = children['children']
for child in child_list:
parent_child_relations.append([parent, child])
child_parent_relations.update({child:parent})

# Creates the "family tree" of each child to then iterate through
nested_child_parent = {}
for each in child_parent_relations:
familyOrder = []
current = each
belong_to = child_parent_relations[current]

familyOrder.append(belong_to)
continueLoop = True
while continueLoop == True:
current = belong_to
try:
belong_to = child_parent_relations[current]
familyOrder.append(belong_to)
except:
continueLoop = False
#familyOrder.reverse()
nested_child_parent[each] = familyOrder
# Sorts that list from the "deepest" branches so that I can
# reconstruct from bottom leaf             
sorted_nested_child_parent = {}
for each in nested_child_parent.items():
length_of_branches = len(each[-1])

if length_of_branches not in sorted_nested_child_parent.keys():
sorted_nested_child_parent[length_of_branches] = []
sorted_nested_child_parent[length_of_branches].append(each)
lengthKeys = list(sorted_nested_child_parent.keys())   
lengthKeys.sort() 
lengthKeys.reverse()

# Starts to add the children lineage data into appropriate parent's children list
# in the source data
for x in lengthKeys:
listToAggregate = sorted_nested_child_parent[x]
for each in listToAggregate:
current = each[0]

for parent in each[1]:
lineageData = sourceData[current]
if parent not in sourceData.keys():
sourceData[parent] = {            
'id':parent,
'description':'NA',
'most_common_countries':'NA',
'earliest_date':'NA',
'number_designated':'NA',
'number_assigned':'NA',
'children':[]}


# if lineageData not already in children, add it
if not lineageData in sourceData[parent]['children']:
sourceData[parent]['children'].append(lineageData)
current = parent

# Gets the list of the main/top lineages    
mainNodes = []
parent_list = list(pd.read_html('https://cov-lineages.org/lineage_list.html')[0]['Lineage'])
for each in parent_list:
try:
parent = child_parent_relations[each]
child = each
except:
print(f'{each} is not a child.')
mainNodes.append(each)
# Gets the main/top lineages from the source data
# and puts into the output list
output = []
for each in mainNodes:
output.append(sourceData[each])

样本输出：

[
{
"id": "A",
"description": "Root of the pandemic lies within lineage A. Many sequences originating from China and many global exports; including to South East Asia Japan South Korea Australia the USA and Europe represented in this lineage",
"most_common_countries": "United States of America 27.0%, United_Arab_Emirates 12.0%, China 9.0%, Germany 8.0%, Canada 5.0%",
"earliest_date": "2019-12-30",
"number_designated": 1698,
"number_assigned": 2317,
"children": [
{
"id": "B",
"description": "Second major haplotype (and first to be discovered)",
"most_common_countries": "United States of America 37.0%, United Kingdom 20.0%, China 7.0%, Mexico 6.0%, Germany 3.0%",
"earliest_date": "2019-12-24",
"number_designated": 4009,
"number_assigned": 9162,
"children": [
{
"id": "B.1",
"description": "A large European lineage the origin of which roughly corresponds to the Northern Italian outbreak early in 2020.",
"most_common_countries": "United States of America 46.0%, United Kingdom 8.0%, Turkey 8.0%, Canada 4.0%, France 4.0%",
"earliest_date": "2020-01-03",
"number_designated": 46252,
"number_assigned": 95711,
"children": [
{
"id": "B.1.1",
"description": "European lineage with 3 clear SNPs `28881GA`,`28882GA`,`28883GC`",
"most_common_countries": "United Kingdom 27.0%, United States of America 14.0%, Japan 7.0%, Russia 5.0%, Turkey 4.0%",
"earliest_date": "2020-01-08",
"number_designated": 22834,
"number_assigned": 49224,
"children": [
{
"id": "B.1.1.1",
"description": "England",
"most_common_countries": "United Kingdom 53.0%, Peru 10.0%, Belgium 4.0%, United States of America 3.0%, Italy 2.0%",
"earliest_date": "2020-03-02",
"number_designated": 1745,
"number_assigned": 2913,
"children": [
{
"id": "C.36",
"description": "Alias of B.1.1.1.36, Egypt mainly and other countries",
"most_common_countries": "Egypt 33.0%, Germany 11.0%, United Kingdom 10.0%, United States of America 7.0%, Denmark 6.0%",
"earliest_date": "2020-03-13",
"number_designated": 220,
"number_assigned": 1042,
"children": [
{
"id": "C.36.3",
"description": "Alias of B.1.1.1.36.3, Europe and USA lineage, from pango-designation issue #80",
"most_common_countries": "Germany 18.0%, United States of America 18.0%, Switzerland 9.0%, Italy 8.0%, United Kingdom 7.0%",
"earliest_date": "2021-01-04",
"number_designated": 493,
"number_assigned": 1681,
"children": [
{
"id": "C.36.3.1",
"description": "Alias of B.1.1.1.36.3.1, Europe and USA lineage, from pango-designation issue #80",
"most_common_countries": "Germany 64.0%, United States of America 18.0%, Belgium 9.0%, Bulgaria 3.0%, Netherlands 3.0%",
"earliest_date": "2021-03-29",
"number_designated": 54,
"number_assigned": 324,
"children": []
}
]
},
{
"id": "C.36.1",
"description": "Alias of B.1.1.1.36.1, Canada",
"most_common_countries": "Canada 97.0%, United States of America 2.0%, Burkina_Faso 1.0%, Egypt 1.0%",
"earliest_date": "2020-06-24",
"number_designated": 21,
"number_assigned": 199,
"children": []
},
{
"id": "C.36.2",
"description": "Alias of B.1.1.1.36.2, Switzerland",
"most_common_countries": "Switzerland 80.0%, Norway 7.0%, Germany 3.0%, United States of America 3.0%, Sweden 3.0%",
"earliest_date": "2020-10-16",
"number_designated": 18,
"number_assigned": 30,
"children": []
}
]
},
{
"id": "C.1",
"description": "Alias of B.1.1.1.1, South Africa",
"most_common_countries": "South_Africa 91.0%, Zambia 4.0%, United States of America 3.0%, Mozambique 1.0%, Zimbabwe 0.0%",
"earliest_date": "2020-01-03",
"number_designated": 242,
"number_assigned": 351,
"children": [
{
"id": "C.1.1",
"description": "Alias of B.1.1.1.1.1, Mozambique",
"most_common_countries": "Mozambique 100.0%",
"earliest_date": "2020-11-25",
"number_designated": 12,
"number_assigned": 13,
"children": []
},
{
"id": "C.1.2",
"description": "Alias of B.1.1.1.1.2, mostly South Africa, from pango-designation issue #139",
"most_common_countries": "South_Africa 88.0%, Eswatini 4.0%, Russia 2.0%, United Kingdom 1.0%, Botswana 1.0%",
"earliest_date": "2021-04-07",
"number_designated": 15,
"number_assigned": 281,
"children": []
}
]
},
{
"id": "C.2",
"description": "Alias of B.1.1.1.2, South Africa and some European",
"most_common_countries": "South_Africa 44.0%, Zimbabwe 32.0%, Denmark 8.0%, United Kingdom 8.0%, Australia 6.0%",
"earliest_date": "2020-06-09",
"number_designated": 25,
"number_assigned": 50,
"children": [
{
"id": "C.2.1",
"description": "Alias of B.1.1.1.2.1, Aruba and Curacao",
"most_common_countries": "Aruba 60.0%, United States of America 28.0%, Curau00e7ao 9.0%, Netherlands 3.0%, Finland 1.0%",
"earliest_date": "2020-12-18",
"number_designated": 58,
"number_assigned": 150,
"children": []
}
]
}

相关内容

最新更新

热门标签：