我一直在尝试这个代码并取得了一些成功,但无法确定下一步
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('n'*3)
我最终以孩子为骑手(结果文本如下(
我一直在尝试这个代码并取得了一些成功,但无法确定下一步
import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json
url2 = "https://www.procyclingstats.com/rankings.php"
print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?
response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
# print(dfs)
print(colored('#Step1','red'))
print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)
# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]
#with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
#print(dfs2)
child_soup = soup.find('h3')
for i in child_soup.children:
print("child : ", i)
print('n'*3)
我最终以孩子为骑手(结果文本如下(
#Step2
child : Rider
我试图捕捉的是"每个专业的分数"和价值观。
还有第二个问题,为什么我只得到两个标签,而它们似乎都有相同的名字?
显示所需结果的照片和箭头
import re
url2 = "https://www.procyclingstats.com/rankings.php"
response = requests.get(url2)
soup = BeautifulSoup(response.text, "lxml")
table = soup.find("table", {"class": "basic"})
thead = table.find("thead")
headers = [heading.text for heading in thead.find_all("th")]
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, "lxml")
ul = soup.find("ul", {"class": "basic"})
li = ul.find_all("li")
d = {}
for l in li:
m = re.search("(d+)(.*)", l.text)
d[m.group(2)] = m.group(1)
print(headers)
print(d)
# ['#', 'Prev.', 'Diff.', 'Rider', 'Team', 'Points']
# {'One day races': '1641', 'GC': '3444', 'Time trial': '1147', 'Sprint': '302', 'Climber': '3816'}