网页抓取-html问题



我一直在尝试这个代码并取得了一些成功,但无法确定下一步

import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json

url2 = "https://www.procyclingstats.com/rankings.php"

print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?


response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))

#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#  print(dfs)

print(colored('#Step1','red'))

print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)

# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]

#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#print(dfs2)

child_soup = soup.find('h3')

for i in child_soup.children:
print("child :  ", i)

print('n'*3)  

我最终以孩子为骑手(结果文本如下(

我一直在尝试这个代码并取得了一些成功,但无法确定下一步

import pandas as pd
import requests
from termcolor import colored
from bs4 import BeautifulSoup
import requests
import lxml.html as lh
import pprint
import json

url2 = "https://www.procyclingstats.com/rankings.php"

print(colored('#Step1','green'))
response = requests.get(url2)
soup = BeautifulSoup(response.text, 'lxml')
table = soup.find('table', {'class':'basic'})
headers = [heading.text for heading in table.find_all('th',{"class":"cu600"})]
#print(headers)
#why do I only get two headers here (prev, team)?


response = requests.get(url2)
dfs = pd.read_html(response.text)[0]
#print(list(dfs))

#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#  print(dfs)

print(colored('#Step1','red'))

print(colored('#Step2','green'))
url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, 'lxml')
table2 = soup.find({'class':'class="mt10 pps"'})
#headers = [heading.text for heading in table1.find_all('th',{"class":"cu600"})]
#print(headers)

# Usually the line below is enough
# But for some reason returning Forbidden
#dfs = pd.read_html(url)[0]
response = requests.get(url3)
dfs2 = pd.read_html(response.text)[0]

#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#print(dfs2)

child_soup = soup.find('h3')

for i in child_soup.children:
print("child :  ", i)

print('n'*3)  

我最终以孩子为骑手(结果文本如下(

#Step2
child :   Rider

我试图捕捉的是"每个专业的分数"和价值观。

还有第二个问题,为什么我只得到两个标签,而它们似乎都有相同的名字?

显示所需结果的照片和箭头

import re
url2 = "https://www.procyclingstats.com/rankings.php"
response = requests.get(url2)
soup = BeautifulSoup(response.text, "lxml")
table = soup.find("table", {"class": "basic"})
thead = table.find("thead")
headers = [heading.text for heading in thead.find_all("th")]

url3 = "https://www.procyclingstats.com/rider/tadej-pogacar"
response = requests.get(url3)
soup = BeautifulSoup(response.text, "lxml")
ul = soup.find("ul", {"class": "basic"})
li = ul.find_all("li")
d = {}
for l in li:
m = re.search("(d+)(.*)", l.text)
d[m.group(2)] = m.group(1)
print(headers)
print(d)
# ['#', 'Prev.', 'Diff.', 'Rider', 'Team', 'Points']
# {'One day races': '1641', 'GC': '3444', 'Time trial': '1147', 'Sprint': '302', 'Climber': '3816'}

最新更新