上面的代码工作得很好,但似乎有一个问题,找到所有的td元素,因为我得到的数据帧只有奇数页。这个问题我解决不了,如果有人能帮我,我将不胜感激。
import requests
from bs4 import BeautifulSoup
import pandas as pd
# URL to scrape
url = 'https://bases.athle.fr/asp.net/liste.aspx?frmbase=resultats&frmmode=1&frmespace=0&frmcompetition=268139&frmposition={page}'
# Create an empty list to store the data
data = []
# Loop through all pages (assuming there are less than range)
for page in range(4):
# Send a GET request to the URL
response = requests.get(url.format(page=page))
# Create a BeautifulSoup object with the content of the response
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the td elements with class datas0
td_datas0 = soup.find_all('td', {'class': 'datas0'})
# Check if there are any results
if len(td_datas0) == 0:
break
# Loop through each td element and extract the text
for td in td_datas0:
data.append(td.text.strip())
# Divide the data into columns
cols = ['Rank', 'Mark', 'Name', 'Club', 'Department', 'Frm_Ligue','Cat_Sex','Col8','Col9']
# Convert the data into a Pandas dataframe
df = pd.DataFrame([data[i:i+len(cols)] for i in range(0, len(data), len(cols))], columns=cols)
# Save the dataframe as a CSV file
df.to_csv('athle.csv', index=False)
print('Data saved to CSV!')
您应该像这样更改类名,如datas0, datas1, datas2
td_datas0 = soup。Find_all ('td', {'class': 'data '+str(page)})
import requests
from bs4 import BeautifulSoup
import pandas as pd
# URL to scrape
url = 'https://bases.athle.fr/asp.net/liste.aspx?frmbase=resultats&frmmode=1&frmespace=0&frmcompetition=268139&frmposition={page}'
# Create an empty list to store the data
data = []
# Loop through all pages (assuming there are less than range)
for page in range(4):
# Send a GET request to the URL
response = requests.get(url.format(page=page))
# Create a BeautifulSoup object with the content of the response
soup = BeautifulSoup(response.content, 'html.parser')
# Find all the td elements with class datas0
td_datas0 = soup.find_all('td', {'class': 'datas'+str(page)})
# Check if there are any results
if len(td_datas0) == 0:
break
# Loop through each td element and extract the text
for td in td_datas0:
data.append(td.text.strip())
# Divide the data into columns
cols = ['Rank', 'Mark', 'Name', 'Club', 'Department', 'Frm_Ligue','Cat_Sex','Col8','Col9']
# Convert the data into a Pandas dataframe
df = pd.DataFrame([data[i:i+len(cols)] for i in range(0, len(data), len(cols))], columns=cols)
# Save the dataframe as a CSV file
df.to_csv('athle.csv', index=False)
print('Data saved to CSV!')