使用Selenium和熊猫进行网络抓取不会在 csv 上显示数据帧值



下面的代码是原始代码的转换示例,但它描述了问题也是如此。在我的项目中,我使用Selenium来收集数据,然后单击点击统计按钮,让pandas库读取页面源。几行之后我有想要的数据帧。问题出在导出过程中,因为尽管来自选择器的数据被完美地写入csv,数据帧值显示为零。

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from time import sleep
from datetime import datetime, timezone
import pandas as pd
import re
errors = []
season = []
my_url = f'https://www.mackolik.com/mac/bayern-m%C3%BCnchen-vs-augsburg/2mck8cqve7nadhtfff65a1mhg'
option = Options()
option.headless = False
driver = webdriver.Firefox(options=option)
driver.get(my_url)
driver.maximize_window()
sleep(5)

#scraping
try:
date_elm = driver.find_element(By.XPATH, 
"//p[@class='p0c-soccer-match-details-header__info']//span[@class='p0c-soccer-match-details-header__info-date']").get_attribute('data-utc')
ts = int(date_elm)
ts /=1000
date = datetime.fromtimestamp(ts).strftime('%d/%m/%Y %H:%M')

info_bar = driver.find_element(By.CSS_SELECTOR,
"p[class='p0c-soccer-match-details-header__info']").text
info = info_bar.split('|')
day = info[2]
matchday = re.findall("d+", day)[0]
crowd = info[3]
attedance = crowd[crowd.find('(')+1:crowd.find(')')]
home_team = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__team-name.p0c-soccer-match-details-header__team-name--home").text
away_team = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__team-name.p0c-soccer-match-details-header__team-name--away").text

home_score = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__score-home").text
away_score = driver.find_element(By.CSS_SELECTOR,
".p0c-soccer-match-details-header__score-away").text

ht_scoreA = driver.find_element(By.XPATH,
"(//div[@class='p0c-soccer-match-details-header__detailed-score'])[1]").text
ht_scoreB = split_string = re.split(r'[(-) ]', ht_scoreA)
home_htscore = ht_scoreB[2]
away_htscore = ht_scoreB[4]
referee = driver.find_element(By.CSS_SELECTOR,
"li[class='p0c-match-officials__official-list-item p0c-match-officials__official-list-item--main '] span[class='p0c-match-officials__official-name']").text

elem = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((
By.LINK_TEXT, "İstatistik")))
driver.execute_script("arguments[0].click();", elem)
sleep(10)


dfs = pd.read_html(driver.page_source)
gentab = dfs[0]
gentab = gentab.replace('%','', regex=True)
gentab.drop(gentab.columns[1], axis=1, inplace=True)
general = gentab.iloc[[0, 10]]
general.columns = ['team1', 'team2']
pastab = dfs[1]
pastab = pastab.replace('%','', regex=True)
pastab.drop(pastab.columns[1], axis=1, inplace=True)
passes = pastab.iloc[[6, 8]]
passes.columns = ['team1', 'team2']
att_tab = dfs[2]
att_tab = att_tab.replace('%','', regex=True)
att_tab.drop(att_tab.columns[1], axis=1, inplace=True)
attack = att_tab.iloc[[10, 8, 4]]
attack.columns = ['team1', 'team2']
foul_tab = dfs[4]
foul_tab = foul_tab.replace('%','', regex=True)
foul_tab.drop(foul_tab.columns[1], axis=1, inplace=True)
fouls = foul_tab.iloc[[0, 2, 4]]
fouls.columns = ['team1', 'team2']
stats = pd.concat([general, passes, attack, fouls], ignore_index=True)
stats.reset_index(drop=True, inplace=True)
stats = stats.assign(sts=['Possesion','Corners','Attack_Passes','Centres',
'Short_Shots','Long_Shots','Target_Shots','Fouls','Yellows',
'Reds'])
stats.columns = [home_team, away_team, 'sts']
stats = stats.reindex(columns=[home_team, 'sts', away_team])
driver.quit()

except:
driver.quit()

# Handling the stats
home_stats = {}
away_stats = {}

home_series = stats[home_team]
away_series = stats[away_team]
stats_series = stats['sts']

for row in zip(home_series, stats_series, away_series):
stat = row[1].replace(' ', '_').lower()
home_stats[stat] = row[0]
away_stats[stat] = row[2]

stats_check = ['Yellows', 'Reds', 'Short_Shots', 'Long_Shots', 'Target_Shots',
'Corners', 'Possesion', 'Centres', 'Attack_Passes', 'Fouls']

for stat in stats_check:
if stat not in home_stats.keys():
home_stats[stat] = 0
away_stats[stat] = 0

# Storing the data
match = [date, matchday, home_team, away_team, home_score, away_score, home_htscore, away_htscore,
referee, attedance, home_stats['Yellows'], away_stats['Yellows'],home_stats['Reds'], away_stats['Reds'],
home_stats['Short_Shots'], away_stats['Short_Shots'], home_stats['Long_Shots'], away_stats['Long_Shots'],
home_stats['Target_Shots'], away_stats['Target_Shots'], home_stats['Corners'], away_stats['Corners'],
home_stats['Possesion'], away_stats['Possesion'], home_stats['Centres'], away_stats['Centres'],
home_stats['Attack_Passes'], away_stats['Attack_Passes'], home_stats['Fouls'], away_stats['Fouls']]

season.append(match)


# Exporting the data
columns = ['date', 'matchday', 'home_team', 'away_team', 'home_score', 'away_score',
'home_htscore', 'away_htscore', 'referee', 'attedance']

for stat in stats_check:
columns.append(f'home_{stat}')
columns.append(f'away_{stat}')
dataset = pd.DataFrame(season, columns=columns)
dataset.to_csv('Bundesliga_test.csv', index=False)
print('.csv file exported.')
print(f'Number of errors: {len(errors)}')
print('Errors:n')
print(errors)

根据chitown88的建议,我尝试了调试,最终发现了问题。在下面的列表中,我不得不将列表值全部替换为小写。固定,运行平稳。

match = [date, matchday, home_team, away_team, home_score, away_score, home_htscore, away_htscore, referee, attedance,
home_stats['possesion'], away_stats['possesion'], home_stats['corners'], away_stats['corners'],
home_stats['attack_passes'], away_stats['attack_passes'], home_stats['centres'], away_stats['centres'],
home_stats['short_shots'], away_stats['short_shots'], home_stats['long_shots'], away_stats['long_shots'],
home_stats['target_shots'], away_stats['target_shots'], home_stats['fouls'], away_stats['fouls'],
home_stats['yellows'], away_stats['yellows'], home_stats['reds'], away_stats['reds']]

stats_check = ['possesion', 'corners', 'attack_passes', 'centres',
'short_shots', 'long_shots', 'target_shots', 'fouls', 'yellows', 'reds']

最新更新