美汤刮爪哒哒表问题csv



我目前有一个脚本,它将从基本 URL 生成 url 和名称列表,还有一个脚本将使用 urls 列表中的链接为我提供我需要的数据,但我正在努力解决我应该如何创建一个从第一行抓取 url 的循环, 运行第二个脚本,然后将该文件另存为包含 URL 的第一个文件中的第 2+3 列。

这是我的两个脚本及其输出的数据。


import io
import requests
import operator
import csv
from bs4 import BeautifulSoup
from requests import get
url = 'http://arizonascaleracers.liverc.com/results'
response = get(url)
print(response.text[:500])
html_soup = BeautifulSoup(response.text, 'html.parser')
type(html_soup)
race_tables = html_soup.find_all('table', {'class':'table table-hover entry_list_data'})[2]
output_rows = []
for row in race_tables.find('tbody').find_all("tr")[1:]:
col = row.find_all("td")
output_row = []
for td in col:

if td.find(racename=''):
racename = ''.join('blank')
if td.find(suburl=''):
suburl = ''.join('blank')
if td.find(time=''):
time = ''.join('blank')
else:
suburl = row.find("a")['href']
racename = col[0].text
time = col[1].text
output_row.append(td)
output_rows.append(output_row)
with open('output.csv', 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerows(output_rows)

此脚本的输出是...

"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227665""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car A-Main)</a></td>","<td>Nov 23, 2019 at 2:39pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227664""><i class=""fa fa-trophy""></i> Race 5: 17.5 Super Stock Touring Car (17.5 Super Stock Touring Car A-Main)</a></td>","<td>Nov 23, 2019 at 2:31pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227666""><i class=""fa fa-trophy""></i> Race 4: Pro GT (Pro GT A-Main)</a></td>","<td>Nov 23, 2019 at 2:17pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227662""><i class=""fa fa-trophy""></i> Race 3: USGT 21.5 (USGT 21.5 A-Main)</a></td>","<td>Nov 23, 2019 at 2:10pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227663""><i class=""fa fa-trophy""></i> Race 2: 1-12 Open Modified (1-12 Open Modified A-Main)</a></td>","<td>Nov 23, 2019 at 2:03pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227661""><i class=""fa fa-trophy""></i> Race 1: VTA (VTA A-Main)</a></td>","<td>Nov 23, 2019 at 1:52pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227120""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 1:22pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227119""><i class=""fa fa-trophy""></i> Race 5: 17.5 Super Stock Touring Car (17.5 Super Stock Touring Car  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 1:14pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227118""><i class=""fa fa-trophy""></i> Race 4: Pro GT (Pro GT  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 1:06pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227117""><i class=""fa fa-trophy""></i> Race 3: USGT 21.5 (USGT 21.5  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:58pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227116""><i class=""fa fa-trophy""></i> Race 2: 1-12 Open Modified (1-12 Open Modified  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:51pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227115""><i class=""fa fa-trophy""></i> Race 1: VTA (VTA  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:40pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226732""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:18pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226731""><i class=""fa fa-trophy""></i> Race 5: 17.5 Super Stock Touring Car (17.5 Super Stock Touring Car  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:11pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226730""><i class=""fa fa-trophy""></i> Race 4: Pro GT (Pro GT  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 12:03pm</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226729""><i class=""fa fa-trophy""></i> Race 3: USGT 21.5 (USGT 21.5  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:55am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226728""><i class=""fa fa-trophy""></i> Race 2: 1-12 Open Modified (1-12 Open Modified  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:47am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226727""><i class=""fa fa-trophy""></i> Race 1: VTA (VTA  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:37am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226250""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:16am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226249""><i class=""fa fa-trophy""></i> Race 5: 17.5 Super Stock Touring Car (17.5 Super Stock Touring Car  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:08am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226251""><i class=""fa fa-trophy""></i> Race 4: Pro GT (Pro GT  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 11:00am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226247""><i class=""fa fa-trophy""></i> Race 3: USGT 21.5 (USGT 21.5  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 10:52am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226248""><i class=""fa fa-trophy""></i> Race 2: 1-12 Open Modified (1-12 Open Modified  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 10:42am</td>"
"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2226246""><i class=""fa fa-trophy""></i> Race 1: VTA (VTA  (Heat 1/1))</a></td>","<td>Nov 23, 2019 at 10:31am</td>"

然后我的第二个脚本..

from bs4 import BeautifulSoup
import urllib.request as urllib2
html = urllib2.urlopen('http://arizonascaleracers.liverc.com/results/?p=view_race_result&id=2227665')
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', {"class":"table table-striped race_result"})
for row in table.find('tbody').find_all("tr", recursive=False):
col = row.find_all("td")
FinishPos = col[0].text.strip()
DriverInfo = col[1].text.strip()
DI = row.find('span', class_ = 'driver_name')
Qual = col[2].text.strip()
test = col[3].text
LapsTime = col[4].text
Behind = col[5].text
Fastest = col[6].text
Avg_Lap = col[7].text
Avg_Top_5 = col[8].text
Avg_Top_10  = col[9].text
Avg_Top_15 = col[10].text
Top_3_Consecutive = col[11].text
DI = DI.text
print(FinishPos, Qual, DI, test, LapsTime, Behind, Fastest, Avg_Lap)

这里的输出是...

1 1 GABE HARVELL 24/5:04.408 --- 12.481 12.83912.839 12.59312.593
2 3 JOHNATHAN LEE 24/5:09.287 4.879 12.583 13.01113.011 12.65512.655
3 4 BRAD TOFFELMIRE 24/5:12.110 2.823 12.520 13.11813.118 12.71012.710
4 2 JACK KLOEBER 23/5:09.212 1 Lap 13.028 13.61013.610 13.09713.097
5 5 BILL CLINE 22/5:02.867 1 Lap 13.177 13.89813.898 13.28713.287
6 6 TIMOTHY SCHMUCK 22/5:03.815 0.948 12.919 13.92713.927 13.13913.139
7 7 CRAIG NELSON 21/5:08.304 1 Lap 13.713 14.82414.824 13.95713.957

现在我想我需要创建一个循环并通过调用输入和输出的列来循环这些,例如......

import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
contents = []
with open('output.csv','r') as csvf:
urls = csv.reader(csvf)[:0]
for url in urls:
html = urllib2.urlopen('http://arizonascaleracers.liverc.com'+(url))
soup = BeautifulSoup(html, 'html.parser')
table = soup.find('table', {"class":"table table-striped race_result"})
for row in table.find('tbody').find_all("tr", recursive=False):
col = row.find_all("td")
FinishPos = col[0].text.strip()
DriverInfo = col[1].text.strip()
DI = row.find('span', class_ = 'driver_name')
Qual = col[2].text.strip()
test = col[3].text
LapsTime = col[4].text
Behind = col[5].text
Fastest = col[6].text
Avg_Lap = col[7].text
Avg_Top_5 = col[8].text
Avg_Top_10  = col[9].text
Avg_Top_15 = col[10].text
Top_3_Consecutive = col[11].text
DI = DI.text

output_rows.append(output_row)
with open('col[1]+[2].csv', 'w') as csvfile:
writer = csv.writer(csvfile, delimiter=',')
writer.writerows(output_rows)

显然,最后一块不起作用,我似乎找不到有关如何完成此操作的任何答案。

我试图最终得到的是这些数据。

1 1 GABE HARVELL 24/5:04.408 --- 12.481 12.83912.839 12.59312.593
2 3 JOHNATHAN LEE 24/5:09.287 4.879 12.583 13.01113.011 12.65512.655
3 4 BRAD TOFFELMIRE 24/5:12.110 2.823 12.520 13.11813.118 12.71012.710
4 2 JACK KLOEBER 23/5:09.212 1 Lap 13.028 13.61013.610 13.09713.097
5 5 BILL CLINE 22/5:02.867 1 Lap 13.177 13.89813.898 13.28713.287
6 6 TIMOTHY SCHMUCK 22/5:03.815 0.948 12.919 13.92713.927 13.13913.139
7 7 CRAIG NELSON 21/5:08.304 1 Lap 13.713 14.82414.824 13.95713.957

但对于每个网址。该数据适用于单个 URL。

"<td><a class=""block"" href=""/results/?p=view_race_result&amp;id=2227665""><i class=""fa fa-trophy""></i> Race 6: Modified Touring Car (Modified Touring Car A-Main)</a></td>","<td>Nov 23, 2019 at 2:39pm</td>"

我的最终目标是在周日(我们在周六比赛(进行此运行,抓取子网址的基本网址,然后抓取每个单独的子网址以获取与上面类似的数据,并将其保存为比赛名称和时间。 类似的东西

Race 6: Modified Touring Car (Modified Touring Car A-Main)Nov 23, 2019.csv

感谢您的帮助!

这里有一些重构,应该可以帮助您将抓取的数据转换为 CSV 表单。 它消除了临时CSV文件,只是遍历每个比赛条目URL,提取文本,然后将结果数据放入命名元组数据结构中,以便于修改。

#!/usr/bin/env python
# functionality of first script
import sys
import requests
import re
from bs4 import BeautifulSoup
from collections import namedtuple
base_url = 'http://arizonascaleracers.liverc.com'
response = requests.get('{}/results/?p=view_event&id={}'.format(base_url, sys.argv[1]))
#response = requests.get('{}/results'.format(base_url))
html_soup = BeautifulSoup(response.text, 'html.parser')
race_tables = html_soup.find_all('table', {'class':'table table-hover entry_list_data'})[2]
REGEX_RACENAME = re.compile(' ?Race d: [w .-]+(([w .-(/)]+))')
Car = namedtuple('Car', ['FinishPos', 'DriverInfo', 'Qual', 'LapsTime', 'Fastest', 'AvgLap', 'AvgTop5', 'AvgTop10', 'AvgTop15', 'Top3Consecutive'])
races = []
class Race:
def __init__(self, name, time, race_id):
self.name = newname
self.time = time
self.id = race_id
self.positions = []
def __str__(self):
retval = ''
for p in self.positions:
line = '{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|{}|n'.format(p.FinishPos, p.Qual, p.DriverInfo, p.LapsTime, p.Fastest, p.AvgLap, p.AvgTop10, p.AvgTop15, p.Top3Consecutive, self.name, self.time)
retval += line
return retval
for row in race_tables.find('tbody').find_all("tr")[1:]:
col = row.find("a", href=True)
try:
racename = col.text
newname = REGEX_RACENAME.match(racename).group(1)[1:][:-1]
except AttributeError:
continue
else:
href = col.get('href')
timestamp = row.find_all("td")[-1].text
r = Race(newname, timestamp, href)
response = requests.get('{}{}'.format(base_url, href))
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', {"class":"table table-striped race_result"})
for entry in table.find('tbody').find_all("tr", recursive=False):
col = entry.find_all("td")
FinishPos = col[0].text.strip()
DriverInfo = entry.find('span', class_ = 'driver_name').text
Qual = col[2].text.strip()
LapsTime = col[4].text
Fastest = col[6].text
Avg_Lap = col[7].text
Avg_Top_5 = col[8].text
Avg_Top_10  = col[9].text
Avg_Top_15 = col[10].text
Top_3_Consecutive = col[11].text
c = Car(FinishPos, DriverInfo, Qual, LapsTime, Fastest, Avg_Lap, Avg_Top_5, Avg_Top_10, Avg_Top_15, Top_3_Consecutive)
r.positions.append(c)
races.append(r)
# second script
with open('output', 'a') as outputfile:
print(len(races))
for r in races:
#print(r.name)
#print(str(r))
outputfile.write(str(r)) # add in different fields to be outputted to CSV as desired

祝你的项目好运!

最新更新