抓取器:从页面抓取列表

  • 本文关键字:抓取 列表 python scrapy
  • 更新时间 :
  • 英文 :


目标:抓取此页面

https://www.cardplayer.com/poker-tournaments/monthly/2021/06

然后在每个页面上获得所有比赛的列表。这是我的代码

from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import pandas as pd
mydf = pd.DataFrame()

class TournamentsSpider(CrawlSpider):
name = 'tournaments'
allowed_domains = ['www.cardplayer.com']
start_urls = ['https://www.cardplayer.com/poker-tournaments/monthly/2021/06']
rules = (
Rule(LinkExtractor(restrict_xpaths='/html/body/div[5]/div/div[2]/div[2]/div[3]/div/table/tbody/tr/td[2]/a'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
# I'm aware that some of the pages have two tables(I was thinking an if statement on the  length of response and then running for table 1 on 1 table pages and table 2 on 2 table pages
for series in response.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbody'):
mydf["Event"] = series.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbodytr/td[1]/a/text()')
mydf["start"] = series.xpath('.//tr/td[2]/text()')
mydf["days"] = series.xpath('.//tr/td[3]/text()')
mydf["buyin"] = series.xpath('.//tr/td[4]/text()')

process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(TournamentsSpider)
process.start()
print(mydf)

我可以看到爬虫找到了所有的URL,但输出只返回了一页,所以我做错了什么。

以下是我如何使用bs4进行此操作的,只需输入您想要收集的年份。

# Get Product Page Links
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
baseurl = 'https://www.cardplayer.com/poker-tournaments/monthly/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
Tournaments = []
def GetPageData(url):
#Get singular page info
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
# Get all tr elements with empty class
productlist = soup.find_all('tr', class_='')
for i, item in enumerate(productlist):
# Skip first row of table titles
if (i != 0):
# remove spaces
RawTournamentInfo = str(item.text).strip()
# splits into list by new lines
RawTournamentInfo = RawTournamentInfo.splitlines()
# Create empty in strings 
Date = ''
Name = ''
Location = ''
# had to loop of list, forsome reason not allowing direct calling
for i, item in enumerate(RawTournamentInfo):
if i == 0: Date = item
if i == 1: Name = item
if i == 2: Location = item
# Creating object and saving to list
if (Date != "Dates") and (Date != 'No tournament series found.'):
print('Added: ', Name)
tournament = {
'date': Date,
'name': Name,
'location': Location
}
Tournaments.append(tournament)
r.close()
def GetTournaments(yearsToCollect):
#Get Current Year/Month
today = datetime.today()
currentMonth = today.month
currentYear = today.year
for year in range(yearsToCollect):
#Finish current Year
if (year == 0):
for i in range(12 - currentMonth):
GetPageData(baseurl + str(currentYear) + '/' + str(currentMonth + i))
#All other years
else:
for i in range(12):
GetPageData(baseurl + str(currentYear + year) + '/' + str(i))
# Save to .xlsx
Tournamentsdf = pd.DataFrame(Tournaments)
Tournamentsdf.to_excel('Tournaments.xlsx', index=False)
if __name__ == "__main__":
yearsToCollect = 2
GetTournaments(yearsToCollect)

最新更新