如何抓取国际足联网站的JavaScript表格



对于一个研究项目,我想从国际足联网站上抓取国际足球(足球(比赛的所有结果。我正在使用R来执行此操作。但是,似乎包含匹配项的表是使用 javascript 生成的。 这是我想抓取的网址:

http://www.fifa.com/live-scores/international-tournaments/fixtures-results/index.html#month5-2018

我尝试在javascript表呈现后使用phantomjs渲染页面,但是在生成的html中,仍然没有给出包含匹配结果的表。这是我的代码:

url = "http://www.fifa.com/live-scores/international- 
tournaments/fixtures-results/index.html#month5-2018"
writeLines(sprintf("
var page = require('webpage').create();
var fs = require('fs');
var path = 'scrape.html'
page.open('%s', function (status) {
var content = page.content;
fs.write(path, content, 'w')
phantom.exit();
});", url), con="scrape.js")
system("./phantomjs.exe scrape.js")

构造表后,您无需对表进行爬网,此网站会对一些此类终结点进行一些调用。

http://data.fifa.com/livescores/en/internationaltournaments/matches/m/byyearandmonth/2018/5

http://data.fifa.com/livescores/live/matches

要找到它们,请使用浏览器上的网络检查器(按 f12(。更简单的方法是选择构造这些表的 json,而不是在构造表之后选择它们。

编辑:构造表的所有数据都在这些 JSON 上,以获取数据。 首先执行 GET 请求并下载包含这些 JSON 的网页的内容。当您检查网页的内容时,您会看到它们是 json,但它们位于函数内,只需将其删除即可。

例如,在第一个链接中,您可以删除转义 json 的_matchesByYearAndMonthCallback(和最后一个)

删除该包后,您将获得一个有效的 json,您可以使用包json.litejsonR中解析该 json,请查看文档。 使用这些包之一后,您应该会获得一个数据帧,您可以选取信息。

你将获得的 json 开头的示例。

{
"competitionslist": {
"0": {
"name": "Friendlies",
"idCup": 506,
"edition": 1872,
"idCupSeason": 2000010101,
"isFifaCompetition": true,
"countryCode": "",
"cupKindID": 105,
"competitionSeoName": "friendly-506",
"hasStanding": false,
"linkMatches": "",
"linkStanding": "",
"link": "",
"hasMatchLive": false,
"isActiveSeason": true,
"matchlist": [{
"idCup": 506,
"idCupSeason": 2000010101,
"edition": 1872,
"isLive": false,
"isActiveSeason": true,
"isFifaCompetition": true,
"isClubCompetition": false,
"competitionName": "Friendlies",
"providerCompetitionID": 0,
"providerEditionID": 0,
"idMatch": 300438343,
"internalMatchID": 0,
"idRound": 281863,
"idHomeTeam": 43818,
"homeCountryCode": "IRQ",
"homeTeamName": "Iraq",
"idAwayTeam": 43989,
"awayCountryCode": "PLE",
"awayTeamName": "Palestine",
"matchDate": "2018-05-08T16:00:00Z",
"matchDateUTC": "2018-05-08T16:00:00Z",
"kickOffTime": "16:00",
"minute": 0,
"status": 0,
"cupKindID": 105,
"cupKindName": "Friendly",
"hasLineup": false,
"scoreHome": 0,
"scoreAway": 0,
"venueName": "Basra ",
"competitionSeoName": "friendly-506",
"matchSeoName": "Iraq-Palestine-300438343",
"homeTeamSeoName": "iraq-43818",
"awayTeamSeoName": "palestine-43989",
"hasStanding": false,
"winTeamName": "",
"winTeamShortName": "",
"isStarted": true,
"isFinished": true,
"isAwarded": false,
"isPostponed": false,
"isSuspended": false,
"isAbandoned": false,
"link": "",
"isNextDay": false
}, {
"idCup": 506,
"idCupSeason": 2000010101,
"edition": 1872,
"isLive": false,
"isActiveSeason": true,
"isFifaCompetition": true,
"isClubCompetition": false,
"competitionName": "Friendlies",
"providerCompetitionID": 0,
"providerEditionID": 0,
"idMatch": 300439349,
"internalMatchID": 0,
"idRound": 281863,
"idHomeTeam": 43843,
"homeCountryCode": "ALG",
"homeTeamName": "Algeria",
"idAwayTeam": 43835,
"awayCountryCode": "KSA",
"awayTeamName": "Saudi Arabia",
"matchDate": "2018-05-09T19:30:00Z",
"minute": 0,
"status": 0,
"cupKindID": 105,
"cupKindName": "Friendly",
"hasLineup": false,
"scoreHome": 0,
"scoreAway": 2,
"venueName": "Cadiz ",
"idWinTeam": 43835,
"competitionSeoName": "friendly-506",
"matchSeoName": "Algeria-Saudi Arabia-300439349",
"homeTeamSeoName": "algeria-43843",
"awayTeamSeoName": "saudi-arabia-43835",
"hasStanding": false,
"winTeamName": "Saudi Arabia",
"winTeamShortName": "Saudi Arabia",
"isStarted": true,
"isFinished": true,
"isAwarded": false,
"isPostponed": false,
"isSuspended": false,
"isAbandoned": false,
"link": "",
"isNextDay": false
},

最新更新