r语言 - 无法使用 rvest 抓取带有表单的网站



我正在尝试抓取下面列出的以下网站。 我尝试通过将rvest与下面的代码一起使用来做到这一点。

我的尝试是尝试复制我在谷歌浏览器中为下载按钮找到的PUT。 我不确定我做错了什么。 我收到reprex中列出的错误。

library(httr)
library(rvest)
library(dplyr)
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union


url <- "https://nfc.shgn.com/adp/baseball"
pgsession <- session(url)

pgform <- html_form(pgsession)[[2]]
filled_form <- html_form_set(pgform,
team_id = "0", from_date = "2020-10-01", to_date = "2021-02-19", num_teams = "0",
draft_type = "0", sport = "baseball", position = "",
league_teams = "0" )
#> Warning: Setting value of hidden field 'team_id'.
#> Warning: Setting value of hidden field 'from_date'.
#> Warning: Setting value of hidden field 'to_date'.
#> Warning: Setting value of hidden field 'num_teams'.
#> Warning: Setting value of hidden field 'draft_type'.
#> Warning: Setting value of hidden field 'sport'.
#> Warning: Setting value of hidden field 'position'.
#> Warning: Setting value of hidden field 'league_teams'.

session_submit(x = pgsession, form = filled_form)
#> Error: `form` doesn't contain a `action` attribute

如果您只想抓取该表,则可以使用"打印"按钮带您到的URL轻松完成rvestpurrr

虽然你不能使用html_table,但使用purrr::map_df将单元格提取为数据帧很简单:

library(rvest)
library(dplyr)
library(purrr)
library(stringr)
pgtab <- read_html("https://nfc.shgn.com/adp.data.php") %>%  #destination of Print button
html_nodes("tr") %>%                 #returns a list of row nodes
map_df(~html_nodes(., "td") %>%      #returns a list of cell nodes for each row
html_text() %>%             #extract text
str_trim() %>%              #remove whitespace
set_names("Rank","Player","Team","Position","ADP","MinPick",
"MaxPick","Diff","Picks","Team2","PickBid"))
head(pgtab)
# A tibble: 6 x 11
Rank  Player             Team  Position ADP   MinPick MaxPick Diff  Picks Team2 PickBid
<chr> <chr>              <chr> <chr>    <chr> <chr>   <chr>   <chr> <chr> <chr> <chr>  
1 1     Ronald Acuna Jr.   ATL   OF       1.69  1       6       ""    332   ""    ""     
2 2     Fernando Tatis Jr. SD    SS       2.57  1       7       ""    332   ""    ""     
3 3     Mookie Betts       LAD   OF       3.53  1       9       ""    332   ""    ""     
4 4     Juan Soto          WAS   OF       3.98  1       10      ""    332   ""    ""     
5 5     Mike Trout         LAA   OF       6.08  1       11      ""    332   ""    ""     
6 6     Gerrit Cole        NYY   P        6.50  1       15      ""    332   ""    ""     

您也可以设置表单参数并执行此操作,尽管您必须检查它是否有区别。 这是一种方式...

url <- "https://nfc.shgn.com/adp/baseball"
pgsession <- html_session(url)
pgform <- html_form(pgsession)[[2]]
filled_form <-set_values(pgform,
team_id = "0", from_date = "2020-10-01", to_date = "2021-02-19", num_teams = "0",
draft_type = "0", sport = "baseball", position = "",
league_teams = "0" )
filled_form$url <- "https://nfc.shgn.com/adp.data.php" #error if this is left blank
pgsession <- submit_form(pgsession, filled_form, submit = "printerFriendly")
pgtab <- pgsession %>% read_html() %>% #code as per previous answer above
html_nodes("tr") %>% 
map_df(~html_nodes(., "td") %>% 
html_text() %>% 
str_trim() %>% 
set_names("Rank","Player","Team","Position","ADP","MinPick",
"MaxPick","Diff","Picks","Team2","PickBid"))

这是一个可能的解决方案,使用rSelenium在 tsv.file 中下载到给定的文件夹。 之后,轻松...

library( RSelenium )
library( rvest )
library( xml2 )
library( data.table )
#setup download file + location
filename <- "ADP.tsv"
download_location <- file.path(Sys.getenv("USERPROFILE"), "Downloads")
#create extra cpabilities, so the browser(firefox) does not display an save-as dialog 
# when downloading the tsv file
eCaps <- makeFirefoxProfile( list( "browser.download.dir" = download_location,
"browser.download.folderList" = 2, 
"browser.helperApps.neverAsk.saveToDisk" = "text/tab-separated-values",
"browser.download.manager.showWhenStarting" = FALSE ) )
#setup driver (using the firefox profile created before), client and server
driver <- rsDriver( browser = "firefox", port = 4545L, extraCapabilities = eCaps, verbose = FALSE )
server <- driver$server
browser <- driver$client
#goto url in browser
browser$navigate( "https://nfc.shgn.com/adp/baseball" )
#get 
button_dl <- list()
#while no buttons found (site not loaded), try to load the download-button
while ( length( button_dl ) == 0 ) {
button_dl <- browser$findElements(using = "name", "download" )
}
#now click the button and wait for the file to show up in the download_location
button_dl[[1]]$clickElement()
#wait for download to complete
Sys.sleep(5)
#check if file is loaded
if ( file.exists( paste( download_location, filename, sep = "/" ) ) ) {
#load the file
DT <- data.table::fread( paste( download_location, filename, sep = "/" ) )
}
#close everything down properly
browser$close()
server$stop()
head(DT)
#    Rank              Player Team Position(s)  ADP Min Pick Max Pick Difference # Picks Team Team Pick
# 1:    1   Acuna Jr., Ronald  ATL          OF 1.68        1        6         NA     323   NA        NA
# 2:    2 Tatis Jr., Fernando   SD          SS 2.58        1        7         NA     323   NA        NA
# 3:    3       Betts, Mookie  LAD          OF 3.50        1        9         NA     323   NA        NA
# 4:    4          Soto, Juan  WAS          OF 3.98        1       10         NA     323   NA        NA
# 5:    5         Trout, Mike  LAA          OF 6.06        1       11         NA     323   NA        NA
# 6:    6        Cole, Gerrit  NYY           P 6.52        1       15         NA     323   NA        NA

最新更新