我正在从谷歌chrome控制台复制一个cURL,我将其粘贴在"之间的R脚本中,当我试图将其保存到对象时,R表示其中有一个意外的符号。
有人能解释为什么会发生这种情况以及如何解决吗?
httpbinrhcurl <- "curl 'http://www.domainia.nl/quarantaine/2018/12/15' -H 'Connection: keep-alive' -H 'Cache-Control: max-age=0' -H 'Upgrade-Insecure-Requests: 1' -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' -H 'Origin: http://www.domainia.nl' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8' -H 'Referer: http://www.domainia.nl/quarantaine/2018/12/15' -H 'Accept-Encoding: gzip, deflate' -H 'Accept-Language: en-US,en;q=0.9,nl;q=0.8' -H 'Cookie: ASP.NET_SessionId=1rq1dcm2rxrxhejcv2apj1nz; _ga=GA1.2.1720697664.1544465383; cookieconsent_dismissed=yes; _gid=GA1.2.1174161929.1544888026; _gat=1' --data '__EVENTTARGET=ctl00%24ContentPlaceHolder1%24gvDomain&__EVENTARGUMENT=Page%246&__VIEWSTATE=%2BfCpnCxB3CBeL3p0AJRxX709sVZd46FhL5m2WythvucpdaUpCnCyUxSa%2BEidpSc9wEzRF6wNxg8LpEQV8%2BzUZpzAPL8nX4hXXd282D77F%2BphaIBhktpf4j8Wj31S3LIC5QRM2V6lHEWuJEbAJbuk68wwOBwa2zOVseYwNs%2FQywJT2H5wWBPzxALrQo5Dui76GArI3RRCSyKUgrq97euwYy4zq5b1Y3NgFWi8nOyu4VWuih%2BRANxczyqirApMmInEpCVFHqbNYulM8iKpi5Ay1jU3k2fC8I87LhC8oFWOPglZXhMuMSxR66FM%2BMOk9T%2BUbSrESAFA9AIywFhk7gRy%2FGMTgmDA%2BdyCbmnnIzJbGy6sr02V0I08l8UxzxfiaRCTx1paJqN3Co7tgq4D%2FqPmkvzlU8w1sMEqpdDWrXnX2cDswYbcEmg01w3PfgD66f4yezNJb4PQdjURDLwYsb6BkPum9iwXzRMi9OHQoULpN%2BUGFZh2BqgJMcI6R9OLbTZXs%2B%2FiiIVwyO8zeYjw8gMIjLZeu2TTdduhgiVBLUbBRZOQ%2BZwAIL6rZNoKF7Vop0BeSXTuA%2F6bCf1Z5g9Yss4WC%2Bre81YbVKImtrnUoqBsRQLvt6EtDUYxxJ1NDfdWYC8HivFw1FHvzwf0EiH2wtKFQ0w7eBNa5Hhahs05UOaqo%2FLSNxZJCsbcJfSwAQlo2nIRAVLSEEibbqYfgFtsgxtvv7oRl0a6eci2OU2skCfXHsLCTBJhzGD3RfUALn9o9bwTx%2FYbYYrGtbCuty%2FFmFVVFQvNN9UGJntsNaxxB0SIjIwamTKL7H%2Bi1npGQbV%2B%2F%2BnPJbzHX8UPWgeT0qB1xBIOc59H7TCE%2BEcFyTDdCprQd7XYmeZ1yKySaDDCtrUc3sP38YUzgzP4R8ktyJg52ZVS7tN4o3lfLEmPb2gV29xxZlLs%2Ft7XuA7xeJei9AXlhZn8rZEsu0qvZwnmk2%2F7ArQYXv%2B2kk58H6223NwCwtoS53emfJuuOgn6x1AsWcIc4xavzYu4bhE5GBIe378PQMvav0GcXghx0YAmYnsQw5x%2BYkcU66e81XKGgDS0nXOOWabpnJwdG1%2B43wziuDsx9Yx%2Fy0FznCAWjdPbMf1uzFPk8yxErqzShuz2W3mX%2FeXmkcnP%2Fr41GUiYW4gfCfYYNSPDORY0gITK%2FZR9yq9eex2dJLanL0g9m7g2aaGrhCG%2FwDM72YP1HVkoOguF%2BFcNUoye%2BeVhi5cKavn9mt%2FjSrvctYV6exY0VqNhqJZBR9H0VaqZDO8X7EvMAeCBs822VOUgDHsKLWmINEDhx%2ByK0NpbHMv60U6ZDIUFMU7VQP%2BtK1d5XXzgPu6McG2pU3GuV%2FVbCowFjOX06cSSJy5PCfzvvW6t5nIDtz37p3pMKMZ3Bkbitmo9O88RAHtk5m1XvFG3U3XBb1wFDKEoB6g5P%2BfeLrVt26DP3XprYW4uvsPlWg3%2FlgiJAieljwQteNB6YP3dFIe6Jtpn0KCvOZvkD1TiHVxht9GBvKRPsQncAUAdk%2Bh0Es2U63UOl9BSiNvJHRXIeJtdwLCiB%2F09ymmGig%2BVzXkOs%2FJJA4toKUljFDNZA2c0eJO675v5flLrb9F53Hch%2B%2FJ8gEGWLCPJRj5UzKHnPV2Ln80lFQFmiI%2BKgvnGlDNXs2KzcbzL468kSziFOATaxHg2NRlTxcRi7eHW3WbWp3s7l804cLMutZP0S7jualsRRNCdwL0hnvAP3eYBBUiNGg%3D%3D&__VIEWSTATEGENERATOR=39107D5A&__VIEWSTATEENCRYPTED=&__EVENTVALIDATION=FVV2FXaDZFD8X3%2Faf1efOYoZkPsG5aGc%2FETy%2BPpSaJViSFwo08G2Z%2FD3oNgkB4gPg3vavcSK2z1GyI%2BWo9Nu12mjPGuSzGefVyonOnrV1Tx9nsqyOEhGinftQPWBa29BXIAnAhb3XxtTj7SEAeR0KYR%2FsPYpD3tPoUjMrLhmHmj4RM%2BOZGYZDj4B2LdpVg%2B1RIYBhs3rdEVxzuhlhAPlgiKvhO8v46wlweLVy47Y4ZOrzks2z2fm%2BHXNDVD8RXTPKl%2FnTUobsv9iZ9imVAs8DS7I8WkrPtH3TJS7jeFS1NGd3eJIXwCZmpsplMe5tY3wHjYLBPijk0dsH7%2B8DvuQ3byNcAz4H%2B92IH%2Fw%2BInvxjc52xdSrOCqKUFWUOIAGYsrG%2FtTphEc0XYuqujHxZAXFcP%2FhPDMSDUzTzoQc2VsHqqI9UthFCX9z9TpVX8Th4puc3pIiRS9%2BQHDlp%2FDiq81AI39IThV6W824NnY9uudiuizQmer8CA7DTts%2BVE5qYDsa%2BJKgyMmBP3YHOhsfPssFLhZ7sFhC%2FQwmf9s6X0rizu1aI%2FzRacJGyjm2C4PMtuT0ylmUvMu%2Bx%2FEOUrMNzKO2UXX07jTxwRByiJ13ud7JEW2U6s1Y7Dnvf3%2F0klDACBXPrTUQy0bnLgGypiv2grtCEyXHePltZkUddwM0eEbS3Kcl3fbDEgq97RaomGxCDpA2VRlOLHZZkLYOasMXCNtC9yo1gaXJcNE1ONwyTVn1%2Fi8gIDAo%2BPqNYLRS4fdEQC%2BDvrGoCvcfr5PlIwxyHn6zHAJGJJWsxunbWjhYX9yFDd8nFEVcIJEHiYHCvQBG%2Bxo6gUSwfza0L6v4k1wTa2SJhLoGQS4Slbl6hVaiawb8M8iOgLwnIYyBgRGoIOFsBhcauh0UO8dIMTNi6LRXiFpIK8VXEa4kG8enfIoJWlj2Yp06FL8mxpQXpQJkZrajseuQ5gtxk6vh3ZanVn3XDg5BrBxB2zUZNDmcVlSWOrw%2Fza6g9mm52q%2Ff%2FtdxrS4xYrTcJcavDkt%2Bhh4Y7Brhf3x0oDtyzk6WeLj5l13ZNjEcnqmhRlluY2Y1VCr9fQzrJFH8NZBiKa4pNVv2lfTKXqje1AogYX9LqUC4JoVfkXT1ip9%2BATWAApQpW7Z%2FjxXcUzN53xZopDP3UtpWQ8uPhBwaz2FRlNpQV7cv1QyoawOZDqIck88J9yEUiDKYDHczSkgK5AAeTENbJZSsxtMUJxIA97XyRWunk5WqmixcAQW8GV5QFT8c3yS35TafJ0bNW5U5vN6BByZiGAJg96r8sJrwZ6RkkFXHgPDGlivdxc594qqEDQTAaBo%2Fj2AJjk%2Frs793XMP61%2B6ahQIF9iAqoA8Iq6RaJSgs%2FOP%2BVrsONlNv%2BXCtndmE97M602%2F%2BVJmrQle%2F8ySQLlrmBdafdJGBSZBNrDeqhS4%2Bs9dbTarV7AvUwVjZUgTIhJ1JTgGGu09kgVOe5FDd89KJ6D9xFNUzAMJiyK7H%2BbX3Mf5KdZgmVnyehE%3D&ctl00%24ddlState=quarantaine&ctl00%24tbSearch=' --compressed"
好吧,我会被诅咒的。他们没有使用ASP.NET的一些更粗糙的功能,所以这非常简单。正如我在本网站上的一个类似问题中所指出的,在不存在的robots.txt中抓取似乎没有任何限制,我也找不到任何条款/条件。
library(httr)
library(rvest)
library(docxtractr) # for data frame cleaning helper utilities
library(tidyverse)
让我们进入第一页:
httr::GET(
url = "http://www.domainia.nl/quarantaine/2018/12/15"
) -> res
pg <- httr::content(res)
现在,我们将提取表格:
html_node(pg, xpath = ".//table[contains(., 'Domein')]") %>%
html_table(fill=TRUE, trim=TRUE) %>%
select(2:6) %>% # The table is full of junk so we trim it off
docxtractr::assign_colnames(3) %>% # The column headers in in row 3
docxtractr::mcga() %>% # Make the column names great again
tbl_df() -> pg_one
将其分配到我们将添加到的列表中:
pgs <- list(pg01 = pg_one)
现在,浏览其余的选项卡(如果超过10个,则可以通过提取分页行并获取最大值/最后一个td
来完成超过10所需的额外位(。
在循环中,我们提取视图状态字段,设置其他POST
主体参数,并增加我们得到的页面。我们发布POST
,将新表格提取到列表中,并对剩余页面进行起泡/冲洗/重复:
for (pg_num in 2:10) {
Sys.sleep(5) # be kind since you don't own the server or pay for the bandwidth
hinputs <- html_nodes(pg, "input[type='hidden']")
hinputs <- as.list(setNames(html_attr(hinputs, "value"), html_attr(hinputs, "name")))
hinputs$`ctl00$tbSearch` <- ""
hinputs$`ctl00$ddlState` <- "quarantaine"
hinputs$`__EVENTTARGET` <- "ctl00$ContentPlaceHolder1$gvDomain"
hinputs$`__EVENTARGUMENT` <- sprintf("Page$%s", pg_num)
httr::POST(
url = "http://www.domainia.nl/quarantaine/2018/12/15",
encode = "form",
body = hinputs
) -> res
httr::content(res) %>%
html_node(xpath = ".//table[contains(., 'Domein')]") %>%
html_table(fill=TRUE, trim=TRUE) %>%
select(2:6) %>%
docxtractr::assign_colnames(3) %>%
docxtractr::mcga() %>%
tbl_df() -> pgs[[sprintf("pg_%02s", pg_num)]] # assign it to a new named list entry
}
最后,合并所有这些行:
bind_rows(pgs)
## # A tibble: 954 x 5
## domein status archive geregistreerd_op uit_quarantaine
## <chr> <chr> <chr> <chr> <chr>
## 1 0172design.nl quarantaine 0 "" 15-12-2018
## 2 0172designs.nl quarantaine 0 "" 15-12-2018
## 3 0172kleding.nl quarantaine 0 "" 15-12-2018
## 4 0172online.nl quarantaine 0 "" 15-12-2018
## 5 123shows.nl quarantaine 0 "" 15-12-2018
## 6 123story.nl quarantaine 0 "" 15-12-2018
## 7 21018dagen.nl quarantaine 0 "" 15-12-2018
## 8 22academy.nl quarantaine 0 "" 15-12-2018
## 9 22aviationcampus.nl quarantaine 0 "" 15-12-2018
## 10 22campus.nl quarantaine 0 "" 15-12-2018
## # ... with 944 more rows