r语言 - 使用"rvest"进行网络抓取,代码不断停止



我正试图从以下网站上获取一些信息:

https://www.hcdn.gob.ar/proyectos/textoCompleto.jsp?exp=0001-D-2014.

我想用以下代码迭代账单编号。我在前几年运行过这个代码,它运行得很好。然而,在今年,这种联系似乎一直在破裂。我在下面列出代码:

summary2 <- data.frame(matrix(nrow=2, ncol=4))
colnames(summary2) <- c("billnum", "sum", "type", "name_dis_part")
k <- sprintf('%0.4d', 1:10048)

for (i in k) {
webpage <- read_html(paste0("https://www.hcdn.gob.ar/proyectos/textoCompleto.jsp?exp=", i, "-D-2014"))
billno <- html_nodes(webpage, 'h1')
billno_text <- html_text(billno)

billsum <- html_nodes(webpage, '.interno')
billsum_text <- html_text(billsum)

billsum_text <- gsub("n", "", billsum_text)
billsum_text <- gsub("t", "", billsum_text)
billsum_text <- gsub("    ", "", billsum_text)

link <- read_html(paste0("https://www.hcdn.gob.ar/proyectos/proyectoTP.jsp?exp=", i, "-D-2014"))
type <- html_nodes(link, 'h3')
type_text <- html_text(type)


table <-html_node(link, "table.table.table-bordered tbody")

table_text <- html_text(table)

table_text <- gsub("n", "", table_text)
table_text <- gsub("t", "", table_text)
table_text <- gsub("", "", table_text)

summary2[i, 1] <- billno_text
summary2[i, 2] <- billsum_text
summary2[i, 3] <- type_text
summary2[i, 4] <- table_text
}

我得到的错误如下:

Error in open.connection(x, "rb") : HTTP error 500.
In addition: Warning message:
In for (i in seq_along(cenv$extra)) { :
closing unused connection 3 (https://www.hcdn.gob.ar/proyectos/proyectoTP.jsp?exp=0279-D-2014)

代码将停止在某些账单链接上工作,即使当我将这些链接放入浏览器时,这些链接实际上似乎是孤立工作的。我不知道为什么会坏掉。

我试着打破循环,跳过不起作用的账单链接,但这不是一个理想的解决方案,因为a(它缺少了代码中不起作用但实际上有我想收集的数据的账单链接;b(它似乎效率很低。

在以下情况下,可以使用tryCatch逃避错误并将NA添加到表中:

library(rvest)
summary2 <- data.frame(matrix(nrow=0, ncol=4))
colnames(summary2) <- c("billnum", "sum", "type", "name_dis_part")
k <- c("0278", "0279", "0280")
for (i in k) {
## First scrape ##
# sys.sleep(1) # Uncomment if ness.
webpage <- read_html(paste0("https://www.hcdn.gob.ar/proyectos/textoCompleto.jsp?exp=", i, "-D-2014"))
billno <- html_nodes(webpage, 'h1')
billno_text <- html_text(billno)

billsum <- html_nodes(webpage, '.interno')
billsum_text <- html_text(billsum)

billsum_text <- gsub("n", "", billsum_text)
billsum_text <- gsub("t", "", billsum_text)
billsum_text <- gsub("    ", "", billsum_text)

## Second scrape ##
# sys.sleep(1) # Uncomment if ness.
link <- tryCatch(read_html(paste0("https://www.hcdn.gob.ar/proyectos/proyectoTP.jsp?exp=", i, "-D-2014")),
error = function(e) NA)

if (is.na(link)) {

type_text <- NA
table_text <- NA

} else {

type <- html_nodes(link, 'h3')
type_text <- html_text(type)
table <-html_node(link, "table.table.table-bordered tbody")

table_text <- html_text(table)

table_text <- gsub("n", "", table_text)
table_text <- gsub("t", "", table_text)
table_text <- gsub("", "", table_text)

}

## Output ##
summary2[i, 1] <- billno_text
summary2[i, 2] <- billsum_text
summary2[i, 3] <- type_text
summary2[i, 4] <- table_text
}

输出:

tibble::as_tibble(summary2)
# A tibble: 3 × 4
billnum     sum                                                                                                           type  name_…¹
<chr>       <chr>                                                                                                         <chr> <chr>  
1 0278-D-2014 "0278-D-2014  ProyectoSu beneplácito por el reconocimiento que la revista científica Nature realizara a un g… " PR… "ASSEF…
2 0279-D-2014 "0279-D-2014  ProyectoSu Benplacito al conmemorarase  el  natalicio de el Dr.  Joaquin V.  Gonzalezel 6 de m…  NA    NA    
3 0280-D-2014 "0280-D-2014  ProyectoLA HONORABLE CAMARA DE DIPUTADOS EXPRESA SU ADHESIÓN AL CONMEMORARSE EL 07 DE MARZO "… " PR… "GRANA…
# … with abbreviated variable name ¹​name_dis_part

最新更新