我正试图从以下网站上获取一些信息:
https://www.hcdn.gob.ar/proyectos/textoCompleto.jsp?exp=0001-D-2014.
我想用以下代码迭代账单编号。我在前几年运行过这个代码,它运行得很好。然而,在今年,这种联系似乎一直在破裂。我在下面列出代码:
summary2 <- data.frame(matrix(nrow=2, ncol=4))
colnames(summary2) <- c("billnum", "sum", "type", "name_dis_part")
k <- sprintf('%0.4d', 1:10048)
for (i in k) {
webpage <- read_html(paste0("https://www.hcdn.gob.ar/proyectos/textoCompleto.jsp?exp=", i, "-D-2014"))
billno <- html_nodes(webpage, 'h1')
billno_text <- html_text(billno)
billsum <- html_nodes(webpage, '.interno')
billsum_text <- html_text(billsum)
billsum_text <- gsub("n", "", billsum_text)
billsum_text <- gsub("t", "", billsum_text)
billsum_text <- gsub(" ", "", billsum_text)
link <- read_html(paste0("https://www.hcdn.gob.ar/proyectos/proyectoTP.jsp?exp=", i, "-D-2014"))
type <- html_nodes(link, 'h3')
type_text <- html_text(type)
table <-html_node(link, "table.table.table-bordered tbody")
table_text <- html_text(table)
table_text <- gsub("n", "", table_text)
table_text <- gsub("t", "", table_text)
table_text <- gsub("", "", table_text)
summary2[i, 1] <- billno_text
summary2[i, 2] <- billsum_text
summary2[i, 3] <- type_text
summary2[i, 4] <- table_text
}
我得到的错误如下:
Error in open.connection(x, "rb") : HTTP error 500.
In addition: Warning message:
In for (i in seq_along(cenv$extra)) { :
closing unused connection 3 (https://www.hcdn.gob.ar/proyectos/proyectoTP.jsp?exp=0279-D-2014)
代码将停止在某些账单链接上工作,即使当我将这些链接放入浏览器时,这些链接实际上似乎是孤立工作的。我不知道为什么会坏掉。
我试着打破循环,跳过不起作用的账单链接,但这不是一个理想的解决方案,因为a(它缺少了代码中不起作用但实际上有我想收集的数据的账单链接;b(它似乎效率很低。
在以下情况下,可以使用tryCatch
逃避错误并将NA
添加到表中:
library(rvest)
summary2 <- data.frame(matrix(nrow=0, ncol=4))
colnames(summary2) <- c("billnum", "sum", "type", "name_dis_part")
k <- c("0278", "0279", "0280")
for (i in k) {
## First scrape ##
# sys.sleep(1) # Uncomment if ness.
webpage <- read_html(paste0("https://www.hcdn.gob.ar/proyectos/textoCompleto.jsp?exp=", i, "-D-2014"))
billno <- html_nodes(webpage, 'h1')
billno_text <- html_text(billno)
billsum <- html_nodes(webpage, '.interno')
billsum_text <- html_text(billsum)
billsum_text <- gsub("n", "", billsum_text)
billsum_text <- gsub("t", "", billsum_text)
billsum_text <- gsub(" ", "", billsum_text)
## Second scrape ##
# sys.sleep(1) # Uncomment if ness.
link <- tryCatch(read_html(paste0("https://www.hcdn.gob.ar/proyectos/proyectoTP.jsp?exp=", i, "-D-2014")),
error = function(e) NA)
if (is.na(link)) {
type_text <- NA
table_text <- NA
} else {
type <- html_nodes(link, 'h3')
type_text <- html_text(type)
table <-html_node(link, "table.table.table-bordered tbody")
table_text <- html_text(table)
table_text <- gsub("n", "", table_text)
table_text <- gsub("t", "", table_text)
table_text <- gsub("", "", table_text)
}
## Output ##
summary2[i, 1] <- billno_text
summary2[i, 2] <- billsum_text
summary2[i, 3] <- type_text
summary2[i, 4] <- table_text
}
输出:
tibble::as_tibble(summary2)
# A tibble: 3 × 4
billnum sum type name_…¹
<chr> <chr> <chr> <chr>
1 0278-D-2014 "0278-D-2014 ProyectoSu beneplácito por el reconocimiento que la revista científica Nature realizara a un g… " PR… "ASSEF…
2 0279-D-2014 "0279-D-2014 ProyectoSu Benplacito al conmemorarase el natalicio de el Dr. Joaquin V. Gonzalezel 6 de m… NA NA
3 0280-D-2014 "0280-D-2014 ProyectoLA HONORABLE CAMARA DE DIPUTADOS EXPRESA SU ADHESIÓN AL CONMEMORARSE EL 07 DE MARZO "… " PR… "GRANA…
# … with abbreviated variable name ¹name_dis_part