我需要用USEPA饮用水标准刮一张表格,列出122种化学品。表格和数据可在此处公开获得:http://www.epa.gov/wqc/national-recommended-water-quality-criteria-human-health-criteria-table
我正在尝试使用 XML 包。
library(XML)
url <- "http://www.epa.gov/wqc/national-recommended-water-quality-criteria-human-health-criteria-table"
classes <- c('character', 'integer', 'FormattedNumber', 'FormattedNumber', 'Integer', 'Character')
USEPA <- readHTMLTable(url,which=1,colClasses=classes,stringAsFactors=F)
不幸的是,我只收到此错误消息作为回报: "错误:无法加载 HTTP 资源">
如果我按照您上面提供的链接进行操作,我的浏览器会自动将我带到https
站点。
我的猜测是可能没有http
版本......只有https
版本。这可能会给 XML 库带来问题。
以下是根据此处的博客文章读取数据的一种方法:使用 rvest 抓取 HTML 表
library("rvest")
url <- "https://www.epa.gov/wqc/national-recommended-water-quality-criteria-human-health-criteria-table"
table_list <- url %>%
read_html() %>%
# I copied this Xpath as described in the blog post I linked above
html_nodes(xpath='/html/body/section/div[2]/div[1]/div/div/table') %>%
html_table()
# we have a list, but need to get the first item (the table)
html_table = table_list[[1]]
head(html_table[, 1:2]) # show only first two columns
输出:
Pollutant CAS Number
1 Acenaphthene (P) 83329
2 Acrolein (P) 107028
3 Acrylonitrile (P) 107131
4 Aldrin (P) 309002
5 alpha-Hexachlorocyclohexane (HCH) (P) 319846
6 alpha-Endosulfan (P) 959988
我使用 rvest 包提出了一个整洁的解决方案。关键概念是您可以使用 HTML 表的xpath,如此处所述。您可能希望使用 tidyr 重命名功能进一步重命名列。 此外,如警告消息所述,您可能希望将某些列保留为字符并手动更正任何有问题的值,然后进行类型转换。
# install.package('rvest')
# install.package('tidyverse')
library(rvest)
#> Loading required package: xml2
library(tidyverse)
url <- "http://www.epa.gov/wqc/national-recommended-water-quality-criteria-human-health-criteria-table"
df <- url %>%
read_html() %>%
html_nodes(xpath = '//*[@id="main-content"]/div[2]/div[1]/div/div/table') %>% # https://www.r-bloggers.com/using-rvest-to-scrape-an-html-table/
html_table() %>%
purrr::pluck(1) %>% # pluck out first item in the list which is a df
readr::type_convert(col_types = "cinnic") # c is character type, and i is integer, ...
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [70, 2]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [77, 2]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [80, 2]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [85, 2]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [17, 3]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [26, 3]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [34, 3]: expected a number, but got 'Total'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [35, 3]: expected a number, but got 'Total'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [63, 3]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [77, 3]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [9, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [10, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [17, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [26, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [34, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [35, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [37, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [68, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [77, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [80, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [85, 4]: expected a number, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [17, 5]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [26, 5]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [34, 5]: expected an integer, but got '—'
#> Warning in type_convert_col(char_cols[[i]], specs$cols[[i]],
#> which(is_character)[i], : [35, 5]: expected an integer, but got '—'
glimpse(df)
#> Observations: 122
#> Variables: 6
#> $ Pollutant <chr> …
#> $ `CAS Number` <int> …
#> $ `Human Health for the consumption of Water + Organismntttt(µg/L)` <dbl> …
#> $ `Human Health for the consumption of Organism Onlyntttt(µg/L)` <dbl> …
#> $ `Publication Year` <int> …
#> $ Notes <chr> …
创建于 2019-10-18 由 reprex 软件包 (v0.3.0(