r语言 - web抓取jsonlite在多个url -错误:词法错误:无效的字符在json文本



我在一个vector

中收集了以下url
departments<-  c("https://www.jurinst.su.se/english/about-us/contact/researchers-teachers",
"https://www.jurinst.su.se/english/about-us/contact/doctoral-students",
"https://www.buv.su.se/english/research/our-researchers/researchers-child-and-youth-studies",
"https://www.buv.su.se/english/research/our-researchers/researchers-children-s-culture",
"https://www.buv.su.se/english/research/our-researchers/researchers-early-childhood-education",
"https://www.buv.su.se/english/research/our-researchers/researchers-schoolage-educare",
"https://www.edu.su.se/english/about-us/organisation/researchers-faculty-members",
"https://www.edu.su.se/english/about-us/organisation/phd-students",
"https://www.psychology.su.se/english/about-us/contact/staff-a-z",
"https://www.su.se/publichealth/english/about-us/our-staff",
"https://www.sbs.su.se/english/research/research-sections/accounting/faculty",
"https://www.sbs.su.se/english/research/research-sections/finance/people",
"https://www.sbs.su.se/english/research/research-sections/management/faculty",
"https://www.sbs.su.se/english/research/research-sections/marketing/faculty",
"https://www.sofi.su.se/english/staff/all-staff",
"https://www.astro.su.se/english/about-us/contact/2.16629",
"https://www.mnd.su.se/english/research/mathematics-education/researchers",
"https://www.mnd.su.se/english/research/science-education/researchers",
"https://www.mnd.su.se/english/research/mathematics-education/graduate-students",
"https://www.mnd.su.se/english/research/science-education/graduate-students",
"https://www.fysik.su.se/english/about-us/contact/contact-list-alphabetical",
"https://www.dbb.su.se/about-us/contact",
"https://www.mmk.su.se/about-us/units-and-staff/people-at-mmk",
"https://www.su.se/mbw/about-us/staff/all-contacts",
"https://www.aces.su.se/staff/",
"https://www.su.se/geo/english/about-us/contact/staff",
"http://www.bergianska.se/english/about-us/contact-us/staff",
"https://www.nordita.org/people/zebra/index.php")

url在xpath方面相似但不相同。我试图使用jsonlite创建一个能够下载所有人的姓名和电子邮件地址的循环。然而,我得到一个错误也工作在一个单一的url如下面的例子。你有更好的编码想法吗?谢谢你

url.1=departments[1]
json.content <- read_html(url.1) %>% html_node('body') %>% html_text() %>% 
jsonlite::fromJSON(simplifyVector = FALSE)

一种蛮力解决方法可能是这个

departments<-departments[-c(18, 24,25 )] #eliminate departments who don't have emails
df<-data.frame(people_name=NA, emails=NA, university=NA )
#################################################################################### 1168 PERSONE
if(TRUE){
for(i in 1:16){

r<-read_html(departments[i]) 
people_name <- r %>%
html_nodes(xpath = '//h3') %>%
html_text()  

email <- r %>%
html_nodes(xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "profiles-mail", " " ))]') %>%
html_attrs() %>%
as.character()

a<-str_split(email, '"')
email<-sapply(a, "[[", 2)  
email<-gsub("mailto:","",email)

d<-data.frame(people_name, emails=email, university="Stockholm University")
df<-rbind(df,d)
rm(email, people_name,d)
} #DEPARTMENTS 1:16 1168 PERSONE
#save(df, file="Sweden4_2.RData")
#DEPARTMENTS 1:16 1168 PERSONE

#################################################################################################   36 people
email<-NULL
people_name<-NULL
for(i in 17:19){

r<-read_html(departments[i]) 
people_name1 <- r %>%
html_nodes(xpath = '//td[(((count(preceding-sibling::*) + 1) = 1) and parent::*)]//a') %>%
html_text()  

email1 <- r %>%
html_nodes(css='td+ td a') %>%
html_attrs() %>%
as.character()

people_name<-c(people_name,people_name1)
email<-c(email,email1)
rm(email1, people_name1)

}
email<-gsub("mailto:","",email)
email<-gsub('c(href = ',"",email, fixed = T)
email<-gsub('"',"",email, fixed = T)
email<-gsub('http://',"",email, fixed = T)
email<-str_split(email, ",")
email<-sapply(email, "[[", 1)  

d<-data.frame(people_name, emails=email, university="Stockholm University")
df<-rbind(df,d)
rm(email, people_name)
###################################################################################################### 1147 people
for(i in 20:22){

r<-read_html(departments[i]) 
people_name <- r %>%
html_nodes(xpath = '//h3') %>%
html_text()  

email <- r %>%
html_nodes(xpath = '//*[contains(concat( " ", @class, " " ), concat( " ", "profiles-mail", " " ))]') %>%
html_attrs() %>%
as.character()

a<-str_split(email, '"')
email<-sapply(a, "[[", 2)  
email<-gsub("mailto:","",email)

d<-data.frame(people_name, emails=email, university="Stockholm University")
df<-rbind(df,d)
rm(email, people_name,d)
} #DEPARTMENTS 20:23 1147 PERSONE
df<-df[-1,]
}

最新更新