r语言 - 使用rest的网页上不显示的Web抓取数据 - r - Web scraping data that is not displayed on a webpage using rvest 小贝子编程网

这是上一个问题的后续:使用R抓取数据并将结果放入数据帧

我试图从Glassdoor上抓取评论，包括子评级(工作与生活的平衡，文化和价值观等)。子评级在一个下拉菜单中，并显示为一些星星(1-5)。对于我之前的问题，Dave2e发布了一个非常有用的解决方案，但我发现一些公司的评论页面格式不同，所以这个解决方案不起作用。下面是一个行不通的公司的例子。

library(stringr)
library(httr)  
library(xml2)  
library(rvest) 
library(purrr) 
library(tidyverse)
library(lubridate)
Subratings <- data.frame()
url <- "https://www.glassdoor.com/Reviews/Fresenius-Medical-Care-North-America-Reviews-"
settings_url <- ".htm?filter.iso3Language=eng"
for (x in 1:3) {
pg_reviews <- read_html(GET(paste(url, "E10445", "_P", x, settings_url, sep = "")))

#the ratings are stored in a data structure in a script
#find all the scripts and then search
scripts<-pg_reviews %>% html_elements(xpath='//script')

#search the scripts for the ratings
ratingsScript <- which(grepl("ratingCareerOpportunities", scripts))
#filter the script down to just the data.  This is JSON like haven't figured out the beginning or end
data1 <-scripts[ratingsScript] %>% html_text2() %>% str_extract(""urlParams":.+\}\}\}\}") 


#extract the ratings
WorkLifeBalance  <- str_extract_all(data1, '(?<="ratingWorkLifeBalance":)\d') %>% unlist() %>% as.integer()
CultureAndValues <- str_extract_all(data1, '(?<="ratingCultureAndValues":)\d') %>% unlist() %>% as.integer()
DiversityAndInclusion        <- str_extract_all(data1, '(?<="ratingDiversityAndInclusion":)\d') %>% unlist() %>% as.integer()
SeniorLeadership <- str_extract_all(data1, '(?<="ratingSeniorLeadership":)\d') %>% unlist() %>% as.integer()
CareerOpportunities <- str_extract_all(data1, '(?<="ratingCareerOpportunities":)\d') %>% unlist() %>% as.integer()
CompensationAndBenefits<- str_extract_all(data1, '(?<="ratingCompensationAndBenefits":)\d') %>% unlist() %>% as.integer()

#Combine columns
combine <- cbind(WorkLifeBalance,CultureAndValues,DiversityAndInclusion,SeniorLeadership,
CareerOpportunities,CompensationAndBenefits)

Subratings <- rbind(Subratings,combine)     
}

看起来这个页面少了一个右括号，试试:str_extract(""urlParams":.+\}\}\}").
这应该也适用于前面的页面。

经过多次搜索后，员工评论被存储在以"reviews":开头，以}]}结尾的字符串中。
通过添加前导{将评论转换为有效的JSON，从而进行简单的转换。

library(stringr) 
library(httr)
library(xml2)
library(rvest) 
library(dplyr)
Subratings <- data.frame() 
url <- "https://www.glassdoor.com/Reviews/Fresenius-Medical-Care-North-America-Reviews-" 
settings_url <- ".htm?filter.iso3Language=eng"
dfs <- lapply(1:3, function(x) { 
pg_reviews <- read_html(GET(paste(url, "E10445", "_P", x, settings_url, sep = "")))
#the ratings are stored in a data structure in a script
#find all the scripts and then search
scripts<-pg_reviews %>% html_elements(xpath='//script')
#search the scripts for the ratings
ratingsScript <- which(grepl("ratingCareerOpportunities", scripts))
#Extract text for the reviews from the script.  This is almost valid JSON format
reviews <-scripts[ratingsScript] %>% html_text2() %>% 
str_extract(""reviews":.+?\}\]\}") 
# char <- nchar(reviews)  #debugging status
#add a leading { to make valid JSON and convert
answer <-jsonlite::fromJSON(paste("{", reviews))
answer
})
bind_rows(dfs)

r语言 - 使用rest的网页上不显示的Web抓取数据

相关内容

最新更新

热门标签：