r语言 - 使用 rvest 抓取"Artsy"



我正试图使用R的rvest包从Artsy那里获得信息。我想获得有关绘画名称、年份、价格、地点(画廊名称、拍卖会等(、艺术家名称和使用的材料的信息。每幅画的内页都提供了有关材料的信息。我尝试使用的代码如下所示:

library(rvest)
library(dplyr)
library(tidyverse)
get_material = function (painting_link) {
painting_page = read_html (painting_link)
material = painting_page %>% html_nodes('h2+ .kPqROo') %>%
html_text() %>% paste(collapse = ",")
return(material)
}
for(page_result in 2:3) {

link = paste0 ("https://www.artsy.net/collect?page=", page_result, "&additional_gene_ids%5B0%5D=painting") 
page = read_html(link)

painting_name_year = page %>%  html_nodes("#main .kjRHrZ") %>% html_text()
painting_link = page %>% html_nodes('#main .kjRHrZ') %>% html_attr("<div color="black60" font-family="sans" class="Box-sc-15se88d-0 Text-sc-18gcpao-0 kjRHrZ">n<i>") %>% paste("https://www.artsy.net", ., sep="/")
price = page %>%  html_nodes('.ibabyz') %>%  html_text()
place = page %>% html_nodes('hWKLzd') %>% html_text()
artist = page %>% html_nodes('.bQOCym .bQOCym') %>% html_text()
material = sapply(painting_link, FUN=get_material, USE.NAMES = FALSE)
}
artsy <- data.frame(painting_name_year, price, place, artist)
view(artsy)

painting_link、位置和材质的代码不起作用。此外,一次观察重复3次。如何解决此问题?

您可以移除循环。首先生成起始url列表。然后,在访问各个列表页面之前,您可以先收集各个列表的所有URL,而不是从登录页面中获取一些信息。

然后,您可以通过跨更多的cpu内核工作,并通过对每个url的函数调用从所有列表中收集您想要的数据,从而获得一点效率。

N。B.由于此操作受I/O限制,因此使用异步方法可能会提高效率。如果我能找到一个不错的教程/参考资料,我可能会更新这个答案。

如果您通过函数从每个列表url返回所需信息的tibble,则可以通过在列表链接和用户定义函数上调用future_map_dfr来生成最终的dataframe

library(purrr)
library(rvest)
#> Loading required package: xml2
#> Warning: package 'xml2' was built under R version 4.0.3
#> 
#> Attaching package: 'rvest'
#> The following object is masked from 'package:purrr':
#> 
#>     pluck
library(tidyverse)
#> Warning: package 'tibble' was built under R version 4.0.3
#> Warning: package 'forcats' was built under R version 4.0.3
library(jsonlite)
#> Warning: package 'jsonlite' was built under R version 4.0.3
#> 
#> Attaching package: 'jsonlite'
#> The following object is masked from 'package:purrr':
#> 
#>     flatten
library(furrr)
#> Warning: package 'furrr' was built under R version 4.0.3
#> Loading required package: future
#> Warning: package 'future' was built under R version 4.0.3
library(stringr)

get_art_links <- function(link) {
hrefs <- read_html(link) %>%
html_nodes("[href*=artwork][class]") %>%
html_attr("href") %>%
paste0("https://www.artsy.net", .)
return(hrefs)
}

get_listing_json <- function(page) {
data <- page %>%
html_node('[type="application/ld+json"]') %>%
html_text() %>%
jsonlite::parse_json()
return(data)
}

get_listing_info <- function(link) {
page <- read_html(link)
json <- get_listing_json(page)
artist <- json$brand$name
title <- page %>%
html_node('[data-test="artworkSidebar"] h2 > i') %>%
html_text()
production_date <- json$productionDate
material <- page %>%
html_node('[data-test="artworkSidebar"] h2 + div') %>%
html_text()
width <- json$width
height <- json$height
place <- stringr::str_match(json$description, "from (.*?),")[, 2] 
price <- json$offers$price
currency <- json$offers$priceCurrency
availability <- str_replace(json$offers$availability, "https://schema.org/", "")
return(tibble(artist, title, production_date, material, width, height, place, price, currency, availability))
}

pages <- 2:3 %>% as.character()
urls <- sprintf("https://www.artsy.net/collect?page=%s&additional_gene_ids[0]=painting", pages)
links <- purrr::map(urls, get_art_links) %>%
unlist()
no_cores <- future::availableCores() - 1
future::plan(future::multisession, workers = no_cores)
results <- future_map_dfr(links, .f = get_listing_info)

由reprex包于2021-05-16创建(v0.3.0(

最新更新