使用 R 进行网页抓取(抓取隐藏号码 "Click here to show number" )



作为数据科学团队的实习生,我的任务是找到一种方法,使用 R 自动收集房地产广告网站上的特定数据。

感谢这篇文章中给出的答案(在房地产广告上使用 R 进行网络抓取(和代码中的一些更改,我设法执行了我想要的任务。但我的问题是我无法抓取电话号码。我尝试了几件事,但没有成功。

我想做与上一篇文章完全相同的事情,但将电话号码作为新变量。

以下是广告的详细信息: https://www.leboncoin.fr/ventes_immobilieres/1074663461.htm?ca=13_s 我的变量是:价格("Prix"(,城市("Ville"(,表面("表面"(,"GES","Classe énergie",房间数量("Pièces"(和电话号码,以及广告中显示的图片数量。

我注意到答案上给出的代码不再有效,因为在提出问题时,该网站不安全(http(。今天,它包括开头的"https"。这就是我对代码进行一些更改的原因。

我是R的初学者,任何帮助将不胜感激(对不起我的英语不好(。

get_ad_links = function(page){
require(rvest)
# construct url to page (!when running the code put the url in 1 line!)
url_base = "https://www.leboncoin.fr/ventes_immobilieres/offres/
languedoc_roussillon/pyrenees_orientales"
url      = paste(url_base, "?o=", page,"&ret=1&ret=2&f=p", sep = "")
page     = read_html(url)
# extract links to ads on page
a="//*/section/section/ul/li["
b="]/a/@href"
t =lapply(1:30, function(i)  paste(a,i,b, sep = ""))
ad_links = sapply(1:30,function(i) { page %>% 
html_node(xpath=as.character(t[i])) %>% html_text()})
return(ad_links)  
}
# Function to Get Ad Details by Ad URL
get_ad_details = function(ad_url){
require(rvest)
# parse ad url to html tree
doc = read_html(paste("https:",ad_url,sep=""))
# extract labels and values using xpath expression
pattern<- "</?\w+((\s+\w+(\s*m\s*(?:".*?
"|'.*?'[^'">\s]+))?)+\s*|\s*)/?>"
prix = doc %>% 
html_node(xpath="//section/section/section[2]/div[4]/h2/span[2]") %>% 
html_text()
PRIX = stringr::str_replace_all(prix,pattern,"")
PRIX =stringr::str_wrap(PRIX)
ville = doc %>% 
html_node(xpath="//section/section/section[2]/div[5]/h2/span[2]") %>% 
html_text()
VILLE = stringr::str_replace_all(ville,pattern,"")
VILLE = stringr::str_wrap(VILLE)
surface = doc %>% 
html_node(xpath="//section/section/section[2]/div[8]/h2/span[2]") %>% 
html_text()
SURFACE = stringr::str_replace_all(surface,pattern,"")
SURFACE = stringr::str_wrap(SURFACE)
pieces = doc %>% 
html_node(xpath="//section/section/section[2]/div[7]/h2/span[2]") %>% 
html_text()
PIECES = stringr::str_replace_all(pieces,pattern,"")
PIECES = stringr::str_wrap(PIECES)
type = doc %>% 
html_node(xpath="//section/section/section[2]/div[6]/h2/span[2]") %>% 
html_text()
TYPE_BIEN = stringr::str_replace_all(type,pattern,"")
TYPE_BIEN = stringr::str_wrap(TYPE_BIEN)
ges = doc %>% 
html_node(xpath="//section/section/section[2]/div[9]/h2/span[2]") %>% 
html_text()
GES = stringr::str_replace_all(ges,pattern,"")
GES = stringr::str_wrap(GES)
values  = c(PRIX, VILLE,SURFACE,PIECES,TYPE_BIEN,GES)
# convert to data frame and add labels
mydf  = as.data.frame(t(values))
names(mydf)= c("PRIX", "VILLE","SURFACE","PIECES" ,"TYPE_BIEN","GES")
return(mydf)
}

ad_links = get_ad_links(page = 1)
# grab ad details for first 30 links from page 1
require(plyr)
ad_details = ldply(ad_links[1:30], get_ad_details, .progress = 'text')

这里的问题是电话号码位于一个按钮后面,在显示此号码之前必须单击该按钮。这样做是为了防止网络抓取工具获取这些电话号码。

没有办法点击网站 使用rvest.但是,您可以使用RSelenium研究另一种方法。此方法使用 Web 浏览器 docker,它的工作方式与普通浏览器一样,但可以通过 R 命令进行定向。

最后,我设法使用Rselenium找到了解决方案,这是函数get_phone_number,它提供了一个数据框,其中广告链接作为ID和电话号码与之前创建的数据框匹配。

但是我遇到了一个新问题,实际上当我检索4或5个电话号码时,我的IP地址被阻止了。另外,当我使用不在法国的VPN时,单击电话号码后不会出现。

那么,如何在每次点击或任何其他想法后动态更改我的 IP 地址(仅在法国(?

x<- c("RSelenium","rvest","plyr")
lapply(x, require, character.only = TRUE)
wdman::selenium(verbose = FALSE)
remDr <- remoteDriver(port = 4567L, browserName = "phantomjs")
remDr$open() 
# Function to Get the phone number by Ad URL
get_ad_phoneNumber = function(ad_url){
# put the url as ID to match later with the data frame created previously
Id = ad_url
# go to the url
remDr$navigate(ad_url)
Sys.sleep(5) # wait until the page stop  loading
# find the phone number's button 
webElem <- remDr$findElement(using = 'css selector', value = 'aside > div > 
div.box-grey-light.mbs.align-center > div > button')
Sys.sleep(5)  # wait until the page stop  loading
webElem$clickElement() # click on the the button
Sys.sleep(5)  # wait until the page stop  loading
#find the phone number after the click
webElem <- remDr$findElement(using = 'xpath', value = 
'//aside/div/div[1]/div/span/a')
# extract the phone as a string character 
phoneNumber=webElem$getElementText()
values  = c(Id,phoneNumber)
# convert to data frame and add labels
mydf  = as.data.frame(t(values))
names(mydf)= c("ID","PhoneNumber")
return(mydf)
}

这就是我在早期编程时想到的。

现在看,我对流程和缺乏评论感到畏缩。但它奏效了。 请让我知道它是否有效?我是StackOverflow的新手。

Shownumbers <- function(k){
for (i in k:k){
url <- paste0("https://www.yellowpages.co.za/search?what=travel&pg=")
webpage_new <- read_html(url)
show_html <- html_nodes(webpage_new , ".idShowNumber")
show <- html_text(show_html)
show <- as.character(sub(paste("\n\t\t\t\t\t\t\t\t\t\t") , "" , 
show))
show <- as.character(replace(show , as.character("\") , ""))
show_base <- as.data.frame(show)
show_final <- show_base
paste0(i)
}
paste0(url,k)
}
Shownumbers(1)
for(d in 1:135){
url <- paste0("https://www.yellowpages.co.za/search?what=travel&pg=")
if(d ==1){ 
Shownumbers(d)
webpage_new <- read_html(url)
no_html <- html_nodes(webpage_new , ".yext-phone")
no <- html_text(no_html)
no <- as.character(sub(paste("\n\t\t\t\t\t\t\t\t\t\t") , "" , no))
no <- as.character(replace(no , as.character("\") , ""))
no_base <- as.data.frame(no)
no_final <- no_base
} else {
Shownumbers(d)
webpage_new <- paste0(url,d)
no_read <- read_html(webpage_new)
no_html <- html_nodes(no_read , ".yext-phone")
no <- html_text(no_html)
no <- as.character(sub(paste("\n\t\t\t\t\t\t\t\t\t\t") , "" , no))
no <- as.character(replace(no , as.character("\") , ""))
no_base <- as.data.frame(no)
no_final <- rbind(no_final,no_base)

}
paste0(d)
}
no_final <- unique(no_final)

最新更新