使用具有NULL值的嵌套列表中的值作为if条件R



我有一个包含文本列的数据帧,其中有一些短端URL。要筛选的URL由它们在另一列中的值发出信号,该列基本上是嵌套列表,位于数据帧列中,该列本身嵌套在列中。嵌套列表中大部分是NULL值和三种不同类型的数据帧,它们确实共享一些列。我想使用共享列的值作为过滤器。

dput(head(data$entities, 10))输出如下:

structure(list(mentions = list(structure(list(start = 0L, end = 5L, 
username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L), 
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L), 
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L), 
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L), 
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L), 
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L), 
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L), 
structure(list(start = c(0L, 17L), end = c(16L, 22L), username = c("Sebasti52801956", 
"BILD"), id = c("1506995029205729292", "9204502")), class = "data.frame", row.names = 1:2), 
structure(list(start = c(0L, 11L), end = c(10L, 16L), username = c("Nico_FHKo", 
"BILD"), id = c("3166027097", "9204502")), class = "data.frame", row.names = 1:2), 
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L)), 
hashtags = list(NULL, NULL, structure(list(start = c(82L, 
106L, 133L), end = c(89L, 116L, 145L), tag = c("grünen", 
"Bundestag", "Solidarität")), class = "data.frame", row.names = c(NA, 
3L)), NULL, NULL, NULL, NULL, NULL, NULL, NULL), urls = list(
NULL, NULL, NULL, NULL, NULL, NULL, structure(list(start = 6L, 
end = 29L, url = "t.co/OC9Vo93CAp", expanded_url = "https://twitter.com/ThomasB59914994/status/1510264984034492417/photo/1", 
display_url = "pic.twitter.com/OC9Vo93CAp", media_key = "16_1510264976740691973"), class = "data.frame", row.names = 1L), 
structure(list(start = 141L, end = 164L, url = "t.co/fSa4TZmzFt", 
expanded_url = "https://www.stern.de/digital/technik/sicher--klein-und-billig---china-baut-den-ersten-thorium-reaktor--30632008.html", 
display_url = "stern.de/digital/techni…", images = list(
structure(list(url = c("https://pbs.twimg.com/news_img/1557443927816404992/VfYVrEGt?format=jpg&name=orig", 
"https://pbs.twimg.com/news_img/1557443927816404992/VfYVrEGt?format=jpg&name=150x150"
), width = c(1440L, 150L), height = c(810L, 150L
)), class = "data.frame", row.names = 1:2)), 
status = 200L, title = "Sicher, klein und billig – China baut den ersten Thorium-Reaktor", 
description = "Er ist nicht größer als ein Badezimmer. In China wird ein Thorium-Reaktor in Betrieb genommen. 2030 soll es zur Serienproduktion kommen – die Mini-Reaktoren versprechen CO2-freien Strom ohne die Gefahr eines Gaus.", 
unwound_url = "https://www.stern.de/digital/technik/sicher--klein-und-billig---china-baut-den-ersten-thorium-reaktor--30632008.html"), class = "data.frame", row.names = 1L), 
NULL, NULL), annotations = list(NULL, NULL, NULL, NULL, 
NULL, NULL, NULL, NULL, NULL, NULL), cashtags = list(
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 
NULL)), row.names = c(NA, 10L), class = "data.frame")

我现在想做的是过滤掉text列中的URLS,它对应于entities.url列表中display_url列的值,该列表声明";图片&";。

然而我尝试了这个:

if (data$entities$urls != NULL) {
if (data$entities$urls$display_url == "pic.*\s*"){ 
data$text <- gsub("http.*\s*", "", data$text)
}
} 

但我总是收到错误

if(replys_bu$entities$urls!=NULL(中的错误{:参数的长度为零

我将感谢的任何帮助

根据url的值不是NULL,这里有一种用新值替换data$text中字符串的矢量化方法。

注意entities$urls必须更改为data$entities$urls

i_not_null <- which(!sapply(entities$urls, is.null))
displ_url <- sapply(entities$urls[i_not_null], `[[`, 'display_url')
i_text <- i_not_null * grepl("^pic.*\s*", displ_url)
i_text <- i_text[i_text != 0]
data$text[i_text] <- gsub("http.*\s*", "", data$text[i_text])

最新更新