我有一个包含文本列的数据帧,其中有一些短端URL。要筛选的URL由它们在另一列中的值发出信号,该列基本上是嵌套列表,位于数据帧列中,该列本身嵌套在列中。嵌套列表中大部分是NULL值和三种不同类型的数据帧,它们确实共享一些列。我想使用共享列的值作为过滤器。
dput(head(data$entities, 10))
输出如下:
structure(list(mentions = list(structure(list(start = 0L, end = 5L,
username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L),
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L),
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L),
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L),
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L),
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L),
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L),
structure(list(start = c(0L, 17L), end = c(16L, 22L), username = c("Sebasti52801956",
"BILD"), id = c("1506995029205729292", "9204502")), class = "data.frame", row.names = 1:2),
structure(list(start = c(0L, 11L), end = c(10L, 16L), username = c("Nico_FHKo",
"BILD"), id = c("3166027097", "9204502")), class = "data.frame", row.names = 1:2),
structure(list(start = 0L, end = 5L, username = "BILD", id = "9204502"), class = "data.frame", row.names = 1L)),
hashtags = list(NULL, NULL, structure(list(start = c(82L,
106L, 133L), end = c(89L, 116L, 145L), tag = c("grünen",
"Bundestag", "Solidarität")), class = "data.frame", row.names = c(NA,
3L)), NULL, NULL, NULL, NULL, NULL, NULL, NULL), urls = list(
NULL, NULL, NULL, NULL, NULL, NULL, structure(list(start = 6L,
end = 29L, url = "t.co/OC9Vo93CAp", expanded_url = "https://twitter.com/ThomasB59914994/status/1510264984034492417/photo/1",
display_url = "pic.twitter.com/OC9Vo93CAp", media_key = "16_1510264976740691973"), class = "data.frame", row.names = 1L),
structure(list(start = 141L, end = 164L, url = "t.co/fSa4TZmzFt",
expanded_url = "https://www.stern.de/digital/technik/sicher--klein-und-billig---china-baut-den-ersten-thorium-reaktor--30632008.html",
display_url = "stern.de/digital/techni…", images = list(
structure(list(url = c("https://pbs.twimg.com/news_img/1557443927816404992/VfYVrEGt?format=jpg&name=orig",
"https://pbs.twimg.com/news_img/1557443927816404992/VfYVrEGt?format=jpg&name=150x150"
), width = c(1440L, 150L), height = c(810L, 150L
)), class = "data.frame", row.names = 1:2)),
status = 200L, title = "Sicher, klein und billig – China baut den ersten Thorium-Reaktor",
description = "Er ist nicht größer als ein Badezimmer. In China wird ein Thorium-Reaktor in Betrieb genommen. 2030 soll es zur Serienproduktion kommen – die Mini-Reaktoren versprechen CO2-freien Strom ohne die Gefahr eines Gaus.",
unwound_url = "https://www.stern.de/digital/technik/sicher--klein-und-billig---china-baut-den-ersten-thorium-reaktor--30632008.html"), class = "data.frame", row.names = 1L),
NULL, NULL), annotations = list(NULL, NULL, NULL, NULL,
NULL, NULL, NULL, NULL, NULL, NULL), cashtags = list(
NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
NULL)), row.names = c(NA, 10L), class = "data.frame")
我现在想做的是过滤掉text列中的URLS,它对应于entities.url列表中display_url列的值,该列表声明";图片&";。
然而我尝试了这个:
if (data$entities$urls != NULL) {
if (data$entities$urls$display_url == "pic.*\s*"){
data$text <- gsub("http.*\s*", "", data$text)
}
}
但我总是收到错误
if(replys_bu$entities$urls!=NULL(中的错误{:参数的长度为零
我将感谢的任何帮助
根据url的值不是NULL
,这里有一种用新值替换data$text
中字符串的矢量化方法。
注意entities$urls
必须更改为data$entities$urls
i_not_null <- which(!sapply(entities$urls, is.null))
displ_url <- sapply(entities$urls[i_not_null], `[[`, 'display_url')
i_text <- i_not_null * grepl("^pic.*\s*", displ_url)
i_text <- i_text[i_text != 0]
data$text[i_text] <- gsub("http.*\s*", "", data$text[i_text])