i有一个数据,具有超过1000万个观测值,带有字符串变量dat_text
。我正在尝试为每个观察结果提供一个指标变量(IV)。
当字符串 dat_text
include pat_text
和不包括不包括包含ex_text
中的任何字符串(请参考编辑以获取含义)。我正在尝试在r。
dat_text <- c("dbhgfadgdfgc", "sdfdsfsdgdfxgfydz", "fqdfsbfdjhdhts","dbhgfghfadgdfgc", "sdfdghsfsdgdfxgfydz", "fqdfsbfdjhfghdhts", "fdsafgdjfx", "dfdoslfspd")
ex_text <- c("fgh", "opl")
pat_text <- c("abc", "xyz", "jbq")
我创建了一个函数,其中包括:
myfunction <- function(pat_text, ex_text, dat_text){
# =========PART 1: pat_text========================
logic_tem <- list()
for(i in 1:length(pat_text)){# for each phrase in "pat_text"
temp <- list()
for(t in 1:nchar(pat_text[i])){# for each character in the phrase
temp[[t]] <- grepl(substring(pat_text[i], t, t), dat_text)
}
# Use "AND" to connect multiple logic vectors
temp <- do.call(cbind, temp)
logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
logic_tem <- do.call(cbind, logic_tem)
logic_pattext <- rowSums(logic_tem) > 0
# =========PART 2: ex_text========================
logic_tem <- list()
for(i in 1:length(ex_text)){# for each phrase in "ex_text"
temp <- list()
for(t in 1:nchar(ex_text[i])){# for each character in the phrase
temp[[t]] <- grepl(substring(ex_text[i], t, t), dat_text)
}
temp <- do.call(cbind, temp)
logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
# Use "OR" to connect multiple logic vectors
logic_tem <- do.call(cbind, logic_tem)
logic_extext <- rowSums(logic_tem) > 0
# =========PART 3: combine the two parts=========
return(logic_pattext & !logic_extext)
}
此功能运行良好:
> myfunction(pat_text, ex_text, dat_text)
[1] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
,当我在原始数据上实现此功能时,事实证明相当缓慢且效率低下。我问是否有人可以提供任何提高R中此功能性能的线索?谢谢
edit 我没有清除"任何字符串的置换"是我的不好。用这句话,我实际上是指该观察值包含该字符串中的所有字母。如果pat_text
是fgh
,则以下短语都符合要求:
"fgh", "00000f00000g00000h", "00000g00000h00000f", "000000h00000f00000g"
实际上上述模式中的0
S可以是任何字符或数字。我意识到,这个版本后这个问题实际上成为一个完全不同的问题。我真的没有意识到这是一个完全不同的问题。我很抱歉。
这是您功能的矢量化方法:
myfunction <- function(pat_text, ex_text, dat_text){
sep_pat_text = strsplit(pat_text,"")
result = lapply(sep_pat_text, FUN = function(k){
testLetter = lapply(k, grepl, x = dat_text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
include = apply(do.call(cbind, result), 1, any)
sep_ex_text = strsplit(ex_text,"")
result = lapply(sep_ex_text, FUN = function(k){
testLetter = lapply(k, grepl, x = dat_text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
dontInclude = apply(do.call(cbind, result), 1, any)
return(!dontInclude & include)
}
将此功能应用于输入样本结果:
> myfunction(pat_text, ex_text, dat_text)
[1] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
我不能保证这会表现更好,但请尝试评论您的结果。
编辑:可以通过将重复的代码组合到内部功能中来简化此功能。
myfunction <- function(pat_text, ex_text, dat_text){
testLetters = function(text, pattern){
sep_pat = strsplit(pattern, "")
result = lapply(sep_pat, FUN = function(k){
testLetter = lapply(k, grepl, x = text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
return(apply(do.call(cbind, result), 1, any))
}
include = testLetters(dat_text, pat_text)
dontInclude = testLetters(dat_text, ex_text)
return(!dontInclude & include)
}
@r。Schifini我运行了一个基准测试代码来测试哪个功能运行速度更快,这是我的结果。您可以尝试运行相同的代码以确认代码的可复制性。
生成数据
dat_text <- stringi::stri_rand_strings(10^6, 5)
ex_text <- c("fgh", "opl")
pat_text <- c("abc", "xyz", "jbq")
功能1:原始海报提供的功能
myfunction1 <- function(pat_text, ex_text, dat_text){
# =========PART 1: pat_text========================
logic_tem <- list()
for(i in 1:length(pat_text)){# for each phrase in "pat_text"
temp <- list()
for(t in 1:nchar(pat_text[i])){# for each character in the phrase
temp[[t]] <- grepl(substring(pat_text[i], t, t), dat_text)
}
# Use "AND" to connect multiple logic vectors
temp <- do.call(cbind, temp)
logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
logic_tem <- do.call(cbind, logic_tem)
logic_pattext <- rowSums(logic_tem) > 0
# =========PART 2: ex_text========================
logic_tem <- list()
for(i in 1:length(ex_text)){# for each phrase in "ex_text"
temp <- list()
for(t in 1:nchar(ex_text[i])){# for each character in the phrase
temp[[t]] <- grepl(substring(ex_text[i], t, t), dat_text)
}
temp <- do.call(cbind, temp)
logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
# Use "OR" to connect multiple logic vectors
logic_tem <- do.call(cbind, logic_tem)
logic_extext <- rowSums(logic_tem) > 0
# =========PART 3: combine the two parts=========
return(logic_pattext & !logic_extext)
}
函数2:@R的第一个函数。chifini
myfunction2 <- function(pat_text, ex_text, dat_text){
sep_pat_text = strsplit(pat_text,"")
result = lapply(sep_pat_text, FUN = function(k){
testLetter = lapply(k, grepl, x = dat_text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
include = apply(do.call(cbind, result), 1, any)
sep_ex_text = strsplit(ex_text,"")
result = lapply(sep_ex_text, FUN = function(k){
testLetter = lapply(k, grepl, x = dat_text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
dontInclude = apply(do.call(cbind, result), 1, any)
return(!dontInclude & include)
}
函数3:@R的第二个函数。chifini
myfunction3 <- function(pat_text, ex_text, dat_text){
testLetters = function(text, pattern){
sep_pat = strsplit(pattern, "")
result = lapply(sep_pat, FUN = function(k){
testLetter = lapply(k, grepl, x = text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
return(apply(do.call(cbind, result), 1, any))
}
include = testLetters(dat_text, pat_text)
dontInclude = testLetters(dat_text, ex_text)
return(!dontInclude & include)
}
基准测试
microbenchmark::microbenchmark(
myfunction1(pat_text, ex_text, dat_text),
myfunction2(pat_text, ex_text, dat_text),
myfunction3(pat_text, ex_text, dat_text))
>## Unit: seconds
>## expr min lq mean median uq max neval
>## myfunction1(pat_text, ex_text, dat_text) 3.284922 3.443022 3.605378 3.594186 3.698748 4.041584 100
>## myfunction2(pat_text, ex_text, dat_text) 12.134576 13.457712 13.802636 13.710624 14.765376 16.084844 100
>## myfunction3(pat_text, ex_text, dat_text) 12.136296 13.522227 13.812180 13.719780 14.662117 17.126667 100
我建议您使用一些tidyverse
工具,特别是stringr::str_detect
,正则表达式和dplyr
。没有原始数据,我不知道它会快多少,但是我认为最有可能。这将是以下内容。您可以将零件组合起来使其短,但我认为这是最可读的。我也在解释例如,"任何字符串的任何置换"表示,例如,dat_text
中的单个字符串不能包括ex_text
中的"fgh"
,但可以包括"hgf"
,并且"fgh"
必须完全在一个字符串中进行计数。
library(tidyverse)
# Make dat_text a column in a data frame
dat_txt_tbl <- tibble(dat_text)
# Make regular expressions
ex_rgx <- str_c(ex_text, sep = "|")
pat_rgx <- str_c(pat_text, sep = "|")
dat_txt_tbl %>%
mutate(inc_pat_txt = str_detect(dat_text, pat_rgx)) %>%
mutate(inc_ex_txt = str_detect(ex_text, ex_rgx)) %>%
mutate(IV = inc_pat_txt & (!inc_ex_txt))
iv希望是您想要的。没有示例,很难调试。
根据问题的新定义更新。不确定这是否会更快,但是您可以从这种方法中选择并选择一些技术:
library(stringr)
pat_text <- c("abc", "xyz", "jbq")
ex_text <- c("fgh", "opl")
dat_text <- c("dbhgfadgdfgc", "sdfdsfsdgdfxgfydz", "fqdfsbfdjhdhts","dbhgfghfadgdfgc", "sdfdghsfsdgdfxgfydz", "fqdfsbfdjhfghdhts", "fdsafgdjfx", "dfdoslfspd")
pat_chars <- strsplit(pat_text, "")
ex_chars <- strsplit(ex_text, "")
mat_pat <- lapply(dat_text, function(x) sapply(pat_chars, str_detect, string = x))
mat_ex <- lapply(dat_text, function(x) sapply(ex_chars, str_detect, string = x))
match_pat <- apply(sapply(mat_pat, function(x) apply(x, 2, all)), 2, any)
match_ex <- apply(sapply(mat_ex, function(x) apply(x, 2, all)), 2, any)
result <- match_pat & !match_ex
result
# [1] FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE
回答原始(未编辑)问题的旧代码
注意:保留以下代码,因为它说明了某些概念,以防有人偶然发现这个问题。
所以我不确定您当前的方法是否正常工作,因为我找不到pat_text
的任何匹配?和FALSE
和 anything
始终为FALSE
。无论如何,我认为这就是您想要的。可能会优化和清理 - 关键正在使所有排列。
library(tidyverse)
library(combinat)
library(stringr)
get_permutations <- function(text) {
strsplit(text, "") %>%
map(permn) %>%
data.frame(check.names = FALSE) %>%
map(paste0, collapse = "") %>%
unname %>%
unlist
}
pat_perm <- get_permutations(pat_text)
pat_perm
# [1] "abc" "acb" "cab" "cba" "bca" "bac" "xyz" "xzy" "zxy" "zyx" "yzx" "yxz" "jbq" "jqb" "qjb" "qbj"
# [17] "bqj" "bjq"
ex_perm <- get_permutations(ex_text)
ex_perm
# [1] "fgh" "fhg" "hfg" "hgf" "ghf" "gfh" "opl" "olp" "lop" "lpo" "plo" "pol"
match_pat_perm <- str_detect(dat_text, paste0(pat_perm, collapse = "|"))
match_ex_perm <- str_detect(dat_text, paste0(ex_perm, collapse = "|"))
result <- match_pat_perm & !match_ex_perm
result
# Included to show how you might get the locations.
str_locate_all(dat_text, paste0(pat_perm, collapse = "|"))
str_locate_all(dat_text, paste0(ex_perm, collapse = "|"))
注意:您的问题指出结果应该是整数,但您的示例建议其他情况。无论如何,您总是可以做as.integer(result)
。