倍数在r中的正则表达式中包括和排除条件



i有一个数据,具有超过1000万个观测值,带有字符串变量dat_text。我正在尝试为每个观察结果提供一个指标变量(IV)。

当字符串 dat_text include pat_text 不包括不包括包含ex_text中的任何字符串(请参考编辑以获取含义)。我正在尝试在r。

中实现此目标
dat_text <- c("dbhgfadgdfgc", "sdfdsfsdgdfxgfydz", "fqdfsbfdjhdhts","dbhgfghfadgdfgc", "sdfdghsfsdgdfxgfydz", "fqdfsbfdjhfghdhts", "fdsafgdjfx", "dfdoslfspd")
ex_text <- c("fgh", "opl")
pat_text <- c("abc", "xyz", "jbq")

我创建了一个函数,其中包括:

myfunction <- function(pat_text, ex_text, dat_text){
# =========PART 1: pat_text========================
logic_tem <- list()
for(i in 1:length(pat_text)){# for each phrase in "pat_text"
  temp <- list()
  for(t in 1:nchar(pat_text[i])){# for each character in the phrase
    temp[[t]] <- grepl(substring(pat_text[i], t, t), dat_text) 
  }
  # Use "AND" to connect multiple logic vectors
  temp <- do.call(cbind, temp)
  logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
logic_tem <- do.call(cbind, logic_tem)
logic_pattext <- rowSums(logic_tem) > 0
# =========PART 2: ex_text========================
logic_tem <- list()
for(i in 1:length(ex_text)){# for each phrase in "ex_text"
  temp <- list()
  for(t in 1:nchar(ex_text[i])){# for each character in the phrase
    temp[[t]] <- grepl(substring(ex_text[i], t, t), dat_text) 
  }
  temp <- do.call(cbind, temp)
  logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
# Use "OR" to connect multiple logic vectors
logic_tem <- do.call(cbind, logic_tem)
logic_extext <- rowSums(logic_tem) > 0
# =========PART 3: combine the two parts=========
return(logic_pattext & !logic_extext)
}

此功能运行良好:

> myfunction(pat_text, ex_text, dat_text)
[1] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE

,当我在原始数据上实现此功能时,事实证明相当缓慢且效率低下。我问是否有人可以提供任何提高R中此功能性能的线索?谢谢

edit 我没有清除"任何字符串的置换"是我的不好。用这句话,我实际上是指该观察值包含该字符串中的所有字母。如果pat_textfgh,则以下短语都符合要求:

"fgh", "00000f00000g00000h", "00000g00000h00000f", "000000h00000f00000g"

实际上上述模式中的0 S可以是任何字符或数字。我意识到,这个版本后这个问题实际上成为一个完全不同的问题。我真的没有意识到这是一个完全不同的问题。我很抱歉。

这是您功能的矢量化方法:

myfunction <- function(pat_text, ex_text, dat_text){
    sep_pat_text = strsplit(pat_text,"")
    result = lapply(sep_pat_text, FUN  = function(k){
        testLetter = lapply(k, grepl, x = dat_text)
        resultLetter = do.call(cbind, testLetter)
        apply(resultLetter, 1, all)
    })
    include = apply(do.call(cbind, result), 1, any)

    sep_ex_text = strsplit(ex_text,"")
    result = lapply(sep_ex_text, FUN  = function(k){
        testLetter = lapply(k, grepl, x = dat_text)
        resultLetter = do.call(cbind, testLetter)
        apply(resultLetter, 1, all)
    })
    dontInclude = apply(do.call(cbind, result), 1, any)
    return(!dontInclude & include)
}

将此功能应用于输入样本结果:

> myfunction(pat_text, ex_text, dat_text)
[1] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE

我不能保证这会表现更好,但请尝试评论您的结果。

编辑:可以通过将重复的代码组合到内部功能中来简化此功能。

myfunction <- function(pat_text, ex_text, dat_text){
    testLetters = function(text, pattern){
        sep_pat = strsplit(pattern, "")
        result = lapply(sep_pat, FUN  = function(k){
            testLetter = lapply(k, grepl, x = text)
            resultLetter = do.call(cbind, testLetter)
            apply(resultLetter, 1, all)
        })
        return(apply(do.call(cbind, result), 1, any))
    }    
    include = testLetters(dat_text, pat_text)
    dontInclude = testLetters(dat_text, ex_text)
    return(!dontInclude & include)
}

@r。Schifini我运行了一个基准测试代码来测试哪个功能运行速度更快,这是我的结果。您可以尝试运行相同的代码以确认代码的可复制性。

生成数据

dat_text <- stringi::stri_rand_strings(10^6, 5)
ex_text <- c("fgh", "opl")
pat_text <- c("abc", "xyz", "jbq")

功能1:原始海报提供的功能

myfunction1 <- function(pat_text, ex_text, dat_text){
# =========PART 1: pat_text========================
logic_tem <- list()
for(i in 1:length(pat_text)){# for each phrase in "pat_text"
  temp <- list()
  for(t in 1:nchar(pat_text[i])){# for each character in the phrase
    temp[[t]] <- grepl(substring(pat_text[i], t, t), dat_text) 
  }
  # Use "AND" to connect multiple logic vectors
  temp <- do.call(cbind, temp)
  logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
logic_tem <- do.call(cbind, logic_tem)
logic_pattext <- rowSums(logic_tem) > 0
# =========PART 2: ex_text========================
logic_tem <- list()
for(i in 1:length(ex_text)){# for each phrase in "ex_text"
  temp <- list()
  for(t in 1:nchar(ex_text[i])){# for each character in the phrase
    temp[[t]] <- grepl(substring(ex_text[i], t, t), dat_text) 
  }
  temp <- do.call(cbind, temp)
  logic_tem[[i]] <- (rowSums(temp) == dim(temp)[2L])
}
# Use "OR" to connect multiple logic vectors
logic_tem <- do.call(cbind, logic_tem)
logic_extext <- rowSums(logic_tem) > 0
# =========PART 3: combine the two parts=========
return(logic_pattext & !logic_extext)
}

函数2:@R的第一个函数。chifini

myfunction2 <- function(pat_text, ex_text, dat_text){
sep_pat_text = strsplit(pat_text,"")
result = lapply(sep_pat_text, FUN = function(k){
testLetter = lapply(k, grepl, x = dat_text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
include = apply(do.call(cbind, result), 1, any)
sep_ex_text = strsplit(ex_text,"")
result = lapply(sep_ex_text, FUN = function(k){
testLetter = lapply(k, grepl, x = dat_text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
dontInclude = apply(do.call(cbind, result), 1, any)
return(!dontInclude & include)
}

函数3:@R的第二个函数。chifini

myfunction3 <- function(pat_text, ex_text, dat_text){
testLetters = function(text, pattern){
sep_pat = strsplit(pattern, "")
result = lapply(sep_pat, FUN = function(k){
testLetter = lapply(k, grepl, x = text)
resultLetter = do.call(cbind, testLetter)
apply(resultLetter, 1, all)
})
return(apply(do.call(cbind, result), 1, any))
}
include = testLetters(dat_text, pat_text)
dontInclude = testLetters(dat_text, ex_text)
return(!dontInclude & include)
}

基准测试

microbenchmark::microbenchmark(
myfunction1(pat_text, ex_text, dat_text),
myfunction2(pat_text, ex_text, dat_text),
myfunction3(pat_text, ex_text, dat_text))
>## Unit: seconds
>## expr min lq mean median uq max neval
>## myfunction1(pat_text, ex_text, dat_text) 3.284922 3.443022 3.605378 3.594186 3.698748 4.041584 100
>## myfunction2(pat_text, ex_text, dat_text) 12.134576 13.457712 13.802636 13.710624 14.765376 16.084844 100
>## myfunction3(pat_text, ex_text, dat_text) 12.136296 13.522227 13.812180 13.719780 14.662117 17.126667 100

我建议您使用一些tidyverse工具,特别是stringr::str_detect,正则表达式和dplyr。没有原始数据,我不知道它会快多少,但是我认为最有可能。这将是以下内容。您可以将零件组合起来使其短,但我认为这是最可读的。我也在解释例如,"任何字符串的任何置换"表示,例如,dat_text中的单个字符串不能包括ex_text中的"fgh",但可以包括"hgf",并且"fgh"必须完全在一个字符串中进行计数。

>
library(tidyverse)
# Make dat_text a column in a data frame
dat_txt_tbl <- tibble(dat_text)
# Make regular expressions
ex_rgx <- str_c(ex_text, sep = "|")
pat_rgx <- str_c(pat_text, sep = "|")
dat_txt_tbl %>%
    mutate(inc_pat_txt = str_detect(dat_text, pat_rgx)) %>%
    mutate(inc_ex_txt = str_detect(ex_text, ex_rgx)) %>%
    mutate(IV = inc_pat_txt & (!inc_ex_txt))

iv希望是您想要的。没有示例,很难调试。

根据问题的新定义更新。不确定这是否会更快,但是您可以从这种方法中选择并选择一些技术:

library(stringr)
pat_text <- c("abc", "xyz", "jbq")
ex_text <- c("fgh", "opl")
dat_text <- c("dbhgfadgdfgc", "sdfdsfsdgdfxgfydz", "fqdfsbfdjhdhts","dbhgfghfadgdfgc", "sdfdghsfsdgdfxgfydz", "fqdfsbfdjhfghdhts", "fdsafgdjfx", "dfdoslfspd")
pat_chars <- strsplit(pat_text, "")
ex_chars <- strsplit(ex_text, "")
mat_pat <- lapply(dat_text, function(x) sapply(pat_chars, str_detect, string = x))
mat_ex <- lapply(dat_text, function(x) sapply(ex_chars, str_detect, string = x))
match_pat <- apply(sapply(mat_pat, function(x) apply(x, 2, all)), 2, any)
match_ex <- apply(sapply(mat_ex, function(x) apply(x, 2, all)), 2, any)
result <- match_pat & !match_ex
result
# [1] FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE


回答原始(未编辑)问题的旧代码

注意:保留以下代码,因为它说明了某些概念,以防有人偶然发现这个问题。

所以我不确定您当前的方法是否正常工作,因为我找不到pat_text的任何匹配?和FALSE anything始终为FALSE。无论如何,我认为这就是您想要的。可能会优化和清理 - 关键正在使所有排列。

library(tidyverse)
library(combinat)
library(stringr)
get_permutations <- function(text) {
  strsplit(text, "") %>%
    map(permn) %>%
    data.frame(check.names = FALSE) %>%
    map(paste0, collapse = "") %>%
    unname %>% 
    unlist
}
pat_perm <- get_permutations(pat_text)
pat_perm
# [1] "abc" "acb" "cab" "cba" "bca" "bac" "xyz" "xzy" "zxy" "zyx" "yzx" "yxz" "jbq" "jqb" "qjb" "qbj"
# [17] "bqj" "bjq"
ex_perm <- get_permutations(ex_text)
ex_perm
# [1] "fgh" "fhg" "hfg" "hgf" "ghf" "gfh" "opl" "olp" "lop" "lpo" "plo" "pol"
match_pat_perm <- str_detect(dat_text, paste0(pat_perm, collapse = "|"))
match_ex_perm <- str_detect(dat_text, paste0(ex_perm, collapse = "|"))
result <- match_pat_perm & !match_ex_perm
result
# Included to show how you might get the locations.
str_locate_all(dat_text, paste0(pat_perm, collapse = "|")) 
str_locate_all(dat_text, paste0(ex_perm, collapse = "|"))

注意:您的问题指出结果应该是整数,但您的示例建议其他情况。无论如何,您总是可以做as.integer(result)

最新更新