我想搜索数据帧中的基序列表。这里我附上了示例数据集和代码。
Gene_and_Promoter <- tibble::tribble(
~Gene, ~Promoter,
"Gene1", "AGTCACGTGCGTGCATACGTGCAAATTGGGCGTACGTGGCTATCTCAACTATCH",
"Gene2", "AACGTGGCGTGGCAGTGCACGTGCCAGTTGTCCCGCAGTGTGCATACTACTCT",
"Gene3", "ACTGGCTACGTGCTGCAATGCGTGCGTAGTGCGTACCAAAGTTAAACCGGCG",
"Gene4", "GCAATACGTGCAAGTGCGTGTACGTGCGTGATGTCGTACGTAACCGGCCGGT",
"Gene5", "ATACGTGCGTCGTACGTGCGTACTAATACATACATCATAATTTAAACCCG",
"Gene6", "GGGGGAATCTCGTTCCTACGTCAAGGATAGATGCTGATAGTCGTA"
)
Motifs <- tibble::tribble(
~MOTIF,
"CGTGC",
"GGAATA",
"CCAG",
"CGTA"
)
Gene_and_Promoter %>%
mutate(CGTGC = vcountPattern("CGTGC",DNAStringSet(Gene_and_Promoter$Promoter))) %>%
mutate(GGAATA = vcountPattern("GGAATA",DNAStringSet(Gene_and_Promoter$Promoter))) %>%
mutate(CCAG = vcountPattern("CCAG",DNAStringSet(Gene_and_Promoter$Promoter))) %>%
mutate(CGTA = vcountPattern("CGTA",DNAStringSet(Gene_and_Promoter$Promoter)))
上述代码提供所需的输出(启动子中存在Motif(。
我可以通过减少使用mutate的次数来优化上面的代码吗?(可能通过迭代(
这里有一个类似于@det的答案的可能性,但在tidyverse。。。
library(tidyverse)
pat <- c("CGTGC", "GGAATA", "CCAG", "CGTA")
# set names so that map_df() keeps them...
lpat <- as.list(pat) %>%
set_names(., pat)
dd <-
Gene_and_Promoter %>%
mutate(across(Promoter, ~map_df(lpat, ~ vcountPattern(., DNAStringSet(Promoter))))) %>%
as.list() %>%
bind_cols() %>%
full_join(Gene_and_Promoter, .)
如果不深入了解函数DNAStringSet
,很难说。也许可以试试这样的东西:
library(data.table)
library(purrr)
vec <- DNAStringSet(Gene_and_Promoter$Promoter)
Motifs <- c("CGTGC", "GGAATA", "CCAG", "CGTA")
setDT(Gene_and_Promoter)
Gene_and_Promoter[, (Motifs) := map(Motifs, ~vcountPattern(.x, vec))]