基于函数在数据帧列表中创建一列



这里是我的可复制示例:

grange_list <- list(hepg2 = structure(list(seqnames = structure(c(7L, 15L, 1L
), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", 
"chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", 
"chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", 
"chr22", "chrX"), class = "factor"), start = c(158126281L, 69110138L, 
2205071L), end = c(158126380L, 69110237L, 2205170L), width = c(100L, 
100L, 100L), strand = structure(c(2L, 2L, 1L), .Label = c("+", 
"-", "*"), class = "factor"), name = c("FUS_HepG2_IDR", "FUS_HepG2_IDR", 
"FUS_HepG2_IDR"), score = c(1000L, 1000L, 1000L), annotation = c("Intron (uc011kwa.2/5799, intron 2 of 22)", 
"Intron (uc002arl.3/8125, intron 1 of 6)", "Intron (uc001aja.4/6497, intron 1 of 6)"
), geneChr = c(7L, 15L, 1L), geneStart = c(157331750L, 69070875L, 
2160134L), geneEnd = c(158380482L, 69113261L, 2241652L), geneLength = c(1048733L, 
42387L, 81519L), geneStrand = c(2L, 2L, 1L), geneId = c("5799", 
"8125", "6497"), distanceToTSS = c(254102, 3024, 44937)), row.names = c(NA, 
3L), class = "data.frame"), k562 = structure(list(seqnames = structure(c(10L, 
22L, 11L), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", 
"chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", 
"chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", 
"chr21", "chr22", "chrX"), class = "factor"), start = c(72508428L, 
49992192L, 3072043L), end = c(72508527L, 49992291L, 3072142L), 
width = c(100L, 100L, 100L), strand = structure(c(1L, 2L, 
2L), .Label = c("+", "-", "*"), class = "factor"), name = c("FUS_K562_IDR", 
"FUS_K562_IDR", "FUS_K562_IDR"), score = c(1000L, 1000L, 
1000L), annotation = c("Intron (uc001jrg.3/140766, intron 15 of 21)", 
"Intron (uc003biq.3/uc003biq.3, intron 1 of 4)", "Intron (uc001lxe.3/833, intron 1 of 22)"
), geneChr = c(10L, 22L, 11L), geneStart = c(72432559L, 50013290L, 
3022152L), geneEnd = c(72522195L, 50051190L, 3078681L), geneLength = c(89637L, 
37901L, 56530L), geneStrand = c(1L, 2L, 2L), geneId = c("140766", 
"348645", "833"), distanceToTSS = c(75869, 58998, 6539)), row.names = c(NA, 
3L), class = "data.frame"), hoel = structure(list(seqnames = structure(c(1L, 
1L, 1L), .Label = c("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", 
"chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", 
"chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", 
"chr22", "chrX", "chrY"), class = "factor"), start = c(557045L, 
870107L, 936673L), end = c(557144L, 870206L, 936772L), width = c(100L, 
100L, 100L), strand = structure(c(1L, 1L, 1L), .Label = c("+", 
"-", "*"), class = "factor"), name = c("FUS", "FUS", "FUS"), 
score = c(1000L, 1000L, 1000L), annotation = c("Distal Intergenic", 
"Intron (uc001abv.1/148398, intron 4 of 4)", "Distal Intergenic"
), geneChr = c(1L, 1L, 1L), geneStart = c(762971L, 860530L, 
948847L), geneEnd = c(794826L, 879961L, 949919L), geneLength = c(31856L, 
19432L, 1073L), geneStrand = c(1L, 1L, 1L), geneId = c("643837", 
"148398", "9636"), distanceToTSS = c(-205827, 9577, -12075
)), row.names = c(NA, 3L), class = "data.frame"))

这是一个数据帧列表,看起来像:

$hepg2
seqnames     start       end width strand          name score                               annotation geneChr geneStart   geneEnd geneLength geneStrand geneId distanceToTSS
1     chr7 158126281 158126380   100      - FUS_HepG2_IDR  1000 Intron (uc011kwa.2/5799, intron 2 of 22)       7 157331750 158380482    1048733          2   5799        254102
2    chr15  69110138  69110237   100      - FUS_HepG2_IDR  1000  Intron (uc002arl.3/8125, intron 1 of 6)      15  69070875  69113261      42387          2   8125          3024
3     chr1   2205071   2205170   100      + FUS_HepG2_IDR  1000  Intron (uc001aja.4/6497, intron 1 of 6)       1   2160134   2241652      81519          1   6497         44937
$k562
seqnames    start      end width strand         name score                                    annotation geneChr geneStart  geneEnd geneLength geneStrand geneId distanceToTSS
1    chr10 72508428 72508527   100      + FUS_K562_IDR  1000   Intron (uc001jrg.3/140766, intron 15 of 21)      10  72432559 72522195      89637          1 140766         75869
2    chr22 49992192 49992291   100      - FUS_K562_IDR  1000 Intron (uc003biq.3/uc003biq.3, intron 1 of 4)      22  50013290 50051190      37901          2 348645         58998
3    chr11  3072043  3072142   100      - FUS_K562_IDR  1000       Intron (uc001lxe.3/833, intron 1 of 22)      11   3022152  3078681      56530          2    833          6539
$hoel
seqnames  start    end width strand name score                                annotation geneChr geneStart geneEnd geneLength geneStrand geneId distanceToTSS
1     chr1 557045 557144   100      +  FUS  1000                         Distal Intergenic       1    762971  794826      31856          1 643837       -205827
2     chr1 870107 870206   100      +  FUS  1000 Intron (uc001abv.1/148398, intron 4 of 4)       1    860530  879961      19432          1 148398          9577
3     chr1 936673 936772   100      +  FUS  1000                         Distal Intergenic       1    948847  949919       1073          1   9636        -12075

我创建了一个在annotation列中查找特定模式的函数:

flag_annot<-function(annotation){
flag = 0
if(length(grep("UTR",   annotation, ignore.case = TRUE))){flag = 1}
if(length(grep("Intron",annotation, ignore.case = TRUE))){flag = 1}
if(length(grep("Exon",  annotation, ignore.case = TRUE))){flag = 1}
return(flag)
}

目标是根据annotation列获得的值创建另一个名为intragenic的列,即10

我知道我可以对注释列进行子集设置,如下所示:

lapply(grange_list,'[',,'annotation')

我正在寻找一个简洁的单行,也许使用mapply,它可以将flag_annot函数与我刚才所做的子集相结合。谢谢

为了简化,我会将函数更改为使用grepl,并将多个模式与|一起使用,而不是单独编写。

flag_annot<-function(annotation){
as.integer(grepl('UTR|Intron|Exon',  annotation, ignore.case = TRUE))
}

然后使用lapply作为-

lapply(grange_list,function(x) transform(x, intragenic = flag_annot(annotation)))

tidyverse方法

library(tidyverse)
flag_annot <- function(x){
x$intragenic <- if_else(str_detect(x$annotation,"UTR|Intron|Exon"),1,0)
return(x)
}
map(grange_list,flag_annot)

最新更新