r-如何使循环和追加更加高效



我有这个代码:

library(dplyr)
library(splitstackshape)
datalist = list()
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)
for(i in files)
{
a<-get(load(i))
names <- rownames(a)
data <- as.data.frame(cbind(names,a))
rownames(data) <- NULL
dd=na.omit(concat.split.multiple(data = data, split.cols = c("names"), seps = ":"))
dd=select(dd,names_1,blup,names_3,names_4)
colnames(dd)=c("rsid","weight","ref_allele","eff_allele")
dd$WGT<-i
datalist[[i]] <- dd # add it to your list
}
big_data = do.call(rbind, datalist)

这个循环必须遍历17345个RDat文件。每个文件大约有10000行。所有RDat文件都可以从这里下载它们被压缩在这个文件中:GSE115828_retina_TWAS_wgts.tar.gz。其中3个.RDat文件的子集在这里是

对于i="0";视网膜。ENSG00000135776wgt.RDat";

> head(dd)
rsid        weight ref_allele eff_allele
1:  rs72763981  9.376766e-09          C          G
2: rs144383755 -2.093346e-09          A          G
3:   rs1925717  1.511376e-08          T          C
4:  rs61827307 -1.625302e-08          C          A
5:  rs61827308 -1.625302e-08          G          C
6: rs199623136 -9.128354e-10         GC          G
WGT
1: retina.ENSG00000135776.wgt.RDat
2: retina.ENSG00000135776.wgt.RDat
3: retina.ENSG00000135776.wgt.RDat
4: retina.ENSG00000135776.wgt.RDat
5: retina.ENSG00000135776.wgt.RDat
6: retina.ENSG00000135776.wgt.RDat

> head(data)
names               top1          blup lasso enet
1  rs72763981:228705421:C:G  0.972975476445267  9.376766e-09     0    0
2 rs144383755:228705758:A:G -0.274895726835564 -2.093346e-09     0    0
3                 rs1925716 -0.739883956565433 -1.993259e-08     0    0
4   rs1925717:228707734:T:C  0.725883147262975  1.511376e-08     0    0
5  rs61827307:228708434:C:A -0.783489562399769 -1.625302e-08     0    0
6  rs61827308:228708526:G:C -0.783489562399769 -1.625302e-08     0    0

如果我只加载一个.RDat文件:

i="retina.ENSG00000135776.wgt.RDat"
a<-get(load(i))
> head(a)
top1          blup lasso enet
rs72763981:228705421:C:G   0.9729755  9.376766e-09     0    0
rs144383755:228705758:A:G -0.2748957 -2.093346e-09     0    0
rs1925716                 -0.7398840 -1.993259e-08     0    0
rs1925717:228707734:T:C    0.7258831  1.511376e-08     0    0
rs61827307:228708434:C:A  -0.7834896 -1.625302e-08     0    0
rs61827308:228708526:G:C  -0.7834896 -1.625302e-08     0    0

人们可以从";名称";列I正在创建3个独立的列:;rsid"ref_allele";,以及";eff_allele";。这个循环需要很长时间才能执行。有没有办法让它更快?

我正在尝试@akrun代码如下:

library(parallel)
library(data.table)
library(foreach)
library(doSNOW)
n <-  parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")   
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)
lst_out <- foreach::foreach(i = seq_along(files), 
.packages = c("data.table") ) %dopar% {
a <- get(load(files[i]))
names <- rownames(a)
if("blup" %in% colnames(a)) {
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[,  (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := files[i]][]
} else {

data <- data.table(names)
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[,  (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid,  ref_allele, eff_allele)][,
WGT := files[i]][]
}
return(out)
rm(data, a)
gc()
}
Error in { : task 12 failed - "object 'blup' not found"

big_data <- rbindlist(lst_out)      

有几种方法可以更快地实现这一点。1( 一个选项是使用data.table中的tstrsplit,2(使用parallel执行此操作

library(parallel)
library(data.table)
n <-  parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")   
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=TRUE)
lst_out <- foreach::foreach(i = seq_along(files), 
.packages = c("data.table") ) %dopar% {
tmp <-  as.data.frame(get(load(files[i])))
a <- data.table::copy(tmp)
rm(tmp)
gc()

names <- rownames(a)
if("blup" %in% colnames(a)) {
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[,  (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := files[i]][]
} else {

data <- data.table(names)
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[,  (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid,  ref_allele, eff_allele)][,
WGT := files[i]][]
}
return(out)
rm(data)
gc()
}
parallel::stopCluster(cl)
big_data <- rbindlist(lst_out, fill = TRUE)    

-测试

a1 <- structure(list(top1 = c(0.9729755, -0.2748957, -0.739884, 0.7258831, 
-0.7834896, -0.7834896), blup = c(9.376766e-09, -2.093346e-09, 
-1.993259e-08, 1.511376e-08, -1.625302e-08, -1.625302e-08), lasso = c(0L, 
0L, 0L, 0L, 0L, 0L), enet = c(0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c("rs72763981:228705421:C:G", 
"rs144383755:228705758:A:G", "rs1925716", "rs1925717:228707734:T:C", 
"rs61827307:228708434:C:A", "rs61827308:228708526:G:C"))

lst1 <- replicate(16, a1, simplify = FALSE)
file_nm <- sprintf("retina.ENSG00000%d.wgt.RDat", 135776:135791)
library(foreach)
library(parallel)
library(data.table)
n <-  parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")   
doSNOW::registerDoSNOW(cl)

lst_out <- foreach::foreach(i = seq_along(lst1), 
.packages = c("data.table") ) %dopar% {
a <- lst1[[i]]
names <- rownames(a)
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[,  (nm1) := tstrsplit(names, ":")[-2]]
return(data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := file_nm[i]][])
}
parallel::stopCluster(cl)
big_data <- rbindlist(lst_out) 
dim(big_data)
#[1] 96  5

最新更新