我有这个代码:
library(dplyr)
library(splitstackshape)
datalist = list()
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)
for(i in files)
{
a<-get(load(i))
names <- rownames(a)
data <- as.data.frame(cbind(names,a))
rownames(data) <- NULL
dd=na.omit(concat.split.multiple(data = data, split.cols = c("names"), seps = ":"))
dd=select(dd,names_1,blup,names_3,names_4)
colnames(dd)=c("rsid","weight","ref_allele","eff_allele")
dd$WGT<-i
datalist[[i]] <- dd # add it to your list
}
big_data = do.call(rbind, datalist)
这个循环必须遍历17345个RDat文件。每个文件大约有10000行。所有RDat文件都可以从这里下载它们被压缩在这个文件中:GSE115828_retina_TWAS_wgts.tar.gz。其中3个.RDat文件的子集在这里是
对于i="0";视网膜。ENSG00000135776wgt.RDat";
> head(dd)
rsid weight ref_allele eff_allele
1: rs72763981 9.376766e-09 C G
2: rs144383755 -2.093346e-09 A G
3: rs1925717 1.511376e-08 T C
4: rs61827307 -1.625302e-08 C A
5: rs61827308 -1.625302e-08 G C
6: rs199623136 -9.128354e-10 GC G
WGT
1: retina.ENSG00000135776.wgt.RDat
2: retina.ENSG00000135776.wgt.RDat
3: retina.ENSG00000135776.wgt.RDat
4: retina.ENSG00000135776.wgt.RDat
5: retina.ENSG00000135776.wgt.RDat
6: retina.ENSG00000135776.wgt.RDat
和
> head(data)
names top1 blup lasso enet
1 rs72763981:228705421:C:G 0.972975476445267 9.376766e-09 0 0
2 rs144383755:228705758:A:G -0.274895726835564 -2.093346e-09 0 0
3 rs1925716 -0.739883956565433 -1.993259e-08 0 0
4 rs1925717:228707734:T:C 0.725883147262975 1.511376e-08 0 0
5 rs61827307:228708434:C:A -0.783489562399769 -1.625302e-08 0 0
6 rs61827308:228708526:G:C -0.783489562399769 -1.625302e-08 0 0
如果我只加载一个.RDat文件:
i="retina.ENSG00000135776.wgt.RDat"
a<-get(load(i))
> head(a)
top1 blup lasso enet
rs72763981:228705421:C:G 0.9729755 9.376766e-09 0 0
rs144383755:228705758:A:G -0.2748957 -2.093346e-09 0 0
rs1925716 -0.7398840 -1.993259e-08 0 0
rs1925717:228707734:T:C 0.7258831 1.511376e-08 0 0
rs61827307:228708434:C:A -0.7834896 -1.625302e-08 0 0
rs61827308:228708526:G:C -0.7834896 -1.625302e-08 0 0
人们可以从";名称";列I正在创建3个独立的列:;rsid"ref_allele";,以及";eff_allele";。这个循环需要很长时间才能执行。有没有办法让它更快?
我正在尝试@akrun代码如下:
library(parallel)
library(data.table)
library(foreach)
library(doSNOW)
n <- parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=T)
lst_out <- foreach::foreach(i = seq_along(files),
.packages = c("data.table") ) %dopar% {
a <- get(load(files[i]))
names <- rownames(a)
if("blup" %in% colnames(a)) {
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := files[i]][]
} else {
data <- data.table(names)
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, ref_allele, eff_allele)][,
WGT := files[i]][]
}
return(out)
rm(data, a)
gc()
}
Error in { : task 12 failed - "object 'blup' not found"
big_data <- rbindlist(lst_out)
有几种方法可以更快地实现这一点。1( 一个选项是使用data.table
中的tstrsplit
,2(使用parallel
执行此操作
library(parallel)
library(data.table)
n <- parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")
doSNOW::registerDoSNOW(cl)
files <- list.files("/WEIGHTS1/Retina", pattern=".RDat", ignore.case=TRUE)
lst_out <- foreach::foreach(i = seq_along(files),
.packages = c("data.table") ) %dopar% {
tmp <- as.data.frame(get(load(files[i])))
a <- data.table::copy(tmp)
rm(tmp)
gc()
names <- rownames(a)
if("blup" %in% colnames(a)) {
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := files[i]][]
} else {
data <- data.table(names)
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
out <- data[, .(rsid, ref_allele, eff_allele)][,
WGT := files[i]][]
}
return(out)
rm(data)
gc()
}
parallel::stopCluster(cl)
big_data <- rbindlist(lst_out, fill = TRUE)
-测试
a1 <- structure(list(top1 = c(0.9729755, -0.2748957, -0.739884, 0.7258831,
-0.7834896, -0.7834896), blup = c(9.376766e-09, -2.093346e-09,
-1.993259e-08, 1.511376e-08, -1.625302e-08, -1.625302e-08), lasso = c(0L,
0L, 0L, 0L, 0L, 0L), enet = c(0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c("rs72763981:228705421:C:G",
"rs144383755:228705758:A:G", "rs1925716", "rs1925717:228707734:T:C",
"rs61827307:228708434:C:A", "rs61827308:228708526:G:C"))
lst1 <- replicate(16, a1, simplify = FALSE)
file_nm <- sprintf("retina.ENSG00000%d.wgt.RDat", 135776:135791)
library(foreach)
library(parallel)
library(data.table)
n <- parallel::detectCores()
cl <- parallel::makeCluster(n, type = "SOCK")
doSNOW::registerDoSNOW(cl)
lst_out <- foreach::foreach(i = seq_along(lst1),
.packages = c("data.table") ) %dopar% {
a <- lst1[[i]]
names <- rownames(a)
data <- data.table(names, a["blup"])
nm1 <- c("rsid", "ref_allele", "eff_allele")
data[, (nm1) := tstrsplit(names, ":")[-2]]
return(data[, .(rsid, weight = blup, ref_allele, eff_allele)][,
WGT := file_nm[i]][])
}
parallel::stopCluster(cl)
big_data <- rbindlist(lst_out)
dim(big_data)
#[1] 96 5