我有数百个(*.txt格式(的数据文件,其中包含列A作为";基因ID";列B为"A";计数";。我想通过"合并"来合并所有文件;基因ID";转换为一个*.csv文件格式,并用*.csv中相应的*.txt文件名命名后续计数列(B列、C列、D列等(。请帮我做这件事。
*.txt格式的输入文件示例:
Sample_File_1
dput(head(Sample_File_1))
structure(list(`Gene IDs` = c("ENSG00000000003", "ENSG00000000005",
"ENSG00000000419", "ENSG00000000457", "ENSG00000000460", "ENSG00000000938"
), Sample_File_1.counts = c(0L, 0L, 8L, 10L, 1L, 242L)), row.names = c(NA,
6L), class = "data.frame")
Sample_File_2
dput(head(Sample_File_2))
structure(list(`Gene IDs` = c("ENSG00000000003", "ENSG00000000005",
"ENSG00000000419", "ENSG00000000457", "ENSG00000000460", "ENSG00000000938"
), Sample_File_2.counts = c(0L, 0L, 18L, 21L, 3L, 413L)), row.names = c(NA,
6L), class = "data.frame")
Sample_File_3
dput(head(Sample_File_3))
structure(list(`Gene IDs` = c("ENSG00000000003", "ENSG00000000005",
"ENSG00000000419", "ENSG00000000457", "ENSG00000000460", "ENSG00000000938"
), Sample_File_3.counts = c(0L, 0L, 24L, 13L, 2L, 400L)), row.names = c(NA,
6L), class = "data.frame")
Sample_File_4
dput(head(Sample_File_4))
structure(list(`Gene IDs` = c("ENSG00000000003", "ENSG00000000005",
"ENSG00000000419", "ENSG00000000457", "ENSG00000000460", "ENSG00000000938"
), Sample_File_4.counts = c(0L, 0L, 7L, 7L, 0L, 403L)), row.names = c(NA,
6L), class = "data.frame")
输出文件示例:
library(tidyverse)
Combined_inner_join <- list(Sample_File_1, Sample_File_2, Sample_File_3, Sample_File_4) %>% reduce(inner_join, by = "Gene IDs")
dput(head(Combined_inner_join))
structure(list(`Gene IDs` = c("ENSG00000000003", "ENSG00000000005",
"ENSG00000000419", "ENSG00000000457", "ENSG00000000460", "ENSG00000000938"
), Sample_File_1.counts = c(0L, 0L, 8L, 10L, 1L, 242L), Sample_File_2.counts = c(0L,
0L, 18L, 21L, 3L, 413L), Sample_File_3.counts = c(0L, 0L, 24L,
13L, 2L, 400L), Sample_File_4.counts = c(0L, 0L, 7L, 7L, 0L,
403L)), row.names = c(NA, 6L), class = "data.frame")
谢谢你,
Toufiq
所以这比我最初想象的要简单得多。如果你第一次读入所有文件,你可以使用mget将它们分配到一个列表中,以便从全局环境中检索它们。然后可以使用reduce和inner_join来获得所需的文件。我想我有你想要的列名方式,但请告诉我你是否有不同的列名方式。
好吧,我在下面的编辑应该可以做到。这绝对不是最有效的方法,而是我发现的方法。让我知道它是否适合你。根据文本文件的保存方式,在读取.中的所有文件时,您可能需要更改read_delim
中的delim
选项
这种方法的好处是,您不需要从环境中调用文件,因为您只需将它们读取到列表中即可。
library(tidyverse)
file_list <- list()
all_files <- list.files("~/Documents/Research/test_dir", full.names = TRUE)
for(i in 1:length(all_files)) {
file_list[[i]] <- read_delim(all_files[i], delim = "t", col_names = FALSE)
}
file_list_named <- list()
col_names <- vector()
for(i in 1:length(all_files)) {
file_list_named[[i]] <- rename(file_list[[i]], gene_ids = X1)
col_names[i] <- unlist(strsplit(unlist(strsplit(all_files[i], split = '.', fixed = TRUE))[1], split = "/", fixed = TRUE))[7]
colnames(file_list_named[[i]])[2] <- col_names[i]
}
final_df <- file_list_named %>% reduce(inner_join, by = "gene_ids")
write_csv(final_df, "pat_to_file/file.csv", col_names = TRUE)
由reprex包(v0.3.0(创建于2020-11-25
试试这个自定义函数,看看是否有效,
readblk <- function(directory = getwd()) {
lst <- list.files(directory)
for (i in 1:length(lst)) {
tm <- read.csv(lst[i], sep = 't', header = FALSE)
colnames(tm) <- c('GeneIDs', paste0('Sample_File_',i,' Counts'))
if (exists('out') == FALSE) {
assign('out', tm)
}
else{
out <- merge(out, tm)
}
}
print(out)
}
compiled <- readblk()
write.csv(compiled, 'compiled.csv')
如果保存文件的目录是您的工作目录,则无需输入任何内容。否则,请添加保存文件的目录的路径。我也重命名了标题。