R:如何根据数字向量对数据帧进行排序



我想使用TPM值计算thyroidtestes数据帧之间的倍数变化,并提供睾丸组织中过表达的前10个基因(testes数据帧中的testes$gene_id(。

在下面的代码中,我首先计算了折叠变化并将其存储为数字向量tpm.foldchange,但随后我不知道如何根据排序的折叠变化值tpm.foldchangetestes数据帧的gene_id列进行排序。

# Parse the gene results file from the testes and thyroid output
thyroid <- read.table("thyroid.genes.results", header=T, sep="t")
testes <- read.table("testes.genes.results", header=T, sep="t")
# Extract the TPM values
# Add one to each value and log them (base 2)
library(tidyverse)
thyroid.tpm <- log(thyroid %>% pull(TPM) + 1)
testes.tpm <- log(testes %>% pull(TPM) + 1)
# Pearson's correlation coefficient between thyroid and testes using TPM
cor(thyroid.tpm, testes.tpm, method="pearson")
# Calculate fold change between the testes and thyroid tissue TPM values and provide top 10 genes that are overexpressed in testes
library(gtools)
tpm.foldchange <- foldchange(testes.tpm, thyroid.tpm)
#tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.foldchange, decreasing=T)
tpm.sortedgenes <- testes[order(factor(testes$TPM, levels=tpm.sorted)),]
tpm.top10genes <- head(tpm.sortedgenes, 10)
testes[order(factor(testes$TPM, levels=tpm.sorted)),]

我最初想像这样合并后进行排序:

tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.df$tpm.foldchange, decreasing=T)

但它引发了一个错误:

Error: cannot allocate vector of size 8.0 Gb

thyroid数据帧:

# Show only the first 20 rows, first column, and 6th column of thyroid dataframe
dput(thyroid[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233", 
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA", 
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582", 
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L", 
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1", 
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(0, 45.96, 
2.72, 2.4, 1.67, 5.14, 4.33, 47.68, 81.1, 10.12, 0.96, 45.21, 
0, 19.63, 15.06, 0.49, 21.76, 12.16, 19.37, 5.3)), row.names = c(NA, 
20L), class = "data.frame")

testes数据帧:

# Show only the first 20 rows, first column, and 6th column of testes dataframe
dput(testes[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233", 
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA", 
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582", 
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L", 
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1", 
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(2.33, 47.56, 
9.45, 2.03, 3.09, 0.11, 3.73, 28.52, 120.65, 6.89, 1.38, 30.89, 
0, 20.39, 13.66, 0.59, 9.62, 22.04, 7.42, 2.53)), row.names = c(NA, 
20L), class = "data.frame")

根据Akrun的评论,我尝试了:

library(gtools)
tpm.foldchange <- foldchange(thyroid.tpm, testes.tpm)
testes.sorted <- testes %>% 
left_join(thyroid, by="gene_id") %>%
mutate(TPM=testes.tpm, tpm.foldchange, .keep="unused") %>%
slice_max(n=10, order_by=tpm.foldchange)

输出:

> dim(testes.sorted)
[1] 304  15
> dput(testes.sorted[1:10,])
structure(list(gene_id = c("gene10075_LOC101927056", "gene10311_A4GNT", 
"gene10394_SLC9A9-AS1", "gene10504_SUCNR1", "gene10511_TMEM14E", 
"gene10798_LOC102724550", "gene10990_FLJ42393", "gene11054_DPPA2P3", 
"gene11065_GP5", "gene11400_USP17L12"), transcript_id.s..x = c("rna28860_NR_125396.1,rna28861_NR_125395.1", 
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4", 
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1", 
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.x = c(659, 1771, 518, 1650, 1293, 2957, 2266, 1146, 
3493, 1593), effective_length.x = c(413.57, 1525.5, 272.62, 1404.5, 
1047.5, 2711.5, 2020.5, 900.5, 3247.5, 1347.5), expected_count.x = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0.12), TPM.x = c(0, 0, 0, 0, 0, 0, 0, 
0, 0, 0), FPKM.x = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), transcript_id.s..y = c("rna28860_NR_125396.1,rna28861_NR_125395.1", 
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4", 
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1", 
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.y = c(796, 1771, 518, 1650, 1293, 2957, 2266, 1146, 
3493, 1593), effective_length.y = c(535.05, 1510.04, 257.15, 
1389.04, 1032.04, 2696.04, 2005.04, 885.04, 3232.04, 1332.04), 
expected_count.y = c(9, 3, 2, 233, 2, 2, 36, 2, 35, 1.91), 
TPM.y = c(0.58, 0.07, 0.27, 5.8, 0.07, 0.03, 0.62, 0.08, 
0.37, 0.05), FPKM.y = c(0.29, 0.03, 0.14, 2.94, 0.03, 0.01, 
0.31, 0.04, 0.19, 0.03), TPM = c(0, 0, 0, 0, 0, 0, 0, 0, 
0, 0), tpm.foldchange = c(Inf, Inf, Inf, Inf, Inf, Inf, Inf, 
Inf, Inf, Inf)), row.names = c(NA, 10L), class = "data.frame")

此代码返回一个具有(30415(维度的数据帧。但我只是在寻找前十个基因。此外,请注意,thyroid.tpm是log2转换的TPM值。

如果我们想按foldchange订购,请先进行连接,然后根据"TPM"列之间的折叠变化进行arrange

library(dplyr)
library(gtools)
testes2 <- testes %>% 
left_join(thyroid,  by = 'gene_id') %>% 
mutate(across(starts_with("TPM"), ~ log(.x + 1),
.names = "tpm_{.col}")) %>% 
mutate(foldchange = foldchange(tpm_TPM.x, tpm_TPM.y)) %>% 
filter(is.finite(foldchange)) %>%
arrange(tpm_TPM.x) %>%
dplyr::select(gene_id, TPM = TPM.x,  foldchange) %>%
slice_head(n = 10)

如果我们想选择前10个折页更改行,请使用slice_max

testes %>%
left_join(thyroid,  by = 'gene_id') %>% 
mutate(TPM = TPM.x, foldchange = foldchange(log(TPM.x + 1), log(TPM.y + 1)),
.keep = "unused") %>% 
filter(is.finite(foldchange)) %>%
slice_max(n = 10, order_by = foldchange, with_ties = FALSE)

-输出

gene_id    TPM foldchange
1        gene100_C1orf233   9.45   1.786222
2          gene10000_CD86   3.09   1.434249
3  gene10007_LOC102723582   1.38   1.288517
4       gene10016_HSPBAP1  22.04   1.217311
5        gene10012_PARP15   0.59   1.162893
6       gene10005_FAM162A 120.65   1.089205
7         gene10010_PARP9  20.39   1.011953
8            gene1_WASH7P  47.56   1.008704
9         gene10011_DTX3L  13.66  -1.033968
10         gene10003_CSTA   3.73  -1.076854

merge导致内存错误,因为它是在两个向量上完成的,创建了笛卡尔连接

相关内容

  • 没有找到相关文章

最新更新