我有一个看起来像这样的数据帧:
PC1 PC2 PC3 PC4 PC5
TGFBI 0.05105828 0.04259313 0.001931321 -0.014739818 -0.023630707
OLFM4 -0.01093027 -0.07768484 -0.056874220 -0.011045101 -0.009762945
CD177 -0.03543697 -0.05385507 -0.073857334 0.024278031 0.009186970
LCN2 -0.01539785 -0.07883489 -0.036711308 0.014314281 -0.010042372
CEACAM8 -0.01130204 -0.08092082 0.004004276 -0.003607281 -0.008600472
这是myPCAdf$rotation
的输出。我想使用 abs 值对每台 PC 进行排序,然后我想对前 n 个基因进行子集。如果我执行:
data.frame(sort(abs(myPCAdf$rotation[,"PC4"]), decreasing=TRUE)[1:50])
我获得了失去基因名称的值列表。 任何人都可以帮助获取一个 data.frame,其中包含每个 PC 的两列:一列包含基因名称,形成第二列中的数值排序。
提前谢谢你。
期望的输出(简短示例(:
PC1
TGFBI 0.05105828
HLA-DPB1 0.04843209
OLFM4 -0.01093027
CEACAM8 -0.01130204
LCN2 -0.01539785
CD177 -0.03543697
.............
你也可以尝试一个经典的 for 循环:
listed <- list() # empty list
# create a loop for each subset of data.frame
for(i in colnames(df)) {
dats <- data.frame(gene = rownames(df), pc = df[, which(names(df) == i )])
listed[[i]] <- dats[order(abs(dats$pc),decreasing = T),]
}
do.call(cbind, listed)
PC1.gene PC1.pc PC2.gene PC2.pc PC3.gene PC3.pc PC4.gene
1 TGFBI 0.05105828 CEACAM8 -0.08092082 CD177 -0.073857334 CD177
3 CD177 -0.03543697 LCN2 -0.07883489 OLFM4 -0.056874220 TGFBI
4 LCN2 -0.01539785 OLFM4 -0.07768484 LCN2 -0.036711308 LCN2
5 CEACAM8 -0.01130204 CD177 -0.05385507 CEACAM8 0.004004276 OLFM4
2 OLFM4 -0.01093027 TGFBI 0.04259313 TGFBI 0.001931321 CEACAM8
PC4.pc PC5.gene PC5.pc
1 0.024278031 TGFBI -0.023630707
3 -0.014739818 LCN2 -0.010042372
4 0.014314281 OLFM4 -0.009762945
5 -0.011045101 CD177 0.009186970
2 -0.003607281 CEACAM8 -0.008600472
通过基本 R 的一个想法是拆分每一列,将行名转换为列,排序并将它们重新绑定在一起,即
do.call(cbind,
lapply(split.default(df, seq(ncol(df))), function(i){
i$rn <- rownames(i);
rownames(i) <- NULL;
i[order(abs(i[1]), decreasing = TRUE),]; })
)
这给了,
1.PC1 1.rn 2.PC2 2.rn 3.PC3 3.rn 4.PC4 4.rn 5.PC5 5.rn 1 0.05105828 TGFBI -0.08092082 CEACAM8 -0.073857334 CD177 0.024278031 CD177 -0.023630707 TGFBI 3 -0.03543697 CD177 -0.07883489 LCN2 -0.056874220 OLFM4 -0.014739818 TGFBI -0.010042372 LCN2 4 -0.01539785 LCN2 -0.07768484 OLFM4 -0.036711308 LCN2 0.014314281 LCN2 -0.009762945 OLFM4 5 -0.01130204 CEACAM8 -0.05385507 CD177 0.004004276 CEACAM8 -0.011045101 OLFM4 0.009186970 CD177 2 -0.01093027 OLFM4 0.04259313 TGFBI 0.001931321 TGFBI -0.003607281 CEACAM8 -0.008600472 CEACAM8
tidyverse
解决方案可能如下所示:
f <- structure(list(PC1 = c(0.05105828, -0.01093027, -0.03543697,
-0.01539785, -0.01130204),
PC2 = c(0.04259313, -0.07768484, -0.05385507,
-0.07883489, -0.08092082),
PC3 = c(0.001931321, -0.05687422, -0.073857334,
-0.036711308, 0.004004276),
PC4 = c(-0.014739818, -0.011045101, 0.024278031,
0.014314281, -0.003607281),
PC5 = c(-0.023630707, -0.009762945, 0.00918697,
-0.010042372, -0.008600472)),
class = "data.frame",
row.names = c("TGFBI", "OLFM4", "CD177", "LCN2", "CEACAM8"))
## first transfrom row names to a column
ft <- f %>% rownames_to_column(f)
## then loop through all PC columns and arrange by abs(.)
map(names(ft)[-1],
~ ft %>% arrange(desc(abs(!!rlang::sym(.x)))) %>%
select(rowname, !!rlang::sym(.x)))
如果您只想拥有前 10 行(例如(,请添加%>% slice(1:10)
输出
[[1]]
rowname PC1
1 TGFBI 0.05105828
2 CD177 -0.03543697
3 LCN2 -0.01539785
4 CEACAM8 -0.01130204
5 OLFM4 -0.01093027
[[2]]
rowname PC2
1 CEACAM8 -0.08092082
2 LCN2 -0.07883489
3 OLFM4 -0.07768484
4 CD177 -0.05385507
5 TGFBI 0.04259313
[[3]]
rowname PC3
1 CD177 -0.073857334
2 OLFM4 -0.056874220
3 LCN2 -0.036711308
4 CEACAM8 0.004004276
5 TGFBI 0.001931321
[[4]]
rowname PC4
1 CD177 0.024278031
2 TGFBI -0.014739818
3 LCN2 0.014314281
4 OLFM4 -0.011045101
5 CEACAM8 -0.003607281
[[5]]
rowname PC5
1 TGFBI -0.023630707
2 LCN2 -0.010042372
3 OLFM4 -0.009762945
4 CD177 0.009186970
5 CEACAM8 -0.008600472
考虑将数据重塑为长格式,这是大多数数据科学操作(聚合、数据合并、统计测试、绘图、建模等(的常用结构。无需循环:
# RESHAPE WIDE TO LONG
rdf <- reshape(df, varying = list(names(df)[-ncol(df)]), times = names(df)[-ncol(df)],
v.names = "value", timevar = "PC",
new.row.names = 1:1E5, direction = "long")
# ORDER BY INDICATOR AND VALUE COLUMNS
rdf <- with(rdf, rdf[order(rev(PC), abs(rdf$value), decreasing=TRUE), ])
# RE-ASSIGN id AS SEQUENCE COLUMN
rdf$id <- with(rdf, ave(value, PC, FUN=seq_along))
row.names(rdf) <- NULL
输出
rdf
# gene PC value id
# 1 TGFBI PC1 0.051058280 1
# 2 CD177 PC1 -0.035436970 2
# 3 LCN2 PC1 -0.015397850 3
# 4 CEACAM8 PC1 -0.011302040 4
# 5 OLFM4 PC1 -0.010930270 5
# 6 CEACAM8 PC2 -0.080920820 1
# 7 LCN2 PC2 -0.078834890 2
# 8 OLFM4 PC2 -0.077684840 3
# 9 CD177 PC2 -0.053855070 4
# 10 TGFBI PC2 0.042593130 5
# 11 CD177 PC3 -0.073857334 1
# 12 OLFM4 PC3 -0.056874220 2
# 13 LCN2 PC3 -0.036711308 3
# 14 CEACAM8 PC3 0.004004276 4
# 15 TGFBI PC3 0.001931321 5
# 16 CD177 PC4 0.024278031 1
# 17 TGFBI PC4 -0.014739818 2
# 18 LCN2 PC4 0.014314281 3
# 19 OLFM4 PC4 -0.011045101 4
# 20 CEACAM8 PC4 -0.003607281 5
# 21 TGFBI PC5 -0.023630707 1
# 22 LCN2 PC5 -0.010042372 2
# 23 OLFM4 PC5 -0.009762945 3
# 24 CD177 PC5 0.009186970 4
# 25 CEACAM8 PC5 -0.008600472 5