我有两个数据帧:一个包含SNP及其位置的列表,另一个包含基因及其开始和结束坐标的列表。使用 dplyr,我想在 SNP 数据帧中添加一列,其中包含每个 SNP 所属的基因的名称(即 SNP 的位置在同一条染色体上,并且落在基因的开始/结束坐标之间,包括基因(。
如果SNP不属于任何基因坐标,它应该在基因列中得到"NA"。SNP和基因之间的染色体数必须匹配。例如,即使第二个SNP的位置落在Gene4的开始/结束坐标内,那也不是匹配的,因为它们在不同的染色体上。
单核苷酸单核苷单核苷酸 数据帧:
CHR POS REF ALT
01 5 C T
01 10 G A
02 5 G T
02 15 C A
02 20 T C
03 10 A G
03 20 C T
基因数据帧:
CHR START END GENE_NAME
01 2 8 Gene1
01 12 20 Gene2
01 25 30 Gene3
02 10 18 Gene4
02 25 35 Gene5
03 5 15 Gene6
期望输出:
CHR POS REF ALT GENE_NAME
01 5 C T Gene1
01 10 G A NA
02 5 G T NA
02 15 C A Gene4
02 20 T C NA
03 10 A G Gene6
03 20 C T NA
同样,我想使用 dplyr 完成此操作。提前感谢任何帮助!
使用 map2_chr
from purrr
的一个选项是根据POS
筛选GENE
数据帧,并从SNP
CHR
并选择相应的GENE_NAME
。
library(dplyr)
library(purrr)
SNP %>%
mutate(GENE_NAME = map2_chr(POS, CHR, function(x, y) {
inds = x >= GENE$START & x <= GENE$END & y == GENE$CHR
if (any(inds)) GENE$GENE_NAME[which.max(inds)] else NA
}))
# CHR POS REF ALT GENE_NAME
#1 1 5 C T Gene1
#2 1 10 G A <NA>
#3 2 5 G T <NA>
#4 2 15 C A Gene4
#5 2 20 T C <NA>
#6 3 10 A G Gene6
#7 3 20 C T <NA>
在基础 R 中,可以使用 mapply
mapply(function(x, y) {
inds = x >= GENE$START & x <= GENE$END & y == GENE$CHR
if (any(inds)) GENE$GENE_NAME[which.max(inds)] else NA
}, SNP$POS, SNP$CHR)
#[1] "Gene1" NA NA "Gene4" NA "Gene6" NA
数据
SNP <- structure(list(CHR = c(1L, 1L, 2L, 2L, 2L, 3L, 3L), POS = c(5L,
10L, 5L, 15L, 20L, 10L, 20L), REF = c("C", "G", "G", "C", "T",
"A", "C"), ALT = c("T", "A", "T", "A", "C", "G", "T")), class =
"data.frame", row.names = c(NA, -7L))
GENE <- structure(list(CHR = c(1L, 1L, 1L, 2L, 2L, 3L), START = c(2L,
12L, 25L, 10L, 25L, 5L), END = c(8L, 20L, 30L, 18L, 35L, 15L),
GENE_NAME = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5",
"Gene6")), class = "data.frame", row.names = c(NA, -6L))
这是dplyr
的一种方式。您只需根据START
和END
展开gene
数据帧,然后使用snp
left_join
-
snp %>%
left_join(
gene %>%
group_by(CHR, START, END) %>%
mutate(
POS = list(seq(START, END))
) %>%
unnest(),
by = c("CHR", "POS")
) %>%
select(CHR, POS, REF, ALT, GENE_NAME)
CHR POS REF ALT GENE_NAME
1 1 5 C T Gene1
2 1 10 G A <NA>
3 2 5 G T <NA>
4 2 15 C A Gene4
5 2 20 T C <NA>
6 3 10 A G Gene6
7 3 20 C T <NA>
这是一个使用non-equi
连接的选项data.table
library(data.table)
setDT(snp)[gene, GENE_NAME := GENE_NAME, on = .(CHR, POS >= START, POS <= END)]
snp
# CHR POS REF ALT GENE_NAME
#1: 1 5 C T Gene1
#2: 1 10 G A <NA>
#3: 2 5 G T <NA>
#4: 2 15 C A Gene4
#5: 2 20 T C <NA>
#6: 3 10 A G Gene6
#7: 3 20 C T <NA>
或与fuzzyjoin
library(fuzzyjoin)
library(dplyr)
fuzzy_left_join(snp, gene, by = c("CHR", "POS" = "START",
"POS" = "END"), match_fun = list(`==`, `>=`, `<=`)) %>%
select(CHR = CHR.x, POS, REF, ALT, GENE_NAME)
# CHR POS REF ALT GENE_NAME
#1 1 5 C T Gene1
#2 1 10 G A <NA>
#3 2 5 G T <NA>
#4 2 15 C A Gene4
#5 2 20 T C <NA>
#6 3 10 A G Gene6
#7 3 20 C T <NA>
或带有rap
的选项
library(rap)
snp %>%
rap(GENE_NAME = ~ filter(gene, CHR == !!CHR, START <= POS, END >= POS) %>%
pull(GENE_NAME)) %>%
mutate(GENE_NAME = replace(GENE_NAME, !lengths(GENE_NAME), NA)) %>%
unnest
# CHR POS REF ALT GENE_NAME
#1 1 5 C T Gene1
#2 1 10 G A <NA>
#3 2 5 G T <NA>
#4 2 15 C A Gene4
#5 2 20 T C <NA>
#6 3 10 A G Gene6
#7 3 20 C T <NA>
数据
snp <- structure(list(CHR = c(1L, 1L, 2L, 2L, 2L, 3L, 3L), POS = c(5L,
10L, 5L, 15L, 20L, 10L, 20L), REF = c("C", "G", "G", "C", "T",
"A", "C"), ALT = c("T", "A", "T", "A", "C", "G", "T")),
class = "data.frame", row.names = c(NA,
-7L))
gene <- structure(list(CHR = c(1L, 1L, 1L, 2L, 2L, 3L), START = c(2L,
12L, 25L, 10L, 25L, 5L), END = c(8L, 20L, 30L, 18L, 35L, 15L),
GENE_NAME = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5",
"Gene6")), class = "data.frame", row.names = c(NA, -6L))