r语言 - 根据 SNP 位置和基因开始/结束坐标从另一个数据帧分配基因名称



我有两个数据帧:一个包含SNP及其位置的列表,另一个包含基因及其开始和结束坐标的列表。使用 dplyr,我想在 SNP 数据帧中添加一列,其中包含每个 SNP 所属的基因的名称(即 SNP 的位置在同一条染色体上,并且落在基因的开始/结束坐标之间,包括基因(。

如果SNP不属于任何基因坐标,它应该在基因列中得到"NA"。SNP和基因之间的染色体数必须匹配。例如,即使第二个SNP的位置落在Gene4的开始/结束坐标内,那也不是匹配的,因为它们在不同的染色体上。

单核苷酸单核苷

单核苷酸 数据帧:

CHR  POS  REF  ALT
01   5    C    T
01   10   G    A
02   5    G    T
02   15   C    A
02   20   T    C
03   10   A    G
03   20   C    T

基因数据帧:

CHR  START  END  GENE_NAME
01   2      8    Gene1
01   12     20   Gene2
01   25     30   Gene3
02   10     18   Gene4
02   25     35   Gene5
03   5      15   Gene6

期望输出:

CHR  POS  REF  ALT  GENE_NAME
01   5    C    T    Gene1
01   10   G    A    NA
02   5    G    T    NA
02   15   C    A    Gene4
02   20   T    C    NA
03   10   A    G    Gene6
03   20   C    T    NA

同样,我想使用 dplyr 完成此操作。提前感谢任何帮助!

使用 map2_chr from purrr的一个选项是根据POS筛选GENE数据帧,并从SNP CHR并选择相应的GENE_NAME

library(dplyr)
library(purrr)
SNP %>%
   mutate(GENE_NAME = map2_chr(POS, CHR, function(x, y) {
       inds = x >= GENE$START & x <= GENE$END & y == GENE$CHR
      if (any(inds)) GENE$GENE_NAME[which.max(inds)] else NA
 }))
#  CHR POS REF ALT GENE_NAME
#1   1   5   C   T     Gene1
#2   1  10   G   A      <NA>
#3   2   5   G   T      <NA>
#4   2  15   C   A     Gene4
#5   2  20   T   C      <NA>
#6   3  10   A   G     Gene6
#7   3  20   C   T      <NA>

在基础 R 中,可以使用 mapply

编写
mapply(function(x, y) {
   inds = x >= GENE$START & x <= GENE$END & y == GENE$CHR
   if (any(inds)) GENE$GENE_NAME[which.max(inds)] else NA
}, SNP$POS, SNP$CHR)
#[1] "Gene1" NA      NA      "Gene4" NA      "Gene6" NA    

数据

SNP <- structure(list(CHR = c(1L, 1L, 2L, 2L, 2L, 3L, 3L), POS = c(5L, 
10L, 5L, 15L, 20L, 10L, 20L), REF = c("C", "G", "G", "C", "T", 
"A", "C"), ALT = c("T", "A", "T", "A", "C", "G", "T")), class = 
"data.frame", row.names = c(NA, -7L))
GENE <- structure(list(CHR = c(1L, 1L, 1L, 2L, 2L, 3L), START = c(2L, 
12L, 25L, 10L, 25L, 5L), END = c(8L, 20L, 30L, 18L, 35L, 15L), 
GENE_NAME = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5", 
"Gene6")), class = "data.frame", row.names = c(NA, -6L))

这是dplyr的一种方式。您只需根据STARTEND展开gene数据帧,然后使用snp left_join -

snp %>% 
  left_join(
    gene %>% 
      group_by(CHR, START, END) %>% 
      mutate(
        POS = list(seq(START, END))
      ) %>% 
      unnest(),
    by = c("CHR", "POS")
  ) %>% 
  select(CHR, POS, REF, ALT, GENE_NAME)
  CHR POS REF ALT GENE_NAME
1   1   5   C   T     Gene1
2   1  10   G   A      <NA>
3   2   5   G   T      <NA>
4   2  15   C   A     Gene4
5   2  20   T   C      <NA>
6   3  10   A   G     Gene6
7   3  20   C   T      <NA>

这是一个使用non-equi连接的选项data.table

library(data.table)
setDT(snp)[gene, GENE_NAME := GENE_NAME, on = .(CHR, POS >= START, POS <= END)]
snp
#   CHR POS REF ALT GENE_NAME
#1:   1   5   C   T     Gene1
#2:   1  10   G   A      <NA>
#3:   2   5   G   T      <NA>
#4:   2  15   C   A     Gene4
#5:   2  20   T   C      <NA>
#6:   3  10   A   G     Gene6
#7:   3  20   C   T      <NA>

或与fuzzyjoin

library(fuzzyjoin)
library(dplyr)
fuzzy_left_join(snp, gene, by = c("CHR", "POS" = "START",
   "POS" = "END"), match_fun = list(`==`, `>=`, `<=`)) %>% 
   select(CHR = CHR.x, POS, REF, ALT, GENE_NAME)
#  CHR POS REF ALT GENE_NAME
#1   1   5   C   T     Gene1
#2   1  10   G   A      <NA>
#3   2   5   G   T      <NA>
#4   2  15   C   A     Gene4
#5   2  20   T   C      <NA>
#6   3  10   A   G     Gene6
#7   3  20   C   T      <NA>

或带有rap的选项

library(rap)
snp %>% 
   rap(GENE_NAME = ~ filter(gene, CHR ==  !!CHR, START <= POS, END >= POS) %>% 
          pull(GENE_NAME)) %>% 
   mutate(GENE_NAME = replace(GENE_NAME, !lengths(GENE_NAME), NA)) %>% 
   unnest
#  CHR POS REF ALT GENE_NAME
#1   1   5   C   T     Gene1
#2   1  10   G   A      <NA>
#3   2   5   G   T      <NA>
#4   2  15   C   A     Gene4
#5   2  20   T   C      <NA>
#6   3  10   A   G     Gene6
#7   3  20   C   T      <NA>

数据

snp <- structure(list(CHR = c(1L, 1L, 2L, 2L, 2L, 3L, 3L), POS = c(5L, 
10L, 5L, 15L, 20L, 10L, 20L), REF = c("C", "G", "G", "C", "T", 
"A", "C"), ALT = c("T", "A", "T", "A", "C", "G", "T")),
  class = "data.frame", row.names = c(NA, 
-7L))
gene <- structure(list(CHR = c(1L, 1L, 1L, 2L, 2L, 3L), START = c(2L, 
12L, 25L, 10L, 25L, 5L), END = c(8L, 20L, 30L, 18L, 35L, 15L), 
    GENE_NAME = c("Gene1", "Gene2", "Gene3", "Gene4", "Gene5", 
    "Gene6")), class = "data.frame", row.names = c(NA, -6L))

相关内容

最新更新