在rdplyr中使用多个参数内部连接数据帧



我有两个像这样的数据帧

library(dplyr)
#> Attaching package: 'dplyr'

df1 <- tibble(chrom=c(1,1,1,2,2,2),
start=c(100,200,300,100,200,300),
end=c(150,250,350,120,220,320))
df2 <- tibble(chrom=c(1,1,1,2,2,2),
start2=c(100,50,280,100,10,200),
end2=c(125,100,320,115,15,350))
df1 
#> # A tibble: 6 × 3
#>   chrom start   end
#>   <dbl> <dbl> <dbl>
#> 1     1   100   150
#> 2     1   200   250
#> 3     1   300   350
#> 4     2   100   120
#> 5     2   200   220
#> 6     2   300   320
df2
#> # A tibble: 6 × 3
#>   chrom start2  end2
#>   <dbl>  <dbl> <dbl>
#> 1     1    100   125
#> 2     1     50   100
#> 3     1    280   320
#> 4     2    100   115
#> 5     2     10    15
#> 6     2    200   350

创建于2023-01-09与reprex v2.0.2

当我试图根据以下条件加入时,我犯了一个巨大的错误知道为什么

inner_join(df2, df1, by = join_by(chrom, start< end2, end > start2))

误差

Error in `inner_join()`:
! Join columns in `x` must be present in the data.
✖ Problem with `start` and `end`.
Run `rlang::last_error()` to see where the error occurred.

这是预期的结果

chrom start end start2 end2
1:     1   100 150    100  125
2:     1    NA  NA     50  100
3:     1   300 350    280  320
4:     2   100 120    100  115
5:     2    NA  NA     10   15
6:     2   200 220    200  350
7:     2   300 320    200  350

您在inner_join中首先列出df2,其变量需要在比较的LHS中列出。

您可以交换df1/df2或交换比较变量的顺序(在给定内连接的情况下实际上是相同的):

inner_join(df2, df1, by = join_by(chrom, end2 > start, start2 < end))
# # A tibble: 5 × 5
#   chrom start2  end2 start   end
#   <dbl>  <dbl> <dbl> <dbl> <dbl>
# 1     1    100   125   100   150
# 2     1    280   320   300   350
# 3     2    100   115   100   120
# 4     2    200   350   200   220
# 5     2    200   350   300   320
inner_join(df1, df2, by = join_by(chrom, start < end2, end > start2))
# # A tibble: 5 × 5
#   chrom start   end start2  end2
#   <dbl> <dbl> <dbl>  <dbl> <dbl>
# 1     1   100   150    100   125
# 2     1   300   350    280   320
# 3     2   100   120    100   115
# 4     2   200   220    200   350
# 5     2   300   320    200   350

您期望的输出建议使用侧连接,其中这两者实际上是等效的:

left_join(df2, df1, by = join_by(chrom, end2 > start, start2 < end))
# # A tibble: 7 × 5
#   chrom start2  end2 start   end
#   <dbl>  <dbl> <dbl> <dbl> <dbl>
# 1     1    100   125   100   150
# 2     1     50   100    NA    NA
# 3     1    280   320   300   350
# 4     2    100   115   100   120
# 5     2     10    15    NA    NA
# 6     2    200   350   200   220
# 7     2    200   350   300   320
right_join(df1, df2, by = join_by(chrom, start < end2, end > start2))

这是你要找的吗?

df3 <- inner_join(df2, df1, by = "chrom") %>%
filter(start< end2, end > start2)

最新更新