我有一个数据帧("引用"(,包含3条人类染色体的基因组位置("起始"one_answers"结束"列(("eqnames"列(。我需要基于第二个数据帧("黑名单"(从每个染色体中删除特定的范围,并重塑原始数据帧以从每个染色体排除这些范围。我试图将这些数据帧作为一个GRanges对象(GenomicRanges库(读取,但我没有找到任何函数来执行此特定操作,所以作为一个数据帧可能会更容易?
reference <- data.frame(seqnames = c('chr1','chr2','chr3'),
start = c(1,1,1),
end = c(248956422, 242193529, 198295559)
)
reference
seqnames start end
<chr> <dbl> <dbl>
chr1 1 248956422
chr2 1 242193529
chr3 1 198295559
blacklist <- data.frame(seqnames = c('chr1','chr1','chr1','chr2','chr2','chr3','chr3'),
start = c(628903, 5850087, 8909610, 10, 23123, 163123, 3163123),
end = c(635104, 5850571, 8910014, 9312, 27120, 193120, 3963122)
)
blacklist
seqnames start end
<chr> <dbl> <dbl>
chr1 628903 635104
chr1 5850087 5850571
chr1 8909610 8910014
chr2 10 9312
chr2 23123 27120
chr3 163123 193120
chr3 3163123 3963122
所需输出(新数据帧"reference_new"(:
reference_new <- data.frame(seqnames = c('chr1','chr1','chr1','chr1','chr2', 'chr2', 'chr2', 'chr3', 'chr3', 'chr3' ),
start = c(1, 635105, 5850572,8910015, 1, 9313,27121, 1,193121,3963123 ),
end = c(628902, 5850086,8909609, 248956422, 9, 23122,242193529, 163122,3163122, 198295559)
)
reference_new
seqnames start end
<chr> <dbl> <dbl>
chr1 1 628902
chr1 635105 5850086
chr1 5850572 8909609
chr1 8910015 248956422
chr2 1 9
chr2 9313 23122
chr2 27121 242193529
chr3 1 163122
chr3 193121 3163122
chr3 3963123 198295559
这感觉很笨拙,但适用于示例数据。我没有检查参考范围之外的黑名单值,所以我怀疑这会造成问题,可能值得更多的检查/不同的方法。
library(tidyverse)
conv_shape <- function(df, rev = 1) {
df%>%
pivot_longer(-seqnames) %>%
mutate(value = value + case_when(rev == -1 & name == "start" ~ -1,
rev == -1 & name == "end" ~ 1,
TRUE ~ 0),
active = if_else(name == "start", 1, -1) * rev)
}
bind_rows(.id = "src",
ref = reference %>% conv_shape,
blacklist = blacklist %>% conv_shape(-1)
) %>%
arrange(seqnames, value) %>%
group_by(seqnames) %>%
mutate(inrng = cumsum(active),
status = if_else(inrng == 1, "start", "end"),
grp = cumsum(inrng - lag(inrng, default = 0) == 1)) %>%
ungroup() %>%
select(seqnames, value, status, grp) %>%
pivot_wider(names_from = status, values_from = value)
结果
# A tibble: 10 × 4
seqnames grp start end
<chr> <int> <dbl> <dbl>
1 chr1 1 1 628902
2 chr1 2 635105 5850086
3 chr1 3 5850572 8909609
4 chr1 4 8910015 248956422
5 chr2 1 1 9
6 chr2 2 9313 23122
7 chr2 3 27121 242193529
8 chr3 1 1 163122
9 chr3 2 193121 3163122
10 chr3 3 3963123 198295559