这里的第一篇文章,所以我希望我能最好地解释自己。
我需要通过查找两个数据帧中一个数据帧提供的一个特定染色体位置是否在另一个数据框提供的范围内来交叉引用两个数据框,因此我希望有一个新的列,其中基因存在于该范围内。
"基因"是坐标(开始/结束(被视为范围的数据帧
head(genes)
# A tibble: 6 x 9
chr source type start end strand gene_id symbol gene_biotype
<chr> <chr> <chr> <int> <int> <chr> <chr> <chr> <chr>
1 2 pseudogene gene 143300987 143301544 + ENSG00000228134 AC092578.1 pseudogene
2 2 pseudogene gene 143611664 143613567 + ENSG00000229781 AC013444.1 pseudogene
3 2 protein_coding gene 143635067 143799890 + ENSG00000115919 KYNU protein_coding
4 2 pseudogene gene 143704869 143705655 - ENSG00000270390 RP11-470B22.1 pseudogene
5 2 miRNA gene 143763269 143763360 - ENSG00000221169 AC013444.2 miRNA
6 2 protein_coding gene 143848931 144525921 + ENSG00000075884 ARHGAP15 protein_coding
另一个数据帧(x(是:
chr_a point A
1 2 143301002
2 2 143625061
3 2 143700941
4 2 143811317
5 2 144127323
6 2 144224689
我基本上必须找出"A点"是否介于(基因(中的"开始"/"结束"范围之间,以及与哪个基因符号相关。
我尝试了以下方法:
x$geneA <- ifelse(sapply(x$`point A`, function(g)
any(genes$start >= g & genes$end <=g)), genes$symbol, NA)
但我得到的结果与基因组坐标不一致。
希望有人能帮助我!Thx!
这行得通吗?
我假设每个点只匹配一个基因符号。
x$geneA <- sapply(x$`point A`,
function(g) filter(genes, g >= start & g <= end)$symbol[1])
结果:
x
# A tibble: 6 x 3
chr_a `point A` geneA
<int> <int> <chr>
1 2 143301002 AC092578.1
2 2 143625061 NA
3 2 143700941 KYNU
4 2 143811317 NA
5 2 144127323 ARHGAP15
6 2 144224689 ARHGAP15
欢迎使用Stackoverflow!在未来,请张贴一个最小的,可行的例子(MWE(。
genes <- tribble(~chr, ~source, ~type, ~start, ~end, ~strand, ~gene_id, ~symbol, ~gene_biotype,
2, "pseudogene", "gene", 143300987, 143301544, "+", "ENSG00000228134", "AC092578.1", "pseudogene",
2, "pseudogene", "gene", 143611664, 143613567, "+", "ENSG00000229781", "AC013444.1", "pseudogene",
2, "protein_coding", "gene", 143635067, 143799890, "+", "ENSG00000115919", "KYNU", "protein_coding",
2, "pseudogene", "gene", 143704869, 143705655, "-", "ENSG00000270390", "RP11-470B22.1", "pseudogene",
2, "miRNA", "gene", 143763269, 143763360, "-", "ENSG00000221169", "AC013444.2", "miRNA",
2, "protein_coding", "gene", 143848931, 144525921, "+", "ENSG00000075884", "ARHGAP15", "protein_coding")
x <- tribble(~chr_a, ~`point A`,
2, 143301002,
2, 143625061,
2, 143700941,
2, 143811317,
2, 144127323,
2, 144224689,
)
我给你一个tidyverse
方法:
x %>%
nest_join(genes, by = c("chr_a" = "chr")) %>%
group_by(`point A`) %>%
mutate(genes = map(genes, ~filter(., `point A` >= start & `point A` <= end))) %>%
unnest(genes, keep_empty = TRUE)
用于获得非匹配行为CCD_ 2的合并表。或者只需在不使用嵌套tibbles 的情况下找到匹配的
x %>%
left_join(genes, by = c("chr_a" = "chr")) %>%
filter(`point A` >= start & `point A` <= end)
您可以尝试下面的基本R代码
df2out <- within(df2,symbol <- sapply(A, function(x) df1$symbol[which(x>=df1$start & x<=df1$end)]))
使得
> df2out
chr_a point A symbol
1 1 2 143301002 AC092578.1
2 2 2 143625061
3 3 2 143700941 KYNU
4 4 2 143811317
5 5 2 144127323 ARHGAP15
6 6 2 144224689 ARHGAP15
数据
df1 <- structure(list(chr = c(2L, 2L, 2L, 2L, 2L, 2L), source = c("pseudogene",
"pseudogene", "protein_coding", "pseudogene", "miRNA", "protein_coding"
), type = c("gene", "gene", "gene", "gene", "gene", "gene"),
start = c(143300987L, 143611664L, 143635067L, 143704869L,
143763269L, 143848931L), end = c(143301544L, 143613567L,
143799890L, 143705655L, 143763360L, 144525921L), strand = c("+",
"+", "+", "-", "-", "+"), gene_id = c("ENSG00000228134",
"ENSG00000229781", "ENSG00000115919", "ENSG00000270390",
"ENSG00000221169", "ENSG00000075884"), symbol = c("AC092578.1",
"AC013444.1", "KYNU", "RP11-470B22.1", "AC013444.2", "ARHGAP15"
), gene_biotype = c("pseudogene", "pseudogene", "protein_coding",
"pseudogene", "miRNA", "protein_coding")), class = "data.frame", row.names = c(NA,
-6L))
df2 <- structure(list(chr_a = 1:6, point = c(2L, 2L, 2L, 2L, 2L, 2L),
A = c(143301002L, 143625061L, 143700941L, 143811317L, 144127323L,
144224689L)), class = "data.frame", row.names = c(NA, -6L
))
这个答案很可能永远不会被看到=p
有这样的程序包。请注意,您的代码将无法使用额外的染色体或链。
使用@koenniem的数据,
library(GenomicRanges)
gr1 = makeGRangesFromDataFrame(genes,keep.extra.columns=TRUE)
x = data.frame(x,check.names=FALSE)
gr2 = GRanges(seqnames=x$chr_a,IRanges(start=x[,"point A"],width=1))
x$gene = NA
ovlp = findOverlaps(gr2,gr1)
x$gene[queryHits(ovlp)] = gr1$symbol[subjectHits(ovlp)]
chr_a point A gene
1 2 143301002 AC092578.1
2 2 143625061 <NA>
3 2 143700941 KYNU
4 2 143811317 <NA>
5 2 144127323 ARHGAP15
6 2 144224689 ARHGAP15
基于循环的解决方案。(当然,这将比使用apply
慢得多。(
#A mock-up of your data
symbol <- c("AC092578.1", "AC013444.1", "KYNU", "RP11-470B22.1", "AC013444.2", "ARHGAP15", "Newadditionalsymbol")
start <- c(143300987, 143611664, 143635067, 143704869, 143763269, 143848931, 143300987)
end <- c(143301544, 143613567, 143799890, 143705655, 143763360, 144525921, 143301044)
genes <- data.frame(start, end, symbol, stringsAsFactors = F)
point_A <- start[1:6]+1
chr_1 <- rep_len(2, length.out = length(point_A))
x <- data.frame(chr_1, point_A, stringsAsFactors = F)
x$symbol <- NA #Create a new column to store the symbols, populate it with NA
x
# chr_1 point_A symbol
# 1 2 143300988 NA
# 2 2 143611665 NA
# 3 2 143635068 NA
# 4 2 143704870 NA
# 5 2 143763270 NA
# 6 2 143848932 NA
#Solution using a for loop
for(i in 1:nrow(x)){ #Iterate through every row of x
for(j in 1:nrow(genes)){ #Iterate through every row of genes
if(x$point_A[i] >= genes$start[j] & x$point_A[i] < genes$end[j]){ #If the ith point_A falls within the jth start & end
if(is.na(x$symbol[i])){ #If there is no symbol assigned to the ith row of x
x$symbol[i] <- genes$symbol[j] #Assign the symbol from the jth row
} else{ #If there is a symbol assigned to the ith row of x already, and it matches (now, another) jth row of genes
x$symbol[i] <- paste(x$symbol[i], genes$symbol[j]) #Concatenate the new symbol from the jth row of genes to the ith row of x
}
}
}
}
x
# chr_1 point_A symbol
# 1 2 143300988 AC092578.1 Newadditionalsymbol
# 2 2 143611665 AC013444.1
# 3 2 143635068 KYNU
# 4 2 143704870 KYNU RP11-470B22.1
# 5 2 143763270 KYNU AC013444.2
# 6 2 143848932 ARHGAP15