r语言 - 将基因型格式'12'转换为 ACTG 格式



你好,

请帮助我根据snp_info文件将"12"基因型格式转换为ACGT格式

df(第1行(的示例

sample1 1/2 1/2 2/1 2/1

样本1的第一个SNP是杂合的(1/2(。1是ref等位基因(A(,2是alt等位基因。

我想自动转换我的真实数据中的所有基因型(900个样本/30个基因型(。

# df
df = t(data.frame( 
sample1 =c('1/2','1/2','2/1','2/1'), 
sample2 =c('2/1','1/1','1/2','2/1'), 
sample3 =c('2/1','2/1','1/1','1/2'), 
sample4 =c('1/1','2/2','2/2','2/2')))
# snp_info
snp_info = data.frame(
snp =c('11_524568','12_542656','12_558659','13_8457658'), 
position =c('524568','542656','558659','8457658'), 
ref =c('A','T','T','G'), 
alt=c('G','C','C','A'))

所需输出

desired_output = t(data.frame(
sample1 =c('A/G','T/C','C/T','A/G'), 
sample2 =c('G/A','T/T','T/C','A/G'), 
sample3 =c('G/A','C/T','T/T','G/A'), 
sample4 =c('A/A','C/T','C/C','A/A')
))

我尝试了第一个SNP的循环,但我没有得到想要的输出

desired_output = t(data.frame(
sample1 =c('','','',''), 
sample2 =c('','','',''), 
sample3 =c('','','',''), 
sample4 =c('','','','')))
tp = as.data.frame(tp)
geno = list()
for (i in 1:nrow(df)) {
geno[i] = paste(snp_info[i,3],'/',file_info[i,4])
desired_output[i,1] = geno[i]
}

谢谢你的帮助。

这里有一个非优雅的解决方案:

# EXAMPLE
df <- t(data.frame( 
sample1 =c('1/2','1/2','2/1','2/1'), 
sample2 =c('2/1','1/1','1/2','2/1'), 
sample3 =c('2/1','2/1','1/1','1/2'), 
sample4 =c('1/1','2/2','2/2','2/2')))
snp_info = data.frame(
snp =c('11_524568','12_542656','12_558659','13_8457658'), 
position =c('524568','542656','558659','8457658'), 
ref =c('A','T','T','G'), 
alt=c('G','C','C','A'))
desired_output = t(data.frame(
sample1 =c('','','',''), 
sample2 =c('','','',''), 
sample3 =c('','','',''), 
sample4 =c('','','','')))
for (i in 1L:ncol(df)) {
desired_output[,i] <- gsub("1", snp_info[i,3L], df[,i])
desired_output[,i] <- gsub("2", snp_info[i,4L], desired_output[,i])
}

然而,速度似乎对您的数量数据没有起到重要作用。

@tacoman的回答是可以接受的。我会稍微改进一下:

df <- t(data.frame(
sample1 = c('1/2', '1/2', '2/1', '2/1'),
sample2 = c('2/1', '1/1', '1/2', '2/1'),
sample3 = c('2/1', '2/1', '1/1', '1/2'),
sample4 = c('1/1', '2/2', '2/2', '2/2')
))
snp_info = data.frame(
snp = c('11_524568', '12_542656', '12_558659', '13_8457658'),
position = c('524568', '542656', '558659', '8457658'),
ref = c('A', 'T', 'T', 'G'),
alt = c('G', 'C', 'C', 'A')
)

stopifnot(ncol(df) == ncol(snp_info))
output <- matrix(rep("", ncol(df)*nrow(df)), ncol = ncol(df))
rownames(output) <- paste0("sample", 1:nrow(df))
for (i in 1L:ncol(df)) {
output[, i] <- gsub("1", snp_info$ref[i], df[, i])
output[, i] <- gsub("2", snp_info$alt[i], output[, i])
}
output
#>         [,1]  [,2]  [,3]  [,4] 
#> sample1 "A/G" "T/C" "C/T" "A/G"
#> sample2 "G/A" "T/T" "T/C" "A/G"
#> sample3 "G/A" "C/T" "T/T" "G/A"
#> sample4 "A/A" "C/C" "C/C" "A/A"

最新更新