我有两个数据框,我想以图形方式(散点图)显示这两个数据框(基因与蛋白质)行之间的相关性,以查看每一行是相关的。因此,我认为可以采用两种策略:1. 两个数据框架之间的线性回归(不知道如何)2. 使用列的平均值(和标准差)表示两者之间的Person相关性。
有人能帮我设计这些图表吗?下面是我的数据的一个例子:
genes <- "gene sample1 sample2 sample3 sample4
gene1 1863.4 1972.94 1603.96 1185.6
gene2 213.88 247.14 189.02 208.793
gene3 8.06 9.25 9.59 7.33
gene4 22.36 3.76 10.64 19.17"
genes<-read.table(text=genes,header=T)
protein <- "protein sample1 sample2 sample3 sample4
protein1 314.2871797 426.8856595 405.7971059 334.1369651
protein2 4747.866647 3070.916824 2780.352062 2990.085431
protein3 1621.566329 1290.470104 1554.27426 1601.357345
pretein4 8832.210499 7796.675008 8461.733171 9500.429355"
protein<-read.table(text=protein,header=T)
谢谢
对于一组列:
data <- data.frame(genes[,2],protein[,2])
names(data) <- c("genes_sample1", "protein_sample1")
plot(data)
abline(lm(genes_sample1 ~ protein_sample1, data=data))
cor(data)
genes_sample1 protein_sample1
genes_sample1 1.0000000 -0.6247528
protein_sample1 -0.6247528 1.0000000
rownames(protein) <- protein$protein
protein_t <- as.data.frame(t(as.matrix(protein[, - 1])))
rownames(genes) <- genes$gene
genes_t <- as.data.frame(t(as.matrix(genes[, - 1])))
cor(protein_t, genes_t, method = "pearson")
plot(cbind(protein_t, genes_t))
至于回归,我猜你想要每个蛋白质,一个将其表达与所有基因联系起来的回归,所以它会像这样:
lapply(protein_t, function(protein) lm(protein ~ ., data = cbind(genes_t, protein = protein)))
感谢大家给我打了积极的分数,也帮助我解决了下面的问题:
#using the exemple data
#getting the individuals average:
mRNA_expression<- data.frame(genes=genes[,1], Means=rowMeans(genes[,-1]))
Protein_abundance<- data.frame(protein=protein[,1], Means=rowMeans(protein[,-1]))
#merging both to do the correlation graph
mean_corr <- data.frame(mRNA_expression[,2],Protein_abundance[,2])
names(mean_corr) <- c("mRNA_expression","Protein_abundance")
#deleting NA lines
mean_corr <- mean_corr[complete.cases(mean_corr),]
#appling log10
mean_corr <- log10 (mean_corr)
library(ggplot2)
#to check the distribution
ggplot(mean_corr, aes(x=Protein_abundance, y=mRNA_expression)) + labs(x = "Protein abundance (log10)", y = "mRNA expression (log10)") + theme(axis.title.y=element_text(margin=margin(0,10,0,0))) + theme(axis.title.x=element_text(margin=margin(10,0,0,0))) +
geom_point(shape=1) # Use hollow circles
#Different kind of plots::
ggplot(mean_corr, aes(x=Protein_abundance, y=mRNA_expression)) + labs(x = "Protein abundance (log10)", y = "mRNA expression (log10)") + theme(axis.title.y=element_text(margin=margin(0,10,0,0))) + theme(axis.title.x=element_text(margin=margin(10,0,0,0))) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) # Add linear regression line
# (by default includes 95% confidence region)
ggplot(mean_corr, aes(x=Protein_abundance, y=mRNA_expression))+ labs(x = "Protein abundance (log10)", y = "mRNA expression (log10)") + theme(axis.title.y=element_text(margin=margin(0,10,0,0))) + theme(axis.title.x=element_text(margin=margin(10,0,0,0))) +
geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm, # Add linear regression line
se=FALSE) # Don't add shaded confidence region
ggplot(mean_corr, aes(x=Protein_abundance, y=mRNA_expression)) + labs(x = "Protein abundance (log10)", y = "mRNA expression (log10)") + theme(axis.title.y=element_text(margin=margin(0,10,0,0))) + theme(axis.title.x=element_text(margin=margin(10,0,0,0))) +
geom_point(shape=1) + # Use hollow circles
geom_smooth() # Add a loess smoothed fit curve with confidence region
#statistics
#to check the correlation
cor(mean_corr)
#linear regression
#lm(genes_mean ~ protein$mean, data=mean_corr)
lm(Protein_abundance ~ mRNA_expression, data=mean_corr)