ballgown包进行基因差异表达分析
ballgown包可以读入Stringtie 的转录组组装及定量数据,进行基因差异表达分析。
1. 数据读入
# if (!requireNamespace("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
#
# BiocManager::install("ballgown")
require(ballgown)
library(dplyr)
library(genefilter)
project_dir <- '/home/test/rna_seq_data' # 项目文件夹,每个样品的数据放在不同的子文件夹里。
# 含有.ctab结尾的五个文件
# 注意samplePattern,文件名的前缀
m_bg <- ballgown(dataDir = project_dir,samplePattern = "SRR")
# length(structure(m_bg)$exon)
# length(structure(m_bg)$intron)
# length(structure(m_bg)$trans)
# 样品分组
pheno_data <- data.frame(id=sampleNames(m_bg), group=c(rep("1",3),rep("2",2)))
pData(m_bg) = pheno_data
2. 数据筛选
texpr(m_bg)[1:3,1:5] # 转录本表达矩阵
# 筛选FPKM在个样本中方差大于1的转录本
# subset {ballgown} 需要加载genefilter包
m_bg_filt = subset(m_bg,"rowVars(texpr(m_bg)) >1",genomesubset=TRUE)
3. 分析差异表达基因
# Identify genes that show statistically significant differences between groups
results_transcripts = stattest(m_bg_filt,feature="transcript",
covariate="group",
getFC=TRUE, meas="FPKM")
results_genes = stattest(m_bg_filt, feature="gene",covariate="group",
getFC=TRUE,meas="FPKM")
colnames(results_genes)
# Add gene names and gene IDs
results_transcripts =
data.frame(geneNames=ballgown::geneNames(m_bg_filt),
geneIDs=ballgown::geneIDs(m_bg_filt), results_transcripts)
head(results_transcripts)
colnames(results_transcripts)
# sort
results_transcripts = arrange(results_transcripts,pval) # arrange {dplyr}
results_genes = arrange(results_genes,pval)
head(results_transcripts)
head(results_genes)
# write to files
write.csv(results_transcripts, "/home/test/rna_seq_data/diff_transcript_results.csv",
row.names=FALSE)
write.csv(results_genes, "/home/test/rna_seq_data/diff_gene_results.csv",
row.names=FALSE)
4. 筛选差异表达基因
# choose
diff_trans_filt <- subset(results_transcripts,results_transcripts$qval<0.05)
diff_gene_filt <- subset(results_genes,results_genes$qval<0.05)
diff_trans_filt[1:3,1:5]
dim(diff_trans_filt)
dim(diff_gene_filt)
# p-value和q-value是统计学检验变量,衡量“假阳性概率”,应用到基因检测结果中,可衡量“某个基因差异 # 表达的假阳性概率”,代表差异显著性,小于0.05代表结果有差异。
# 如果p-value或q-value/越低,那么“该基因差异结果”是假阳性的概率就越低,可靠性就越高。
# q-value相比于p-value更加严格,当差异基因结果较少时,可退而求其次根据p-value筛选。
# 当然,用q值筛选可能会过滤掉少部分真的有差异的基因,所以,q值是个双刃剑。但,相比绝大部分基因的假 # 阳性,以及真阳性被滤掉的小概率,这部分的真阳性的丢失也不是很重要了。
5. 作图查看
tropical= c('darkorange', 'dodgerblue','hotpink',
'limegreen', 'yellow')
palette(tropical)
fpkm = texpr(m_bg_filt,meas="FPKM")
fpkm = log2(fpkm+1)
# 不同组之间的fpkm差异
boxplot(fpkm,col=as.numeric(pData(m_bg)$group),
las=2,ylab='log2(FPKM+1)')
# 转录本作图
plotTranscripts(ballgown::geneIDs(m_bg_filt)[200], m_bg_filt,
main=c('Gene PARK7 in sample SRR12663677'), sample=c('SRR12663677'))
plotTranscripts("merge.3987", m_bg_filt,
main='merge.3987 transcripts',
sample=c('SRR12663677','SRR12663681'))
plotTranscripts(ballgown::geneIDs(m_bg_filt)[200], m_bg_filt,
main= paste(as.character(ballgown::geneIDs(m_bg_filt)[200]), 'transcripts'),
sample=c('SRR12663677','SRR12663681'))
sampleNames(m_bg_filt)
plotMeans("merge.24191",
m_bg_filt,groupvar="group",legend=FALSE)
plotMeans(ballgown::geneIDs(m_bg_filt)[200],
m_bg_filt,groupvar="group",legend=FALSE)
版权声明:本文为qq_27390023原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。