1.INTRODUCTION(介绍)
1.数据来源
- GDC Legacy Archive
- GDC Harmonized database
2.barcode
2.Install.packages(包安装)
3.数据下载
我们以胆管癌数据为例进行展示
-
下载表达数据
#数据查询(就像你在页面网站上点来点去)
query <- GDCquery(project = "TCGA-CHOL",
+ data.category = "Transcriptome Profiling",
+ data.type = "Gene Expression Quantification",
+ workflow.type = "STAR - Counts",
+ legacy = FALSE,
+ experimental.strategy = 'RNA-Seq')#数据下载
GDCdownload(query = query)
-
数据整理
#数据准备
dataPrep <- GDCprepare(query = query)#数据整理
library(SummarizedExperiment)
expdat <- assay(dataPrep)
dim(expat)
head(expdat)#数据调整
library(stringr)
group_list<-ifelse(str_sub(colnames(expdat),14,15) == "01","tumor","normal")
group_list
coldata<-data.frame(row.names = colnames(expdat),condition=group_list)
coldata
-
DESeq2差异分析
#差异表达分析
library(DESeq2)
expdat<-round(expdat,0)
dds<-DESeqDataSetFromMatrix(countData = expdat,colData = coldata,design = ~ condition)
keep<-rowSums(counts(dds))>=10
dds<-dds[keep,]
dds<-DESeq(dds)
res<-results(dds,contrast = c("condition","tumor","normal"))
resOrdered <- res[order(res$pvalue),]
DEG <- as.data.frame(resOrdered)
DEG <- na.omit(DEG)
logFC_cutoff<-with(DEG,mean(abs(log2FoldChange))+2*sd(abs(log2FoldChange)))
logFC_cutoff
DEG$change<-as.factor(ifelse(DEG$pvalue<0.05&abs(DEG$log2FoldChange)>logFC_cutoff,ifelse(DEG$log2FoldChange>logFC_cutoff,"UP","DOWN"),"NOT"))
-
差异表达基因文件输出
#文件会默认储存在你所创建的根目录里,你也可以根据自己的需要进行调整
write.csv(DEG,file = "TCGA_CHOL_DEG.csv")
-
差异表达基因可视化(火山图)
#火山图可视化
library(ggplot2)
this_title <- paste0('Cutoff for logFC is ',round(logFC_cutoff,3),'\nThe number of up gene is ',nrow(DEG[DEG$change =='UP',]) ,'\nThe number of down gene is ',nrow(DEG[DEG$change =='DOWN',]))
ggplot(data=DEG,aes(x=log2FoldChange,y=-log10(pvalue),color=change))+geom_point(alpha=0.4,size=1.75)+labs(x="log2 fold change")+ ylab("-log10 pvalue")+ggtitle(this_title)+theme_bw(base_size = 10)+theme(plot.title = element_text(size=15,hjust=0.5))+scale_color_manual(values=c('blue','black','red'))
过程参考了一下链接进行整理R语言TCGA分析https://zhuanlan.zhihu.com/p/107714916