数据来源:https://ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE226291
避雷指南(希望我真的记住:)
0.很多命令用的不熟练,但是熟能生巧,还是希望你能多多多多练习——算我求你了。1. 数据下载:确定好control和treat分别对应的SRR号,千万别搞反了;2. 基因注释文件下载:NCBI或ENDNOTE或gencote下载的注释文件里id不一样,需要提前查看文件确定是ensembl id还是gene id,这个决定了下游分析是是否要id转换,最好是不要这一步,费时费力还容易报错;3. 比对软件的选择:hisat2最好不用,一方面他的结果是sam文件,后面还需要转bam文件,这个耕费时间,同时他自身比对也比较慢;STAR虽然需要自己建索引,但是人和小鼠的建好了以后也可以用,整体耗费的时间比hisat2少太多;4. 计数软件:htseq软件也是很慢,可以不用;featureCounts非常迅速,同时可以将几个文件的计数结果放在一个文件里,最后就省去了合并,好用;RSEM一般,用起来也麻烦,建议还是featureCounts;5. 下游分析如果不需要id转换,就方便一些,按步骤整理好count矩阵,用DESeq2分析就好,分析过程中切记看好control组和treat组,不能写反;6. 最好时刻查看数据框是否是自己想要的样子,有问题及时解决;7. 报错后如果gpt无法解决,就直接放进浏览器,在CSDN、简书、gitub上搜寻;8. 火山图绘制时可以添加的东西很多,可以结合实际需要做选择;9. GO分析只需要找出差异基因,可以选择网站也可以直接r语言。
下载数据(3小时)
#!/bin/bash
for i in 1 2 3 4 5 6
do
prefetch SRR2364187${i}
done#下载注释文件的时候最好查看一下 选择gene id 要不然后面还需要id转换
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/GRCm38.p6.genome.fa.gz
wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_mouse/release_M25/gencode.vM25.annotation.gtf.gz
格式转换(5-6小时)
#!/bin/bash
for i in 1 2 3 4 5 6
do
echo SRR2364187${i}
fastq-dump --gzip --split-files SRR2364187${i}
done
数据质控(0.5小时)
#!/bin/bash
for i in 1 2 3 4 5 6
do
fastp -i /home/hemiaomiao/NDNFrnaseq/data/SRR2364187${i}_1.fastq.gz -I /home/hemiaomiao/NDNFrnaseq/data/SRR2364187${i}_2.fastq.gz -o SRR2364187${i}_1_clean.fastq.gz -O SRR2364187${i}_2_clean.fastq.gz -h report.html -j report.json
done
hisat2序列比对(5.5小时)
索引文件可以自己建,也可以下载,STAR需要自己建索引,hisat2可以直接用官网的索引。
#!/bin/bash
for i in 1 2 3 4 5 6
do
hisat2 -t -x /home/hemiaomiao/NDNFrnaseq/genome/mm10/genome -1 /home/hemiaomiao/NDNFrnaseq/data/SRR2364187${i}_1_clean.fastq.gz -2 /home/hemiaomiao/NDNFrnaseq/data/SRR2364187${i}_2_clean.fastq.gz -S /home/hemiaomiao/NDNFrnaseq/data/SRR2364187${i}.sam
done
STAR比对(1小时 +1小时)
#建索引
STAR --runMode genomeGenerate --runThreadN 16 --genomeDir /home/hemiaomiao/NDNFrnaseq/genome/mm10_index --genomeFastaFiles /home/hemiaomiao/NDNFrnaseq/genome/GRCm38.p6.genome.fa --sjdbGTFfile /home/hemiaomiao/NDNFrnaseq/genome/gencode.vM25.annotation.gtf --sjdbOverhang 100#比对
#!/bin/bash
for i in 1 2 3 4 5 6
doSTAR --genomeDir /home/hemiaomiao/NDNFrnaseq/genome/mm10_index \--readFilesIn /home/hemiaomiao/NDNFrnaseq/data/SRR2364187${i}_1_clean.fastq.gz /home/hemiaomiao/NDNFrnaseq/data/SRR2364187${i}_2_clean.fastq.gz \--runThreadN 16 \--runMode alignReads \--readFilesCommand zcat \--quantMode TranscriptomeSAM GeneCounts \--outFileNamePrefix SRR2364187${i}_ \--outSAMtype BAM SortedByCoordinate
done
chmod 777 star.sh
./star.sh
samtools文件格式转换(1.5-2小时)(针对hisat2的结果)
#!/bin/bash
for i in 1 2 3 4 5 6
do
samtools view -S SRR2364187${i}.sam -b > SRR2364187${i}.bam
samtools sort SRR2364187${i}.bam -o SRR2364187${i}_sorted.bam #将所有的bam文件按默认的染色体位置进行排序
samtools index SRR2364187${i}_sorted.bam
done
htseq计数(9小时)
#!/bin/bash
for i in 1 2 3 4 5 6
do
samtools sort -n SRR2364187${i}.bam -o SRR2364187${i}_nsorted.bam #上一步是按照染色体位置排序的 这里需要按照reads数重新排序(read name排序)
htseq-count -r name -f bam /home/hemiaomiao/NDNFrnaseq/data/SRR2364187${i}_nsorted.bam /home/hemiaomiao/NDNFrnaseq/genome/gencode.vM25.annotation.gtf > /home/hemiaomiao/NDNFrnaseq/matrix/SRR2364187${i}.count
done
featureCounts计数(10分钟左右)
这一步的结果第一列是ensembl_gene_id而不是gene_symbol,但上次用STAR比对后统计是gene,这个不确定原因。
#计数hisat2比对结果
featureCounts -g gene_id -a /home/hemiaomiao/NDNFrnaseq/genome/gencode.vM25.annotation.gtf -o /home/hemiaomiao/NDNFrnaseq/matrix/gene_exp.txt -p /home/hemiaomiao/NDNFrnaseq/data/SRR23641871_nsorted.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641872_nsorted.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641873_nsorted.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641874_nsorted.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641875_nsorted.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641876_nsorted.bam#计数STAR比对结果
featureCounts -g gene_id -a /home/hemiaomiao/NDNFrnaseq/genome/gencode.vM25.annotation.gtf -o /home/hemiaomiao/NDNFrnaseq/matrix/gene_count.txt -p /home/hemiaomiao/NDNFrnaseq/data/SRR23641871_Aligned.sortedByCoord.out.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641872_Aligned.sortedByCoord.out.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641873_Aligned.sortedByCoord.out.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641874_Aligned.sortedByCoord.out.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641875_Aligned.sortedByCoord.out.bam /home/hemiaomiao/NDNFrnaseq/data/SRR23641876_Aligned.sortedByCoord.out.bam
下游分析
#featureCounts计数
raw_gene_exp <- read.table("D:/R/NDNF/NDNF-RNAseq/gene_count.txt",header = T,row.names = 1)
gene_exp <- raw_gene_exp[,6:11]
colnames(gene_exp) <- c("treat1","treat2","treat3","control1","control2","control3")library(DESeq2)
library(ggrepel)
library(ggplot2)#行名去除小数点
ENSEMBL <- gsub("\\.\\d*", "", row.names(gene_exp))
row.names(gene_exp) <- ENSEMBL #去除表达量为0的行
gene_exp_filt <- gene_exp[rowSums(gene_exp == 0) != ncol(gene_exp), ]#gene名转换
ensembl <- useMart("ensembl", dataset = "mmusculus_gene_ensembl")
my_ensembl_gene_id<-row.names(gene_exp_filt)
options(timeout=4000000)
devtools::install_version("dbplyr", version = "2.3.4")
mms_symbols<-biomaRt::getBM(attributes=c('ensembl_gene_id','external_gene_name'),filters='ensembl_gene_id',values=list(ensembl_gene_id =my_ensembl_gene_id),mart=ensembl)#合并表达矩阵
gene_exp_filt$gene_id <- rownames(gene_exp_filt)
colnames(gene_exp_filt) <- c("treat1","treat2","treat3","control1","control2","control3","ensembl_gene_id")
readcount<-merge(gene_exp_filt,mms_symbols,by="ensembl_gene_id")
readcount <- readcount[,-1]#合并重复行
library(dplyr)
merged_readcount <- readcount %>%group_by(external_gene_name) %>%summarise(treat1 = sum(treat1, na.rm = TRUE),treat2 = sum(treat2, na.rm = TRUE),treat3 = sum(treat3, na.rm = TRUE),control1 = sum(control1, na.rm = TRUE), # 对control1求和control2 = sum(control2, na.rm = TRUE),control3 = sum(control3, na.rm = TRUE),)
merged_readcount <- merged_readcount[-1,]
row.names(merged_readcount) <- merged_readcount$external_gene_name
rowname <- row.names(merged_readcount)
merged_readcount <- merged_readcount[,-1]
row.names(merged_readcount) <- rowname#构建DDS对象 开始DES分析
#countData差异基因列表 colData
condition <- factor(c(rep("treat",times=3),rep("control",times=3)))
colData = data.frame(row.names = colnames(merged_readcount), condition)
dds <- DESeqDataSetFromMatrix(merged_readcount,colData,design = ~condition)
dds <- DESeq(dds)#提取结果(dds和dds_resuil都是package result从dds分析结果里提取出对比结果)
dds_results <- results(dds, contrast = c("condition","treat","control"))
dds_results <- as.data.frame(dds_results[order(dds_results$pvalue),])
table(dds_results$pvalue<0.05)#绘图进阶
library(RColorBrewer)
library(ggrepel)
dds_results$name <- rownames(dds_results)
dds_results <- dds_results[,c(1,2,3,4,5,7)]
#画的火山图标签有NA 所以需要去除 这一步之前dds_results是29808行 去除后是29806行
dds_results <- na.omit(dds_results)
dds_results$change <- factor(ifelse(dds_results$pvalue < 0.05 & abs(dds_results$log2FoldChange) >= 1,ifelse(dds_results$log2FoldChange >= 1, 'Up', 'Down'),'Stable'),levels = c('Up', 'Down', 'Stable')
)
table(dds_results$change)#上一步的另一种写法
dds_results$change <- "Stable"
dds_results[dds_results$pvalue<0.05 & dds_results$log2FoldChange >= 1,]$change <- "Up"
dds_results[dds_results$pvalue<0.05 & dds_results$log2FoldChange <= -1,]$change <- "Down"ggplot(dds_results,aes(x=log2FoldChange,y= -log10(pvalue),color=change))+geom_point(data = dds_results[dds_results$pvalue<0.05&abs(dds_results$log2FoldChange)>=1,],size = 5)+ #上调或下调的基因圆点大小为5geom_point(data = dds_results[dds_results$pvalue>0.05|abs(dds_results$log2FoldChange)<1,],size = 4)+ #没有变化的大小为4scale_color_manual(values=c("#f38db0","#7eadc9","#d9d9d9"))+ #确定点的颜色geom_text_repel(data = dds_results[dds_results$pvalue<0.05&abs(dds_results$log2FoldChange)>1,],aes(label = name),size = 4.5,color = "black",segment.color = "black", show.legend = FALSE )+ #添加关注的点的基因名ylab('-log10 (pvalue)')+ #修改y轴名称xlab('log2 (FoldChange)')+ #修改x轴名称geom_vline(xintercept=c(-1,1),lty=3,col="black",lwd=0.5) + #添加横线|logFoldChange|>1geom_hline(yintercept = -log10(0.05),lty=3,col="black",lwd=0.5) + #添加竖线padj<0.05theme_bw(base_line_size = 1 )+ #主题设置和坐标轴的粗细geom_label(label = "Down 207 genes", hjust = 0, size = 4, color = "#7eadc9")+theme(axis.title.x = element_text(size = 15, color = "black",face = "bold"),axis.title.y = element_text(size = 15,color = "black",face = "bold", vjust = 1.9, hjust = 0.5, angle = 90),legend.title = element_blank(),legend.text = element_text(color="black", # 设置图例标签文字size = 10, face = "bold"),axis.text.x = element_text(size = 13, # 修改X轴上字体大小,color = "black", # 颜色face = "bold", # face取值:plain普通,bold加粗,italic斜体,bold.italic斜体加粗vjust = 0.5, # 位置hjust = 0.5, angle = 0), #角度axis.text.y = element_text(size = 13, color = "black",face = "bold", vjust = 0.5, hjust = 0.5, angle = 0) )
table(dds_results$change)#普通火山图
p <- ggplot(data = dds_results, aes(x=log2FoldChange,y= -log10(pvalue))) + geom_point(alpha = 0.4, size = 3.5, aes(color = change)) + # 添加散点,根据change列着色ylab("-log10(Pvalue)") + # 设置y轴标签scale_color_manual(values = c("#f38db0","#7eadc9","#d9d9d9")) + geom_vline(xintercept=c(-1,1),lty=3,col="black",lwd=0.5) + #添加横线|logFoldChange|>1geom_hline(yintercept = -log10(0.05),lty=3,col="black",lwd=0.5) + #添加竖线padj<0.05theme_bw()+ # 使用网格白底主题geom_text_repel(data = dds_results[dds_results$pvalue<0.05&abs(dds_results$log2FoldChange)>1,],aes(label = name),size = 4.5,color = "black",segment.color = "black", show.legend = FALSE )+ #添加关注的点的基因名theme(axis.title.x = element_text(size = 15, color = "black",face = "bold"),axis.title.y = element_text(size = 15,color = "black",face = "bold", vjust = 1.9, hjust = 0.5, angle = 90),legend.title = element_blank(),legend.text = element_text(color="black", # 设置图例标签文字size = 10, face = "bold"),axis.text.x = element_text(size = 13, # 修改X轴上字体大小,color = "black", # 颜色face = "bold", # face取值:plain普通,bold加粗,italic斜体,bold.italic斜体加粗vjust = 0.5, # 位置hjust = 0.5, angle = 0), #角度axis.text.y = element_text(size = 13, color = "black",face = "bold", vjust = 0.5, hjust = 0.5, angle = 0) )
table(dds_results$change)