0. 新建文件夹
mkdir {rawdata, align, macs3}
1. 数据质控+过滤同WES & RNA-seq
fastqc ./*.gz
multiqc ./*.zip
# 必要时使用Trimmomatic/trim_galore去接头,质控
2. BWA序列比对 + MarkDuplicates + 去除低质量序列 + 去除线粒体基因
# bwa mem比对+samtools排序+MarkDuplicates去除pcr重复
# ATAC需额外使用samtools view -h -f 2 -q 30来只保留两条reads要比对到同一条染色体(Proper paired) ,还有高质量的比对结果(Mapping quality>=30)
# 线粒体基因组由于没有染色质组装,处于开放状态,更容易被Tn5酶切割,所以线粒体序列需要去除
cd align
ls ../rawdata/*.Cleandata.R1.fq.gz|perl -ne 'chomp;my $name=$1 if($_=~/\/([^\/]+)\.Cleandata/);print "bwa mem -t 10 -R \"\@RG\\tID:$name\\tSM:$name\\tLB:ATAC\\tPL:Illumina\" /data02/zhangmengmeng/database/hg38/hg38.fa ../rawdata/$name\.Cleandata.R1.fq.gz ../rawdata/$name\.Cleandata.R2.fq.gz | samtools view -bS - >$name.bam && samtools sort -@ 5 -o $name.sort.bam $name.bam && gatk --java-options \"-Xmx20G -Djava.io.tmpdir=./\" MarkDuplicates --INPUT $name.sort.bam --OUTPUT $name.sort.markdup.bam -M $name.metrics --VALIDATION_STRINGENCY LENIENT && samtools index $name.sort.markdup.bam && samtools view -h -f 2 -q 30 $name.sort.markdup.bam | grep -v chrM | samtools view -bS -o $name.sort.markdup.final.bam && samtools index $name.sort.markdup.final.bam && echo $name ok\n"' >aln.sh
peak 可视化
# 先对fasta.fai文件进行处理
cut -f1,2 gencode_GRCh38.p14.genome.fa.fai > gencode_GRCh38.p14.genome.chrom.sizes
# 通过igvtools将bam处理成tdf文件,导入本地IGV可视化
igvtools count BC_1225T.sort.markdup.final.bam BC_1225T.sort.markdup.final.tdf /data02/zhangmengmeng/database/hg38/hg38.chrom.sizes
3. macs3 callpeak
# 软件使用说明:
## 寻找ChIP-seq的转录因子(TF)的命令(生成的文件是narrowPeak):
macs3 callpeak -t ChIP.bam -c Control.bam -f BAM -g hs -n test -B -q 0.01
## 寻找ChIP-seq的组蛋白(Histone )Mark的命令(生成的文件是broadPeak):
macs3 callpeak -t ChIP.bam -c Control.bam --broad -g hs --broad-cutoff 0.1
## 对于ATAC-seq双端数据的操作:
macs3 callpeak -f BAMPE -t ATAC.bam -g hs -n test -B -q 0.01
-f是输入文件的格式,可以是BAM、BED,软件可以自动识别,但是需要注意自动识别是无法区分是PE还是SE,所以建议手动指定;
-g是有效基因组大小,软件预设了模式物种的有效因组大小,如果hs人,mm小鼠;
-n是样本名字;
-B是以bedGraph格式存放fragment pileup, control lambda, -log10pvalue 和log10qvale,使用会增长耗时;
-q就是用q值(FDR)进行筛选,输入得到值为阈值;
broad-cutoff用于过滤 broad得到的peak。
--nolambda: 设置这个参数就意味着不用MACS推荐的动态lambda,而是使用背景lambda作为local lambda,也就是不考虑染色质结构等造成的局部偏误。因为ATAC没有control,因此需要用背景lambda作为local lambda,否则会丢失许多开放的染色体区域
--keep-dup: 保留重复。默认MACS(auto)会使用二项分布估计每个位置上是否存在重复(默认是1,也就是每个位置上出现一个read的概率最大)。如果你前期已经去重,那就使用all省了这一步.
# Peak calling 最后输出文件需要去除 ENCODE 的 blacklist(由于NGS测序读长的限制,目前的参考基因组有一些测不准的区域,即黑名单区域(Blacklist regions)。这些区域的有时候会具有高信号的富集,影响我们peak calling的结果。 )
bedtools intersect -v -a NAME_peaks.narrowPeak -b /data02/zhangmengmeng/database/ENCODE_BlackList/ENCFF356LFX.bed > FILTERED.narrowPeak
流程化代码:
cd align
conda activate macs3
ls *.sort.markdup.final.bam|perl -ne 'chomp;if($_=~/(\S+)\.sort/){print "macs3 callpeak -f BAMPE -t $1.sort.markdup.final.bam -g hs -n $1 -q 0.05 -B --keep-dup all --nolambda --nomodel --shift -100 --extsize 200 && bedtools intersect -v -a $1_peaks.narrowPeak -b /data02/zhangmengmeng/database/ENCODE_BlackList/ENCFF356LFX.bed > $1_filtered.narrowPeak && echo $1 ok\n"}' > macs3.sh
?VarCA 检测变异?
4. 基于homer的motif分析+注释(或者用MEME )
基础命令:
# findMotifsGenome.pl <peak/BED file> <genome> <output directory> -size # [options]
findMotifsGenome.pl /data02/zhangmengmeng/HR_pos/ATAC/align/peaks.bed hg38 /data02/zhangmengmeng/HR_pos/ATAC/homer -p 8 -size 200
annotatePeaks.pl /data02/zhangmengmeng/HR_pos/ATAC/align/peaks.bed hg38 > peak.annotation.xls
流程化处理:
ls ../align/*_summits.bed | perl -ne 'chomp; if ($_ =~ m|../align/(\S+)_summits.bed|) { print "annotatePeaks.pl /data02/zhangmengmeng/HR_pos/ATAC/align/$1_summits.bed hg38 > /data02/zhangmengmeng/HR_pos/ATAC/homer/$1_peak.annotation.xls && echo $1 ok\n"; }' > homer_annotatePeaks.sh
ls ../align/*_summits.bed | perl -ne 'chomp; if ($_ =~ m|../align/(\S+)_summits.bed|) { print "findMotifsGenome.pl /data02/zhangmengmeng/HR_pos/ATAC/align/$1_summits.bed hg38 /data02/zhangmengmeng/HR_pos/ATAC/homer/$1 -p 8 -size 200 && echo $1 ok\n"; }' > homer_findMotifsGenome.sh
5.1 下游分析1-差异分析(edgeR/DESeq2)
5.2 下游分析1-差异分析(DiffBind)
5.3 下游分析1-差异分析(macs3)
# 先根据分组合并bam文件跑macs3 callpeak
# ATAC需要设置--nolambda,否则会因为自己的"local background" 清除大多数染色体可及区域: https://github.com/macs3-project/MACS/issues/412
# 参数推荐: https://github.com/macs3-project/MACS/discussions/435
macs3 callpeak -f BAMPE -t ../../align/BC_1265T.sort.markdup.final.bam ../../align/BC_1652T.sort.markdup.final.bam ../../align/BC_874T.sort.markdup.final.bam -g hs -n Responsed -B -q 0.05 --keep-dup all --nolambda --nomodel --shift -100 --extsize 200 --outdir ./
macs3 callpeak -f BAMPE -t ../../align/BC_1225T.sort.markdup.final.bam ../../align/BC_850T.sort.markdup.final.bam ../../align/BC_1321T.sort.markdup.final.bam ../../align/BC_1604T.sort.markdup.final.bam -g hs -n UnResponsed -B -q 0.05 --keep-dup all --nolambda --nomodel --shift -100 --extsize 200 --outdir ./
# 输出结果文件:
# The NAME_treat_pileup.bdg and NAME_control_lambda.bdg files are in bedGraph format which can be imported to the UCSC genome browser or be converted into even smaller bigWig files.
# The NAME_treat_pielup.bdg contains the pileup signals (normalized according to --scale-to option) from ChIP/treatment sample.
# The NAME_control_lambda.bdg contains local biases estimated for each genomic location from the control sample, or from treatment sample when the control sample is absent.
# The subcommand bdgcmp can be used to compare these two files and make a bedGraph file of scores such as p-value, q-value, log-likelihood, and log fold changes.
$ macs3callpeakMerged.log Responsed_control_lambda.bdg Responsed_peaks.xls Responsed_treat_pileup.bdg UnResponsed_peaks.narrowPeak UnResponsed_summits.bed
$ macs3callpeakMerged.sh Responsed_peaks.narrowPeak Responsed_summits.bed UnResponsed_control_lambda.bdg UnResponsed_peaks.xls UnResponsed_treat_pileup.bdg
# 从输出结果文件中导出有效测序深度(effective sequencing depths)
egrep "fragments after filtering in treatment" Responsed_peaks.xls # 131052489
egrep "fragments after filtering in treatment" UnResponsed_peaks.xls # 159367706
egrep "total fragments in treatment" Responsed_peaks.xls # 158401364
egrep "total fragments in treatment" UnResponsed_peaks.xls # 186738400
# 运行macs3 bdgdiff
macs3 bdgdiff --t1 ../macs3bdg_callpeak_merged/Responsed_treat_pileup.bdg --c1 ../macs3bdg_callpeak_merged/Responsed_control_lambda.bdg --t2 ../macs3bdg_callpeak_merged/UnResponsed_treat_pileup.bdg --c2 ../macs3bdg_callpeak_merged/UnResponsed_control_lambda.bdg --d1 131052489 --d2 159367706 --outdir ./ --o-prefix diff_R_vs_UR
# 输出结果
# con1.bed保存了在condition1中(即bdgdiff中的t1和c1)上调的peak
# con2.bed保存了在condition2中(即bdgdiff中的t2和c2)上调的peak
# common.bed文件中保存的是没有达到阈值的,非显著差异peak
# 最后一列的内容为log10 likehood ratio值,用来衡量两个条件之间的差异,默认阈值为3,大于阈值的peak为组间差异显著的peak, 这个阈值可以通过-c参数进行调整
$ diff_R_vs_UR_c3.0_common.bed diff_R_vs_UR_c3.0_cond1.bed diff_R_vs_UR_c3.0_cond2.bed
# homer注释结果:
ls *.bed | perl -ne 'chomp; if ($_ =~ m|(\S+).bed|) { print "annotatePeaks.pl $1_summits.bed hg38 > $1_peak.annotation.xls && echo $1 ok\n"; }' > homer_annotatePeaks.sh
6. 下游分析2-峰注释
library(ChIPseeker)
library(GenomicFeatures)
#生成txdb对象,如果研究物种没有已知的TxDb,可以用GenomicFeatures中的函数生成
txdb<- makeTxDbFromGFF(‘gene.gtf’)
#导入需要注释的peak文件
peakfile <-readPeakFile(‘A1_peaks.narrowPeak’)
peakAnno <- annotatePeak(peakfile,tssRegion=c(-2000, 2000), TxDb=txdb)
# 用peak文件和txdb进行peak注释,这里可以通过tssRegion定义TSS区域的区间
#对于peak注释的结果,也可以进行可视化展示,如:
p <- plotAnnoPie(peakAnno)
7. 富集分析(goseq、topGO等R包)