manta+strelka2 call somatic mutation
bgzip -f AgilentV6_GRCh38_ex_region.sort.filtered.bed
tabix -p bed AgilentV6_GRCh38_ex_region.sort.filtered.bed.gz
mkdir {manta,strelka2}
conda activate strelka2
cd manta
perl -ne 'chomp; next if /^$/; @a = split /\t/; print "configManta.py --tumorBam ../align/$a[0]_bqsr.bam --normalBam ../align/$a[1]_bqsr.bam --referenceFasta /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --callRegions /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed.gz --generateEvidenceBam --runDir /data02/zhangmengmeng/BPT/LCM-WES/manta/$a[0] --exome --outputContig \n"' sample_pair.txt > CreateRunManta.sh
./CreateRunManta.sh |grep "runWorkflow.py" > RunManta.sh
nohup ./RunManta.sh > RunManta.log 2>&1 &
cat RunManta.log |grep "Manta workflow successfully completed"
# 注意strelka2的conda版本似乎有问题,需要从github下载后解压使用
cd ../strelka2
perl -ne 'chomp; next if /^$/; @a = split /\t/; print "/data02/zhangmengmeng/software/strelka-2.9.10.centos6_x86_64/bin/configureStrelkaSomaticWorkflow.py --tumorBam ../align/$a[0]_bqsr.bam --normalBam ../align/$a[1]_bqsr.bam --referenceFasta /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --callRegions /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed.gz --indelCandidates /data02/zhangmengmeng/BPT/LCM-WES/manta/$a[0]/results/variants/candidateSmallIndels.vcf.gz --runDir /data02/zhangmengmeng/BPT/LCM-WES/strelka2/$a[0] --exome \n"' sample_pair.txt > CreateRunStrelka2.sh
./CreateRunStrelka2.sh |grep "runWorkflow.py" > RunStrelka2.sh
sed -i 's#$# -m local -j 4#' RunStrelka2.sh
cat RunStrelka2.log |grep "Strelka somatic workflow successfully completed"
运行完后处理:
1.将所有vcf文件copy到新的文件夹
# cp_results.sh
mkdir -p snp indel
# Move and rename somatic.snvs.vcf.gz
find . -path "*/results/variants/somatic.snvs.vcf.gz" | while read file; do
new_name=$(echo "$file" | awk -F'/' '{print $(NF-3) ".somatic.snvs.vcf.gz"}')
cp "$file" "snp/$new_name"
done
# Move and rename somatic.indels.vcf.gz
find . -path "*/results/variants/somatic.indels.vcf.gz" | while read file; do
new_name=$(echo "$file" | awk -F'/' '{print $(NF-3) ".somatic.indels.vcf.gz"}')
cp "$file" "indel/$new_name"
done
2.解压并筛选pass
cd snp
gunzip ./*.gz
cd indel
gunzip ./*.gz
cd ../
# filter_pass.sh:
# Process VCF files in the snp directory
for file in snp/*.vcf; do
grep '^#' "$file" > "${file%.vcf}_filtered.vcf" # Extract headers
awk '$7 == "PASS"' "$file" >> "${file%.vcf}_filtered.vcf" # Append PASS rows
done
# Process VCF files in the indel directory
for file in indel/*.vcf; do
grep '^#' "$file" > "${file%.vcf}_filtered.vcf" # Extract headers
awk '$7 == "PASS"' "$file" >> "${file%.vcf}_filtered.vcf" # Append PASS rows
done
varscan2 call somatic mutation
cd align
ls *_bqsr.bam|perl -ne 'chomp;if($_=~/(\S+)\_bqsr/){print "samtools view -b -u -q 20 $1_bqsr.bam | samtools mpileup -f /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa - \> $1.mpileup && echo $1 ok\n"}' > samtools_mpileup.sh
cd varscan2
# 同时运行varscan somatic + varscan processSomatic:
perl -ne 'chomp; next if /^$/; @a = split /\t/; print "varscan somatic ../../align/$a[1].mpileup ../../align/$a[0].mpileup $a[0] --strand-filter 1 --min-coverage 100 --min-reads 3 --min-var-freq 0.05 --p-value 0.05 --output-vcf 1 && varscan processSomatic $a[0].snp.vcf --min-tumor-freq 0.05 --max-normal-freq 0.05 --p-value 0.05 && varscan processSomatic $a[0].indel.vcf --min-tumor-freq 0.05 --max-normal-freq 0.05 --p-value 0.05 && echo $a[0] varscan2 ok\n"' ../../somatic/sample_pair.txt > varscan2.sh
# 单独运行varscan2后processSomatic:
perl -ne 'chomp; next if /^$/; @a = split /\t/; print "varscan somatic ../../align/$a[1].mpileup ../../align/$a[0].mpileup $a[0] --strand-filter 1 --min-coverage 250 --min-reads 3 --min-var-freq 0.05 --p-value 0.05 --output-vcf 1 && echo $a[0] varscan2 ok\n"' ../../somatic/sample_pair.txt > varscan2.sh
perl -ne 'chomp; next if /^$/; @a = split /\t/; print "varscan processSomatic $a[0].snp.vcf --min-tumor-freq 0.05 --max-normal-freq 0.05 --p-value 0.05 && varscan processSomatic $a[0].indel.vcf --min-tumor-freq 0.05 --max-normal-freq 0.05 --p-value 0.05 && echo $a[0] varscan2 processSomatic ok\n"' ../../somatic/sample_pair.txt > varscan2_processSomatic.sh
# varscan结束后运行vcf2maf:
conda activate vcf2maf
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "vcf2maf.pl --input-vcf $a[0].snp.Somatic.hc.vcf --output-maf $a[0].snp.Somatic.hc.maf --normal-id $a[1] --tumor-id $a[0] --vcf-tumor-id TUMOR --vcf-normal-id NORMAL --vep-path /home/zhoukaiwen/software/anaconda3/envs/vcf2maf/bin --vep-data /home/zhoukaiwen/database/vep/ --ref-fasta /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa --verbose --ncbi-build GRCh38 --cache-version 114 && vcf2maf.pl --input-vcf $a[0].snp.Germline.hc.vcf --output-maf $a[0].snp.Germline.hc.maf --normal-id $a[1] --tumor-id $a[0] --vcf-tumor-id TUMOR --vcf-normal-id NORMAL --vep-path /home/zhoukaiwen/software/anaconda3/envs/vcf2maf/bin --vep-data /home/zhoukaiwen/database/vep/ --ref-fasta /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa --verbose --ncbi-build GRCh38 --cache-version 114 && vcf2maf.pl --input-vcf $a[0].snp.LOH.hc.vcf --output-maf $a[0].snp.LOH.hc.maf --normal-id $a[1] --tumor-id $a[0] --vcf-tumor-id TUMOR --vcf-normal-id NORMAL --vep-path /home/zhoukaiwen/software/anaconda3/envs/vcf2maf/bin --vep-data /home/zhoukaiwen/database/vep/ --ref-fasta /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa --verbose --ncbi-build GRCh38 --cache-version 114 && vcf2maf.pl --input-vcf $a[0].indel.Somatic.hc.vcf --output-maf $a[0].indel.Somatic.hc.maf --normal-id $a[1] --tumor-id $a[0] --vcf-tumor-id TUMOR --vcf-normal-id NORMAL --vep-path /home/zhoukaiwen/software/anaconda3/envs/vcf2maf/bin --vep-data /home/zhoukaiwen/database/vep/ --ref-fasta /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa --verbose --ncbi-build GRCh38 --cache-version 114 && vcf2maf.pl --input-vcf $a[0].indel.Germline.hc.vcf --output-maf $a[0].indel.Germline.hc.maf --normal-id $a[1] --tumor-id $a[0] --vcf-tumor-id TUMOR --vcf-normal-id NORMAL --vep-path /home/zhoukaiwen/software/anaconda3/envs/vcf2maf/bin --vep-data /home/zhoukaiwen/database/vep/ --ref-fasta /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa --verbose --ncbi-build GRCh38 --cache-version 114 && vcf2maf.pl --input-vcf $a[0].indel.LOH.hc.vcf --output-maf $a[0].indel.LOH.hc.maf --normal-id $a[1] --tumor-id $a[0] --vcf-tumor-id TUMOR --vcf-normal-id NORMAL --vep-path /home/zhoukaiwen/software/anaconda3/envs/vcf2maf/bin --vep-data /home/zhoukaiwen/database/vep/ --ref-fasta /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa --verbose --ncbi-build GRCh38 --cache-version 114 && echo $a[0] vcf2maf ok\n"' ../../somatic/sample_pair.txt >vcf2maf.sh
# 批量运行Annovar
ls *.Somatic.hc.vcf | perl -ne 'chomp; my $name = $1 if ($_ =~ /([^\/]+)\.Somatic\.hc\.vcf/); print "table_annovar.pl $name.Somatic.hc.vcf /home/zhoukaiwen/database/Annovar/humandb_hg38 -buildver hg38 -out $name.Somatic.hc -remove -protocol refGene,cytoBand,avsnp151,exac03,gnomad41_exome,EAS.sites.2015_08,cosmic102_unique -operation g,r,f,f,f,f,f -nastring . -vcfinput \n"'>annovar.sh
# 把Annovar注释后vcf转换成maf
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "vcf2maf.pl --input-vcf $a[0].snp.Somatic.hc.hg38_multianno.vcf --output-maf $a[0].snp.Somatic.hc.hg38_multianno.maf --normal-id $a[1] --tumor-id $a[0] --vcf-tumor-id TUMOR --vcf-normal-id NORMAL --vep-path /home/zhoukaiwen/software/anaconda3/envs/vcf2maf/bin --vep-data /home/zhoukaiwen/database/vep/ --ref-fasta /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa --verbose --ncbi-build GRCh38 --cache-version 114 && cat $a[0].snp.Somatic.hc.hg38_multianno.maf|grep -E \"version|FILTER|PASS\" > $a[0].snp_pass.Somatic.hc.hg38_multianno.maf && vcf2maf.pl --input-vcf $a[0].indel.Somatic.hc.hg38_multianno.vcf --output-maf $a[0].indel.Somatic.hc.hg38_multianno.maf --normal-id $a[1] --tumor-id $a[0] --vcf-tumor-id TUMOR --vcf-normal-id NORMAL --vep-path /home/zhoukaiwen/software/anaconda3/envs/vcf2maf/bin --vep-data /home/zhoukaiwen/database/vep/ --ref-fasta /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa --verbose --ncbi-build GRCh38 --cache-version 114 && cat $a[0].indel.Somatic.hc.hg38_multianno.maf|grep -E \"version|FILTER|PASS\" > $a[0].indel_pass.Somatic.hc.hg38_multianno.maf && echo $a[0] vcf2maf ok\n"' ../../somatic/sample_pair.txt >vcf2maf2.sh