1. 下载Annovar软件本体
https://annovar.openbioinformatics.org/en/latest/
2. 下在Annovar所需的注释文件
annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avdblist humandb_hg38
annotate_variation.pl -buildver hg38 -downdb -webfrom annovar refGene humandb_hg38
annotate_variation.pl -buildver hg38 -downdb cytoBand humandb_hg38
annotate_variation.pl -buildver hg38 -downdb -webfrom annovar avsnp151 humandb_hg38
annotate_variation.pl -buildver hg38 -downdb -webfrom annovar genomad41_exome humandb_hg38
# https://annovar.openbioinformatics.org/en/latest/user-guide/filter/#cosmic-annotations
# Prepare COSMIC database (COSMIC > 100)
tar xvf Cosmic_GenomeScreensMutant_Vcf_v100_GRCh38.tar
tar xvf Cosmic_GenomeScreensMutant_Tsv_v100_GRCh38.tar
gunzip Cosmic_GenomeScreensMutant_v100_GRCh38.vcf.gz
gunzip Cosmic_GenomeScreensMutant_v100_GRCh38.tsv.gz
tar xvf Cosmic_NonCodingVariants_Tsv_v100_GRCh38.tar
tar xvf Cosmic_NonCodingVariants_Vcf_v100_GRCh38.tar
gunzip Cosmic_NonCodingVariants_v100_GRCh38.vcf.gz
gunzip Cosmic_NonCodingVariants_v100_GRCh38.tsv.gz
# Run in Linux
echo -e '#Chr\tStart\tEnd\tRef\tAlt\tCOSMIC100' > hg38_cosmic100_raw.txt
prepare_annovar_user.pl -dbtype cosmic Cosmic_GenomeScreensMutant_v100_GRCh38.tsv -vcf Cosmic_GenomeScreensMutant_v100_GRCh38.vcf >> hg38_cosmic100_raw.txt
prepare_annovar_user.pl -dbtype cosmic Cosmic_NonCodingVariants_v100_GRCh38.tsv -vcf Cosmic_NonCodingVariants_v100_GRCh38.vcf >> hg38_cosmic100_raw.txt
index_annovar.pl hg38_cosmic100_raw.txt -outfile hg38_cosmic100.txt
# 注意手动构建的COSMIC注释文件会有重复行,运行annovar会报错
# 参考:https://github.com/WGLab/doc-ANNOVAR/issues/121
# 首先检查是否有重复行
awk '{count[$0]++} END {for (line in count) if (count[line] > 1) print count[line], line}' hg38_EAS.sites.2015_08.txt
# 以下脚本去重(未实际运行):
input_file="hg38_EAS.sites.2015_08.txt"
output_file="hg38_EAS.sites.2015_08.unique.txt"
temp_file=$(mktemp)
awk '
BEGIN {
FS = OFS = "\t";
}
{
key = $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5;
match($0, /OCCURENCE=([0-9]+)/, arr);
occ = arr[1];
if (seen[key] == "") {
seen[key] = occ;
max_line[key] = $0;
} else if (occ > seen[key]) {
seen[key] = occ;
max_line[key] = $0;
}
}
END {
for (k in max_line) {
print max_line[k];
}
}
' "$input_file" > "$temp_file"
mv "$temp_file" "$output_file"
echo "De-duplication completed, results are saved in $output_file"
3. 对vcf文件运行Annovar
table_annovar.pl 952T_wes/952T.purple.somatic.vcf.gz /home/zhoukaiwen/database/annovar/humandb_hg38 -buildver hg38 -out 952T -remove -protocol refGene,cytoBand,avsnp151,cosmic70,exac03,gnomad41_exome -operation g,r,f,f,f,f -nastring . -vcfinput
注意-operation的参数设置:
- g: Gene-based annotation (e.g., functional annotation), used in refGene, ensGene
- r: Region-based annotation (annotate specific regions), used in cytoBand, genomicSuperDups
- f: Filter-based annotation (compare against databases), used in 1000g, avsnp, clinvar
批量运行
cd somatic
ls *somatic.passed.vcf | perl -ne 'chomp; my $name = $1 if ($_ =~ /([^\/]+)\.somatic\.passed\.vcf/); print "table_annovar.pl $name.somatic.passed.vcf /data02/zhangmengmeng/database/annovar_db/humandb_hg38 -buildver hg38 -out $name -remove -protocol refGene,cytoBand,avsnp151,cosmic70,exac03,gnomad41_exome -operation g,r,f,f,f,f -nastring . -vcfinput \n"'>annovar.sh
4. 合并多个样本的Annovar结果
for i in *.hg38_multianno.txt
do
sample=`echo $i | awk -F '.' '{print $1}'`
cut -f '1-10' $i | sed '1d' | sed "s/$/\t${sample}/" >> all_sample.txt
done
sed -i '1s/^/Chr\tStart\tEnd\tRef\tAlt\tFunc.refGene\tGene.refGene\tGeneDetail.refGene\tExonicFunc.refGene\tAAChange.refGene\tTumor_Sample_Barcode\n/' all_sample.txt
5. 在R中使用maftools绘制合并的结果
library(maftools)
var_maf= annovarToMaf(annovar = "all_sample.txt",
Center = 'NA',
refBuild = 'hg38',
tsbCol = 'Tumor_Sample_Barcode',
table = 'refGene',MAFobj =T,
sep = "\t")
plotmafSummary(maf = var_maf, rmOutlier = TRUE, addStat = 'median')
oncoplot(var_maf)