1.使用mutect按病人同时处理样本
gatk Mutect2 -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa -I ../align/FETB01-BNPT-E_bqsr.bam -I ../align/FETB01-BNPT-M_bqsr.bam -I ../align/FETB01-FA-E_bqsr.bam -I ../align/FETB01-FA-M_bqsr.bam -I ../align/FETB01-B_bqsr.bam -normal FETB01-B -L /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed --germline-resource /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -pon /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_1000g_pon.hg38.vcf.gz --f1r2-tar-gz FETB01.f1r2.tar.gz -O FETB01.raw.vcf && gatk LearnReadOrientationModel -I FETB01.f1r2.tar.gz -O FETB01.read-orientation-model.tar.gz && gatk GetPileupSummaries -I ../align/FETB01-BNPT-E_bqsr.bam -I ../align/FETB01-BNPT-M_bqsr.bam -I ../align/FETB01-FA-E_bqsr.bam -I ../align/FETB01-FA-M_bqsr.bam -L /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -O FETB01.pileups.table && gatk GetPileupSummaries -I ../align/FETB01-B_bqsr.bam -L /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -O FETB01-B.pileups.table && gatk CalculateContamination -I FETB01.pileups.table -matched FETB01-B.pileups.table -O FETB01.CalculateContamination.table && gatk FilterMutectCalls -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa -V FETB01.raw.vcf --contamination-table FETB01.CalculateContamination.table --ob-priors FETB01.read-orientation-model.tar.gz -O FETB01.somatic.vcf && gatk FilterAlignmentArtifacts -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --bwa-mem-index-image /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa.img -V FETB01.somatic.vcf -O FETB01.somatic.FAA.vcf -I ../align/FETB01-BNPT-E_bqsr.bam -I ../align/FETB01-BNPT-M_bqsr.bam -I ../align/FETB01-FA-E_bqsr.bam -I ../align/FETB01-FA-M_bqsr.bam && awk '/^#/ {print $0; next} $7=="PASS" {print $0}' FETB01.somatic.FAA.vcf > ./FETB01.somatic.passed.vcf && echo FETB01 somatic ok
gatk Mutect2 -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa -I ../align/FETB02-BNPT-E_bqsr.bam -I ../align/FETB02-BNPT-M_bqsr.bam -I ../align/FETB02-FA-E_bqsr.bam -I ../align/FETB02-FA-M_bqsr.bam -I ../align/FETB02-B_bqsr.bam -normal FETB02-B -L /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed --germline-resource /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -pon /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_1000g_pon.hg38.vcf.gz --f1r2-tar-gz FETB02.f1r2.tar.gz -O FETB02.raw.vcf && gatk LearnReadOrientationModel -I FETB02.f1r2.tar.gz -O FETB02.read-orientation-model.tar.gz && gatk GetPileupSummaries -I ../align/FETB02-BNPT-E_bqsr.bam -I ../align/FETB02-BNPT-M_bqsr.bam -I ../align/FETB02-FA-E_bqsr.bam -I ../align/FETB02-FA-M_bqsr.bam -L /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -O FETB02.pileups.table && gatk GetPileupSummaries -I ../align/FETB02-B_bqsr.bam -L /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -O FETB02-B.pileups.table && gatk CalculateContamination -I FETB02.pileups.table -matched FETB02-B.pileups.table -O FETB02.CalculateContamination.table && gatk FilterMutectCalls -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa -V FETB02.raw.vcf --contamination-table FETB02.CalculateContamination.table --ob-priors FETB02.read-orientation-model.tar.gz -O FETB02.somatic.vcf && gatk FilterAlignmentArtifacts -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --bwa-mem-index-image /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa.img -V FETB02.somatic.vcf -O FETB02.somatic.FAA.vcf -I ../align/FETB02-BNPT-E_bqsr.bam -I ../align/FETB02-BNPT-M_bqsr.bam -I ../align/FETB02-FA-E_bqsr.bam -I ../align/FETB02-FA-M_bqsr.bam && awk '/^#/ {print $0; next} $7=="PASS" {print $0}' FETB02.somatic.FAA.vcf > ./FETB02.somatic.passed.vcf && echo FETB02 somatic ok
gatk Mutect2 -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa -I ../align/FETB03-BNPT-E_bqsr.bam -I ../align/FETB03-BNPT-M_bqsr.bam -I ../align/FETB03-FA-E_bqsr.bam -I ../align/FETB03-FA-M_bqsr.bam -I ../align/FETB03-B_bqsr.bam -normal FETB03-B -L /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed --germline-resource /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -pon /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_1000g_pon.hg38.vcf.gz --f1r2-tar-gz FETB03.f1r2.tar.gz -O FETB03.raw.vcf && gatk LearnReadOrientationModel -I FETB03.f1r2.tar.gz -O FETB03.read-orientation-model.tar.gz && gatk GetPileupSummaries -I ../align/FETB03-BNPT-E_bqsr.bam -I ../align/FETB03-BNPT-M_bqsr.bam -I ../align/FETB03-FA-E_bqsr.bam -I ../align/FETB03-FA-M_bqsr.bam -L /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -O FETB03.pileups.table && gatk GetPileupSummaries -I ../align/FETB03-B_bqsr.bam -L /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.filtered.bed -V /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -O FETB03-B.pileups.table && gatk CalculateContamination -I FETB03.pileups.table -matched FETB03-B.pileups.table -O FETB03.CalculateContamination.table && gatk FilterMutectCalls -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa -V FETB03.raw.vcf --contamination-table FETB03.CalculateContamination.table --ob-priors FETB03.read-orientation-model.tar.gz -O FETB03.somatic.vcf && gatk FilterAlignmentArtifacts -R /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa --bwa-mem-index-image /data02/zhangmengmeng/database/hg38/gencode_GRCh38.p14.genome.fa.img -V FETB03.somatic.vcf -O FETB03.somatic.FAA.vcf -I ../align/FETB03-BNPT-E_bqsr.bam -I ../align/FETB03-BNPT-M_bqsr.bam -I ../align/FETB03-FA-E_bqsr.bam -I ../align/FETB03-FA-M_bqsr.bam && awk '/^#/ {print $0; next} $7=="PASS" {print $0}' FETB03.somatic.FAA.vcf > ./FETB03.somatic.passed.vcf && echo FETB03 somatic ok
2.使用 Annovar 对每个病人的多样本vcf进行注释
ls *somatic.passed.vcf | perl -ne 'chomp; my $name = $1 if ($_ =~ /([^\/]+)\.somatic\.passed\.vcf/); print "table_annovar.pl $name.somatic.passed.vcf /data02/zhangmengmeng/database/annovar_db/humandb_hg38 -buildver hg38 -out $name -remove -protocol refGene,cytoBand,avsnp151,cosmic70,exac03 -operation g,r,f,f,f -nastring . -vcfinput \n"'>annovar.sh
3.对注释后的vcf进行提取,获取基因名,突变点位,样本名,t_ref_count及t_alt_count
(echo -e "CHROM\tPOS\tREF\tALT\tGENE\t$(bcftools query -l FETB01.hg38_multianno.vcf | tr '\n' '\t')" && bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%INFO/Gene.refGene[\t%AD]\n' FETB01.hg38_multianno.vcf) > ../pyclonevi/FETB01_mut.tsv
(echo -e "CHROM\tPOS\tREF\tALT\tGENE\t$(bcftools query -l FETB02.hg38_multianno.vcf | tr '\n' '\t')" && bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%INFO/Gene.refGene[\t%AD]\n' FETB02.hg38_multianno.vcf) > ../pyclonevi/FETB02_mut.tsv
(echo -e "CHROM\tPOS\tREF\tALT\tGENE\t$(bcftools query -l FETB03.hg38_multianno.vcf | tr '\n' '\t')" && bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%INFO/Gene.refGene[\t%AD]\n' FETB03.hg38_multianno.vcf) > ../pyclonevi/FETB03_mut.tsv
4.运行脚本转换为pyclonevi的输入格式
ls *_mut.tsv | perl -ne 'chomp;if($_=~/(\S+)\_mut\.tsv/){print "PreparePycloneVIInput -sampleid $1 -mut $1_mut.tsv -facets_cncf ../facets/$1-BNPT-E_cncf.xls,../facets/$1-BNPT-M_cncf.xls,../facets/$1-FA-E_cncf.xls,../facets/$1-FA-M_cncf.xls && echo $1 ok\n"}' > RunPycloneVIPrepare.sh
5.运行pyclonevi
ls *_pyclonevi_input.tsv | perl -ne 'chomp;if($_=~/(\S+)\_pyclonevi\_input\.tsv/){print "pyclone-vi fit -i $1_pyclonevi_input\.tsv -o $1.h5 -c 40 -d beta-binomial -r 10 && pyclone-vi write-results-file -i $1.h5 -o $1.pyclonevi_results.tsv && echo $1 ok\n"}' > RunPycloneVI.sh
PreparePycloneVIInput 脚本
#!/usr/bin/env python
"""
This is an open source Script written by K.Z. to prepare Pyclone-VI input files
from Merged filtered PASS and Annovar annotated vcf file and extracted counts
and facets called cncf file
Input example:
(echo -e "CHROM\tPOS\tREF\tALT\tGENE\t$(bcftools query -l FETB01.hg38_multianno.vcf | tr '\n' '\t')" && bcftools query -f '%CHROM\t%POS\t%REF\t%ALT\t%INFO/Gene.refGene[\t%AD]\n' FETB01.hg38_multianno.vcf) > ../pyclonevi/FETB01_mut.tsv
"""
import argparse
import pandas as pd
import os
import numpy as np
def main():
# Create a Custom ArgumentParser
parser = argparse.ArgumentParser(description='Create Pyclone-VI input file from MAF file and facets called cncf file')
# Set Input Argument
parser.add_argument('-sampleid', dest='sampleid', required=True, help="Sample name: EXAMPLE")
parser.add_argument('-mut', dest='mutfile', required=True, help="Merged filtered PASS and Annovar annotated vcf file and extracted counts: EXAMPLE_filtered.maf")
parser.add_argument('-facets_cncf', dest='facetsfile', required=True, help="Merged cncf file of facets R output, with header and rownames(can have multiple inputs): EXAMPLE_1_cncf.xls,EXAMPLE_2_cncf.xls,EXAMPLE_3_cncf.xls,EXAMPLE_4_cncf.xls")
parser.add_argument("-V", "--version", action="version", version="Pyclone-VI Input Prepare Version 1.0")
# Parse Arguments
args = parser.parse_args()
# Obtain input and output file names
sample_id = args.sampleid
mut_filename = args.mutfile
facets_filename = args.facetsfile
output_filename = str(sample_id)+"_pyclonevi_input.tsv"
# Load all files first
mut_file = pd.read_csv(mut_filename, sep='\t', header=0,comment='#')
mut_file = mut_file.loc[:, ~mut_file.columns.str.contains("^Unnamed")]
mut_dict = mut_file.to_dict(orient='records')
facets_filenames = facets_filename.split(",")
facets_dfs = []
for filename in facets_filenames:
df = pd.read_csv(filename, sep='\t', header=0, index_col=0, comment='#')
# Extract sample name (assuming it's before '_cncf.xls')
basename = os.path.basename(filename)
sample_name = basename.split("_cncf.xls")[0]
# Add a new column for the sample name
df["Sample"] = sample_name
# Append the DataFrame to the list
facets_dfs.append(df)
facets_file = pd.concat(facets_dfs, ignore_index=True)
facets_file['lcn.em'].fillna(1, inplace=True)
facets_file['chrom'] = 'chr' + facets_file['chrom'].astype(str)
facets_file['chrom'] = facets_file['chrom'].replace('chr23', 'chrX')
facets_dict = facets_file[['Sample','chrom','start','end','tcn.em','lcn.em']].to_dict(orient='records')
facets_lookup = {}
for entry in facets_dict:
key = (entry['chrom'], entry['Sample']) # Key by chromosome and sample name
if key not in facets_lookup:
facets_lookup[key] = []
facets_lookup[key].append(entry) # Store all matching segments
# Convert mut_dict into the new format
final_list = []
for mut in mut_dict:
chrom = mut['CHROM']
# Extract sample names and remove the first sample
sample_names = [s for s in mut.keys() if s not in ['CHROM', 'POS', 'REF', 'ALT', 'GENE']]
if sample_names:
sample_names.pop(0) # Remove first sample
for sample in sample_names:
ref_alt = mut[sample].split(',')
ref_counts, alt_counts = ref_alt[0], ref_alt[1] # Extract ref and alt counts
new_entry = {
'CHROM': chrom,
'POS': mut['POS'],
'REF': mut['REF'],
'ALT': mut['ALT'],
'GENE': mut['GENE'],
'sample_id': sample,
'ref_counts': ref_counts,
'alt_counts': alt_counts
}
# Merge with facets_dict if available
if (chrom, sample) in facets_lookup:
for segment in facets_lookup[(chrom, sample)]:
if segment['start'] <= mut['POS'] <= segment['end']: # Check POS in range
new_entry['tcn.em'] = segment['tcn.em']
new_entry['lcn.em'] = segment['lcn.em']
final_list.append(new_entry)
# 去除没有'tcn.em'的
final_list = [item for item in final_list if 'tcn.em' in item]
for item in final_list:
# 添加 mutation id
item['mutation_id'] = item['GENE'] + ":" + str(item['POS']) + ":" + str(item['ALT'])
# 计算 mcn.em
item['mcn.em'] = item['tcn.em'] - item['lcn.em']
# 添加 ncn
item['ncn'] = 2
output_data = []
for item in final_list:
output_data.append({
"mutation_id": item['mutation_id'],
"sample_id": item['sample_id'],
"ref_counts": item['ref_counts'],
"alt_counts": item['alt_counts'],
"normal_cn": item['ncn'],
"major_cn": int(item['mcn.em']),
"minor_cn": int(item['lcn.em'])
})
output_df = pd.DataFrame(output_data)
print("Writing OutPut Files...")
output_df.to_csv(output_filename, sep='\t', index=False)
if __name__ == "__main__":
main()
6.处理pyclone 结果获取Citup输入文件
find . -maxdepth 1 -name "*pyclonevi_results.tsv" -print0 | xargs -0 -I {} sh -c '
base=$(basename "{}" .pyclonevi_results.tsv);
echo "cut -f 6 {} | sed '1d' | paste - - - - > ${base}.freq.txt"
echo "cut -f 3 {} | sed '1d' | paste - - - - | cut -f 1 > ${base}.cluster.txt"
echo "cut -f 2 {} | sed '1d' | head -4 > ${base}.sampleid"
echo "echo ${base} ok"
' > PrepareCitupInput.sh
7.运行Citup
ls *.freq.txt | perl -ne 'chomp;if($_=~/(\S+)\.freq\.txt/){print "run_citup_qip.py $1.freq.txt $1.cluster.txt $1.citup.results.h5 && echo $1 ok\n"}' > RunCitup.sh
8.Timescape 可视化