TMB的定义
TMB的定义如下:TMB was defined as the number of somatic, coding, base substitution, and indel mutations per megabase of genome examined.
计算TMB的流程:
- 统计外显子区域长度(bed文件)
- 使用ANNOVAR注释vcf文件(基因名库:refGene,germline库:ExAC,genomad_exome,肿瘤Driver基因库:cosmic70)
- 纳入的突变类型:除了non-coding和unknown的
- 过滤germline突变:population frequency 需要< 0.05
- 过滤COSMIC driver mutation:population allele count 需要<50
- 过滤Tumor suppressor gene里面的stopgain突变
- 统计剩下的所有突变数并除以外显子长度从而获得TMB
可运行脚本
#!/usr/bin/env python
"""
This is an open source Script written by K.Z. to calculate TMB from ANNOVAR annotated files
This scipt filters variants annotated by exac03,genomad41_exome using the user-defined Population Frequency cutoff and filters COSMIC driver mutations according to the variant population allele count
"""
import argparse
import re
#import vcf # pip install open-cravat -i https://pypi.tuna.tsinghua.edu.cn/simple/
import csv
#from collections import defaultdict
import pandas as pd
from io import StringIO
def main():
# Create a Custom ArgumentParser
parser = argparse.ArgumentParser(description='Calculate TMB from ANNOVAR annotated files')
# Set Input Argument
parser.add_argument('-sampleid', dest='sampleid', required=True, help="Sample name: EXAMPLE, if in multisample mode, this could be set as tumor name")
parser.add_argument('-input', dest='inputfile', required=True, help="TXT output of ANNOVAR annotated VCF file: EXAMPLE.hg38_multianno.txt")
parser.add_argument('-bed', dest='bedfile', required=True, help="BED file used in exome sequencing or BED files targeting exon region that want to be used to calculate TMB")
parser.add_argument('-annovar_db', dest='annovar_database', required=True, help="Databases used in ANNOVAR annotation, seperate using ',' and the following can be included: cosmic70,exac03,genomad41_exome")
parser.add_argument('-tsg', dest='tsgfile', required=True, help="Tumor suppressor genes with stopgain to exlude in TMB calculation, can be downloaded from https://bioinfo.uth.edu/TSGene/Human_TSGs.txt")
parser.add_argument('-populationfreqcutoff', dest='Population_Freq_cutoff', required=False, default=0.05, help="Population Frequency cutoff to filter in exac03,genomad41_exome, default is 0.05(5%%)")
parser.add_argument('-cosmiccutoff', dest='COSMIC_cutoff', required=False, default=50, help="Cutoff for COSMIC driver gene count to filter in cosmic70, default is 50")
parser.add_argument('-multisample', dest='multi_sample', required=False, default="FALSE", help="Whether the input file is merged with multiple samples, if set to TRUE, the input file must contain a column named SampleID, default is FALSE")
parser.add_argument("-V", "--version", action="version", version="TMB calculation Version 1.1")
# Parse Arguments
args = parser.parse_args()
# Obtain input and output file names
sample_id = args.sampleid
input_filename = args.inputfile
bed_file_name = args.bedfile
annovar_db = args.annovar_database
tsg_filename = args.tsgfile
Population_Freq_cutoff = args.Population_Freq_cutoff
Population_Freq_cutoff = float(Population_Freq_cutoff)
COSMIC_cutoff = args.COSMIC_cutoff
COSMIC_cutoff = int(COSMIC_cutoff)
multi_sample = args.multi_sample
# Load all files first
with open(input_filename, 'r') as f:
lines = [line for line in f if not line.startswith('##')]
input_file = pd.read_csv(StringIO(''.join(lines)), sep='\t', header=0)
bed_file = pd.read_csv(bed_file_name, sep='\t', header=None,comment='#')
# Calculate Total Chromosome length in the bed file
bed_file = bed_file.rename(columns={0: "Chr", 1: "Start", 2: "End"})
Exon_length = (bed_file['End']-bed_file['Start']).sum()/1000000
# print("The Total length of exon regions is: ",Exon_length)
annovar_db_list = annovar_db.split(",")
tsg_file = pd.read_csv(tsg_filename, sep='\t', header=0,comment='#')
if multi_sample == "FALSE":
print("Running TMB Calculation in ",sample_id)
# filter input file to exclude non-exon and exon functions unknown
input_file = input_file[~input_file['ExonicFunc.refGene'].isin(['.', 'unknown'])]
# print("non-exon and gene-exon with unknown functions have been filtered")
# Now Filter vcf files according to the criterias
for annovar_db_name in annovar_db_list:
if annovar_db_name=="cosmic70":
# print("Processing cosmic70 annotation...")
# filter cosmic genes with score over 50
cosmic70_sum = []
for value in input_file['cosmic70']: # Replace `input_file['cosmic70']` with the correct reference
# Extract all the numbers after "=" using regular expressions
if value != ".":
value = value.split("OCCURENCE=")[1]
numbers = re.findall(r"\d+\.?\d*", value)
else:
numbers = 0
# Convert the extracted numbers to integers and calculate their sum
if numbers != 0:
sum_numbers = sum(int(num) for num in numbers)
else:
sum_numbers = 0 # In case no numbers are found
# Append the sum to the cosmic70_sum list
cosmic70_sum.append(sum_numbers)
# Add the new column to the DataFrame
input_file['cosmic70_sum'] = cosmic70_sum
input_file = input_file[input_file['cosmic70_sum'] < COSMIC_cutoff]
# print("There're ",len(input_file)," mutations left")
elif annovar_db_name=="exac03":
# print("Processing exac03 annotation...")
input_file['ExAC_ALL'] = input_file['ExAC_ALL'].replace('.', '0')
input_file['ExAC_ALL'] = input_file['ExAC_ALL'].astype(float)
input_file = input_file[input_file['ExAC_ALL'] < Population_Freq_cutoff]
# print("There're ",len(input_file)," mutations left")
elif annovar_db_name=="genomad41_exome":
# print("Processing genomad41_exome annotation...")
input_file['gnomAD_exome_ALL'] = input_file['gnomAD_exome_ALL'].replace('.', '0')
input_file['gnomAD_exome_ALL'] = input_file['gnomAD_exome_ALL'].astype(float)
input_file = input_file[input_file['gnomAD_exome_ALL'] < Population_Freq_cutoff]
# print("There're ",len(input_file)," mutations left")
# Filter TSGs with stopgain
tsg_gene_symbols = tsg_file["GeneSymbol"]
input_file = input_file[~((input_file["Gene.refGene"].isin(tsg_gene_symbols)) & (input_file["ExonicFunc.refGene"] == "stopgain"))]
# print("There're ",len(input_file)," mutations left")
# Calculate final mutation count
mut_count = len(input_file)
print("The total filtered mutation count in ", sample_id, " is: ", mut_count)
TMB_value = mut_count/Exon_length
output_filename = str(sample_id)+"_TMB.txt"
print("Writing OutPut Files...")
with open(output_filename, 'w', newline='', encoding='utf-8') as outputfile:
writer = csv.writer(outputfile, delimiter='\t') # Use tab as delimiter
writer.writerow(["SampleID",
"ExonLength",
"MutCount",
"TMB"])
writer.writerow([sample_id,
Exon_length,
mut_count,
TMB_value])
elif multi_sample == "TRUE":
TMB_allsamples = {}
for tmp_sampleid in input_file['SampleID'].unique():
tmp_inputfile = input_file[input_file['SampleID'] == tmp_sampleid]
# print("Running TMB Calculation in ",tmp_sampleid)
# filter input file to exclude non-exon and exon functions unknown
tmp_inputfile = tmp_inputfile[~tmp_inputfile['ExonicFunc.refGene'].isin(['.', 'unknown'])]
# print("non-exon and gene-exon with unknown functions have been filtered")
# Now Filter vcf files according to the criterias
for annovar_db_name in annovar_db_list:
if annovar_db_name=="cosmic70":
# print("Processing cosmic70 annotation...")
# filter cosmic genes with score over 50
cosmic70_sum = []
for value in tmp_inputfile['cosmic70']: # Replace `tmp_inputfile['cosmic70']` with the correct reference
# Extract all the numbers after "=" using regular expressions
if value != ".":
value = value.split("OCCURENCE=")[1]
numbers = re.findall(r"\d+\.?\d*", value)
else:
numbers = 0
# Convert the extracted numbers to integers and calculate their sum
if numbers != 0:
sum_numbers = sum(int(num) for num in numbers)
else:
sum_numbers = 0 # In case no numbers are found
# Append the sum to the cosmic70_sum list
cosmic70_sum.append(sum_numbers)
# Add the new column to the DataFrame
tmp_inputfile['cosmic70_sum'] = cosmic70_sum
tmp_inputfile = tmp_inputfile[tmp_inputfile['cosmic70_sum'] < COSMIC_cutoff]
# print("There're ",len(tmp_inputfile)," mutations left")
elif annovar_db_name=="exac03":
# print("Processing exac03 annotation...")
tmp_inputfile['ExAC_ALL'] = tmp_inputfile['ExAC_ALL'].replace('.', '0')
tmp_inputfile['ExAC_ALL'] = tmp_inputfile['ExAC_ALL'].astype(float)
tmp_inputfile = tmp_inputfile[tmp_inputfile['ExAC_ALL'] < Population_Freq_cutoff]
# print("There're ",len(tmp_inputfile)," mutations left")
elif annovar_db_name=="genomad41_exome":
# print("Processing genomad41_exome annotation...")
tmp_inputfile['gnomAD_exome_ALL'] = tmp_inputfile['gnomAD_exome_ALL'].replace('.', '0')
tmp_inputfile['gnomAD_exome_ALL'] = tmp_inputfile['gnomAD_exome_ALL'].astype(float)
tmp_inputfile = tmp_inputfile[tmp_inputfile['gnomAD_exome_ALL'] < Population_Freq_cutoff]
# print("There're ",len(tmp_inputfile)," mutations left")
# Filter TSGs with stopgain
# print("Removing TSGs with stopgain...")
tsg_gene_symbols = tsg_file["GeneSymbol"]
tmp_inputfile = tmp_inputfile[~((tmp_inputfile["Gene.refGene"].isin(tsg_gene_symbols)) & (tmp_inputfile["ExonicFunc.refGene"] == "stopgain"))]
# print("There're ",len(tmp_inputfile)," mutations left")
# Calculate final mutation count
mut_count = len(tmp_inputfile)
print("The total filtered mutation count in ", tmp_sampleid, " is: ", mut_count)
TMB_value = mut_count/Exon_length
#output_filename = str(tmp_sampleid)+"_TMB.txt"
#print("Writing OutPut Files of ",tmp_sampleid)
#with open(output_filename, 'w', newline='', encoding='utf-8') as outputfile:
# writer = csv.writer(outputfile, delimiter='\t') # Use tab as delimiter
# writer.writerow(["SampleID",
# "ExonLength",
# "MutCount",
# "TMB"])
# writer.writerow([tmp_sampleid,
# Exon_length,
# mut_count,
# TMB_value])
TMB_allsamples[tmp_sampleid] = {
"ExonLength": Exon_length,
"MutCount": mut_count,
"TMB": TMB_value}
output_filename = str(sample_id)+"Samples_TMB.txt"
print("Writing OutPut Files of All Samples")
with open(output_filename, 'w', newline='', encoding='utf-8') as outputfile:
writer = csv.writer(outputfile, delimiter='\t') # Use tab as delimiter
writer.writerow(["SampleID",
"ExonLength",
"MutCount",
"TMB"])
for sample_id, values in TMB_allsamples.items():
writer.writerow([sample_id, values['ExonLength'], values['MutCount'], values['TMB']])
if __name__ == "__main__":
main()
构建批量运行脚本
ls *.hg38_multianno.vcf | perl -ne 'chomp; my $name = $1 if ($_ =~ /([^\/]+)\.hg38\_multianno\.vcf/); print "/data02/zhangmengmeng/software/kztools/TMB/TMBCalculation -sampleid $name -input $name.hg38_multianno.txt -bed /data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.bed -annovar_db cosmic70,exac03 -tsg /data02/zhangmengmeng/software/kztools/TMB/Human_TSGs.txt -populationfreqcutoff 0.05 -cosmiccutoff 50 && echo $name TMB ok\n"'>TMBCalculation.sh
# 将结果文件输出整合到一起
awk 'FNR==1 && NR!=1 {next} {print}' *_TMB.txt > All_TMB.txt