{"id":411,"date":"2025-01-20T15:16:08","date_gmt":"2025-01-20T07:16:08","guid":{"rendered":"https:\/\/www.kz-hub.tech\/?p=411"},"modified":"2025-02-26T09:41:33","modified_gmt":"2025-02-26T01:41:33","slug":"tmb%e7%9a%84%e5%ae%9a%e4%b9%89%e4%b8%8e%e8%ae%a1%e7%ae%97","status":"publish","type":"post","link":"https:\/\/www.kz-hub.tech\/index.php\/2025\/01\/20\/tmb%e7%9a%84%e5%ae%9a%e4%b9%89%e4%b8%8e%e8%ae%a1%e7%ae%97\/","title":{"rendered":"TMB\u7684\u5b9a\u4e49\u4e0e\u8ba1\u7b97"},"content":{"rendered":"<p><a href=\"https:\/\/github.com\/fanyucai1\/TMB\">\u53c2\u8003\u94fe\u63a51<\/a><\/p>\n<h2>TMB\u7684\u5b9a\u4e49<\/h2>\n<p>TMB\u7684\u5b9a\u4e49\u5982\u4e0b\uff1aTMB was defined as the number of <strong><em>somatic<\/em><\/strong>, <strong><em>coding<\/em><\/strong>, <strong><em>base substitution<\/em><\/strong>, and <strong><em>indel mutations<\/em><\/strong> per megabase of genome examined.<\/p>\n<p>\u8ba1\u7b97TMB\u7684\u6d41\u7a0b\uff1a<\/p>\n<ol>\n<li>\u7edf\u8ba1\u5916\u663e\u5b50\u533a\u57df\u957f\u5ea6\uff08bed\u6587\u4ef6\uff09<\/li>\n<li>\u4f7f\u7528ANNOVAR\u6ce8\u91cavcf\u6587\u4ef6\uff08\u57fa\u56e0\u540d\u5e93:refGene\uff0cgermline\u5e93\uff1aExAC\uff0cgenomad_exome\uff0c\u80bf\u7624Driver\u57fa\u56e0\u5e93\uff1acosmic70\uff09<\/li>\n<li>\u7eb3\u5165\u7684\u7a81\u53d8\u7c7b\u578b\uff1a\u9664\u4e86non-coding\u548cunknown\u7684<\/li>\n<li>\u8fc7\u6ee4germline\u7a81\u53d8\uff1apopulation frequency \u9700\u8981&lt; 0.05<\/li>\n<li>\u8fc7\u6ee4COSMIC driver mutation\uff1apopulation allele count \u9700\u8981&lt;50<\/li>\n<li>\u8fc7\u6ee4Tumor suppressor gene\u91cc\u9762\u7684stopgain\u7a81\u53d8<\/li>\n<li>\u7edf\u8ba1\u5269\u4e0b\u7684\u6240\u6709\u7a81\u53d8\u6570\u5e76\u9664\u4ee5\u5916\u663e\u5b50\u957f\u5ea6\u4ece\u800c\u83b7\u5f97TMB<\/li>\n<\/ol>\n<h4>\u53ef\u8fd0\u884c\u811a\u672c<\/h4>\n<pre><code class=\"language-python\">#!\/usr\/bin\/env python\n&quot;&quot;&quot;\nThis is an open source Script written by K.Z. to calculate TMB from ANNOVAR annotated files\nThis scipt filters variants annotated by exac03,genomad41_exome using the user-defined Population Frequency cutoff and filters COSMIC driver mutations according to the variant population allele count\n&quot;&quot;&quot;\nimport argparse\nimport re\n#import vcf # pip install open-cravat -i https:\/\/pypi.tuna.tsinghua.edu.cn\/simple\/\nimport csv\n#from collections import defaultdict\nimport pandas as pd\nfrom io import StringIO\n\ndef main():\n    # Create a Custom ArgumentParser\n    parser = argparse.ArgumentParser(description=&#039;Calculate TMB from ANNOVAR annotated files&#039;)\n\n    # Set Input Argument\n    parser.add_argument(&#039;-sampleid&#039;, dest=&#039;sampleid&#039;, required=True, help=&quot;Sample name: EXAMPLE, if in multisample mode, this could be set as tumor name&quot;)\n    parser.add_argument(&#039;-input&#039;, dest=&#039;inputfile&#039;, required=True, help=&quot;TXT output of ANNOVAR annotated VCF file: EXAMPLE.hg38_multianno.txt&quot;)\n    parser.add_argument(&#039;-bed&#039;, dest=&#039;bedfile&#039;, required=True, help=&quot;BED file used in exome sequencing or BED files targeting exon region that want to be used to calculate TMB&quot;)\n    parser.add_argument(&#039;-annovar_db&#039;, dest=&#039;annovar_database&#039;, required=True, help=&quot;Databases used in ANNOVAR annotation, seperate using &#039;,&#039; and the following can be included: cosmic70,exac03,genomad41_exome&quot;)\n    parser.add_argument(&#039;-tsg&#039;, dest=&#039;tsgfile&#039;, required=True, help=&quot;Tumor suppressor genes with stopgain to exlude in TMB calculation, can be downloaded from https:\/\/bioinfo.uth.edu\/TSGene\/Human_TSGs.txt&quot;)\n    parser.add_argument(&#039;-populationfreqcutoff&#039;, dest=&#039;Population_Freq_cutoff&#039;, required=False, default=0.05, help=&quot;Population Frequency cutoff to filter in exac03,genomad41_exome, default is 0.05(5%%)&quot;)\n    parser.add_argument(&#039;-cosmiccutoff&#039;, dest=&#039;COSMIC_cutoff&#039;, required=False, default=50, help=&quot;Cutoff for COSMIC driver gene count to filter in cosmic70, default is 50&quot;)\n    parser.add_argument(&#039;-multisample&#039;, dest=&#039;multi_sample&#039;, required=False, default=&quot;FALSE&quot;, help=&quot;Whether the input file is merged with multiple samples, if set to TRUE, the input file must contain a column named SampleID, default is FALSE&quot;)\n    parser.add_argument(&quot;-V&quot;, &quot;--version&quot;, action=&quot;version&quot;, version=&quot;TMB calculation Version 1.1&quot;)\n\n    # Parse Arguments\n    args = parser.parse_args()\n\n    # Obtain input and output file names\n    sample_id = args.sampleid\n    input_filename = args.inputfile\n    bed_file_name = args.bedfile\n    annovar_db = args.annovar_database\n    tsg_filename = args.tsgfile\n    Population_Freq_cutoff = args.Population_Freq_cutoff\n    Population_Freq_cutoff = float(Population_Freq_cutoff)\n    COSMIC_cutoff = args.COSMIC_cutoff\n    COSMIC_cutoff = int(COSMIC_cutoff)\n    multi_sample = args.multi_sample\n\n    # Load all files first\n    with open(input_filename, &#039;r&#039;) as f:\n        lines = [line for line in f if not line.startswith(&#039;##&#039;)]\n    input_file = pd.read_csv(StringIO(&#039;&#039;.join(lines)), sep=&#039;\\t&#039;, header=0)\n    bed_file = pd.read_csv(bed_file_name, sep=&#039;\\t&#039;, header=None,comment=&#039;#&#039;)\n    # Calculate Total Chromosome length in the bed file\n    bed_file = bed_file.rename(columns={0: &quot;Chr&quot;, 1: &quot;Start&quot;, 2: &quot;End&quot;})\n    Exon_length = (bed_file[&#039;End&#039;]-bed_file[&#039;Start&#039;]).sum()\/1000000\n    # print(&quot;The Total length of exon regions is: &quot;,Exon_length)\n    annovar_db_list = annovar_db.split(&quot;,&quot;)\n    tsg_file = pd.read_csv(tsg_filename, sep=&#039;\\t&#039;, header=0,comment=&#039;#&#039;)\n\n    if multi_sample == &quot;FALSE&quot;:\n        print(&quot;Running TMB Calculation in &quot;,sample_id)\n        # filter input file to exclude non-exon and exon functions unknown\n        input_file = input_file[~input_file[&#039;ExonicFunc.refGene&#039;].isin([&#039;.&#039;, &#039;unknown&#039;])]\n        # print(&quot;non-exon and gene-exon with unknown functions have been filtered&quot;)\n\n        # Now Filter vcf files according to the criterias\n        for annovar_db_name in annovar_db_list:\n            if annovar_db_name==&quot;cosmic70&quot;:\n                # print(&quot;Processing cosmic70 annotation...&quot;)\n                # filter cosmic genes with score over 50\n                cosmic70_sum = []\n                for value in input_file[&#039;cosmic70&#039;]:  # Replace `input_file[&#039;cosmic70&#039;]` with the correct reference\n                    # Extract all the numbers after &quot;=&quot; using regular expressions\n                    if value !=  &quot;.&quot;:\n                        value = value.split(&quot;OCCURENCE=&quot;)[1]\n                        numbers = re.findall(r&quot;\\d+\\.?\\d*&quot;, value)\n                    else:\n                        numbers = 0\n\n                    # Convert the extracted numbers to integers and calculate their sum\n                    if numbers != 0:\n                        sum_numbers = sum(int(num) for num in numbers)\n                    else:\n                        sum_numbers = 0  # In case no numbers are found\n\n                    # Append the sum to the cosmic70_sum list\n                    cosmic70_sum.append(sum_numbers)\n\n                # Add the new column to the DataFrame\n                input_file[&#039;cosmic70_sum&#039;] = cosmic70_sum\n                input_file = input_file[input_file[&#039;cosmic70_sum&#039;] &lt; COSMIC_cutoff]\n                # print(&quot;There&#039;re &quot;,len(input_file),&quot; mutations left&quot;)\n            elif annovar_db_name==&quot;exac03&quot;:\n                # print(&quot;Processing exac03 annotation...&quot;)\n                input_file[&#039;ExAC_ALL&#039;] = input_file[&#039;ExAC_ALL&#039;].replace(&#039;.&#039;, &#039;0&#039;)\n                input_file[&#039;ExAC_ALL&#039;] = input_file[&#039;ExAC_ALL&#039;].astype(float)\n                input_file = input_file[input_file[&#039;ExAC_ALL&#039;] &lt; Population_Freq_cutoff]\n                # print(&quot;There&#039;re &quot;,len(input_file),&quot; mutations left&quot;)\n            elif annovar_db_name==&quot;genomad41_exome&quot;:\n                # print(&quot;Processing genomad41_exome annotation...&quot;)\n                input_file[&#039;gnomAD_exome_ALL&#039;] = input_file[&#039;gnomAD_exome_ALL&#039;].replace(&#039;.&#039;, &#039;0&#039;)\n                input_file[&#039;gnomAD_exome_ALL&#039;] = input_file[&#039;gnomAD_exome_ALL&#039;].astype(float)\n                input_file = input_file[input_file[&#039;gnomAD_exome_ALL&#039;] &lt; Population_Freq_cutoff]\n                # print(&quot;There&#039;re &quot;,len(input_file),&quot; mutations left&quot;)\n\n        # Filter TSGs with stopgain\n        tsg_gene_symbols = tsg_file[&quot;GeneSymbol&quot;]\n\n        input_file = input_file[~((input_file[&quot;Gene.refGene&quot;].isin(tsg_gene_symbols)) &amp; (input_file[&quot;ExonicFunc.refGene&quot;] == &quot;stopgain&quot;))]\n        # print(&quot;There&#039;re &quot;,len(input_file),&quot; mutations left&quot;)\n\n        # Calculate final mutation count\n        mut_count = len(input_file)\n        print(&quot;The total filtered mutation count in &quot;, sample_id, &quot; is: &quot;, mut_count)\n        TMB_value = mut_count\/Exon_length\n\n        output_filename = str(sample_id)+&quot;_TMB.txt&quot;\n        print(&quot;Writing OutPut Files...&quot;)\n        with open(output_filename, &#039;w&#039;, newline=&#039;&#039;, encoding=&#039;utf-8&#039;) as outputfile:\n            writer = csv.writer(outputfile, delimiter=&#039;\\t&#039;)  # Use tab as delimiter\n            writer.writerow([&quot;SampleID&quot;,\n                            &quot;ExonLength&quot;,\n                            &quot;MutCount&quot;,\n                            &quot;TMB&quot;])\n            writer.writerow([sample_id,\n                            Exon_length,\n                            mut_count,\n                            TMB_value])\n\n    elif multi_sample == &quot;TRUE&quot;:\n        TMB_allsamples = {}\n        for tmp_sampleid in input_file[&#039;SampleID&#039;].unique():\n            tmp_inputfile = input_file[input_file[&#039;SampleID&#039;] == tmp_sampleid]\n            # print(&quot;Running TMB Calculation in &quot;,tmp_sampleid)\n            # filter input file to exclude non-exon and exon functions unknown\n            tmp_inputfile = tmp_inputfile[~tmp_inputfile[&#039;ExonicFunc.refGene&#039;].isin([&#039;.&#039;, &#039;unknown&#039;])]\n            # print(&quot;non-exon and gene-exon with unknown functions have been filtered&quot;)\n\n            # Now Filter vcf files according to the criterias\n            for annovar_db_name in annovar_db_list:\n                if annovar_db_name==&quot;cosmic70&quot;:\n                    # print(&quot;Processing cosmic70 annotation...&quot;)\n                    # filter cosmic genes with score over 50\n                    cosmic70_sum = []\n                    for value in tmp_inputfile[&#039;cosmic70&#039;]:  # Replace `tmp_inputfile[&#039;cosmic70&#039;]` with the correct reference\n                        # Extract all the numbers after &quot;=&quot; using regular expressions\n                        if value !=  &quot;.&quot;:\n                            value = value.split(&quot;OCCURENCE=&quot;)[1]\n                            numbers = re.findall(r&quot;\\d+\\.?\\d*&quot;, value)\n                        else:\n                            numbers = 0\n\n                        # Convert the extracted numbers to integers and calculate their sum\n                        if numbers != 0:\n                            sum_numbers = sum(int(num) for num in numbers)\n                        else:\n                            sum_numbers = 0  # In case no numbers are found\n\n                        # Append the sum to the cosmic70_sum list\n                        cosmic70_sum.append(sum_numbers)\n\n                    # Add the new column to the DataFrame\n                    tmp_inputfile[&#039;cosmic70_sum&#039;] = cosmic70_sum\n                    tmp_inputfile = tmp_inputfile[tmp_inputfile[&#039;cosmic70_sum&#039;] &lt; COSMIC_cutoff]\n                    # print(&quot;There&#039;re &quot;,len(tmp_inputfile),&quot; mutations left&quot;)\n\n                elif annovar_db_name==&quot;exac03&quot;:\n                    # print(&quot;Processing exac03 annotation...&quot;)\n                    tmp_inputfile[&#039;ExAC_ALL&#039;] = tmp_inputfile[&#039;ExAC_ALL&#039;].replace(&#039;.&#039;, &#039;0&#039;)\n                    tmp_inputfile[&#039;ExAC_ALL&#039;] = tmp_inputfile[&#039;ExAC_ALL&#039;].astype(float)\n                    tmp_inputfile = tmp_inputfile[tmp_inputfile[&#039;ExAC_ALL&#039;] &lt; Population_Freq_cutoff]\n                    # print(&quot;There&#039;re &quot;,len(tmp_inputfile),&quot; mutations left&quot;)\n\n                elif annovar_db_name==&quot;genomad41_exome&quot;:\n                    # print(&quot;Processing genomad41_exome annotation...&quot;)\n                    tmp_inputfile[&#039;gnomAD_exome_ALL&#039;] = tmp_inputfile[&#039;gnomAD_exome_ALL&#039;].replace(&#039;.&#039;, &#039;0&#039;)\n                    tmp_inputfile[&#039;gnomAD_exome_ALL&#039;] = tmp_inputfile[&#039;gnomAD_exome_ALL&#039;].astype(float)\n                    tmp_inputfile = tmp_inputfile[tmp_inputfile[&#039;gnomAD_exome_ALL&#039;] &lt; Population_Freq_cutoff]\n                    # print(&quot;There&#039;re &quot;,len(tmp_inputfile),&quot; mutations left&quot;)\n\n            # Filter TSGs with stopgain\n            # print(&quot;Removing  TSGs with stopgain...&quot;)\n            tsg_gene_symbols = tsg_file[&quot;GeneSymbol&quot;]\n            tmp_inputfile = tmp_inputfile[~((tmp_inputfile[&quot;Gene.refGene&quot;].isin(tsg_gene_symbols)) &amp; (tmp_inputfile[&quot;ExonicFunc.refGene&quot;] == &quot;stopgain&quot;))]\n            # print(&quot;There&#039;re &quot;,len(tmp_inputfile),&quot; mutations left&quot;)\n\n            # Calculate final mutation count\n            mut_count = len(tmp_inputfile)\n            print(&quot;The total filtered mutation count in &quot;, tmp_sampleid, &quot; is: &quot;, mut_count)\n            TMB_value = mut_count\/Exon_length\n            #output_filename = str(tmp_sampleid)+&quot;_TMB.txt&quot;\n            #print(&quot;Writing OutPut Files of &quot;,tmp_sampleid)\n            #with open(output_filename, &#039;w&#039;, newline=&#039;&#039;, encoding=&#039;utf-8&#039;) as outputfile:\n            #    writer = csv.writer(outputfile, delimiter=&#039;\\t&#039;)  # Use tab as delimiter\n            #    writer.writerow([&quot;SampleID&quot;,\n            #                    &quot;ExonLength&quot;,\n            #                    &quot;MutCount&quot;,\n            #                    &quot;TMB&quot;])\n            #    writer.writerow([tmp_sampleid,\n            #                    Exon_length,\n            #                    mut_count,\n            #                    TMB_value])\n            TMB_allsamples[tmp_sampleid] = {\n                &quot;ExonLength&quot;: Exon_length,\n                &quot;MutCount&quot;: mut_count,\n                &quot;TMB&quot;: TMB_value}\n\n        output_filename = str(sample_id)+&quot;Samples_TMB.txt&quot;\n        print(&quot;Writing OutPut Files of All Samples&quot;)\n        with open(output_filename, &#039;w&#039;, newline=&#039;&#039;, encoding=&#039;utf-8&#039;) as outputfile:\n            writer = csv.writer(outputfile, delimiter=&#039;\\t&#039;)  # Use tab as delimiter\n            writer.writerow([&quot;SampleID&quot;,\n                            &quot;ExonLength&quot;,\n                            &quot;MutCount&quot;,\n                            &quot;TMB&quot;])\n            for sample_id, values in TMB_allsamples.items():\n                writer.writerow([sample_id, values[&#039;ExonLength&#039;], values[&#039;MutCount&#039;], values[&#039;TMB&#039;]])\n\nif __name__ == &quot;__main__&quot;:\n    main()<\/code><\/pre>\n<h3>\u6784\u5efa\u6279\u91cf\u8fd0\u884c\u811a\u672c<\/h3>\n<pre><code class=\"language-bash\">ls *.hg38_multianno.vcf | perl -ne &#039;chomp; my $name = $1 if ($_ =~ \/([^\\\/]+)\\.hg38\\_multianno\\.vcf\/); print &quot;\/data02\/zhangmengmeng\/software\/kztools\/TMB\/TMBCalculation -sampleid $name -input $name.hg38_multianno.txt -bed \/data02\/zhangmengmeng\/database\/gatk_resource_bundle\/hg38\/AgilentV6_GRCh38_ex_region.sort.bed  -annovar_db cosmic70,exac03 -tsg \/data02\/zhangmengmeng\/software\/kztools\/TMB\/Human_TSGs.txt -populationfreqcutoff 0.05 -cosmiccutoff 50 &amp;&amp; echo $name TMB ok\\n&quot;&#039;&gt;TMBCalculation.sh\n\n# \u5c06\u7ed3\u679c\u6587\u4ef6\u8f93\u51fa\u6574\u5408\u5230\u4e00\u8d77\nawk &#039;FNR==1 &amp;&amp; NR!=1 {next} {print}&#039; *_TMB.txt &gt; All_TMB.txt\n<\/code><\/pre>\n","protected":false},"excerpt":{"rendered":"<p>\u53c2\u8003\u94fe\u63a51 TMB\u7684\u5b9a\u4e49 TMB\u7684\u5b9a\u4e49\u5982\u4e0b\uff1aTMB was defined as the number of &#8230;<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"closed","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[],"class_list":["post-411","post","type-post","status-publish","format-standard","hentry","category-uncategorized"],"_links":{"self":[{"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/posts\/411","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/comments?post=411"}],"version-history":[{"count":9,"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/posts\/411\/revisions"}],"predecessor-version":[{"id":471,"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/posts\/411\/revisions\/471"}],"wp:attachment":[{"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/media?parent=411"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/categories?post=411"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.kz-hub.tech\/index.php\/wp-json\/wp\/v2\/tags?post=411"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}