仅能统计SNP,无法统计INDEL,此软件会把INDEL打碎成单个碱基进行统计
1. 首先对bqsr处理好的bam文件进行去除重复
# 注意加上 “--REMOVE_DUPLICATE true” 后,输出的bam文件就是已经剔除了pcr duplicate的
ls ../rawdata/*_R1.fastq.gz|perl -ne 'chomp;my $name=$1 if($_=~/\/([^\/]+)\_R1/);print "gatk --java-options \"-Xmx20G -Djava.io.tmpdir=./\" MarkDuplicates --INPUT ../align/$name\_bqsr.bam --OUTPUT $name\_bqsr.removedup.bam -M $name.metrics --VALIDATION_STRINGENCY LENIENT --REMOVE_DUPLICATE true && samtools index $name\_bqsr.removedup.bam && echo $name ok\n"' > remove_bam_dup.sh
2. 在R中对同一病人的不同样本的突变maf文件进行合并,并提取为bam-readcount的输入格式
# 示例:
## X. 提取配对上皮、基质样本中的PASS位点 ####
# 读取的是过滤后,所有病人的所有样本的所有突变的maf文件
Merged_maf_df <- read.table("/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/Mutation/6patients_mutect2_varscan2_intersect_pass.Somatic.hc.hg38_multianno_filtered.maf",
header = T,sep = '\t',quote = "")
# 构建区分样本的信息
Merged_maf_df$PatientID <- Merged_maf_df$Tumor_Sample_Barcode
Merged_maf_df <- Merged_maf_df %>%
separate(PatientID, into = c("PatientID", "TumorType", "TissueType"), sep = "-")
unique_combinations <- unique(Merged_maf_df[, c("PatientID", "TissueType")])
# Loop through each unique combination
for (i in 1:nrow(unique_combinations)) {
patient_id <- unique_combinations$PatientID[i]
tissue_type <- unique_combinations$TissueType[i]
# Filter rows based on PatientID and TissueType
filtered_df <- Merged_maf_df[Merged_maf_df$PatientID == patient_id & Merged_maf_df$TissueType == tissue_type, ]
filtered_df <- filtered_df[,c("Chromosome", "Start_Position", "End_Position")]
# Create a dynamic name for the new dataframe
df_name <- paste(patient_id, tissue_type, sep = "_")
df_name <- paste(df_name, "df", sep = "_")
# Assign the filtered dataframe to a new variable
assign(df_name, filtered_df)
write.table(filtered_df,paste0("/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/unique_mut_sites/",patient_id,"-",tissue_type,"_site_list.txt"),
col.names = F,row.names = F,sep = '\t',quote = F)
}
最后输出的范例:
# FETB01-E_site_list.txt (tab分隔):
chr1 10008116 10008116
chr1 52495456 52495456
chr2 54649645 54649645
chr3 138084166 138084166
chr5 50667572 50667572
chr6 1611682 1611682
chr7 4221252 4221252
chr7 94420457 94420457
chr8 3387676 3387676
chr8 115515228 115515228
chr11 434096 434096
chr11 119313110 119313110
chr12 242757 242757
chr12 51806999 51806999
chr13 99866404 99866404
chr14 19760391 19760391
chr16 1314177 1314177
chr16 2183864 2183864
chr16 29861014 29861014
chr19 35835836 35835836
chr19 38119394 38119394
chr19 47353134 47353134
3. 运行bam-readcount
github:https://github.com/genome/bam-readcount
参考:https://www.jianshu.com/p/0697dbfe316c
# 使用说明:
Usage: bam-readcount [OPTIONS] bam_file|cram_file [region]
Generate metrics for bam_file at single nucleotide positions.
Example: bam-readcount -f ref.fa some.bam|some.cram
Available options:
-h [ --help ] produce this message
-v [ --version ] output the version number
-q [ --min-mapping-quality ] arg (=0) minimum mapping quality of reads used
for counting.
-b [ --min-base-quality ] arg (=0) minimum base quality at a position to
use the read for counting.
-d [ --max-count ] arg (=10000000) max depth to avoid excessive memory
usage.
-l [ --site-list ] arg file containing a list of regions to
report readcounts within.
-f [ --reference-fasta ] arg reference sequence in the fasta format.
-D [ --print-individual-mapq ] arg report the mapping qualities as a comma
separated list.
-p [ --per-library ] report results by library.
-w [ --max-warnings ] arg maximum number of warnings of each type
to emit. -1 gives an unlimited number.
-i [ --insertion-centric ] generate indel centric readcounts.
Reads containing insertions will not be
included in per-base counts
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB01-M_site_list.txt ../align/FETB01-BNPT-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB01-BNPT-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB01-M_site_list.txt ../align/FETB01-FA-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB01-FA-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB01-E_site_list.txt ../align/FETB01-BNPT-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB01-BNPT-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB01-E_site_list.txt ../align/FETB01-FA-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB01-FA-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB02-M_site_list.txt ../align/FETB02-BNPT-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB02-BNPT-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB02-M_site_list.txt ../align/FETB02-FA-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB02-FA-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB02-E_site_list.txt ../align/FETB02-BNPT-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB02-BNPT-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB02-E_site_list.txt ../align/FETB02-FA-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB02-FA-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB03-M_site_list.txt ../align/FETB03-BNPT-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB03-BNPT-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB03-M_site_list.txt ../align/FETB03-FA-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB03-FA-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB03-E_site_list.txt ../align/FETB03-BNPT-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB03-BNPT-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB03-E_site_list.txt ../align/FETB03-FA-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB03-FA-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB04-M_site_list.txt ../align/FETB04-BNPT-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB04-BNPT-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB04-M_site_list.txt ../align/FETB04-FA-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB04-FA-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB04-E_site_list.txt ../align/FETB04-BNPT-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB04-BNPT-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB04-E_site_list.txt ../align/FETB04-FA-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB04-FA-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB05-M_site_list.txt ../align/FETB05-BNPT-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB05-BNPT-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB05-M_site_list.txt ../align/FETB05-FA-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB05-FA-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB05-E_site_list.txt ../align/FETB05-BNPT-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB05-BNPT-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB05-E_site_list.txt ../align/FETB05-FA-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB05-FA-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB06-M_site_list.txt ../align/FETB06-BNPT-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB06-BNPT-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB06-M_site_list.txt ../align/FETB06-FA-M_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB06-FA-M.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB06-E_site_list.txt ../align/FETB06-BNPT-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB06-BNPT-E.matched.sites.txt
bam-readcount -q 20 -b 10 -f /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa -l FETB06-E_site_list.txt ../align/FETB06-FA-E_bqsr.bam | grep -v WARNING |awk 'BEGIN{FS=OFS="\t"}{split($6,A,":");split($7,C,":");split($8,G,":");split($9,T,":");print $1,$2,$3,$4,"A:"A[2],"C:"C[2],"G:"G[2],"T:"T[2]}' > FETB06-FA-E.matched.sites.txt