0. 官网:
CONIPHER: https://github.com/McGranahanLab/CONIPHER
ALPACA: https://github.com/McGranahanLab/ALPACA-model
ASCAT: https://github.com/VanLoo-lab/ascat
refphase: https://bitbucket.org/schwarzlab/refphase/src/master/
1. Dependency软件安装
conipher:
conda create -n conipher -c conda-forge -c bioconda conipher
alpaca: 需gurobi证书,本地安装
git clone https://github.com/McGranahanLab/ALPACA-model.git
cd ALPACA-model
conda env create --name alpaca --file environment.yml
conda run -n alpaca pip install dist/*.whl
ASCAT: 运行alpaca需要refphase,refphase需要ASCAT
# R
BiocManager::install(c('GenomicRanges','IRanges')
devtools::install_bitbucket("schwarzlab/ascat_v3_fork/ASCAT") # 需要安装他们修改过的ASCAT V3才行
refphase:
BiocManager::install("GenomicRanges")
devtools::install_bitbucket("schwarzlab/refphase")
2. CONIPHER
2.1 准备输入文件
对多灶样本运行Mutect2,varscan2后,筛选PASS,使用Annovar、vcf2maf注释,之后分别合并样本:
# mutect2
cat *.hg38_multianno.maf | awk 'NR<=3 || (!/^#/ && $1!="Hugo_Symbol")' > 6patients_mutect2_pass.Somatic.hc.hg38_multianno.maf
# varscan2
cat *.hc.hg38_multianno.maf | awk 'NR<=3 || (!/^#/ && $1!="Hugo_Symbol")' > 6patients_varscan2_pass.Somatic.hc.hg38_multianno.maf
在R中筛选过滤,只留下两个软件共同鉴定到的、gnomAD EAS (east asian population) AF<= 0.05、1000GenomesProject EAS <= 0.05, Tumor and normal total coverage >=20 & <=1000,Tumor ALT read count >= 3,Normal ALT read count <= 1,Tumor VAF >=0.05,Normal VAF <=0.05 的突变
mutect2_maf_df <- read.table("/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/Mutation/mutect2_TNpaired/6patients_mutect2_pass.Somatic.hc.hg38_multianno.maf", header = T, sep = '\t', quote = "")
varscan2_maf_df <- read.table("/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/Mutation/varscan2/6patients_varscan2_pass.Somatic.hc.hg38_multianno.maf", header = T, sep = '\t', quote = "")
table(mutect2_maf_df$Tumor_Sample_Barcode)
table(varscan2_maf_df$Tumor_Sample_Barcode)
filter_maf_df <- function(maf_df){
# reformat gnomAD EAS (east asian population) AF and only keep AF <= 0.05
maf_df$gnomADe_EAS_AF[is.na(maf_df$gnomADe_EAS_AF)] <- 0
maf_df <- subset(maf_df,gnomADe_EAS_AF <= 0.05)
# reformat 1000GenomesProject EAS (east asian population) AF and only keep AF <= 0.05
maf_df$EAS_AF[is.na(maf_df$EAS_AF)] <- 0
maf_df <- subset(maf_df,EAS_AF <= 0.05)
# subset Tumor and normal total coverage >=20 & <=1000
maf_df <- subset(maf_df,t_depth >= 20 & t_depth <= 1000)
maf_df <- subset(maf_df,n_depth >= 20 & n_depth <= 1000)
# subset Tumor ALT read count >= 3
maf_df <- subset(maf_df,t_alt_count >= 3)
# subset Normal ALT read count <= 1
maf_df <- subset(maf_df,n_alt_count <= 1)
# subset Tumor VAF >=0.05
maf_df$tVAF <- maf_df$t_alt_count/maf_df$t_depth
maf_df <- subset(maf_df,tVAF>=0.05)
# subset Normal VAF <=0.05
maf_df$nVAF <- maf_df$n_alt_count/maf_df$n_depth
maf_df <- subset(maf_df,nVAF<=0.05)
maf_df$mut_id <- paste0(maf_df$Tumor_Sample_Barcode,":",
maf_df$Chromosome,":",
maf_df$Start_Position,":",
maf_df$End_Position,":",
maf_df$Reference_Allele,":",
maf_df$Tumor_Seq_Allele2)
return(maf_df)
}
mutect2_filtered <- filter_maf_df(mutect2_maf_df)
varscan2_filtered <- filter_maf_df(varscan2_maf_df)
mutect2_varscan2_filtered_intersect <- subset(mutect2_filtered,mut_id %in% varscan2_filtered$mut_id)
table(mutect2_varscan2_filtered_intersect$Tumor_Sample_Barcode)
write.table(mutect2_filtered,"/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/Mutation/mutect2_TNpaired/6patients_mutect2_pass.Somatic.hc.hg38_multianno_filtered.maf",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(varscan2_filtered,"/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/Mutation/varscan2/6patients_varscan2_pass.Somatic.hc.hg38_multianno_filtered.maf",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(mutect2_varscan2_filtered_intersect,"/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/Mutation/6patients_mutect2_varscan2_intersect_pass.Somatic.hc.hg38_multianno_filtered.maf",col.names = T,row.names = F,sep = '\t',quote = F)
直接删除-BNPT和-FA,获取单独以-M和-E区分的Tumor_Sample_Barcode,留下如FETB01-M这样的编号,这样可以提取配对上皮、基质样本中的PASS位点
Merged_maf_df$Tumor_Sample_Barcode <- gsub("-BNPT","",Merged_maf_df$Tumor_Sample_Barcode)
Merged_maf_df$Tumor_Sample_Barcode <- gsub("-FA","",Merged_maf_df$Tumor_Sample_Barcode)
table(Merged_maf_df$Tumor_Sample_Barcode)
write.table(Merged_maf_df,"/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/Mutation/6patients_mutect2_varscan2_intersect_pass_Tumor_Sample_Barcode_merged.Somatic.hc.hg38_multianno_filtered.maf",col.names = T,row.names = F,sep = '\t',quote = F)
使用maf2vcf把提取完的位点转换为vcf文件,其中--per-tn-vcfs会根据Tumor_Sample_Barcode,vcf拆分为每个配对样本一个(如FETB01-M的突变位点会合在一起成一个vcf),生成的vcf文件的名字会是“FETB01-M_vs_FETB01-B.vcf”的格式
maf2vcf.pl --input-maf 6patients_mutect2_varscan2_intersect_pass_Tumor_Sample_Barcode_merged.Somatic.hc.hg38_multianno_filtered.maf --output-dir /groups/phyllodes/home/share/Results/WES/maf2vcf --output-vcf 6patients_mutect2_varscan2_intersect_pass_Tumor_Sample_Barcode_merged --ref-fasta /home/zhoukaiwen/database/GRCh38/genecode_GRCh38.p14.genome.fa --per-tn-vcfs
为call Somatic Mutation 中的 sample_pair添加一列(最右边),按组织类型区分(FETB01-M)
FETB01-BNPT-E FETB01-B FETB01 FETB01-E
FETB01-BNPT-M FETB01-B FETB01 FETB01-M
FETB01-FA-E FETB01-B FETB01 FETB01-E
FETB01-FA-M FETB01-B FETB01 FETB01-M
FETB02-BNPT-E FETB02-B FETB02 FETB02-E
FETB02-BNPT-M FETB02-B FETB02 FETB02-M
FETB02-FA-E FETB02-B FETB02 FETB02-E
FETB02-FA-M FETB02-B FETB02 FETB02-M
FETB03-BNPT-E FETB03-B FETB03 FETB03-E
FETB03-BNPT-M FETB03-B FETB03 FETB03-M
FETB03-FA-E FETB03-B FETB03 FETB03-E
FETB03-FA-M FETB03-B FETB03 FETB03-M
FETB04-BNPT-E FETB04-B FETB04 FETB04-E
FETB04-BNPT-M FETB04-B FETB04 FETB04-M
FETB04-FA-E FETB04-B FETB04 FETB04-E
FETB04-FA-M FETB04-B FETB04 FETB04-M
FETB05-BNPT-E FETB05-B FETB05 FETB05-E
FETB05-BNPT-M FETB05-B FETB05 FETB05-M
FETB05-FA-E FETB05-B FETB05 FETB05-E
FETB05-FA-M FETB05-B FETB05 FETB05-M
FETB06-BNPT-E FETB06-B FETB06 FETB06-E
FETB06-BNPT-M FETB06-B FETB06 FETB06-M
FETB06-FA-E FETB06-B FETB06 FETB06-E
FETB06-FA-M FETB06-B FETB06 FETB06-M
使用mutect2 force call 模式,以上一步生成的vcf文件为target region,call Somatic mutation,这一步call完突变后一样用Annovar注释、并通过vcf2maf再次转换为maf文件,
# Call 突变
perl -ne 'chomp; next if /^$/; @a = split /\t/; print "bcftools sort $a[3]\_vs\_$a[1].vcf -o $a[3]\_vs\_$a[1].sorted.vcf && gatk IndexFeatureFile -I $a[3]\_vs\_$a[1].sorted.vcf && gatk Mutect2 -R ~/database/GRCh38/genecode_GRCh38.p14.genome.fa -I ../align/$a[0]_bqsr.bam -I ../align/$a[1]_bqsr.bam -tumor $a[0] -normal $a[1] --alleles $a[3]\_vs\_$a[1].sorted.vcf -L $a[3]\_vs\_$a[1].sorted.vcf --germline-resource ~/database/gatk_bundle/hg38/somatic-hg38_af-only-gnomad.hg38.vcf.gz -pon ~/database/gatk_bundle/hg38/somatic-hg38_1000g_pon.hg38.vcf.gz --f1r2-tar-gz $a[0].shared.sites.f1r2.tar.gz -O $a[0].shared.sites.raw.vcf && echo $a[0] Mutect2 force call ok\n"' sample_pair.txt > Mutect2_force_call_short.sh
# Annovar、vcf2maf省略
# 合并maf文件
cat *.shared.sites.maf | awk 'NR<=3 || (!/^#/ && $1!="Hugo_Symbol")' > 6patients.shared.sites.maf
至此,制作好了经过质量过滤、样本配对的突变的maf文件
接下来,制作CONIPHER输入文件
# R
mutation_maf <- read.table("6patients.shared.sites.maf",header = T, sep = '\t',quote = "")
mutation_maf <- as_tibble(mutation_maf) %>%
separate(Tumor_Sample_Barcode, into = c("SampleID", "middle", "Tissue"), sep = "-", fill = "right", remove = FALSE) %>%
mutate(Tissue = coalesce(Tissue, middle)) %>%
dplyr::select(-middle)
Filtered_mutationTable <- mutation_maf[,c("Tumor_Sample_Barcode","SampleID","Tissue","Chromosome","Start_Position","Reference_Allele","Tumor_Seq_Allele2","t_ref_count","t_alt_count","t_depth")]
colnames(Filtered_mutationTable) <- c("SAMPLE","CASE_ID","Tissue","CHR","POS","REF","ALT","REF_COUNT","VAR_COUNT","DEPTH")
Filtered_mutationTable$CHR <- gsub("chr","",Filtered_mutationTable$CHR)
# 此处需要运行sequenza之后合并结果,详见sequenza流程文件
merged_pp <- read.table("/data02/zhangmengmeng/BPT/LCM-WES/sequenza/merged_solutions.txt",header = T,sep = '\t')
merged_seg <- read.table("/data02/zhangmengmeng/BPT/LCM-WES/sequenza/merged_segments.txt",header = T,sep = '\t')
merged_pp_seg <- merge(merged_seg, merged_pp, by = "sample_id", all.x = TRUE)
merged_pp_seg <- merged_pp_seg[c("sample_id","chromosome","start.pos","end.pos","A","B","cellularity","ploidy")]
merged_pp_seg$chromosome <- gsub("chr","",merged_pp_seg$chromosome)
matched <- Filtered_mutationTable %>%
inner_join(merged_pp_seg, by = c("SAMPLE" = "sample_id")) %>%
filter(CHR == chromosome & POS >= start.pos & POS <= end.pos)
matched <- matched[,-(11:13)]
colnames(matched)[which(colnames(matched)=="A")] <- "COPY_NUMBER_A"
colnames(matched)[which(colnames(matched)=="B")] <- "COPY_NUMBER_B"
colnames(matched)[which(colnames(matched)=="cellularity")] <- "ACF"
colnames(matched)[which(colnames(matched)=="ploidy")] <- "PLOIDY"
# Then, do an anti_join to get rows from mutationTable that did NOT match
unmatched <- anti_join(Filtered_mutationTable, matched, by = colnames(Filtered_mutationTable)[1:10])
unmatched$COPY_NUMBER_A <- 1
unmatched$COPY_NUMBER_B <- 1
unmatched$ACF <- "UnKnown"
unmatched$PLOIDY <- "UnKnown"
# Merge Final input fo CONIPHER #
table(colnames(matched) %in% colnames(unmatched))
Conipher_Input <- rbind(matched,unmatched)
Conipher_Input <- Conipher_Input[order(Conipher_Input$SAMPLE),]
Conipher_Input <- Conipher_Input %>%
group_by(SAMPLE) %>%
mutate(
ACF = if_else(ACF == "UnKnown", ACF[ACF != "UnKnown"][1], ACF),
PLOIDY = if_else(PLOIDY == "UnKnown", PLOIDY[PLOIDY != "UnKnown"][1], PLOIDY)
) %>%
ungroup()
Conipher_Input <- Conipher_Input[, c(2, 1, 3:ncol(Conipher_Input))]
Conipher_Input <- subset(Conipher_Input,!(SAMPLE %in% c("FETB01-BPT-M-Unstained","FETB01-BPT-E-Unstained")))
Conipher_Input$SAMPLE <- gsub("-","_",Conipher_Input$SAMPLE)
Conipher_Input$CHR <- factor(Conipher_Input$CHR,levels=c(1:22,"X"))
Conipher_Input <- Conipher_Input[order(Conipher_Input$SAMPLE,Conipher_Input$CHR),]
Conipher_Input <- unique(Conipher_Input)
Conipher_Input_FETB01_Epi <- subset(Conipher_Input,CASE_ID == "FETB01" & Tissue =="E")
Conipher_Input_FETB01_Stroma <- subset(Conipher_Input,CASE_ID == "FETB01" & Tissue =="M")
Conipher_Input_FETB02_Epi <- subset(Conipher_Input,CASE_ID == "FETB02" & Tissue =="E")
Conipher_Input_FETB02_Stroma <- subset(Conipher_Input,CASE_ID == "FETB02" & Tissue =="M")
Conipher_Input_FETB03_Epi <- subset(Conipher_Input,CASE_ID == "FETB03" & Tissue =="E")
Conipher_Input_FETB03_Stroma <- subset(Conipher_Input,CASE_ID == "FETB03" & Tissue =="M")
Conipher_Input_FETB04_Epi <- subset(Conipher_Input,CASE_ID == "FETB04" & Tissue =="E")
Conipher_Input_FETB04_Stroma <- subset(Conipher_Input,CASE_ID == "FETB04" & Tissue =="M")
Conipher_Input_FETB05_Epi <- subset(Conipher_Input,CASE_ID == "FETB05" & Tissue =="E")
Conipher_Input_FETB05_Stroma <- subset(Conipher_Input,CASE_ID == "FETB05" & Tissue =="M")
Conipher_Input_FETB06_Epi <- subset(Conipher_Input,CASE_ID == "FETB06" & Tissue =="E")
Conipher_Input_FETB06_Stroma <- subset(Conipher_Input,CASE_ID == "FETB06" & Tissue =="M")
write.table(Conipher_Input_FETB01_Epi,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB01_Epi.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB01_Stroma,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB01_Stroma.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB02_Epi,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB02_Epi.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB02_Stroma,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB02_Stroma.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB03_Epi,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB03_Epi.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB03_Stroma,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB03_Stroma.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB04_Epi,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB04_Epi.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB04_Stroma,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB04_Stroma.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB05_Epi,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB05_Epi.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB05_Stroma,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB05_Stroma.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB06_Epi,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB06_Epi.txt",col.names = T,row.names = F,sep = '\t',quote = F)
write.table(Conipher_Input_FETB06_Stroma,"/data02/zhangmengmeng/BPT/LCM-WES/conipher/Conipher_Input_FETB06_Stroma.txt",col.names = T,row.names = F,sep = '\t',quote = F)
2.2 正式运行
# 模板如下
library(CONIPHER)
library(tidyverse)
# cluster+tree-building
conipher_run(case_id = "FETB01",
prefix = "FETB01",
out_dir = "FETB01",
input_tsv_loc = "Conipher_Input_FETB01.txt")
3. 运行 ASCAT
# EXAMPLE_ASCAT.R
library(ASCAT)
options(bitmapType='cairo')
Sys.setenv(LD_LIBRARY_PATH = paste("/data02/zhangmengmeng/software/alleleCount-4.2.1/lib", Sys.getenv("LD_LIBRARY_PATH"), sep=":"))
Sys.getenv("LD_LIBRARY_PATH")
dyn.load("/data02/zhangmengmeng/software/alleleCount-4.2.1/lib/libhts.so.3")
ascat.prepareHTS(
tumourseqfile = "/data02/zhangmengmeng/BPT/LCM-WES/align/TUMOR_bqsr.bam",
normalseqfile = "/data02/zhangmengmeng/BPT/LCM-WES/align/NORMAL_bqsr.bam",
tumourname = "TUMOR",
normalname = "NORMAL",
allelecounter_exe = "/data02/zhangmengmeng/software/alleleCount-4.2.1/bin/alleleCounter",
alleles.prefix = "/data02/zhangmengmeng/database/ASCAT_ref/G1000_allelesAll_hg38/G1000_alleles_hg38_chr",
loci.prefix = "/data02/zhangmengmeng/database/ASCAT_ref/G1000_lociAll_hg38/G1000_loci_hg38_chr",
BED_file = "/data02/zhangmengmeng/database/gatk_resource_bundle/hg38/AgilentV6_GRCh38_ex_region.sort.bed",
gender = "XX",
genomeVersion = "hg38",
nthreads = 8,
tumourLogR_file = "TUMOR_LogR.txt",
tumourBAF_file = "TUMOR_BAF.txt",
normalLogR_file = "NORMAL_LogR.txt",
normalBAF_file = "NORMAL_BAF.txt")
ascat.bc = ascat.loadData(Tumor_LogR_file = "TUMOR_LogR.txt", Tumor_BAF_file = "TUMOR_BAF.txt", Germline_LogR_file = "NORMAL_LogR.txt", Germline_BAF_file = "NORMAL_BAF.txt", gender = "XX", genomeVersion = "hg38")
ascat.plotRawData(ascat.bc, img.prefix = "TUMOR_Before_correction_")
ascat.bc = ascat.correctLogR(ascat.bc, GCcontentfile = "/data02/zhangmengmeng/database/ASCAT_ref/GC_G1000_hg38.txt", replictimingfile = "/data02/zhangmengmeng/database/ASCAT_ref/RT_G1000_hg38.txt")
ascat.plotRawData(ascat.bc, img.prefix = "TUMOR_After_correction_")
ascat.bc = ascat.aspcf(ascat.bc)
ascat.plotSegmentedData(ascat.bc)
ascat.output = ascat.runAscat(ascat.bc, gamma=1, write_segments = TRUE)
# Function to count SNPs within segment boundaries
count_snps_in_segment <- function(chr, start, end, snp_df) {
sum(snp_df$Chromosome == chr & snp_df$Position >= start & snp_df$Position <= end)
}
# Apply function to each row in segments dataframe
ascat.output$segments$nProbes <- mapply(
count_snps_in_segment,
ascat.output$segments$chr,
ascat.output$segments$startpos,
ascat.output$segments$endpos,
MoreArgs = list(snp_df = ascat.bc$SNPpo)
)
final_segment <- ascat.output$segments
colnames(final_segment) <- c("SampleID","Chr","Start","End","nMajor","nMinor","nProbes")
write.table(final_segment,"TUMOR_final_segment.txt",col.names = T,row.names = F,sep = '\t',quote = F)
QC = ascat.metrics(ascat.bc,ascat.output)
save(ascat.bc, ascat.output, QC, file = 'TUMOR_ASCAT_objects.Rdata')
# Create_Run_ASCAT.sh
#!/bin/bash
# Path to the example R script
EXAMPLE_SCRIPT="EXAMPLE_ASCAT.R"
OUTPUT_SCRIPT="run_ascat.sh"
# Start the script file
echo "#!/bin/bash" > "$OUTPUT_SCRIPT"
# Debugging: Check if sample pairs are correctly read
echo "Processing sample pairs..."
# Ensure strict tab-separated parsing
while IFS=$'\t' read -r TUMOR NORMAL PATIENT_ID; do
# Trim spaces in TUMOR and NORMAL
TUMOR=$(echo "$TUMOR" | xargs)
NORMAL=$(echo "$NORMAL" | xargs)
PATIENT_ID=$(echo "$PATIENT_ID" | xargs)
# Debug: Show extracted values
echo "TUMOR: [$TUMOR], NORMAL: [$NORMAL], PATIENT_ID: [$PATIENT_ID]"
# Skip invalid lines
if [[ -z "$TUMOR" || -z "$NORMAL" ]]; then
echo "Skipping invalid line: [$TUMOR] [$NORMAL]"
continue
fi
# Define output R script name
R_SCRIPT="ASCAT_${TUMOR}.R"
# Swap TUMOR and NORMAL in the script and create a new file
awk -v t="$TUMOR" -v n="$NORMAL" '{gsub("TUMOR", t); gsub("NORMAL", n)} 1' "$EXAMPLE_SCRIPT" > "$R_SCRIPT"
# Append the command to run the script (but don't execute)
echo "/opt/R/4.3.0/lib64/R/bin/Rscript $R_SCRIPT" >> "$OUTPUT_SCRIPT"
done < sample_pair.txt
# Make the script executable
chmod +x "$OUTPUT_SCRIPT"
echo "Commands saved in run_ascat.sh. You can run it manually with: bash run_ascat.sh"
4. 运行refphase
# R
library(ASCAT)
library(refphase)
options(bitmapType='cairo')
patient <- "FETB01-M"
normal_sample <- "FETB01-B"
tumor_samples <- c("FETB01-BNPT-M", "FETB01-FA-M")
# Do some extra work to create a table that will be needed when we run refphase in the next step. This is not needed by ASCAT itself.
refphase_sample_data <- data.frame(sample_id = tumor_samples,
segmentation = paste0(tumor_samples, "_refphase_segs.tsv"),
snps = paste0(tumor_samples, "_refphase_snps.tsv"),
purity = NA, ploidy = NA, row.names = tumor_samples)
# ascat_input and ascat_output are required for refphase later on
ascat_input <- list()
ascat_output <- list()
for (tumor_sample in tumor_samples) {
cur_ascat_input <- ascat.loadData(Tumor_LogR_file = paste0("/data02/zhangmengmeng/BPT/LCM-WES/ASCAT/",tumor_sample, "_LogR.txt"),
Tumor_BAF_file = paste0("/data02/zhangmengmeng/BPT/LCM-WES/ASCAT/",tumor_sample, "_BAF.txt"),
Germline_LogR_file = paste0("/data02/zhangmengmeng/BPT/LCM-WES/ASCAT/",tumor_sample, "_LogR.txt"),
Germline_BAF_file = paste0("/data02/zhangmengmeng/BPT/LCM-WES/ASCAT/",tumor_sample, "_BAF.txt"),
gender = "XX", genomeVersion = "hg38")
ascat.plotRawData(cur_ascat_input,img.prefix =paste0(tumor_sample,"_Before_correction_"))
cur_ascat_input = ascat.correctLogR(cur_ascat_input,
GCcontentfile = "/data02/zhangmengmeng/database/ASCAT_ref/GC_G1000_hg38.txt",
replictimingfile = "/data02/zhangmengmeng/database/ASCAT_ref/RT_G1000_hg38.txt")
ascat.plotRawData(cur_ascat_input,img.prefix =paste0(tumor_sample,"_After_correction_"))
cur_ascat_input <- ascat.aspcf(cur_ascat_input)
ascat.plotSegmentedData(cur_ascat_input)
cur_ascat_output <- ascat.runAscat(cur_ascat_input, gamma = 1.0)
ascat_input[[tumor_sample]] <- cur_ascat_input
ascat_output[[tumor_sample]] <- cur_ascat_output
# Save the segmentation in the default ASCAT format
write.table(cur_ascat_output$segments, file = paste0(tumor_sample, "_segments.txt"), sep = "\t", quote = FALSE, row.names = FALSE)
refphase_sample_data[tumor_sample, "purity"] <- cur_ascat_output$aberrantcellfraction[[1]]
refphase_sample_data[tumor_sample, "ploidy"] <- cur_ascat_output$ploidy[[1]]
}
write.table(refphase_sample_data, file = paste0(patient,"_refphase-sample-data.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
# Load the ascat input and output
refphase_input <- refphase_load(
data_format = "ascat3", samples = tumor_samples,
ascat_input = ascat_input, ascat_output = ascat_output
)
# (Optional) If your data shows reference bias, which presents itself as BAFs
# in regions with a balanced copy number that are systematically shifted away
# from 0.5 (usually something like 0.47), this can try to correct for that.
refphase_input <- center_baf(refphase_input)
# (Optional) Fit SNP logr data to improve copy number re-estimation in refphase,
# when using the default ASCAT formula-based method fo re-estimating copy
# numbers
refphase_input <- fit_logr_to_ascat(refphase_input)
# Run refphase on the experimental data
results <- refphase(refphase_input)
save(results, file = paste0(patient, "-refphase-results.RData"))
write_segs(results$phased_segs, file = paste0(patient, "-refphase-segmentation.tsv"))
# (optional) output the SNPs, including phasing information
write_snps(results$phased_snps, file = paste0(patient, "-refphase-phased-snps.tsv.gz"))
# Ploidy might have changed, if we have updated any copy numbers
write.table(results$sample_data, file = paste0(patient, "-refphase-sample-data-updated.tsv"), sep = "\t", row.names = FALSE)
# Plot the results as PDFs (these files can be huge, over 100 MB in some cases)
pdf(paste0(patient, "-refphase-genome.pdf"), width = 10, height = 4 * nrow(results$sample_data), family = "sans")
plot(results, what = "genome")
dev.off()
pdf(paste0(patient, "-refphase-chromosomes.pdf"), width = 10, height = 2 + 4 * nrow(results$sample_data), family = "sans")
plot_all_chromosomes(results$sample_data, results$phased_snps, list(refphase = results$phased_segs, orig = refphase_input$segs), cn_event_calls = results$cn_event_calls)
dev.off()
# Plot the results as individual PNG files
png(paste0(patient, "-refphase-genome.png"), width = 600, height = 200 + 300 * nrow(results$sample_data), family = "sans")
plot(results, what = "genome")
dev.off()
png(paste0(patient, "-refphase-chromosomes-%02d.png"), width = 700, height = 300 * nrow(results$sample_data), family = "sans")
plot_all_chromosomes(results$sample_data, results$phased_snps, list(refphase = results$phased_segs, orig = refphase_input$segs), cn_event_calls = results$cn_event_calls)
dev.off()
5. 运行alpaca
由于conipher和rephase中,样本名字格式不同(CONIPHER不允许“-”,只能用“_”,因此需要在R中进行修改)
# R 参考模板
load("/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/refphase/FETB04-M-refphase-results.RData")
tree_rds <- readRDS("/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/CONIPHER/FETB04_S/Trees/FETB04-M.tree.RDS")
rename_conipher_output <- function(rds){
rownames(rds$ccf_table_pyclone) <- sub("^([^:]+):", "\\1-M:", rownames(rds$ccf_table_pyclone))
colnames(rds$ccf_table_pyclone) <- sub("_", "-", colnames(rds$ccf_table_pyclone))
colnames(rds$ccf_table_pyclone) <- sub("_", "-", colnames(rds$ccf_table_pyclone))
rownames(rds$ccf_table_absolute) <- sub("^([^:]+):", "\\1-M:", rownames(rds$ccf_table_absolute))
colnames(rds$ccf_table_absolute) <- sub("_", "-", colnames(rds$ccf_table_absolute))
colnames(rds$ccf_table_absolute) <- sub("_", "-", colnames(rds$ccf_table_absolute))
rownames(rds$ccf_table_pyclone_clean) <- sub("^([^:]+):", "\\1-M:", rownames(rds$ccf_table_pyclone_clean))
colnames(rds$ccf_table_pyclone_clean) <- sub("_", "-", colnames(rds$ccf_table_pyclone_clean))
colnames(rds$ccf_table_pyclone_clean) <- sub("_", "-", colnames(rds$ccf_table_pyclone_clean))
rownames(rds$ccf_table_absolute_clean) <- sub("^([^:]+):", "\\1-M:", rownames(rds$ccf_table_absolute_clean))
colnames(rds$ccf_table_absolute_clean) <- sub("_", "-", colnames(rds$ccf_table_absolute_clean))
colnames(rds$ccf_table_absolute_clean) <- sub("_", "-", colnames(rds$ccf_table_absolute_clean))
colnames(rds$clonality_table) <- gsub("_", "-", colnames(rds$clonality_table))
rds$graph_pyclone$sampleID <- paste0(rds$graph_pyclone$sampleID,"-M")
rds$parameters$sampleID <- paste0(rds$parameters$sampleID,"-M")
rds$parameters$prefix <- paste0(rds$parameters$prefix,"-M")
colnames(rds$clonality_out$clonality_table_corrected) <- gsub("_", "-", colnames(rds$clonality_out$clonality_table_corrected))
colnames(rds$clonality_out$clonality_table_original) <- gsub("_", "-", colnames(rds$clonality_out$clonality_table_original))
colnames(rds$clone_proportion_out$clone_proportion_table) <- gsub("_", "-", colnames(rds$clone_proportion_out$clone_proportion_table))
rds$subclonal_expansion_score_out$subclonal_exp_score$sample <- gsub("_", "-", rds$subclonal_expansion_score_out$subclonal_exp_score$sample)
rownames(rds$subclonal_expansion_score_out$subclonal_exp_score) <- rds$subclonal_expansion_score_out$subclonal_exp_score$sample
rds$subclonal_expansion_score_out$subclonal_exp_score_min_sce_trees$`1`$sample <- gsub("_", "-", rds$subclonal_expansion_score_out$subclonal_exp_score_min_sce_trees$`1`$sample)
rownames(rds$subclonal_expansion_score_out$subclonal_exp_score_min_sce_trees$`1`) <- rds$subclonal_expansion_score_out$subclonal_exp_score_min_sce_trees$`1`$sample
return(rds)
}
tree_rds_renamed <- rename_conipher_output(tree_rds)
saveRDS(tree_rds_renamed,"/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/ALPACA/FETB04-M.renamed.tree.RDS")
运行ALPACA
tumour_id="FETB04-M"
refphase_rData="/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/refphase/${tumour_id}-refphase-results.RData"
CONIPHER_tree_object="/Users/zhoukaiwen/Desktop/Breast_Phyllodes_Tumor/Bioinfo/WES/ALPACA/${tumour_id}.renamed.tree.RDS"
prep_input_dir="./${tumour_id}/input"
run_input_dir="${tumour_id}/input"
run_output_dir="${tumour_id}/output"
alpaca input-conversion \
--tumour_id $tumour_id \
--refphase_rData $refphase_rData \
--CONIPHER_tree_object $CONIPHER_tree_object \
--CONIPHER_tree_index 1 \
--output_dir $prep_input_dir
# 重新调整样本命名,R输出时自动会把列名中的-修改为_,这里改回来
sed -i '' 's/_/-/g' "$(pwd)/${run_input_dir}/cp_table.csv"
# 会有不重复的CN segments,这里只挑选重复的CNV区段
mv "$(pwd)/${run_input_dir}/ALPACA_input_table.csv" "$(pwd)/${run_input_dir}/ALPACA_input_table.csv.backup"
awk -F, 'NR==1{print; next} {count[$3]++; lines[NR]=$0; seg[NR]=$3} END{for(i=2;i<=NR;i++) if(count[seg[i]]>1) print lines[i]}' "$(pwd)/${run_input_dir}/ALPACA_input_table.csv.backup" > "$(pwd)/${run_input_dir}/ALPACA_input_table.csv"
# 软件bug:alpaca run设置了run_input_dir的情况下,会默认输入路径为${tumour_id}/${tumour_id},故运行前创建软链接
ln -s "$(pwd)/${tumour_id}/input" "${tumour_id}/${tumour_id}"
alpaca run \
--input_tumour_directory "${run_input_dir}" \
--output_directory "${run_output_dir}"