Skip to content

KevinZ的小窝

Menu
  • Home
  • Categories
Menu

从零开始的BISCUT+推断体系CNA的fitness(适应度)

Posted on 2025年 1月 8日2025年 1月 17日 by KevinZhou
  • Github网址

安装

conda create -n biscut
conda activate biscut

# BISCUT 必须要用特定版本的R和Python
conda install conda-forge::r-base=4.1.2
conda install conda-forge::python=3.9.7

# 安装所需python包
pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install multiprocessing -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install numpy -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install operator -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install itertools -i https://pypi.tuna.tsinghua.edu.cn/simple/
pip install fnmatch -i https://pypi.tuna.tsinghua.edu.cn/simple/

# 安装所需R包
install.packages(c("ismev","extRemes","fitdistrplus","truncdist","segmented","dplyr","reticulate","foreach","doParallel","pastecs","ismev","ggplot2","fitdistrplus","gridExtra","stringr","gtable","cowplot","BiocManager"))
# 使用西湖大学镜像加速
options(BioC_mirror="https://mirrors.westlake.edu.cn/bioconductor")
BiocManager::install("GenomicRanges")

先运行BISCUT_preprocessing.py

# python内设置部分如下
# hg19和hg38都可以用SNP6_hg19_chromosome_locs_200605.txt作为位置文件来定义端粒和着丝粒的位置
# 注意seg文件需要放在python脚本下一个叫doc的文件夹内,其文件名格式为tumor_type+seg_file_suffix,如以下文件名为LiverOld_LIHC_merge_old_552.seg,注意tumor_type不能有下划线"_",否则后续会报错
amplitude_threshold = 0.2
chromosome_coordinates = '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
tumor_type = 'LiverOld'
seg_file_suffix = '_LIHC_merge_old_552.seg'
n_proc = 8 
date_suffix = '2025_01_09'

构建hg38版本的基因名-cytoband对应文件

# 下载ucsc的hg38版本的cytoband文件:
# https://hgdownload.soe.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz
# 下载ucsc的hg38版本的gtf文件:
# http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/genes/hg38.refGene.gtf.gz

通过以下R脚本进行合并:

library(dplyr)
library(tidyr,lib.loc = "/usr/lib/R/site-library")
library(stringr,lib.loc = "/usr/lib/R/site-library")
library(IRanges)

UCSC_cytoband_file <- "~/database/GRCh38/ucsc/UCSC_cytoBand.txt"
UCSC_refgene_gtf <- "~/database/GRCh38/ucsc/UCSC_hg38.refGene.gtf"

UCSC_cytoband <- read.table(UCSC_cytoband_file, header = F, sep = '\t',quote = "")
UCSC_refgene <- read.table(UCSC_refgene_gtf, header = F, sep = '\t',quote = "",comment.char = "#")

# process refgene gtf
UCSC_refgene_split <- UCSC_refgene %>%
  separate(V9, into = paste0("V9_", 1:5), sep = ";", fill = "right", extra = "drop") 

UCSC_refgene_split <- subset(UCSC_refgene_split,V3=="transcript")[,c(1,4,5,9,10)]
colnames(UCSC_refgene_split) <- c("Chr","Start","End","Gene","RefSeqName")

UCSC_refgene_split <- UCSC_refgene_split %>%
  mutate(Gene = gsub('.*"([^"]+)".*', '\\1', Gene))

UCSC_refgene_split <- UCSC_refgene_split %>%
  mutate(RefSeqName = gsub('.*"([^"]+)".*', '\\1', RefSeqName))

UCSC_refgene_split <- subset(UCSC_refgene_split,Chr %in% c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"))

# process cytoband
UCSC_cytoband <- subset(UCSC_cytoband,V1 %in% c("chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22"))
UCSC_cytoband <- UCSC_cytoband[,1:4]
colnames(UCSC_cytoband) <- c("Chr","Start","End","Cytoband")

# Add the Cytoband annotation to UCSC_refgene_split
UCSC_refgene_split$Cytoband <- NA

# Loop through each chromosome to match positions
for (chr in unique(UCSC_refgene_split$Chr)) {
  # Filter rows for the current chromosome
  refgene_chr <- UCSC_refgene_split %>% filter(Chr == chr)
  cytoband_chr <- UCSC_cytoband %>% filter(Chr == chr)

  # Create IRanges objects for the current chromosome
  refgene_ranges <- IRanges(start = refgene_chr$Start, end = refgene_chr$End)
  cytoband_ranges <- IRanges(start = cytoband_chr$Start, end = cytoband_chr$End)

  # Find overlaps
  overlaps <- findOverlaps(refgene_ranges, cytoband_ranges)

  # Map Cytoband values back to UCSC_refgene_split
  UCSC_refgene_split$Cytoband[UCSC_refgene_split$Chr == chr][queryHits(overlaps)] <- 
    cytoband_chr$Cytoband[subjectHits(overlaps)]
}

UCSC_refgene_split <- UCSC_refgene_split[,c("Chr","Start","End","Cytoband","Gene","RefSeqName")]
write.table(UCSC_refgene_split,"/home/zhoukaiwen/software/BISCUT-py3-main/docs/UCSC_hg38_geneloc.txt",col.names = T,row.names = F,sep = '\t',quote = F)
  • 此后按需要可能要删除文件中Chr列的“chr”前缀,并提取unique的基因转录本

设定默认shell

在home目录首先构建tmp文件夹
mkdir /home/zhoukaiwen/tmp
在tmp文件夹中构建软链接,否则后续reticulate包可能会报错,因默认的/usr/bin/sh默认指向dash,与bash的语言格式不同
ln -s /bin/bash /home/zhoukaiwen/tmp/sh

修改R脚本

# 修改最后的files = list.files(path = paste(resultsfolder,'/stats',sep=''))为以下,否则会读取result文件夹下的summary文件夹名从而报错
files = list.files(path = paste(resultsfolder,'/stats',sep=''),pattern = "\\.rds$")

# 在BISCUT_peaks_finding.R 最后的source_python前添加以下代码来绕开R包reticulate中的Sys.which("sh")链接到“/usr/bin/sh”
Sys.setenv(PATH = paste("/home/zhoukaiwen/tmp", Sys.getenv("PATH"), sep = ":"))
Sys.which("sh")

BISCUT结果文件

BISCUT 结果文件有:

  1. 在设定的results文件夹下的all_BISCUT_results.txt,如果有多个肿瘤类型,这里是整合了所有类型的
  2. 在results文件夹下有以设定的肿瘤类型命名的文件夹,如上面设置的LiverOld,这个文件夹下又有一个summary文件夹,里面的LiverOld_BISCUT_results.txt和LiverOld_BISCUT_results_cols_0.95.txt为这个肿瘤类型单独的结果文件

通过BISCUT的结果文件计算peak和gene-level的Relative fitness

gc()
rm(list=ls())

# Settings ####
n_cores <- 8 # numbers of CPU cores for parallelization over chromosome arms
set.seed(123456789) #random seed

# Settings for Liver Old —— Self background
tumor_type <- "LiverOld"
biscut_peak_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95/LiverOld/summary/LiverOld_BISCUT_results_cols_0.95.txt"
biscut_gene_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95/LiverOld/summary/LiverOld_BISCUT_results.txt"
breakpoint_file_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/breakpoint_files_2025_01_09/LiverOld"
abslocs_file <- '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
output_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95"

# Settings for Liver Young —— Self background
tumor_type <- "LiverYoung"
biscut_peak_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95/LiverYoung/summary/LiverYoung_BISCUT_results_cols_0.95.txt"
biscut_gene_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95/LiverYoung/summary/LiverYoung_BISCUT_results.txt"
breakpoint_file_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/breakpoint_files_2025_01_09/LiverYoung"
abslocs_file <- '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
output_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_09_0.95"

# Settings for Liver Old —— PANCAN background
tumor_type <- "LiverOld"
biscut_peak_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95/LiverOld/summary/LiverOld_BISCUT_results_cols_0.95.txt"
biscut_gene_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95/LiverOld/summary/LiverOld_BISCUT_results.txt"
breakpoint_file_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/breakpoint_files_2025_01_14/LiverOld"
abslocs_file <- '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
output_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95"

# Settings for Liver Young —— PANCAN background
tumor_type <- "LiverYoung"
biscut_peak_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95/LiverYoung/summary/LiverYoung_BISCUT_results_cols_0.95.txt"
biscut_gene_file <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95/LiverYoung/summary/LiverYoung_BISCUT_results.txt"
breakpoint_file_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/breakpoint_files_2025_01_14/LiverYoung"
abslocs_file <- '~/software/BISCUT-py3-main/docs/SNP6_hg19_chromosome_locs_200605.txt'
output_path <- "/home/zhoukaiwen/algorithm_learn/output/biscut/results_2025_01_14_0.95"

# Step0: Load Library ####
library(ismev)
library(extRemes)
library(fitdistrplus)
library(truncdist)
library(segmented)
library(dplyr)
library(reticulate)
library(foreach)
library(doParallel)
library(tidyr,lib.loc = "/usr/lib/R/site-library")

registerDoParallel(n_cores)
getDoParWorkers()

# Step0: Load telomere/centromere location files ####
message("Step0: Loading telomere/centromere location file...")
abslocs <- read.table(abslocs_file,sep='\t',header=T)
message("Step0: Load Done!")

# Step1: Process BISCUT identified peak file and obtain peak regions ####
message("Step1: Loading BISCUT identified peak files...")
biscut_peak <- read.csv(biscut_peak_file,sep='\t',header=T)
biscut_gene <- read.csv(biscut_gene_file,sep='\t',header=T)

message("Step1: Extracting BISCUT identified peaks...")
biscut_peak <- as.data.frame(t(biscut_peak))
colnames(biscut_peak) <- biscut_peak[1,]
biscut_peak <- biscut_peak[-1,]
biscut_peak_loc <- biscut_peak$peak_location
biscut_peak_loc <- data.frame(biscut_peak_loc) %>%
  separate(biscut_peak_loc, into = c("Chr", "Positions"), sep = ":") %>%
  separate(Positions, into = c("Start", "End"), sep = "-")
biscut_peak_loc$Chr <- gsub("chr","",biscut_peak_loc$Chr)
biscut_peak_loc$Chr <- as.integer(biscut_peak_loc$Chr)
biscut_peak_loc$direction <- biscut_peak$direction
biscut_peak_loc$telcent <- biscut_peak$`telomeric or centromeric`

biscut_peak_loc_abslocs <- biscut_peak_loc %>%
  left_join(abslocs, by = c("Chr" = "chromosome_info"))
biscut_peak_loc_abslocs$Chr <- as.character(biscut_peak_loc_abslocs$Chr)

biscut_peak_loc_abslocs$ArmType <- "NA"
biscut_peak_loc_abslocs$Start <- as.integer(biscut_peak_loc_abslocs$Start)
biscut_peak_loc_abslocs$End <- as.integer(biscut_peak_loc_abslocs$End)

biscut_peak_loc_abslocs$ArmType[which(biscut_peak_loc_abslocs$Start > biscut_peak_loc_abslocs$p_start)] <- "p"
biscut_peak_loc_abslocs$ArmType[which(biscut_peak_loc_abslocs$Start > biscut_peak_loc_abslocs$q_start)] <- "q"

biscut_peak_loc_abslocs$ArmLength <- "NA"
biscut_peak_loc_abslocs$ArmLength[which(biscut_peak_loc_abslocs$ArmType=="p")] <- biscut_peak_loc_abslocs$p_end[which(biscut_peak_loc_abslocs$ArmType=="p")]-biscut_peak_loc_abslocs$p_start[which(biscut_peak_loc_abslocs$ArmType=="p")]
biscut_peak_loc_abslocs$ArmLength[which(biscut_peak_loc_abslocs$ArmType=="q")] <- biscut_peak_loc_abslocs$q_end[which(biscut_peak_loc_abslocs$ArmType=="q")]-biscut_peak_loc_abslocs$q_start[which(biscut_peak_loc_abslocs$ArmType=="q")]
biscut_peak_loc_abslocs$ArmLength <- as.integer(biscut_peak_loc_abslocs$ArmLength)

biscut_peak_loc_abslocs$ArmStart<- "NA"
biscut_peak_loc_abslocs$ArmStart[which(biscut_peak_loc_abslocs$ArmType=="p")] <- biscut_peak_loc_abslocs$p_start[which(biscut_peak_loc_abslocs$ArmType=="p")]
biscut_peak_loc_abslocs$ArmStart[which(biscut_peak_loc_abslocs$ArmType=="q")] <- biscut_peak_loc_abslocs$q_start[which(biscut_peak_loc_abslocs$ArmType=="q")]
biscut_peak_loc_abslocs$ArmStart <- as.integer(biscut_peak_loc_abslocs$ArmStart)

biscut_peak_loc_abslocs$Start_percent <- "NA"
biscut_peak_loc_abslocs$Start_percent <- (biscut_peak_loc_abslocs$Start-biscut_peak_loc_abslocs$ArmStart)/biscut_peak_loc_abslocs$ArmLength
biscut_peak_loc_abslocs$Start_percent[which(biscut_peak_loc_abslocs$Start_percent<0)] <- 0
biscut_peak_loc_abslocs$Start_percent[which(biscut_peak_loc_abslocs$Start_percent>1)] <- 1

biscut_peak_loc_abslocs$End_percent <- "NA"
biscut_peak_loc_abslocs$End_percent <- (biscut_peak_loc_abslocs$End-biscut_peak_loc_abslocs$ArmStart)/biscut_peak_loc_abslocs$ArmLength
biscut_peak_loc_abslocs$End_percent[which(biscut_peak_loc_abslocs$End_percent<0)] <- 0
biscut_peak_loc_abslocs$End_percent[which(biscut_peak_loc_abslocs$End_percent>1)] <- 1

biscut_peak_loc_abslocs$Peak_Length <- biscut_peak_loc_abslocs$End-biscut_peak_loc_abslocs$Start
biscut_peak_loc_abslocs$Peak_Length_percent <- biscut_peak_loc_abslocs$Peak_Length/biscut_peak_loc_abslocs$ArmLength

# Reverse the direction peaks of cent-bounded on p and tel-bounded on q to make them start on 0
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$Start_percent <- 1-(biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$Start_percent)
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$End_percent <- 1-(biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$End_percent)
tmp_peak_start <- biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$End_percent
tmp_peak_end <- biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$Start_percent
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$Start_percent <- tmp_peak_start
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="cent" & biscut_peak_loc_abslocs$ArmType=="p")),]$End_percent <- tmp_peak_end

biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$Start_percent <- 1-(biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$Start_percent)
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$End_percent <- 1-(biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$End_percent)
tmp_peak_start <- biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$End_percent
tmp_peak_end <- biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$Start_percent
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$Start_percent <- tmp_peak_start
biscut_peak_loc_abslocs[(which(biscut_peak_loc_abslocs$telcent=="tel" & biscut_peak_loc_abslocs$ArmType=="q")),]$End_percent <- tmp_peak_end

biscut_peak_loc_abslocs$Chr <- as.integer(biscut_peak_loc_abslocs$Chr)
biscut_peak_loc_abslocs <- biscut_peak_loc_abslocs[order(biscut_peak_loc_abslocs$Chr),]

message("Step1: Extracting BISCUT identified genes on peaks...")
biscut_gene <- biscut_gene[,c("Chr","Gene","Start","End","Peak.Start","Peak.End","arm","direction","telcent")]
biscut_gene_loc_abslocs <- biscut_gene %>%
  left_join(abslocs, by = c("Chr" = "chromosome_info"))

biscut_gene_loc_abslocs$ArmType <- "NA"
biscut_gene_loc_abslocs$Start <- as.integer(biscut_gene_loc_abslocs$Start)
biscut_gene_loc_abslocs$End <- as.integer(biscut_gene_loc_abslocs$End)

biscut_gene_loc_abslocs$ArmType[which(biscut_gene_loc_abslocs$Start > biscut_gene_loc_abslocs$p_start)] <- "p"
biscut_gene_loc_abslocs$ArmType[which(biscut_gene_loc_abslocs$Start > biscut_gene_loc_abslocs$q_start)] <- "q"

biscut_gene_loc_abslocs$ArmLength <- "NA"
biscut_gene_loc_abslocs$ArmLength[which(biscut_gene_loc_abslocs$ArmType=="p")] <- biscut_gene_loc_abslocs$p_end[which(biscut_gene_loc_abslocs$ArmType=="p")]-biscut_gene_loc_abslocs$p_start[which(biscut_gene_loc_abslocs$ArmType=="p")]
biscut_gene_loc_abslocs$ArmLength[which(biscut_gene_loc_abslocs$ArmType=="q")] <- biscut_gene_loc_abslocs$q_end[which(biscut_gene_loc_abslocs$ArmType=="q")]-biscut_gene_loc_abslocs$q_start[which(biscut_gene_loc_abslocs$ArmType=="q")]
biscut_gene_loc_abslocs$ArmLength <- as.integer(biscut_gene_loc_abslocs$ArmLength)

biscut_gene_loc_abslocs$ArmStart<- "NA"
biscut_gene_loc_abslocs$ArmStart[which(biscut_gene_loc_abslocs$ArmType=="p")] <- biscut_gene_loc_abslocs$p_start[which(biscut_gene_loc_abslocs$ArmType=="p")]
biscut_gene_loc_abslocs$ArmStart[which(biscut_gene_loc_abslocs$ArmType=="q")] <- biscut_gene_loc_abslocs$q_start[which(biscut_gene_loc_abslocs$ArmType=="q")]
biscut_gene_loc_abslocs$ArmStart <- as.integer(biscut_gene_loc_abslocs$ArmStart)

biscut_gene_loc_abslocs$Start_percent <- "NA"
biscut_gene_loc_abslocs$Start_percent <- (biscut_gene_loc_abslocs$Start-biscut_gene_loc_abslocs$ArmStart)/biscut_gene_loc_abslocs$ArmLength
biscut_gene_loc_abslocs$Start_percent[which(biscut_gene_loc_abslocs$Start_percent<0)] <- 0
biscut_gene_loc_abslocs$Start_percent[which(biscut_gene_loc_abslocs$Start_percent>1)] <- 1

biscut_gene_loc_abslocs$End_percent <- "NA"
biscut_gene_loc_abslocs$End_percent <- (biscut_gene_loc_abslocs$End-biscut_gene_loc_abslocs$ArmStart)/biscut_gene_loc_abslocs$ArmLength
biscut_gene_loc_abslocs$End_percent[which(biscut_gene_loc_abslocs$End_percent<0)] <- 0
biscut_gene_loc_abslocs$End_percent[which(biscut_gene_loc_abslocs$End_percent>1)] <- 1

biscut_gene_loc_abslocs$Gene_Length <- biscut_gene_loc_abslocs$End-biscut_gene_loc_abslocs$Start
biscut_gene_loc_abslocs$Gene_Length_percent <- biscut_gene_loc_abslocs$Gene_Length/biscut_gene_loc_abslocs$ArmLength

# Reverse the direction genes of cent-bounded on p and tel-bounded on q to make them start on 0
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$Start_percent <- 1-(biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$Start_percent)
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$End_percent <- 1-(biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$End_percent)
tmp_gene_start <- biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$End_percent
tmp_gene_end <-biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$Start_percent
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$Start_percent <- tmp_gene_start
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="cent" & biscut_gene_loc_abslocs$ArmType=="p")),]$End_percent <- tmp_gene_end

biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$Start_percent <- 1-(biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$Start_percent)
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$End_percent <- 1-(biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$End_percent)
tmp_gene_start <- biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$End_percent
tmp_gene_end <-biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$Start_percent
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$Start_percent <- tmp_gene_start
biscut_gene_loc_abslocs[(which(biscut_gene_loc_abslocs$telcent=="tel" & biscut_gene_loc_abslocs$ArmType=="q")),]$End_percent <- tmp_gene_end

biscut_gene_loc_abslocs$Chr <- as.integer(biscut_gene_loc_abslocs$Chr)
biscut_gene_loc_abslocs <- biscut_gene_loc_abslocs[order(biscut_gene_loc_abslocs$Chr),]

message("Step1 All Done!")

# Step2: Process breakpoint file and count the number of true telomere/centromere-bounded SCNA breakpoints between peaks ####
message("Step2: Extracting breakpoint peak file...")

# Extract the 4 rows necessary for later calculation
biscut_peak_loc_abslocs_unique <- unique(biscut_peak_loc_abslocs[, c("Chr", "direction", "telcent", "ArmType")])

# Calculate the max number of peaks within one chr arm
dup_count <- biscut_peak_loc_abslocs %>%
  group_by(Chr, direction, telcent, ArmType) %>%
  summarise(count = n(), .groups = "drop") %>%
  as.data.frame(.)

# Create breakpoint_filenames to store paths of telomere/centromere-bounded SCNA breakpoint files
breakpoint_filenames <- unique(apply(biscut_peak_loc_abslocs_unique, 1, function(row) {
  paste(breakpoint_file_path,"/",tumor_type,"_",row["Chr"], row["ArmType"], "_", row["direction"], "_",row["telcent"], ".txt",sep = "")
}))

# Change the name for chr 13,14,15,21,22 to match the names of breakpoint filenames
breakpoint_filenames <- gsub("_13p","_13",breakpoint_filenames)
breakpoint_filenames <- gsub("_13q","_13",breakpoint_filenames)
breakpoint_filenames <- gsub("_14p","_14",breakpoint_filenames)
breakpoint_filenames <- gsub("_14q","_14",breakpoint_filenames)
breakpoint_filenames <- gsub("_15p","_15",breakpoint_filenames)
breakpoint_filenames <- gsub("_15q","_15",breakpoint_filenames)
breakpoint_filenames <- gsub("_21p","_21",breakpoint_filenames)
breakpoint_filenames <- gsub("_21q","_21",breakpoint_filenames)
breakpoint_filenames <- gsub("_22p","_22",breakpoint_filenames)
breakpoint_filenames <- gsub("_22q","_22",breakpoint_filenames)
breakpoint_filenames <- gsub(" ","",breakpoint_filenames)

# Calculate the max number of segments(between BISCUT peaks) in one arm
n = max(dup_count$count)+1

# Create column names for Expected and True breakpoint count within one segment
col_names <- paste0("E_", 0:(n-1), "_", 1:n)
new_rows_Expected <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_Expected) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_Expected)

col_names <- paste0("T_", 0:(n-1), "_", 1:n)
new_rows_True <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_True) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_True)

# Set the max number of peak in one chr arm
biscut_peak_n <- max(dup_count$count)

# Set names for peak start
col_names <- paste0("biscut_peak_start_", 1:biscut_peak_n)
new_rows_biscut_ps <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_biscut_ps) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_biscut_ps)

# Set names for peak end
col_names <- paste0("biscut_peak_end_", 1:biscut_peak_n)
new_rows_biscut_pe <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_biscut_pe) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_biscut_pe)

# Set names for peak length as percentage
col_names <- paste0("biscut_peak_length_percent_", 1:biscut_peak_n)
new_rows_biscut_plp <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_biscut_plp) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_biscut_plp)

# Read breakpoint_filenames
message("Matching breakpoint files with BISCUT peak files...")
count_n = 1
for (filenames in breakpoint_filenames){
  tmp_file <-  read.csv(filenames,sep='\t',header=T)
  tmp_file <- subset(tmp_file,percent>0) # Extract samples with SCNA length in the chr arm
  tmp_count_df <- as.data.frame(table(tmp_file$end)) # Extract SCNA breakpoint end as percentage and count its number in the chr arm
  tmp_count_df$Var1 <- as.numeric(as.character(tmp_count_df$Var1)) # Var1 correspond to the breakpoint end location in the chr arm, shown as percentage
  tmp_count_df$Freq <- as.numeric(as.character(tmp_count_df$Freq)) # Freq correspond to the count of the same breakpoints in the chr arm, shown as integer
  message("Processing ",biscut_peak_loc_abslocs_unique[count_n,]$Chr,biscut_peak_loc_abslocs_unique[count_n,]$ArmType," ", biscut_peak_loc_abslocs_unique[count_n,]$telcent, " ",biscut_peak_loc_abslocs_unique[count_n,]$direction)
  # Extract BISCUT indentified peak(peaks that located at the chr arm in breakpoint_filenames) locations(as percentage)
  tmp_biscut_peak_start <- subset(biscut_peak_loc_abslocs, Chr==biscut_peak_loc_abslocs_unique[count_n,]$Chr & direction==biscut_peak_loc_abslocs_unique[count_n,]$direction & telcent==biscut_peak_loc_abslocs_unique[count_n,]$telcent & ArmType==biscut_peak_loc_abslocs_unique[count_n,]$ArmType)$Start_percent
  tmp_biscut_peak_end <- subset(biscut_peak_loc_abslocs, Chr==biscut_peak_loc_abslocs_unique[count_n,]$Chr & direction==biscut_peak_loc_abslocs_unique[count_n,]$direction & telcent==biscut_peak_loc_abslocs_unique[count_n,]$telcent & ArmType==biscut_peak_loc_abslocs_unique[count_n,]$ArmType)$End_percent
  tmp_biscut_peak_length_percent <- subset(biscut_peak_loc_abslocs, Chr==biscut_peak_loc_abslocs_unique[count_n,]$Chr & direction==biscut_peak_loc_abslocs_unique[count_n,]$direction & telcent==biscut_peak_loc_abslocs_unique[count_n,]$telcent & ArmType==biscut_peak_loc_abslocs_unique[count_n,]$ArmType)$Peak_Length_percent
  message("Filtering telomere/centromere-bounded SCNA breakpoints before BISCUT peak1 start: ",min(tmp_biscut_peak_start))
  tmp_count_df_subset1 <- subset(tmp_count_df, Var1 < min(tmp_biscut_peak_start)) # SCNA breakpoint end < BISCUT peak start are identified as E_0_1
  tmp_count_df_subset2 <- subset(tmp_count_df, Var1 < 1 & Var1 > max(tmp_biscut_peak_end)) # Extract the SCNA breakpoint end in the last segment(located before arm end(1) and after the last BISCUT end(max(tmp_biscut_peak_end))) 
  tmp_E_0_1 <- sum(tmp_count_df_subset1$Freq) # Calculate the total count of Expected breakpoints in the first segment, which equals True total count of breakpoints
  tmp_T_0_1 <- sum(tmp_count_df_subset1$Freq) # Calculate the total count of True breakpoints in the first segment
  tmp_T_1_2 <- sum(tmp_count_df_subset2$Freq) # Calculate the total count of True breakpoints in the second segment, will change later if the number of BISCUT peak is larger than 1
  biscut_peak_loc_abslocs_unique[count_n,]$E_0_1 <- tmp_E_0_1
  biscut_peak_loc_abslocs_unique[count_n,]$T_0_1 <- tmp_T_0_1
  if (length(tmp_biscut_peak_start)==1){
    message("There's only 1 peak on ", biscut_peak_loc_abslocs_unique[count_n,]$Chr,biscut_peak_loc_abslocs_unique[count_n,]$ArmType," ", biscut_peak_loc_abslocs_unique[count_n,]$telcent, " ",biscut_peak_loc_abslocs_unique[count_n,]$direction)
    message("Filtering telomere/centromere-bounded SCNA breakpoints before arm end")
    biscut_peak_loc_abslocs_unique[count_n,]$biscut_peak_start_1 <- tmp_biscut_peak_start
    biscut_peak_loc_abslocs_unique[count_n,]$biscut_peak_end_1 <- tmp_biscut_peak_end
    biscut_peak_loc_abslocs_unique[count_n,]$biscut_peak_length_percent_1 <- tmp_biscut_peak_length_percent
    tmp_count_df_subset2 <- subset(tmp_count_df, Var1 < 1 & Var1 > tmp_biscut_peak_end) # SCNA breakpoint end < BISCUT peak start are identified as E_0_1
    biscut_peak_loc_abslocs_unique[count_n,]$T_1_2 <- tmp_T_1_2
  }else if(length(tmp_biscut_peak_start)>1){
    message("There're ",length(tmp_biscut_peak_start)," peaks on ", biscut_peak_loc_abslocs_unique[count_n,]$Chr,biscut_peak_loc_abslocs_unique[count_n,]$ArmType," ", biscut_peak_loc_abslocs_unique[count_n,]$telcent, " ",biscut_peak_loc_abslocs_unique[count_n,]$direction)
    for(peak_n in 1:length(tmp_biscut_peak_start)){
      message("Processing peak ", peak_n, "...")
      bps_col_name <- paste0("biscut_peak_start_", peak_n)
      bpe_col_name <- paste0("biscut_peak_end_", peak_n)
      bplp_col_name <- paste0("biscut_peak_length_percent_", peak_n)
      true_col_name <- paste0("T_", peak_n-1, "_", peak_n)
      biscut_peak_loc_abslocs_unique[count_n,bps_col_name] <- tmp_biscut_peak_start[peak_n]
      biscut_peak_loc_abslocs_unique[count_n,bpe_col_name] <- tmp_biscut_peak_end[peak_n]
      biscut_peak_loc_abslocs_unique[count_n,bplp_col_name] <- tmp_biscut_peak_length_percent[peak_n]
      if (1 < peak_n & peak_n < length(tmp_biscut_peak_start)){
        tmp_count_df_subset <- subset(tmp_count_df, Var1 < tmp_biscut_peak_start[peak_n] & Var1 > tmp_biscut_peak_end[peak_n-1])
        biscut_peak_loc_abslocs_unique[count_n,true_col_name] <- sum(tmp_count_df_subset$Freq)
      }else if(peak_n==length(tmp_biscut_peak_start)){
        tmp_count_df_subset1 <- subset(tmp_count_df, Var1 < tmp_biscut_peak_start[peak_n] & Var1 > tmp_biscut_peak_end[peak_n-1])
        tmp_count_df_subset2 <- subset(tmp_count_df, Var1 < 1 & Var1 > tmp_biscut_peak_end[peak_n])
        true_col_name2 <- paste0("T_", peak_n, "_", peak_n+1)
        biscut_peak_loc_abslocs_unique[count_n,true_col_name] <- sum(tmp_count_df_subset1$Freq)
        biscut_peak_loc_abslocs_unique[count_n,true_col_name2] <- sum(tmp_count_df_subset2$Freq)
      }
    }
  }
  count_n = count_n+1
}
message("Matching BISCUT peak files with BISCUT gene files...")
dup_count$MultiPeak <- "TRUE"
dup_count$MultiPeak[which(dup_count$count == 1)] <- "FALSE"

dup_peak <- unique(biscut_peak_loc_abslocs[, c("Chr", "direction", "telcent", "ArmType","Start","End","Start_percent","End_percent")])
dup_peak <- dup_peak[order(dup_peak$Start),]
dup_peak <- dup_peak %>%
  group_by(Chr, direction, telcent, ArmType) %>%
  mutate(
    PeakNum = paste0("Peak", row_number()),        # Assign peak numbers
    PeakNum = ifelse(row_number() == n(),         # Check if it's the last row in the group
                     paste0(PeakNum, "(Last)"), 
                     PeakNum),
    NextPeakStart = ifelse(grepl("\\(Last\\)", PeakNum), 1, lead(Start_percent)),  # Use lead(Start_percent) for next peak, 1 if Last
    PriorPeakEnd = ifelse(row_number() == 1, 0, lag(End_percent))
  ) %>%
  ungroup()

colnames(dup_peak)[which(colnames(dup_peak)=="Start")] <- "Peak.Start"
colnames(dup_peak)[which(colnames(dup_peak)=="End")] <- "Peak.End"
colnames(dup_peak)[which(colnames(dup_peak)=="Start_percent")] <- "Peak.Start_percent"
colnames(dup_peak)[which(colnames(dup_peak)=="End_percent")] <- "Peak.End_percent"

biscut_gene_loc_abslocs_peak <- biscut_gene_loc_abslocs %>%
  left_join(dup_peak, by = c("Chr" = "Chr","direction" = "direction", "telcent" = "telcent", "ArmType" = "ArmType","Peak.Start"="Peak.Start","Peak.End"="Peak.End"))

message("Matching breakpoint files with BISCUT gene files")
biscut_gene_loc_abslocs_peak$E_0_1 <- NA
biscut_gene_loc_abslocs_peak$E_1_2 <- NA
biscut_gene_loc_abslocs_peak$T_0_1 <- NA
biscut_gene_loc_abslocs_peak$T_1_2 <- NA

count_n=1
for (filenames in breakpoint_filenames){
  tmp_file <-  read.csv(filenames,sep='\t',header=T)
  tmp_file <- subset(tmp_file,percent>0) # Extract samples with SCNA length in the chr arm
  tmp_count_df <- as.data.frame(table(tmp_file$end)) # Extract SCNA breakpoint end as percentage and count its number in the chr arm
  tmp_count_df$Var1 <- as.numeric(as.character(tmp_count_df$Var1)) # Var1 correspond to the breakpoint end location in the chr arm, shown as percentage
  tmp_count_df$Freq <- as.numeric(as.character(tmp_count_df$Freq)) # Freq correspond to the count of the same breakpoints in the chr arm, shown as integer
  tmp_Chr <- biscut_peak_loc_abslocs_unique[count_n,]$Chr
  tmp_ArmType <- biscut_peak_loc_abslocs_unique[count_n,]$ArmType
  tmp_telcent <- biscut_peak_loc_abslocs_unique[count_n,]$telcent
  tmp_direction <- biscut_peak_loc_abslocs_unique[count_n,]$direction
  message("Processing ",biscut_peak_loc_abslocs_unique[count_n,]$Chr,biscut_peak_loc_abslocs_unique[count_n,]$ArmType," ", biscut_peak_loc_abslocs_unique[count_n,]$telcent, " ",biscut_peak_loc_abslocs_unique[count_n,]$direction)
  # Matching BISCUT indentified genes
  for (gn in 1:nrow(biscut_gene_loc_abslocs_peak)){
    if(biscut_gene_loc_abslocs_peak[gn,]$Chr==tmp_Chr & biscut_gene_loc_abslocs_peak[gn,]$ArmType==tmp_ArmType & biscut_gene_loc_abslocs_peak[gn,]$telcent==tmp_telcent & biscut_gene_loc_abslocs_peak[gn,]$direction==tmp_direction){
      message(filenames)
      tmp_biscut_gene_start <- biscut_gene_loc_abslocs_peak[gn,]$Start_percent
      tmp_biscut_gene_end <- biscut_gene_loc_abslocs_peak[gn,]$End_percent
      tmp_biscut_gene_length_percent <- biscut_gene_loc_abslocs_peak[gn,]$Gene_Length_percent
      tmp_prior_peak_end_percent <- biscut_gene_loc_abslocs_peak[gn,]$PriorPeakEnd
      tmp_next_peak_start_percent <- biscut_gene_loc_abslocs_peak[gn,]$NextPeakStart
      tmp_gene_name <- biscut_gene_loc_abslocs_peak[gn,]$Gene
      message("Filtering telomere/centromere-bounded SCNA breakpoints before ", tmp_gene_name," start: ",tmp_biscut_gene_start)
      tmp_count_df_subset1 <- subset(tmp_count_df, Var1 < tmp_biscut_gene_start & Var1 > tmp_prior_peak_end_percent) # SCNA breakpoint end < BISCUT gene start are identified as E_0_1
      tmp_count_df_subset2 <- subset(tmp_count_df, Var1 < tmp_next_peak_start_percent & Var1 > tmp_biscut_gene_end) # Extract the SCNA breakpoint end in the segment after the BISCUT gene end
      tmp_E_0_1 <- sum(tmp_count_df_subset1$Freq) # Calculate the total count of Expected breakpoints in the first segment, which equals True total count of breakpoints
      tmp_T_0_1 <- sum(tmp_count_df_subset1$Freq) # Calculate the total count of True breakpoints in the first segment
      tmp_T_1_2 <- sum(tmp_count_df_subset2$Freq) # Calculate the total count of True breakpoints in the second segment
      biscut_gene_loc_abslocs_peak[gn,]$E_0_1 <- tmp_E_0_1
      biscut_gene_loc_abslocs_peak[gn,]$T_0_1 <- tmp_T_0_1
      biscut_gene_loc_abslocs_peak[gn,]$T_1_2 <- tmp_T_1_2
      message("Gene ",gn," ",tmp_gene_name,": ",tmp_E_0_1,"|",tmp_T_0_1,"|",tmp_T_1_2)
    }
  }
  count_n = count_n+1
}

message("Step2 All Done!")

# Step3: Fit breakpoints to beta distribution to obtain alpha and beta value ####
biscut_peak_loc_abslocs_unique$alpha <- NA
biscut_peak_loc_abslocs_unique$beta <- NA

count_n = 1
for (filenames in breakpoint_filenames){
  tmp_file <-  read.csv(filenames,sep='\t',header=T)
  betafit <- fitdist(tmp_file$percent,'beta')
  alpha = summary(betafit)$estimate[1]
  beta = summary(betafit)$estimate[2]
  biscut_peak_loc_abslocs_unique[count_n,"alpha"] <- alpha
  biscut_peak_loc_abslocs_unique[count_n,"beta"] <- beta
  count_n = count_n+1
}

arm_alpha_beta <- biscut_peak_loc_abslocs_unique[,c("Chr","direction","telcent","ArmType","alpha","beta")]

biscut_gene_loc_abslocs_peak <- biscut_gene_loc_abslocs_peak %>%
  left_join(arm_alpha_beta, by = c("Chr" = "Chr","direction" = "direction", "telcent" = "telcent", "ArmType" = "ArmType"))

message("Step3 All Done!")

# Step4: Use BISCUT peak to estimate expected count of telomere/centromere-bounded SCNA breakpoints between peaks ####
message("Calculating peak-level expected count of telomere/centromere-bounded SCNA breakpoints")
for(n in 1:nrow(biscut_peak_loc_abslocs_unique)){
  for(peak_n in 1:max(dup_count$count)){
    expected_n_1_col_name <- paste0("E_", peak_n-1, "_", peak_n)
    expected_n_col_name <- paste0("E_", peak_n, "_", peak_n+1)

    bps_n_col_name <- paste0("biscut_peak_start_", peak_n)
    bpe_n_col_name <- paste0("biscut_peak_end_", peak_n)
    bplp_n_col_name <- paste0("biscut_peak_length_percent_", peak_n)

    bps_n1_col_name <- paste0("biscut_peak_start_", peak_n+1)
    bpe_n1_col_name <- paste0("biscut_peak_end_", peak_n+1)
    bplp_n1_col_name <- paste0("biscut_peak_length_percent_", peak_n+1)

    bps_n_1_col_name <- paste0("biscut_peak_start_", peak_n-1)
    bpe_n_1_col_name <- paste0("biscut_peak_end_", peak_n-1)
    bplp_n_1_col_name <- paste0("biscut_peak_length_percent_", peak_n-1)

    if (peak_n == 1){
      if(bps_n1_col_name %in% colnames(biscut_peak_loc_abslocs_unique)){
        if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name])==TRUE){
          biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(1,biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
        }else if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name])==FALSE){
          biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
        }
      }else if(((bps_n1_col_name %in% colnames(biscut_peak_loc_abslocs_unique)==FALSE))){
        biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(1,biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
      }
    }else if(1<peak_n & peak_n<max(dup_count$count)){
      if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n_col_name])==FALSE & is.na(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name])==FALSE){
        biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])) 
      }else if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n_col_name])==FALSE & is.na(biscut_peak_loc_abslocs_unique[n,bps_n1_col_name])==TRUE){
        biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(1,biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
      }
    }else if(peak_n==max(dup_count$count)){
      if(is.na(biscut_peak_loc_abslocs_unique[n,bps_n_col_name])==FALSE){
        biscut_peak_loc_abslocs_unique[n,expected_n_col_name] <- (pbeta(1,biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))*biscut_peak_loc_abslocs_unique[n,expected_n_1_col_name]/(pbeta(biscut_peak_loc_abslocs_unique[n,bps_n_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"])-pbeta(biscut_peak_loc_abslocs_unique[n,bpe_n_1_col_name],biscut_peak_loc_abslocs_unique[n,"alpha"],biscut_peak_loc_abslocs_unique[n,"beta"]))
      }
    } 
  }
}

message("Calculating gene-level expected count of telomere/centromere-bounded SCNA breakpoints")

for (n in 1:nrow(biscut_gene_loc_abslocs_peak)){
  biscut_gene_loc_abslocs_peak[n,"E_1_2"] <- (pbeta(biscut_gene_loc_abslocs_peak[n,"NextPeakStart"],biscut_gene_loc_abslocs_peak[n,"alpha"],biscut_gene_loc_abslocs_peak[n,"beta"])-pbeta(biscut_gene_loc_abslocs_peak[n,"End_percent"],biscut_gene_loc_abslocs_peak[n,"alpha"],biscut_gene_loc_abslocs_peak[n,"beta"]))*biscut_gene_loc_abslocs_peak[n,"E_0_1"]/(pbeta(biscut_gene_loc_abslocs_peak[n,"Start_percent"],biscut_gene_loc_abslocs_peak[n,"alpha"],biscut_gene_loc_abslocs_peak[n,"beta"])-pbeta(biscut_gene_loc_abslocs_peak[n,"PriorPeakEnd"],biscut_gene_loc_abslocs_peak[n,"alpha"],biscut_gene_loc_abslocs_peak[n,"beta"])) 
}

message("Step4 All Done!")

# Step5: Divide Expected count of telomere/centromere-bounded SCNA breakpoints with True telomere/centromere-bounded SCNA breakpoints to infer fitness ####
col_names <- paste0("biscut_peak", 1:biscut_peak_n,"_RF")
new_rows_biscut_prf <- as.data.frame(matrix(nrow = nrow(biscut_peak_loc_abslocs_unique), ncol = length(col_names)))
colnames(new_rows_biscut_prf) <- col_names
biscut_peak_loc_abslocs_unique <- cbind(biscut_peak_loc_abslocs_unique, new_rows_biscut_prf)

# Calculate Peak-level Relative Fitness as (True breakpoint count)/(Expected breakpoint count)
for(n in 1:nrow(biscut_peak_loc_abslocs_unique)){
  for(peak_n in 1:max(dup_count$count)){
    expected_n_col_name <- paste0("E_", peak_n, "_", peak_n+1)
    true_n_col_name <- paste0("T_", peak_n, "_", peak_n+1)
    RF_n_col_name <- paste0("biscut_peak", peak_n, "_RF")

    biscut_peak_loc_abslocs_unique[n,RF_n_col_name] <- (biscut_peak_loc_abslocs_unique[n,true_n_col_name]+0.000000001)/(biscut_peak_loc_abslocs_unique[n,expected_n_col_name]+0.000000001)
  }
}

# Calculate Gene-level Relative Fitness as (True breakpoint count)/(Expected breakpoint count)
biscut_gene_loc_abslocs_peak$RF <- (biscut_gene_loc_abslocs_peak$T_1_2+0.000000001)/(biscut_gene_loc_abslocs_peak$E_1_2+0.000000001)

message("Step5 All Done!")
message("Writting Files to ",output_path,"/",tumor_type)

write.table(biscut_peak_loc_abslocs_unique,paste0(output_path,"/",tumor_type,"_biscut_PANCAN_peak_RF.txt"),col.names = T, row.names = F, sep = '\t', quote = F)
write.table(biscut_gene_loc_abslocs_peak,paste0(output_path,"/",tumor_type,"_biscut_PANCAN_gene_RF.txt"),col.names = T, row.names = F, sep = '\t', quote = F)
2025 年 1 月
一 二 三 四 五 六 日
 12345
6789101112
13141516171819
20212223242526
2728293031  
« 12 月   2 月 »

俺家的猫~

胖达~

© 2025 KevinZ的小窝 |

粤ICP备2023017690号

|

粤公网安备 44010402003004号