使用 Numbat 进行单细胞恶性细胞鉴定

Numbat官网： https://kharchenkolab.github.io/numbat/articles/numbat.html

其它依赖包：
1. cellsnp-lite：https://github.com/single-cell-genetics/cellsnp-lite
2. Eagle2：https://alkesgroup.broadinstitute.org/Eagle/
3. samtools

1. 安装及参考文件下载

Numbat 安装：

install.packages('numbat', dependencies = TRUE)

devtools::install_github("https://github.com/kharchenkolab/numbat")

# 注意还要下载github官网的包，里面有个单独的脚本会用来提取snp

Eagle v2.4.1：

http://data.broadinstitute.org/alkesgroup/Eagle/downloads/

参考文件：

# 1000G SNP VCF
# hg38
wget https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz
# hg19
wget https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e2.chr1toX.hg19.vcf.gz

# 1000G Reference Panel (paste link in browser to download if wget isn’t working)
# hg38
wget http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip
# hg19
wget http://pklab.med.harvard.edu/teng/data/1000G_hg19.zip

2. Preparing data

对每个单细胞样本进行一次pileup

模板sh文件：
EXAMPLE_Run_pileup_and_phase.sh

Rscript /home/zhoukaiwen/software/numbat-1.4.2/inst/bin/pileup_and_phase.R \
--label EXAMPLE \
--samples EXAMPLE \
--bams /groups/phyllodes/home/share/Results/scRNA/cellranger_count_new/EXAMPLE/outs/possorted_genome_bam.bam \
--barcodes /groups/phyllodes/home/share/Results/scRNA/cellranger_count_new/EXAMPLE/outs/filtered_feature_bc_matrix/barcodes.tsv \
--outdir EXAMPLE \
--gmap /home/zhoukaiwen/software/Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
--snpvcf /home/zhoukaiwen/database/numbat_reference/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
--paneldir /home/zhoukaiwen/database/numbat_reference/1000G_hg38 \
--ncores 4

根据模板文件为每个样本生成一个脚本（需要有一个叫sample_id.txt的文件，里面包含了所有的样本名）：
Create_Run_pileup_and_phase_Script.sh

#!/bin/bash

# 读取sample_id.txt中的每个样本ID
while IFS= read -r sample_id; do
    if [[ -n "$sample_id" ]]; then
        # 生成新的sh文件名
        new_file="${sample_id}_Run_pileup_and_phase.sh"

        # 复制并替换内容
        sed "s/EXAMPLE/${sample_id}/g" EXAMPLE_Run_pileup_and_phase.sh > "$new_file"

        echo "Generated: $new_file"
    fi
done < sample_id.txt

echo "All scripts generated successfully!"

生成批量运行上述生成的sh的脚本：
Run_All_Run_pileup_and_phase.sh

echo '#!/bin/bash' > Run_All_Run_pileup_and_phase.sh
echo '# 批量运行所有pileup和phase脚本' >> Run_All_Run_pileup_and_phase.sh
echo '' >> Run_All_Run_pileup_and_phase.sh

# 添加每个脚本的运行命令
for script in Run_pileup_and_phase_FETB*.sh; do
    echo "echo \"开始运行: $script\"" >> Run_All_Run_pileup_and_phase.sh
    echo "bash $script" >> Run_All_Run_pileup_and_phase.sh
    echo "echo \"完成: $script\"" >> Run_All_Run_pileup_and_phase.sh
    echo "echo \"----------------------------------------\"" >> Run_All_Run_pileup_and_phase.sh
    echo "" >> Run_All_Run_pileup_and_phase.sh
done

echo 'echo "所有脚本运行完成!"' >> Run_All_Run_pileup_and_phase.sh

3. Running Numbat

模板文件：
EXAMPLE_Run_Numbat.main.R

library(Seurat)
library(numbat)
library(qs)
library(data.table)

message("Loading File...")
seu_obj <- qread("/groups/phyllodes/home/share/Results/transfer/16samples_Merged_AllCells_Annotated_Final.qs")
seu_obj <- subset(seu_obj,orig.ident=="EXAMPLE")

#All_matrix <- GetAssayData(seu_obj,slot = "counts")
#All_matrix <- as.data.frame(All_matrix)
#All_matrix <- round(All_matrix,digits = 0)
#qsave(All_matrix,"All_matrix.qs")
All_matrix <- qread("All_matrix.qs")

#All_data <- GetAssayData(seu_obj,slot = "data")
#All_data <- as.data.frame(All_data)
#qsave(All_data,"All_data.qs")
All_data <- qread("All_data.qs")

reference_cells <- c("SELL+_CD4+_Tn","FOXP3+_Treg","CD4+_Tem","ANXA1+_CD4+_Tcm",
                     "CXCL13+_Tfh","CD8+_MAIT","GZMK+_CD8+_Tem","ZNF683+CXCR6-_CD8+_Trm",
                     "ZNF683+CXCR6+_CD8+_Trm","KLRG1+_CD8+_Temra/Teff","SELL+_CD8+_Tn/Tcm","RSAD2+GZMK+_CD8+_Tem",
                     "GZMK+PDCD1+_CD8+_Tex","Proliferating_CD8+_T", "CD56dimCD16hi_NKT","CD56brightCD16lo_NK",
                     "CD56dimCD16hi_NK","IL41+KIT+_NK","Bn","Bm",
                     "ASCs","Bgc","ASC-like","Unknown_B",
                     "Macrophages","CD1C+_cDC2","MKI67+_Proliferating_mono/macro","Monocytes","MastCells")

reference_cell_id <- rownames(subset(seu_obj@meta.data,CellType_Minor %in% reference_cells))
tumor_cell_id <- rownames(subset(seu_obj@meta.data,!(CellType_Minor %in% reference_cells)))

tumor_matrix <- All_matrix[,tumor_cell_id]
reference_matrix <- All_data[,reference_cell_id]

df_allele <- fread("gunzip -c EXAMPLE/EXAMPLE_allele_counts.tsv.gz")

suffix_num <- strsplit(colnames(tumor_matrix),"_")[[1]][2]
df_allele$cell <- paste0(df_allele$cell,"_",suffix_num)

rm(list=c("All_data","All_matrix","seu_obj"))

message("Start run_numbat")
out = run_numbat(
  as.matrix(tumor_matrix), # gene x cell integer UMI count matrix 
  as.matrix(reference_matrix), # reference expression profile, a gene x cell type normalized expression level matrix
  df_allele, # allele dataframe generated by pileup_and_phase script
  genome = "hg38",
  t = 1e-5,
  ncores = 4,
  plot = TRUE,
  out_dir = './EXAMPLE_numbat'
)

为每个样本生成一个R脚本
Create_Run_numbat.sh

#!/bin/bash

# 读取sample_id.txt中的每个样本ID
while IFS= read -r sample_id; do
    if [[ -n "$sample_id" ]]; then
        # 生成新的sh文件名
        new_file="${sample_id}_Run_Numbat.main.R"

        # 复制并替换内容
        sed "s/EXAMPLE/${sample_id}/g" EXAMPLE_Run_Numbat.main.R > "$new_file"

        echo "Generated: $new_file"
    fi
done < sample_id.txt

echo "All scripts generated successfully!"

批量运行R的脚本

echo '#!/bin/bash' > Run_All_Run_numbat.sh
echo '# 批量运行所有pileup和phase脚本' >> Run_All_Run_numbat.sh
echo '' >> Run_All_Run_numbat.sh
for script in FETB*_Run_Numbat.main.R; do
    echo "echo \"开始运行: $script\"" >> Run_All_Run_numbat.sh
    echo "Rscript ./$script" >> Run_All_Run_numbat.sh
    echo "echo \"完成: $script\"" >> Run_All_Run_numbat.sh
    echo "echo \"----------------------------------------\"" >> Run_All_Run_numbat.sh
    echo "" >> Run_All_Run_numbat.sh
done

一	二	三	四	五	六	日
		1	2	3	4	5
6	7	8	9	10	11	12
13	14	15	16	17	18	19
20	21	22	23	24	25	26
27	28	29	30	31