Numbat官网: https://kharchenkolab.github.io/numbat/articles/numbat.html
- 其它依赖包:
- cellsnp-lite:https://github.com/single-cell-genetics/cellsnp-lite
- Eagle2:https://alkesgroup.broadinstitute.org/Eagle/
- samtools
1. 安装及参考文件下载
Numbat 安装:
install.packages('numbat', dependencies = TRUE)
devtools::install_github("https://github.com/kharchenkolab/numbat")
# 注意还要下载github官网的包,里面有个单独的脚本会用来提取snp
Eagle v2.4.1:
http://data.broadinstitute.org/alkesgroup/Eagle/downloads/
参考文件:
# 1000G SNP VCF
# hg38
wget https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf.gz
# hg19
wget https://sourceforge.net/projects/cellsnp/files/SNPlist/genome1K.phase3.SNP_AF5e2.chr1toX.hg19.vcf.gz
# 1000G Reference Panel (paste link in browser to download if wget isn’t working)
# hg38
wget http://pklab.med.harvard.edu/teng/data/1000G_hg38.zip
# hg19
wget http://pklab.med.harvard.edu/teng/data/1000G_hg19.zip
2. Preparing data
对每个单细胞样本进行一次pileup
- 模板sh文件:
EXAMPLE_Run_pileup_and_phase.sh
Rscript /home/zhoukaiwen/software/numbat-1.4.2/inst/bin/pileup_and_phase.R \
--label EXAMPLE \
--samples EXAMPLE \
--bams /groups/phyllodes/home/share/Results/scRNA/cellranger_count_new/EXAMPLE/outs/possorted_genome_bam.bam \
--barcodes /groups/phyllodes/home/share/Results/scRNA/cellranger_count_new/EXAMPLE/outs/filtered_feature_bc_matrix/barcodes.tsv \
--outdir EXAMPLE \
--gmap /home/zhoukaiwen/software/Eagle_v2.4.1/tables/genetic_map_hg38_withX.txt.gz \
--snpvcf /home/zhoukaiwen/database/numbat_reference/genome1K.phase3.SNP_AF5e2.chr1toX.hg38.vcf \
--paneldir /home/zhoukaiwen/database/numbat_reference/1000G_hg38 \
--ncores 4
- 根据模板文件为每个样本生成一个脚本(需要有一个叫sample_id.txt的文件,里面包含了所有的样本名):
Create_Run_pileup_and_phase_Script.sh
#!/bin/bash
# 读取sample_id.txt中的每个样本ID
while IFS= read -r sample_id; do
if [[ -n "$sample_id" ]]; then
# 生成新的sh文件名
new_file="${sample_id}_Run_pileup_and_phase.sh"
# 复制并替换内容
sed "s/EXAMPLE/${sample_id}/g" EXAMPLE_Run_pileup_and_phase.sh > "$new_file"
echo "Generated: $new_file"
fi
done < sample_id.txt
echo "All scripts generated successfully!"
- 生成批量运行上述生成的sh的脚本:
Run_All_Run_pileup_and_phase.sh
echo '#!/bin/bash' > Run_All_Run_pileup_and_phase.sh
echo '# 批量运行所有pileup和phase脚本' >> Run_All_Run_pileup_and_phase.sh
echo '' >> Run_All_Run_pileup_and_phase.sh
# 添加每个脚本的运行命令
for script in Run_pileup_and_phase_FETB*.sh; do
echo "echo \"开始运行: $script\"" >> Run_All_Run_pileup_and_phase.sh
echo "bash $script" >> Run_All_Run_pileup_and_phase.sh
echo "echo \"完成: $script\"" >> Run_All_Run_pileup_and_phase.sh
echo "echo \"----------------------------------------\"" >> Run_All_Run_pileup_and_phase.sh
echo "" >> Run_All_Run_pileup_and_phase.sh
done
echo 'echo "所有脚本运行完成!"' >> Run_All_Run_pileup_and_phase.sh
3. Running Numbat
- 模板文件:
EXAMPLE_Run_Numbat.main.R
library(Seurat)
library(numbat)
library(qs)
library(data.table)
message("Loading File...")
seu_obj <- qread("/groups/phyllodes/home/share/Results/transfer/16samples_Merged_AllCells_Annotated_Final.qs")
seu_obj <- subset(seu_obj,orig.ident=="EXAMPLE")
#All_matrix <- GetAssayData(seu_obj,slot = "counts")
#All_matrix <- as.data.frame(All_matrix)
#All_matrix <- round(All_matrix,digits = 0)
#qsave(All_matrix,"All_matrix.qs")
All_matrix <- qread("All_matrix.qs")
#All_data <- GetAssayData(seu_obj,slot = "data")
#All_data <- as.data.frame(All_data)
#qsave(All_data,"All_data.qs")
All_data <- qread("All_data.qs")
reference_cells <- c("SELL+_CD4+_Tn","FOXP3+_Treg","CD4+_Tem","ANXA1+_CD4+_Tcm",
"CXCL13+_Tfh","CD8+_MAIT","GZMK+_CD8+_Tem","ZNF683+CXCR6-_CD8+_Trm",
"ZNF683+CXCR6+_CD8+_Trm","KLRG1+_CD8+_Temra/Teff","SELL+_CD8+_Tn/Tcm","RSAD2+GZMK+_CD8+_Tem",
"GZMK+PDCD1+_CD8+_Tex","Proliferating_CD8+_T", "CD56dimCD16hi_NKT","CD56brightCD16lo_NK",
"CD56dimCD16hi_NK","IL41+KIT+_NK","Bn","Bm",
"ASCs","Bgc","ASC-like","Unknown_B",
"Macrophages","CD1C+_cDC2","MKI67+_Proliferating_mono/macro","Monocytes","MastCells")
reference_cell_id <- rownames(subset(seu_obj@meta.data,CellType_Minor %in% reference_cells))
tumor_cell_id <- rownames(subset(seu_obj@meta.data,!(CellType_Minor %in% reference_cells)))
tumor_matrix <- All_matrix[,tumor_cell_id]
reference_matrix <- All_data[,reference_cell_id]
df_allele <- fread("gunzip -c EXAMPLE/EXAMPLE_allele_counts.tsv.gz")
suffix_num <- strsplit(colnames(tumor_matrix),"_")[[1]][2]
df_allele$cell <- paste0(df_allele$cell,"_",suffix_num)
rm(list=c("All_data","All_matrix","seu_obj"))
message("Start run_numbat")
out = run_numbat(
as.matrix(tumor_matrix), # gene x cell integer UMI count matrix
as.matrix(reference_matrix), # reference expression profile, a gene x cell type normalized expression level matrix
df_allele, # allele dataframe generated by pileup_and_phase script
genome = "hg38",
t = 1e-5,
ncores = 4,
plot = TRUE,
out_dir = './EXAMPLE_numbat'
)
- 为每个样本生成一个R脚本
Create_Run_numbat.sh
#!/bin/bash
# 读取sample_id.txt中的每个样本ID
while IFS= read -r sample_id; do
if [[ -n "$sample_id" ]]; then
# 生成新的sh文件名
new_file="${sample_id}_Run_Numbat.main.R"
# 复制并替换内容
sed "s/EXAMPLE/${sample_id}/g" EXAMPLE_Run_Numbat.main.R > "$new_file"
echo "Generated: $new_file"
fi
done < sample_id.txt
echo "All scripts generated successfully!"
- 批量运行R的脚本
echo '#!/bin/bash' > Run_All_Run_numbat.sh
echo '# 批量运行所有pileup和phase脚本' >> Run_All_Run_numbat.sh
echo '' >> Run_All_Run_numbat.sh
for script in FETB*_Run_Numbat.main.R; do
echo "echo \"开始运行: $script\"" >> Run_All_Run_numbat.sh
echo "Rscript ./$script" >> Run_All_Run_numbat.sh
echo "echo \"完成: $script\"" >> Run_All_Run_numbat.sh
echo "echo \"----------------------------------------\"" >> Run_All_Run_numbat.sh
echo "" >> Run_All_Run_numbat.sh
done