1. 下载并安装scAllele
官网:https://github.com/gxiaolab/scAllele
conda create -n scallele
mamba install -c bioconda -c conda-forge python=3.10 pysam samtools bamtools
pip install scallele
2. 按照Barcode分割比对后单细胞Bam文件(Cellranger out)
2.1 方法一
参考:https://github.com/pezmaster31/bamtools/issues/135
ulimit -n 65535
samtools sort -t CB /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/possorted_genome_bam.bam > ./CBsorted_tags.bam
split_script.py:
##### Code has not been tested on unsorted bam files, sort on barcode (CB):
##### samtools sort -t CB unsorted.bam > sorted_tags.bam
###
##### INPUT: .bam file to be sorted and output directory to place split BC
##### OUTPUT: .bam file for each unique barcode, best to make a new directory
### Python 3.6.8
import pysam
### Input varibles to set
# file to split on
unsplit_file = "/path/to/dir/of/data/sorted_tags.bam"
# where to place output files
out_dir = "/path/to/dir/of/out_data/"
# variable to hold barcode index
CB_hold = 'unset'
itr = 0
# read in upsplit file and loop reads by line
samfile = pysam.AlignmentFile( unsplit_file, "rb")
for read in samfile.fetch( until_eof=True):
# barcode itr for current read
CB_itr = read.get_tag( 'CB')
# if change in barcode or first line; open new file
if( CB_itr!=CB_hold or itr==0):
# close previous split file, only if not first read in file
if( itr!=0):
split_file.close()
CB_hold = CB_itr
itr+=1
split_file = pysam.AlignmentFile( out_dir + "CB_{}.bam".format( itr), "wb", template=samfile)
# write read with same barcode to file
split_file.write( read)
split_file.close()
samfile.close()
python split_script.py
2.2 方法二
也可尝试直接用bamtools:
cp /groups/phyllodes/home/share/Results/scRNA/starsolo/FETB01-MPT-01Aligned.sortedByCoord.out.bam ./
bamtools split -tag CB -in FETB01-MPT-01Aligned.sortedByCoord.out.bam
2.3 方法三
https://github.com/10XGenomics/subset-bam
conda activate subset-bam
gunzip -c /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/filtered_feature_bc_matrix/barcodes.tsv.gz > /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/filtered_feature_bc_matrix/barcodes.tsv
subset-bam -b /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/possorted_genome_bam.bam -c /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/filtered_feature_bc_matrix/barcodes.tsv.gz -o /groups/phyllodes/home/share/Results/scRNA/scallele/FETB01-MPT-01_bam
2.4 方法四 (这个才能用)
https://github.com/10XGenomics/subset-bam/issues/17
https://github.com/aertslab/single_cell_toolkit/blob/master/subset_bam_per_cb.sh
subset_bam_per_cb.sh:
#!/bin/bash
#
# Copyright (C) 2024 - Gert Hulselmans
#
# Purpose:
# Subset BAM file per cell barcode.
set -e
set -o pipefail
subset_bam_file_per_cb_chunk () {
local bam_file="${1}";
local barcodes_file="${2}";
local bam_output_prefix="${3%.}";
if [ ${#@} -ne 3 ] ; then
printf 'Usage:\n';
printf ' subset_bam_file_per_cb_chunk bam_file barcodes_file bam_output_prefix\n\n';
printf 'Arguments:\n';
printf ' bam_file: BAM file to subset per provided cell barcode.\n';
printf ' barcodes_file: File with cell barcodes to subset input BAM file.\n';
printf ' bam_output_prefix: Prefix used for output per CB BAM files.\n';
return 1;
fi
if [ "${bam_file##*.}" != "bam" ] ; then
printf 'Error: BAM file should have ".bam" extension.\n';
return 1;
fi
if [ ! -f "${bam_file}" ] ; then
printf 'Error: BAM file "%s" does not exist.\n' "${bam_file}";
return 1;
fi
if [ ! -f "${barcodes_file}" ] ; then
printf 'Error: Barcodes file "%s" does not exist.\n' "${barcodes_file}";
return 1;
fi
if [ ! -d $(dirname "${bam_output_prefix}") ] ; then
printf 'Error: Directory "%s" does not exist.\n' "$(dirname "${bam_output_prefix}")";
return 1;
fi
# First extract all requested cell barcodes from BAM file
# before writing individual per CB BAM files.
samtools view -@ 4 -h -D CB:"${barcodes_file}" "${bam_file}" \
| mawk \
-v bam_output_prefix="${bam_output_prefix}" \
-F '\t' \
'
BEGIN {
header = "";
CB_tag_found = 0;
}
{
if ( $0 ~ /^@/ ) {
# Collect SAM header.
if ( header == "" ) {
header = $0;
} else {
header = header "\n" $0;
}
} else {
CB_tag_found = 0;
# Search for CB tag.
for ( i = 12; i <= NF; i++ ) {
if ( substr($i, 1, 5) == "CB:Z:" ) {
CB = substr($i, 6);
CB_tag_found = 1;
break;
}
}
if ( CB_tag_found == 1 ) {
if (! (CB in CBs_found) ) {
# Write BAM header if this is the first time this CB was seen.
print header | "samtools view -O bam -o " bam_output_prefix "." CB ".bam -";
CBs_found[CB] = 1;
}
# Write read to CB specific BAM file.
print $0 | "samtools view -O bam -o " bam_output_prefix "." CB ".bam -";
}
}
}
END {
# Close all file handles.
for ( CB in CBs_found ) {
close("samtools view -O bam -o " bam_output_prefix "." CB ".bam -");
}
}
'
}
subset_bam_file_per_cb () {
local bam_file="${1}";
local barcodes_file="${2}";
local bam_output_prefix="${3%.}";
# Number of barcodes to process in one invocation of subset_bam_file_per_cb_chunk.
# This is used to limit the number of open file handles and the number of parallel
# spawned samtools processes.
local chunk_size="${4:-1000}";
if [ ${#@} -lt 3 ] ; then
printf 'Usage:\n';
printf ' subset_bam_file_per_cb bam_file barcodes_file bam_output_prefix [chunk_size]\n\n';
printf 'Arguments:\n';
printf ' bam_file: BAM file to subset per provided cell barcode.\n';
printf ' barcodes_file: File with cell barcodes to subset input BAM file.\n';
printf ' bam_output_prefix: Prefix used for output per CB BAM files.\n';
printf ' chunk_size: Number of cell barcodes to process simultaniously.\n'
printf ' If more cell barcodes are provided, the input BAM file\n';
printf ' will be read multiple times. It is recommended to not\n';
printf ' set this value too high as the same number of parallel\n';
printf ' spawned samtools processes will be created for writing\n';
printf ' the per CB BAM files.\n';
printf ' Default: 1000.\n';
return 1;
fi
if [ "${bam_file##*.}" != "bam" ] ; then
printf 'Error: BAM file should have ".bam" extension.\n';
return 1;
fi
if [ ! -f "${bam_file}" ] ; then
printf 'Error: BAM file "%s" does not exist.\n' "${bam_file}";
return 1;
fi
if [ ! -f "${barcodes_file}" ] ; then
printf 'Error: Barcodes file "%s" does not exist.\n' "${barcodes_file}";
return 1;
fi
if [ ! -d $(dirname "${bam_output_prefix}") ] ; then
printf 'Error: Directory "%s" does not exist.\n' "$(dirname "${bam_output_prefix}")";
return 1;
fi
if [ $(cat ${barcodes_file} | wc -l) -le ${chunk_size} ] ; then
# If barcodes file contains less than ${chunk_size} lines, process all barcodes at once.
subset_bam_file_per_cb_chunk "${bam_file}" "${barcodes_file}" "${bam_output_prefix}";
return $?;
else
# Split barcodes file in chunks of ${chunk_size} lines.
local subset_bam_file_per_cb_tmp_dir=$(mktemp -t -d subset_bam_file_per_cb_XXXXXX);
split -l ${chunk_size} -d "${barcodes_file}" ${subset_bam_file_per_cb_tmp_dir}/barcodes_part
# Subset BAM file in per chunk of barcodes.
for barcodes_part_file in ${subset_bam_file_per_cb_tmp_dir}/barcodes_part* ; do
subset_bam_file_per_cb_chunk "${bam_file}" "${barcodes_part_file}" "${bam_output_prefix}";
done
# Remove temporary files.
rm ${subset_bam_file_per_cb_tmp_dir}/barcodes_part*;
rmdir "${subset_bam_file_per_cb_tmp_dir}";
fi
}
subset_bam_file_per_cb "${@}";
conda create -n subset-bam
mamba install -c bioconda -c conda-forge mawk samtools
./subset_bam_per_cb.sh /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/possorted_genome_bam.bam /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/filtered_feature_bc_matrix/barcodes.tsv /groups/phyllodes/home/share/Results/scRNA/scallele/FETB01-MPT-01
# 构建批量运行脚本:
for d in /groups/phyllodes/home/share/Results/scRNA/cellranger_count/*/; do s=$(basename "$d"); echo "mkdir $s && ./subset_bam_per_cb.sh $d/outs/possorted_genome_bam.bam $d/outs/filtered_feature_bc_matrix/barcodes.tsv /groups/phyllodes/home/share/Results/scRNA/scallele/$s/$s"; done > SubSetBam.sh
3. 对每个bam文件运行scAllele:
此处注意 -o 的 prefix 不能为bam文件的名字,否则会删除bam文件(bug?)
for f in *.bam; do base=${f%.bam}; echo "samtools index $f && scAllele -n 4 -b $f -g /groups/phyllodes/home/share/Reference/GRCh38_Ensembl_Ensembl104/fasta/genome.fa -o $base-scallele && echo $base ok"; done > run_scAllele.sh
4. 对每个样本文件夹下的vcf文件筛选PASS突变并添加header
#!/bin/bash
# Headers to add (with placeholder for SAMPLE)
HEADER="##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description=\"Variant passes all filters\">
##FILTER=<ID=LowQual,Description=\"Variant does not pass one or more filtering criteria\">
##INFO=<ID=varType,Number=1,Type=String,Description=\"Variant type\">
##INFO=<ID=varGroup,Number=1,Type=String,Description=\"Variant Group\">
##INFO=<ID=varLength,Number=1,Type=Integer,Description=\"Variant length\">
##INFO=<ID=varEnd,Number=1,Type=Integer,Description=\"End position of the variant\">
##INFO=<ID=TandemRep,Number=1,Type=Integer,Description=\"Count of repeat sequence adjacent to the variant\">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of reads overlapping the variant position\">
##FORMAT=<ID=AC,Number=R,Type=Integer,Description=\"Number of reads containing the variant allele\">
##FORMAT=<ID=RC,Number=R,Type=Integer,Description=\"Number of reads containing the reference allele\">
##FORMAT=<ID=AB,Number=1,Type=Float,Description=\"Allelic balance of the variant\">
##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Consensus genotype\">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype quality\">
##FORMAT=<ID=RCL,Number=1,Type=Integer,Description=\"Read Cluster id\">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE"
# Loop through all VCF files
for file in *scallele.vcf; do
echo "Processing: $file"
# 1. 提取cell barcode
cell_barcode=$(echo "$file" | sed -r 's/^.*\.([A-Z0-9\-]+\-[0-9]+)\-scallele\.vcf$/\1/')
echo "Cell Barcode: $cell_barcode"
# 2. 创建新文件名(添加 .pass 后缀)
new_file="${file%.vcf}.pass.vcf"
# 3. 替换header中的SAMPLE
header_with_cell_barcode=$(echo "$HEADER" | sed "s/SAMPLE/$cell_barcode/")
# 4. 创建新文件(包含新header和筛选的PASS行)
# 先写入新header
echo "$header_with_cell_barcode" > "$new_file"
# 追加筛选的PASS行(保留原header会干扰,所以跳过所有注释行)
awk '!/^#/ && $7 == "PASS"' "$file" >> "$new_file"
echo "Created new file: $new_file (contains only PASS variants)"
done
5. 对过滤后的vcf进行Annovar+vcf2maf
# annovar
ls *.pass.vcf | perl -ne 'chomp; my $name = $1 if ($_ =~ /([^\/]+)\.pass\.vcf/); print "~/software/annovar/table_annovar.pl $name.pass.vcf ~/database/Annovar/humandb_hg38 -buildver hg38 -out $name.pass -remove -protocol refGene,cytoBand,avsnp151,exac03,gnomad41_exome,EAS.sites.2015_08,cosmic102_unique -operation g,r,f,f,f,f,f -nastring . -vcfinput \n"'>annovar.sh
# vcf2maf
6. 对同一个样本的不同细胞的maf文件进行合并