使用scAllele计算cell-level SNV

1. 下载并安装scAllele

conda create -n scallele
mamba install -c bioconda -c conda-forge python=3.10 pysam samtools bamtools

pip install scallele

2. 按照Barcode分割比对后单细胞Bam文件(Cellranger out)

2.1 方法一

参考：https://github.com/pezmaster31/bamtools/issues/135

ulimit -n 65535
samtools sort -t CB /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/possorted_genome_bam.bam  > ./CBsorted_tags.bam

split_script.py:

##### Code has not been tested on unsorted bam files, sort on barcode (CB):
##### samtools sort -t CB unsorted.bam > sorted_tags.bam
###
##### INPUT: .bam file to be sorted and output directory to place split BC
##### OUTPUT: .bam file for each unique barcode, best to make a new directory

### Python 3.6.8
import pysam

### Input varibles to set
# file to split on
unsplit_file = "/path/to/dir/of/data/sorted_tags.bam"
# where to place output files
out_dir = "/path/to/dir/of/out_data/"

# variable to hold barcode index
CB_hold = 'unset'
itr = 0
# read in upsplit file and loop reads by line
samfile = pysam.AlignmentFile( unsplit_file, "rb")
for read in samfile.fetch( until_eof=True):
    # barcode itr for current read
    CB_itr = read.get_tag( 'CB')
    # if change in barcode or first line; open new file  
    if( CB_itr!=CB_hold or itr==0):
        # close previous split file, only if not first read in file
        if( itr!=0):
            split_file.close()
        CB_hold = CB_itr
        itr+=1
        split_file = pysam.AlignmentFile( out_dir + "CB_{}.bam".format( itr), "wb", template=samfile)

    # write read with same barcode to file
    split_file.write( read)
split_file.close()
samfile.close()

python split_script.py

2.2 方法二

也可尝试直接用bamtools：

cp /groups/phyllodes/home/share/Results/scRNA/starsolo/FETB01-MPT-01Aligned.sortedByCoord.out.bam ./
bamtools split -tag CB -in FETB01-MPT-01Aligned.sortedByCoord.out.bam

2.3 方法三

https://github.com/10XGenomics/subset-bam

conda activate subset-bam

gunzip -c /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/filtered_feature_bc_matrix/barcodes.tsv.gz > /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/filtered_feature_bc_matrix/barcodes.tsv
subset-bam -b /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/possorted_genome_bam.bam -c /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/filtered_feature_bc_matrix/barcodes.tsv.gz -o /groups/phyllodes/home/share/Results/scRNA/scallele/FETB01-MPT-01_bam

2.4 方法四 (这个才能用)

https://github.com/10XGenomics/subset-bam/issues/17
https://github.com/aertslab/single_cell_toolkit/blob/master/subset_bam_per_cb.sh
subset_bam_per_cb.sh:

#!/bin/bash
#
# Copyright (C) 2024 - Gert Hulselmans
#
# Purpose:
#   Subset BAM file per cell barcode.

set -e
set -o pipefail

subset_bam_file_per_cb_chunk () {
    local bam_file="${1}";
    local barcodes_file="${2}";
    local bam_output_prefix="${3%.}";

    if [ ${#@} -ne 3 ] ; then
        printf 'Usage:\n';
        printf '  subset_bam_file_per_cb_chunk bam_file barcodes_file bam_output_prefix\n\n';
        printf 'Arguments:\n';
        printf '  bam_file:          BAM file to subset per provided cell barcode.\n';
        printf '  barcodes_file:     File with cell barcodes to subset input BAM file.\n';
        printf '  bam_output_prefix: Prefix used for output per CB BAM files.\n';
        return 1;
    fi

    if [ "${bam_file##*.}" != "bam" ] ; then
        printf 'Error: BAM file should have ".bam" extension.\n';
        return 1;
    fi

    if [ ! -f "${bam_file}" ] ; then
        printf 'Error: BAM file "%s" does not exist.\n' "${bam_file}";
        return 1;
    fi

    if [ ! -f "${barcodes_file}" ] ; then
        printf 'Error: Barcodes file "%s" does not exist.\n' "${barcodes_file}";
        return 1;
    fi

    if [ ! -d $(dirname "${bam_output_prefix}") ] ; then
        printf 'Error: Directory "%s" does not exist.\n' "$(dirname "${bam_output_prefix}")";
        return 1;
    fi

    # First extract all requested cell barcodes from BAM file
    # before writing individual per CB BAM files.
    samtools view -@ 4 -h -D CB:"${barcodes_file}" "${bam_file}" \
      | mawk \
        -v bam_output_prefix="${bam_output_prefix}" \
        -F '\t' \
        '
        BEGIN {
            header = "";
            CB_tag_found = 0;
        }
        {
            if ( $0 ~ /^@/ ) {
                # Collect SAM header.
                if ( header == "" ) {
                    header = $0;
                } else {
                    header = header "\n" $0;
                }
            } else {
                CB_tag_found = 0;

                # Search for CB tag.
                for ( i = 12; i <= NF; i++ ) {
                    if ( substr($i, 1, 5) == "CB:Z:" ) {
                        CB = substr($i, 6);
                        CB_tag_found = 1;
                        break;
                    }
                }

                if ( CB_tag_found == 1 ) {
                    if (! (CB in CBs_found) ) {
                        # Write BAM header if this is the first time this CB was seen.
                        print header | "samtools view -O bam -o " bam_output_prefix "." CB ".bam -";
                        CBs_found[CB] = 1;
                    }

                    # Write read to CB specific BAM file.
                    print $0 | "samtools view -O bam -o " bam_output_prefix "." CB ".bam -";
                }
            }
        }
        END {
            # Close all file handles.
            for ( CB in CBs_found ) {
                close("samtools view -O bam -o " bam_output_prefix "." CB ".bam -");
            }
        }
        '
}

subset_bam_file_per_cb () {
    local bam_file="${1}";
    local barcodes_file="${2}";
    local bam_output_prefix="${3%.}";

    # Number of barcodes to process in one invocation of subset_bam_file_per_cb_chunk.
    # This is used to limit the number of open file handles and the number of parallel
    # spawned samtools processes.
    local chunk_size="${4:-1000}";

    if [ ${#@} -lt 3 ] ; then
        printf 'Usage:\n';
        printf '  subset_bam_file_per_cb bam_file barcodes_file bam_output_prefix [chunk_size]\n\n';
        printf 'Arguments:\n';
        printf '  bam_file:          BAM file to subset per provided cell barcode.\n';
        printf '  barcodes_file:     File with cell barcodes to subset input BAM file.\n';
        printf '  bam_output_prefix: Prefix used for output per CB BAM files.\n';
        printf '  chunk_size:        Number of cell barcodes to process simultaniously.\n'
        printf '                     If more cell barcodes are provided, the input BAM file\n';
        printf '                     will be read multiple times. It is recommended to not\n';
        printf '                     set this value too high as the same number of parallel\n';
        printf '                     spawned samtools processes will be created for writing\n';
        printf '                     the per CB BAM files.\n';
        printf '                     Default: 1000.\n';
        return 1;
    fi

    if [ "${bam_file##*.}" != "bam" ] ; then
        printf 'Error: BAM file should have ".bam" extension.\n';
        return 1;
    fi

    if [ ! -f "${bam_file}" ] ; then
        printf 'Error: BAM file "%s" does not exist.\n' "${bam_file}";
        return 1;
    fi

    if [ ! -f "${barcodes_file}" ] ; then
        printf 'Error: Barcodes file "%s" does not exist.\n' "${barcodes_file}";
        return 1;
    fi

    if [ ! -d $(dirname "${bam_output_prefix}") ] ; then
        printf 'Error: Directory "%s" does not exist.\n' "$(dirname "${bam_output_prefix}")";
        return 1;
    fi

    if [ $(cat ${barcodes_file} | wc -l) -le ${chunk_size} ] ; then
        # If barcodes file contains less than ${chunk_size} lines, process all barcodes at once.
        subset_bam_file_per_cb_chunk "${bam_file}" "${barcodes_file}" "${bam_output_prefix}";
        return $?;
    else
        # Split barcodes file in chunks of ${chunk_size} lines.
        local subset_bam_file_per_cb_tmp_dir=$(mktemp -t -d subset_bam_file_per_cb_XXXXXX);
        split -l ${chunk_size} -d "${barcodes_file}" ${subset_bam_file_per_cb_tmp_dir}/barcodes_part

        # Subset BAM file in per chunk of barcodes.
        for barcodes_part_file in ${subset_bam_file_per_cb_tmp_dir}/barcodes_part* ; do
            subset_bam_file_per_cb_chunk "${bam_file}" "${barcodes_part_file}" "${bam_output_prefix}";
        done

        # Remove temporary files.
        rm ${subset_bam_file_per_cb_tmp_dir}/barcodes_part*;
        rmdir "${subset_bam_file_per_cb_tmp_dir}";
    fi
}

subset_bam_file_per_cb "${@}";

conda create -n subset-bam
mamba install -c bioconda -c conda-forge mawk samtools

./subset_bam_per_cb.sh /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/possorted_genome_bam.bam /groups/phyllodes/home/share/Results/scRNA/cellranger_count/FETB01-MPT-01/outs/filtered_feature_bc_matrix/barcodes.tsv /groups/phyllodes/home/share/Results/scRNA/scallele/FETB01-MPT-01

# 构建批量运行脚本：
for d in /groups/phyllodes/home/share/Results/scRNA/cellranger_count/*/; do s=$(basename "$d"); echo "mkdir $s && ./subset_bam_per_cb.sh $d/outs/possorted_genome_bam.bam $d/outs/filtered_feature_bc_matrix/barcodes.tsv /groups/phyllodes/home/share/Results/scRNA/scallele/$s/$s"; done > SubSetBam.sh

3. 对每个bam文件运行scAllele：

此处注意 -o 的 prefix 不能为bam文件的名字，否则会删除bam文件（bug?）

for f in *.bam; do base=${f%.bam}; echo "samtools index $f && scAllele -n 4 -b $f -g /groups/phyllodes/home/share/Reference/GRCh38_Ensembl_Ensembl104/fasta/genome.fa -o $base-scallele && echo $base ok"; done > run_scAllele.sh

4. 对每个样本文件夹下的vcf文件筛选PASS突变并添加header

#!/bin/bash

# Headers to add (with placeholder for SAMPLE)
HEADER="##fileformat=VCFv4.2
##FILTER=<ID=PASS,Description=\"Variant passes all filters\">
##FILTER=<ID=LowQual,Description=\"Variant does not pass one or more filtering criteria\">
##INFO=<ID=varType,Number=1,Type=String,Description=\"Variant type\">
##INFO=<ID=varGroup,Number=1,Type=String,Description=\"Variant Group\">
##INFO=<ID=varLength,Number=1,Type=Integer,Description=\"Variant length\">
##INFO=<ID=varEnd,Number=1,Type=Integer,Description=\"End position of the variant\">
##INFO=<ID=TandemRep,Number=1,Type=Integer,Description=\"Count of repeat sequence adjacent to the variant\">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of reads overlapping the variant position\">
##FORMAT=<ID=AC,Number=R,Type=Integer,Description=\"Number of reads containing the variant allele\">
##FORMAT=<ID=RC,Number=R,Type=Integer,Description=\"Number of reads containing the reference allele\">
##FORMAT=<ID=AB,Number=1,Type=Float,Description=\"Allelic balance of the variant\">
##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Consensus genotype\">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype quality\">
##FORMAT=<ID=RCL,Number=1,Type=Integer,Description=\"Read Cluster id\">
#CHROM  POS ID  REF ALT QUAL    FILTER  INFO    FORMAT  SAMPLE"

# Loop through all VCF files
for file in *scallele.vcf; do
    echo "Processing: $file"

    # 1. 提取cell barcode
    cell_barcode=$(echo "$file" | sed -r 's/^.*\.([A-Z0-9\-]+\-[0-9]+)\-scallele\.vcf$/\1/')
    echo "Cell Barcode: $cell_barcode"

    # 2. 创建新文件名（添加 .pass 后缀）
    new_file="${file%.vcf}.pass.vcf"

    # 3. 替换header中的SAMPLE
    header_with_cell_barcode=$(echo "$HEADER" | sed "s/SAMPLE/$cell_barcode/")

    # 4. 创建新文件（包含新header和筛选的PASS行）
    # 先写入新header
    echo "$header_with_cell_barcode" > "$new_file"

    # 追加筛选的PASS行（保留原header会干扰，所以跳过所有注释行）
    awk '!/^#/ && $7 == "PASS"' "$file" >> "$new_file"

    echo "Created new file: $new_file (contains only PASS variants)"
done

5. 对过滤后的vcf进行Annovar+vcf2maf

# annovar
ls *.pass.vcf | perl -ne 'chomp; my $name = $1 if ($_ =~ /([^\/]+)\.pass\.vcf/); print "~/software/annovar/table_annovar.pl $name.pass.vcf ~/database/Annovar/humandb_hg38 -buildver hg38 -out $name.pass -remove -protocol refGene,cytoBand,avsnp151,exac03,gnomad41_exome,EAS.sites.2015_08,cosmic102_unique -operation g,r,f,f,f,f,f -nastring . -vcfinput \n"'>annovar.sh

# vcf2maf

一	二	三	四	五	六	日
						1
2	3	4	5	6	7	8
9	10	11	12	13	14	15
16	17	18	19	20	21	22
23	24	25	26	27	28	29
30

1. 下载并安装scAllele

2. 按照Barcode分割比对后单细胞Bam文件(Cellranger out)

2.1 方法一

2.2 方法二

2.3 方法三

2.4 方法四 (这个才能用)

3. 对每个bam文件运行scAllele：

4. 对每个样本文件夹下的vcf文件筛选PASS突变并添加header

5. 对过滤后的vcf进行Annovar+vcf2maf

6. 对同一个样本的不同细胞的maf文件进行合并