从Cellranger Count开始的velocyto、scvelo速率分析

1. 首先使用 cellranger count 处理原始fastq文件

# 注意所有fastq文件必须为SampleName_S1_L001_R1_001.fastq.gz的格式
cellranger count --create-bam true --id SAMPLE --transcriptome=/path/to/cellranger/ref/folder/refdata-gex-GRCh38-2020-A --fastqs=/path/to/fastq/folder/SAMPLE --sample=SAMPLE

# 运行后得到SAMPLE文件夹

构建批量运行CellRanger脚本: CreateCellRangerScript.sh

#!/bin/bash

# 设置变量
BASE_DIR="/data02/zhangmengmeng/BPT/scRNA/RawData/GEX"
CR_Ref="/data02/zhangmengmeng/database/cellranger/refdata-gex-GRCh38-2024-A"
OUTPUT_SCRIPT="RunCellRanger.sh"

# 清空输出文件
> "$OUTPUT_SCRIPT"

# 遍历每个子目录并生成命令
for SAMPLE_DIR in "$BASE_DIR"/*/; do
    SAMPLE=$(basename "$SAMPLE_DIR")
    echo "cellranger count --create-bam true --id $SAMPLE --transcriptome $CR_Ref --fastqs $BASE_DIR/$SAMPLE --sample $SAMPLE" >> "$OUTPUT_SCRIPT"
done

# 赋予输出脚本可执行权限
chmod +x "$OUTPUT_SCRIPT"

2. 使用 conda 安装 velocyto 环境

conda create -n velocyto # python版本应该大于3.6
conda activate velocyto

# 安装依赖包
conda install numpy scipy cython numba matplotlib scikit-learn h5py click

# 安装软件
pip install pysam
pip install velocyto

# 尝试运行velocyto
velocyto --help

3. 运行velocyto

注意此处有bug，cellranger的文件夹下，必须有outs/filtered_feature_bc_matrix/ 文件夹，且filtered_feature_bc_matrix里的barcodes.tsv必须同时有barcodes.tsv.gz，否则velocyto无法识别barcodes.tsv，会报"ERROR - Can not locate the barcodes.tsv file! for Cellranger Multiplexing outputs"

从scAllele分开的bam文件重新提取barcode：
for f in FETB*.bam; do basename "$f" .bam | awk -F'.' '{print $2}'; done > barcode.tsv

velocyto run10x -v ./SAMPLE /path/to/cellranger/ref/folder/refdata-gex-GRCh38-2020-A/genes/genes.gtf

# 待程序运行完即可在SAMPLE文件夹下的velocyto文件夹中找到SAMPLE.loom

# 可选：从UCSC下载表达重复注释文件（expressed repeats annotation）
https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=611454127_NtvlaW6xBSIRYJEBI0iRDEWisITa&clade=mammal&org=Human&db=0&hgta_group=allTracks&hgta_track=rmsk&hgta_table=rmsk&hgta_regionType=genome&position=&hgta_outputType=gff&hgta_outFileName=UCSC_hg38_rmsk.gtf

# 运行流程添加-m
velocyto run10x -v -m UCSC_hg38_rmsk.gtf ./SAMPLE /path/to/cellranger/ref/folder/refdata-gex-GRCh38-2020-A/genes/genes.gtf

构建批量运行velocyto脚本: CreateVelocytoScript.sh

#!/bin/bash

# 设置变量
BASE_DIR="/data02/zhangmengmeng/BPT/scRNA/RawData/GEX"
MASK_GTF="/data02/zhangmengmeng/database/cellranger/UCSC_hg38_rmsk.gtf"
GENES_GTF="/data02/zhangmengmeng/database/cellranger/genes.gtf"
OUTPUT_SCRIPT="RunVelocyto.sh"

# 清空输出文件
> "$OUTPUT_SCRIPT"

# 遍历每个子目录并生成命令
for SAMPLE_DIR in "$BASE_DIR"/*/; do
    SAMPLE=$(basename "$SAMPLE_DIR")
    echo "velocyto run10x -v -m $MASK_GTF $BASE_DIR/$SAMPLE $GENES_GTF >> "$OUTPUT_SCRIPT"
done

# 赋予输出脚本可执行权限
chmod +x "$OUTPUT_SCRIPT"

4. 将loom文件导入R中进行后续分析

devtools::install_github("rrydbirk/velocyto.R")
remotes::install_github('satijalab/seurat-wrappers')

library(Seurat)
library(velocyto.R)
library(SeuratDisk)
library(SeuratWrappers)

# 读取速率文件并转换为seurat
IBC01_T <- ReadVelocity("/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC01-T.loom")
IBC01_T <- as.Seurat(IBC01_T)

# 单细胞数据常规处理
IBC01_T@meta.data$orig.ident <- "IBC01_T"
IBC01_T[["RNA"]] <- IBC01_T[["spliced"]]
IBC01_T <- NormalizeData(IBC01_T, normalization.method = "LogNormalize", scale.factor = 10000)
IBC01_T <- FindVariableFeatures(IBC01_T, selection.method = "vst", nfeatures = 2000)
IBC01_T <- ScaleData(IBC01_T, features = rownames(IBC01_T))
IBC01_T <- RunPCA(IBC01_T, features = VariableFeatures(object = IBC01_T), nps=50, verbose=TRUE)
IBC01_T <- FindNeighbors(IBC01_T,reduction = "pca",dims = 1:20)
IBC01_T <- FindClusters(IBC01_T, resolution = 0.1)
IBC01_T <- RunUMAP(IBC01_T,dims = 1:20,reduction = "pca")

# 单独提取spliced和unspliced文件
emat <- IBC01_T$spliced
nmat <- IBC01_T$unspliced

# 此处还需导入注释好的文件，用来提取umap embeddings
IBC01-T_annotated <- readRDS("/path/to/annotated/rds/IBC01-T_annotated.rds")
emb <- IBC01-T_annotated@reductions$umap@cell.embeddings

# Estimate the cell-cell distances 
cell.dist <- as.dist(1-armaCor(t(IBC01-T_annotated@reductions$umap@cell.embeddings)))

colnames(emat) <- paste(substring(colnames(emat),6,21),"_1",sep="")
colnames(nmat) <- paste(substring(colnames(nmat),6,21),"_1",sep="")

DefaultAssay(IBC01_T) <- "RNA"

# 保存为h5Seurat和h5ad
SaveH5Seurat(IBC01_T,filename = "/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC01_T.h5Seurat")
Convert("/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC01_T.h5Seurat",dest = "h5ad")

# velocyto.R进行速率分析及可视化
IBC01_T <- RunVelocity(object = IBC01_T, deltaT = 1, kCells = 25, fit.quantile = 0.02)
saveRDS(IBC01_T,"/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC01_T_after_velocyto.rds")

p<-DimPlot(IBC01_T,reduction = "umap", seed = 3, label = T, group.by = c("orig.ident"))

IBC01_T <- RunVelocity(object = IBC01_T, deltaT = 1, kCells = 25, fit.quantile = 0.02)
ident.colors <- (scales::hue_pal())(n = length(x = levels(x = IBC01_T)))
names(x = ident.colors) <- levels(x = IBC01_T)
cell.colors <- ident.colors[Idents(object = IBC01_T)]
names(x = cell.colors) <- colnames(x = IBC01_T)
p<-show.velocity.on.embedding.cor(emb = Embeddings(object = IBC01_T, reduction = "umap"), vel = Tool(object = IBC1234,  slot = "RunVelocity"), n = 200, scale = "sqrt", cell.colors = ac(x = cell.colors, alpha = 0.5), cex = 0.8, arrow.scale = 3, show.grid.flow = TRUE, min.grid.cell.mass = 0.5, grid.n = 40, arrow.lwd = 1, do.par = FALSE, cell.border.alpha = 0.1)

ggsave("/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC1234TS_AllCells_velocyto.png",p)

一	二	三	四	五	六	日
				1	2	3
4	5	6	7	8	9	10
11	12	13	14	15	16	17
18	19	20	21	22	23	24
25	26	27	28	29	30