1. 首先使用 cellranger count 处理原始fastq文件
# 注意所有fastq文件必须为SampleName_S1_L001_R1_001.fastq.gz的格式
cellranger count --create-bam true --id SAMPLE --transcriptome=/path/to/cellranger/ref/folder/refdata-gex-GRCh38-2020-A --fastqs=/path/to/fastq/folder/SAMPLE --sample=SAMPLE
# 运行后得到SAMPLE文件夹
构建批量运行CellRanger脚本: CreateCellRangerScript.sh
#!/bin/bash
# 设置变量
BASE_DIR="/data02/zhangmengmeng/BPT/scRNA/RawData/GEX"
CR_Ref="/data02/zhangmengmeng/database/cellranger/refdata-gex-GRCh38-2024-A"
OUTPUT_SCRIPT="RunCellRanger.sh"
# 清空输出文件
> "$OUTPUT_SCRIPT"
# 遍历每个子目录并生成命令
for SAMPLE_DIR in "$BASE_DIR"/*/; do
SAMPLE=$(basename "$SAMPLE_DIR")
echo "cellranger count --create-bam true --id $SAMPLE --transcriptome $CR_Ref --fastqs $BASE_DIR/$SAMPLE --sample $SAMPLE" >> "$OUTPUT_SCRIPT"
done
# 赋予输出脚本可执行权限
chmod +x "$OUTPUT_SCRIPT"
2. 使用 conda 安装 velocyto 环境
conda create -n velocyto # python版本应该大于3.6
conda activate velocyto
# 安装依赖包
conda install numpy scipy cython numba matplotlib scikit-learn h5py click
# 安装软件
pip install pysam
pip install velocyto
# 尝试运行velocyto
velocyto --help
3. 运行velocyto
注意此处有bug,cellranger的文件夹下,必须有outs/filtered_feature_bc_matrix/ 文件夹,且filtered_feature_bc_matrix里的barcodes.tsv必须同时有barcodes.tsv.gz,否则velocyto无法识别barcodes.tsv,会报"ERROR - Can not locate the barcodes.tsv file! for Cellranger Multiplexing outputs"
从scAllele分开的bam文件重新提取barcode:
for f in FETB*.bam; do basename "$f" .bam | awk -F'.' '{print $2}'; done > barcode.tsv
velocyto run10x -v ./SAMPLE /path/to/cellranger/ref/folder/refdata-gex-GRCh38-2020-A/genes/genes.gtf
# 待程序运行完即可在SAMPLE文件夹下的velocyto文件夹中找到SAMPLE.loom
# 可选:从UCSC下载表达重复注释文件(expressed repeats annotation)
https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=611454127_NtvlaW6xBSIRYJEBI0iRDEWisITa&clade=mammal&org=Human&db=0&hgta_group=allTracks&hgta_track=rmsk&hgta_table=rmsk&hgta_regionType=genome&position=&hgta_outputType=gff&hgta_outFileName=UCSC_hg38_rmsk.gtf
# 运行流程添加-m
velocyto run10x -v -m UCSC_hg38_rmsk.gtf ./SAMPLE /path/to/cellranger/ref/folder/refdata-gex-GRCh38-2020-A/genes/genes.gtf
构建批量运行velocyto脚本: CreateVelocytoScript.sh
#!/bin/bash
# 设置变量
BASE_DIR="/data02/zhangmengmeng/BPT/scRNA/RawData/GEX"
MASK_GTF="/data02/zhangmengmeng/database/cellranger/UCSC_hg38_rmsk.gtf"
GENES_GTF="/data02/zhangmengmeng/database/cellranger/genes.gtf"
OUTPUT_SCRIPT="RunVelocyto.sh"
# 清空输出文件
> "$OUTPUT_SCRIPT"
# 遍历每个子目录并生成命令
for SAMPLE_DIR in "$BASE_DIR"/*/; do
SAMPLE=$(basename "$SAMPLE_DIR")
echo "velocyto run10x -v -m $MASK_GTF $BASE_DIR/$SAMPLE $GENES_GTF >> "$OUTPUT_SCRIPT"
done
# 赋予输出脚本可执行权限
chmod +x "$OUTPUT_SCRIPT"
4. 将loom文件导入R中进行后续分析
devtools::install_github("rrydbirk/velocyto.R")
remotes::install_github('satijalab/seurat-wrappers')
library(Seurat)
library(velocyto.R)
library(SeuratDisk)
library(SeuratWrappers)
# 读取速率文件并转换为seurat
IBC01_T <- ReadVelocity("/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC01-T.loom")
IBC01_T <- as.Seurat(IBC01_T)
# 单细胞数据常规处理
IBC01_T@meta.data$orig.ident <- "IBC01_T"
IBC01_T[["RNA"]] <- IBC01_T[["spliced"]]
IBC01_T <- NormalizeData(IBC01_T, normalization.method = "LogNormalize", scale.factor = 10000)
IBC01_T <- FindVariableFeatures(IBC01_T, selection.method = "vst", nfeatures = 2000)
IBC01_T <- ScaleData(IBC01_T, features = rownames(IBC01_T))
IBC01_T <- RunPCA(IBC01_T, features = VariableFeatures(object = IBC01_T), nps=50, verbose=TRUE)
IBC01_T <- FindNeighbors(IBC01_T,reduction = "pca",dims = 1:20)
IBC01_T <- FindClusters(IBC01_T, resolution = 0.1)
IBC01_T <- RunUMAP(IBC01_T,dims = 1:20,reduction = "pca")
# 单独提取spliced和unspliced文件
emat <- IBC01_T$spliced
nmat <- IBC01_T$unspliced
# 此处还需导入注释好的文件,用来提取umap embeddings
IBC01-T_annotated <- readRDS("/path/to/annotated/rds/IBC01-T_annotated.rds")
emb <- IBC01-T_annotated@reductions$umap@cell.embeddings
# Estimate the cell-cell distances
cell.dist <- as.dist(1-armaCor(t(IBC01-T_annotated@reductions$umap@cell.embeddings)))
colnames(emat) <- paste(substring(colnames(emat),6,21),"_1",sep="")
colnames(nmat) <- paste(substring(colnames(nmat),6,21),"_1",sep="")
DefaultAssay(IBC01_T) <- "RNA"
# 保存为h5Seurat和h5ad
SaveH5Seurat(IBC01_T,filename = "/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC01_T.h5Seurat")
Convert("/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC01_T.h5Seurat",dest = "h5ad")
# velocyto.R进行速率分析及可视化
IBC01_T <- RunVelocity(object = IBC01_T, deltaT = 1, kCells = 25, fit.quantile = 0.02)
saveRDS(IBC01_T,"/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC01_T_after_velocyto.rds")
p<-DimPlot(IBC01_T,reduction = "umap", seed = 3, label = T, group.by = c("orig.ident"))
IBC01_T <- RunVelocity(object = IBC01_T, deltaT = 1, kCells = 25, fit.quantile = 0.02)
ident.colors <- (scales::hue_pal())(n = length(x = levels(x = IBC01_T)))
names(x = ident.colors) <- levels(x = IBC01_T)
cell.colors <- ident.colors[Idents(object = IBC01_T)]
names(x = cell.colors) <- colnames(x = IBC01_T)
p<-show.velocity.on.embedding.cor(emb = Embeddings(object = IBC01_T, reduction = "umap"), vel = Tool(object = IBC1234, slot = "RunVelocity"), n = 200, scale = "sqrt", cell.colors = ac(x = cell.colors, alpha = 0.5), cex = 0.8, arrow.scale = 3, show.grid.flow = TRUE, min.grid.cell.mass = 0.5, grid.n = 40, arrow.lwd = 1, do.par = FALSE, cell.border.alpha = 0.1)
ggsave("/home/zhoukaiwen/IBC/scRNA/cellranger_count/velocyto/loom/IBC1234TS_AllCells_velocyto.png",p)