内容链接
构建 SBATCH shell 脚本的开头
#!/bin/bash
#SBATCH -J
#SBATCH -w node7
#SBATCH --ntasks-per-node=16
Environment Settings 环境设置
export PATH=/home/software/R-4.0.3/bin:$PATH
export LD_LIBRARY_PATH=/home/software/R-4.0.3/lib64:$LD_LIBRARY_PATH
scRNA-seq 单细胞分析流程
1. 激活conda环境(可以不用 conda 跑)
conda activate cellranger
2. 数据解压(bam2fastq)
perl 脚本创建sh文件用于后续运行
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\./;print "bamtofastq $a[0]\.bam $a[0] && echo $a[0] ok\n"' sample_pair.txt >bamtofastq.sh
nohup ./bamtofastq.sh &
3. cellranger count(生成文件夹包括outs文件夹,内含用于R的Seurat的features, barcode, matrix)
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\./;print "cellranger count --id=$a[0] --transcriptome=/home/zhoukaiwen/database/cellranger/refdata-gex-GRCh38-2020-A --fastqs=/home/zhoukaiwen/IBC/scRNA/rawdata/$a[0] --sample=$a[0] && echo $a[0] ok\n"' sample_pair.txt >cellranger_count.sh
4. cellranger vdj(TCR、BCR-seq,生成表格,用于在Seurat中导入到metadata)
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\./;print "cellranger vdj --id=$a[0] --reference=/home/zhoukaiwen/database/cellranger/refdata-cellranger-vdj-GRCh38-alts-ensembl-5.0.0 --fastqs=/home/zhoukaiwen/IBC/scTCR/rawdata/$a[0] --sample=$a[0] --localcores=8 --localmem=64 && echo $a[0] ok"' sample_pair.txt >cellranger_vdj.sh
5、cellranger aggr 将所有样本合成一份以排除多种克隆
5.1 创建 aggr.csv 文件, 格式如下
sample_id,vdj_contig_info,donor,origin
IBC01-S,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC01-S/outs/vdj_contig_info.pb,IBC01,01Skin
IBC01-T,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC01-T/outs/vdj_contig_info.pb,IBC01,01Tumor
IBC02-S,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC02-S/outs/vdj_contig_info.pb,IBC02,02Skin
IBC02-T,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC02-T/outs/vdj_contig_info.pb,IBC02,02Tumor
IBC03-S,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC03-S/outs/vdj_contig_info.pb,IBC03,03Skin
IBC03-T,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC03-T/outs/vdj_contig_info.pb,IBC03,03Tumor
IBC04-S,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC04-S/outs/vdj_contig_info.pb,IBC04,04Skin
IBC04-T,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC04-T/outs/vdj_contig_info.pb,IBC04,04Tumor
IBC05-S,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC05-S/outs/vdj_contig_info.pb,IBC05,05Skin
IBC05-T,/home/zhoukaiwen/IBC/scTCR/cellranger_vdj/IBC05-T/outs/vdj_contig_info.pb,IBC05,05Tumor
5.2 运行 cellranger aggr
cellranger aggr --id=IBC1234TS_TCR --csv=aggr.csv
三个可用于Seurat的结果文件在 IBC1234TS_TCR/outs/vdj_t 里
Exome-seq 外显子测序分析流程
1. 建立文件夹
mkdir {rawdata,align,somatic,facets,msi,polysolver,lohhla,netmhcpan,sequenza}
2. 数据解压
cd rawdata
#!/bin/bash
for file in `ls /Bulk/Long/GSE_data/GSE50760/*.1`;do
fasterq-dump --split-3 $file -O /Bulk/metastasis/GSE50760/rawdata
done
3. 数据质控
fastqc+multiqc查看测序数据质量
cd /Bulk/metastasis/SRP034161/rawdata
fastqc *fastq.gz
multiqc *.zip
trimmomatic单端测序去接头
ls ../rawdata/*.fq.gz|perl -ne 'chomp;my $name=$1 if($_=~/\/([^\/]+)\.fq.gz/);print "trimmomatic SE -phred33 -threads 4 $name\.fq.gz $name\.trimmed.fq.gz ILLUMINACLIP:TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36\n"' >trimmomatic.sh
trimmomatic双端测序去接头
ls ../rawdata/*_R1.fastq.gz|perl -ne 'chomp;my $name=$1 if($_=~/\/([^\/]+)\_R1/);print "trimmomatic PE -phred33 $name\_R1.fq.gz $name\_R2.fq.gz $name\_R1_paired.fq.gz $name\_R1_unpaired.fq.gz $name\_R2_paired.fq.gz $name\_R2_unpaired.fq.gz ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 \n"' >trimmomatic.sh
4. 序列比对 & 标记PCR重复
联川测序
cd ../align
ls ../rawdata/*_R1.fastq.gz|perl -ne 'chomp;my $name=$1 if($_=~/\/([^\/]+)\_R1/);print "/home/zhoukaiwen/software/bwa-0.7.17/bwa mem -t 10 -R \"\@RG\\tID:$name\\tSM:$name\\tLB:WES\\tPL:Illumina\" /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa ../rawdata/$name\_R1.fastq.gz ../rawdata/$name\_R2.fastq.gz | samtools view -bS - >$name.bam && samtools sort -@ 5 -o $name.sort.bam $name.bam && /home/zhoukaiwen/software/gatk-4.1.2.0/gatk --java-options \"-Xmx20G -Djava.io.tmpdir=./\" MarkDuplicates --INPUT $name.sort.bam --OUTPUT $name.sort.markdup.bam -M $name.metrics --VALIDATION_STRINGENCY=LENIENT && samtools index $name.sort.markdup.bam &&echo $name ok\n"' >aln.sh
ls ../rawdata/*_val_1.fq.gz|perl -ne 'chomp;my $name=$1 if($_=~/\/([^\/]+)\_val\_1/);print "/home/zhoukaiwen/software/bwa-0.7.17/bwa mem -t 10 -R \"\@RG\\tID:$name\\tSM:$name\\tLB:WES\\tPL:Illumina\" /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa ../rawdata/$name\_val\_1.fq.gz ../rawdata/$name\_val\_2.fq.gz | samtools view -bS - >$name.bam && samtools sort -@ 5 -o $name.sort.bam $name.bam && /home/zhoukaiwen/software/gatk-4.1.2.0/gatk --java-options \"-Xmx20G -Djava.io.tmpdir=./\" MarkDuplicates --INPUT $name.sort.bam --OUTPUT $name.sort.markdup.bam -M $name.metrics --VALIDATION_STRINGENCY=LENIENT && samtools index $name.sort.markdup.bam &&echo $name ok\n"' >aln_val.sh
chmod u+x aln.sh
nohup ./aln.sh>aln.log 2>&1 &
明码生物测序
ls ../rawdata/*_val_1.fq.gz|perl -ne 'chomp;my $name=$1 if($_=~/\/([^\/]+)\_val/);print "/home/zhoukaiwen/software/bwa-0.7.17/bwa mem -t 10 -R \"\@RG\\tID:$name\\tSM:$name\\tLB:WES\\tPL:Illumina\" /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa ../rawdata/$name\_val\_1.fq.gz ../rawdata/$name\_val\_1.fq.gz | samtools view -bS - >$name.bam && samtools sort -@ 5 -o $name.sort.bam $name.bam && /home/zhoukaiwen/software/gatk-4.1.2.0/gatk --java-options \"-Xmx20G -Djava.io.tmpdir=./\" MarkDuplicates --INPUT $name.sort.bam --OUTPUT $name.sort.markdup.bam -M $name.metrics --VALIDATION_STRINGENCY=LENIENT && samtools index $name.sort.markdup.bam &&echo $name ok\n"' >aln.sh
chmod u+x aln.sh
nohup ./aln.sh>aln.log 2>&1 &
5. 查看比对结果
cd ../align
ls *.sort.markdup.bam|while read id;do samtools flagstat $id;done
6. 碱基矫正 BaseRecalibrator + ApplyBQSR
碱基矫正(对base的quality score进行校正)碱基质量分数重校准(Base quality score recalibration,BQSR),就是利用机器学习的方式调整原始碱基的质量分数。它分为两个步骤:1、利用已有的snp数据库,建立相关性模型,产生重校准表( recalibration table)。2、根据这个模型对原始碱基进行调整,只会调整非已知SNP区域
cd ../align
gunzip dbsnp_138.hg19.vcf.gz
bgzip dbsnp_138.hg19.vcf
tabix -p vcf dbsnp_138.hg19.vcf.gz
gunzip Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz
bgzip Mills_and_1000G_gold_standard.indels.hg19.sites.vcf
tabix -p vcf Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz
ref=/home/zhoukaiwen/database/GRCh37_hg19/hg19.fa
snp=/home/zhoukaiwen/database/gatk_bundle/hg19/dbsnp_138.hg19.vcf.gz
indel=/home/zhoukaiwen/database/gatk_bundle/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz
ls *.sort.markdup.bam|while read sample
do nohup /home/zhoukaiwen/software/gatk-4.1.2.0/gatk \
--java-options "-Xmx20G -Djava.io.tmpdir=./" \
BaseRecalibrator \
-R $ref \
-I $sample \
--known-sites $snp \
--known-sites $indel \
-O ${sample%%.*}_recal.table \
1>${sample%%.*}_log.recal 2>&1 &
done
#BWA比对时设置@RG的重要性在这一步体现出来了,算法通过@RG中的ID来识别各个独立的测序过程
ls *.sort.markdup.bam|while read sample
do nohup /home/zhoukaiwen/software/gatk-4.1.2.0/gatk \
--java-options "-Xmx20G -Djava.io.tmpdir=./" ApplyBQSR \
-R $ref \
-I $sample \
-bqsr ${sample%%.*}_recal.table \
-O ${sample%%.*}_bqsr.bam \
1>${sample%%.*}_log.ApplyBQSR 2>&1 &
done
使用 perl 脚本构建 shell 脚本,按每个样本输出一份代码运行 BaseRecalibrator & ApplyBQSR
# 自己的数据用hg19.fa
ls *.sort.markdup.bam|perl -ne 'chomp;if($_=~/(\S+)\.sort/){print "/home/zhoukaiwen/software/gatk-4.1.2.0/gatk --java-options \"-Xmx20G -Djava.io.tmpdir=./\" BaseRecalibrator -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa -I $1.sort.markdup.bam --known-sites /home/zhoukaiwen/database/gatk_bundle/hg19/dbsnp_138.hg19.vcf.gz --known-sites /home/zhoukaiwen/database/gatk_bundle/hg19/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf.gz --O $1_recal.table && /home/zhoukaiwen/software/gatk-4.1.2.0/gatk --java-options \"-Xmx20G -Djava.io.tmpdir=./\" ApplyBQSR -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa -I $1.sort.markdup.bam -bqsr $1_recal.table -O $1_bqsr.bam && echo $1 ok\n"}' >BQSR.sh
# TCGA-BRCA数据用GRCh37-lite.fa
ls *.sort.markdup.bam|perl -ne 'chomp;if($_=~/(\S+)\.sort/){print "/home/zhoukaiwen/software/gatk-4.1.2.0/gatk --java-options \"-Xmx20G -Djava.io.tmpdir=./\" BaseRecalibrator -R /home/zhoukaiwen/database/GRCh37_hg19/GRCh37-lite.fa -I $1.sort.markdup.bam --known-sites /home/zhoukaiwen/database/gatk_bundle/hg19_without_chr/dbsnp_138.hg19.vcf --known-sites /home/zhoukaiwen/database/gatk_bundle/hg19_without_chr/Mills_and_1000G_gold_standard.indels.hg19.sites.vcf --known-sites /home/zhoukaiwen/database/gatk_bundle/hg19_without_chr/1000G_phase1.indels.hg19.sites.vcf --O $1_recal.table && /home/zhoukaiwen/software/gatk-4.1.2.0/gatk --java-options \"-Xmx20G -Djava.io.tmpdir=./\" ApplyBQSR -R /home/zhoukaiwen/database/GRCh37_hg19/GRCh37-lite.fa -I $1.sort.markdup.bam -bqsr $1_recal.table -O $1_bqsr.bam && echo $1 ok\n"}' >BQSR.sh
nohup ./BQSR.sh >bqsr.log 2>&1 &
查看比对结果
ls *bqsr.bam|while read id
do nohup samtools flagstat $id &
done
/home/zhoukaiwen/software/gatk-4.1.2.0/gatk --java-options "-Xmx20G" AnalyzeCovariates \
-before recal_data.table1 \
-after recal_data.table2 \
-plots AnalyzeCovariates.pdf
ls *_bqsr.bam|while read sample
do nohup samtools index $sample >$sample_bqsr_index.log 2>&1 &
done
7. Mutect2 寻找somatic mutation
下载已知的germline mutation用于过滤
gunzip -c af-only-gnomad.raw.sites.b37.vcf.gz >af-only-gnomad.raw.sites.b37.vcf
cat af-only-gnomad.raw.sites.b37.vcf|grep -v "#"|sed 's/^/chr&/g'>af-only-gnomad.raw.sites.b37.tmp.vcf && cat af-only-gnomad.raw.sites.b37.vcf|grep "#">header.vcf && cat header.vcf af-only-gnomad.raw.sites.b37.tmp.vcf >af-only-gnomad.raw.sites.b37.chr.vcf
bgzip af-only-gnomad.raw.sites.b37.chr.vcf af-only-gnomad.raw.sites.b37.chr.vcf.gz
tabix -p vcf af-only-gnomad.raw.sites.b37.chr.vcf.gz
nohup java -Xmx4G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar SelectVariants -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa -V /home/zhoukaiwen/database/gatk_bundle/hg19/af-only-gnomad.raw.sites.b37.chr.vcf.gz --select-type-to-include SNP --restrict-alleles-to BIALLELIC -O /home/zhoukaiwen/database/gatk_bundle/hg19/af-only-gnomad.raw.sites.b37.chr.SNP_biallelic.vcf.gz &
使用perl脚本创建shell脚本用于运行mutect2
cd ../somatic
ref=/home/zhoukaiwen/database/GRCh37_hg19/hg19.fa
germine_vcf=/home/zhoukaiwen/database/gatk_bundle/hg19/af-only-gnomad.raw.sites.b37.chr.vcf.gz
germine_biallelic_vcf=/home/zhoukaiwen/database/gatk_bundle/hg19/af-only-gnomad.raw.sites.b37.chr.SNP_biallelic.vcf.gz
genomic_intervals=/home/zhoukaiwen/database/database/Agilent_V6.interval_list
create.sh:
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "java -Xmx4G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar Mutect2 -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa -I ../align/$a[0]_bqsr.bam -I ../align/$a[1]_bqsr.bam -normal $a[0] -tumor $a[1] -L /home/zhoukaiwen/database/database/Agilent_V6.interval_list --germline-resource /home/zhoukaiwen/database/gatk_bundle/hg19/af-only-gnomad.raw.sites.b37.chr.vcf.gz -O $a[1].raw.vcf && java -Xmx4G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar GetPileupSummaries -I ../align/$a[0]_bqsr.bam -L /home/zhoukaiwen/database/database/Agilent_V6.interval_list -V /home/zhoukaiwen/database/gatk_bundle/hg19/af-only-gnomad.raw.sites.b37.chr.SNP_biallelic.vcf.gz -O $a[0].pileups.table && java -Xmx4G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar GetPileupSummaries -I ../align/$a[1]_bqsr.bam -L /home/zhoukaiwen/database/database/Agilent_V6.interval_list -V /home/zhoukaiwen/database/gatk_bundle/hg19/af-only-gnomad.raw.sites.b37.chr.SNP_biallelic.vcf.gz -O $a[1].pileups.table && java -Xmx4G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar CalculateContamination -I $a[1].pileups.table -matched $a[0].pileups.table -O $a[1].CalculateContamination.table && java -Xmx4G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar FilterMutectCalls -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa -V $a[1].raw.vcf --contamination-table $a[1].CalculateContamination.table -O $a[1].somatic.vcf && perl /home/zhoukaiwen/software/annovar/convert2annovar.pl -format vcf4old -filter pass $a[1].somatic.vcf >$a[1].somatic.annovar && perl /home/zhoukaiwen/software/annovar/table_annovar.pl $a[1].somatic.annovar /home/zhoukaiwen/software/annovar/database/humandb_hg19 --protocol refGene,gnomad_exome,avsnp150,icgc21,cosmic70,clinvar_20190305,dbnsfp35c,dbscsnv11,intervar_20180118 --operation g,f,f,f,f,f,f,f,f --buildver hg19 --remove --nastring - --thread 4 --otherinfo && echo $a[1] somatic ok\n"' sample_pair.txt >mutect2.sh
# sample_pair.txt 为 tab 分隔的 txt 文件,第一列为 normal,第二列为 tumor sample id
nohup ./mutect2.sh > mutect2.log 2>&1&
8. 将 vcf 转为 maf 后过滤,筛选 filter 为 PASS
conda activate vep
export PERL5LIB="/home/zhoukaiwen/software/anaconda3/envs/vep/lib/site_perl/5.26.2/x86_64-linux-thread-multi"
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "vcf2maf.pl --input-vcf $a[1].somatic.vcf --output-maf $a[1].maf --normal-id $a[0] --tumor-id $a[1] --vcf-tumor-id $a[1] --vcf-normal-id $a[0] --vep-path /home/zhoukaiwen/software/anaconda3/envs/vep/bin --vep-data /home/zhoukaiwen/database/vep --ref-fasta /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa && cat $a[1].maf|grep -E \"version|FILTER|PASS\" > $a[1]_filtered.maf && echo $a[1] vcf2maf ok\n"' sample_pair.txt >vcf2maf.sh
nohup ./vcf2maf.sh > vcf2maf.log 2>&1 &
9. 将所有 maf 合并为一个文件,并用maf 计算 VAF (python)
import os
all_maf = open("/IBC/WES/somatic/all.maf", "a")
VAF_maf = open("/IBC/WES/somatic/VAF.maf", "a")
path = "/IBC/WES/somatic/"
file_list = os.listdir(path)
maf_list = []
for file in os.listdir(path):
if file.endswith("filtered.maf"):
maf_list.append(file)
maf_list.sort()
print(maf_list)
n = 0
x = 0
VAF_maf.write("Hugo_Symbol Chromosome Start_Position End_Position Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele2 Ref_allele_depth Alt_allele_depth Tumor_Sample_Barcode VAF" + '\n')
for file in maf_list:
if file == "all.maf":
pass
elif file == "VAF.maf":
pass
else:
f = open(path + "/" + file, "r")
for lines in f:
cutlines = lines.strip().split('\t')
if "#version" in lines:
header1 = lines
if n == 0:
all_maf.write(lines)
n += 1
elif "Hugo_Symbol" in lines:
header2 = lines
if x == 0:
all_maf.write(lines)
x += 1
else:
all_maf.write(lines)
VAF = int(cutlines[42]) / (int(cutlines[41]) + int(cutlines[42]))
# $1,$5,$6,$7,$9,$10,$11,$13,$41,$42,$16
VAF_maf.write(cutlines[0] + '\t' +
cutlines[4] + '\t' +
cutlines[5] + '\t' +
cutlines[6] + '\t' +
cutlines[8] + '\t' +
cutlines[9] + '\t' +
cutlines[10] + '\t' +
cutlines[12] + '\t' +
cutlines[40] + '\t' +
cutlines[41] + '\t' +
cutlines[15] + '\t' +
str(VAF) + '\n')
10. 计算微卫星不稳定
cd ../msi
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "/home/zhoukaiwen/software/msisensor/binary/msisensor_Linux_x86_64 msi -d /home/zhoukaiwen/database/database/microsatellites.list -n ../align/$a[0]_bqsr.bam -t ../align/$a[1]_bqsr.bam -e /home/zhoukaiwen/database/database/Agilent_V6.bed -o $a[1].msi && echo $a[1] ok\n"' ../somatic/sample_pair.txt >msi.sh
nohup ./msi.sh > msi.log 2>&1 &
11. Facets 计算CNV
# Using R4.0.3 in ~/.bashrc
cd ../facets
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "perl /home/zhoukaiwen/software/FACETS/pileup/snp-pileup.pl ../align/$a[0]_bqsr.bam ../align/$a[1]_bqsr.bam $a[1].txt && cp /home/zhoukaiwen/software/FACETS/demo.R $a[1].R && sed -i \"s/example/$a[1]/\" $a[1].R && Rscript $a[1].R && echo $a[1] facets ok\n"' ../somatic/sample_pair.txt >facets.sh
nohup ./facets.sh > facets.log 2>&1 &
# II期服务器上跑
conda activate facets
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "snp-pileup -A -p -v /home/zhoukaiwen/database/dbSNP/human_9606_b151_GRCh37p13/00-common_all.vcf.gz $a[1].txt ../align/$a[0]_bqsr.bam ../align/$a[1]_bqsr.bam && cp /home/zhoukaiwen/software/FACETS/demo.R $a[1].R && sed -i \"s/example/$a[1]/\" $a[1].R && Rscript $a[1].R && echo $a[1] facets ok\n"' ../somatic/sample_pair.txt >facets.sh
ls *_cncf.xls|perl -ne 'chomp;if($_=~/(\S+)\_cncf.xls/){print "perl /home/zhoukaiwen/software/facets_annovar.pl $_ >$1.cnv.annovar && perl /home/zhoukaiwen/software/annovar/annotate_variation.pl -buildver hg19 $1.cnv.annovar /home/zhoukaiwen/software/annovar/database/humandb_hg19 && perl /home/zhoukaiwen/software/annovar/annotate_variation.pl -regionanno -build hg19 -out $1.cnv.annovar -dbtype cytoBand $1.cnv.annovar /home/zhoukaiwen/software/annovar/database/humandb_hg19 &\n"}' > facets.annovar.sh
nohup ./facets.annovar.sh > facets.annovar.log 2>&1 &
12. Sequenza 计算CNV
# 运行一次即可, 用于构建 hg19.gc50Base.wig.gz 或 GRCh37-lite.gc50Base.wig.gz
cd /home/zhoukaiwen/database/GRCh37_hg19
sequenza-utils gc_wiggle -w 50 --fasta GRCh37-lite.fa -o GRCh37-lite.gc50Base.wig.gz
sequenza-utils gc_wiggle -w 50 --fasta hg19.fa -o hg19.gc50Base.wig.gz
cd ../sequenza
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "sequenza-utils bam2seqz -n ../align/$a[0]_bqsr.bam -t ../align/$a[1]_bqsr.bam -F /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa -gc /home/zhoukaiwen/database/database/hg19.gc50Base.wig.gz -o $a[1].seqz.gz && sequenza-utils seqz_binning -s $a[1].seqz.gz -w 50 -o $a[1].small.seqz.gz && cp /home/zhoukaiwen/software/sequenza/demo.R $a[1].R && sed -i \"s/example/$a[1]/g\" $a[1].R && Rscript $a[1].R && echo $a[1] sequenza ok\n"' ../somatic/sample_pair.txt >sequenza.sh
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "sequenza-utils bam2seqz -n ../align/$a[1]_bqsr.bam -t ../align/$a[0]_bqsr.bam -F /home/zhoukaiwen/database/GRCh37_hg19/GRCh37-lite.fa -gc /home/zhoukaiwen/database/GRCh37_hg19/GRCh37-lite.gc50Base.wig.gz -o $a[0].seqz.gz && sequenza-utils seqz_binning -s $a[0].seqz.gz -w 50 -o $a[0].small.seqz.gz && cp /home/zhoukaiwen/software/sequenza/demo.R $a[0].R && sed -i \"s/example/$a[0]/g\" $a[0].R && Rscript $a[0].R && echo $a[0] sequenza ok\n"' ../somatic/sample_pair.txt >sequenza.sh
nohup ./sequenza.sh>sequenza.log 2>&1 &
13. Manta call SV 结构变异
ls *-T_bqsr.bam|perl -ne 'chomp;if($_=~/(\S+)\-T\_bqsr.bam/){print "/home/zhoukaiwen/software/anaconda3/envs/manta/bin/python2 /home/zhoukaiwen/software/anaconda3/envs/manta/bin/configManta.py --normalBam ../align/$1-B_bqsr.bam --tumorBam ../align/$1-T_bqsr.bam --referenceFasta /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa --runDir ./$1 --exome\n"}' > manta.sh
ls *-T_bqsr.bam | perl -ne 'chomp;if($_=~/(\S+)\-T\_bqsr.bam/){print "/home/zhoukaiwen/software/anaconda3/envs/manta/bin/python2 $1/runWorkflow.py -m local -j 4 1\>$1/$1\_manta.log 2\>\&1\n"}' > ../manta/RunManta.sh
# TCGA-BRCA
ls *_T_bqsr.bam|perl -ne 'chomp;if($_=~/(\S+)\_T\_bqsr.bam/){print "/home/zhoukaiwen/software/anaconda3/envs/manta/bin/python2 /home/zhoukaiwen/software/anaconda3/envs/manta/bin/configManta.py --normalBam ../align/$1_N_bqsr.bam --tumorBam ../align/$1_T_bqsr.bam --referenceFasta /home/zhoukaiwen/database/GRCh37_hg19/GRCh37-lite.fa --runDir ./$1 --exome\n"}' > manta.sh
ls *_T_bqsr.bam | perl -ne 'chomp;if($_=~/(\S+)\_T\_bqsr.bam/){print "/home/zhoukaiwen/software/anaconda3/envs/manta/bin/python2 $1/runWorkflow.py -m local -j 4 1\>$1/$1\_manta.log 2\>\&1\n"}' > ../manta/RunManta.sh
/home/zhoukaiwen/software/anaconda3/envs/manta/bin/python2 ${tumor}/runWorkflow.py -m local -j 4 1>${tumor}/${tumor}_manta.log 2>&1
nohup /home/zhoukaiwen/software/anaconda3/envs/manta/bin/python2 ./*/runWorkflow.py &
gunzip ./*/results/variants/*.gz
for i in `ls ./|grep "TCGA"`; do /home/zhoukaiwen/software/anaconda3/envs/manta/bin/python2 /home/zhoukaiwen/software/anaconda3/envs/manta/bin/convertInversion.py /home/zhoukaiwen/software/anaconda3/envs/samtools1.13/bin/samtools /home/zhoukaiwen/database/GRCh37_hg19/GRCh37-lite.fa ./$i/results/variants/somaticSV.vcf\>./somaticSV/${i}_somatic_SV_convertedInversion.vcf; done
for i in `ls ./|grep "IBC"`; do echo /home/zhoukaiwen/software/anaconda3/envs/manta/bin/python2 /home/zhoukaiwen/software/anaconda3/envs/manta/bin/convertInversion.py /home/zhoukaiwen/software/anaconda3/envs/samtools1.13/bin/samtools /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa ./$i/results/variants/somaticSV.vcf\>./somaticSV/${i}_somatic_SV_convertedInversion.vcf>>convertINV.sh; done
# 复制以上内容并运行
for i in `ls *_SV_convertedInversion.vcf`; do echo cat $i\|grep -E \"#\|PASS\"\>${i%_SV_convertedInversion.vcf}_SV_convertedInversion_filtered.vcf; done
14. polysolver HLA基因型预测
bam 文件染色体前删除 chr, 1.13版本samtools才可以用samtools reheader
cd ../align
ls *_bqsr.bam > sample_list.txt
python
input_path = "/home/zhoukaiwen/IBC/exome/align/"
output_path = "/home/zhoukaiwen/IBC/exome/align/"
sample_list = open("sample_list.txt", "r")
out_file = open("reheader.sh", "a")
for lines in sample_list:
cutlines = lines.strip().split('.')
out_file.write("/home/zhoukaiwen/software/samtools-1.13/samtools reheader -c \'perl -pe \"s/^(@SQ.*)(\\tSN:)chr/\$1\$2/\"\' " + input_path + cutlines[0] + ".bam > " + output_path + cutlines[0] +"_reheader.bam && /home/zhoukaiwen/software/samtools-1.13/samtools index " + output_path + cutlines[0] +"_reheader.bam && echo " + cutlines[0] + " ok\n")
chmod u+x reheader.sh
nohup ./reheader.sh &
ls *_bqsr_reheader.bam|perl -ne 'chomp;if($_=~/(\S+)\_bqsr/){print "mkdir $1 && bash /home/polysolver/scripts/shell_call_hla_type ../align/$1_bqsr_reheader.bam Asian 1 hg19 STDFQ 0 ./$1 && echo $1 ok\n"}' > ../polysolver/polysolver.sh
docker exec -it 5e81af527b75 bash
cd /Desktop/home/zhoukaiwen/IBC/exome/polysolver/
nohup ./polysolver.sh &
15. LOHHLA
15.1 创建文件夹
cd lohhla
mkdir {lohhla_bam,hla,ploidy}
15.2 在lohhla文件夹内手动创建 list.txt,第一列为病人号,第二列为病人正常号,如
IBC01 IBC01-B
IBC02 IBC02-B
IBC03 IBC03-B
15.3 更换文件名,LOHHLA要求文件名为sample id + .bam,将更换了文件名的文件存入align/lohhla_bam文件夹
cd ../align
for name in `ls *_bqsr.bam`;do nohup cp $name ../lohhla/lohhla_bam/${name%_bqsr.bam}.bam &;done
cd ../lohhla/lohhla_bam
for name in `ls IBC0???.bam`;do nohup samtools index $name >$name.log 2>&1 &;done
for name in `ls IBC0??B.bam`; do nohup mkdir ${name%%-*} && mv ${name%%-*}* ${name%%-*} &; done
15.4 更改 polysolver 里 winners.hla.txt 文件的格式
cd ../polysolver
ls |grep "IBC">id.txt
# python
pwd = "/home/zhoukaiwen/IBC/exome/polysolver/"
id_list = open(pwd+"id.txt","r")
for lines in id_list:
lines=lines.strip()
input_file = open(pwd+lines+"/"+"winners.hla.txt","r")
output_file = open(pwd+lines+"/"+"winners2.hla.txt","a")
for lines2 in input_file:
cutlines2 = lines2.strip().split('\t')
output_file.write(cutlines2[1] + '\n' + cutlines2[2] + '\n')
15.5 取正常组织的 winners2.hla.txt作为该病人的hla(待确认),拷贝至 ../lohhla/hla/ 文件夹下
for name in `ls -d IBC0??B`; do cp $name/winners2.hla.txt ../lohhla/hla/${name%%-B}.hla.txt; done
15.6 从 facets 结果中提取肿瘤纯度和倍性信息
cd ../facets
ls *_info.xls> info_list.txt
# python
pwd = "/home/zhoukaiwen/IBC/exome/facets/"
info_list = open("/home/zhoukaiwen/IBC/exome/facets/info_list.txt","r")
for lines in info_list:
lines=lines.strip()
input_file = open(pwd+lines,"r")
cutlines = lines.strip().split('_')
output_file = open(pwd + cutlines[0] + "_ploidy.txt", "a")
output_file.write("Ploidy" + '\t' + "tumorPurity"+'\t'+"tumorPloidy\n")
for lines2 in input_file:
cutlines2 = lines2.strip().split('\t')
if "x" in lines2:
pass
elif "\"1\"" in lines2 :
purity = cutlines2[1]
elif "\"2\"" in lines2 :
ploidy = cutlines2[1]
output_file.write(cutlines[0] + '\t2\t' + purity + '\t' + ploidy + '\n')
15.7 将相同病人的纯度与倍性信息存入一个txt,放在 ../lohhla/ploidy 文件夹下
for name in `ls IBC0??S.R`;do cat ${name%%-S.R}-S_ploidy.txt|grep "Ploidy">../lohhla/ploidy/${name%%-S.R}_ploidy.txt && cat ${name%%-S.R}*_ploidy.txt|grep -v "Ploidy">>../lohhla/ploidy/${name%%-S.R}_ploidy.txt;done
注意若此处肿瘤纯度为NA 则无法算出HLA_type1copyNum_withBAFBin 和 HLA_type2copyNum_withBAFBin,若无法算出肿瘤纯度信息(为NA),则用0.2代替,也可看 sequenza 中的 _alternative_solutions.txt 中的 cellularity
15.8 文件准备完成,进行LOHHLA
cd ../lohhla
conda activate lohhla
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "lohhla --patientId $a[0] --outputDir /home/zhoukaiwen/IBC/exome/lohhla/$a[0]/ --BAMDir /home/zhoukaiwen/IBC/exome/lohhla/lohhla_bam/$a[0]/ --normalBAMfile /home/zhoukaiwen/IBC/exome/lohhla/lohhla_bam/$a[0]/$a[1].bam --hlaPath /home/zhoukaiwen/IBC/exome/lohhla/hla/$a[0].hla.txt --HLAfastaLoc /home/zhoukaiwen/database/LOHHLA/data/abc_complete.fasta --CopyNumLoc /home/zhoukaiwen/IBC/exome/lohhla/ploidy/$a[0]_ploidy.txt --mappingStep TRUE --minCoverageFilter 10 --fishingStep TRUE --cleanUp FALSE --gatkDir /home/zhoukaiwen/software/picard-tools-1.119/ --novoDir /home/zhoukaiwen/software/novocraftV3.08.01 && echo $a[0] lohhla ok\n"' list.txt >lohhla.sh
HLA_type1copyNum_withBAFBin 或 HLA_type2copyNum_withBAFBin<0.5
PVal_unique < 0.05
才能确认为LossAllele
16. 通过 docker 使用 pTuneos 新抗原预测 (有bug,bwa路径无法设置,跳过)
docker exec -it 57e0b0fe505c bash
17. netMHCpan 新抗原预测(需学习如何用 HGVSc 提取氨基酸序列)
将 maf 文件中的 RefSeq 导入 https://www.uniprot.org/uploadlists/
select options 里 设置RefSeq Nucleotide To UniProtKB,下载输出文件(fasta格式),在此文件中抓取HGVSc的突变,之后将这些抓取出来的氨基酸导入网页版 netMHCpan(http://www.cbs.dtu.dk/services/NetMHCpan/) 或 本地 netMHCpan
18. NeoPredPipe预测新抗原
18.1 更改 polysolver 里 winners.hla.txt 文件的格式
cd ../polysolver
ls |grep "IBC">id.txt
python
pwd = "/home/zhoukaiwen/IBC/exome/polysolver/"
id_list = open(pwd+"id.txt","r")
for lines in id_list:
lines=lines.strip()
input_file = open(pwd+lines+"/"+"winners.hla.txt","r")
output_file = open(pwd+lines+"/"+"winners3.hla.txt","a")
output_file.write(lines + '\t')
for lines2 in input_file:
cutlines2 = lines2.strip().split('\t')
output_file.write(cutlines2[1] + '\t' + cutlines2[2] + '\t')
output_file.write('\n')
winners3.hla.txt作为该病人的hla(待确认),拷贝至 ../lohhla/hla/ 文件夹下
for name in `ls -d IBC0???`;do cp $name/winners3.hla.txt ../neopredpipe/hla/$name.hla.txt; done
18.2 将somatic/vcf文件拷贝至neopredpipe/vcf目录下,注意命名需要和上一步hla文件中的病人名一致
cd neopredpipe && mkdir {vcf,hla,out}
cd vcf
for name in `ls *.somatic.vcf`;do mkdir ../neopredpipe/vcf/$name && cp $name ../neopredpipe/vcf/$name/${name%%.somatic.vcf}.vcf; done
ls -d *>list.txt
ls -d *|perl -ne 'chomp;print "python NeoPredPipe.py -I /Desktop/home/zhoukaiwen/IBC/exome/neopredpipe/vcf/$1/ -H /Desktop/home/zhoukaiwen/IBC/exome/neopredpipe/hla/$1.hla.txt -o out/$1 -n $1 && echo $1 ok\n"' >../NeoPredPipe.sh
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\n/;print "python NeoPredPipe.py -I /Desktop/home/zhoukaiwen/IBC/exome/neopredpipe/vcf/$0/ -H /Desktop/home/zhoukaiwen/IBC/exome/neopredpipe/hla/$0.hla.txt -o out/$0 -n $0 && echo $0 ok\n"' list.txt > ../NeoPredPipe.sh
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "$0"' list.txt
python NeoPredPipe.py -I /Desktop/NeoPredPipe_test/ -H /Desktop/polysolver_test_out/Neopredpipe_HLAfile.txt -o /Neopredpipe_out/ -n test
19. HAPSEG + ABSOLUTE 预测肿瘤纯度&倍性
# R
library(ABSOLUTE)
WES call germ line mutation
接上第6步
7. 在 GVCF 模式下对每个样本 call 突变:HaplotypeCaller
ls *_bqsr.bam|perl -ne 'chomp;if($_=~/(\S+)\_bqsr/){print "java -Xmx20G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar HaplotypeCaller -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa -ERC GVCF -I ../align/$1_bqsr.bam -O ../germline/$1_HaplotypeCaller.vcf --intervals /home/zhoukaiwen/database/database/Agilent_V6.interval_list && echo $1 germline ok\n"}' > ../germline/HaplotypeCaller.sh
nohup java -Xmx20G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar GenomicsDBImport -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa $(ls *.vcf | awk '{print "-V "$0" "}') -L /home/zhoukaiwen/database/database/Agilent_V6.interval_list --merge-input-intervals TRUE --genomicsdb-workspace-path ./gvcfs_db > GenomicsDBImport.log 2>&1 &
#所有样本合为1个,应分病人合成
#手动修改为病人编号
ls *_HaplotypeCaller.vcf|perl -ne 'chomp;if($_=~/(\S+)\_HaplotypeCaller.vcf/){print "$1\n"}'>sample_list.txt
ls *_HaplotypeCaller.vcf|perl -ne 'chomp;if($_=~/(\S+)\_HaplotypeCaller.vcf/){print "java -Xmx20G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar GenomicsDBImport -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa ../align/$0.vcf -L /home/zhoukaiwen/database/database/Agilent_V6.interval_list --merge-input-intervals TRUE --genomicsdb-workspace-path ./gvcfs_db&& echo $1 germline ok\n"}' > ../germline/GenomicsDBImport.sh
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "java -Xmx20G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar GenomicsDBImport -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa -V ../germline/$a[0]-B_HaplotypeCaller.vcf -V ../germline/$a[0]-S_HaplotypeCaller.vcf -V ../germline/$a[0]-T_HaplotypeCaller.vcf -L /home/zhoukaiwen/database/database/Agilent_V6.interval_list --merge-input-intervals TRUE --genomicsdb-workspace-path ./$a[0]_gvcfs_db && echo $a[0] GenomicsDBImport ok\n"' sample_list.txt >GenomicsDBImport.sh
nohup ./GenomicsDBImport.sh > GenomicsDBImport.log 2>&1 &
perl -ne 'chomp;next if($_=~/^$/);my @a=split /\t/;print "java -Xmx20G -jar /home/zhoukaiwen/software/gatk-4.1.2.0/gatk-package-4.1.2.0-local.jar GenotypeGVCFs -R /home/zhoukaiwen/database/GRCh37_hg19/hg19.fa -V gendb://$a[0]_gvcfs_db -O $a[0]_germline_merged.vcf && echo $a[0] GenotypeGVCFs ok\n"' sample_list.txt >GenotypeGVCFs.sh
nohup ./GenotypeGVCFs.sh > GenotypeGVCFs.log 2>&1 &
RNA-seq 转录组测序分析流程
1. 创建文件夹
mkdir {rawdata, align, rnaseqc, fusion}
2. 数据解压
#!/bin/bash
for file in `ls /Bulk/Long/GSE_data/GSE50760/*.1`;do
fasterq-dump --split-3 $file -O /Bulk/metastasis/GSE50760/rawdata
done
3. 数据质控
cd /Bulk/metastasis/SRP034161/rawdata
ls *.fq.gz|xargs fastqc -t 4 -o ./
multiqc *.zip
4. 序列比对
# hisat2 构建参考基因组索引
hisat2-build ./hg38.fa hg38
# hisat2 剪接位点信息索引添加
hisat2_extract_splice_sites.py ./gencode.v46.annotation.gtf > gencode.v46.annotation.splicesites
cd /data/user/heminghui/project/hepatoblastoma/multifocal/mRNA/align
# 双端:
ls ../rawdata/*_R1.fq.gz|perl -ne 'chomp;my $a=$_;$a=~s/\_R1/\_R2/;if($_=~/\/([^\/]+)\_R1/){print "/home/zhoukaiwen/software/hisat2-2.1.0/hisat2 -p 10 -x /home/zhoukaiwen/database/database/hg19/hg19 --known-splicesite-infile /home/zhoukaiwen/database/database/gencode.v31lift37.annotation.splicesites -1 $_ -2 $a --rg-id $1 --rg SM:$1 | samtools view -bS - >$1.bam && samtools sort -@ 5 -o $1.sort.bam $1.bam && samtools index $1.sort.bam && echo $1 align ok\n"}' >aln.sh
chmod u+x aln.sh
nohup /data/user/heminghui/bin/parallel_run.pl aln.sh &
5. rnaseqc 生成表达矩阵
cd /data/user/heminghui/project/hepatoblastoma/multifocal/mRNA/rnaseqc
# 生成rpkm表达矩阵
ls ../align/*.sort.bam|perl -ne 'chomp;if($_=~/\/([^\/]+)\.sort/){print "/home/zhoukaiwen/software/rnaseqc /home/zhoukaiwen/database/database/gencode.v31lift37.gene.gtf $_ ./ -s $1 -q 1 -u --rpkm --coverage && echo $1 ok\n"}' >rnaseqc_rpkm.sh
# 生成tpm表达矩阵
ls ../align/*.sort.bam|perl -ne 'chomp;if($_=~/\/([^\/]+)\.sort/){print "/home/zhoukaiwen/software/rnaseqc /home/zhoukaiwen/database/database/gencode.v31lift37.gene.gtf $_ ./ -s $1 -q 1 -u --coverage && echo $1 ok\n"}' >rnaseqc_tpm.sh
nohup ./rnaseqc_tpm.sh>rnaseqc_tpm.log 2>&1 &
# lncRNA (不确定是不是这么跑)
ls ../lnc_align/*.sort.bam|perl -ne 'chomp;if($_=~/\/([^\/]+)\.sort/){print "/home/zhoukaiwen/software/rnaseqc /home/zhoukaiwen/database/GRCh37_hg19/gencode.v39lift37.long_noncoding_RNAs.gtf $_ ./ -s $1 -q 1 -u --rpkm --coverage && echo $1 ok\n"}' >rnaseqc_rpkm.sh
rpkm文件合并(python),合并后可以用于跑CIBERSORT
import glob
file_list = glob.glob("/home/zhoukaiwen/IBC/RNA/rnaseqc/*.gene_rpkm.gct")
f_out_combined = open("/home/zhoukaiwen/IBC/RNA/rnaseqc/rpkm_merged.txt", "a")
sample_list = []
gene_list = []
all_dict = {}
for file in file_list:
f = open(file.strip())
for lines in f:
cutlines = lines.strip().split("\t")
if "#1.2" in lines: # 去除开头的版本符号#1.2
pass
elif "59953" in cutlines[0]: # 去除第二行的两个数字,应该是基因数和样本数
pass
elif "Name" in lines:
sample = cutlines[2]
sample_list.append(sample)
else:
if cutlines[1] in gene_list:
all_dict[cutlines[1]][sample] = cutlines[2]
else:
gene_list.append(cutlines[1])
all_dict[cutlines[1]] = {}
all_dict[cutlines[1]][sample] = cutlines[2]
sample_list.sort()
heading_sample = "\t".join(sample_list)
f_out_combined.write("Gene" + "\t" + heading_sample + "\n")
for n in gene_list:
f_out_combined.write(n + "\t")
for i in range(len(sample_list)):
f_out_combined.write(str(all_dict[n][sample_list[i]]) + "\t")
f_out_combined.write("\n")
reads文件合并,用于下游分析
import glob
file_list = glob.glob("/IBC/RNA/rnaseqc/*.gene_reads.gct")
f_out_combined = open("/IBC/RNA/rnaseqc/reads_merged.txt", "a")
sample_list = []
gene_list = []
all_dict = {}
for file in file_list:
f = open(file.strip())
for lines in f:
cutlines = lines.strip().split("\t")
if "#1.2" in lines: # 去除开头的版本符号#1.2
pass
elif "59953" in cutlines[0]: # 去除第二行的两个数字,应该是基因数和样本数
pass
elif "Name" in lines:
sample = cutlines[2]
sample_list.append(sample)
else:
if cutlines[1] in gene_list:
all_dict[cutlines[1]][sample] = cutlines[2]
else:
gene_list.append(cutlines[1])
all_dict[cutlines[1]] = {}
all_dict[cutlines[1]][sample] = cutlines[2]
sample_list.sort()
heading_sample = "\t".join(sample_list)
f_out_combined.write("Gene" + "\t" + heading_sample + "\n")
for n in gene_list:
f_out_combined.write(n + "\t")
for i in range(len(sample_list)):
f_out_combined.write(str(all_dict[n][sample_list[i]]) + "\t")
f_out_combined.write("\n")
count、rpkm/fpkm、tpm 转换(R语言代码)
countToTpm <- function(counts, effLen)
{
rate <- log(counts) - log(effLen)
denom <- log(sum(exp(rate)))
exp(rate - denom + log(1e6))
}
countToFpkm <- function(counts, effLen)
{
N <- sum(counts)
exp( log(counts) + log(1e9) - log(effLen) - log(N) )
}
fpkmToTpm <- function(fpkm)
{
exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
eg.#fpkmToTpm
expMatrix<-read.table("fpkm_expr.txt",header = T,row.names = 1)
tpms <- apply(expMatrix,2,fpkmToTpm)
tpms[1:3,]
#最后可以根据TPM的特征进行检查,看每列加和是否一致
colSums(tpms)
6. starfusion 融合基因分析
cd /data/user/heminghui/project/hepatoblastoma/multifocal/mRNA/fusion
export PERL5LIB="/usr/local/lib64/perl5/"
ls ../rawdata/*_R1.fq.gz|perl -ne 'chomp;my $a=$_;$a=~s/\_R1/\_R2/;if($_=~/\/([^\/]+)\_R1/){print "/home/zhoukaiwen/software/STAR-Fusion.v1.10.1/STAR-Fusion --genome_lib_dir /home/zhoukaiwen/database/star-fusion-index/GRCh37_gencode_v19_CTAT_lib_Mar012021.plug-n-play/ctat_genome_lib_build_dir --left_fq $_ --right_fq $a --CPU 10 --output_dir $1 && echo $1 fusion ok\n"}' >fusion.sh
nohup /data/user/heminghui/bin/parallel_run.pl fusion.sh &
conda activate starfusion
ls ../rawdata/*_R1.fq.gz|perl -ne 'chomp;my $a=$_;$a=~s/\_R1/\_R2/;if($_=~/\/([^\/]+)\_R1/){print "STAR-Fusion --genome_lib_dir /home/zhoukaiwen/database/star-fusion-index/GRCh37_gencode_v19_CTAT_lib_Mar012021.plug-n-play/ctat_genome_lib_build_dir --left_fq $_ --right_fq $a --CPU 10 --output_dir $1 && echo $1 fusion ok\n"}' >fusion.sh
批量更改结果文件名
for n in `ls ./*/*.tsv`;do cp $n ${n%/*}_${n##*/};done
bam 文件添加/删除 Chr/chr 1.13版本samtools才可以用samtools reheader
# Remove comment lines
samtools reheader -c 'grep -v ^@CO' in.bam
# Add “Chr” prefix to chromosome names. Note extra backslashes before dollar signs to prevent unwanted shell variable expansion.
samtools reheader -c 'perl -pe "s/^(@SQ.*)(\tSN:)(\d+|X|Y|MT)(\s|\$)/\$1Chr\$2\$3/"' in.bam
# Remove “Chr” prefix
samtools reheader -c 'perl -pe "s/^(@SQ.*)(\tSN:)Chr/\$1\$2/"' in.bam
for i in `ls *.sort.markdup.bam`;do samtools reheader -c 'perl -pe "s/^(@SQ.*)(\tSN:)(\d+|X|Y|MT)(\s|\$)/\$1chr\$2\$3/"' $i;done
samtools index *.sort.markdup.bam