四、Post alignment QC (你以为壳子好的快递,里面也是好的嘛?)
除了早期的测序质控外,比对后质控也是重要的环节之一,这一部分主要来看看比对后的情况,判断样本是否正常
1、运行 picard
work_dir=/home4/sjshen/project/m6a/GSE29714_Cell
hisat2_bam_dir=$work_dir/1.1_hisat2/mouse/position_sorted_bam
echo $hisat2_bam_dir
cd $hisat2_bam_dir
mkdir picard
species=mm10
#mkdir picard
for i in *.bam
do
echo $i
(nohup picard CollectRnaSeqMetrics \
REF_FLAT=/home/share/ann/${species}.refFlat \
STRAND_SPECIFICITY=NONE \
INPUT=$i \
OUTPUT= ./picard/${i%%.bam}.picard_out \
CHART_OUTPUT=./picard/${i%%.bam}.picard_out_chart.pdf \
REFERENCE_SEQUENCE=/home/share/bowtie2_index/${species}.fa \
> ./picard/${i%%.bam}.picard.log) &
done
Tips1:refFlat文件可以从UCSC下载,没有的话可以使用如下生成:
gtfToGenePred -genePredExt -geneNameAsName2 XXX.gtf XXX.refFlat
Tips2:picard是一个基于Java的软件,有的时候对线程和内存不友好,可以按如下调整:
java -jar -Xmx10000m -XX:ParallelGCThreads=10 $picard_path/picard.jar CollectRnaSeqMetrics
最后生成文件如下:
2、summary
picard_dir=$work_dir/1.1_hisat2/position_sorted_bam/picard
echo $picard_dir
cd $picard_dir
# picard_distribution
echo -e 'Sample\tPCT_CODING_BASES\tPCT_UTR_BASES\tPCT_INTRONIC_BASES\tPCT_INTERGENIC_BASES' > picard_distribution.tab
for i in *_out
do
echo $i
awk -v sample=${i} -v t="" 'BEGIN{FS="[:\t ]+" } NR==8{t=t"\t"$15"\t"$16"\t"$17"\t"$18} END{print sample""t}' $i >> picard_distribution.tab
done
#picard_coverage_100
file=picard_coverage_100.tab
for i in *_out
do
echo $i
if [ ! -f "$file" ]; then
tail -103 $i|cut -f 1,2|sed '1c All_Reads.normalized_coverage\t'${i} > $file
else
tail -103 $i|cut -f2|sed '1c'${i} >temp
paste $file temp > temp2
mv temp2 $file
rm temp
fi
done