-
Notifications
You must be signed in to change notification settings - Fork 1
/
STRT2-UPPMAX.sh
442 lines (410 loc) · 16.6 KB
/
STRT2-UPPMAX.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
#!/bin/bash
PROGNAME="$( basename $0 )"
# Usage
function usage() {
cat << EOS >&2
Usage: ${PROGNAME} [-o <output>] [-g <genome (required)>] [-a <annotation>] [-b <path (required)>] [-i <path (required)>]
Options:
-o, --out Output file name. (default: OUTPUT)
-g, --genome Genome (hg19/hg38/mm9/mm10/canFam3). Required!
-a, --annotation Gene annotation (ref{RefSeq}/ens{Ensembl}/kg{UCSC KnownGenes}/wgEncodeGencodeBasic*{Gencode}) for QC and counting. Default : ref. NOTE: no Ensembl for hg38&mm10, no KnownGenes for canFam3, no Gencode for mm9&CanFam3.
-b, --basecalls /PATH/to/the Illumina basecalls directory. Required!
-i, --index /PATH/to/the directory and basename of the HISAT2 index. Fasta file has to be 'basename.fasta'. Required!
-c, --center The name of the sequencing center that produced the reads. (default: CENTER)
-r, --run The barcode of the run. Prefixed to read names. (default: RUNBARCODE)
-s, --structure Read structure (default: 8M3S74T6B)
-d, --dta Downstream-transcriptome-assembly for HISAT2, which is useful for TFE-based analysis but leads to fewer alignments with short-anchors.
-h, --help Show usage.
-v, --version Show version.
EOS
exit 1
}
function version() {
cat << EOS >&2
STRT2-NextSeq-automated-pipeline ver2020.6.30
EOS
exit 1
}
# Default parameters
OUTPUT_NAME=OUTPUT
run_VALUE=RUNBARCODE
center_VALUE=CENTER
READ_STRUCTURE=8M3S74T6B
IF_DTA=false
# Parameter settings
PARAM=()
for opt in "$@"; do
case "${opt}" in
'-o' | '--out' )
if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]]; then
echo "${PROGNAME}: option requires an argument -- $( echo $1 | sed 's/^-*//' )" 1>&2
exit 1
fi
OUTNAME=true
OUTPUT_NAME="$2"
shift 2
;;
'-g' | '--genome' )
if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]]; then
echo "${PROGNAME}: option requires an argument -- $( echo $1 | sed 's/^-*//' )" 1>&2
exit 1
elif [[ "$2" = "hg19" ]]; then
GENOME=true
GENOME_VALUE="hg19"
shift 2
elif [[ "$2" = "hg38" ]]; then
GENOME=true
GENOME_VALUE="hg38"
shift 2
elif [[ "$2" = "mm9" ]]; then
GENOME=true
GENOME_VALUE="mm9"
shift 2
elif [[ "$2" = "mm10" ]]; then
GENOME=true
GENOME_VALUE="mm10"
shift 2
elif [[ "$2" = "canFam3" ]]; then
GENOME=true
GENOME_VALUE="canFam3"
shift 2
else
usage
exit 1
fi
;;
'-a' | '--annotation' )
if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]]; then
echo "${PROGNAME}: option requires an argument -- $( echo $1 | sed 's/^-*//' )" 1>&2
exit 1
elif [[ "$2" = "ref" ]]; then
ANNO=true
ANNO_VALUE="ref"
shift 2
elif [[ "$2" = "kg" ]]; then
ANNO=true
ANNO_VALUE="kg"
shift 2
elif [[ "$2" = "ens" ]]; then
ANNO=true
ANNO_VALUE="ens"
shift 2
elif [[ "$2" = wgEncodeGencodeBasic* ]]; then
ANNO=true
ANNO_VALUE=$2
shift 2
else
usage
exit 1
fi
;;
'-b' | '--basecalls' )
if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]]; then
echo "${PROGNAME}: option requires an argument -- $( echo $1 | sed 's/^-*//' )" 1>&2
exit 1
fi
BaseCallsDir=true
BaseCallsDir_PATH="$2"
shift 2
;;
'-i' | '--index' )
if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]]; then
echo "${PROGNAME}: option requires an argument -- $( echo $1 | sed 's/^-*//' )" 1>&2
exit 1
fi
Index=true
Index_PATH="$2"
shift 2
;;
'-c' | '--center' )
if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]]; then
echo "${PROGNAME}: option requires an argument -- $( echo $1 | sed 's/^-*//' )" 1>&2
exit 1
fi
center_VALUE="$2"
shift 2
;;
'-r' | '--run' )
if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]]; then
echo "${PROGNAME}: option requires an argument -- $( echo $1 | sed 's/^-*//' )" 1>&2
exit 1
fi
run_VALUE="$2"
shift 2
;;
'-s' | '--structure' )
if [[ -z "$2" ]] || [[ "$2" =~ ^-+ ]]; then
echo "${PROGNAME}: option requires an argument -- $( echo $1 | sed 's/^-*//' )" 1>&2
exit 1
fi
READ_STRUCTURE="$2"
shift 2
;;
'-d' | '--dta' )
IF_DTA=true; shift
;;
'-h' | '--help' )
usage
;;
'-v' | '--version' )
version
;;
'--' | '-' )
shift
PARAM+=( "$@" )
break
;;
-* )
echo "${PROGNAME}: illegal option -- '$( echo $1 | sed 's/^-*//' )'" 1>&2
exit 1
;;
esac
done
if [[ -n "${PARAM[@]}" ]]; then
usage
fi
[ "${GENOME}" != "true" ] && usage
[ "${BaseCallsDir}" != "true" ] && usage
[ "${Index}" != "true" ] && usage
[ "${ANNO}" != "true" ] && ANNO_VALUE=ref
# Loading required tools
module load bioinfo-tools
module load picard/2.23.4
module load HISAT2/2.2.1
module load samtools/1.10
module load BEDTools/2.29.2
module load subread/2.0.0
module load ruby/2.6.2
# Make temporary and output directory
mkdir tmp
mkdir out
# Preparation for barcodes
ALL_LINES=`cat src/barcode.txt | wc -l`
NLINES=`expr $ALL_LINES \- 1`
for i in `seq 1 $NLINES`
do
echo -e ${OUTPUT_NAME}_${i}_Lane1.bam"\t"${OUTPUT_NAME}_${i}_Lane1"\t"${OUTPUT_NAME}_${i}_Lane1 >> tmp/out
done
paste tmp/out <(awk 'NR>1{print $1}' src/barcode.txt) | cut -f 1-4 > tmp/out2 && rm tmp/out
echo -e ${OUTPUT_NAME}_non-indexed_Lane1.bam"\t"${OUTPUT_NAME}_non-indexed_Lane1"\t" ${OUTPUT_NAME}_non-indexed_Lane1"\t"N >> tmp/out2
echo -e OUTPUT"\t"SAMPLE_ALIAS"\t"LIBRARY_NAME"\t"BARCODE_1 | cat - tmp/out2 > library.param.lane1 && rm tmp/out2
# Number of lanes
nlanes=`ls -l ${BaseCallsDir_PATH} | grep ^d | wc -l`
for i in `seq 2 $nlanes`
do
sed -e "s/Lane1/Lane${i}/g" library.param.lane1 > library.param.lane${i}
done
# Convert BCL files to BAM files
for i in `seq 1 $nlanes`
do
java -jar $PICARD_HOME/picard.jar ExtractIlluminaBarcodes \
--BASECALLS_DIR ${BaseCallsDir_PATH}/ \
--LANE ${i} \
--READ_STRUCTURE ${READ_STRUCTURE} \
--BARCODE_FILE src/barcode.txt \
--METRICS_FILE metrics_output_lane${i}.txt ;
java -jar $PICARD_HOME/picard.jar IlluminaBasecallsToSam \
--BASECALLS_DIR ${BaseCallsDir_PATH}/ \
--LANE ${i} \
--READ_STRUCTURE ${READ_STRUCTURE} \
--RUN_BARCODE ${run_VALUE} \
--IGNORE_UNEXPECTED_BARCODES true \
--LIBRARY_PARAMS library.param.lane${i} \
--SEQUENCING_CENTER ${center_VALUE} \
--INCLUDE_NON_PF_READS false
done
rm library.param.lane*
mkdir out/ExtractIlluminaBarcodes_Metrics && mv metrics_output_lane*.txt out/ExtractIlluminaBarcodes_Metrics
# Make the fasta reference / sequence dictionary if they do not exist.
if [[ ! -e ${Index_PATH}.fasta ]]; then
hisat2-inspect ${Index_PATH} > ${Index_PATH}.fasta
java -jar $PICARD_HOME/picard.jar CreateSequenceDictionary --REFERENCE ${Index_PATH}.fasta --OUTPUT ${Index_PATH}.dict
fi
if [[ ! -e ${Index_PATH}.dict ]]; then
java -jar $PICARD_HOME/picard.jar CreateSequenceDictionary --REFERENCE ${Index_PATH}.fasta --OUTPUT ${Index_PATH}.dict
fi
# Mapping by HISAT2 and merging with the original unaligned BAM files to generate UMI-annotated BAM files
mkdir tmp/UMI
mkdir out/HISAT2_Metrics
if [ $IF_DTA = true ]; then
for file in *.bam
do
name=$(basename $file .bam)
echo $name >> out/HISAT2_Metrics/Alignment-summary.txt
java -jar $PICARD_HOME/picard.jar SortSam \
--INPUT $file \
--OUTPUT tmp/.unmapped.sorted.bam \
--SORT_ORDER queryname;
java -jar $PICARD_HOME/picard.jar SamToFastq \
--INPUT tmp/.unmapped.sorted.bam \
--FASTQ tmp/.tmp.fastq ;
hisat2 -p 8 --dta -x ${Index_PATH} \
-U tmp/.tmp.fastq -S /dev/stdout \
2>> out/HISAT2_Metrics/Alignment-summary.txt \
| java -jar $PICARD_HOME/picard.jar SortSam \
--INPUT /dev/stdin \
--OUTPUT tmp/.mapped.sorted.sam \
--SORT_ORDER queryname;
java -jar $PICARD_HOME/picard.jar MergeBamAlignment \
--ATTRIBUTES_TO_RETAIN XS \
--UNMAPPED_BAM tmp/.unmapped.sorted.bam \
--ALIGNED_BAM tmp/.mapped.sorted.sam \
--OUTPUT tmp/UMI/$name.umi.bam \
--REFERENCE_SEQUENCE ${Index_PATH}.fasta
done
else
for file in *.bam
do
name=$(basename $file .bam)
echo $name >> out/HISAT2_Metrics/Alignment-summary.txt
java -jar $PICARD_HOME/picard.jar SortSam \
--INPUT $file \
--OUTPUT tmp/.unmapped.sorted.bam \
--SORT_ORDER queryname;
java -jar $PICARD_HOME/picard.jar SamToFastq \
--INPUT tmp/.unmapped.sorted.bam \
--FASTQ tmp/.tmp.fastq \
| hisat2 -p 8 -x ${Index_PATH} \
-U tmp/.tmp.fastq -S /dev/stdout \
2>> out/HISAT2_Metrics/Alignment-summary.txt \
| java -jar $PICARD_HOME/picard.jar SortSam \
--INPUT /dev/stdin \
--OUTPUT tmp/.mapped.sorted.sam \
--SORT_ORDER queryname;
java -jar $PICARD_HOME/picard.jar MergeBamAlignment \
--ATTRIBUTES_TO_RETAIN XS \
--UNMAPPED_BAM tmp/.unmapped.sorted.bam \
--ALIGNED_BAM tmp/.mapped.sorted.sam \
--OUTPUT tmp/UMI/$name.umi.bam \
--REFERENCE_SEQUENCE ${Index_PATH}.fasta
done
fi
rm tmp/.unmapped.sorted.bam
rm tmp/.mapped.sorted.sam
rm tmp/.tmp.fastq
mkdir tmp/merged
mkdir tmp/Unaligned_bam
mv *.bam tmp/Unaligned_bam
# Merging all lanes
for i in `seq 1 $NLINES`
do
ls tmp/UMI/${OUTPUT_NAME}_${i}_Lane*.umi.bam > tmp/.list
sed -e "s/tmp/--INPUT tmp/" tmp/.list > tmp/.bam.list
rm tmp/.list
java -jar $PICARD_HOME/picard.jar MergeSamFiles \
$(cat tmp/.bam.list) \
--OUTPUT /dev/stdout |
java -jar $PICARD_HOME/picard.jar AddOrReplaceReadGroups \
--INPUT /dev/stdin \
--OUTPUT tmp/merged/${OUTPUT_NAME}_${i}.merged.bam \
--RGLB ${OUTPUT_NAME}_${i} --RGPL NextSeq --RGPU ${i} --RGSM ${i}
rm tmp/.bam.list
done
rm -rf tmp/UMI
# Mark potential PCR duplicates
mkdir out/MarkDuplicates_Metrics
for i in `seq 1 $NLINES`
do
java -jar $PICARD_HOME/picard.jar MarkDuplicates \
--INPUT tmp/merged/${OUTPUT_NAME}_${i}.merged.bam \
--OUTPUT out/${OUTPUT_NAME}_${i}.output.bam \
--METRICS_FILE out/MarkDuplicates_Metrics/${OUTPUT_NAME}_${i}.metrics.txt \
--BARCODE_TAG RX
done
rm -rf tmp/merged
# Preparation for annotation and QC
if [[ ${GENOME_VALUE} = "hg38" ]] && [[ ${ANNO_VALUE} = "ens" ]]; then
echo "No Ensembl gene annotations!! Please use RefSeq, KnownGenes, or Gencode for hg38"
exit 1
elif [[ ${GENOME_VALUE} = "mm10" ]] && [[ ${ANNO_VALUE} = "ens" ]]; then
echo "No Ensembl gene annotations!! Please use RefSeq or KnownGenes, or Gencode for mm10"
exit 1
elif [[ ${GENOME_VALUE} = "canFam3" ]] && [[ ${ANNO_VALUE} = "kg" ]]; then
echo "No KnownGenes annotations!! Please use RefSeq or Ensembl for canFam3"
exit 1
elif [[ ${GENOME_VALUE} = "canFam3" ]] && [[ ${ANNO_VALUE} = wgEncodeGencodeBasic* ]]; then
echo "No Gencode annotations!! Please use RefSeq or Ensembl for canFam3"
exit 1
elif [[ ${GENOME_VALUE} = "mm9" ]] && [[ ${ANNO_VALUE} = wgEncodeGencodeBasic* ]]; then
echo "No Gencode annotations!! Please use RefSeq, KnownGenes, or Ensembl for mm9"
exit 1
elif [[ ${ANNO_VALUE} = "ens" ]]; then
echo "Downloading the Ensembl annotation data..."
curl -o src/ensGene.txt.gz http://hgdownload.cse.ucsc.edu/goldenPath/${GENOME_VALUE}/database/ensGene.txt.gz
curl -o src/ensemblToGeneName.txt.gz http://hgdownload.cse.ucsc.edu/goldenPath/${GENOME_VALUE}/database/ensemblToGeneName.txt.gz
gunzip src/ensGene.txt.gz
gunzip src/ensemblToGeneName.txt.gz
join -1 1 -2 2 -t $'\t' <(sort -k 1,1 src/ensemblToGeneName.txt) <(sort -k 2,2 src/ensGene.txt) > src/common.txt
join -1 1 -2 2 -t $'\t' -v 2 <(sort -k 1,1 src/ensemblToGeneName.txt) <(sort -k 2,2 src/ensGene.txt) | awk 'BEGIN{OFS="\t"}{print $2,$13,$1,$1=$2="",$0}' | cut -f 1-3,7- > src/no-genename.txt
rm src/ensGene.txt && rm src/ensemblToGeneName.txt
cat src/common.txt src/no-genename.txt > src/ens-genes.txt
rm src/common.txt && rm src/no-genename.txt
ruby bin/ENSEMBL-extract.rb
shift 2
elif [[ ${ANNO_VALUE} = "kg" ]]; then
echo "Downloading the UCSC KnownGenes annotation data..."
curl -o src/knownGene.txt.gz http://hgdownload.cse.ucsc.edu/goldenPath/${GENOME_VALUE}/database/knownGene.txt.gz
curl -o src/kgXref.txt.gz http://hgdownload.cse.ucsc.edu/goldenPath/${GENOME_VALUE}/database/kgXref.txt.gz
gunzip src/knownGene.txt.gz
gunzip src/kgXref.txt.gz
join -1 1 -2 1 -t $'\t' <(sort -k 1,1 src/kgXref.txt | cut -f 1-5) <(sort -k 1,1 src/knownGene.txt) > src/knowngene-names.txt
rm src/knownGene.txt && rm src/kgXref.txt
ruby bin/KnownGenes-extract.rb
shift 2
elif [[ ${ANNO_VALUE} = "ref" ]]; then
echo "Downloading the NCBI RefSeq annotation data..."
curl -o src/refGene.txt.gz http://hgdownload.cse.ucsc.edu/goldenPath/${GENOME_VALUE}/database/refGene.txt.gz
gunzip src/refGene.txt.gz
ruby bin/RefSeq-extract.rb
shift 2
elif [[ ${ANNO_VALUE} = wgEncodeGencodeBasic* ]]; then
echo "Downloading the Gencode annotation data..."
curl -o src/Gencode.txt.gz http://hgdownload.cse.ucsc.edu/goldenPath/${GENOME_VALUE}/database/${ANNO_VALUE}.txt.gz
gunzip src/Gencode.txt.gz
ruby bin/GENCODE-extract.rb
shift 2
else
echo "Something is wrong with the annotation data file."
exit 1
fi
echo "Downloading the chromosome size data..."
curl -o src/${GENOME_VALUE}.chrom.sizes http://hgdownload.soe.ucsc.edu/goldenPath/${GENOME_VALUE}/bigZips/${GENOME_VALUE}.chrom.sizes
cat src/${GENOME_VALUE}.chrom.sizes | awk '{print $1"\t"1"\t"$2}' | sortBed -i > src/chrom.size.bed
cat src/proxup.bed | grep -v _alt | grep -v _hap | grep -v _fix | grep -v _random | grep -v ^chrUn | sortBed -i stdin | intersectBed -a stdin -b src/chrom.size.bed > src/proxup_trimmed.bed
cat src/5utr.bed src/proxup_trimmed.bed | grep -v _alt | grep -v _hap | grep -v _fix | grep -v _random | grep -v ^chrUn | sortBed -i stdin | mergeBed -s -o distinct,distinct,distinct -c 4,5,6 -i - | grep -v , > src/coding_5end.bed
cat src/exon.bed src/proxup_trimmed.bed | grep -v _alt | grep -v _hap | grep -v _fix | grep -v _random | grep -v ^chrUn | sortBed -i stdin | mergeBed -s -o distinct,distinct,distinct -c 4,5,6 -i - > src/coding.bed
cat src/ERCC.bed src/coding_5end.bed | awk '{print $4 "\t" $1 "\t" $2+1 "\t" $3 "\t" $6}' > src/5end-regions.saf
rm src/${GENOME_VALUE}.chrom.sizes
rm src/5utr.bed
rm src/exon.bed
rm src/proxup.bed
rm src/proxup_trimmed.bed
# Quality check
cd out
echo -e Barcode"\t"Qualified_reads"\t"Total_reads"\t"Redundancy"\t"Mapped_reads"\t"Mapped_rate\
"\t"Spikein_reads"\t"Spikein-5end_reads"\t"Spikein-5end_rate"\t"Coding_reads"\t"Coding-5end_reads"\t"Coding-5end_rate > ${OUTPUT_NAME}-QC.txt
for file in *.output.bam
do
name=$(basename $file .output.bam)
samtools index $file
QR=$(samtools view -F 256 $file | wc -l)
Total=$(samtools view -F 256 -F 1024 $file | wc -l)
Redundancy=$(echo "scale=2;$QR/$Total" | bc)
Map=$(samtools view -F 256 -F 1024 -F 4 $file | wc -l)
Rate=$(echo "scale=1;$Map*100/$Total" | bc)
Spike=$(samtools view -F 256 -F 1024 -F 4 $file |grep -e ERCC -e NIST| wc -l)
spikein_5end_reads=$(samtools view -u -F 256 -F 1024 -F 4 $file | intersectBed -abam stdin -wa -bed -b ../src/ERCC.bed | cut -f 4 | sort -u | wc -l)
spikein_5end_rate=$(echo "scale=1;$spikein_5end_reads*100/$Spike" | bc)
coding_reads=$(samtools view -u -F 256 -F 1024 -F 4 $file | intersectBed -abam stdin -wa -bed -b ../src/coding.bed | cut -f 4 | sort -u | wc -l)
coding_5end_reads=$(samtools view -u -F 256 -F 1024 -F 4 $file | intersectBed -abam stdin -wa -bed -b ../src/coding_5end.bed | cut -f 4 | sort -u | wc -l)
coding_5end_rate=$(echo "scale=1;$coding_5end_reads*100/$coding_reads" | bc)
echo -e $name"\t"$QR"\t"$Total"\t"$Redundancy"\t"$Map"\t"$Rate"\t"$Spike"\t"$spikein_5end_reads"\t"$spikein_5end_rate"\t"$coding_reads"\t"$coding_5end_reads"\t"$coding_5end_rate >> ${OUTPUT_NAME}-QC.txt
done
# Counting by featureCounts
featureCounts -T 8 -s 1 --largestOverlap --ignoreDup --primary -a ../src/5end-regions.saf -F SAF -o ${OUTPUT_NAME}_byGene-counts.txt *.bam
mkdir Output_bai && mv *.bam.bai Output_bai
mkdir Output_bam && mv *.bam Output_bam
# Plotting
module load R/4.0.0
module load R_packages/4.0.0
R CMD BATCH --slave --vanilla ../bin/QC-plot.R QC-plot.R.log