Added fastqc

Sydney-Informatics-Hub · Dec 18, 2020 · 29ab874 · 29ab874
1 parent 6fe4672
commit 29ab874
Show file tree

Hide file tree

Showing 3 changed files with 143 additions and 0 deletions.
diff --git a/fastqc.sh b/fastqc.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+#########################################################
+# 
+# Platform: NCI Gadi HPC
+# Description: run fastQC and unpack output
+# Usage: this script is executed by fastqc_run_parallel.pbs
+# Author: Cali Willet
+# cali.willet@sydney.edu.au
+# Date last modified: 24/07/2020
+#
+# If you use this script towards a publication, please acknowledge the
+# Sydney Informatics Hub (or co-authorship, where appropriate).
+#
+# Suggested acknowledgement:
+# The authors acknowledge the scientific and technical assistance 
+# <or e.g. bioinformatics assistance of <PERSON>> of Sydney Informatics
+# Hub and resources and services from the National Computational 
+# Infrastructure (NCI), which is supported by the Australian Government
+# with access facilitated by the University of Sydney.
+# 
+#########################################################
+
+fastq=$1
+
+fastqc --extract $fastq -o FastQC
+
+# Optional: 
+#    -a              Specifies a non-default file which contains the list of
+#    --adapters      adapter sequences which will be explicity searched against
+#                    the library. The file must contain sets of named adapters
+#                    in the form name[tab]sequence.  Lines prefixed with a hash
+#                    will be ignored.
diff --git a/fastqc_make_input.sh b/fastqc_make_input.sh
@@ -0,0 +1,39 @@
+#! /bin/bash
+
+#########################################################
+# 
+# Platform: NCI Gadi HPC
+# Description: make input for fastQC
+# Usage: bash fastqc_make_inputs.sh
+# Details:
+# 	Create input file for fastqc_run_parallel.pbs
+# 	Sort by input file size largest to smallest 
+# 	VIP !!! Check that the find pattern matches, eg fastq/fq, gz vs not gz
+# 	VIP !!! Manually verify that the inputs list is the same line length
+# 	as the number of files in the fastq directory
+
+# Author: Cali Willet
+# cali.willet@sydney.edu.au
+# Date last modified: 24/07/2020
+#
+# If you use this script towards a publication, please acknowledge the
+# Sydney Informatics Hub (or co-authorship, where appropriate).
+#
+# Suggested acknowledgement:
+# The authors acknowledge the scientific and technical assistance 
+# <or e.g. bioinformatics assistance of <PERSON>> of Sydney Informatics
+# Hub and resources and services from the National Computational 
+# Infrastructure (NCI), which is supported by the Australian Government
+# with access facilitated by the University of Sydney.
+# 
+#########################################################
+
+
+inputs=./Inputs/fastqc.inputs
+
+find ./Fastq -name "*fastq*" -print | xargs ls -l | sort -rnk 5 | awk '{print $9}'> $inputs
+
+tasks=`wc -l < $inputs`
+
+printf "Number of fastQC tasks to run: ${tasks}\n"
+
diff --git a/fastqc_run_parallel.pbs b/fastqc_run_parallel.pbs
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+#########################################################
+# 
+# Platform: NCI Gadi HPC
+# Description: run fastQC in parallel over all fastq in Fastq
+# Usage: first, run 'bash fastqc_make_input.sh, 
+# then execute this script with 'qsub fastqc_run_parallel.pbs'
+# Details:
+#       Run fastqc on all fastq, and unzip the fastqc dir for 
+# 	text read of a subsequent step. FastQC can use some RAM
+#	so its best to run this on hugemem. Inputs are sorted by size
+#	so you can juggle node v walltime requests if there is a large 
+# 	size discrepancy among your fastq. 
+#
+# Author: Cali Willet
+# cali.willet@sydney.edu.au
+# Date last modified: 18/12/2020
+#
+# If you use this script towards a publication, please acknowledge the
+# Sydney Informatics Hub (or co-authorship, where appropriate).
+#
+# Suggested acknowledgement:
+# The authors acknowledge the scientific and technical assistance 
+# <or e.g. bioinformatics assistance of <PERSON>> of Sydney Informatics
+# Hub and resources and services from the National Computational 
+# Infrastructure (NCI), which is supported by the Australian Government
+# with access facilitated by the University of Sydney.
+# 
+#########################################################
+
+
+#PBS -P <project>
+#PBS -N fqc
+#PBS -l walltime=02:00:00
+#PBS -l ncpus=48
+#PBS -l mem=1500GB
+#PBS -q hugemem
+#PBS -W umask=022
+#PBS -l wd
+#PBS -o ./Logs/fastqc-3286.o
+#PBS -e ./Logs/fastqc-3286.e
+#PBS -lstorage=scratch/<project>
+
+module load openmpi/4.0.2
+module load nci-parallel/1.0.0
+module load fastqc/0.11.7
+
+set -e
+
+SCRIPT=./Scripts/fastqc.sh
+INPUTS=./Inputs/fastqc.inputs 
+
+mkdir -p FastQC
+
+NCPUS=1
+
+#########################################################
+# Do not edit below this line  
+#########################################################
+
+if [[ $PBS_QUEUE =~ bw-exec ]]; then CPN=28; else CPN=48; fi
+M=$(( CPN / NCPUS )) #tasks per node
+
+sed "s|^|${SCRIPT} |" ${INPUTS} > ${PBS_JOBFS}/input-file
+
+mpirun --np $((M * PBS_NCPUS / CPN)) \
+        --map-by node:PE=${NCPUS} \
+        nci-parallel \
+        --verbose \
+        --input-file ${PBS_JOBFS}/input-file