-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpipeline.xml
executable file
·135 lines (128 loc) · 10.4 KB
/
pipeline.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
<?xml version="1.0" encoding="UTF-8"?>
<pipeline xmlns="http://www.sing-group.org/compi/pipeline-1.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<version>2.0.0-SNAPSHOT</version>
<params>
<param name="working_dir" shortName="working_dir" global="true">The executable of the HISAT2 index command.</param>
<param name="hisat2_index" shortName="hisat2_index" defaultValue="/opt/hisat2-2.0.5/hisat2-build" global="true">The executable of the HISAT2 index command.</param>
<param name="genome_index_image" shortName="genome_index_image" defaultValue="singgroup/dewe:1.3.0" global="true">The executable of the HISAT2 command.</param>
<param name="hisat2" shortName="hisat2" defaultValue="/opt/hisat2-2.0.5/hisat2" global="true">The executable of the HISAT2 command.</param>
<param name="alignment_image" shortName="alignment_image" defaultValue="singgroup/dewe:1.3.0" global="true">The executable of the HISAT2 command.</param>
<param name="samtools" shortName="samtools" defaultValue="/opt/samtools-1.3.1/samtools" global="true">The executable of the samtools command.</param>
<param name="samtools_image" shortName="samtools_image" defaultValue="singgroup/dewe:1.3.0" global="true">The executable of the HISAT2 command.</param>
<param name="htseq_count" shortName="htseq_count" defaultValue="htseq-count" global="true">The executable of the htseq-count command.</param>
<param name="htseq_count_image" shortName="htseq_count_image" defaultValue="pegi3s/htseq:2.0.9" global="true">The HTSeq Docker image</param>
<param name="feature_counts" shortName="feature_counts" defaultValue="featureCounts" global="true">The executable of the featureCounts command.</param>
<param name="feature_counts_image" shortName="htfeature_counts_imageseq_count_image" defaultValue="pegi3s/feature-counts:2.0.0" global="true">The featureCounts Docker image</param>
<param name="delite_image" shortName="delite_image" defaultValue="bioinfolabcro/delite:v.1.2.2" global="true">The DElite Docker image</param>
<param name="sva_image" shortName="sva_image" defaultValue="pegi3s/r_sva:3.54.0" global="true">The R SVA Docker image</param>
<param name="pca" shortName="pca" defaultValue="pca" global="true">The executable of the pca command.</param>
<param name="pca_image" shortName="pca_image" defaultValue="singgroup/sklearn_pca" global="true">The scikit-learn PCA Docker image</param>
<param name="pca_batch_correction_image" shortName="pca_batch_correction_image" defaultValue="singgroup/sklearn_pca" global="true">The scikit-learn PCA Docker image</param>
<!-- User input files and directories -->
<param name="genome_fasta" shortName="genome_fasta" global="true">The reference genome.</param>
<param name="reference_annotation" shortName="reference_annotation" global="true">The path to the reference GTF file for the analysis.</param>
<param name="genome_index_dir" shortName="genome_index_dir" defaultValue="compi/hisat2-indexes" global="true">The directory where genome indexes should be created.</param>
<param name="genome_index" shortName="genome_index" defaultValue="genome_index" global="true">The name for the genome index.</param>
<param name="user_contrasts_file" shortName="user_contrasts_file" defaultValue="config/contrasts.tsv" global="true">The path to the contrasts file provided by the user (relative to the working directory).</param>
<param name="batch_correction" shortName="batch_correction" defaultValue="none" global="true">The factor (column in metadata) for batch correction (or "interaction" to use a combination of all of them). If "none", no batch correction is applied.</param>
<param name="counts_method" shortName="counts_method" defaultValue="featurecounts" global="true">The software for counting reads, either featurecounts or htseq.</param>
<param name="samples_dir" shortName="samples_dir" defaultValue="samples" global="true">The directory containing the samples reads (relative to the working directory).</param>
<param name="metadata_file" shortName="metadata_file" defaultValue="metadata.tsv" global="true">The metadata file name (relative to the samples directory).</param>
<!-- Internal files and directories -->
<param name="samples_files_list" shortName="samples_files_list" defaultValue="compi/samples.txt" global="true">The file where the merged htseq-count results are stored.</param>
<param name="contrasts_file" shortName="contrasts_file" defaultValue="compi/contrasts.tsv" global="true">The path to the contrasts file provided by the user (relative to the working directory).</param>
<param name="samples_alignment_dir" shortName="samples_alignment_dir" defaultValue="compi/aligned-reads" global="true">The directory where aligned samples reads should be placed.</param>
<param name="samples_htseqcount_dir" shortName="samples_htseqcount_dir" defaultValue="compi/htseq-count" global="true">The directory where htseq-count results should be placed.</param>
<param name="samples_feature_counts_dir" shortName="samples_feature_counts_dir" defaultValue="compi/feature-counts" global="true">The directory where feature-counts results should be placed.</param>
<param name="dea_dir" shortName="deas_dir" defaultValue="compi/dea" global="true">The directory where DEA results should be placed.</param>
<param name="all_counts_dir" shortName="all_counts_dir" defaultValue="compi/all_counts" global="true">The directory where PCA results should be placed.</param>
<param name="scripts_dir" shortName="sd" defaultValue="/scripts" global="true">Path of the directory containing the pipeline scripts.</param>
</params>
<tasks>
<task id="initialization" src="tasks/initialization.sh"></task>
<task id="genome-index" after="initialization">
${hisat2_index} ${working_dir}/${genome_fasta} ${working_dir}/${genome_index_dir}/${genome_index}
</task>
<foreach
id="alignment" after="genome-index"
of="command"
in="${scripts_dir}/list_samples.sh ${working_dir}/${samples_dir}/${metadata_file} ${working_dir}/${samples_files_list}"
as="sample_name"
>
${hisat2} --dta -x ${working_dir}/${genome_index_dir}/${genome_index} -1 ${working_dir}/${samples_dir}/${sample_name}_1.fastq.gz -2 ${working_dir}/${samples_dir}/${sample_name}_2.fastq.gz -S ${working_dir}/${samples_alignment_dir}/${sample_name}.sam
</foreach>
<foreach
id="samtools" after="*alignment"
of="command"
in="${scripts_dir}/list_samples.sh ${working_dir}/${samples_dir}/${metadata_file} ${working_dir}/${samples_files_list}"
as="sample_name"
>
${samtools} sort -o ${working_dir}/${samples_alignment_dir}/${sample_name}.bam ${working_dir}/${samples_alignment_dir}/${sample_name}.sam
</foreach>
<foreach
id="htseq-count" after="*samtools"
of="command"
in="${scripts_dir}/list_samples.sh ${working_dir}/${samples_dir}/${metadata_file} ${working_dir}/${samples_files_list}"
as="sample_name"
if="[[ ${counts_method} == 'htseq' ]]"
>
${htseq_count} --format bam --order pos --mode intersection-strict --stranded reverse --minaqual 1 --type exon --idattr gene_id ${working_dir}/${samples_alignment_dir}/${sample_name}.bam ${working_dir}/${reference_annotation} > ${working_dir}/${samples_htseqcount_dir}/${sample_name}.tsv
</foreach>
<foreach
id="feature-counts" after="*samtools"
of="command"
in="${scripts_dir}/list_samples.sh ${working_dir}/${samples_dir}/${metadata_file} ${working_dir}/${samples_files_list}"
as="sample_name"
if="[[ ${counts_method} == 'featurecounts' ]]"
>
${feature_counts} -F GTF -a ${working_dir}/${reference_annotation} -o ${working_dir}/${samples_feature_counts_dir}/${sample_name}.txt -t exon -g gene_id -s 2 -Q 1 -B -p --primary ${working_dir}/${samples_alignment_dir}/${sample_name}.bam
</foreach>
<foreach
id="feature-counts-postprocessing" after="*feature-counts"
of="command"
in="${scripts_dir}/list_samples.sh ${working_dir}/${samples_dir}/${metadata_file} ${working_dir}/${samples_files_list}"
as="sample_name"
if="[[ ${counts_method} == 'featurecounts' ]]"
>
awk 'BEGIN{OFS="\t"} NR>1 {print $1, $NF}' ${working_dir}/${samples_feature_counts_dir}/${sample_name}.txt > ${working_dir}/${samples_feature_counts_dir}/${sample_name}_unsorted.tsv
tail -n +2 ${working_dibatch-correction-all
</foreach>
<task id="join-all" after="htseq-count, feature-counts-postprocessing" src="tasks/join_all.sh"></task>
<task id="pca" after="join-all"><![CDATA[
${pca} ${working_dir}/${all_counts_dir}/counts.tsv --transpose --metadata ${working_dir}/${samples_dir}/${metadata_file} --group class --output_dir=${working_dir}/${all_counts_dir}/pca $([ "${batch_correction}" != "none" ] && echo "--shape ${batch_correction}")
]]></task>
<task id="batch-correction-all"
after="pca"
src="tasks/batch_correction_all.sh"
if="[[ ${batch_correction} != 'none' ]]"
/>
<task id="pca-batch-correction"
after="batch-correction-all"
if="[[ ${batch_correction} != 'none' ]]"
>
${pca} ${working_dir}/${all_counts_dir}/counts.tsv --transpose --metadata ${working_dir}/${samples_dir}/${metadata_file} --group class --shape ${batch_correction} --output_dir=${working_dir}/${all_counts_dir}/pca_batch_corrected
</task>
<task id="prepare-deas" after="pca-batch-correction" src="tasks/prepare_deas.sh"></task>
<foreach id="delite" after="prepare-deas"
of="command"
in="ls ${working_dir}/${dea_dir}"
as="contrast_dir"
>
reference=$(cat ${working_dir}/${dea_dir}/${contrast_dir}/reference.txt)
docker run --rm -v "${working_dir}/${dea_dir}/${contrast_dir}:/home/DElite" ${delite_image} \
counts_file=counts.tsv \
metadata_file=metadata.tsv \
condition=class \
reference_condition=${reference}
</foreach>
</tasks>
<metadata>
<task-description id="initialization">Initializes the directories for the pipeline outputs and pulls the Docker images.</task-description>
<task-description id="genome-index">Creates the genome index for the reference genome.</task-description>
<task-description id="alignment">Aligns each pair of FASTQ files using HISAT2.</task-description>
<task-description id="samtools">Converts the HISAT2 alignments into BAM format using samtools.</task-description>
<task-description id="htseq-count">Runs htseq-count to obtain the raw counts for subsequent analyses.</task-description>
<task-description id="prepare-deas">Creates one DEA folder for each specified contrast.</task-description>
<task-description id="delite">Runs DElite in each DEA folder.</task-description>
</metadata>
</pipeline>