-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstart.sh
374 lines (314 loc) · 9.43 KB
/
start.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/bin/bash
# Export all
set -a
# What date and time is it?
DATESTAMP=$(date)
# Current work dir
CWD=$(pwd)
NAME="AutoCoEv" # script name
VER="0.021beta" # version
# Load settings
. $CWD/settings.conf
# Load functions
. $CWD/functions/databases.sh
. $CWD/functions/retrieval.sh
. $CWD/functions/blast.sh
. $CWD/functions/msa.sh
. $CWD/functions/trees.sh
. $CWD/functions/pairing.sh
. $CWD/functions/caps.sh
. $CWD/functions/results.sh
. $CWD/functions/xml.sh
. $CWD/functions/check.sh
# User provided input
echo -e "\n\e[46mWelcome to $NAME version $VER!\e[49m\n"
# Check for binaries
echo -e "Checking for executables in: \e[96m$EXEPATH\e[39m"
check_bin
echo ""
read -p "Change working dir or press ENTER to accept: " -e -i $TMP TMP
echo -e "\n\e[96m$TMP\e[39m\n"
echo -e "Do initial preparations:"
# Help from:
# https://unix.stackexchange.com/questions/293340/bash-how-can-i-re-display-selection-menu-after-a-selection-is-chosen-and-perfo
PS3="Your choice: "
PREPOPT=( "Download databases"
"Extract databases"
"Index FASTA database"
"Trim gene_xrefs database ($ORGANISM)"
"Trim OG2genes database ($LEVEL)"
"[DONE AND CONTINUE]" )
select prep in "${PREPOPT[@]}" ; do
case $prep in
"Download databases")
download_db
echo -e "\nDone with 1)"
;;
"Extract databases")
extract_db
echo -e "\nDone with 2)"
;;
"Index FASTA database")
index_fa
echo -e "\nDone with 3)"
;;
"Trim gene_xrefs database ($ORGANISM)")
trim_gene_xrefs
echo -e "\nDone with 4)"
;;
"Trim OG2genes database ($LEVEL)")
trim_OG2genes
echo -e "\nDone with 5)"
;;
"[DONE AND CONTINUE]")
echo -e "Continue...\n"
break 2
;;
*)
echo "Invalid option"
;;
esac
REPLY=
done
cd $CWD
read -p "Gene XRefs database: " -e -i $GENEXREF GENEXREF
echo -e "\n\e[96m$GENEXREF\e[39m"
head $DTB/$GENEXREF
echo ""
read -p "OrthoGroup 2 genes database: " -e -i $OG2GENES G2GENES
echo -e "\n\e[96m$OG2GENES\e[39m"
head $DTB/$OG2GENES
echo ""
read -p "All FASTA sequences database: " -e -i $ALLFASTA ALLFASTA
echo -e "\n\e[96m$ALLFASTA\e[39m"
seqkit head $DTB/$ALLFASTA
echo -e "\n\e[96m$ALLFASTA.index\e[39m"
head $DTB/$ALLFASTA.index
echo ""
read -p "Change proteins list or press ENTER to accept: " -e -i $PROTEIN PROTEIN
echo -e "\n\e[96m$PROTEIN\e[39m"
head $PROTEIN
echo ""
read -p "Change species list or press ENTER to continue: " -e -i $SPECIES SPECIES
echo -e "\n\e[96m$SPECIES\e[39m"
head $SPECIES
echo ""
read -p "Use maximum available CPU threads?: " -e -i $THREADS THREADS
echo -e "CPU threads to be used are: \e[92m$THREADS\e[39m\n"
echo -e "We achieve parallelization via \e[92mGNU/Parallel\e[39m"
echo -e "Run 'parallel --citation' once to silence its citation notice.\n\n"
# Output a summary of settings
echo -e "Work directory: \e[92m${TMP}\e[39m\n"
echo -e "UniProt reference sequences are from:.........\e[92m$ORGANISM\e[39m"
echo -e "Reverse BLAST identity (%) cutoff:............\e[92m$PIDENT\e[39m"
echo -e "Reverse BLAST gaps (%) cutoff:................\e[92m$PGAPS\e[39m"
echo -e "MSAs for PhyML and/or CAPS created by:........\e[92m$MSAMETHOD\e[39m"
echo -e "Process MSAs before CAPS run (Gblocks)?.......\e[92m$GBLOCKS\e[39m"
echo -e "Run PhyML on processed MSAs (Gblocks)?........\e[92m$PHYMLGBLOCKS\e[39m"
echo -e "PhyML user-specified options:.................\e[92m$PHYMLOPTIONS\e[39m"
echo -e "Root the PhyML produced trees?................\e[92m$TREESROOT\e[39m"
echo -e "Do we use external guide tree for PhyML?......\e[92m$PHYMLGUIDE\e[39m"
echo -e "Trees to be used with CAPS:...................\e[92m$TREESCAPS\e[39m"
echo -e "Minimum number of common species in a pair:...\e[92m$MINCOMMONSPCS\e[39m"
echo -e "CAPS alpha-value cutoff at runtime:...........\e[92m$ALPHA\e[39m"
echo -e "CAPS bootstrap value at runtime:..............\e[92m$BOOT\e[39m"
echo -e "Postrun bootstrap threshold cutoff:...........\e[92m$RESBOOT\e[39m"
echo -e "Alignment gaps postrun cutoff:................\e[92m$RESGAPS\e[39m"
echo -e "Column identity postrun cutoff:...............\e[92m$RESIDEN\e[39m"
echo -e "Postrun P-value correlation cutoff:...........\e[92m$PVALUE\e[39m"
echo -e "\n"
echo -e "Select a step:"
mkdir -p $TMP/tsv
cd $TMP
# Menu. Great help from:
# https://askubuntu.com/questions/1705/how-can-i-create-a-select-menu-in-a-shell-script
PS3="Your choice: "
SEQOPT=( "Pair UniProt <-> OrthoDB <-> OGuniqueID"
"Prepare orthologues list (level: $LEVEL)"
"Get FASTA sequences of all orthologues"
"Download sequences from UniProt (organism: $ORGANISM)"
"BLAST orthologues against UniProt sequence ($ORGANISM, detailed: $DETBLAST)"
"Get FASTA sequences of the best hits (identity: $PIDENT; gaps: $PGAPS)"
"[MSA] Create MSA with selected method ($MSAMETHOD)"
"[TRE] Prepare trees ($TREESCAPS, $MSAMETHOD, $PHYMLGBLOCKS, $PHYMLGUIDE, $TREESROOT)"
"[RUN] Create pairs ($PAIRINGMANNER)"
"[RUN] CAPS run (alpha: $ALPHA, $MSAMETHOD, $GBLOCKS, $TREESCAPS)"
"[RES] Inspect CAPS results"
"[RES] Generate columns stats"
"[XML] Process CAPS results"
"[Exit script]" )
select opt in "${SEQOPT[@]}" ; do
case $opt in
"Pair UniProt <-> OrthoDB <-> OGuniqueID")
create_tsv_files
pair_uniprot_vs_orthodb
cd $TMP/$ORTHO/
FOUND=$( ls ./ )
parallel $CORESCAPS extract_oguniqueid ::: $FOUND
parallel $CORESCAPS report_gen ::: $FOUND
cd ..
summary_clarify
report_duplicates
headers_insert
echo -e "\nDone with 1)"
;;
"Prepare orthologues list (level: $LEVEL)")
sed "/$ORGANISM/d" $CWD/$SPECIES | \
while read -r taxidNCBI latinName; do
cd $TMP/$ORTHO/
FOUND=$( ls ./ )
parallel $CORESCAPS prepare_orthologues_list ::: $FOUND
cd ..
done
cd $TMP/$ORTHO/
FOUND=$( ls ./ )
parallel $CORESCAPS no_homologues_check ::: $FOUND
cd ..
echo -e "\nDone with 2)"
;;
"Get FASTA sequences of all orthologues")
cd $TMP/$ORTHO/
FOUND=$( ls ./ )
parallel $CORESCAPS get_ortho_fasta ::: $FOUND
cd ..
echo -e "\nDone with 3)"
;;
"Download sequences from UniProt (organism: $ORGANISM)")
uniprot_download
echo -e "\nDone with 4)"
;;
"BLAST orthologues against UniProt sequence ($ORGANISM, detailed: $DETBLAST)")
cd $TMP/$ORTHO/
FOUND=$( ls ./ )
parallel $CORESCAPS blast_db_prep ::: $FOUND
parallel $CORESCAPS reciprocal_blast ::: $FOUND
cd ..
echo -e "\nDone with 5)"
;;
"Get FASTA sequences of the best hits (identity: $PIDENT; gaps: $PGAPS)")
cd $TMP/$ORTHO/
FOUND=$( ls ./ )
parallel $CORESCAPS best_hits ::: $FOUND
cd ..
cd $TMP/$GETFA/
FOUND=$( ls ./ )
parallel $CORESCAPS species_names ::: $FOUND
cd ..
headers_blast
echo -e "\nDone with 6)"
;;
"[MSA] Create MSA with selected method ($MSAMETHOD)")
mkdir -p $TMP/$MSA/$MSAMETHOD/
cd $TMP/$GETFA
ORTHMSA=$( ls ./ )
if [ "$MSAMETHOD" = "muscle" ]; then
parallel $CORESMUSCLE musclefn ::: "$ORTHMSA"
elif [ "$MSAMETHOD" = "prank" ]; then
parallel $CORESPRANK prankfn ::: "$ORTHMSA"
elif [ "$MSAMETHOD" = "mafft" ] || [ "$MSAMETHOD" = "mafft-linsi" ] || [ "$MSAMETHOD" = "mafft-ginsi" ] || [ "$MSAMETHOD" = "mafft-einsi" ] || [ "$MSAMETHOD" = "mafft-fftns" ] || [ "$MSAMETHOD" = "mafft-fftnsi" ]; then
mafftfn
else
echo "[ERROR] Specify MSA method properly!"
fi
cd ..
msa_process
echo -e "\nDone with 7)"
;;
"[TRE] Prepare trees ($TREESCAPS, $MSAMETHOD, $PHYMLGBLOCKS, $PHYMLGUIDE, $TREESROOT)")
if [ "$TREESCAPS" = "auto" ]; then
echo -e "CAPS will generate its own trees. Skipping..."
elif [ "$TREESCAPS" = "external" ]; then
ext_trees
elif [ "$TREESCAPS" = "phyml" ] && [ "$PHYMLGUIDE" = "exguide" ]; then
phyml_guide
phyml_prep
parallel $CORESPHYML phymlfn ::: "$ORTHPHY"
phyml_process
elif [ "$TREESCAPS" = "phyml" ] && [ "$PHYMLGUIDE" = "noguide" ]; then
phyml_prep
parallel $CORESPHYML phymlfn ::: "$ORTHPHY"
phyml_process
else
echo -e "Check your tree settings!"
fi
echo -e "\nDone with 8)"
;;
"[RUN] Create pairs ($PAIRINGMANNER)")
if [ "$PAIRINGMANNER" = "all" ]; then
pair_msa
pair_tree
elif [ "$PAIRINGMANNER" = "defined" ]; then
pair_defined_msa
pair_tree
else
echo -e "Check your pairing settings!"
fi
echo -e "\nDone with 9"
;;
"[RUN] CAPS run (alpha: $ALPHA, $MSAMETHOD, $GBLOCKS, $TREESCAPS)")
split_dirs
caps_set
for folder in $TMP/$CAPSM/* ; do
cd $folder
echo -e "Processing \e[92m${folder}\e[39m"
echo "$DATESTAMP ${folder}" >> $TMP/progress-$ALPHA-$MSAMETHOD-$GBLOCKS-$TREESCAPS.txt
PAIRLIST=$(ls)
parallel $CORESCAPS --progress capsfn ::: "$PAIRLIST"
echo -e "Done in \e[92m${folder}\e[39m"
echo "" >> $TMP/progress-$ALPHA-$MSAMETHOD-$GBLOCKS-$TREESCAPS.txt
cd ..
done
echo -e "\nDone with 10"
;;
"[RES] Inspect CAPS results")
cd $TMP/$CAPSM/
for folder in * ; do
cd $folder
SUBPART=$( ls ./ )
parallel $CORESCAPS caps_inspect ::: "$SUBPART"
cd ..
done
echo -e "\n\e[92mResults inspections done!\e[39m\n"
cd $TMP/$RESULTS/coev/
for folder in * ; do
cd $folder
SUBPART=$( ls ./ )
parallel $CORESCAPS results_cleanup ::: "$SUBPART"
cd ..
done
echo -e "\nDone with 11"
;;
"[RES] Generate columns stats")
cd $TMP/$RESULTS/coev/
for folder in * ; do
cd $folder
SUBPART=$( ls ./ )
parallel $CORESCAPS columns_stats ::: "$SUBPART"
cd ..
done
cd $TMP/$RESULTS/coev/
for folder in * ; do
cd $folder
SUBPART=$( ls ./ )
parallel $CORESCAPS mean_pval ::: "$SUBPART"
cd ..
done
summary_cleanup
echo -e "\nDone with 12"
;;
"[XML] Process CAPS results")
node_gen_list
make_coev_inter_xml
make_pos_corr_xml
echo -e "\nDone with 13"
;;
"[Exit script]")
echo "Quitting..."
exit
;;
*)
echo "Invalid option"
;;
esac
REPLY=
done