-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrefinement.sh
100 lines (75 loc) · 5.62 KB
/
refinement.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/bin/bash
# Step performed after the cluster validation, to remove:
# 1. bad clusters (≥ 10% bad-aligned ORFs)
# 2. shadow clusters (≥ 30% shadow ORFs)
# 3. single rejected ORFs (shadow, spurious and bad-aligned)
# From the validation results we already have a table with the good clusters:
# good_cl.tsv (in the directory with the validation results)
MD=data
GOOD="${MD}"/cluster_validation/good_cl.tsv
OUTDIR="${MD}"/cluster_refinement
if [[ ! -d "${OUTDIR}" ]]; then
mkdir -p "${OUTDIR}"
fi
#Cluster db
CLSTR="${MD}"/mmseqs_clustering/marine_hmp_db_03112017_clu_fa
# Table with info about shadow and spurious ORFs
# Column: orf - length - cl_name - cl_size - prop_shadow - is.shadow - is.spurious
SHSP="${MD}"/spurious_and_shadows/marine_hmp_info_shadow_spurious.tsv.gz
# Remove the clusters with ≥ 30% shadows
join -11 -21 -v2 <(awk '$5>=0.3 && !seen[$2]++{print $2}' <(zcat ${SHSP}) | sort -k1,1) \
<(awk '{print $1}' ${GOOD} | sort -k1,1) > ${OUTDIR}/marine_hmp_good_less30_cl.txt
# retrieve the subdb with the ORFs opf these clusters
~/opt/MMseqs2/bin/mmseqs createsubdb ${OUTDIR}/marine_hmp_good_less30_cl.txt ${CLSTR} ${OUTDIR}/marine_hmp_kept_less30_clu_fa
# retrieve the bad-aligned sequences in these clusters
#awk '$1!="cl_name"' ${OUTDIR}/marine_hmp_good_less30_cl.txt | parallel --progress -j 20 cat "${MD}"/cluster_validation/compositional/results/{}_rejected.txt > ${OUTDIR}/marine_hmp_good_less30_cl_rejected.txt
# Or if we have instaed the file with all the rejected sequences
join -11 -21 <(sort -k1,1 ${OUTDIR}/marine_hmp_good_less30_cl.txt) \
<(sort -k1,1 ${MD}/cluster_validation/rejected_seqs_cl.tsv) > ${OUTDIR}/marine_hmp_good_less30_cl_rejected.txt
# Add the bad-aligned sequences to the spurious and shadows
cat ${OUTDIR}/marine_hmp_good_less30_cl_rejected.txt <(awk '$5<0.3 && $6!="FALSE" || $5<0.3 && $7!="FALSE"{print $1}' ${SHSP}) > ${OUTDIR}/marine_hmp_orfs_to_remove.txt
# add cluster membership
join -11 -21 <(sort ${OUTDIR}/${SMPL}_orfs_to_remove.txt) \
<(awk '{print $2,$1}' <(zcat ${MD}/mmseqs_clustering/marine_hmp_db_03112017_clu_info.tsv.gz) | sort -k1,1) > ${OUTDIR}/marine_hmp_orfs_to_remove_cl.tsv
# remove the single orfs from the clusters with less than 10% bad-aligned ORFs and less than 30% shadows.
mpirun -np 12 ~/opt/ffindex_mg_updt/bin/ffindex_apply_mpi \
${OUTDIR}/marine_hmp_kept_less30_clu_fa ${OUTDIR}/marine_hmp_kept_less30_clu_fa.index \
-i ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_fa.ffindex -d ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_fa.ffdata -- ${PWD}/scripts/C_refinement/remove_orfs.sh ${OUTDIR}/marine_hmp_orfs_to_remove_cl.tsv
# From the refined clusters select the annotated and the not annotated for the following classification steps
# Create tables with new seqs and new clusters for some stats and checking the numbers
join -11 -22 <(awk '{print $1}' ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_fa.ffindex | sort -k1,1) \
<(awk '{print $1,$2}' ${SHSP} | sort -k2,2) > ${OUTDIR}/tmp
join -12 -21 -v1 <(sort -k2,2 ${OUTDIR}/tmp) \
<(sort -k1,1 ${OUTDIR}/marine_hmp_orfs_to_remove.txt) > ${OUTDIR}/marine_hmp_refined_cl.tsv
rm ${OUTDIR}/tmp
# annotated (check those left with no-annotated sequences) --> join with file with all annotated clusters..for annotations
join -11 -21 <(sort -k1,1 ${OUTDIR}/marine_hmp_refined_cl.tsv) \
<(awk '{print $2,$1}' ${MD}/annot_and_clust/marine_hmp_clu_ge10_annot.tsv \
| sort -k1,1) > ${OUTDIR}/marine_hmp_refined_annot_cl.tsv
join -11 -21 <( awk '{print $3,$2}' ${OUTDIR}/marine_hmp_refined_annot_cl.tsv | sort -k1,1) \
<(awk '{print $1,$2,$3}' ${MD}/annot_and_clust/marine_hmp_clu_ge10_annot.tsv \
| sort -k1,1) > ${OUTDIR}/marine_hmp_refined_all_annot_cl.tsv
#find clusters with no annotated members
sort -k1,1 ${OUTDIR}/marine_hmp_refined_all_annot_cl.tsv \
| awk '!seen[$2,$4]++{print $2,$4}' \
| awk 'BEGIN{ getline; id=$1;l1=$1;l2=$2;} { if ($1 != id) { print l1,l2; l1=$1;l2=$2;} else { l2=l2"|"$2;} id=$1;} END { print l1,l2;}' \
| grep -v '|' | awk '$2=="NA"{print $1}' > ${OUTDIR}/marine_hmp_new_unkn_cl.txt
if [[ ! -s ${OUTDIR}/marine_hmp_new_unkn_cl.txt ]]; then
#move the clusters left with no annotated member to the not annotated
join -12 -21 -v1 <(awk '!seen[$1,$2,$3]++' ${OUTDIR}/marine_hmp_refined_all_annot_cl.tsv | sort -k2,2) \
<(sort ${OUTDIR}/marine_hmp_new_unkn_cl.txt) > ${OUTDIR}/marine_hmp_refined_annot_cl.tsv
join -12 -21 <(awk '!seen[$1,$2,$3]++' ${OUTDIR}/marine_hmp_refined_annot_cl.tsv | sort -k2,2) \
<(sort ${OUTDIR}/marine_hmp_new_unkn_cl.txt) > ${OUTDIR}/marine_hmp_refined_noannot_cl.tsv
# not annotated
join -12 -21 <(sort -k2,2 ${OUTDIR}/marine_hmp_refined_cl.tsv) \
<(awk '$4=="noannot"{print $1,$2}' ${GOOD} | sort -k1,1) >> ${OUTDIR}/marine_hmp_refined_noannot_cl.tsv
else
# not annotated
join -12 -21 <(sort -k2,2 ${OUTDIR}/marine_hmp_refined_cl.tsv) \
<(awk '$4=="noannot"{print $1,$2}' ${GOOD} | sort -k1,1) > ${OUTDIR}/marine_hmp_refined_noannot_cl.tsv
fi
# Uisng the cluster ids retrieve the two sub database for annotated clusters and not
ln -s ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_fa.ffindex ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_fa.index
ln -s ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_fa.ffdata ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_fa
~/MMseqs2/bin/mmseqs createsubdb <(awk '!seen[$1]++{print $1}' ${OUTDIR}/marine_hmp_refined_annot_cl.tsv) ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_fa ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_annot_fa
~/MMseqs2/bin/mmseqs createsubdb <(awk '!seen[$1]++{print $1}' ${OUTDIR}/marine_hmp_refined_noannot_cl.tsv) ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_fa ${OUTDIR}/ffindex_files/marine_hmp_refined_cl_noannot_fa