-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclassification.sh
111 lines (77 loc) · 4.79 KB
/
classification.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/bin/bash
set -e
set -x
DIR=data/cluster_refinement/refined
SMPL="marine_hmp"
U="${DIR}"/"${SMPL}"_refined_noannot_cl.tsv
K="${DIR}"/"${SMPL}"_refined_annot_cl.tsv
OUTDIR=data/cluster_classification
OUTDIRU="${OUTDIR}"/unannotated_cl
OUTDIRK="${OUTDIR}"/annotated_cl
NSLOTS="${1}"
#Extract not-annotated clusters from the good refined sbset
awk '!seen[$1]++{print $1}' "${U}" > "${OUTDIRU}"/"${SMPL}"_refined_noannot_cl_ids.txt
#Extract consensus sequnces
"${PWD}"/scripts/Cluster_classification/cluster_ffindex_files.sh "${SMPL}" "${OUTDIRU}"/"${SMPL}"_refined_noannot_cl_ids.txt "${OUTDIRU}"/"${SMPL}"_refined_noannot_cons.fasta
#Search against UniRef90
OUTUR="${OUTDIRU}"/"${SMPL}"_uniref90
mkdir -p "${OUTUR}"
UR=data/DBs/uniref90_db
UR_PROT=data/DBs/uniref90_prot_ids.tsv.gz
"${PWD}"/scripts/Cluster_classification/double_search.sh "${OUTUR}" "${OUTDIRU}"/"${SMPL}"_refined_noannot_cons.fasta "${UR}" "${NSLOTS}" "${OUTUR}"/"${SMPL}"_cons_uniref90.m8
rm "${OUTUR}"/tmp -rf
"${PWD}"/scripts/Cluster_classification/hypo_parser.sh "${OUTUR}"/"${SMPL}"_cons_uniref90.m8 "${UR_PROT}" "${PWD}"/scripts/unknown_grep.tsv 1.0 "${OUTUR}"/"${SMPL}"_cons_uniref_e60_hypo_1.txt
#Parse results and search nohits against NCBI nr
awk '!seen[$1]++{print $1}' "${OUTUR}"/"${SMPL}"_cons_uniref90.m8 > "${OUTUR}"/"${SMPL}"_cons_uniref90_hits.txt
filterbyname.sh in="${OUTDIRU}"/"${SMPL}"_refined_noannot_cons.fasta out="${OUTUR}"/"${SMPL}"_cons_uniref90_nohits.fasta \
names="${OUTUR}"/"${SMPL}"_cons_uniref90_hits.txt include=f ignorejunk
#Search against NR
OUTNR="${OUTDIRU}"/"${SMPL}"_NR
mkdir -p "${OUTNR}"
NR=data/DBs/nr_db
NR_PROT=data/DBs/nr.proteins.tsv.gz
"${PWD}"/scripts/Cluster_classification/double_search.sh "${OUTNR}" "${OUTUR}"/"${SMPL}"_cons_uniref90_nohits.fasta "${NR}" "${NSLOTS}" "${OUTNR}"/"${SMPL}"_cons_nr.m8
rm "${OUTNR}"/tmp -rf
"${PWD}"/scripts/Cluster_classification/hypo_parser.sh "${OUTNR}"/"${SMPL}"_cons_nr.m8 "${NR_PROT}" "${PWD}"/scripts/unknown_grep.tsv 1.0 "${OUTNR}"/"${SMPL}"_cons_nr_e60_hypo_1.txt
#Parse results and define the first categories
awk '!seen[$1]++{print $1}' "${OUTNR}"/"${SMPL}"_cons_nr.m8 > "${OUTNR}"/"${SMPL}"_cons_nr_hits.txt
filterbyname.sh in="${OUTUR}"/"${SMPL}"_cons_uniref90_nohits.fasta out="${OUTNR}"/"${SMPL}"_cons_nr_nohits.fasta \
names="${OUTNR}"/"${SMPL}"_cons_nr_hits.txt include=f ignorejunk
#Environmental unknowns (EUs)
grep '^>' "${OUTNR}"/"${SMPL}"_cons_nr_nohits.fasta | sed 's/^>//' > "${OUTDIR}"/"${SMPL}"_eu_ids.txt
# Knowns without Pfam (KWPs)
#Not-hypo hits from UniRef90
join -11 -21 -v1 <(sort "${OUTUR}"/"${SMPL}"_cons_uniref90_hits.txt) \
<(sort "${OUTUR}"/"${SMPL}"_cons_uniref_e60_hypo_1.txt) > "${OUTUR}"/"${SMPL}"_cons_uniref_not_hypo.txt
#Not-hypo hits from NCBI nr
join -11 -21 -v1 <(sort "${OUTNR}"/"${SMPL}"_cons_nr_hits.txt) \
<(sort -k1,1 <(awk '{print $1}' "${OUTNR}"/"${SMPL}"_cons_nr_e60_hypo_1.txt)) > "${OUTNR}"/"${SMPL}"_cons_nr_not_hypo.txt
cat "${OUTUR}"/"${SMPL}"_cons_uniref_not_hypo.txt "${OUTNR}"/"${SMPL}"_cons_nr_not_hypo.txt > "${OUTDIR}"/"${SMPL}"_kwp_ids.txt
#Knowns and Genomic unknowns
# Retreive the info about the annotations
join -13 -22 <(sort -k3,3 "${K}") <(sort -k2,2 "${PWD}"/"${SMPL}"/annot_and_clust/"${SMPL}"_clu_ge_avg_size_annot.tsv) > "${OUTDIRK}"/tmp
# Parse columns
awk '{print $2"\t"$3"\t"$4"\t"$1"\t"$6"\t"$8"\t"$9}' "${OUTDIRK}"/tmp | gzip > "${OUTDIRK}"/"${SMPL}"_refined_annot_cl_pfam.tsv.gz
rm "${OUTDIRK}"/tmp
#Load in R and retrieve general table with architectures plus DUFs and PFs
KANNOT="${OUTDIRK}"/"${SMPL}"_cluster_pfam_domain_architect.tsv
R CMD BATCH "--args ${KANNOT} ${OUTDIRK}" "${PWD}"/scripts/Cluster_classification/pfam_domain_architect_ref.r "${OUTDIRK}"/pfam_domain_architecture.Rout
# Add the DUFs annotated clusters to the GU
cat <(awk 'NR>1{print $1}' "${OUTDIRK}"/kept_DUFs.tsv) \
"${OUTUR}"/"${SMPL}"_cons_uniref_e60_hypo_1.txt \
"${OUTNR}"/"${SMPL}"_cons_nr_e60_hypo_1.txt > "${OUTDIR}"/"${SMPL}"_gu_ids.txt
# Retrieve the knowns
awk 'NR>1{print $1}' "${OUTDIR}"/kept_PF.tsv > "${OUTDIR}"/"${SMPL}"_k_ids.txt
### Save files with annotations
# Add annotation info:
cat "${OUTUR}"/"${SMPL}"_cons_uniref90_hypo_char \
"${OUTUR}"/"${SMPL}"_cons_nr_hypo_char > "${OUTUR}"/"${SMPL}"_uniref90_nr_annotations.tsv
## KWP annotations
join -11 -21 <(sort -k1,1 "${OUTDIR}"/"${SMPL}"_kwp_ids.txt ) \
<(sort -k1,1 "${OUTUR}"/"${SMPL}"_uniref90_nr_annotations.tsv) > "${OUTDIR}"/"${SMPL}"_kwp_annotations.tsv
## GU annotations
join -11 -21 <(sort -k1,1 "${OUTDIR}"/"${SMPL}"_gu_ids.txt ) \
<(sort -k1,1 ${DIR}/noannot_uniref_nr_annotations.tsv) > ${DIR}/gu_annotations.tsv
awk 'NR>1 && $9=="DUF"{print $1,"PFAM","0.0",$6}' "${KANNOT}" >> "${OUTDIR}"/"${SMPL}"_gu_annotations.tsv
# Retrieve the Knowns cluster set
awk 'NR>1 && $9=="PF"{print $1,"PFAM","0.0",$6}' "${KANNOT}" >> "${OUTDIR}"/"${SMPL}"_k_annotations.tsv