Skip to content

Commit

Permalink
placement
Browse files Browse the repository at this point in the history
  • Loading branch information
balabanmetin committed Dec 7, 2021
1 parent 77c1a98 commit 13937c1
Show file tree
Hide file tree
Showing 4 changed files with 71 additions and 7 deletions.
5 changes: 2 additions & 3 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
input_dir: "data/alignments"
output_dir: "data/output"
input_jplace: "data/apples.jplace"
workdir: "data"

cores: 4

apples_config:
Expand Down
2 changes: 1 addition & 1 deletion install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ if conda info --envs | grep "udance" > /dev/null; then
echo "conda environment udance exists"
else
conda create -y -c bioconda -c conda-forge --channel smirarab --name udance python=3.8 \
pip newick_utils=1.6 setuptools seqkit==0.16.1 scipy dendropy==4.5.2 pandas==1.3.0 snakemake \
pip newick_utils=1.6 setuptools seqkit=2.1.0 scipy dendropy=4.5.2 pandas=1.3.0 snakemake \
raxml=8.2.12 tqdist
source activate udance
conda activate udance
Expand Down
47 changes: 47 additions & 0 deletions uDance/create_concat_alignment.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env bash

# Yueyu Jiang and Metin Balaban

# $1 alignment dir
# $2 bbone
ALNDIR=$1
BBONE=$2
OUTDIR=$3


TMPDIR=`mktemp -d placementruleXXXXXX`

cat $ALNDIR/* | grep ">" | sed "s/>//g" | sort -u > $TMPDIR/alltaxa.txt
touch $TMPDIR/alnpaths.txt
ls $ALNDIR | while read aln; do
# awk instead of wc -L because OSX doesn't have wc -L
sze=`sed -e "s/>\(.*\)/@>\1@/g" $ALNDIR/$aln|tr -d "\n"|tr "@" "\n"|tail -n+2 | head -n 2 | tail -n 1 | awk '{print length}'`
grep ">" $ALNDIR/$aln | sed "s/>//g" > $TMPDIR/thistaxa.txt
comm -23 $TMPDIR/alltaxa.txt $TMPDIR/thistaxa.txt > $TMPDIR/missing.txt
(cat $ALNDIR/$aln; while read tx; do printf ">$tx\n"; dd if=/dev/zero bs=$sze count=1 2>/dev/null | tr '\0' "-" | tr '\0' '-'; printf "\n"; done < $TMPDIR/missing.txt) | gzip -c - > $TMPDIR/$aln.gz
echo $TMPDIR/$aln.gz >> $TMPDIR/alnpaths.txt
done
# concatenate all alignments
seqkit concat --quiet -w 0 $(cat $TMPDIR/alnpaths.txt) | gzip -c - > $TMPDIR/concat.fa.gz
# order it so that bacbone sequences are the first N records
(seqkit grep -f <(nw_labels -I $BBONE) -w 0 --quiet $TMPDIR/concat.fa.gz; seqkit grep -v -f <(nw_labels -I $BBONE) -w 0 --quiet $TMPDIR/concat.fa.gz) | gzip -c - > $TMPDIR/concat_ordered.fa.gz
# Duplicate map is written to OUTDIR.
# TODO should we replace N's with dashes?
gzip -cd $TMPDIR/concat_ordered.fa.gz | seqkit rmdup --quiet -s -i -w 0 -D $OUTDIR/rm_map.txt | gzip -c - > $TMPDIR/concat_dedupe.fa.gz
rm $TMPDIR/concat_ordered.fa.gz
# extract backbone and query alignments from deduplicated alignment
seqkit grep -f <(nw_labels -I $BBONE) $TMPDIR/concat_dedupe.fa.gz -w 0 --quiet -o $OUTDIR/placement/backbone.fa
seqkit grep -v -f <(nw_labels -I $BBONE) $TMPDIR/concat_dedupe.fa.gz -w 0 --quiet -o $OUTDIR/placement/query.fa
rm $TMPDIR/concat_dedupe.fa.gz

grep ">" $OUTDIR/placement/backbone.fa | sed "s/>//g" > $TMPDIR/backbone_id_dedup.txt
mapfile -t < $TMPDIR/backbone_id_dedup.txt

# backbone tree with duplicates removed. This tree might potentially have polytomies.
# Polytomies need to be resolved as its required for APPLES-2 (FastTree).
# TODO resolution using raxml
nw_prune -v $BBONE "${MAPFILE[@]}" > $TMPDIR/backbone.tree
python -c "import treeswift as ts; t=ts.read_tree_newick(\"$TMPDIR/backbone.tree\"); \
[c.resolve_polytomies() for c in t.root.children]; print(t)" > $OUTDIR/placement/backbone.tree
rm -r $TMPDIR
# run_apples.py -s $OUTDIR/placement/backbone.fa -q $OUTDIR/query.fa -t $TMPDIR/backbone_resolved.tree
24 changes: 21 additions & 3 deletions udance.smk
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@

import os

configfile: "config.yaml"
# configfile: "config.yaml"

#include: "workflows/decompose.smk"

outdir=config["output_dir"]
wdr = config["workdir"]
outdir = os.path.join(wdr, "output")
alndir = os.path.join(wdr, "alignments")
bbone = os.path.join(wdr, "backbone.nwk")

localrules: all, clean

Expand All @@ -19,8 +22,23 @@ rule clean:
rm -r {params} data2/count.txt
"""

rule placement:
input: b = bbone, ind = alndir
output: j=os.path.join(outdir,"placement.jplace"), d=directory(os.path.join(outdir,"placement"))
params: o=outdir,
f=config["apples_config"]["filter"],
m=config["apples_config"]["method"], b=config["apples_config"]["base"]
log: out=os.path.join(outdir,"placement/apples2.out"), err=os.path.join(outdir,"placement/apples2.err")
shell:
"""
bash uDance/create_concat_alignment.sh {input.ind} {input.b} {params.o}
run_apples.py -s {output.d}/backbone.fa -q {output.d}/query.fa \
-t {output.d}/backbone.tree -f {params.f} -m {params.m} -b {params.b} -o {output.j} > {log.out} 2> {log.err}
"""


checkpoint decompose:
input: j=config["input_jplace"], ind=config["input_dir"]
input: j=os.path.join(outdir,"placement.jplace"), ind=alndir
output: cst=os.path.join(outdir,"color_spanning_tree.nwk")
params:
size=config["decompose_config"]["cluster_size"],
Expand Down

0 comments on commit 13937c1

Please sign in to comment.