Skip to content

Commit

Permalink
initial snakemake pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
balabanmetin committed Dec 1, 2021
1 parent 311d972 commit 63e671f
Show file tree
Hide file tree
Showing 9 changed files with 251 additions and 136 deletions.
23 changes: 23 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
input_dir: "data2/alignments"
output_dir: "data2/output"
input_jplace: "data2/apples.jplace"
cores: 4

apples_config:
# [FM, OLS]
method: FM
# float. 0-infinity
filter: 0.25
# integer. 5-infinity
base: 25

decompose_config:
# cluster must have a minimum diameter
edge_thr: 0.02
# approximate cluster size (no guarantees)
cluster_size: 100
# [raxml-ng,iqtree,raxml-8,copy]
infer: "raxml-8"
cores: 1


2 changes: 1 addition & 1 deletion install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
if conda info --envs | grep "udance" > /dev/null; then
echo "conda environment udance exists"
else
mamba create -y -c bioconda -c conda-forge --channel smirarab --name udance python=3.8 pip newick_utils=1.6 setuptools seqkit==0.16.1 scipy dendropy==4.5.2 pandas==1.3.0 snakemake
mamba create -y -c bioconda -c conda-forge --channel smirarab --name udance python=3.8 pip newick_utils=1.6 setuptools seqkit==0.16.1 scipy dendropy==4.5.2 pandas==1.3.0 snakemake raxml=8.2.12
source activate udance
conda activate udance
pip install apples==2.0.5
Expand Down
195 changes: 100 additions & 95 deletions uDance/PoolAlignmentWorker.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def worker(cls, i, _):
f.write("\n".join(duplist))
f.write("\n")

# create the raxml constraint
constraint_outgroup_tree = join(partition_output_dir, "raxml_constraint.nwk")
if isfile(constraint_outgroup_tree) and cls.options.constrain_outgroups:
t = ts.read_tree_newick(constraint_outgroup_tree)
Expand All @@ -89,112 +90,116 @@ def worker(cls, i, _):
with open(script, "w") as f:
f.write("#!/usr/bin/env bash\n\n")
f.write("export OMP_NUM_THREADS=1\n\n")
bipartition_path = join(aln_outdir, "bipartition.fasta")
fasttree_log = join(aln_outdir, "fasttree.log")
fasttree_err = join(aln_outdir, "fasttree.err")
fasttree_nwk = join(aln_outdir, "fasttree.nwk")
if isfile(bipartition_path) and cls.options.constrain_outgroups and cls.options.method == 'raxml-ng':
f.write("FastTreeMP -constraints %s -log %s < %s > %s 2> %s \n"
% (bipartition_path, fasttree_log, aln_output_path, fasttree_nwk, fasttree_err))
elif cls.options.method == 'raxml-ng':
f.write("FastTreeMP -log %s < %s > %s 2> %s \n"
% (fasttree_log, aln_output_path, fasttree_nwk, fasttree_err))

fasttree_resolved_nwk = join(aln_outdir, "fasttree_resolved.nwk")
raxml_constraint_path = join(partition_output_dir, "raxml_constraint.nwk")
astral_constraint_path = join(partition_output_dir, "astral_constraint.nwk")
if cls.options.method == 'raxml-ng':
# raxml-ng method first estimates a starting tree using fasttree2
bipartition_path = join(aln_outdir, "bipartition.fasta")
fasttree_log = join(aln_outdir, "fasttree.log")
fasttree_err = join(aln_outdir, "fasttree.err")
fasttree_nwk = join(aln_outdir, "fasttree.nwk")
if isfile(bipartition_path) and cls.options.constrain_outgroups:
f.write("FastTreeMP -constraints %s -log %s < %s > %s 2> %s \n"
% (bipartition_path, fasttree_log, aln_output_path, fasttree_nwk, fasttree_err))
else:
f.write("FastTreeMP -log %s < %s > %s 2> %s \n"
% (fasttree_log, aln_output_path, fasttree_nwk, fasttree_err))

fasttree_resolved_nwk = join(aln_outdir, "fasttree_resolved.nwk")
# random resolution. THIS MAY PREVENT REPRODUCIBILITY!!! Change in the future
f.write("python3 -c \"import sys, treeswift; "
"t=treeswift.read_tree_newick(input()); "
"t.resolve_polytomies(); print(t)\" < %s > %s \n" % (fasttree_nwk, fasttree_resolved_nwk))

induced_raxml_constraint_path = join(aln_outdir, "raxml_constraint.nwk")
raxml_err = join(aln_outdir, "raxml.err")
raxml_out = join(aln_outdir, "raxml.out")
raxml_run = join(aln_outdir, "RUN")
raxml_constraint_path = join(partition_output_dir, "raxml_constraint.nwk")
astral_constraint_path = join(partition_output_dir, "astral_constraint.nwk")

iqtree_err = join(aln_outdir, "iqtree.err")
iqtree_out = join(aln_outdir, "iqtree.out")
iqtree_run = join(aln_outdir, "RUN")
iqtree_constraint_path = join(partition_output_dir, "iqtree_constraint.nwk")
raxml8_constraint_path = join(partition_output_dir, "raxml8_constraint.nwk")
raxml8_run = "file"
raxml8_err = join(aln_outdir, "raxml8.err")
raxml8_out = join(aln_outdir, "raxml8.out")
if isfile(induced_raxml_constraint_path) and cls.options.constrain_outgroups:
f.write("raxml-ng --force perf_threads --tree %s --tree-constraint %s "
"--msa %s --model LG+G --prefix %s --seed 12345 "
"--threads 4 > %s 2> %s \n"
% (fasttree_resolved_nwk, induced_raxml_constraint_path, aln_output_path,
raxml_run, raxml_out, raxml_err))
raxml_err = join(aln_outdir, "raxml.err")
raxml_out = join(aln_outdir, "raxml.out")
raxml_run = join(aln_outdir, "RUN")

else:
if cls.options.method == 'iqtree':
if cls.options.protein_seqs:
# f.write("iqtree -t %s -s %s --prefix %s "
# "--seed 12345 -T 4 --model LG+G > %s 2> %s \n"
# % (fasttree_resolved_nwk, aln_output_path,
# iqtree_run, iqtree_out, iqtree_err))
if os.path.isfile(astral_constraint_path):
shutil.copy(astral_constraint_path, iqtree_constraint_path)
f.write("iqtree2 -s %s --prefix %s "
"--seed 12345 -T AUTO --model LG+G -g %s > %s 2> %s \n"
% (aln_output_path, iqtree_run,
iqtree_constraint_path,
iqtree_out, iqtree_err))
else:
f.write("iqtree2 -s %s --prefix %s "
"--seed 12345 -T AUTO --model LG+G -g %s > %s 2> %s \n"
% (aln_output_path, iqtree_run,
iqtree_out, iqtree_err))
else:
if os.path.isfile(astral_constraint_path):
shutil.copy(astral_constraint_path, iqtree_constraint_path)
f.write("iqtree2 -s %s --prefix %s "
"--seed 12345 -T AUTO --model GTR+F+G4 -g %s > %s 2> %s \n"
% (aln_output_path, iqtree_run,
iqtree_constraint_path,
iqtree_out, iqtree_err))
else:
f.write("iqtree2 -s %s --prefix %s "
"--seed 12345 -T AUTO --model GTR+F+G4 > %s 2> %s \n"
% (aln_output_path, iqtree_run,
iqtree_out, iqtree_err))
elif cls.options.method == 'raxml-ng':
f.write("raxml-ng --force perf_threads --tree %s "
if isfile(raxml_constraint_path) and cls.options.constrain_outgroups:
f.write("raxml-ng --tree %s --tree-constraint %s "
"--msa %s --model LG+G --prefix %s --seed 12345 "
"--threads 4 > %s 2> %s \n"
% (fasttree_resolved_nwk, raxml_constraint_path, aln_output_path,
raxml_run, raxml_out, raxml_err))
else:
f.write("raxml-ng --tree %s "
"--msa %s --model LG+G --prefix %s --seed 12345 "
"--threads 4 > %s 2> %s \n"
% (fasttree_resolved_nwk, aln_output_path,
raxml_run, raxml_out, raxml_err))
elif cls.options.method == 'raxml-8':
if cls.options.protein_seqs:
# f.write("iqtree -t %s -s %s --prefix %s "
# "--seed 12345 -T 4 --model LG+G > %s 2> %s \n"
# % (fasttree_resolved_nwk, aln_output_path,
# iqtree_run, iqtree_out, iqtree_err))
if os.path.isfile(astral_constraint_path):
shutil.copy(astral_constraint_path, raxml8_constraint_path)
f.write("raxmlHPC-PTHREADS -s %s -w %s -n %s "
"-p 12345 -T %s -m LG+G -g %s > %s 2> %s \n"
% (aln_output_path, aln_outdir, raxml8_run, min(cls.options.num_thread, 16),
raxml8_constraint_path, raxml8_out, raxml8_err))
else:
f.write("raxmlHPC-PTHREADS -s %s -w %s -n %s "
"-p 12345 -T %s -m LG+G > %s 2> %s \n"
% (aln_output_path, aln_outdir, raxml8_run, min(cls.options.num_thread, 16),
raxml8_out, raxml8_err))

elif cls.options.method == 'iqtree':
iqtree_err = join(aln_outdir, "iqtree.err")
iqtree_out = join(aln_outdir, "iqtree.out")
iqtree_run = join(aln_outdir, "RUN")
iqtree_constraint_path = join(partition_output_dir, "iqtree_constraint.nwk")
if cls.options.protein_seqs:
# f.write("iqtree -t %s -s %s --prefix %s "
# "--seed 12345 -T 4 --model LG+G > %s 2> %s \n"
# % (fasttree_resolved_nwk, aln_output_path,
# iqtree_run, iqtree_out, iqtree_err))
if isfile(raxml_constraint_path) and cls.options.constrain_outgroups:
shutil.copy(raxml_constraint_path, iqtree_constraint_path)
f.write("iqtree2 -s %s --prefix %s "
"--seed 12345 -T AUTO --model LG+G -g %s > %s 2> %s \n"
% (aln_output_path, iqtree_run,
iqtree_constraint_path,
iqtree_out, iqtree_err))
else:
f.write("iqtree2 -s %s --prefix %s "
"--seed 12345 -T AUTO --model LG+G > %s 2> %s \n"
% (aln_output_path, iqtree_run,
iqtree_out, iqtree_err))
else:
if isfile(raxml_constraint_path) and cls.options.constrain_outgroups:
shutil.copy(raxml_constraint_path, iqtree_constraint_path)
f.write("iqtree2 -s %s --prefix %s "
"--seed 12345 -T AUTO --model GTR+F+G4 -g %s > %s 2> %s \n"
% (aln_output_path, iqtree_run,
iqtree_constraint_path,
iqtree_out, iqtree_err))
else:
f.write("iqtree2 -s %s --prefix %s "
"--seed 12345 -T AUTO --model GTR+F+G4 > %s 2> %s \n"
% (aln_output_path, iqtree_run,
iqtree_out, iqtree_err))
elif cls.options.method == 'raxml-8':
raxml8_constraint_path = join(partition_output_dir, "raxml8_constraint.nwk")
raxml8_run = "file"
raxml8_err = join(aln_outdir, "raxml8.err")
raxml8_out = join(aln_outdir, "raxml8.out")
if cls.options.num_thread > 1:
executable = "raxmlHPC-PTHREADS"
else:
executable = "raxmlHPC"
if cls.options.protein_seqs:
# f.write("iqtree -t %s -s %s --prefix %s "
# "--seed 12345 -T 4 --model LG+G > %s 2> %s \n"
# % (fasttree_resolved_nwk, aln_output_path,
# iqtree_run, iqtree_out, iqtree_err))
if isfile(raxml_constraint_path) and cls.options.constrain_outgroups:
shutil.copy(raxml_constraint_path, raxml8_constraint_path)
f.write("%s -s %s -w %s -n %s "
"-p 12345 -T %s -m LG+G -g %s > %s 2> %s \n"
% (executable, aln_output_path, aln_outdir, raxml8_run, min(cls.options.num_thread, 16),
raxml8_constraint_path, raxml8_out, raxml8_err))
else:
f.write("%s -s %s -w %s -n %s "
"-p 12345 -T %s -m LG+G > %s 2> %s \n"
% (executable, aln_output_path, aln_outdir, raxml8_run, min(cls.options.num_thread, 16),
raxml8_out, raxml8_err))
else:
if isfile(raxml_constraint_path) and cls.options.constrain_outgroups:
shutil.copy(raxml_constraint_path, raxml8_constraint_path)
f.write("%s -s %s -w %s -n %s "
"-p 12345 -T %s -m GTRCAT -g %s > %s 2> %s \n"
% (executable, aln_output_path, aln_outdir, raxml8_run, min(cls.options.num_thread, 16),
raxml8_constraint_path, raxml8_out, raxml8_err))
else:
if os.path.isfile(astral_constraint_path):
shutil.copy(astral_constraint_path, raxml8_constraint_path)
f.write("raxmlHPC-PTHREADS -s %s -w %s -n %s "
"-p 12345 -T %s -m GTRCAT -g %s > %s 2> %s \n"
% (aln_output_path, aln_outdir, raxml8_run, min(cls.options.num_thread, 16),
raxml8_constraint_path, raxml8_out, raxml8_err))
else:
f.write("raxmlHPC-PTHREADS -s %s -w %s -n %s "
"-p 12345 -T %s -m GTRCAT > %s 2> %s \n"
% (aln_output_path, aln_outdir, raxml8_run, min(cls.options.num_thread, 16),
raxml8_out, raxml8_err))
f.write("%s -s %s -w %s -n %s "
"-p 12345 -T %s -m GTRCAT > %s 2> %s \n"
% (executable, aln_output_path, aln_outdir, raxml8_run, min(cls.options.num_thread, 16),
raxml8_out, raxml8_err))
st = os.stat(script)
os.chmod(script, st.st_mode | stat.S_IEXEC)
return trimmed_aln_length*len(partition_aln), script
Expand Down
31 changes: 21 additions & 10 deletions uDance/PoolAstralWorker.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,7 @@ def set_class_attributes(cls, options, astral_exec):
cls.astral_exec = astral_exec

@classmethod
def worker(cls, i):
partition_output_dir = join(cls.options.output_fp, str(i))
def worker(cls, partition_output_dir):
genes = glob(join(partition_output_dir, "*", ""))
for gene in genes:
if cls.options.method == 'iqtree':
Expand Down Expand Up @@ -56,12 +55,24 @@ def worker(cls, i):
astral_output_file = join(partition_output_dir, "astral_output.nwk")
astral_log_file = join(partition_output_dir, "astral.log")
astral_const_file = join(partition_output_dir, "astral_constraint.nwk")
s = f'cp {astral_input_file} {astral_output_file}\n'
# s = ["java", "-jar", cls.astral_exec, "-i", astral_input_file,
# "-o", astral_output_file, "-j", astral_const_file, "2>", astral_log_file]
return s
# if cls.options.use_gpu:
# gpu_opt = ""
# else:
# gpu_opt = "-C"

# with open(astral_log_file, "w") as lg:
# with Popen(s, stdout=PIPE, stdin=PIPE, stderr=lg) as p:
# astral_stdout = p.stdout.read().decode('utf-8')
# #print(astral_stdout)
# s = f'cp {astral_const_file} {astral_output_file}\n'
# s = ["cp", astral_const_file, astral_output_file]
s = ["java", "-Xmx%sG" % cls.options.memory, "-jar", cls.astral_exec, "-i", astral_input_file,
"-o", astral_output_file, "-j", astral_const_file] # , "2>", astral_log_file]

# astral_jobs = join(options.output_fp, "run_astral.sh")
# with open(astral_jobs, "w") as f:
# for ln in results:
# f.write(ln)
#
# return s

with open(astral_log_file, "w") as lg:
with Popen(s, stdout=PIPE, stdin=PIPE, stderr=lg) as p:
astral_stdout = p.stdout.read().decode('utf-8')
print(astral_stdout)
Loading

0 comments on commit 63e671f

Please sign in to comment.