Skip to content

Commit

Permalink
removing pruning threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
balabanmetin committed Mar 11, 2022
1 parent 1f75797 commit 1a22447
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 18 deletions.
4 changes: 3 additions & 1 deletion README
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
uDance version 1.3.0
uDance version 1.3.1

AUTHORS:
Metin Balaban
Expand All @@ -9,6 +9,8 @@ Unix/Linux
Anaconda

Changelog:
1.3.1:
-Getting rid of pruning thresholds (automated finding)

1.3.0:
-Pruning Large partitions
Expand Down
2 changes: 0 additions & 2 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@ prep_config:
fraglength: 75
# large clusters are pruned. ASTRAL's limit is around 9000 for 400+ genes.
pruneafter: 9000
# [0-1]. lower threshold = stricter pruning
prune_thr: 0.96

infer_config:
# [raxml-ng,iqtree,raxml-8]
Expand Down
9 changes: 2 additions & 7 deletions prune_similar.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import json
from multiprocessing import cpu_count
from optparse import OptionParser
from os.path import join, isfile
import shutil
from os.path import join

from uDance.subsample_partition import subsample_partition

Expand All @@ -15,8 +14,6 @@
parser.add_option("-T", "--threads", type=int, dest="num_thread", default=0,
help="number of cores used in placement. "
"0 to use all cores in the running machine", metavar="NUMBER")
parser.add_option("-c", "--cutoff", type=float, dest="cutoff_threshold", metavar='NUMBER', default=0.96,
help="threshold number of dissimilar genes.")
parser.add_option("-S", "--size", type=int, dest="minimum_size", metavar='NUMBER', default=9000,
help="partition size requirement for pruning.")

Expand All @@ -37,12 +34,10 @@
if numspecies < options.minimum_size:
continue
print(numspecies)
res = subsample_partition(partition_output_dir, options.cutoff_threshold)
res = subsample_partition(partition_output_dir, options.minimum_size)
if res:
dupmapstrs.append(res)

# if isfile(join(options.output_fp, "rm_map.txt")):
# shutil.copyfile(join(options.output_fp, "rm_map.txt"), join(options.output_fp, "dedupe_map.txt"))
if len(dupmapstrs) > 0:
with open(join(options.output_fp, "dedupe_map.txt"), "a") as f:
for st in dupmapstrs:
Expand Down
26 changes: 20 additions & 6 deletions uDance/subsample_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import treeswift as ts


def subsample_partition(partition_output_dir, cutoff):
def subsample_partition(partition_output_dir, limit):
with open(join(partition_output_dir, "species.txt")) as f:
species = set(map(lambda x: x.strip(), f.readlines()))
t = ts.read_tree_newick(join(partition_output_dir, "astral_constraint.nwk"))
Expand Down Expand Up @@ -65,11 +65,25 @@ def subsample_partition(partition_output_dir, cutoff):
x = counts_ij / np.minimum(counts_i[..., np.newaxis], counts_i[np.newaxis, ...])
x.dump(join(partition_output_dir, "adj_mat.pkl"), protocol=4)
#print(x)
y = (x >= cutoff)
print("redo %.3f." % (time.time() - start))
start = time.time()
n, components = connected_components(y)
print(n, components)

clow = 1
chigh = 100

while chigh > clow or chigh == 2:
cutoff = (chigh + clow + 1)//2
print(clow, cutoff, chigh)
y = (x >= (cutoff/100))
n, components = connected_components(y)
if limit < n:
chigh = cutoff-1
elif limit > n:
clow = cutoff
else:
clow = cutoff
chigh = cutoff # making sure clow=chigh always holds at the exit. we will print that.
break

print("Partition " + partition_output_dir + " is pruned to %d taxa at the automatic cutoff %f." % (n, clow/100))

print("components %.3f." % (time.time() - start))

Expand Down
3 changes: 1 addition & 2 deletions udance.smk
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,6 @@ checkpoint decompose:
sub=config["prep_config"]["sublength"],
frag=config["prep_config"]["fraglength"],
pra=config["prep_config"]["pruneafter"],
prt=config["prep_config"]["prune_thr"],
char=config["chartype"]

resources: cpus=config["resources"]["cores"],
Expand All @@ -212,7 +211,7 @@ checkpoint decompose:
else
python run_udance.py decompose -p -s {input.ind} -o {outdir}/udance -t {params.size} -j {input.j} -m {params.method} -T {resources.cpus} -l {params.sub} -f {params.frag}
fi
python prune_similar.py -T {resources.cpus} -o {outdir}/udance -S {params.pra} -c {params.prt}
python prune_similar.py -T {resources.cpus} -o {outdir}/udance -S {params.pra}
if [ -f {outdir}/udance/dedupe_map.txt ]; then
cat {outdir}/udance/dedupe_map.txt > {outdir}/dedupe_map.txt
fi
Expand Down

0 comments on commit 1a22447

Please sign in to comment.