Skip to content

Commit

Permalink
pruning large
Browse files Browse the repository at this point in the history
  • Loading branch information
balabanmetin committed Mar 10, 2022
1 parent 76460bd commit 2d7edfa
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 8 deletions.
6 changes: 5 additions & 1 deletion README
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
uDance version 1.2.1
uDance version 1.3.0

AUTHORS:
Metin Balaban
Expand All @@ -10,6 +10,10 @@ Anaconda

Changelog:

1.3.0:
-Pruning Large partitions
-Changes in TreeCluster logic disallowing formation of very small partitions

1.2.1:
-APPLES2 excludes sequences that are placed on internal nodes.
-Filtered backbone sequences are no longer added to the query set.
8 changes: 6 additions & 2 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ mainlines_config:
apples_config:
# [FM, OLS]
method: FM
# float. (0-infinity)
# float. [0-infinity)
filter: 0.2
# integer. (5-infinity)
# integer. [5-infinity)
base: 25
# minimum overlap fraction. Default is 0.001
overlap: 0.05
Expand All @@ -37,6 +37,10 @@ prep_config:
sublength: 100
# minimum fragment length
fraglength: 75
# large clusters are pruned. ASTRAL's limit is around 9000 for 400+ genes.
pruneafter: 9000
# [0-1]. lower threshold = stricter pruning
prune_thr: 0.96

infer_config:
# [raxml-ng,iqtree,raxml-8]
Expand Down
4 changes: 2 additions & 2 deletions prune_similar.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
parser.add_option("-T", "--threads", type=int, dest="num_thread", default=0,
help="number of cores used in placement. "
"0 to use all cores in the running machine", metavar="NUMBER")
parser.add_option("-c", "--cutoff", type=float, dest="cutoff_threshold", metavar='NUMBER', default=0.95,
parser.add_option("-c", "--cutoff", type=float, dest="cutoff_threshold", metavar='NUMBER', default=0.96,
help="threshold number of dissimilar genes.")
parser.add_option("-S", "--size", type=int, dest="minimum_size", metavar='NUMBER', default=8000,
parser.add_option("-S", "--size", type=int, dest="minimum_size", metavar='NUMBER', default=9000,
help="partition size requirement for pruning.")

(options, args) = parser.parse_args()
Expand Down
11 changes: 9 additions & 2 deletions uDance/subsample_partition.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,13 @@ def subsample_partition(partition_output_dir, cutoff):
return ""
pruned_species = set(pruned_species)

# remove pruned from species.txt
with open(join(partition_output_dir, "species.txt")) as f:
prevspecies = set(map(lambda x: x.strip(), f.readlines()))
newspecieslst = list(prevspecies.difference(pruned_species))
with open(join(partition_output_dir, "species.txt"), "w") as f:
f.write("\n".join(pruned_species) + "\n")

for g in genes:
aln_dict = dict()
with open(join(g, "aln.fa")) as af:
Expand Down Expand Up @@ -131,12 +138,12 @@ def subsample_partition(partition_output_dir, cutoff):
if len(v) > 1:
duplist.append("\t".join(v))

aln_output_path = join(g, "aln_pruned.fa")
aln_output_path = join(g, "aln.fa")
with open(aln_output_path, "w", buffering=100000000) as f:
f.write("\n".join(res))
f.write("\n")
if duplist:
dupmap_output_path = join(g, "dupmap_pruned.txt")
dupmap_output_path = join(g, "dupmap.txt")
with open(dupmap_output_path, "w", buffering=100000000) as f:
f.write("\n".join(duplist))
f.write("\n")
Expand Down
1 change: 1 addition & 0 deletions uDance/treecluster_sum.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def min_tree_coloring_sum_max(tree, thr, max_thr):
else:
left, right = current.children
if left.weight + right.weight + current.weight <= thr or \
(left.weight + right.weight + current.weight > thr and left.weight + right.weight <= max(3, thr / 10)) or \
left.edge_length + left.farthest + right.edge_length + right.farthest < max_thr or \
(left.edge_length <= ZERO_LEN and len(left.placements) > 0) or \
(right.edge_length <= ZERO_LEN and len(right.placements) > 0) or \
Expand Down
4 changes: 3 additions & 1 deletion udance.smk
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,8 @@ checkpoint decompose:
method=config["infer_config"]["method"],
sub=config["prep_config"]["sublength"],
frag=config["prep_config"]["fraglength"],
pra=config["prep_config"]["pruneafter"],
prt=config["prep_config"]["prune_thr"],
char=config["chartype"]

resources: cpus=config["resources"]["cores"],
Expand All @@ -210,7 +212,7 @@ checkpoint decompose:
else
python run_udance.py decompose -p -s {input.ind} -o {outdir}/udance -t {params.size} -j {input.j} -m {params.method} -T {resources.cpus} -l {params.sub} -f {params.frag}
fi
python prune_similar.py -T {resources.cpus} -o {outdir}/udance
python prune_similar.py -T {resources.cpus} -o {outdir}/udance -S {params.pra} -c {params.prt}
if [ -f {outdir}/udance/dedupe_map.txt ]; then
cat {outdir}/udance/dedupe_map.txt > {outdir}/dedupe_map.txt
fi
Expand Down

0 comments on commit 2d7edfa

Please sign in to comment.