Skip to content

Commit

Permalink
physlr-make: update rules to reflect new features (#141)
Browse files Browse the repository at this point in the history
* physlr-make: update rules to reflect new features

* physlr-make: update map-paf rule prerequisite

* physlr-make: update map-paf comment

* physlr-make: update rule

* add extend mol rules

* unittest: update split-minimizers unit test

* unittest: remove old  split-minimizers unit test files

* physlr-make: remove temp rules

* physlr-make: use mol rules

* physlr-make: update help message

* physlr-make: update help message
  • Loading branch information
jwcodee authored Jan 28, 2020
1 parent 3326156 commit d07a1d6
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 55 deletions.
23 changes: 14 additions & 9 deletions bin/physlr-make
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ t=16
bloom_filter_size=10000000000 #10GB

# Molecule separation stratergy
mol_strategy=distributed
mol_strategy=distributed+sqcosbin

# Path to the Physlr project.
physlr_path=$(shell dirname $$(dirname $(realpath $(MAKEFILE_LIST))))
Expand Down Expand Up @@ -103,13 +103,13 @@ arcs=false
.PHONY: f1chr4 f1chr2R f1 fishchr25 fish physical-map scaffolds
all: f1chr4 f1chr2R f1 fishchr25 fish

$(lr).physlr.physical-map.path: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.backbone.path
$(lr).physlr.physical-map.path: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.backbone.path
ln -sf $< $@

$(lr).physlr.physical-map.$(ref).n10.paf.gz: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.backbone.map-split.$(ref).n10.paf.gz
$(lr).physlr.physical-map.$(ref).n10.paf.gz: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.backbone.map-split.$(ref).n10.paf.gz
ln -sf $< $@

$(draft).physlr.fa: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.backbone.map-split.$(draft).n10.sort.best.bed.path.fa
$(draft).physlr.fa: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.backbone.map-split.$(draft).n10.sort.best.bed.path.fa
ln -sf $< $@

scaffolds:
Expand Down Expand Up @@ -151,6 +151,7 @@ physical-map: \
$(lr).physlr.physical-map.$(ref).n10.qpos.chain.metrics.tsv
endif
endif

# Help
help:
@echo "Usage: ./physlr-make [COMMAND] [OPTION=VALUE]..."
Expand Down Expand Up @@ -184,11 +185,11 @@ help:
@echo " min_component_size minimum number of barcodes in a backbone [50]."
@echo " minimum_barcode_multiplicity minimum number of minimizers per barcode [10]."
@echo " maximum_barcode_multiplicity maximum number of minimizers per barcode [5000]."
@echo " mol_strategy molecule separation strategy [distributed]. Available options are bc, bc+k3, distributed, ext."
@echo " mol_strategy molecule separation strategy [distributed+sqcosbin]. Available options are bc, bc+k3, distributed, distributed+sqcosbin."
@echo " bc (biconnected componenets) is the least conservative and is only suitable for datasets with low barcode multiplicity."
@echo " bc+k3 (biconnected componenets + k-3 cliques) is more conservative than bc and requires more time."
@echo " distributed is a modified version of bc+k3 that is faster than bc+k3 but may be more (or even less) conservative."
@echo " ext (extensive) mixes distributed with a modified version of sqcos (cosine similarity of squared adjacency matrix) which makes it more conservative."
@echo " distributed+sqcosbin mixes distributed with a modified version of sqcos (cosine similarity of squared adjacency matrix) which makes it more conservative."
@echo " bloom_filter_size size of bloom filter [10000000000] (10G)."
@echo " arcs Use ARCS to augment scaffolds (only compatible with ARCS v1.1.1) [false]."
@echo ""
Expand Down Expand Up @@ -898,7 +899,7 @@ endif

# Determine overlaps and output the graph in TSV.
%.physlr.overlap.tsv: %.physlr.tsv
$(time) $(physlr_path)/src/physlr-overlap -t1 -n10 $< >$@
$(time) $(physlr_path)/src/physlr-overlap -t$t -m10 $< >$@

# Determine the degree of each vertex.
%.deg.tsv: %.tsv
Expand Down Expand Up @@ -958,7 +959,7 @@ min_path_size=200
$(python) $(bin)/physlr flesh-backbone --min-component-size=$(min_component_size) -V$V $< $*.backbone.path >$@

# Split the minimizers to molecules
%.overlap.m$m.mol.mol2-bcs.split.tsv: %.overlap.m$m.mol.mol2-bcs.tsv %.tsv
%.overlap.m$m.mol.split.tsv: %.overlap.m$m.mol.tsv %.tsv
$(time) $(physlr_path)/src/physlr-split-minimizers -t$t $< $*.tsv >$@

# Split the reads into molecules
Expand All @@ -979,7 +980,7 @@ min_path_size=200
$(time) $(python) $(bin)/physlr map -V$V -n10 $^ >$@

# Map the draft assembly to the backbone graph and output BED.
%.backbone.map-split.$(draft).n10.bed: %.backbone.path $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.split.tsv $(draft).k$k-w$w.physlr.tsv
%.backbone.map-split.$(draft).n10.bed: %.backbone.path %.split.tsv $(draft).k$k-w$w.physlr.tsv
$(time) $(python) $(bin)/physlr map --mx-type split --map-pos 10 -V$V -n10 $^ >$@

# Map the draft assembly to the backbone graph and output BED.
Expand Down Expand Up @@ -1034,6 +1035,10 @@ min_path_size=200
%.map.$(ref).n10.paf.gz: %.path $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.tsv $(name)/$(ref).k$k-w$w.physlr.tsv
$(time) $(python) $(bin)/physlr map-paf -V$V -n10 $^ | $(gzip) >$@

# Map the reference to the backbone graph with split minimizers and output PAF.
%.backbone.map-split.$(ref).n10.paf.gz: %.backbone.path %.split.tsv $(name)/$(ref).k$k-w$w.physlr.tsv
$(time) $(python) $(bin)/physlr map-paf --mx-type split -V$V -n10 $^ | $(gzip) >$@

# Lift over query coordinates of a PAF file from minimzer index to nucleotide coordinate.
%.qpos.paf.gz: $(name)/$(ref).k$k-w$w.physlr.tsv %.paf.gz
$(zcat) $*.paf.gz | $(time) $(python) $(bin)/physlr liftover-paf -V$V $< - | $(gzip) >$@
Expand Down
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ check-physlr-molecules: all
./physlr-molecules -s bc data/tiny.mol.input.tsv | diff -q - data/tiny.mol.tsv.good

check-physlr-split-minimizers: all
./physlr-split-minimizers -t4 data/tiny.split-minimizers.mol.mol2-bcs.tsv data/tiny.split-minimizers.physlr.tsv | sort |diff -q - data/tiny.split-minimizers.mol.mol2-bcs.split.tsv.good
./physlr-split-minimizers -t4 data/tiny.split-minimizers.ext.mol.tsv data/tiny.split-minimizers.physlr.tsv | sort |diff -q - data/tiny.split-minimizers.ext.mol.split.tsv.good

install: physlr-indexlr physlr-filter-barcodes physlr-overlap physlr-filter-bxmx physlr-makebf physlr-molecules physlr-split-minimizers
install -d $(DESTDIR)$(PREFIX)/bin
Expand Down
14 changes: 14 additions & 0 deletions src/data/tiny.split-minimizers.ext.mol.split.tsv.good
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
100_55_25_0
100_55_25_90 1 3 5 7
100_55_25_91 2 6 8 15
22_1_9_0 2 6 8
543_288_92_0 1 3 5 7
75_288_50_0 1 3 7
92_300_57_0 2 6 15
AAACACCAGAAACCTA-1_0
AAACACCAGAAACCTA-1_90 1 3 5 7
AAACACCAGAAACCTA-1_91 2 6 8 15
AAACACCAGAAAGCTT-1_0 1 3 5 7
AAACACCAGAACGACC-1_0 1 3 7
AAACACCAGAACGACT-1_0 2 6 15
AAACACCAGAACGCCA-1_0 2 6 8
25 changes: 25 additions & 0 deletions src/data/tiny.split-minimizers.ext.mol.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
U m
AAACACCAGAAACCTA-1_0 1254
AAACACCAGAAACCTA-1_91 1254
AAACACCAGAAACCTA-1_90 1254
AAACACCAGAAAGCTT-1_0 1313
AAACACCAGAACGACC-1_0 1819
AAACACCAGAACGACT-1_0 1819
AAACACCAGAACGCCA-1_0 4173
100_55_25_0 1254
100_55_25_91 1254
100_55_25_90 1254
543_288_92_0 1313
75_288_50_0 1819
92_300_57_0 1819
22_1_9_0 4173

U V m
100_55_25_90 543_288_92_0 1
100_55_25_90 75_288_50_0 1
100_55_25_91 92_300_57_0 1
100_55_25_91 22_1_9_0 1
AAACACCAGAAACCTA-1_90 AAACACCAGAAAGCTT-1_0 1
AAACACCAGAAACCTA-1_90 AAACACCAGAACGACC-1_0 1
AAACACCAGAAACCTA-1_91 AAACACCAGAACGACT-1_0 1
AAACACCAGAAACCTA-1_91 AAACACCAGAACGCCA-1_0 1
16 changes: 0 additions & 16 deletions src/data/tiny.split-minimizers.mol.mol2-bcs.split.tsv.good

This file was deleted.

27 changes: 0 additions & 27 deletions src/data/tiny.split-minimizers.mol.mol2-bcs.tsv

This file was deleted.

4 changes: 2 additions & 2 deletions src/physlr-split-minimizers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ findMoleculesPerBarcode(bxToMolIdx_t& bxToMolIdx, const graph_t& g)
{
auto vertexItRange = boost::vertices(g);
for (auto vertexIt = vertexItRange.first; vertexIt != vertexItRange.second; ++vertexIt) {
std::string pattern = R"((\S+)_\d+_\d+$)";
std::string pattern = R"((\S+)_\d+$)";
std::regex rgx(pattern);
std::smatch matches;

Expand Down Expand Up @@ -228,7 +228,7 @@ splitMinimizers(
tsl::robin_set<Minimizer> neighbourMxsUnion;
for (auto neighbourItr = neighbours.first; neighbourItr != neighbours.second;
++neighbourItr) {
std::string pattern = R"((\S+)_\d+_\d+$)";
std::string pattern = R"((\S+)_\d+$)";
std::regex rgx(pattern);
std::smatch matches;
if (std::regex_search(g[*neighbourItr].name, matches, rgx)) {
Expand Down

0 comments on commit d07a1d6

Please sign in to comment.