From d07a1d67666d7fd32f7b8061212f85323bd1cb75 Mon Sep 17 00:00:00 2001 From: Johnathan Wong <34543031+jowong4@users.noreply.github.com> Date: Tue, 28 Jan 2020 08:44:20 -0800 Subject: [PATCH] physlr-make: update rules to reflect new features (#141) * physlr-make: update rules to reflect new features * physlr-make: update map-paf rule prerequisite * physlr-make: update map-paf comment * physlr-make: update rule * add extend mol rules * unittest: update split-minimizers unit test * unittest: remove old split-minimizers unit test files * physlr-make: remove temp rules * physlr-make: use mol rules * physlr-make: update help message * physlr-make: update help message --- bin/physlr-make | 23 +++++++++------- src/Makefile | 2 +- ...ny.split-minimizers.ext.mol.split.tsv.good | 14 ++++++++++ src/data/tiny.split-minimizers.ext.mol.tsv | 25 +++++++++++++++++ ...lit-minimizers.mol.mol2-bcs.split.tsv.good | 16 ----------- .../tiny.split-minimizers.mol.mol2-bcs.tsv | 27 ------------------- src/physlr-split-minimizers.cc | 4 +-- 7 files changed, 56 insertions(+), 55 deletions(-) create mode 100644 src/data/tiny.split-minimizers.ext.mol.split.tsv.good create mode 100644 src/data/tiny.split-minimizers.ext.mol.tsv delete mode 100644 src/data/tiny.split-minimizers.mol.mol2-bcs.split.tsv.good delete mode 100644 src/data/tiny.split-minimizers.mol.mol2-bcs.tsv diff --git a/bin/physlr-make b/bin/physlr-make index 1a46255c..6cb2a423 100755 --- a/bin/physlr-make +++ b/bin/physlr-make @@ -15,7 +15,7 @@ t=16 bloom_filter_size=10000000000 #10GB # Molecule separation stratergy -mol_strategy=distributed +mol_strategy=distributed+sqcosbin # Path to the Physlr project. physlr_path=$(shell dirname $$(dirname $(realpath $(MAKEFILE_LIST)))) @@ -103,13 +103,13 @@ arcs=false .PHONY: f1chr4 f1chr2R f1 fishchr25 fish physical-map scaffolds all: f1chr4 f1chr2R f1 fishchr25 fish -$(lr).physlr.physical-map.path: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.backbone.path +$(lr).physlr.physical-map.path: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.backbone.path ln -sf $< $@ -$(lr).physlr.physical-map.$(ref).n10.paf.gz: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.backbone.map-split.$(ref).n10.paf.gz +$(lr).physlr.physical-map.$(ref).n10.paf.gz: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.backbone.map-split.$(ref).n10.paf.gz ln -sf $< $@ -$(draft).physlr.fa: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.backbone.map-split.$(draft).n10.sort.best.bed.path.fa +$(draft).physlr.fa: $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.backbone.map-split.$(draft).n10.sort.best.bed.path.fa ln -sf $< $@ scaffolds: @@ -151,6 +151,7 @@ physical-map: \ $(lr).physlr.physical-map.$(ref).n10.qpos.chain.metrics.tsv endif endif + # Help help: @echo "Usage: ./physlr-make [COMMAND] [OPTION=VALUE]..." @@ -184,11 +185,11 @@ help: @echo " min_component_size minimum number of barcodes in a backbone [50]." @echo " minimum_barcode_multiplicity minimum number of minimizers per barcode [10]." @echo " maximum_barcode_multiplicity maximum number of minimizers per barcode [5000]." - @echo " mol_strategy molecule separation strategy [distributed]. Available options are bc, bc+k3, distributed, ext." + @echo " mol_strategy molecule separation strategy [distributed+sqcosbin]. Available options are bc, bc+k3, distributed, distributed+sqcosbin." @echo " bc (biconnected componenets) is the least conservative and is only suitable for datasets with low barcode multiplicity." @echo " bc+k3 (biconnected componenets + k-3 cliques) is more conservative than bc and requires more time." @echo " distributed is a modified version of bc+k3 that is faster than bc+k3 but may be more (or even less) conservative." - @echo " ext (extensive) mixes distributed with a modified version of sqcos (cosine similarity of squared adjacency matrix) which makes it more conservative." + @echo " distributed+sqcosbin mixes distributed with a modified version of sqcos (cosine similarity of squared adjacency matrix) which makes it more conservative." @echo " bloom_filter_size size of bloom filter [10000000000] (10G)." @echo " arcs Use ARCS to augment scaffolds (only compatible with ARCS v1.1.1) [false]." @echo "" @@ -898,7 +899,7 @@ endif # Determine overlaps and output the graph in TSV. %.physlr.overlap.tsv: %.physlr.tsv - $(time) $(physlr_path)/src/physlr-overlap -t1 -n10 $< >$@ + $(time) $(physlr_path)/src/physlr-overlap -t$t -m10 $< >$@ # Determine the degree of each vertex. %.deg.tsv: %.tsv @@ -958,7 +959,7 @@ min_path_size=200 $(python) $(bin)/physlr flesh-backbone --min-component-size=$(min_component_size) -V$V $< $*.backbone.path >$@ # Split the minimizers to molecules -%.overlap.m$m.mol.mol2-bcs.split.tsv: %.overlap.m$m.mol.mol2-bcs.tsv %.tsv +%.overlap.m$m.mol.split.tsv: %.overlap.m$m.mol.tsv %.tsv $(time) $(physlr_path)/src/physlr-split-minimizers -t$t $< $*.tsv >$@ # Split the reads into molecules @@ -979,7 +980,7 @@ min_path_size=200 $(time) $(python) $(bin)/physlr map -V$V -n10 $^ >$@ # Map the draft assembly to the backbone graph and output BED. -%.backbone.map-split.$(draft).n10.bed: %.backbone.path $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.overlap.m$m.mol.mol2-bcs.split.tsv $(draft).k$k-w$w.physlr.tsv +%.backbone.map-split.$(draft).n10.bed: %.backbone.path %.split.tsv $(draft).k$k-w$w.physlr.tsv $(time) $(python) $(bin)/physlr map --mx-type split --map-pos 10 -V$V -n10 $^ >$@ # Map the draft assembly to the backbone graph and output BED. @@ -1034,6 +1035,10 @@ min_path_size=200 %.map.$(ref).n10.paf.gz: %.path $(lr).k$k-w$w.n$(minimum_barcode_multiplicity)-$(maximum_barcode_multiplicity).c2-x.physlr.tsv $(name)/$(ref).k$k-w$w.physlr.tsv $(time) $(python) $(bin)/physlr map-paf -V$V -n10 $^ | $(gzip) >$@ +# Map the reference to the backbone graph with split minimizers and output PAF. +%.backbone.map-split.$(ref).n10.paf.gz: %.backbone.path %.split.tsv $(name)/$(ref).k$k-w$w.physlr.tsv + $(time) $(python) $(bin)/physlr map-paf --mx-type split -V$V -n10 $^ | $(gzip) >$@ + # Lift over query coordinates of a PAF file from minimzer index to nucleotide coordinate. %.qpos.paf.gz: $(name)/$(ref).k$k-w$w.physlr.tsv %.paf.gz $(zcat) $*.paf.gz | $(time) $(python) $(bin)/physlr liftover-paf -V$V $< - | $(gzip) >$@ diff --git a/src/Makefile b/src/Makefile index d831c784..52563b17 100644 --- a/src/Makefile +++ b/src/Makefile @@ -67,7 +67,7 @@ check-physlr-molecules: all ./physlr-molecules -s bc data/tiny.mol.input.tsv | diff -q - data/tiny.mol.tsv.good check-physlr-split-minimizers: all - ./physlr-split-minimizers -t4 data/tiny.split-minimizers.mol.mol2-bcs.tsv data/tiny.split-minimizers.physlr.tsv | sort |diff -q - data/tiny.split-minimizers.mol.mol2-bcs.split.tsv.good + ./physlr-split-minimizers -t4 data/tiny.split-minimizers.ext.mol.tsv data/tiny.split-minimizers.physlr.tsv | sort |diff -q - data/tiny.split-minimizers.ext.mol.split.tsv.good install: physlr-indexlr physlr-filter-barcodes physlr-overlap physlr-filter-bxmx physlr-makebf physlr-molecules physlr-split-minimizers install -d $(DESTDIR)$(PREFIX)/bin diff --git a/src/data/tiny.split-minimizers.ext.mol.split.tsv.good b/src/data/tiny.split-minimizers.ext.mol.split.tsv.good new file mode 100644 index 00000000..0042671f --- /dev/null +++ b/src/data/tiny.split-minimizers.ext.mol.split.tsv.good @@ -0,0 +1,14 @@ +100_55_25_0 +100_55_25_90 1 3 5 7 +100_55_25_91 2 6 8 15 +22_1_9_0 2 6 8 +543_288_92_0 1 3 5 7 +75_288_50_0 1 3 7 +92_300_57_0 2 6 15 +AAACACCAGAAACCTA-1_0 +AAACACCAGAAACCTA-1_90 1 3 5 7 +AAACACCAGAAACCTA-1_91 2 6 8 15 +AAACACCAGAAAGCTT-1_0 1 3 5 7 +AAACACCAGAACGACC-1_0 1 3 7 +AAACACCAGAACGACT-1_0 2 6 15 +AAACACCAGAACGCCA-1_0 2 6 8 diff --git a/src/data/tiny.split-minimizers.ext.mol.tsv b/src/data/tiny.split-minimizers.ext.mol.tsv new file mode 100644 index 00000000..26ee0cb8 --- /dev/null +++ b/src/data/tiny.split-minimizers.ext.mol.tsv @@ -0,0 +1,25 @@ +U m +AAACACCAGAAACCTA-1_0 1254 +AAACACCAGAAACCTA-1_91 1254 +AAACACCAGAAACCTA-1_90 1254 +AAACACCAGAAAGCTT-1_0 1313 +AAACACCAGAACGACC-1_0 1819 +AAACACCAGAACGACT-1_0 1819 +AAACACCAGAACGCCA-1_0 4173 +100_55_25_0 1254 +100_55_25_91 1254 +100_55_25_90 1254 +543_288_92_0 1313 +75_288_50_0 1819 +92_300_57_0 1819 +22_1_9_0 4173 + +U V m +100_55_25_90 543_288_92_0 1 +100_55_25_90 75_288_50_0 1 +100_55_25_91 92_300_57_0 1 +100_55_25_91 22_1_9_0 1 +AAACACCAGAAACCTA-1_90 AAACACCAGAAAGCTT-1_0 1 +AAACACCAGAAACCTA-1_90 AAACACCAGAACGACC-1_0 1 +AAACACCAGAAACCTA-1_91 AAACACCAGAACGACT-1_0 1 +AAACACCAGAAACCTA-1_91 AAACACCAGAACGCCA-1_0 1 diff --git a/src/data/tiny.split-minimizers.mol.mol2-bcs.split.tsv.good b/src/data/tiny.split-minimizers.mol.mol2-bcs.split.tsv.good deleted file mode 100644 index 5f6c0790..00000000 --- a/src/data/tiny.split-minimizers.mol.mol2-bcs.split.tsv.good +++ /dev/null @@ -1,16 +0,0 @@ -100_55_25_0_0 -100_55_25_15_0 -100_55_25_15_90 1 3 5 7 -100_55_25_15_91 2 6 8 15 -22_1_9_0_0 2 6 8 -543_288_92_0_0 1 3 5 7 -75_288_50_0_0 1 3 7 -92_300_57_0_0 2 6 15 -AAACACCAGAAACCTA-1_0_0 -AAACACCAGAAACCTA-1_15_0 -AAACACCAGAAACCTA-1_15_90 1 3 5 7 -AAACACCAGAAACCTA-1_15_91 2 6 8 15 -AAACACCAGAAAGCTT-1_0_0 1 3 5 7 -AAACACCAGAACGACC-1_0_0 1 3 7 -AAACACCAGAACGACT-1_0_0 2 6 15 -AAACACCAGAACGCCA-1_0_0 2 6 8 diff --git a/src/data/tiny.split-minimizers.mol.mol2-bcs.tsv b/src/data/tiny.split-minimizers.mol.mol2-bcs.tsv deleted file mode 100644 index cfa52da4..00000000 --- a/src/data/tiny.split-minimizers.mol.mol2-bcs.tsv +++ /dev/null @@ -1,27 +0,0 @@ -U m -AAACACCAGAAACCTA-1_0_0 1254 -AAACACCAGAAACCTA-1_15_0 1254 -AAACACCAGAAACCTA-1_15_91 1254 -AAACACCAGAAACCTA-1_15_90 1254 -AAACACCAGAAAGCTT-1_0_0 1313 -AAACACCAGAACGACC-1_0_0 1819 -AAACACCAGAACGACT-1_0_0 1819 -AAACACCAGAACGCCA-1_0_0 4173 -100_55_25_0_0 1254 -100_55_25_15_0 1254 -100_55_25_15_91 1254 -100_55_25_15_90 1254 -543_288_92_0_0 1313 -75_288_50_0_0 1819 -92_300_57_0_0 1819 -22_1_9_0_0 4173 - -U V m -100_55_25_15_90 543_288_92_0_0 1 -100_55_25_15_90 75_288_50_0_0 1 -100_55_25_15_91 92_300_57_0_0 1 -100_55_25_15_91 22_1_9_0_0 1 -AAACACCAGAAACCTA-1_15_90 AAACACCAGAAAGCTT-1_0_0 1 -AAACACCAGAAACCTA-1_15_90 AAACACCAGAACGACC-1_0_0 1 -AAACACCAGAAACCTA-1_15_91 AAACACCAGAACGACT-1_0_0 1 -AAACACCAGAAACCTA-1_15_91 AAACACCAGAACGCCA-1_0_0 1 diff --git a/src/physlr-split-minimizers.cc b/src/physlr-split-minimizers.cc index 41f58f7d..013950f6 100644 --- a/src/physlr-split-minimizers.cc +++ b/src/physlr-split-minimizers.cc @@ -185,7 +185,7 @@ findMoleculesPerBarcode(bxToMolIdx_t& bxToMolIdx, const graph_t& g) { auto vertexItRange = boost::vertices(g); for (auto vertexIt = vertexItRange.first; vertexIt != vertexItRange.second; ++vertexIt) { - std::string pattern = R"((\S+)_\d+_\d+$)"; + std::string pattern = R"((\S+)_\d+$)"; std::regex rgx(pattern); std::smatch matches; @@ -228,7 +228,7 @@ splitMinimizers( tsl::robin_set neighbourMxsUnion; for (auto neighbourItr = neighbours.first; neighbourItr != neighbours.second; ++neighbourItr) { - std::string pattern = R"((\S+)_\d+_\d+$)"; + std::string pattern = R"((\S+)_\d+$)"; std::regex rgx(pattern); std::smatch matches; if (std::regex_search(g[*neighbourItr].name, matches, rgx)) {