From 13038a5cc02e9e45a38db4667505ce78be498b9c Mon Sep 17 00:00:00 2001 From: Baraa Orabi Date: Thu, 15 Dec 2022 14:50:24 -0800 Subject: [PATCH] Option to generate cell bc whitelist from LRs --- README.md | 2 +- Snakefile | 41 +++++++++++++++++++------- config.yaml | 2 ++ env.yaml | 1 + scTagger.py | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 115 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 190e1c0..5e64652 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ To run this step, you can use the following command. The second step is to extract the top short-reads barcodes that cover most of the reads. ``` -./scTagger.py extract_top_sr_bc -i "path/to/bam/file" -o "path/to/output/file" -p "path/to/output/plot" +./scTagger.py extract_sr_bc -i "path/to/bam/file" -o "path/to/output/file" -p "path/to/output/plot" ``` **Arguments** diff --git a/Snakefile b/Snakefile index eec6e7c..c3e43c2 100644 --- a/Snakefile +++ b/Snakefile @@ -13,19 +13,11 @@ clrg_d = f"{os.path.abspath(outpath)}/cellranger-out" rule all: input: expand( - f"{clrg_d}/{{sample}}/{{sample}}/outs/possorted_genome_bam.bam", + f"{outpath}/{{sample}}/{{sample}}.lr_bc_matches.tsv.gz", sample=config["samples"], ), expand( - f"{outpath}/{{sample}}/{{sample}}.sr_bc.tsv.gz", - sample=config["samples"] - ), - expand( - f"{outpath}/{{sample}}/{{sample}}.lr_bc.tsv.gz", - sample=config["samples"] - ), - expand( - f"{outpath}/{{sample}}/{{sample}}.lr_bc_matches.tsv.gz", + f"{outpath}/{{sample}}/{{sample}}.lr_bc_from_lr_matches.tsv.gz", sample=config["samples"], ), @@ -96,6 +88,35 @@ rule extract_lr_bc: shell: "{params.script} extract_lr_bc -r {input.reads} -o {output.tsv} -t {threads}" +rule extract_sr_bc_from_lr: + input: + tsv=f"{outpath}/{{sample}}/{{sample}}.lr_bc.tsv.gz", + wl=lambda wildcards: config["samples"][wildcards.sample]["whiltlist"], + output: + tsv=f"{outpath}/{{sample}}/{{sample}}.sr_bc_from_lr.tsv.gz", + params: + script=config["exec"]["scTagger"], + resources: + mem="16G", + time=59, + shell: + "{params.script} extract_sr_bc_from_lr -i {input.tsv} -wl {input.wl} -o {output.tsv}" + + +rule match_trie_from_lr: + input: + lr_tsv=f"{outpath}/{{sample}}/{{sample}}.lr_bc.tsv.gz", + sr_tsv=f"{outpath}/{{sample}}/{{sample}}.sr_bc_from_lr.tsv.gz", + output: + lr_tsv=f"{outpath}/{{sample}}/{{sample}}.lr_bc_from_lr_matches.tsv.gz", + params: + script=config["exec"]["scTagger"], + threads: 32 + resources: + mem="64G", + time=60 * 5 - 1, + shell: + "{params.script} match_trie -lr {input.lr_tsv} -sr {input.sr_tsv} -o {output.lr_tsv} -t {threads}" rule match_trie: input: diff --git a/config.yaml b/config.yaml index ff4f90a..8e9c1b4 100644 --- a/config.yaml +++ b/config.yaml @@ -7,6 +7,8 @@ exec: samples: s1: ref: homo_sapiens + whiltlist: + test/refs/3M-february-2018.txt.gz lr_fastqs: - test/data/lr/TestSample_pass_fastq_concat.fastq sr: diff --git a/env.yaml b/env.yaml index 28cf092..273a90a 100644 --- a/env.yaml +++ b/env.yaml @@ -10,3 +10,4 @@ dependencies: - tqdm - pandas - pysam + - pyahocorasick diff --git a/scTagger.py b/scTagger.py index a2cb375..13f1c54 100755 --- a/scTagger.py +++ b/scTagger.py @@ -14,6 +14,7 @@ import edlib import pysam +import ahocorasick def parse_args(): @@ -49,7 +50,7 @@ def parse_args(): parser_top_sr = subparsers.add_parser('extract_sr_bc') parser_top_sr.add_argument("-i", "--input", type=str, required=True, - help="Input file") + help="Input BAM file") parser_top_sr.add_argument("-o", "--outfile", type=str, default=None, help="Path to output file. Default: STDOUT") parser_top_sr.add_argument("-p", "--plotfile", type=str, default=None, @@ -63,6 +64,20 @@ def parse_args(): parser_top_sr.add_argument("--max-barcode-cnt", type=int, default=25_000, help="Max number of barcodes to keep. Default: 25000") + parser_sr_bc_from_lr = subparsers.add_parser('extract_sr_bc_from_lr') + parser_sr_bc_from_lr.add_argument("-i", "--input", type=str, required=True, + help="Input TSV file generated from extract_lr_bc step") + parser_sr_bc_from_lr.add_argument("-o", "--outfile", type=str, default=None, + help="Path to output file. Default: STDOUT") + parser_sr_bc_from_lr.add_argument("-wl", "--barcode-whitelist", type=str, required=True, + help="Path to TXT barcode whiltelist such as 10x Chromimum v3 3M-february-2018.txt.gz barcode file") + parser_sr_bc_from_lr.add_argument("--thresh", type=float, default=0.005, + help="Percentage theshold required per step to continue adding read barcodes. Default: 0.005") + parser_sr_bc_from_lr.add_argument("--step-size", type=int, default=1000, + help="Number of barcodes processed at a time and whose sum is used to check against the theshold. Default: 1000") + parser_sr_bc_from_lr.add_argument("--max-barcode-cnt", type=int, default=25_000, + help="Max number of barcodes to keep. Default: 25000") + parser_match_trie = subparsers.add_parser('match_trie') parser_match_trie.add_argument("-lr", "--long-read-segments", type=str, required=True, help="Long-read segments TSV file") @@ -117,6 +132,11 @@ def parse_args(): assert 0 < args.step_size assert 0 < args.max_barcode_cnt + if args.subcommand == 'extract_sr_bc_from_lr': + assert 0 <= args.thresh <= 1 + assert 0 < args.step_size + assert 0 < args.max_barcode_cnt + if args.subcommand == 'match_trie': assert args.mem > 0 assert args.barcode_length > 0 @@ -733,7 +753,7 @@ def show_plot_match_trie(full_data, plotfile, max_error): plt.savefig(plotfile) -def match_tire(args): +def match_trie(args): read_sr_barcodes(args.short_read_barcodes) barcode_lens = {len(b) for b in barcodes} assert barcode_lens == {args.barcode_length}, barcode_lens @@ -767,6 +787,60 @@ def match_tire(args): f'{",".join([barcodes[b] if s else rev_compl(barcodes[b]) for b, s in sorted(bids)])}\n') outfile.close() +def extract_sr_bc_from_lr(args): + if args.barcode_whitelist.endswith('.gz'): + infile = gzip.open(args.barcode_whitelist, 'rt') + else: + infile = open(args.barcode_whitelist) + print(f'Reading whiltelist barcodes from: {args.barcode_whitelist}') + barcodes = [l[:-1] for l in tqdm(infile)] + infile.close() + + A = ahocorasick.Automaton() + print(f'\n=====\nAdding forward barcodes...') + for idx,bc in tqdm(enumerate(barcodes), total=len(barcodes)): + A.add_word(bc,idx) + print(f'\n=====\nAdding reverse compliment barcodes...') + for idx,bc in tqdm(enumerate(barcodes), total=len(barcodes)): + A.add_word(rev_compl(bc),-idx) + print(f'\n=====\nBuilding Aho-Corasick automaton...') + A.make_automaton() + + + if args.input.endswith('.gz'): + infile = gzip.open(args.input, 'rt') + else: + infile = open(args.input) + print(f'\n=====\nMatching exact barcodes on long-reads: {args.input}') + C = Counter() + for l in tqdm(infile): + _,_,p,seg = l.rstrip('\n').split('\t') + if p=='NA': + continue + hits = tuple(A.iter(seg)) + if len(hits) > 1: + continue + for _,bc in hits: + C[abs(bc)]+=1 + print(f'\n=====\nFound {len(C):,} unique barcodes on long-reads') + sorted_bc = sorted(C.items(), reverse=True, key=lambda x: x[1])[:args.max_barcode_cnt] + + total = sum(c for _,c in sorted_bc) + for last_idx in range(0, len(sorted_bc), args.step_size): + percent = sum(c for _,c in sorted_bc[last_idx:last_idx+args.step_size])/total + if percent < args.thresh: + break + sorted_bc = sorted_bc[:last_idx+args.step_size] + + print(f"\n=====\nWriting the top {len(sorted_bc)} barcodes") + if args.outfile: + outfile = gzip.open(args.outfile, 'wt+') + else: + outfile = sys.stdout + for bc,c in tqdm(sorted_bc): + outfile.write(f'{barcodes[bc]}\t{c}\n') + outfile.close() + def main(): args = parse_args() @@ -778,8 +852,11 @@ def main(): if args.subcommand == 'extract_sr_bc': extract_sr_bc(args) + if args.subcommand == 'extract_sr_bc_from_lr': + extract_sr_bc_from_lr(args) + if args.subcommand == 'match_trie': - match_tire(args) + match_trie(args) if __name__ == "__main__":