From 13038a5cc02e9e45a38db4667505ce78be498b9c Mon Sep 17 00:00:00 2001
From: Baraa Orabi <baraaorabi@gmail.com>
Date: Thu, 15 Dec 2022 14:50:24 -0800
Subject: [PATCH] Option to generate cell bc whitelist from LRs

---
 README.md   |  2 +-
 Snakefile   | 41 +++++++++++++++++++-------
 config.yaml |  2 ++
 env.yaml    |  1 +
 scTagger.py | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 115 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 190e1c0..5e64652 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ To run this step, you can use the following command.
 The second step is to extract the top short-reads barcodes that cover most of the reads.
 
 ```
-./scTagger.py extract_top_sr_bc -i "path/to/bam/file" -o "path/to/output/file" -p "path/to/output/plot"
+./scTagger.py extract_sr_bc -i "path/to/bam/file" -o "path/to/output/file" -p "path/to/output/plot"
 ```
 
 **Arguments**
diff --git a/Snakefile b/Snakefile
index eec6e7c..c3e43c2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -13,19 +13,11 @@ clrg_d = f"{os.path.abspath(outpath)}/cellranger-out"
 rule all:
     input:
         expand(
-            f"{clrg_d}/{{sample}}/{{sample}}/outs/possorted_genome_bam.bam",
+            f"{outpath}/{{sample}}/{{sample}}.lr_bc_matches.tsv.gz",
             sample=config["samples"],
         ),
         expand(
-            f"{outpath}/{{sample}}/{{sample}}.sr_bc.tsv.gz",
-            sample=config["samples"]
-        ),
-        expand(
-            f"{outpath}/{{sample}}/{{sample}}.lr_bc.tsv.gz",
-            sample=config["samples"]
-        ),
-        expand(
-            f"{outpath}/{{sample}}/{{sample}}.lr_bc_matches.tsv.gz",
+            f"{outpath}/{{sample}}/{{sample}}.lr_bc_from_lr_matches.tsv.gz",
             sample=config["samples"],
         ),
 
@@ -96,6 +88,35 @@ rule extract_lr_bc:
     shell:
         "{params.script} extract_lr_bc -r {input.reads} -o {output.tsv} -t {threads}"
 
+rule extract_sr_bc_from_lr:
+    input:
+        tsv=f"{outpath}/{{sample}}/{{sample}}.lr_bc.tsv.gz",
+        wl=lambda wildcards: config["samples"][wildcards.sample]["whiltlist"],
+    output:
+        tsv=f"{outpath}/{{sample}}/{{sample}}.sr_bc_from_lr.tsv.gz",
+    params:
+        script=config["exec"]["scTagger"],
+    resources:
+        mem="16G",
+        time=59,
+    shell:
+        "{params.script} extract_sr_bc_from_lr -i {input.tsv} -wl {input.wl} -o {output.tsv}"
+
+
+rule match_trie_from_lr:
+    input:
+        lr_tsv=f"{outpath}/{{sample}}/{{sample}}.lr_bc.tsv.gz",
+        sr_tsv=f"{outpath}/{{sample}}/{{sample}}.sr_bc_from_lr.tsv.gz",
+    output:
+        lr_tsv=f"{outpath}/{{sample}}/{{sample}}.lr_bc_from_lr_matches.tsv.gz",
+    params:
+        script=config["exec"]["scTagger"],
+    threads: 32
+    resources:
+        mem="64G",
+        time=60 * 5 - 1,
+    shell:
+        "{params.script} match_trie -lr {input.lr_tsv} -sr {input.sr_tsv} -o {output.lr_tsv} -t {threads}"
 
 rule match_trie:
     input:
diff --git a/config.yaml b/config.yaml
index ff4f90a..8e9c1b4 100644
--- a/config.yaml
+++ b/config.yaml
@@ -7,6 +7,8 @@ exec:
 samples:
     s1:
         ref: homo_sapiens
+        whiltlist:
+            test/refs/3M-february-2018.txt.gz
         lr_fastqs:
             - test/data/lr/TestSample_pass_fastq_concat.fastq
         sr:
diff --git a/env.yaml b/env.yaml
index 28cf092..273a90a 100644
--- a/env.yaml
+++ b/env.yaml
@@ -10,3 +10,4 @@ dependencies:
   - tqdm
   - pandas
   - pysam
+  - pyahocorasick
diff --git a/scTagger.py b/scTagger.py
index a2cb375..13f1c54 100755
--- a/scTagger.py
+++ b/scTagger.py
@@ -14,6 +14,7 @@
 
 import edlib
 import pysam
+import ahocorasick
 
 
 def parse_args():
@@ -49,7 +50,7 @@ def parse_args():
 
     parser_top_sr = subparsers.add_parser('extract_sr_bc')
     parser_top_sr.add_argument("-i", "--input", type=str, required=True,
-                               help="Input file")
+                               help="Input BAM file")
     parser_top_sr.add_argument("-o", "--outfile", type=str, default=None,
                                help="Path to output file. Default: STDOUT")
     parser_top_sr.add_argument("-p", "--plotfile", type=str, default=None,
@@ -63,6 +64,20 @@ def parse_args():
     parser_top_sr.add_argument("--max-barcode-cnt", type=int, default=25_000,
                                help="Max number of barcodes to keep. Default: 25000")
 
+    parser_sr_bc_from_lr = subparsers.add_parser('extract_sr_bc_from_lr')
+    parser_sr_bc_from_lr.add_argument("-i", "--input", type=str, required=True,
+                               help="Input TSV file generated from extract_lr_bc step")
+    parser_sr_bc_from_lr.add_argument("-o", "--outfile", type=str, default=None,
+                               help="Path to output file. Default: STDOUT")
+    parser_sr_bc_from_lr.add_argument("-wl", "--barcode-whitelist", type=str, required=True,
+                               help="Path to TXT barcode whiltelist such as 10x Chromimum v3 3M-february-2018.txt.gz barcode file")
+    parser_sr_bc_from_lr.add_argument("--thresh", type=float, default=0.005,
+                               help="Percentage theshold required per step to continue adding read barcodes. Default: 0.005")
+    parser_sr_bc_from_lr.add_argument("--step-size", type=int, default=1000,
+                               help="Number of barcodes processed at a time and whose sum is used to check against the theshold. Default: 1000")
+    parser_sr_bc_from_lr.add_argument("--max-barcode-cnt", type=int, default=25_000,
+                               help="Max number of barcodes to keep. Default: 25000")
+
     parser_match_trie = subparsers.add_parser('match_trie')
     parser_match_trie.add_argument("-lr", "--long-read-segments", type=str,
                                    required=True, help="Long-read segments TSV file")
@@ -117,6 +132,11 @@ def parse_args():
         assert 0 < args.step_size
         assert 0 < args.max_barcode_cnt
 
+    if args.subcommand == 'extract_sr_bc_from_lr':
+        assert 0 <= args.thresh <= 1
+        assert 0 < args.step_size
+        assert 0 < args.max_barcode_cnt
+
     if args.subcommand == 'match_trie':
         assert args.mem > 0
         assert args.barcode_length > 0
@@ -733,7 +753,7 @@ def show_plot_match_trie(full_data, plotfile, max_error):
     plt.savefig(plotfile)
 
 
-def match_tire(args):
+def match_trie(args):
     read_sr_barcodes(args.short_read_barcodes)
     barcode_lens = {len(b) for b in barcodes}
     assert barcode_lens == {args.barcode_length}, barcode_lens
@@ -767,6 +787,60 @@ def match_tire(args):
             f'{",".join([barcodes[b] if s else rev_compl(barcodes[b]) for b, s in sorted(bids)])}\n')
     outfile.close()
 
+def extract_sr_bc_from_lr(args):
+    if args.barcode_whitelist.endswith('.gz'):
+        infile = gzip.open(args.barcode_whitelist, 'rt')
+    else:
+        infile = open(args.barcode_whitelist)
+    print(f'Reading whiltelist barcodes from: {args.barcode_whitelist}')
+    barcodes = [l[:-1] for l in tqdm(infile)]
+    infile.close()
+
+    A = ahocorasick.Automaton()
+    print(f'\n=====\nAdding forward barcodes...')
+    for idx,bc in tqdm(enumerate(barcodes), total=len(barcodes)):
+        A.add_word(bc,idx)
+    print(f'\n=====\nAdding reverse compliment barcodes...')
+    for idx,bc in tqdm(enumerate(barcodes), total=len(barcodes)):
+        A.add_word(rev_compl(bc),-idx)
+    print(f'\n=====\nBuilding Aho-Corasick automaton...')
+    A.make_automaton()    
+
+
+    if args.input.endswith('.gz'):
+        infile = gzip.open(args.input, 'rt')
+    else:
+        infile = open(args.input)
+    print(f'\n=====\nMatching exact barcodes on long-reads: {args.input}')
+    C = Counter()
+    for l in tqdm(infile):
+        _,_,p,seg = l.rstrip('\n').split('\t')
+        if p=='NA':
+            continue
+        hits = tuple(A.iter(seg))
+        if len(hits) > 1:
+            continue
+        for _,bc in hits:
+            C[abs(bc)]+=1
+    print(f'\n=====\nFound {len(C):,} unique barcodes on long-reads') 
+    sorted_bc = sorted(C.items(), reverse=True, key=lambda x: x[1])[:args.max_barcode_cnt]
+    
+    total = sum(c for _,c in sorted_bc)
+    for last_idx in range(0, len(sorted_bc), args.step_size):
+        percent = sum(c for _,c in sorted_bc[last_idx:last_idx+args.step_size])/total
+        if percent < args.thresh:
+            break
+    sorted_bc = sorted_bc[:last_idx+args.step_size]
+
+    print(f"\n=====\nWriting the top {len(sorted_bc)} barcodes")
+    if args.outfile:
+        outfile = gzip.open(args.outfile, 'wt+')
+    else:
+        outfile = sys.stdout
+    for bc,c in tqdm(sorted_bc):
+        outfile.write(f'{barcodes[bc]}\t{c}\n')
+    outfile.close()
+
 
 def main():
     args = parse_args()
@@ -778,8 +852,11 @@ def main():
     if args.subcommand == 'extract_sr_bc':
         extract_sr_bc(args)
 
+    if args.subcommand == 'extract_sr_bc_from_lr':
+        extract_sr_bc_from_lr(args)
+
     if args.subcommand == 'match_trie':
-        match_tire(args)
+        match_trie(args)
 
 
 if __name__ == "__main__":