Skip to content

Commit

Permalink
Option to generate cell bc whitelist from LRs
Browse files Browse the repository at this point in the history
  • Loading branch information
baraaorabi committed Dec 15, 2022
1 parent afd7f74 commit 13038a5
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 14 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ To run this step, you can use the following command.
The second step is to extract the top short-reads barcodes that cover most of the reads.

```
./scTagger.py extract_top_sr_bc -i "path/to/bam/file" -o "path/to/output/file" -p "path/to/output/plot"
./scTagger.py extract_sr_bc -i "path/to/bam/file" -o "path/to/output/file" -p "path/to/output/plot"
```

**Arguments**
Expand Down
41 changes: 31 additions & 10 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,11 @@ clrg_d = f"{os.path.abspath(outpath)}/cellranger-out"
rule all:
input:
expand(
f"{clrg_d}/{{sample}}/{{sample}}/outs/possorted_genome_bam.bam",
f"{outpath}/{{sample}}/{{sample}}.lr_bc_matches.tsv.gz",
sample=config["samples"],
),
expand(
f"{outpath}/{{sample}}/{{sample}}.sr_bc.tsv.gz",
sample=config["samples"]
),
expand(
f"{outpath}/{{sample}}/{{sample}}.lr_bc.tsv.gz",
sample=config["samples"]
),
expand(
f"{outpath}/{{sample}}/{{sample}}.lr_bc_matches.tsv.gz",
f"{outpath}/{{sample}}/{{sample}}.lr_bc_from_lr_matches.tsv.gz",
sample=config["samples"],
),

Expand Down Expand Up @@ -96,6 +88,35 @@ rule extract_lr_bc:
shell:
"{params.script} extract_lr_bc -r {input.reads} -o {output.tsv} -t {threads}"

rule extract_sr_bc_from_lr:
input:
tsv=f"{outpath}/{{sample}}/{{sample}}.lr_bc.tsv.gz",
wl=lambda wildcards: config["samples"][wildcards.sample]["whiltlist"],
output:
tsv=f"{outpath}/{{sample}}/{{sample}}.sr_bc_from_lr.tsv.gz",
params:
script=config["exec"]["scTagger"],
resources:
mem="16G",
time=59,
shell:
"{params.script} extract_sr_bc_from_lr -i {input.tsv} -wl {input.wl} -o {output.tsv}"


rule match_trie_from_lr:
input:
lr_tsv=f"{outpath}/{{sample}}/{{sample}}.lr_bc.tsv.gz",
sr_tsv=f"{outpath}/{{sample}}/{{sample}}.sr_bc_from_lr.tsv.gz",
output:
lr_tsv=f"{outpath}/{{sample}}/{{sample}}.lr_bc_from_lr_matches.tsv.gz",
params:
script=config["exec"]["scTagger"],
threads: 32
resources:
mem="64G",
time=60 * 5 - 1,
shell:
"{params.script} match_trie -lr {input.lr_tsv} -sr {input.sr_tsv} -o {output.lr_tsv} -t {threads}"

rule match_trie:
input:
Expand Down
2 changes: 2 additions & 0 deletions config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ exec:
samples:
s1:
ref: homo_sapiens
whiltlist:
test/refs/3M-february-2018.txt.gz
lr_fastqs:
- test/data/lr/TestSample_pass_fastq_concat.fastq
sr:
Expand Down
1 change: 1 addition & 0 deletions env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ dependencies:
- tqdm
- pandas
- pysam
- pyahocorasick
83 changes: 80 additions & 3 deletions scTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

import edlib
import pysam
import ahocorasick


def parse_args():
Expand Down Expand Up @@ -49,7 +50,7 @@ def parse_args():

parser_top_sr = subparsers.add_parser('extract_sr_bc')
parser_top_sr.add_argument("-i", "--input", type=str, required=True,
help="Input file")
help="Input BAM file")
parser_top_sr.add_argument("-o", "--outfile", type=str, default=None,
help="Path to output file. Default: STDOUT")
parser_top_sr.add_argument("-p", "--plotfile", type=str, default=None,
Expand All @@ -63,6 +64,20 @@ def parse_args():
parser_top_sr.add_argument("--max-barcode-cnt", type=int, default=25_000,
help="Max number of barcodes to keep. Default: 25000")

parser_sr_bc_from_lr = subparsers.add_parser('extract_sr_bc_from_lr')
parser_sr_bc_from_lr.add_argument("-i", "--input", type=str, required=True,
help="Input TSV file generated from extract_lr_bc step")
parser_sr_bc_from_lr.add_argument("-o", "--outfile", type=str, default=None,
help="Path to output file. Default: STDOUT")
parser_sr_bc_from_lr.add_argument("-wl", "--barcode-whitelist", type=str, required=True,
help="Path to TXT barcode whiltelist such as 10x Chromimum v3 3M-february-2018.txt.gz barcode file")
parser_sr_bc_from_lr.add_argument("--thresh", type=float, default=0.005,
help="Percentage theshold required per step to continue adding read barcodes. Default: 0.005")
parser_sr_bc_from_lr.add_argument("--step-size", type=int, default=1000,
help="Number of barcodes processed at a time and whose sum is used to check against the theshold. Default: 1000")
parser_sr_bc_from_lr.add_argument("--max-barcode-cnt", type=int, default=25_000,
help="Max number of barcodes to keep. Default: 25000")

parser_match_trie = subparsers.add_parser('match_trie')
parser_match_trie.add_argument("-lr", "--long-read-segments", type=str,
required=True, help="Long-read segments TSV file")
Expand Down Expand Up @@ -117,6 +132,11 @@ def parse_args():
assert 0 < args.step_size
assert 0 < args.max_barcode_cnt

if args.subcommand == 'extract_sr_bc_from_lr':
assert 0 <= args.thresh <= 1
assert 0 < args.step_size
assert 0 < args.max_barcode_cnt

if args.subcommand == 'match_trie':
assert args.mem > 0
assert args.barcode_length > 0
Expand Down Expand Up @@ -733,7 +753,7 @@ def show_plot_match_trie(full_data, plotfile, max_error):
plt.savefig(plotfile)


def match_tire(args):
def match_trie(args):
read_sr_barcodes(args.short_read_barcodes)
barcode_lens = {len(b) for b in barcodes}
assert barcode_lens == {args.barcode_length}, barcode_lens
Expand Down Expand Up @@ -767,6 +787,60 @@ def match_tire(args):
f'{",".join([barcodes[b] if s else rev_compl(barcodes[b]) for b, s in sorted(bids)])}\n')
outfile.close()

def extract_sr_bc_from_lr(args):
if args.barcode_whitelist.endswith('.gz'):
infile = gzip.open(args.barcode_whitelist, 'rt')
else:
infile = open(args.barcode_whitelist)
print(f'Reading whiltelist barcodes from: {args.barcode_whitelist}')
barcodes = [l[:-1] for l in tqdm(infile)]
infile.close()

A = ahocorasick.Automaton()
print(f'\n=====\nAdding forward barcodes...')
for idx,bc in tqdm(enumerate(barcodes), total=len(barcodes)):
A.add_word(bc,idx)
print(f'\n=====\nAdding reverse compliment barcodes...')
for idx,bc in tqdm(enumerate(barcodes), total=len(barcodes)):
A.add_word(rev_compl(bc),-idx)
print(f'\n=====\nBuilding Aho-Corasick automaton...')
A.make_automaton()


if args.input.endswith('.gz'):
infile = gzip.open(args.input, 'rt')
else:
infile = open(args.input)
print(f'\n=====\nMatching exact barcodes on long-reads: {args.input}')
C = Counter()
for l in tqdm(infile):
_,_,p,seg = l.rstrip('\n').split('\t')
if p=='NA':
continue
hits = tuple(A.iter(seg))
if len(hits) > 1:
continue
for _,bc in hits:
C[abs(bc)]+=1
print(f'\n=====\nFound {len(C):,} unique barcodes on long-reads')
sorted_bc = sorted(C.items(), reverse=True, key=lambda x: x[1])[:args.max_barcode_cnt]

total = sum(c for _,c in sorted_bc)
for last_idx in range(0, len(sorted_bc), args.step_size):
percent = sum(c for _,c in sorted_bc[last_idx:last_idx+args.step_size])/total
if percent < args.thresh:
break
sorted_bc = sorted_bc[:last_idx+args.step_size]

print(f"\n=====\nWriting the top {len(sorted_bc)} barcodes")
if args.outfile:
outfile = gzip.open(args.outfile, 'wt+')
else:
outfile = sys.stdout
for bc,c in tqdm(sorted_bc):
outfile.write(f'{barcodes[bc]}\t{c}\n')
outfile.close()


def main():
args = parse_args()
Expand All @@ -778,8 +852,11 @@ def main():
if args.subcommand == 'extract_sr_bc':
extract_sr_bc(args)

if args.subcommand == 'extract_sr_bc_from_lr':
extract_sr_bc_from_lr(args)

if args.subcommand == 'match_trie':
match_tire(args)
match_trie(args)


if __name__ == "__main__":
Expand Down

0 comments on commit 13038a5

Please sign in to comment.