From 7def8cae7cab1a9dcaabfc43f209b14edcc14f80 Mon Sep 17 00:00:00 2001
From: Jesse Connell <ancon@upenn.edu>
Date: Sat, 6 Jan 2024 17:33:47 -0500
Subject: [PATCH] For #69: draft cut feature

---
 igseq/__main__.py |  47 +++++++++++++++++++++
 igseq/cut.py      | 102 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 149 insertions(+)
 create mode 100644 igseq/cut.py

diff --git a/igseq/__main__.py b/igseq/__main__.py
index 78885b1..dd4fbc5 100644
--- a/igseq/__main__.py
+++ b/igseq/__main__.py
@@ -22,6 +22,8 @@
 from . import msa
 from . import tree
 from . import show
+from . import cut
+from .cut import AIRR_REGIONS
 from .util import IgSeqError
 from .version import __version__
 
@@ -266,6 +268,20 @@ def _main_tree(args):
         colmap=colmap,
         dry_run=args.dry_run)
 
+def _main_cut(args):
+    colmap = args_to_colmap(args)
+    cut.cut(
+        ref_paths=args.reference,
+        query=args.input,
+        output=args.output,
+        regions=args.regions,
+        species=args.species,
+        fmt_in=args.input_format,
+        fmt_out=args.output_format,
+        colmap=colmap,
+        dry_run=args.dry_run,
+        threads=args.threads)
+
 def _setup_log(verbose, quiet, prefix):
     # Handle warnings via logging
     logging.captureWarnings(True)
@@ -348,6 +364,11 @@ def __setup_arg_parser():
         help="list builtin reference data files",
         description=rewrap(show.__doc__),
         formatter_class=argparse.RawDescriptionHelpFormatter)
+    cut_extra_help = "\nAntibody regions recognized: " + ", ".join(AIRR_REGIONS)
+    p_cut = subps.add_parser("cut",
+        help="cut sequences to specific region(s)",
+        description=rewrap(cut.__doc__ + cut_extra_help),
+        formatter_class=argparse.RawDescriptionHelpFormatter)
 
     __add_common_args(p_get)
     p_get.add_argument("input", help="one Illumina run directory")
@@ -601,6 +622,32 @@ def __setup_arg_parser():
         "This can be given multiple times for multiple set/color pairs.")
     p_tree.set_defaults(func=_main_tree)
 
+    __add_common_args(p_cut)
+    p_cut.add_argument("input",
+        help="input file path, or a literal '-' for standard input")
+    p_cut.add_argument("output",
+        help="output file path, or a literal '-' for standard output")
+    p_cut.add_argument("--input-format",
+        help="format of input "
+        "(default: detect from input filename if possible)")
+    p_cut.add_argument("--output-format",
+        help="format of output "
+        "(default: detect from output filename if possible)")
+    p_cut.add_argument("--col-seq-id",
+        help="Name of column containing sequence IDs (for tabular input/output)")
+    p_cut.add_argument("--col-seq",
+        help="Name of column containing sequences (for tabular input/output)")
+    p_cut.add_argument("-R", "--regions", required=True, help=("Antibody region(s) to include. "
+        "To span two regions, separate by - or :"))
+    p_cut.add_argument("-r", "--reference", nargs="+",
+            help="one or more FASTA/directory/builtin names pointing to V/D/J FASTA files")
+    p_cut.add_argument("-S", "--species",
+            help="species to use (human or rhesus).  Default: infer from database if possible")
+    p_cut.add_argument("-t", "--threads", type=int, default=1,
+        help="number of threads for parallel processing (default: 1)")
+    p_cut.set_defaults(func=_main_cut)
+
+
     return parser
 
 def __add_common_args(obj):
diff --git a/igseq/cut.py b/igseq/cut.py
new file mode 100644
index 0000000..dd15b0f
--- /dev/null
+++ b/igseq/cut.py
@@ -0,0 +1,102 @@
+"""
+Extract subsequences based on AIRR region via IgBLAST.
+
+Input and output can be in any file format supported by the convert command and
+can be given as "-" for standard input/output.
+
+    igseq cut -S rhesus -R cdr3 seqs.fa cdr3.fa
+    igseq cut -S rhesus -R fwr1-fwr3 V.fa cysTruncated.fa
+"""
+
+import re
+import logging
+from csv import DictReader
+from . import util
+from . import igblast
+from . import vdj
+from .record import RecordWriter
+
+LOGGER = logging.getLogger(__name__)
+
+# These things have _start and _end columns defined that specify start and end
+# positions relative to a query sequence
+# https://docs.airr-community.org/en/stable/datarep/rearrangements.html
+AIRR_REGIONS = [
+    "v_sequence",
+    "d_sequence",
+    "j_sequence",
+    "c_sequence",
+    "cdr1",
+    "cdr2",
+    "cdr3",
+    "fwr1",
+    "fwr2",
+    "fwr3",
+    "fwr4",
+    ]
+
+def cut(
+        ref_paths, query, output, regions,
+        species=None, fmt_in=None, fmt_out=None, colmap=None, dry_run=False, threads=1):
+    """Extract subsequences based on AIRR region via IgBLAST"""
+    LOGGER.info("given ref path(s): %s", ref_paths)
+    LOGGER.info("given query path: %s", query)
+    LOGGER.info("given output: %s", output)
+    LOGGER.info("given regions: %s", regions)
+    LOGGER.info("given species: %s", species)
+    LOGGER.info("given input format: %s", fmt_in)
+    LOGGER.info("given colmap: %s", colmap)
+    LOGGER.info("given threads: %s", threads)
+    if species and not ref_paths:
+        # If only species is given, default to using all available reference
+        # sets for that species
+        ref_paths = [igblast.fuzzy_species_match(species)]
+        LOGGER.info("inferred ref path: %s", ref_paths[0])
+    attrs_list = vdj.parse_vdj_paths(ref_paths)
+    species_det = {attrs.get("species") for attrs in attrs_list}
+    species_det = {s for s in species_det if s}
+    organism = igblast.detect_organism(species_det, species)
+    attrs_list_grouped = vdj.group(attrs_list)
+    for key, attrs_group in attrs_list_grouped.items():
+        LOGGER.info("detected %s references: %d", key, len(attrs_group))
+        if len(attrs_group) == 0:
+            raise util.IgSeqError(f"No references for segment {key}")
+
+    parts = parse_regions(regions)
+    LOGGER.info("parsed regions: {parts}")
+
+    if not dry_run:
+        with igblast.setup_db_dir(
+            [attrs["path"] for attrs in attrs_list]) as (db_dir, _):
+            with igblast.run_igblast(
+                    db_dir, organism, query, threads,
+                    fmt_in, colmap, extra_args=["-outfmt", "19"]) as proc:
+                reader = DictReader(proc.stdout, delimiter="\t")
+                with RecordWriter(output, fmt_out, colmap, dry_run=dry_run) as writer:
+                    for row in reader:
+                        # TODO what should happen if a region isn't present at
+                        # all?  Fall back on nearest position that does exist?
+                        # TODO should this also handle the non-positioned based
+                        # entries like np1 and junction?  (But then, what to do
+                        # with something like "np1-J"?)
+                        pos_start = int(row[parts[0] + "_start"])
+                        pos_end = int(row[parts[1] + "_end"])
+                        seq = row["sequence"][pos_start-1:pos_end]
+                        record = {"sequence_id": row["sequence_id"], "sequence": seq}
+                        writer.write(record)
+
+def parse_regions(regions):
+    parts = [p.lower() for p in re.split(r"[-:]", regions, 1)]
+    # support implicit ..._sequence
+    for idx, part in enumerate(parts):
+        if f"{part}_sequence" in AIRR_REGIONS:
+            parts[idx] = f"{part}_sequence"
+    if len(parts) == 1:
+        parts *= 2
+    for part in parts:
+        if part not in AIRR_REGIONS:
+            raise util.IgSeqError(f"region \"{part}\" not recognized")
+    if AIRR_REGIONS.index(parts[0]) > AIRR_REGIONS.index(parts[1]):
+        LOGGER.warning("regions out of order; will reverse automatically: %s", parts)
+        parts.reverse()
+    return parts