From 7def8cae7cab1a9dcaabfc43f209b14edcc14f80 Mon Sep 17 00:00:00 2001 From: Jesse Connell Date: Sat, 6 Jan 2024 17:33:47 -0500 Subject: [PATCH] For #69: draft cut feature --- igseq/__main__.py | 47 +++++++++++++++++++++ igseq/cut.py | 102 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 igseq/cut.py diff --git a/igseq/__main__.py b/igseq/__main__.py index 78885b1..dd4fbc5 100644 --- a/igseq/__main__.py +++ b/igseq/__main__.py @@ -22,6 +22,8 @@ from . import msa from . import tree from . import show +from . import cut +from .cut import AIRR_REGIONS from .util import IgSeqError from .version import __version__ @@ -266,6 +268,20 @@ def _main_tree(args): colmap=colmap, dry_run=args.dry_run) +def _main_cut(args): + colmap = args_to_colmap(args) + cut.cut( + ref_paths=args.reference, + query=args.input, + output=args.output, + regions=args.regions, + species=args.species, + fmt_in=args.input_format, + fmt_out=args.output_format, + colmap=colmap, + dry_run=args.dry_run, + threads=args.threads) + def _setup_log(verbose, quiet, prefix): # Handle warnings via logging logging.captureWarnings(True) @@ -348,6 +364,11 @@ def __setup_arg_parser(): help="list builtin reference data files", description=rewrap(show.__doc__), formatter_class=argparse.RawDescriptionHelpFormatter) + cut_extra_help = "\nAntibody regions recognized: " + ", ".join(AIRR_REGIONS) + p_cut = subps.add_parser("cut", + help="cut sequences to specific region(s)", + description=rewrap(cut.__doc__ + cut_extra_help), + formatter_class=argparse.RawDescriptionHelpFormatter) __add_common_args(p_get) p_get.add_argument("input", help="one Illumina run directory") @@ -601,6 +622,32 @@ def __setup_arg_parser(): "This can be given multiple times for multiple set/color pairs.") p_tree.set_defaults(func=_main_tree) + __add_common_args(p_cut) + p_cut.add_argument("input", + help="input file path, or a literal '-' for standard input") + p_cut.add_argument("output", + help="output file path, or a literal '-' for standard output") + p_cut.add_argument("--input-format", + help="format of input " + "(default: detect from input filename if possible)") + p_cut.add_argument("--output-format", + help="format of output " + "(default: detect from output filename if possible)") + p_cut.add_argument("--col-seq-id", + help="Name of column containing sequence IDs (for tabular input/output)") + p_cut.add_argument("--col-seq", + help="Name of column containing sequences (for tabular input/output)") + p_cut.add_argument("-R", "--regions", required=True, help=("Antibody region(s) to include. " + "To span two regions, separate by - or :")) + p_cut.add_argument("-r", "--reference", nargs="+", + help="one or more FASTA/directory/builtin names pointing to V/D/J FASTA files") + p_cut.add_argument("-S", "--species", + help="species to use (human or rhesus). Default: infer from database if possible") + p_cut.add_argument("-t", "--threads", type=int, default=1, + help="number of threads for parallel processing (default: 1)") + p_cut.set_defaults(func=_main_cut) + + return parser def __add_common_args(obj): diff --git a/igseq/cut.py b/igseq/cut.py new file mode 100644 index 0000000..dd15b0f --- /dev/null +++ b/igseq/cut.py @@ -0,0 +1,102 @@ +""" +Extract subsequences based on AIRR region via IgBLAST. + +Input and output can be in any file format supported by the convert command and +can be given as "-" for standard input/output. + + igseq cut -S rhesus -R cdr3 seqs.fa cdr3.fa + igseq cut -S rhesus -R fwr1-fwr3 V.fa cysTruncated.fa +""" + +import re +import logging +from csv import DictReader +from . import util +from . import igblast +from . import vdj +from .record import RecordWriter + +LOGGER = logging.getLogger(__name__) + +# These things have _start and _end columns defined that specify start and end +# positions relative to a query sequence +# https://docs.airr-community.org/en/stable/datarep/rearrangements.html +AIRR_REGIONS = [ + "v_sequence", + "d_sequence", + "j_sequence", + "c_sequence", + "cdr1", + "cdr2", + "cdr3", + "fwr1", + "fwr2", + "fwr3", + "fwr4", + ] + +def cut( + ref_paths, query, output, regions, + species=None, fmt_in=None, fmt_out=None, colmap=None, dry_run=False, threads=1): + """Extract subsequences based on AIRR region via IgBLAST""" + LOGGER.info("given ref path(s): %s", ref_paths) + LOGGER.info("given query path: %s", query) + LOGGER.info("given output: %s", output) + LOGGER.info("given regions: %s", regions) + LOGGER.info("given species: %s", species) + LOGGER.info("given input format: %s", fmt_in) + LOGGER.info("given colmap: %s", colmap) + LOGGER.info("given threads: %s", threads) + if species and not ref_paths: + # If only species is given, default to using all available reference + # sets for that species + ref_paths = [igblast.fuzzy_species_match(species)] + LOGGER.info("inferred ref path: %s", ref_paths[0]) + attrs_list = vdj.parse_vdj_paths(ref_paths) + species_det = {attrs.get("species") for attrs in attrs_list} + species_det = {s for s in species_det if s} + organism = igblast.detect_organism(species_det, species) + attrs_list_grouped = vdj.group(attrs_list) + for key, attrs_group in attrs_list_grouped.items(): + LOGGER.info("detected %s references: %d", key, len(attrs_group)) + if len(attrs_group) == 0: + raise util.IgSeqError(f"No references for segment {key}") + + parts = parse_regions(regions) + LOGGER.info("parsed regions: {parts}") + + if not dry_run: + with igblast.setup_db_dir( + [attrs["path"] for attrs in attrs_list]) as (db_dir, _): + with igblast.run_igblast( + db_dir, organism, query, threads, + fmt_in, colmap, extra_args=["-outfmt", "19"]) as proc: + reader = DictReader(proc.stdout, delimiter="\t") + with RecordWriter(output, fmt_out, colmap, dry_run=dry_run) as writer: + for row in reader: + # TODO what should happen if a region isn't present at + # all? Fall back on nearest position that does exist? + # TODO should this also handle the non-positioned based + # entries like np1 and junction? (But then, what to do + # with something like "np1-J"?) + pos_start = int(row[parts[0] + "_start"]) + pos_end = int(row[parts[1] + "_end"]) + seq = row["sequence"][pos_start-1:pos_end] + record = {"sequence_id": row["sequence_id"], "sequence": seq} + writer.write(record) + +def parse_regions(regions): + parts = [p.lower() for p in re.split(r"[-:]", regions, 1)] + # support implicit ..._sequence + for idx, part in enumerate(parts): + if f"{part}_sequence" in AIRR_REGIONS: + parts[idx] = f"{part}_sequence" + if len(parts) == 1: + parts *= 2 + for part in parts: + if part not in AIRR_REGIONS: + raise util.IgSeqError(f"region \"{part}\" not recognized") + if AIRR_REGIONS.index(parts[0]) > AIRR_REGIONS.index(parts[1]): + LOGGER.warning("regions out of order; will reverse automatically: %s", parts) + parts.reverse() + return parts