From 8de77bb2a17f0b995b43301a40a4bd8fa3aa5599 Mon Sep 17 00:00:00 2001 From: Xiao Chen Date: Thu, 18 Apr 2024 11:42:55 -0700 Subject: [PATCH] Version 3.1.1 (#20) Fix program error in low-depth or no-data regions. Completes analysis even when the input is a small bamlet (result is still a no-call). --- README.md | 2 +- paraphase/__init__.py | 2 +- paraphase/genes/cfhclust.py | 16 ++++++++++++---- paraphase/genome_depth.py | 38 ++++++++++++++++++++++++++----------- paraphase/paraphase.py | 4 ++-- 5 files changed, 43 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index c4c3cd9..43a521a 100644 --- a/README.md +++ b/README.md @@ -82,7 +82,7 @@ Please note that the input BAM should be one that's aligned to the ENTIRE refere Optional parameters: - `-g`: Region(s) to analyze, separated by comma. All supported [regions](docs/regions.md) will be analyzed if not specified. Please use region name, i.e. first column in the doc. - `-t`: Number of threads. -- `-p`: Prefix of output files when the input is a single sample, i.e. use with `-b`. If not provided, prefix will be extracted from the name of the input BAM. +- `-p`: Prefix of output files when the input is a single sample, i.e. use with `-b`. If not provided, prefix will be extracted from the header of the input BAM. - `--genome`: Genome reference build. Default is `38`. If `37` or `19` is specified, Paraphase will run the analysis for GRCh37 or hg19, respectively (note that only 11 medically relevant [regions](docs/regions.md) are supported now for GRCh37/hg19). - `--gene1only`: If specified, variants calls will be made against the main gene only for SMN1, PMS2, STRC, NCF1 and IKBKG, see more information [here](docs/vcf.md). - `--novcf`: If specified, no VCF files will be produced. diff --git a/paraphase/__init__.py b/paraphase/__init__.py index f5f41e5..d539d50 100755 --- a/paraphase/__init__.py +++ b/paraphase/__init__.py @@ -1 +1 @@ -__version__ = "3.1.0" +__version__ = "3.1.1" diff --git a/paraphase/genes/cfhclust.py b/paraphase/genes/cfhclust.py index 76e4bf2..895ab46 100644 --- a/paraphase/genes/cfhclust.py +++ b/paraphase/genes/cfhclust.py @@ -19,12 +19,20 @@ def __init__( def call(self): haps = {} - haps.update(self.cfh["final_haplotypes"]) - haps.update(self.cfhr3["final_haplotypes"]) fusions = {} - fusions.update(self.cfh["fusions_called"]) - fusions.update(self.cfhr3["fusions_called"]) total_cn = None + if ( + self.cfh["final_haplotypes"] is not None + and self.cfhr3["final_haplotypes"] is not None + ): + haps.update(self.cfh["final_haplotypes"]) + haps.update(self.cfhr3["final_haplotypes"]) + if ( + self.cfh["fusions_called"] is not None + and self.cfhr3["fusions_called"] is not None + ): + fusions.update(self.cfh["fusions_called"]) + fusions.update(self.cfhr3["fusions_called"]) if self.cfh["total_cn"] is not None and self.cfhr3["total_cn"] is not None: total_cn = min(self.cfh["total_cn"], self.cfhr3["total_cn"]) if ( diff --git a/paraphase/genome_depth.py b/paraphase/genome_depth.py index 1fe6333..7269a85 100755 --- a/paraphase/genome_depth.py +++ b/paraphase/genome_depth.py @@ -4,6 +4,9 @@ import numpy as np import pysam +import logging + +logging.basicConfig(level=logging.INFO) class GenomeDepth: @@ -42,10 +45,15 @@ def get_genome_depth(self): nchr = nchr.replace("chr", "") pos1 = int(at[1]) for pos in [pos1, pos1 + 1600]: - site_depth = self._bamh.count( - nchr, pos - 1, pos, read_callback="all" - ) - depth.append(site_depth) + try: + site_depth = self._bamh.count( + nchr, pos - 1, pos, read_callback="all" + ) + depth.append(site_depth) + except Exception: + logging.info( + f"This BAM does not have data at {nchr}:{pos}, which is used for determining sample coverage." + ) self.mdepth = np.median(depth) self.mad = np.median([abs(a - self.mdepth) for a in depth]) / self.mdepth @@ -64,15 +72,23 @@ def check_sex(self): if self.chr_in_name is False: nchr = nchr.replace("chr", "") pos1 = int(at[1]) - site_depth = self._bamh.count( - nchr, pos1 - 1, pos1, read_callback="all" - ) - if "X" in nchr: - self.x.append(site_depth) - elif "Y" in nchr: - self.y.append(site_depth) + try: + site_depth = self._bamh.count( + nchr, pos1 - 1, pos1, read_callback="all" + ) + if "X" in nchr: + self.x.append(site_depth) + elif "Y" in nchr: + self.y.append(site_depth) + except Exception: + logging.info( + f"This BAM does not have data at {nchr}:{pos1}, which is used for determining sample sex." + ) cov_x, cov_y = np.median(self.x), np.median(self.y) self._bamh.close() + + if np.isnan(cov_x) or np.isnan(cov_y) or cov_x == 0: + return None if cov_y / cov_x < 0.05: return "female" elif cov_y / cov_x > 0.1: diff --git a/paraphase/paraphase.py b/paraphase/paraphase.py index c156dc1..35bc566 100755 --- a/paraphase/paraphase.py +++ b/paraphase/paraphase.py @@ -28,7 +28,7 @@ from paraphase import genes from .phaser import Phaser -logging.basicConfig(level=logging.DEBUG) +logging.basicConfig(level=logging.INFO) class Paraphase: @@ -590,7 +590,7 @@ def load_parameters(self): "-p", "--prefix", help="Prefix of output files for a single sample. Used with -b.\n" - + "If not provided, prefix will be extracted from the name of the input BAM.\n", + + "If not provided, prefix will be extracted from the header of the input BAM.\n", required=False, ) parser.add_argument(