Robaina · Robaina · Jan 25, 2023 · Jan 23, 2023 · Jan 24, 2023 · Jan 24, 2023
diff --git a/pynteny/__init__.py b/pynteny/__init__.py
@@ -1,2 +1,2 @@
-from pynteny.api import *
+from pynteny.api import Command, Search, Build, Download
 from pynteny.cli import main
diff --git a/pynteny/api.py b/pynteny/api.py
@@ -24,10 +24,11 @@
 
 
 class Command:
-    """Parent class for Pynteny command"""
+    """
+    Parent class for Pynteny command
 
-    def __init__(self):
-        """Parent class for Pynteny command"""
+    args: CommandArgs
+    """
 
     def _repr_html_(self):
         """Executed by Jupyter to print Author and version in html"""

diff --git a/pynteny/app/helpers.py b/pynteny/app/helpers.py
@@ -12,7 +12,7 @@
 from pynteny.utils import ConfigParser
 
 
-parent_dir = Path(Path(__file__).parent)
+parent_dir = Path(__file__).parent
 
 
 class FileManager:

diff --git a/pynteny/app/main_page.py b/pynteny/app/main_page.py
@@ -9,7 +9,7 @@
 from pynteny.app.components import Sidebar, Mainpage
 
 
-parent_dir = Path(Path(__file__).parent)
+parent_dir = Path(__file__).parent
 meta = metadata.metadata("pynteny")
 __version__ = meta["Version"]
 __author__ = meta["Author"]

diff --git a/pynteny/cli.py b/pynteny/cli.py
@@ -89,7 +89,7 @@ def _generate_cool_quotes(self):
             "One does not simply walk into Mordor (J.R.R. Tolkien)",
             "Damn, looks like a rainy day, let's do bioiformatics! (SR)",
         ]
-        return f"{random.choice(quotes)}\n" " "
+        return f"{random.choice(quotes)}\n"
 
     def _call_subcommand(self, subcommand_name: str) -> None:
         subcommand = getattr(self, subcommand_name)
@@ -182,16 +182,16 @@ def search() -> argparse.ArgumentParser:
             type=str,
             required=True,
             help=(
-                f"string displaying hmm structure to search for, such as: \n"
-                f" \n"
-                f"'>hmm_a n_ab <hmm_b n_bc hmm_c'\n"
-                f" \n"
-                f"where '>' indicates a hmm target located on the positive strand, \n"
-                f"'<' a target located on the negative strand, and n_ab cooresponds \n"
-                f"to the maximum number of genes separating matched genes a and b. \n"
-                f"Multiple hmms may be employed. \n"
-                f"No order symbol in a hmm indicates that results should be independent \n"
-                f"of strand location. "
+                "string displaying hmm structure to search for, such as: \n"
+                " \n"
+                "'>hmm_a n_ab <hmm_b n_bc hmm_c'\n"
+                " \n"
+                "where '>' indicates a hmm target located on the positive strand, \n"
+                "'<' a target located on the negative strand, and n_ab cooresponds \n"
+                "to the maximum number of genes separating matched genes a and b. \n"
+                "Multiple hmms may be employed. \n"
+                "No order symbol in a hmm indicates that results should be independent \n"
+                "of strand location. "
             ),
         )
         required.add_argument(
@@ -458,7 +458,7 @@ def download() -> argparse.ArgumentParser:
         )
 
         optional = parser._action_groups.pop()
-        required = parser.add_argument_group("required arguments")
+        # required = parser.add_argument_group("required arguments")
         parser._action_groups.append(optional)
 
         optional.add_argument(

diff --git a/pynteny/filter.py b/pynteny/filter.py
@@ -38,20 +38,14 @@ def __init__(self, synteny_structure: str, unordered: bool = False) -> None:
                 gene found by hmm_a and gene found by hmm_b, and hmm_ corresponds
                 to the name of the hmm as provided in the keys of hmm_hits.
                 More than two hmms can be concatenated. Strand location may be
-                specificed by using '>' for sense and '<' for antisense.
+                specified by using '>' for sense and '<' for antisense.
             unordered (bool, optional): whether the HMMs should be arranged in the
                 exact same order displayed in the synteny_structure or in
-                any order If ordered, the filters would filter collinear rather
+                any order. If ordered, the filters would filter collinear rather
                 than syntenic structures. Defaults to False.
         """
         parsed_structure = SyntenyParser.parse_synteny_structure(synteny_structure)
-        hmm_order_dict = dict(
-            zip(
-                parsed_structure["hmm_groups"],
-                range(len(parsed_structure["hmm_groups"])),
-            )
-        )
-        hmm_codes = list(hmm_order_dict.values())
+        hmm_codes = list(range(len(parsed_structure["hmm_groups"])))
         self.hmm_code_order_pattern = hmm_codes
 
         if unordered:
@@ -128,7 +122,7 @@ def contains_strand_pattern(self, data: pd.Series) -> int:
                 strand_comparisons.append(data_strand == pattern_strand)
             else:
                 strand_comparisons.append(True)
-        return 1 if all(strand_comparisons) == True else 0
+        return 1 if all(strand_comparisons) else 0
 
 
 class SyntenyHMMfilter:
@@ -257,10 +251,10 @@ def get_all_HMM_hits(self) -> pd.DataFrame:
             .filter(lambda x: len(x) >= self._n_hmm_groups)
             .sort_values(["contig", "gene_pos"], ascending=True)
         )
-        all_hit_labels.reset_index(drop=True, inplace=True)
+        all_hit_labels = all_hit_labels.reset_index(drop=True)
         if self._contains_hmm_groups:
             all_hit_labels = self._merge_hits_by_HMM_group(all_hit_labels)
-        all_hit_labels.reset_index(drop=True, inplace=True)
+        all_hit_labels = all_hit_labels.reset_index(drop=True)
         return self._add_meta_codes_to_HMM_hits(all_hit_labels)
 
     def filter_hits_by_synteny_structure(self) -> dict:
@@ -296,7 +290,7 @@ def filter_hits_by_synteny_structure(self) -> dict:
                 hmm_group: [] for hmm_group in contig_hits.hmm.unique()
             }
 
-            if len(contig_hits.hmm.unique()) >= self._n_hmm_groups:
+            if contig_hits.hmm.nunique() >= self._n_hmm_groups:
 
                 hmm_match = contig_hits.hmm_code.rolling(
                     window=self._n_hmm_groups
@@ -313,7 +307,7 @@ def filter_hits_by_synteny_structure(self) -> dict:
                     ]
                 else:
                     matched_rows = contig_hits[(hmm_match == 1) & (strand_match == 1)]
-                for i, _ in matched_rows.iterrows():
+                for i in matched_rows.index:
                     matched_hits = contig_hits.iloc[
                         i - (self._n_hmm_groups - 1) : i + 1, :
                     ]
@@ -379,7 +373,8 @@ def from_hits_dict(cls, hits_by_contig: dict) -> SyntenyHits:
         """
         return cls(cls._hits_to_dataframe(hits_by_contig))
 
-    def get_synteny_hits(self) -> pd.DataFrame:
+    @property
+    def hits(self) -> pd.DataFrame:
         """Return synteny hits.
 
         Returns:
@@ -401,14 +396,17 @@ def add_HMM_meta_info_to_hits(self, hmm_meta: Path) -> SyntenyHits:
             return self._synteny_hits
         pgap = PGAP(hmm_meta)
         self._synteny_hits[fields] = ""
-        for i, row in self._synteny_hits.iterrows():
+        # for i, row in self._synteny_hits.iterrows():
+        for row in self._synteny_hits.itertuples():
+            i = getattr(row, "Index")
+            hmm_group = getattr(row, "hmm")
             meta_values = [
                 [
                     str(v).replace("nan", "")
                     for k, v in pgap.get_meta_info_for_HMM(hmm).items()
                     if k != "#ncbi_accession"
                 ]
-                for hmm in row.hmm.split("|")
+                for hmm in hmm_group.split("|")  # row.hmm.split("|")
             ]
             self._synteny_hits.loc[i, fields] = ["|".join(v) for v in zip(*meta_values)]
         return SyntenyHits(self._synteny_hits)
@@ -532,11 +530,13 @@ def filter_FASTA_by_synteny_structure(
     if additional_args is None:
         additional_args = [None for _ in input_hmms]
 
-    if type(additional_args) == str:
+    # if type(additional_args) == str:
+    if isinstance(additional_args, str):
         logger.warning(f"Repeating hmmsearch arg: '{additional_args}' for all HMMs")
         additional_args = [additional_args for _ in input_hmms]
 
-    elif type(additional_args) == list:
+    # elif type(additional_args) == list:
+    elif isinstance(additional_args, list):
         if len(additional_args) == 1:
             logger.warning(
                 f"Repeating hmmsearch arg: '{additional_args[0]}' for all HMMs"

diff --git a/pynteny/hmm.py b/pynteny/hmm.py
@@ -103,9 +103,8 @@ def get_HMMER_tables(
         hmm_hits = {}
         for hmm_model, add_args in zip(self._input_hmms, self._additional_args):
             hmm_name = hmm_model.stem
-            hmmer_output = Path(
-                os.path.join(self._hmmer_output_dir, f"hmmer_output_{hmm_name}.txt")
-            )
+            hmmer_output = Path(self._hmmer_output_dir) / f"hmmer_output_{hmm_name}.txt"
+
             if not (reuse_hmmer_results and os.path.isfile(hmmer_output)):
                 wrappers.run_HMM_search(
                     hmm_model=hmm_model,
@@ -130,10 +129,20 @@ def __init__(self, meta_file: Path):
         Args:
             meta_file (Path): path to PGAP's metadata file.
         """
-        meta = pd.read_csv(str(meta_file), sep="\t")
-        meta = meta[
-            ["#ncbi_accession", "gene_symbol", "label", "product_name", "ec_numbers"]
-        ]
+        meta = pd.read_csv(
+            str(meta_file),
+            sep="\t",
+            usecols=[
+                "#ncbi_accession",
+                "gene_symbol",
+                "label",
+                "product_name",
+                "ec_numbers",
+            ],
+        )
+        # meta = meta[
+        #     ["#ncbi_accession", "gene_symbol", "label", "product_name", "ec_numbers"]
+        # ]
         self._meta = meta
         self._meta_file = meta_file
 
@@ -189,17 +198,14 @@ def get_HMM_names_by_gene_symbol(self, gene_symbol: str) -> list[str]:
             list[str]: list of HMM names matching gene symbol.
         """
         meta = self._meta  # .dropna(subset=["gene_symbol", "label"], axis=0)
-        try:
-            return meta[
-                (
-                    (meta.gene_symbol == gene_symbol)
-                    |
-                    # (meta.label.str.contains(gene_id))
-                    (meta.label == gene_symbol)
-                )
-            ]["#ncbi_accession"].values.tolist()
-        except:
-            return list()
+        return meta[
+            (
+                (meta.gene_symbol == gene_symbol)
+                |
+                # (meta.label.str.contains(gene_id))
+                (meta.label == gene_symbol)
+            )
+        ]["#ncbi_accession"].values.tolist()
 
     def get_HMM_group_for_gene_symbol(self, gene_symbol: str) -> str:
         """Get HMMs corresponding to gene symbol in PGAP metadata.
@@ -230,12 +236,7 @@ def get_HMM_gene_ID(self, hmm_name: str) -> list[str]:
             list[str]: list of gene symbols matching given HMM.
         """
         meta = self._meta.dropna(subset=["#ncbi_accession"], axis=0)
-        try:
-            return meta[meta["#ncbi_accession"] == hmm_name][
-                "gene_symbol"
-            ].values.tolist()
-        except:
-            return None
+        return meta[meta["#ncbi_accession"] == hmm_name]["gene_symbol"].values.tolist()
 
     def get_meta_info_for_HMM(self, hmm_name: str) -> dict:
         """Get meta info for given hmm.
@@ -249,11 +250,10 @@ def get_meta_info_for_HMM(self, hmm_name: str) -> dict:
         meta = self._meta.dropna(subset=["#ncbi_accession"], axis=0).applymap(
             lambda x: x if not pd.isna(x) else ""
         )
-        try:
-            return {
-                k: list(v.values())[0] if list(v.values())[0] else "undef"
-                for k, v in meta[meta["#ncbi_accession"] == hmm_name].to_dict().items()
-            }
-        except:
+        metadata = {
+            k: list(v.values())[0] if list(v.values())[0] else "undef"
+            for k, v in meta[meta["#ncbi_accession"] == hmm_name].to_dict().items()
+        }
+        if not metadata:
             logger.warning(f"No metadata for HMM: {hmm_name}")
-            return dict()
+        return metadata
diff --git a/pynteny/parser.py b/pynteny/parser.py
@@ -39,21 +39,26 @@ def parse(label: str) -> dict:
             "locus_pos": None,
             "strand": "",
         }
-        try:
-            entry = label.split("__")[0]
-            meta = label.split("__")[1]
-            strand = meta.split("_")[-1]
-            locus_pos = tuple([int(pos) for pos in meta.split("_")[-3:-1]])
-            gene_pos = int(meta.split("_")[-4])
-            contig = "_".join(meta.split("_")[:-4])
-
-            parsed_dict["gene_id"] = entry
-            parsed_dict["contig"] = contig
-            parsed_dict["gene_pos"] = gene_pos
-            parsed_dict["locus_pos"] = locus_pos
-            parsed_dict["strand"] = strand
-        except Exception:
-            pass
+
+        if label.count("__") > 1:
+            logger.error("Invalid format of record label string")
+            sys.exit(1)
+
+        entry = label.split("__")[0]
+        meta = label.split("__")[1]
+        meta_items = meta.split("_")
+
+        strand = meta_items[-1]
+        locus_pos = tuple([int(pos) for pos in meta_items[-3:-1]])
+        gene_pos = int(meta_items[-4])
+        contig = "_".join(meta_items[:-4])
+
+        parsed_dict["gene_id"] = entry
+        parsed_dict["contig"] = contig
+        parsed_dict["gene_pos"] = gene_pos
+        parsed_dict["locus_pos"] = locus_pos
+        parsed_dict["strand"] = strand
+
         return parsed_dict
 
     @staticmethod
@@ -107,7 +112,7 @@ def is_valid_structure(synteny_structure: str) -> bool:
     @staticmethod
     def split_strand_from_locus(
         locus_str: str, parsed_symbol: bool = True
-    ) -> tuple[str]:
+    ) -> tuple[str, ...]:
         """Split strand info from locus tag / HMM model.
 
         Args:
@@ -117,7 +122,7 @@ def split_strand_from_locus(
                 as 'pos' and '<' as 'neg'. Defaults to True.
 
         Returns:
-            tuple[str]: tuple with parsed strand info and gene symbol / HMM name.
+            tuple[str, ...]: tuple with parsed strand info and gene symbol / HMM name.
         """
         locus_str = locus_str.strip()
         if locus_str[0] == "<" or locus_str[0] == ">":