Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PyOpenSci REVIEW - minor updates #43

Merged
merged 10 commits into from
Jan 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pynteny/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from pynteny.api import *
from pynteny.api import Command, Search, Build, Download
from pynteny.cli import main
7 changes: 4 additions & 3 deletions pynteny/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,11 @@


class Command:
"""Parent class for Pynteny command"""
"""
Parent class for Pynteny command

def __init__(self):
"""Parent class for Pynteny command"""
args: CommandArgs
"""

def _repr_html_(self):
"""Executed by Jupyter to print Author and version in html"""
Expand Down
2 changes: 1 addition & 1 deletion pynteny/app/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from pynteny.utils import ConfigParser


parent_dir = Path(Path(__file__).parent)
parent_dir = Path(__file__).parent


class FileManager:
Expand Down
2 changes: 1 addition & 1 deletion pynteny/app/main_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pynteny.app.components import Sidebar, Mainpage


parent_dir = Path(Path(__file__).parent)
parent_dir = Path(__file__).parent
meta = metadata.metadata("pynteny")
__version__ = meta["Version"]
__author__ = meta["Author"]
Expand Down
24 changes: 12 additions & 12 deletions pynteny/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def _generate_cool_quotes(self):
"One does not simply walk into Mordor (J.R.R. Tolkien)",
"Damn, looks like a rainy day, let's do bioiformatics! (SR)",
]
return f"{random.choice(quotes)}\n" " "
return f"{random.choice(quotes)}\n"

def _call_subcommand(self, subcommand_name: str) -> None:
subcommand = getattr(self, subcommand_name)
Expand Down Expand Up @@ -182,16 +182,16 @@ def search() -> argparse.ArgumentParser:
type=str,
required=True,
help=(
f"string displaying hmm structure to search for, such as: \n"
f" \n"
f"'>hmm_a n_ab <hmm_b n_bc hmm_c'\n"
f" \n"
f"where '>' indicates a hmm target located on the positive strand, \n"
f"'<' a target located on the negative strand, and n_ab cooresponds \n"
f"to the maximum number of genes separating matched genes a and b. \n"
f"Multiple hmms may be employed. \n"
f"No order symbol in a hmm indicates that results should be independent \n"
f"of strand location. "
"string displaying hmm structure to search for, such as: \n"
" \n"
"'>hmm_a n_ab <hmm_b n_bc hmm_c'\n"
" \n"
"where '>' indicates a hmm target located on the positive strand, \n"
"'<' a target located on the negative strand, and n_ab cooresponds \n"
"to the maximum number of genes separating matched genes a and b. \n"
"Multiple hmms may be employed. \n"
"No order symbol in a hmm indicates that results should be independent \n"
"of strand location. "
),
)
required.add_argument(
Expand Down Expand Up @@ -458,7 +458,7 @@ def download() -> argparse.ArgumentParser:
)

optional = parser._action_groups.pop()
required = parser.add_argument_group("required arguments")
# required = parser.add_argument_group("required arguments")
parser._action_groups.append(optional)

optional.add_argument(
Expand Down
38 changes: 19 additions & 19 deletions pynteny/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,14 @@ def __init__(self, synteny_structure: str, unordered: bool = False) -> None:
gene found by hmm_a and gene found by hmm_b, and hmm_ corresponds
to the name of the hmm as provided in the keys of hmm_hits.
More than two hmms can be concatenated. Strand location may be
specificed by using '>' for sense and '<' for antisense.
specified by using '>' for sense and '<' for antisense.
unordered (bool, optional): whether the HMMs should be arranged in the
exact same order displayed in the synteny_structure or in
any order If ordered, the filters would filter collinear rather
any order. If ordered, the filters would filter collinear rather
than syntenic structures. Defaults to False.
"""
parsed_structure = SyntenyParser.parse_synteny_structure(synteny_structure)
hmm_order_dict = dict(
zip(
parsed_structure["hmm_groups"],
range(len(parsed_structure["hmm_groups"])),
)
)
hmm_codes = list(hmm_order_dict.values())
hmm_codes = list(range(len(parsed_structure["hmm_groups"])))
self.hmm_code_order_pattern = hmm_codes

if unordered:
Expand Down Expand Up @@ -128,7 +122,7 @@ def contains_strand_pattern(self, data: pd.Series) -> int:
strand_comparisons.append(data_strand == pattern_strand)
else:
strand_comparisons.append(True)
return 1 if all(strand_comparisons) == True else 0
return 1 if all(strand_comparisons) else 0


class SyntenyHMMfilter:
Expand Down Expand Up @@ -257,10 +251,10 @@ def get_all_HMM_hits(self) -> pd.DataFrame:
.filter(lambda x: len(x) >= self._n_hmm_groups)
.sort_values(["contig", "gene_pos"], ascending=True)
)
all_hit_labels.reset_index(drop=True, inplace=True)
all_hit_labels = all_hit_labels.reset_index(drop=True)
if self._contains_hmm_groups:
all_hit_labels = self._merge_hits_by_HMM_group(all_hit_labels)
all_hit_labels.reset_index(drop=True, inplace=True)
all_hit_labels = all_hit_labels.reset_index(drop=True)
return self._add_meta_codes_to_HMM_hits(all_hit_labels)

def filter_hits_by_synteny_structure(self) -> dict:
Expand Down Expand Up @@ -296,7 +290,7 @@ def filter_hits_by_synteny_structure(self) -> dict:
hmm_group: [] for hmm_group in contig_hits.hmm.unique()
}

if len(contig_hits.hmm.unique()) >= self._n_hmm_groups:
if contig_hits.hmm.nunique() >= self._n_hmm_groups:

hmm_match = contig_hits.hmm_code.rolling(
window=self._n_hmm_groups
Expand All @@ -313,7 +307,7 @@ def filter_hits_by_synteny_structure(self) -> dict:
]
else:
matched_rows = contig_hits[(hmm_match == 1) & (strand_match == 1)]
for i, _ in matched_rows.iterrows():
for i in matched_rows.index:
matched_hits = contig_hits.iloc[
i - (self._n_hmm_groups - 1) : i + 1, :
]
Expand Down Expand Up @@ -379,7 +373,8 @@ def from_hits_dict(cls, hits_by_contig: dict) -> SyntenyHits:
"""
return cls(cls._hits_to_dataframe(hits_by_contig))

def get_synteny_hits(self) -> pd.DataFrame:
@property
def hits(self) -> pd.DataFrame:
"""Return synteny hits.

Returns:
Expand All @@ -401,14 +396,17 @@ def add_HMM_meta_info_to_hits(self, hmm_meta: Path) -> SyntenyHits:
return self._synteny_hits
pgap = PGAP(hmm_meta)
self._synteny_hits[fields] = ""
for i, row in self._synteny_hits.iterrows():
# for i, row in self._synteny_hits.iterrows():
for row in self._synteny_hits.itertuples():
i = getattr(row, "Index")
hmm_group = getattr(row, "hmm")
Comment on lines +401 to +402
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be changed to

row.Index
row.hmm

Not so ugly anymore, right? 😸

Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see! that pleases me greatly 😄 I really did not want to use getattr.

meta_values = [
[
str(v).replace("nan", "")
for k, v in pgap.get_meta_info_for_HMM(hmm).items()
if k != "#ncbi_accession"
]
for hmm in row.hmm.split("|")
for hmm in hmm_group.split("|") # row.hmm.split("|")
]
self._synteny_hits.loc[i, fields] = ["|".join(v) for v in zip(*meta_values)]
return SyntenyHits(self._synteny_hits)
Expand Down Expand Up @@ -532,11 +530,13 @@ def filter_FASTA_by_synteny_structure(
if additional_args is None:
additional_args = [None for _ in input_hmms]

if type(additional_args) == str:
# if type(additional_args) == str:
if isinstance(additional_args, str):
logger.warning(f"Repeating hmmsearch arg: '{additional_args}' for all HMMs")
additional_args = [additional_args for _ in input_hmms]

elif type(additional_args) == list:
# elif type(additional_args) == list:
elif isinstance(additional_args, list):
if len(additional_args) == 1:
logger.warning(
f"Repeating hmmsearch arg: '{additional_args[0]}' for all HMMs"
Expand Down
62 changes: 31 additions & 31 deletions pynteny/hmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,8 @@ def get_HMMER_tables(
hmm_hits = {}
for hmm_model, add_args in zip(self._input_hmms, self._additional_args):
hmm_name = hmm_model.stem
hmmer_output = Path(
os.path.join(self._hmmer_output_dir, f"hmmer_output_{hmm_name}.txt")
)
hmmer_output = Path(self._hmmer_output_dir) / f"hmmer_output_{hmm_name}.txt"

if not (reuse_hmmer_results and os.path.isfile(hmmer_output)):
wrappers.run_HMM_search(
hmm_model=hmm_model,
Expand All @@ -130,10 +129,20 @@ def __init__(self, meta_file: Path):
Args:
meta_file (Path): path to PGAP's metadata file.
"""
meta = pd.read_csv(str(meta_file), sep="\t")
meta = meta[
["#ncbi_accession", "gene_symbol", "label", "product_name", "ec_numbers"]
]
meta = pd.read_csv(
str(meta_file),
sep="\t",
usecols=[
"#ncbi_accession",
"gene_symbol",
"label",
"product_name",
"ec_numbers",
],
)
# meta = meta[
# ["#ncbi_accession", "gene_symbol", "label", "product_name", "ec_numbers"]
# ]
self._meta = meta
self._meta_file = meta_file

Expand Down Expand Up @@ -189,17 +198,14 @@ def get_HMM_names_by_gene_symbol(self, gene_symbol: str) -> list[str]:
list[str]: list of HMM names matching gene symbol.
"""
meta = self._meta # .dropna(subset=["gene_symbol", "label"], axis=0)
try:
return meta[
(
(meta.gene_symbol == gene_symbol)
|
# (meta.label.str.contains(gene_id))
(meta.label == gene_symbol)
)
]["#ncbi_accession"].values.tolist()
except:
return list()
return meta[
(
(meta.gene_symbol == gene_symbol)
|
# (meta.label.str.contains(gene_id))
(meta.label == gene_symbol)
)
]["#ncbi_accession"].values.tolist()

def get_HMM_group_for_gene_symbol(self, gene_symbol: str) -> str:
"""Get HMMs corresponding to gene symbol in PGAP metadata.
Expand Down Expand Up @@ -230,12 +236,7 @@ def get_HMM_gene_ID(self, hmm_name: str) -> list[str]:
list[str]: list of gene symbols matching given HMM.
"""
meta = self._meta.dropna(subset=["#ncbi_accession"], axis=0)
try:
return meta[meta["#ncbi_accession"] == hmm_name][
"gene_symbol"
].values.tolist()
except:
return None
return meta[meta["#ncbi_accession"] == hmm_name]["gene_symbol"].values.tolist()

def get_meta_info_for_HMM(self, hmm_name: str) -> dict:
"""Get meta info for given hmm.
Expand All @@ -249,11 +250,10 @@ def get_meta_info_for_HMM(self, hmm_name: str) -> dict:
meta = self._meta.dropna(subset=["#ncbi_accession"], axis=0).applymap(
lambda x: x if not pd.isna(x) else ""
)
try:
return {
k: list(v.values())[0] if list(v.values())[0] else "undef"
for k, v in meta[meta["#ncbi_accession"] == hmm_name].to_dict().items()
}
except:
metadata = {
k: list(v.values())[0] if list(v.values())[0] else "undef"
for k, v in meta[meta["#ncbi_accession"] == hmm_name].to_dict().items()
}
if not metadata:
logger.warning(f"No metadata for HMM: {hmm_name}")
return dict()
return metadata
39 changes: 22 additions & 17 deletions pynteny/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,26 @@ def parse(label: str) -> dict:
"locus_pos": None,
"strand": "",
}
try:
entry = label.split("__")[0]
meta = label.split("__")[1]
strand = meta.split("_")[-1]
locus_pos = tuple([int(pos) for pos in meta.split("_")[-3:-1]])
gene_pos = int(meta.split("_")[-4])
contig = "_".join(meta.split("_")[:-4])

parsed_dict["gene_id"] = entry
parsed_dict["contig"] = contig
parsed_dict["gene_pos"] = gene_pos
parsed_dict["locus_pos"] = locus_pos
parsed_dict["strand"] = strand
except Exception:
pass

if label.count("__") > 1:
logger.error("Invalid format of record label string")
sys.exit(1)

entry = label.split("__")[0]
meta = label.split("__")[1]
meta_items = meta.split("_")

strand = meta_items[-1]
locus_pos = tuple([int(pos) for pos in meta_items[-3:-1]])
gene_pos = int(meta_items[-4])
contig = "_".join(meta_items[:-4])

parsed_dict["gene_id"] = entry
parsed_dict["contig"] = contig
parsed_dict["gene_pos"] = gene_pos
parsed_dict["locus_pos"] = locus_pos
parsed_dict["strand"] = strand

return parsed_dict

@staticmethod
Expand Down Expand Up @@ -107,7 +112,7 @@ def is_valid_structure(synteny_structure: str) -> bool:
@staticmethod
def split_strand_from_locus(
locus_str: str, parsed_symbol: bool = True
) -> tuple[str]:
) -> tuple[str, ...]:
"""Split strand info from locus tag / HMM model.

Args:
Expand All @@ -117,7 +122,7 @@ def split_strand_from_locus(
as 'pos' and '<' as 'neg'. Defaults to True.

Returns:
tuple[str]: tuple with parsed strand info and gene symbol / HMM name.
tuple[str, ...]: tuple with parsed strand info and gene symbol / HMM name.
"""
locus_str = locus_str.strip()
if locus_str[0] == "<" or locus_str[0] == ">":
Expand Down
Loading