Skip to content

Commit

Permalink
fix: two-line dot-bracket for large parallel quadruplexes (#148)
Browse files Browse the repository at this point in the history
* feat: use improved nucleotide detection from RNApolis 0.4.7

* feat: add a new test case

* chore: set version to 1.5.18
  • Loading branch information
tzok authored Nov 21, 2024
1 parent 6f306e4 commit 28644c6
Show file tree
Hide file tree
Showing 7 changed files with 34 additions and 72 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
mmcif==0.90.0
numpy==1.26.4
orjson==3.10.11
rnapolis==0.3.17
rnapolis==0.4.7
2 changes: 1 addition & 1 deletion src/eltetrado/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.5.17
1.5.18
72 changes: 4 additions & 68 deletions src/eltetrado/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import tempfile
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from functools import lru_cache
from typing import IO, Dict, FrozenSet, Iterable, List, Optional, Set, Tuple

import numpy
Expand All @@ -30,69 +29,6 @@
logging.basicConfig(level=os.environ.get("LOGLEVEL", "INFO"))


@lru_cache(maxsize=None)
def is_nucleotide(nt):
phosphate_atoms = {"P", "OP1", "OP2", "O3'", "O5'"}
sugar_atoms = {"C1'", "C2'", "C3'", "C4'", "C5'", "O4'"}
adenine_atoms = {"N1", "C2", "N3", "C4", "C5", "C6", "N6", "N7", "C8", "N9"}
guanine_atoms = {"N1", "C2", "N2", "N3", "C4", "C5", "C6", "O6", "N7", "C8", "N9"}
cytosine_atoms = {"N1", "C2", "O2", "N3", "C4", "N4", "C5", "C6"}
thymine_atoms = {"N1", "C2", "O2", "N3", "C4", "O4", "C5", "C5M", "C6"}
uracil_atoms = {"N1", "C2", "O2", "N3", "C4", "O4", "C5", "C6"}
scores = {"phosphate": 0.0, "sugar": 0.0, "base": 0.0, "connections": 0.0}
weights = {"phosphate": 0.25, "sugar": 0.25, "base": 0.25, "connections": 0.25}

residue_atoms = {atom.name for atom in nt.atoms}

phosphate_match = len(residue_atoms.intersection(phosphate_atoms))
scores["phosphate"] = phosphate_match / len(phosphate_atoms)

sugar_match = len(residue_atoms.intersection(sugar_atoms))
scores["sugar"] = sugar_match / len(sugar_atoms)

matches = {
"A": len(residue_atoms.intersection(adenine_atoms)) / len(adenine_atoms),
"G": len(residue_atoms.intersection(guanine_atoms)) / len(guanine_atoms),
"C": len(residue_atoms.intersection(cytosine_atoms)) / len(cytosine_atoms),
"T": len(residue_atoms.intersection(thymine_atoms)) / len(thymine_atoms),
"U": len(residue_atoms.intersection(uracil_atoms)) / len(uracil_atoms),
}
best_match = max(matches.items(), key=lambda x: x[1])
scores["base"] = best_match[1]

connection_score = 0.0
distance_threshold = 2.0

if "P" in residue_atoms and "O5'" in residue_atoms:
p_atom = next(atom for atom in nt.atoms if atom.name == "P")
o5_atom = next(atom for atom in nt.atoms if atom.name == "O5'")
if (
numpy.linalg.norm(p_atom.coordinates - o5_atom.coordinates)
<= distance_threshold
):
connection_score += 0.5
if "C1'" in residue_atoms:
c1_atom = next(atom for atom in nt.atoms if atom.name == "C1'")
for base_connection in ["N9", "N1"]:
if base_connection in residue_atoms:
base_atom = next(
atom for atom in nt.atoms if atom.name == base_connection
)
if (
numpy.linalg.norm(c1_atom.coordinates - base_atom.coordinates)
<= distance_threshold
):
connection_score += 0.5
break

scores["connections"] = connection_score

probability = sum(
scores[component] * weights[component] for component in scores.keys()
)
return probability > 0.5


@dataclass(order=True)
class Tetrad:
@staticmethod
Expand Down Expand Up @@ -590,7 +526,7 @@ def __find_loops(self) -> List[Loop]:
else:
nts = list(
filter(
lambda nt: is_nucleotide(nt)
lambda nt: nt.is_nucleotide
and self.global_index[nprev]
< self.global_index[nt]
< self.global_index[ncur],
Expand Down Expand Up @@ -1286,7 +1222,7 @@ def __dot_bracket(self, pairs: List[BasePair3D]) -> Tuple[str, str]:
i = 1

for nt in sorted(
filter(lambda nt: is_nucleotide(nt), self.structure3d.residues),
filter(lambda nt: nt.is_nucleotide, self.structure3d.residues),
key=lambda nt: self.global_index[nt],
):
if chain and chain != nt.chain:
Expand Down Expand Up @@ -1330,7 +1266,7 @@ def __str__(self):

def __chain_order(self) -> List[str]:
only_nucleic_acids = filter(
lambda nt: is_nucleotide(nt), self.structure3d.residues
lambda nt: nt.is_nucleotide, self.structure3d.residues
)
return list(
{
Expand Down Expand Up @@ -1451,7 +1387,7 @@ def __to_helix(
ONZ.Z_MINUS: 6,
}
nucleotides = [
nt for nt in self.analysis.structure3d.residues if is_nucleotide(nt)
nt for nt in self.analysis.structure3d.residues if nt.is_nucleotide
]
chain = None
shifts = dict()
Expand Down
4 changes: 2 additions & 2 deletions src/eltetrado/dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from dataclasses import dataclass
from typing import List, Optional

from eltetrado.analysis import Analysis, Quadruplex, TetradPair, is_nucleotide
from eltetrado.analysis import Analysis, Quadruplex, TetradPair
from eltetrado.model import Ion


Expand Down Expand Up @@ -127,7 +127,7 @@ def convert_nucleotides(analysis: Analysis) -> List[NucleotideDTO]:
nt.chi_class.value if nt.chi_class else None,
)
for nt in analysis.structure3d.residues
if is_nucleotide(nt)
if nt.is_nucleotide
]


Expand Down
Binary file added tests/files/6a85-assembly1.cif.gz
Binary file not shown.
1 change: 1 addition & 0 deletions tests/files/6a85-assembly1.json

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions tests/test_dto.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,3 +109,28 @@ def test_5v3f():
)
assert dto.dotBracket.line1 == ".......([{..)].(.[{.).]}.}....-.([{..)].(.[{.).]}.}"
assert dto.dotBracket.line2 == ".......([(..[{.).]}.{.)].}....-.([(..[{.).]}.{.)].}"


def test_6a85():
"""
In 6A85 the two-line dot-bracket notation requires more than 10 different bracket levels
"""
cif = handle_input_file("tests/files/6a85-assembly1.cif.gz")
structure3d = rnapolis.parser.read_3d_structure(cif, 1)
structure2d = read_secondary_structure_from_dssr(
structure3d, 1, "tests/files/6a85-assembly1.json"
)
analysis = eltetrado(structure2d, structure3d, False, False, 2)
dto = generate_dto(analysis)
assert (
dto.dotBracket.sequence
== "AGAGAGATGGGTGCGT-TAGAGAGATGGGTGCGT-TAGAGAGATGGGTGCGT-TAGAGAGATGGGTGCGTT"
)
assert (
dto.dotBracket.line1
== ".([[{{.<ABCDE.F.-..)(][}.>abcde.f.-..{)(]<.ABCDEF.G.-..}])}>.abcdef.g.."
)
assert (
dto.dotBracket.line2
== ".(([[{.<ABCDE.F.-..{)(][.<ABCDE.F.-..}{)G].>abcde.f.-..)}]g}.>abcde.f.."
)

0 comments on commit 28644c6

Please sign in to comment.