Merge pull request #25 from alstonlo/master

v1.0.1
aspuru-guzik-group · Aug 25, 2020 · e00f931 · e00f931
2 parents e5206be + 1774cb2
commit e00f931
Show file tree

Hide file tree

Showing 14 changed files with 108 additions and 76 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,27 +1,38 @@
 # Changelog
 
+## v1.0.1 - 25.08.2020
+### Changed: 
+ *  Code so that is compatible with python >= 3.5.
+ *  More descriptive error messages.
+
+### Bug Fixes: 
+ *  Minor bug fixes in the encoder for SMILES ending in branches (e.g. `C(Cl)(F)`),
+    and SMILES with ring numbers between branches (e.g. `C(Cl)1(Br)CCCC1`)
+ *  Minor bug fix with ring ordering in decoder (e.g. `C1CC2CCC12` vs `C1CC2CCC21`).  
+
+---
 
 ## v1.0.0 - 17.08.2020:
 ### Added:
  *  Added semantic handling of aromaticity / delocalization (by kekulizing SMILES with aromatic symbols before
-    they are translated into SELFIES by `selfies.encoder`)
- *  Added semantic handling of charged species (e.g. `[CH+]1CCC1`)
- *  Added semantic handling of radical species (`[CH]1CCC1`) or any species with explicit hydrogens (e.g. `CC[CH2]`)
- *  Added semantic handling of isotopes (e.g. `[14CH2]=C` or `[235U]`)
- *  Improved semantic handling of explicit atom symbols in square brackets, e.g. Carbene (`[C]=C`)
- *  Improved semantic handling of chirality (e.g. `O=C[Co@@](F)(Cl)(Br)(I)S`)
- *  Improved semantic handling of double-bond configuration (e.g. `F/C=C/C=C/C`) 
+    they are translated into SELFIES by `selfies.encoder`).
+ *  Added semantic handling of charged species (e.g. `[CH+]1CCC1`).
+ *  Added semantic handling of radical species (`[CH]1CCC1`) or any species with explicit hydrogens (e.g. `CC[CH2]`).
+ *  Added semantic handling of isotopes (e.g. `[14CH2]=C` or `[235U]`).
+ *  Improved semantic handling of explicit atom symbols in square brackets, e.g. Carbene (`[C]=C`).
+ *  Improved semantic handling of chirality (e.g. `O=C[Co@@](F)(Cl)(Br)(I)S`).
+ *  Improved semantic handling of double-bond configuration (e.g. `F/C=C/C=C/C`). 
  *  Added new functions to the library, such as `selfies.len_selfies` and 
     `selfies.split_selfies`.
  *  Added advanced-user functions to the library to customize the SELFIES semantic constraints, e.g. 
     `selfies.set_semantic_constraints`. Allows to encode for instance diborane, `[BH2]1[H][BH2][H]1`.
  *  Introduced new padding `[nop]` (no operation) symbol.
 
 ### Changed: 
- *  Optimized the indexing alphabet (it is base-16 now)
+ *  Optimized the indexing alphabet (it is base-16 now).
  *  Optimized the behaviours of rings and branches to fix an issue with specific non-standard molecules that could not be translated.
- *  Changed behaviour of Ring/Branch, such that states `X9991-X9993` are not necessary anymore
- *  Significantly improved encoding and decoding algorithms, it is much faster now
+ *  Changed behaviour of Ring/Branch, such that states `X9991-X9993` are not necessary anymore.
+ *  Significantly improved encoding and decoding algorithms, it is much faster now.
 
 ---
 

diff --git a/README.md b/README.md
@@ -1,7 +1,12 @@
 # SELFIES
 
+[![GitHub release](https://img.shields.io/github/release/aspuru-guzik-group/selfies.svg)](https://github.com/aspuru-guzik-group/selfies/releases/)
 ![versions](https://img.shields.io/pypi/pyversions/selfies.svg)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![Maintenance](https://img.shields.io/badge/Maintained%3F-yes-blue.svg)](https://github.com/aspuru-guzik-group/selfies/graphs/commit-activity)
+[![GitHub issues](https://img.shields.io/github/issues/aspuru-guzik-group/selfies.svg)](https://github.com/aspuru-guzik-group/selfies/issues/)
+[![Documentation Status](https://readthedocs.org/projects/selfies/badge/?version=latest)](http://selfies.readthedocs.io/?badge=latest)
+[![GitHub contributors](https://img.shields.io/github/contributors/aspuru-guzik-group/selfies.svg)](https://github.com/aspuru-guzik-group/selfies/graphs/contributors/)
 
 
 SELFIES (SELF-referencIng Embedded Strings) is a 100% robust molecular
@@ -23,13 +28,16 @@ Use pip to install ``selfies``.
 pip install selfies
 ```
 
-To check if the correct version of SELFIES is installed (see [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md) to verify latest version), use the following pip command:
+To check if the correct version of ``selfies`` is installed 
+(see [CHANGELOG](https://github.com/aspuru-guzik-group/selfies/blob/master/CHANGELOG.md) 
+to verify the latest version), use the following pip command:
 
 ```bash
 pip show selfies
 ```
 
-To upgrade to the latest release of SELFIES if you are using an older version, use the following pip command:
+To upgrade to the latest release of ``selfies`` if you are using an 
+older version, use the following pip command:
 
 ```bash
 pip install selfies --upgrade 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -22,7 +22,7 @@
 author = 'Mario Krenn'
 
 # The full version, including alpha/beta/rc tags
-release = '1.0.0'
+release = '1.0.1'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/selfies/__init__.py b/selfies/__init__.py
@@ -25,7 +25,7 @@
 mario.krenn@utoronto.ca and alan@aspuru.com.
 """
 
-__version__ = "1.0.0"
+__version__ = "1.0.1"
 
 __all__ = ['encoder', 'decoder',
            'get_semantic_robust_alphabet', 'get_semantic_constraints',

diff --git a/selfies/decoder.py b/selfies/decoder.py
@@ -1,3 +1,4 @@
+from collections import OrderedDict
 from typing import Dict, Iterable, List, Optional, Tuple, Union
 
 from selfies.grammar_rules import get_bond_from_num, get_n_from_symbols, \
@@ -41,8 +42,7 @@ def decoder(selfies: str, print_error: bool = False) -> Optional[str]:
 
     except ValueError as err:
         if print_error:
-            print(err)
-            print("Could not decode SELFIES. Please contact authors.")
+            print("Decoding error '{}': {}.".format(selfies, err))
         return None
 
 
@@ -257,7 +257,7 @@ def _form_rings_bilocally(derived: List[List[Union[str, int]]],
     # due to the behaviour of allowing multiple rings between the same atom
     # pair, or rings between already bonded atoms, we first resolve all rings
     # so that only valid rings are left and placed into <ring_locs>.
-    ring_locs = {}
+    ring_locs = OrderedDict()
 
     for left_idx, right_idx, bond_symbol in rings:
 

diff --git a/selfies/encoder.py b/selfies/encoder.py
@@ -58,15 +58,17 @@ def encoder(smiles: str, print_error: bool = False) -> Optional[str]:
     """
 
     try:
+        if '*' in smiles:
+            raise ValueError("wildcard atom '*' not supported")
+
         all_selfies = []  # process dot-separated fragments separately
         for s in smiles.split("."):
             all_selfies.append(_translate_smiles(s))
         return '.'.join(all_selfies)
 
     except ValueError as err:
         if print_error:
-            print(err)
-            print('Could not encode SMILES. Please contact authors.')
+            print("Encoding error '{}': {}.".format(smiles, err))
         return None
 
 
@@ -113,7 +115,7 @@ def _parse_smiles(smiles: str) -> Iterable[Tuple[str, str, int]]:
                 i += 1
 
         elif smiles[i] in ('(', ')'):  # open and closed branch brackets
-            bond = smiles[i + 1]
+            bond = smiles[i + 1: i + 2]
             symbol = smiles[i]
             symbol_type = BRANCH_TYPE
             i += 1
@@ -124,6 +126,12 @@ def _parse_smiles(smiles: str) -> Iterable[Tuple[str, str, int]]:
             symbol_type = ATOM_TYPE
             i = r_idx + 1
 
+            # quick chirality specification check
+            chiral_i = symbol.find('@')
+            if symbol[chiral_i + 1].isalpha() and symbol[chiral_i + 1] != 'H':
+                raise ValueError("chiral specification '{}' not supported"
+                                 .format(symbol))
+
         elif smiles[i].isdigit():  # one-digit ring number
             symbol = smiles[i]
             symbol_type = RING_TYPE
@@ -135,7 +143,7 @@ def _parse_smiles(smiles: str) -> Iterable[Tuple[str, str, int]]:
             i += 3
 
         else:
-            raise ValueError(f"Unknown symbol '{smiles[i]}' in SMILES.")
+            raise ValueError("unrecognized symbol '{}'".format(smiles[i]))
 
         yield bond, symbol, symbol_type
 
@@ -166,6 +174,10 @@ def _translate_smiles(smiles: str) -> str:
 
     selfies, _ = _translate_smiles_derive(smiles_gen, rings, derive_counter)
 
+    if rings:
+        raise ValueError("malformed ring numbering or ring numbering "
+                         "across a dot symbol")
+
     return selfies
 
 
@@ -196,12 +208,12 @@ def _translate_smiles_derive(smiles_gen: Iterable[Tuple[str, str, int]],
 
         if symbol_type == ATOM_TYPE:
             if symbol[0] == '[':
-                selfies += f"[{bond}{symbol[1:-1]}expl]"
+                selfies += "[{}{}expl]".format(bond, symbol[1:-1])
             else:
-                selfies += f"[{bond}{symbol}]"
+                selfies += "[{}{}]".format(bond, symbol)
+            prev_idx = counter[0]
             counter[0] += 1
             selfies_len += 1
-            prev_idx = counter[0]
 
         elif symbol_type == BRANCH_TYPE:
             if symbol == '(':
@@ -215,7 +227,7 @@ def _translate_smiles_derive(smiles_gen: Iterable[Tuple[str, str, int]],
                 N_as_symbols = get_symbols_from_n(branch_len - 1)
                 bond_num = get_num_from_bond(bond)
 
-                selfies += f"[Branch{len(N_as_symbols)}_{bond_num}]"
+                selfies += "[Branch{}_{}]".format(len(N_as_symbols), bond_num)
                 selfies += ''.join(N_as_symbols) + branch
                 selfies_len += 1 + len(N_as_symbols) + branch_len
 
@@ -233,16 +245,18 @@ def _translate_smiles_derive(smiles_gen: Iterable[Tuple[str, str, int]],
                 N_as_symbols = get_symbols_from_n(ring_len - 1)
 
                 if left_bond != '':
-                    selfies += f"[Expl{left_bond}Ring{len(N_as_symbols)}]"
+                    selfies += "[Expl{}Ring{}]".format(left_bond,
+                                                       len(N_as_symbols))
                 elif right_bond != '':
-                    selfies += f"[Expl{right_bond}Ring{len(N_as_symbols)}]"
+                    selfies += "[Expl{}Ring{}]".format(right_bond,
+                                                       len(N_as_symbols))
                 else:
-                    selfies += f"[Ring{len(N_as_symbols)}]"
+                    selfies += "[Ring{}]".format(len(N_as_symbols))
 
                 selfies += ''.join(N_as_symbols)
                 selfies_len += 1 + len(N_as_symbols)
 
             else:
-                rings[ring_id] = (bond, counter[0])
+                rings[ring_id] = (bond, prev_idx)
 
     return selfies, selfies_len
diff --git a/selfies/grammar_rules.py b/selfies/grammar_rules.py
@@ -36,19 +36,19 @@ def get_semantic_robust_alphabet() -> Set[str]:
             continue
 
         if a in organic_subset:
-            symbol = f"[{b}{a}]"
+            symbol = "[{}{}]".format(b, a)
         else:
-            symbol = f"[{b}{a}expl]"
+            symbol = "[{}{}expl]".format(b, a)
 
         alphabet_subset.add(symbol)
 
     # add branch and ring symbols
     for i in range(1, 4):
-        alphabet_subset.add(f"[Ring{i}]")
-        alphabet_subset.add(f"[Expl=Ring{i}]")
+        alphabet_subset.add("[Ring{}]".format(i))
+        alphabet_subset.add("[Expl=Ring{}]".format(i))
 
         for j in range(1, 4):
-            alphabet_subset.add(f"[Branch{i}_{j}]")
+            alphabet_subset.add("[Branch{}_{}]".format(i, j))
 
     return alphabet_subset
 
@@ -110,11 +110,12 @@ def set_semantic_constraints(
 
         # error checking
         if '?' not in bond_constraints:
-            raise ValueError("'?' not a key in bond_constraints")
+            raise ValueError("bond_constraints missing '?' as a key.")
 
         for key, value in bond_constraints.items():
             if not (1 <= value <= 8):
-                raise ValueError("Value in bond_constraints not in [1, 8]")
+                raise ValueError("bond_constraints['{}'] not between "
+                                 "1 and 8 inclusive.".format(key))
 
         _bond_constraints = dict(bond_constraints)
 
@@ -145,7 +146,7 @@ def get_next_state(symbol: str, state: int) -> Tuple[str, int]:
     bond_num = get_num_from_bond(bond)
 
     if symbol[-5:] == 'expl]':  # e.g. [C@@Hexpl]
-        smiles_symbol = f"[{symbol[1 + len(bond):-5]}]"
+        smiles_symbol = "[{}]".format(symbol[1 + len(bond):-5])
     else:
         smiles_symbol = symbol[1 + len(bond):-1]
 
@@ -155,14 +156,14 @@ def get_next_state(symbol: str, state: int) -> Tuple[str, int]:
     if charge == 0:
         atom_or_ion = element
     else:
-        atom_or_ion = f"{element}{charge:+}"
+        atom_or_ion = "{}{:+}".format(element, charge)
 
     max_bonds = _bond_constraints.get(atom_or_ion,
                                       _bond_constraints['?'])
 
     if h_count >= max_bonds:
-        raise ValueError(f"Too many Hs in SELFIES Symbol '{symbol}'. "
-                         f"Consider adjusting bond constraints.")
+        raise ValueError("too many Hs in symbol '{}'; consider "
+                         "adjusting bond constraints".format(symbol))
     max_bonds -= h_count  # hydrogens consume 1 bond
 
     # calculate next state
@@ -201,7 +202,7 @@ def get_next_branch_state(branch_symbol: str, state: int) -> Tuple[int, int]:
     branch_type = int(branch_symbol[-2])  # branches of the form [BranchL_X]
 
     if not (1 <= branch_type <= 3):
-        raise ValueError(f"Unknown branch symbol: {branch_symbol}")
+        raise ValueError("unknown branch symbol '{}'".format(branch_symbol))
 
     if 2 <= state <= 8:
         branch_init_state = min(state - 1, branch_type)

diff --git a/selfies/kekulize.py b/selfies/kekulize.py
@@ -1,5 +1,3 @@
-from __future__ import annotations
-
 from typing import Dict, Iterable, List, Set, Tuple, Union
 
 from selfies.grammar_rules import find_element, get_num_from_bond, \
@@ -43,7 +41,7 @@ def kekulize_parser(smiles_gen: Iterable[Tuple[str, str, int]]) \
         yield tuple(x)
 
 
-def _build_molecular_graph(graph: MolecularGraph,
+def _build_molecular_graph(graph,
                            smiles_symbols: List[List[Union[str, int]]],
                            rings: Dict[int, Tuple[int, int]],
                            prev_idx: int = -1,
@@ -105,7 +103,7 @@ def _build_molecular_graph(graph: MolecularGraph,
     return curr_idx
 
 
-def _kekulize(mol_graph: MolecularGraph) -> None:
+def _kekulize(mol_graph) -> None:
     """Kekulizes the molecular graph.
 
     :param mol_graph: a molecular graph to be kekulized.
@@ -118,7 +116,7 @@ def _kekulize(mol_graph: MolecularGraph) -> None:
     for i in mol_graph.get_nodes_by_num_edges():
         success = mol_graph.dfs_assign_bonds(i, visited, set(), set())
         if not success:
-            raise ValueError("Kekulization Failed.")
+            raise ValueError("kekulization algorithm failed")
 
     mol_graph.write_to_smiles_symbols()
 
@@ -167,8 +165,8 @@ def _is_aromatic(atom_symbol: str) -> bool:
         return False
 
     if element not in _aromatic_valences:
-        raise ValueError(f"Kekulization Failed: aromatic symbol {atom_symbol} "
-                         f"not recognized.")
+        raise ValueError("unrecognized aromatic symbol '{}'"
+                         .format(atom_symbol))
     return True
 
 
@@ -199,7 +197,8 @@ def _in_pi_subgraph(atom_symbol: str, bonds: Tuple[str]) -> bool:
         h_count += 1  # implied bonded hydrogen
 
     if h_count > 1:
-        raise ValueError(f"Kekulization Failed: {atom_symbol} not supported.")
+        raise ValueError("unrecognized aromatic symbol '{}'"
+                         .format(atom_symbol))
 
     elif h_count == 1:  # e.g. [nH]
         used_electrons += 1
@@ -226,9 +225,6 @@ class MolecularGraph:
     :ivar aro_indices: a set of indices of atom(s) from ``smiles_symbols``
         that are aromatic in the molecular graph.
     """
-    smiles_symbols: List[List[Union[str, int]]]
-    graph: Dict[int, List[Bond]]
-    aro_indices: Set[int]
 
     def __init__(self, smiles_symbols: List[List[Union[str, int]]]):
         self.smiles_symbols = smiles_symbols
@@ -377,7 +373,7 @@ def prune_to_pi_subgraph(self) -> None:
     def dfs_assign_bonds(self, idx: int,
                          visited: Set[int],
                          matched_nodes: Set[int],
-                         matched_edges: Set[Bond]) -> bool:
+                         matched_edges) -> bool:
         """After calling ``prune_to_pi_subgraph``, this method assigns
         double bonds between pairs of nodes such that every node is
         paired or matched.
@@ -490,10 +486,6 @@ class Bond:
     :ivar bond_symbol: the SMILES symbol representing this bond (e.g. '#').
     :ivar bond_idx: the index of this bond or edge.
     """
-    idx_a: int
-    idx_b: int
-    bond_symbol: str
-    bond_idx: int
 
     def __init__(self, idx_a, idx_b, bond_symbol, bond_idx):
         self.idx_a = idx_a