Ensure thermo dataset SMILES are mapped (#62)

* Ensure thermo dataset SMILES are mapped * lint codebase
SimonBoothroyd · Feb 15, 2024 · cd94f82 · cd94f82
1 parent 9f3a23e
commit cd94f82
Show file tree

Hide file tree

Showing 8 changed files with 53 additions and 11 deletions.
diff --git a/descent/optim/_lm.py b/descent/optim/_lm.py
@@ -8,6 +8,7 @@
 References:
     [1]: https://github.com/leeping/forcebalance/blob/b395fd4b/src/optimizer.py
 """
+
 import logging
 import math
 import typing

diff --git a/descent/targets/dimers.py b/descent/targets/dimers.py
@@ -1,4 +1,5 @@
 """Train against dimer energies."""
+
 import pathlib
 import typing
 

diff --git a/descent/targets/energy.py b/descent/targets/energy.py
@@ -1,4 +1,5 @@
 """Train against relative energies and forces."""
+
 import typing
 
 import datasets

diff --git a/descent/targets/thermo.py b/descent/targets/thermo.py
@@ -1,4 +1,5 @@
 """Train against thermodynamic properties."""
+
 import contextlib
 import hashlib
 import logging
@@ -113,9 +114,9 @@ class SimulationConfig(pydantic.BaseModel):
         False, description="Whether to apply hydrogen mass repartitioning."
     )
 
-    equilibrate: list[
-        smee.mm.MinimizationConfig | smee.mm.SimulationConfig
-    ] = pydantic.Field(..., description="Configuration for equilibration simulations.")
+    equilibrate: list[smee.mm.MinimizationConfig | smee.mm.SimulationConfig] = (
+        pydantic.Field(..., description="Configuration for equilibration simulations.")
+    )
 
     production: smee.mm.SimulationConfig = pydantic.Field(
         ..., description="Configuration for the production simulation."
@@ -137,6 +138,24 @@ class _Observables(typing.NamedTuple):
 _SystemDict = dict[SimulationKey, smee.TensorSystem]
 
 
+def _map_smiles(smiles: str) -> str:
+    """Add atom mapping to a SMILES string if it is not already present."""
+    params = Chem.SmilesParserParams()
+    params.removeHs = False
+
+    mol = Chem.AddHs(Chem.MolFromSmiles(smiles, params))
+
+    map_idxs = sorted(atom.GetAtomMapNum() for atom in mol.GetAtoms())
+
+    if map_idxs == list(range(1, len(map_idxs) + 1)):
+        return smiles
+
+    for i, atom in enumerate(mol.GetAtoms()):
+        atom.SetAtomMapNum(i + 1)
+
+    return Chem.MolToSmiles(mol)
+
+
 def create_dataset(*rows: DataEntry) -> datasets.Dataset:
     """Create a dataset from a list of existing data points.
 
@@ -148,12 +167,12 @@ def create_dataset(*rows: DataEntry) -> datasets.Dataset:
     """
 
     for row in rows:
-        row["smiles_a"] = Chem.MolToSmiles(Chem.MolFromSmiles(row["smiles_a"]))
+        row["smiles_a"] = _map_smiles(row["smiles_a"])
 
         if row["smiles_b"] is None:
             continue
 
-        row["smiles_b"] = Chem.MolToSmiles(Chem.MolFromSmiles(row["smiles_b"]))
+        row["smiles_b"] = _map_smiles(row["smiles_b"])
 
     # TODO: validate rows
     table = pyarrow.Table.from_pylist([*rows], schema=DATA_SCHEMA)
@@ -519,9 +538,7 @@ def _predict_hmix(
 
     value = enthalpy_mix - x_0 * enthalpy_0 - x_1 * enthalpy_1
     std = torch.sqrt(
-        enthalpy_mix_std**2
-        + x_0**2 * enthalpy_0_std**2
-        + x_1**2 * enthalpy_1_std**2
+        enthalpy_mix_std**2 + x_0**2 * enthalpy_0_std**2 + x_1**2 * enthalpy_1_std**2
     )
 
     return value, std

diff --git a/descent/tests/targets/test_thermo.py b/descent/tests/targets/test_thermo.py
@@ -11,6 +11,7 @@
     SimulationKey,
     _compute_observables,
     _convert_entry_to_system,
+    _map_smiles,
     _Observables,
     _plan_simulations,
     _predict,
@@ -90,6 +91,21 @@ def mock_hmix() -> DataEntry:
     }
 
 
+@pytest.mark.parametrize(
+    "smiles, expected",
+    [
+        ("C", "[C:1]([H:2])([H:3])([H:4])[H:5]"),
+        ("[CH4:1]", "[C:1]([H:2])([H:3])([H:4])[H:5]"),
+        ("[Cl:1][H:2]", "[Cl:1][H:2]"),
+        ("[Cl:2][H:1]", "[Cl:2][H:1]"),
+        ("[Cl:2][H:2]", "[Cl:1][H:2]"),
+        ("[Cl:1][H]", "[Cl:1][H:2]"),
+    ],
+)
+def test_map_smiles(smiles, expected):
+    assert _map_smiles(smiles) == expected
+
+
 def test_create_dataset(mock_density_pure, mock_density_binary):
     expected_entries = [mock_density_pure, mock_density_binary]
 
@@ -105,7 +121,10 @@ def test_extract_smiles(mock_density_pure, mock_density_binary):
     dataset = create_dataset(mock_density_pure, mock_density_binary)
     smiles = extract_smiles(dataset)
 
-    expected_smiles = ["CCO", "CO"]
+    expected_smiles = [
+        "[C:1]([C:2]([O:3][H:9])([H:7])[H:8])([H:4])([H:5])[H:6]",
+        "[C:1]([O:2][H:6])([H:3])([H:4])[H:5]",
+    ]
     assert smiles == expected_smiles
 
 
@@ -497,7 +516,7 @@ def test_predict_hmix(mock_hmix, mocker):
 def test_predict(tmp_cwd, mock_density_pure, mocker):
     dataset = create_dataset(mock_density_pure)
 
-    mock_topologies = {"CO": mocker.Mock()}
+    mock_topologies = {"[C:1]([O:2][H:6])([H:3])([H:4])[H:5]": mocker.Mock()}
     mock_ff = mocker.Mock()
 
     mock_density = torch.tensor(123.0)
@@ -520,7 +539,7 @@ def test_predict(tmp_cwd, mock_density_pure, mocker):
     mock_compute.assert_called_once_with(
         "bulk",
         SimulationKey(
-            ("CO",),
+            ("[C:1]([O:2][H:6])([H:3])([H:4])[H:5]",),
             (256,),
             mock_density_pure["temperature"],
             mock_density_pure["pressure"],

diff --git a/descent/utils/dataset.py b/descent/utils/dataset.py
@@ -1,4 +1,5 @@
 """Utilities for working with datasets."""
+
 import typing
 
 import datasets

diff --git a/descent/utils/loss.py b/descent/utils/loss.py
@@ -1,4 +1,5 @@
 """Utilities for defining loss functions."""
+
 import functools
 import typing
 

diff --git a/descent/utils/reporting.py b/descent/utils/reporting.py
@@ -1,4 +1,5 @@
 """Utilities for reporting results."""
+
 import base64
 import io
 import itertools