diff --git a/descent/optim/_lm.py b/descent/optim/_lm.py index d639b38..ed5c6f0 100644 --- a/descent/optim/_lm.py +++ b/descent/optim/_lm.py @@ -8,6 +8,7 @@ References: [1]: https://github.com/leeping/forcebalance/blob/b395fd4b/src/optimizer.py """ + import logging import math import typing diff --git a/descent/targets/dimers.py b/descent/targets/dimers.py index a9bf688..0a20639 100644 --- a/descent/targets/dimers.py +++ b/descent/targets/dimers.py @@ -1,4 +1,5 @@ """Train against dimer energies.""" + import pathlib import typing diff --git a/descent/targets/energy.py b/descent/targets/energy.py index d00ddbe..fe6c6b8 100644 --- a/descent/targets/energy.py +++ b/descent/targets/energy.py @@ -1,4 +1,5 @@ """Train against relative energies and forces.""" + import typing import datasets diff --git a/descent/targets/thermo.py b/descent/targets/thermo.py index 4c87d2b..57f93d0 100644 --- a/descent/targets/thermo.py +++ b/descent/targets/thermo.py @@ -1,4 +1,5 @@ """Train against thermodynamic properties.""" + import contextlib import hashlib import logging @@ -113,9 +114,9 @@ class SimulationConfig(pydantic.BaseModel): False, description="Whether to apply hydrogen mass repartitioning." ) - equilibrate: list[ - smee.mm.MinimizationConfig | smee.mm.SimulationConfig - ] = pydantic.Field(..., description="Configuration for equilibration simulations.") + equilibrate: list[smee.mm.MinimizationConfig | smee.mm.SimulationConfig] = ( + pydantic.Field(..., description="Configuration for equilibration simulations.") + ) production: smee.mm.SimulationConfig = pydantic.Field( ..., description="Configuration for the production simulation." @@ -137,6 +138,24 @@ class _Observables(typing.NamedTuple): _SystemDict = dict[SimulationKey, smee.TensorSystem] +def _map_smiles(smiles: str) -> str: + """Add atom mapping to a SMILES string if it is not already present.""" + params = Chem.SmilesParserParams() + params.removeHs = False + + mol = Chem.AddHs(Chem.MolFromSmiles(smiles, params)) + + map_idxs = sorted(atom.GetAtomMapNum() for atom in mol.GetAtoms()) + + if map_idxs == list(range(1, len(map_idxs) + 1)): + return smiles + + for i, atom in enumerate(mol.GetAtoms()): + atom.SetAtomMapNum(i + 1) + + return Chem.MolToSmiles(mol) + + def create_dataset(*rows: DataEntry) -> datasets.Dataset: """Create a dataset from a list of existing data points. @@ -148,12 +167,12 @@ def create_dataset(*rows: DataEntry) -> datasets.Dataset: """ for row in rows: - row["smiles_a"] = Chem.MolToSmiles(Chem.MolFromSmiles(row["smiles_a"])) + row["smiles_a"] = _map_smiles(row["smiles_a"]) if row["smiles_b"] is None: continue - row["smiles_b"] = Chem.MolToSmiles(Chem.MolFromSmiles(row["smiles_b"])) + row["smiles_b"] = _map_smiles(row["smiles_b"]) # TODO: validate rows table = pyarrow.Table.from_pylist([*rows], schema=DATA_SCHEMA) @@ -519,9 +538,7 @@ def _predict_hmix( value = enthalpy_mix - x_0 * enthalpy_0 - x_1 * enthalpy_1 std = torch.sqrt( - enthalpy_mix_std**2 - + x_0**2 * enthalpy_0_std**2 - + x_1**2 * enthalpy_1_std**2 + enthalpy_mix_std**2 + x_0**2 * enthalpy_0_std**2 + x_1**2 * enthalpy_1_std**2 ) return value, std diff --git a/descent/tests/targets/test_thermo.py b/descent/tests/targets/test_thermo.py index 1cebce0..89473b9 100644 --- a/descent/tests/targets/test_thermo.py +++ b/descent/tests/targets/test_thermo.py @@ -11,6 +11,7 @@ SimulationKey, _compute_observables, _convert_entry_to_system, + _map_smiles, _Observables, _plan_simulations, _predict, @@ -90,6 +91,21 @@ def mock_hmix() -> DataEntry: } +@pytest.mark.parametrize( + "smiles, expected", + [ + ("C", "[C:1]([H:2])([H:3])([H:4])[H:5]"), + ("[CH4:1]", "[C:1]([H:2])([H:3])([H:4])[H:5]"), + ("[Cl:1][H:2]", "[Cl:1][H:2]"), + ("[Cl:2][H:1]", "[Cl:2][H:1]"), + ("[Cl:2][H:2]", "[Cl:1][H:2]"), + ("[Cl:1][H]", "[Cl:1][H:2]"), + ], +) +def test_map_smiles(smiles, expected): + assert _map_smiles(smiles) == expected + + def test_create_dataset(mock_density_pure, mock_density_binary): expected_entries = [mock_density_pure, mock_density_binary] @@ -105,7 +121,10 @@ def test_extract_smiles(mock_density_pure, mock_density_binary): dataset = create_dataset(mock_density_pure, mock_density_binary) smiles = extract_smiles(dataset) - expected_smiles = ["CCO", "CO"] + expected_smiles = [ + "[C:1]([C:2]([O:3][H:9])([H:7])[H:8])([H:4])([H:5])[H:6]", + "[C:1]([O:2][H:6])([H:3])([H:4])[H:5]", + ] assert smiles == expected_smiles @@ -497,7 +516,7 @@ def test_predict_hmix(mock_hmix, mocker): def test_predict(tmp_cwd, mock_density_pure, mocker): dataset = create_dataset(mock_density_pure) - mock_topologies = {"CO": mocker.Mock()} + mock_topologies = {"[C:1]([O:2][H:6])([H:3])([H:4])[H:5]": mocker.Mock()} mock_ff = mocker.Mock() mock_density = torch.tensor(123.0) @@ -520,7 +539,7 @@ def test_predict(tmp_cwd, mock_density_pure, mocker): mock_compute.assert_called_once_with( "bulk", SimulationKey( - ("CO",), + ("[C:1]([O:2][H:6])([H:3])([H:4])[H:5]",), (256,), mock_density_pure["temperature"], mock_density_pure["pressure"], diff --git a/descent/utils/dataset.py b/descent/utils/dataset.py index 792c2c1..81e6f12 100644 --- a/descent/utils/dataset.py +++ b/descent/utils/dataset.py @@ -1,4 +1,5 @@ """Utilities for working with datasets.""" + import typing import datasets diff --git a/descent/utils/loss.py b/descent/utils/loss.py index b652d59..240037d 100644 --- a/descent/utils/loss.py +++ b/descent/utils/loss.py @@ -1,4 +1,5 @@ """Utilities for defining loss functions.""" + import functools import typing diff --git a/descent/utils/reporting.py b/descent/utils/reporting.py index e27e294..5d31db5 100644 --- a/descent/utils/reporting.py +++ b/descent/utils/reporting.py @@ -1,4 +1,5 @@ """Utilities for reporting results.""" + import base64 import io import itertools