Merge branch 'main' into report-multi-tops

SimonBoothroyd · Jun 3, 2024 · bdea278 · bdea278
2 parents d78f39a + feea215
commit bdea278
Show file tree

Hide file tree

Showing 18 changed files with 417 additions and 127 deletions.
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml
@@ -2,10 +2,8 @@ name: Publish Documentation
 
 on:
   push:
-    branches:
-      - main
-    tags:
-      - '*'
+    branches: ["main"]
+    tags: ["*"]
 
 jobs:
   deploy-docs:
@@ -43,11 +41,12 @@ jobs:
         git config --global --add safe.directory "$PWD"
         git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}
 
+        git pull origin gh-pages --allow-unrelated-histories
         git fetch --all --prune
 
         make env
 
         sed -i 's/# extensions/extensions/' mkdocs.yml
+        make docs-insiders INSIDER_DOCS_TOKEN="${INSIDER_DOCS_TOKEN}"
 
-        make docs-insiders INSIDER_DOCS_TOKEN="${INSIDER_DOCS_TOKEN}"        
         make docs-deploy VERSION="$VERSION"
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,38 +1,9 @@
 repos:
   - repo: local
     hooks:
-      - id: isort
-        name: "[Package] Import formatting"
+      - id: ruff
+        name: "[Package] Formatting"
         language: system
-        entry: isort
+        entry: make
+        args: [ lint ]
         files: \.py$
-
-      - id: black
-        name: "[Package] Code formatting"
-        language: system
-        entry: black
-        files: \.py$
-
-      - id: flake8
-        name: "[Package] Linting"
-        language: system
-        entry: flake8
-        files: \.py$
-
-      - id: isort-examples
-        name: "[Examples] Import formatting"
-        language: system
-        entry: nbqa isort
-        files: examples/.+\.ipynb$
-
-      - id: black-examples
-        name: "[Examples] Code formatting"
-        language: system
-        entry: nbqa black
-        files: examples/.+\.ipynb$
-
-      - id: flake8-examples
-        name: "[Examples] Linting"
-        language: system
-        entry: nbqa flake8 --ignore=E402
-        files: examples/.+\.ipynb$
diff --git a/Makefile b/Makefile
@@ -1,4 +1,6 @@
-PACKAGE_NAME  := descent
+PACKAGE_NAME := descent
+PACKAGE_DIR  := $(PACKAGE_NAME)
+
 CONDA_ENV_RUN := conda run --no-capture-output --name $(PACKAGE_NAME)
 
 .PHONY: pip-install env lint format test test-examples
@@ -13,23 +15,16 @@ env:
 	$(CONDA_ENV_RUN) pre-commit install || true
 
 lint:
-	$(CONDA_ENV_RUN) isort --check-only $(PACKAGE_NAME)
-	$(CONDA_ENV_RUN) black --check      $(PACKAGE_NAME)
-	$(CONDA_ENV_RUN) flake8             $(PACKAGE_NAME)
-	$(CONDA_ENV_RUN) nbqa isort   --check-only  examples
-	$(CONDA_ENV_RUN) nbqa black   --check       examples
-	$(CONDA_ENV_RUN) nbqa flake8  --ignore=E402 examples
+	$(CONDA_ENV_RUN) ruff check $(PACKAGE_DIR)
 
 format:
-	$(CONDA_ENV_RUN) isort  $(PACKAGE_NAME)
-	$(CONDA_ENV_RUN) black  $(PACKAGE_NAME)
-	$(CONDA_ENV_RUN) flake8 $(PACKAGE_NAME)
-	$(CONDA_ENV_RUN) nbqa isort                examples
-	$(CONDA_ENV_RUN) nbqa black                examples
-	$(CONDA_ENV_RUN) nbqa flake8 --ignore=E402 examples
+	$(CONDA_ENV_RUN) ruff format                 $(PACKAGE_DIR)
+	$(CONDA_ENV_RUN) ruff check --fix --select I $(PACKAGE_DIR)
+	$(CONDA_ENV_RUN) nbqa 'ruff format'                 examples
+	$(CONDA_ENV_RUN) nbqa 'ruff check' --fix --select=I examples
 
 test:
-	$(CONDA_ENV_RUN) pytest -v --cov=$(PACKAGE_NAME) --cov-report=xml --color=yes $(PACKAGE_NAME)/tests/
+	$(CONDA_ENV_RUN) pytest -v --cov=$(PACKAGE_NAME) --cov-report=xml --color=yes $(PACKAGE_DIR)/tests/
 
 docs-build:
 	$(CONDA_ENV_RUN) mkdocs build

diff --git a/descent/optim/__init__.py b/descent/optim/__init__.py
@@ -1,5 +1,5 @@
 """Custom parameter optimizers."""
 
-from descent.optim._lm import LevenbergMarquardtConfig, levenberg_marquardt
+from descent.optim._lm import ClosureFn, LevenbergMarquardtConfig, levenberg_marquardt
 
-__all__ = ["LevenbergMarquardtConfig", "levenberg_marquardt"]
+__all__ = ["ClosureFn", "LevenbergMarquardtConfig", "levenberg_marquardt"]
diff --git a/descent/targets/dimers.py b/descent/targets/dimers.py
@@ -12,12 +12,15 @@
 import tqdm
 
 import descent.utils.dataset
+import descent.utils.loss
 import descent.utils.molecule
 import descent.utils.reporting
 
 if typing.TYPE_CHECKING:
     import pandas
 
+    import descent.train
+
 
 EnergyFn = typing.Callable[
     ["pandas.DataFrame", tuple[str, ...], torch.Tensor], torch.Tensor
@@ -272,11 +275,38 @@ def predict(
         *[
             _predict(dimer, force_field, topologies)
             for dimer in descent.utils.dataset.iter_dataset(dataset)
-        ]
+        ],
+        strict=True,
     )
     return torch.cat(reference), torch.cat(predicted)
 
 
+def default_closure(
+    trainable: "descent.train.Trainable",
+    topologies: dict[str, smee.TensorTopology],
+    dataset: datasets.Dataset,
+):
+    """Return a default closure function for training against dimer energies.
+
+    Args:
+        trainable: The wrapper around trainable parameters.
+        topologies: The topologies of the molecules present in the dataset, with keys
+            of mapped SMILES patterns.
+        dataset: The dataset to train against.
+
+    Returns:
+        The default closure function.
+    """
+
+    def loss_fn(_x: torch.Tensor) -> torch.Tensor:
+        y_ref, y_pred = descent.targets.dimers.predict(
+            dataset, trainable.to_force_field(_x), topologies
+        )
+        return ((y_pred - y_ref) ** 2).sum()
+
+    return descent.utils.loss.to_closure(loss_fn)
+
+
 def _plot_energies(energies: dict[str, torch.Tensor]) -> str:
     from matplotlib import pyplot
 

diff --git a/descent/targets/thermo.py b/descent/targets/thermo.py
@@ -16,9 +16,15 @@
 import smee.mm
 import smee.utils
 import torch
-from rdkit import Chem
 
+import descent.optim
 import descent.utils.dataset
+import descent.utils.loss
+import descent.utils.molecule
+
+if typing.TYPE_CHECKING:
+    import descent.train
+
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -138,24 +144,6 @@ class _Observables(typing.NamedTuple):
 _SystemDict = dict[SimulationKey, smee.TensorSystem]
 
 
-def _map_smiles(smiles: str) -> str:
-    """Add atom mapping to a SMILES string if it is not already present."""
-    params = Chem.SmilesParserParams()
-    params.removeHs = False
-
-    mol = Chem.AddHs(Chem.MolFromSmiles(smiles, params))
-
-    map_idxs = sorted(atom.GetAtomMapNum() for atom in mol.GetAtoms())
-
-    if map_idxs == list(range(1, len(map_idxs) + 1)):
-        return smiles
-
-    for i, atom in enumerate(mol.GetAtoms()):
-        atom.SetAtomMapNum(i + 1)
-
-    return Chem.MolToSmiles(mol)
-
-
 def create_dataset(*rows: DataEntry) -> datasets.Dataset:
     """Create a dataset from a list of existing data points.
 
@@ -167,12 +155,12 @@ def create_dataset(*rows: DataEntry) -> datasets.Dataset:
     """
 
     for row in rows:
-        row["smiles_a"] = _map_smiles(row["smiles_a"])
+        row["smiles_a"] = descent.utils.molecule.map_smiles(row["smiles_a"])
 
         if row["smiles_b"] is None:
             continue
 
-        row["smiles_b"] = _map_smiles(row["smiles_b"])
+        row["smiles_b"] = descent.utils.molecule.map_smiles(row["smiles_b"])
 
     # TODO: validate rows
     table = pyarrow.Table.from_pylist([*rows], schema=DATA_SCHEMA)
@@ -582,6 +570,7 @@ def predict(
     output_dir: pathlib.Path,
     cached_dir: pathlib.Path | None = None,
     per_type_scales: dict[DataType, float] | None = None,
+    verbose: bool = False,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """Predict the properties in a dataset using molecular simulation, or by reweighting
     previous simulation data.
@@ -596,6 +585,7 @@ def predict(
             from.
         per_type_scales: The scale factor to apply to each data type. A default of 1.0
             will be used for any data type not specified.
+        verbose: Whether to log additional information.
     """
 
     entries: list[DataEntry] = [*descent.utils.dataset.iter_dataset(dataset)]
@@ -616,9 +606,11 @@ def predict(
     reference = []
     reference_std = []
 
+    verbose_rows = []
+
     per_type_scales = per_type_scales if per_type_scales is not None else {}
 
-    for entry, keys in zip(entries, entry_to_simulation):
+    for entry, keys in zip(entries, entry_to_simulation, strict=True):
         value, std = _predict(entry, keys, observables, required_simulations)
 
         type_scale = per_type_scales.get(entry["type"], 1.0)
@@ -631,10 +623,83 @@ def predict(
             torch.nan if entry["std"] is None else entry["std"] * abs(type_scale)
         )
 
+        if verbose:
+            std_ref = "" if entry["std"] is None else f" ± {float(entry['std']):.3f}"
+
+            verbose_rows.append(
+                {
+                    "type": f'{entry["type"]} [{entry["units"]}]',
+                    "smiles_a": descent.utils.molecule.unmap_smiles(entry["smiles_a"]),
+                    "smiles_b": (
+                        ""
+                        if entry["smiles_b"] is None
+                        else descent.utils.molecule.unmap_smiles(entry["smiles_b"])
+                    ),
+                    "pred": f"{float(value):.3f} ± {float(std):.3f}",
+                    "ref": f"{float(entry['value']):.3f}{std_ref}",
+                }
+            )
+
+    if verbose:
+        import pandas
+
+        _LOGGER.info(f"predicted {len(entries)} properties")
+        _LOGGER.info("\n" + pandas.DataFrame(verbose_rows).to_string(index=False))
+
     predicted = torch.stack(predicted)
     predicted_std = torch.stack(predicted_std)
 
     reference = smee.utils.tensor_like(reference, predicted)
     reference_std = smee.utils.tensor_like(reference_std, predicted_std)
 
     return reference, reference_std, predicted, predicted_std
+
+
+def default_closure(
+    trainable: "descent.train.Trainable",
+    topologies: dict[str, smee.TensorTopology],
+    dataset: datasets.Dataset,
+    per_type_scales: dict[DataType, float] | None = None,
+    verbose: bool = False,
+) -> descent.optim.ClosureFn:
+    """Return a default closure function for training against thermodynamic
+    properties.
+
+    Args:
+        trainable: The wrapper around trainable parameters.
+        topologies: The topologies of the molecules present in the dataset, with keys
+            of mapped SMILES patterns.
+        dataset: The dataset to train against.
+        per_type_scales: The scale factor to apply to each data type.
+        verbose: Whether to log additional information about predictions.
+
+    Returns:
+        The default closure function.
+    """
+
+    def closure_fn(
+        x: torch.Tensor,
+        compute_gradient: bool,
+        compute_hessian: bool,
+    ):
+        force_field = trainable.to_force_field(x)
+
+        y_ref, _, y_pred, _ = descent.targets.thermo.predict(
+            dataset,
+            force_field,
+            topologies,
+            pathlib.Path.cwd(),
+            None,
+            per_type_scales,
+            verbose,
+        )
+        loss, gradient, hessian = ((y_pred - y_ref) ** 2).sum(), None, None
+
+        if compute_hessian:
+            hessian = descent.utils.loss.approximate_hessian(x, y_pred)
+        if compute_gradient:
+            gradient = torch.autograd.grad(loss, x, retain_graph=True)[0].detach()
+
+        return loss.detach(), gradient, hessian
+
+    return closure_fn
diff --git a/descent/tests/optim/test_lm.py b/descent/tests/optim/test_lm.py
@@ -143,7 +143,8 @@ def test_damping_factor_loss_fn(mocker):
 
 
 @pytest.mark.parametrize(
-    "n_convergence_criteria, n_convergence_steps, step_quality, expected_converged, expected_logs",
+    "n_convergence_criteria, n_convergence_steps, step_quality, expected_converged, "
+    "expected_logs",
     [
         (0, 2, 1.0, False, []),
         (1, 2, 0.0, False, []),
@@ -283,7 +284,7 @@ def mock_loss_fn(_x, *_):
     ]
     assert len(trust_radius_messages) == len(expected_messages)
 
-    for message, expected in zip(trust_radius_messages, expected_messages):
+    for message, expected in zip(trust_radius_messages, expected_messages, strict=True):
         assert message.startswith(expected)
 
     # mock_step_fn.assert_has_calls(expected_loss_traj, any_order=False)