From 6d4b216baacd99ec1f1e1241b0ad4cf92910e921 Mon Sep 17 00:00:00 2001 From: Josh Horton Date: Mon, 3 Jun 2024 12:43:59 +0100 Subject: [PATCH] Thermo dataset from evaluator (#67) --- descent/targets/thermo.py | 75 +++++++++++++++++++ descent/tests/data/evaluator_mock.json | 1 + .../data/missing_property_evaluator.json | 1 + descent/tests/targets/test_thermo.py | 28 +++++++ 4 files changed, 105 insertions(+) create mode 100644 descent/tests/data/evaluator_mock.json create mode 100644 descent/tests/data/missing_property_evaluator.json diff --git a/descent/targets/thermo.py b/descent/targets/thermo.py index 5614570..3cae352 100644 --- a/descent/targets/thermo.py +++ b/descent/targets/thermo.py @@ -169,6 +169,81 @@ def create_dataset(*rows: DataEntry) -> datasets.Dataset: return dataset +def create_from_evaluator(dataset_file: pathlib.Path) -> datasets.Dataset: + """ + Create a dataset from an evaluator PhysicalPropertyDataSet + + Args: + dataset_file: The path to the evaluator dataset + + Returns: + The created dataset + """ + import json + + from openff.units import unit + + _evaluator_to_prop = { + "openff.evaluator.properties.density.Density": "density", + "openff.evaluator.properties.enthalpy.EnthalpyOfMixing": "hmix", + "openff.evaluator.properties.enthalpy.EnthalpyOfVaporization": "hvap", + } + _prop_units = {"density": "g/mL", "hmix": "kcal/mol", "hvap": "kcal/mol"} + + properties: list[DataEntry] = [] + property_data = json.load(dataset_file.open()) + + for phys_prop in property_data["properties"]: + try: + prop_type = _evaluator_to_prop[phys_prop["@type"]] + except KeyError: + raise KeyError(f"{phys_prop['@type']} not currently supported.") from None + + smiles_and_role = [ + (comp["smiles"], comp["smiles"] + "{" + comp["role"]["value"] + "}") + for comp in phys_prop["substance"]["components"] + ] + smiles_a, role_a = smiles_and_role[0] + x_a = phys_prop["substance"]["amounts"][role_a][0]["value"] + if len(smiles_and_role) == 1: + smiles_b, x_b = None, None + elif len(smiles_and_role) == 2: + smiles_b, role_b = smiles_and_role[1] + x_b = phys_prop["substance"]["amounts"][role_b][0]["value"] + else: + raise NotImplementedError("up to binary mixtures are currently supported") + + temp_unit = getattr( + unit, phys_prop["thermodynamic_state"]["temperature"]["unit"] + ) + temp = phys_prop["thermodynamic_state"]["temperature"]["value"] * temp_unit + pressure_unit = getattr( + unit, phys_prop["thermodynamic_state"]["pressure"]["unit"] + ) + pressure = phys_prop["thermodynamic_state"]["pressure"]["value"] * pressure_unit + value = phys_prop["value"]["value"] * getattr(unit, phys_prop["value"]["unit"]) + std = phys_prop["uncertainty"]["value"] * getattr( + unit, phys_prop["uncertainty"]["unit"] + ) + default_units = getattr(unit, _prop_units[prop_type]) + prop = { + "type": prop_type, + "smiles_a": smiles_a, + "x_a": x_a, + "smiles_b": smiles_b, + "x_b": x_b, + "temperature": temp.to(unit.kelvin).m, + "pressure": pressure.to(unit.atm).m, + "value": value.to(default_units).m, + "units": _prop_units[prop_type], + "std": std.to(default_units).m, + "source": phys_prop["source"]["doi"], + } + properties.append(prop) + + return create_dataset(*properties) + + def extract_smiles(dataset: datasets.Dataset) -> list[str]: """Return a list of unique SMILES strings in the dataset. diff --git a/descent/tests/data/evaluator_mock.json b/descent/tests/data/evaluator_mock.json new file mode 100644 index 0000000..27addf8 --- /dev/null +++ b/descent/tests/data/evaluator_mock.json @@ -0,0 +1 @@ +{"properties": [{"id": "1", "substance": {"components": [{"smiles": "CCO", "role": {"value": "solv", "@type": "openff.evaluator.substances.components.Component.Role"}, "@type": "openff.evaluator.substances.components.Component"}, {"smiles": "O", "role": {"value": "solv", "@type": "openff.evaluator.substances.components.Component.Role"}, "@type": "openff.evaluator.substances.components.Component"}], "amounts": {"CCO{solv}": [{"value": 0.48268, "@type": "openff.evaluator.substances.amounts.MoleFraction"}], "O{solv}": [{"value": 0.51732, "@type": "openff.evaluator.substances.amounts.MoleFraction"}]}, "@type": "openff.evaluator.substances.substances.Substance"}, "phase": 2, "thermodynamic_state": {"temperature": {"value": 298.15, "unit": "kelvin", "@type": "openff.evaluator.unit.Quantity"}, "pressure": {"value": 101.3, "unit": "kilopascal", "@type": "openff.evaluator.unit.Quantity"}, "@type": "openff.evaluator.thermodynamics.ThermodynamicState"}, "value": {"value": 0.99, "unit": "gram / milliliter", "@type": "openff.evaluator.unit.Quantity"}, "uncertainty": {"value": 0.000505, "unit": "gram / milliliter", "@type": "openff.evaluator.unit.Quantity"}, "source": {"doi": "mock", "reference": "", "@type": "openff.evaluator.datasets.provenance.MeasurementSource"}, "gradients": [], "@type": "openff.evaluator.properties.density.Density"}]} \ No newline at end of file diff --git a/descent/tests/data/missing_property_evaluator.json b/descent/tests/data/missing_property_evaluator.json new file mode 100644 index 0000000..23efae1 --- /dev/null +++ b/descent/tests/data/missing_property_evaluator.json @@ -0,0 +1 @@ +{"properties": [{"id": "1", "substance": {"components": [{"smiles": "CC(C)(C)O", "role": {"value": "solv", "@type": "openff.evaluator.substances.components.Component.Role"}, "@type": "openff.evaluator.substances.components.Component"}, {"smiles": "O", "role": {"value": "solv", "@type": "openff.evaluator.substances.components.Component.Role"}, "@type": "openff.evaluator.substances.components.Component"}], "amounts": {"CC(C)(C)O{solv}": [{"value": 0.48268, "@type": "openff.evaluator.substances.amounts.MoleFraction"}], "O{solv}": [{"value": 0.51732, "@type": "openff.evaluator.substances.amounts.MoleFraction"}]}, "@type": "openff.evaluator.substances.substances.Substance"}, "phase": 2, "thermodynamic_state": {"temperature": {"value": 298.15, "unit": "kelvin", "@type": "openff.evaluator.unit.Quantity"}, "pressure": {"value": 101.3, "unit": "kilopascal", "@type": "openff.evaluator.unit.Quantity"}, "@type": "openff.evaluator.thermodynamics.ThermodynamicState"}, "value": {"value": 0.99, "unit": "gram / milliliter", "@type": "openff.evaluator.unit.Quantity"}, "uncertainty": {"value": 0.000505, "unit": "gram / milliliter", "@type": "openff.evaluator.unit.Quantity"}, "source": {"doi": "mock", "reference": "", "@type": "openff.evaluator.datasets.provenance.MeasurementSource"}, "gradients": [], "@type": "openff.evaluator.properties.density.DielectricConstant"}]} \ No newline at end of file diff --git a/descent/tests/targets/test_thermo.py b/descent/tests/targets/test_thermo.py index 6abc9b4..a32f199 100644 --- a/descent/tests/targets/test_thermo.py +++ b/descent/tests/targets/test_thermo.py @@ -16,6 +16,7 @@ _predict, _simulate, create_dataset, + create_from_evaluator, default_closure, default_config, extract_smiles, @@ -583,3 +584,30 @@ def test_default_closure(tmp_cwd, mock_density_pure, mocker): assert torch.isclose(loss, expected_loss) assert grad.shape == mock_x.shape assert hess.shape == (1, 1) + + +def test_create_from_evaluator(data_dir): + dataset = create_from_evaluator(dataset_file=data_dir / "evaluator_mock.json") + + entries = list(descent.utils.dataset.iter_dataset(dataset)) + expected = { + "smiles_a": "[C:1]([C:2]([O:3][H:9])([H:7])[H:8])([H:4])([H:5])[H:6]", + "x_a": 0.48268, + "smiles_b": "[O:1]([H:2])[H:3]", + "x_b": 0.51732, + "temperature": 298.15, + "pressure": 0.999753269183321, + "value": 0.99, + "std": 0.000505, + "units": "g/mL", + "source": "mock", + "type": "density", + } + assert entries[0] == expected + + +def test_unsupported_property(data_dir): + with pytest.raises(KeyError): + _ = create_from_evaluator( + dataset_file=data_dir / "missing_property_evaluator.json" + )