Merge branch 'master' into tjlane/diffmap_script-refactor

rs-station · Oct 24, 2024 · 25d96e9 · 25d96e9
2 parents ecdf676 + d71405a
commit 25d96e9
Show file tree

Hide file tree

Showing 27 changed files with 3,214 additions and 98 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1 @@
+test/data/scaled-test-data.mtz filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -10,7 +10,9 @@ jobs:
         python-version: ["3.11"]
 
     steps:
-    - uses: actions/checkout@v4
+    - name: Checkout
+      uses: actions/checkout@v4
+
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:

diff --git a/.github/workflows/mypy.yml b/.github/workflows/mypy.yml
@@ -10,7 +10,9 @@ jobs:
         python-version: ["3.11"]
 
     steps:
-    - uses: actions/checkout@v4
+    - name: Checkout
+      uses: actions/checkout@v4
+
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -10,12 +10,16 @@ jobs:
         python-version: ["3.11"]
 
     steps:
-    - uses: actions/checkout@v4
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        lfs: true
+
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v5
       with:
         python-version: ${{ matrix.python-version }}
-
+        
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

diff --git a/.gitignore b/.gitignore
@@ -2,7 +2,13 @@
 __pycache__/
 *.py[cod]
 *$py.class
+
+# large crystallography files -- some of which are used in tests
+# large files are accessed via github LFS
 *.ccp4
+*.map
+*.mtz
+*.pdb
 
 # C extensions
 *.so

diff --git a/README.md b/README.md
@@ -5,6 +5,5 @@ Map Enhancement Tools for Ephemeral Occupancy Refinement
 [![Pytest](https://github.com/alisiafadini/meteor/actions/workflows/tests.yml/badge.svg)](https://github.com/your_username/your_repo/actions/workflows/tests.yml)
 [![Mypy](https://github.com/alisiafadini/meteor/actions/workflows/mypy.yml/badge.svg)](https://github.com/your_username/your_repo/actions/workflows/mypy.yml)
 [![Ruff](https://github.com/alisiafadini/meteor/actions/workflows/lint.yml/badge.svg)](https://github.com/your_username/your_repo/actions/workflows/lint.yml)
-[![codecov](https://codecov.io/gh/your_username/your_repo/branch/main/graph/badge.svg)](https://codecov.io/gh/your_username/your_repo)
-
+[![codecov](https://codecov.io/gh/alisiafadini/meteor/graph/badge.svg?token=NCYMP06MNS)](https://codecov.io/gh/alisiafadini/meteor)
 Set of tools for crystallographic electron density maps containing low ligand/species populations.
diff --git a/meteor/io.py b/meteor/io.py
@@ -0,0 +1,76 @@
+"""https://www.ccp4.ac.uk/html/mtzformat.html
+https://www.globalphasing.com/buster/wiki/index.cgi?MTZcolumns
+"""
+
+from __future__ import annotations
+
+import re
+from typing import Final
+
+OBSERVED_INTENSITY_LABELS: Final[list[str]] = [
+    "I",  # generic
+    "IMEAN",  # CCP4
+    "I-obs",  # phenix
+]
+
+OBSERVED_AMPLITUDE_LABELS: Final[list[str]] = [
+    "F",  # generic
+    "FP",  # CCP4 & GLPh native
+    r"FPH\d",  # CCP4 derivative
+    "F-obs",  # phenix
+]
+
+OBSERVED_UNCERTAINTY_LABELS: Final[list[str]] = [
+    "SIGF",  # generic
+    "SIGFP",  # CCP4 & GLPh native
+    r"SIGFPH\d",  # CCP4
+]
+
+COMPUTED_AMPLITUDE_LABELS: Final[list[str]] = ["FC"]
+
+COMPUTED_PHASE_LABELS: Final[list[str]] = ["PHIC"]
+
+
+class AmbiguousMtzLabelError(ValueError): ...
+
+
+def _infer_mtz_label(labels_to_search: list[str], labels_to_look_for: list[str]) -> str:
+    # the next line consumes ["FOO", "BAR", "BAZ"] and produces regex strings like "^(FOO|BAR|BAZ)$"
+    regex = re.compile(f"^({'|'.join(labels_to_look_for)})$")
+    matches = [regex.match(label) for label in labels_to_search if regex.match(label) is not None]
+
+    if len(matches) == 0:
+        msg = "cannot infer MTZ column name; "
+        msg += f"cannot find any of {labels_to_look_for} in {labels_to_search}"
+        raise AmbiguousMtzLabelError(msg)
+    if len(matches) > 1:
+        msg = "cannot infer MTZ column name; "
+        msg += f">1 instance of {labels_to_look_for} in {labels_to_search}"
+        raise AmbiguousMtzLabelError(msg)
+
+    [match] = matches
+    if match is None:
+        msg = "`None` not filtered during regex matching"
+        raise RuntimeError(msg)
+
+    return match.group(0)
+
+
+def find_observed_intensity_label(mtz_column_labels: list[str]) -> str:
+    return _infer_mtz_label(mtz_column_labels, OBSERVED_INTENSITY_LABELS)
+
+
+def find_observed_amplitude_label(mtz_column_labels: list[str]) -> str:
+    return _infer_mtz_label(mtz_column_labels, OBSERVED_AMPLITUDE_LABELS)
+
+
+def find_observed_uncertainty_label(mtz_column_labels: list[str]) -> str:
+    return _infer_mtz_label(mtz_column_labels, OBSERVED_UNCERTAINTY_LABELS)
+
+
+def find_computed_amplitude_label(mtz_column_labels: list[str]) -> str:
+    return _infer_mtz_label(mtz_column_labels, COMPUTED_AMPLITUDE_LABELS)
+
+
+def find_computed_phase_label(mtz_column_labels: list[str]) -> str:
+    return _infer_mtz_label(mtz_column_labels, COMPUTED_PHASE_LABELS)
diff --git a/meteor/rsmap.py b/meteor/rsmap.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
-from typing import Any, ClassVar
+from typing import Any, Callable, ClassVar, Literal, overload
 
 import gemmi
 import numpy as np
@@ -69,7 +69,7 @@ def __init__(
         amplitude_column: str = "F",
         phase_column: str = "PHI",
         uncertainty_column: str | None = "SIGF",
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(data=data, **kwargs)
 
@@ -99,8 +99,8 @@ def __init__(
             self.uncertainties = self._verify_uncertainty_type(self.uncertainties, fix=True)
 
     @property
-    def _constructor(self):
-        def constructor_fxn(*args, **kwargs):
+    def _constructor(self) -> Callable[[Any], Map]:
+        def constructor_fxn(*args: Any, **kwargs: Any) -> Map:
             return Map(
                 *args,
                 amplitude_column=self._amplitude_column,
@@ -112,7 +112,7 @@ def constructor_fxn(*args, **kwargs):
         return constructor_fxn
 
     @property
-    def _constructor_sliced(self):
+    def _constructor_sliced(self) -> Callable[[Any], rs.DataSeries]:
         return rs.DataSeries
 
     def _verify_type(
@@ -175,7 +175,7 @@ def _verify_uncertainty_type(
             name, uncertainty_dtypes, dataseries, fix=fix, cast_fix_to=rs.StandardDeviationDtype()
         )
 
-    def __setitem__(self, key: str, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         allowed = list(self.columns) + self._allowed_columns
         if key not in allowed:
             msg = "column assignment not allowed for Map objects"
@@ -191,11 +191,14 @@ def insert(self, loc: int, column: str, value: Any, *, allow_duplicates: bool =
             msg += "see also Map.set_uncertainties(...)"
             raise MapMutabilityError(msg)
 
-    def drop(self, labels=None, *, axis=0, columns=None, inplace=False, **kwargs):
-        if (axis == 1) or (columns is not None):
-            msg = "columns are fixed for Map objects"
-            raise MapMutabilityError(msg)
-        return super().drop(labels=labels, axis=axis, columns=columns, inplace=inplace, **kwargs)
+    @overload
+    def drop(self, labels: Any = None, *, inplace: Literal[True]) -> None: ...
+
+    @overload
+    def drop(self, labels: Any = None, *, inplace: Literal[False]) -> Map: ...
+
+    def drop(self, labels: Any = None, *, inplace: bool = False) -> None | Map:
+        return super().drop(labels=labels, axis="index", columns=None, inplace=inplace)
 
     def get_hkls(self) -> np.ndarray:
         # overwrite rs implt'n, return w/o modifying self -> same behavior, under testing - @tjlane
@@ -272,7 +275,7 @@ def set_uncertainties(self, values: rs.DataSeries, column_name: str = "SIGF") ->
     def complex_amplitudes(self) -> np.ndarray:
         return self.amplitudes.to_numpy() * np.exp(1j * np.deg2rad(self.phases.to_numpy()))
 
-    def canonicalize_amplitudes(self):
+    def canonicalize_amplitudes(self) -> None:
         canonicalize_amplitudes(
             self,
             amplitude_label=self._amplitude_column,

diff --git a/meteor/scale.py b/meteor/scale.py
@@ -41,6 +41,12 @@ def _compute_anisotropic_scale_factors(
     return anisotropic_scale_parameters[0] * np.exp(exponential_argument)
 
 
+def _check_index_consistency(series1: rs.DataSeries, series2: rs.DataSeries) -> None:
+    if not series1.index.equals(series2.index):
+        msg = f"indices of `{series1.name}`, `{series2.name}` differ"
+        raise IndexError(msg)
+
+
 def compute_scale_factors(
     *,
     reference_values: rs.DataSeries,
@@ -87,37 +93,38 @@ def compute_scale_factors(
     ----------
     [1] SCALEIT https://www.ccp4.ac.uk/html/scaleit.html
     """
-    common_miller_indices = reference_values.index.intersection(values_to_scale.index)
+    common_miller_indices: pd.Index = reference_values.index.intersection(values_to_scale.index)
+
+    common_reference_values: np.ndarray = reference_values.loc[common_miller_indices].to_numpy()
+    common_values_to_scale: np.ndarray = values_to_scale.loc[common_miller_indices].to_numpy()
 
     # if we are going to weight the scaling using the uncertainty values, then the weights will be
     #    inverse_sigma = 1 / sqrt{ sigmaA ** 2 + sigmaB ** 2 }
-    if reference_uncertainties is not None and to_scale_uncertainties is not None:
-        if not reference_uncertainties.index.equals(reference_values.index):
-            msg = "indices of `reference_uncertainties`, `reference_values` differ, cannot combine"
-            raise IndexError(msg)
-        if not to_scale_uncertainties.index.equals(values_to_scale.index):
-            msg = "indices of `to_scale_uncertainties`, `values_to_scale` differ, cannot combine"
-            raise IndexError(msg)
-
+    uncertainty_weights: np.ndarray
+    if (reference_uncertainties is not None) and (to_scale_uncertainties is not None):
+        _check_index_consistency(reference_values, reference_uncertainties)
+        _check_index_consistency(values_to_scale, to_scale_uncertainties)
         uncertainty_weights = np.sqrt(
-            np.square(reference_uncertainties.loc[common_miller_indices])
-            + np.square(to_scale_uncertainties.loc[common_miller_indices]),
+            np.square(reference_uncertainties.loc[common_miller_indices].to_numpy())
+            + np.square(to_scale_uncertainties.loc[common_miller_indices].to_numpy()),
         )
-
     else:
-        uncertainty_weights = 1.0
-
-    common_reference_values = reference_values.loc[common_miller_indices]
-    common_values_to_scale = values_to_scale.loc[common_miller_indices]
+        uncertainty_weights = np.array(1.0)
 
     def compute_residuals(scaling_parameters: ScaleParameters) -> np.ndarray:
         scale_factors = _compute_anisotropic_scale_factors(
             common_miller_indices,
             scaling_parameters,
         )
-        return uncertainty_weights * (
-            scale_factors * common_values_to_scale - common_reference_values
-        )
+
+        difference_after_scaling = scale_factors * common_values_to_scale - common_reference_values
+        residuals = uncertainty_weights * difference_after_scaling
+
+        if not isinstance(residuals, np.ndarray):
+            msg = "scipy optimizers' behavior is unstable unless `np.ndarray`s are used"
+            raise TypeError(msg)
+
+        return residuals
 
     initial_scaling_parameters: ScaleParameters = (1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)
     optimization_result = opt.least_squares(compute_residuals, initial_scaling_parameters)

diff --git a/meteor/sfcalc.py b/meteor/sfcalc.py
@@ -0,0 +1,27 @@
+from pathlib import Path
+
+import gemmi
+
+from .rsmap import Map
+
+
+def structure_to_calculated_map(structure: gemmi.Structure, *, high_resolution_limit: float) -> Map:
+    density_map = gemmi.DensityCalculatorX()
+    density_map.d_min = high_resolution_limit
+    density_map.grid.setup_from(structure)
+    for i, _ in enumerate(structure):
+        density_map.put_model_density_on_grid(structure[i])
+
+    ccp4_map = gemmi.Ccp4Map()
+    ccp4_map.grid = density_map.grid
+    ccp4_map.update_ccp4_header()
+
+    return Map.from_ccp4_map(ccp4_map, high_resolution_limit=high_resolution_limit)
+
+
+def pdb_to_calculated_map(pdb_file: Path, *, high_resolution_limit: float) -> Map:
+    if not pdb_file.exists():
+        msg = f"could not find file: {pdb_file}"
+        raise OSError(msg)
+    structure = gemmi.read_structure(str(pdb_file))
+    return structure_to_calculated_map(structure, high_resolution_limit=high_resolution_limit)
diff --git a/meteor/testing.py b/meteor/testing.py
@@ -2,6 +2,7 @@
 
 from dataclasses import dataclass
 
+import gemmi
 import numpy as np
 
 
@@ -12,10 +13,59 @@ class MapColumns:
     uncertainty: str | None = None
 
 
-def assert_phases_allclose(array1: np.ndarray, array2: np.ndarray, atol=1e-3):
+def assert_phases_allclose(array1: np.ndarray, array2: np.ndarray, atol: float = 1e-3) -> None:
     diff = array2 - array1
     diff = (diff + 180) % 360 - 180
     absolute_difference = np.sum(np.abs(diff)) / float(np.prod(diff.shape))
     if not absolute_difference < atol:
         msg = f"per element diff {absolute_difference} > tolerance {atol}"
         raise AssertionError(msg)
+
+
+def single_carbon_structure(
+    carbon_position: tuple[float, float, float],
+    space_group: gemmi.SpaceGroup,
+    unit_cell: gemmi.UnitCell,
+) -> gemmi.Structure:
+    model = gemmi.Model("single_atom")
+    chain = gemmi.Chain("A")
+
+    residue = gemmi.Residue()
+    residue.name = "X"
+    residue.seqid = gemmi.SeqId("1")
+
+    atom = gemmi.Atom()
+    atom.name = "C"
+    atom.element = gemmi.Element("C")
+    atom.pos = gemmi.Position(*carbon_position)
+
+    residue.add_atom(atom)
+    chain.add_residue(residue)
+    model.add_chain(chain)
+
+    structure = gemmi.Structure()
+    structure.add_model(model)
+    structure.cell = unit_cell
+    structure.spacegroup_hm = space_group.hm
+
+    return structure
+
+
+def single_carbon_density(
+    carbon_position: tuple[float, float, float],
+    space_group: gemmi.SpaceGroup,
+    unit_cell: gemmi.UnitCell,
+    high_resolution_limit: float,
+) -> gemmi.Ccp4Map:
+    structure = single_carbon_structure(carbon_position, space_group, unit_cell)
+
+    density_map = gemmi.DensityCalculatorX()
+    density_map.d_min = high_resolution_limit
+    density_map.grid.setup_from(structure)
+    density_map.put_model_density_on_grid(structure[0])
+
+    ccp4_map = gemmi.Ccp4Map()
+    ccp4_map.grid = density_map.grid
+    ccp4_map.update_ccp4_header()
+
+    return ccp4_map
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		test/data/scaled-test-data.mtz filter=lfs diff=lfs merge=lfs -text