diff --git a/package/AUTHORS b/package/AUTHORS index c125912f9b3..3f63933159a 100644 --- a/package/AUTHORS +++ b/package/AUTHORS @@ -185,6 +185,7 @@ Chronological list of authors - Mingyi Xue - Meghan Osato - Anirvinya G + - Rishabh Shukla External code ------------- diff --git a/package/CHANGELOG b/package/CHANGELOG index 1a60b1b1e53..f6db2084b28 100644 --- a/package/CHANGELOG +++ b/package/CHANGELOG @@ -14,7 +14,7 @@ The rules for this file: ------------------------------------------------------------------------------ ??/??/?? IAlibay, BFedder, inomag, Agorfa, aya9aladdin, shudipto-amin, cbouy, - HenokB, umak1106, tamandeeps, Mrqeoqqt, megosato, AnirG + HenokB, umak1106, tamandeeps, Mrqeoqqt, megosato, AnirG, rishu235 * 2.2.0 @@ -36,6 +36,8 @@ Fixes * Fixed FrameIteratorIndices doesn't rewind. (Issue #3416) Enhancements + * PCA class now has an inverse-PCA function. The inverse transform can + also be extrapolated to certain non-PCA atoms. (PR #3596, Issue #2703) * `DistanceMatrix` class in MDAnalysis.analysis.diffusionmap now also accepts `AtomGroup` as a valid argument (Issue #3486, PR #3541) * Improves the RDKitConverter's accuracy (PR #3044): AtomGroups containing diff --git a/package/MDAnalysis/analysis/pca.py b/package/MDAnalysis/analysis/pca.py index a4066ed78be..780c36e41f8 100644 --- a/package/MDAnalysis/analysis/pca.py +++ b/package/MDAnalysis/analysis/pca.py @@ -414,6 +414,174 @@ def transform(self, atomgroup, n_components=None, start=None, stop=None, return dot + def project_single_frame(self, components=None, group=None, anchor=None): + r"""Computes a function to project structures onto selected PCs + + Applies Inverse-PCA transform to the PCA atomgroup. + Optionally, calculates one displacement vector per residue + to extrapolate the transform to atoms not in the PCA atomgroup. + + Parameters + ---------- + components : int, array, optional + Components to be projected onto. + The default ``None`` maps onto all components. + + group : AtomGroup, optional + The AtomGroup containing atoms to be projected. + The projection applies to whole residues in ``group``. + The atoms in the PCA class are not affected by this argument. + The default ``None`` does not extrapolate the projection + to non-PCA atoms. + + anchor : string, optional + The string to select the PCA atom whose displacement vector + is applied to non-PCA atoms in a residue. The ``anchor`` selection + is applied to ``group``.The resulting atomselection must have + exactly one PCA atom in each residue of ``group``. + The default ``None`` does not extrapolate the projection + to non-PCA atoms. + + Returns + ------- + function + The resulting function f(ts) takes as input a + :class:`~MDAnalysis.coordinates.base.Timestep` ts, + and returns ts with the projected structure + + .. warning:: + The transformation function takes a :class:``Timestep`` as input + because this is required for :ref:``transformations``. + However, the inverse-PCA transformation is applied on the atoms + of the Universe that was used for the PCA. It is *expected* + that the `ts` is from the same Universe but this is + currently not checked. + + Notes + ----- + When the PCA class is run for an atomgroup, the principal components + are cached. The trajectory can then be projected onto one or more of + these principal components. Since the principal components are sorted + in the order of decreasing explained variance, the first few components + capture the essential molecular motion. + + If N is the number of atoms in the PCA group, each component has the + length 3N. A PCA score :math:`w\_i`, along component :math:`u\_i`, is + calculated for a set of coordinates :math:`(r(t))` of the same atoms. + The PCA scores are then used to transform the structure, :math:`(r(t))` + at a timestep, back to the original space. + + .. math:: + + w_{i}(t) = ({\textbf r}(t) - \bar{{\textbf r}}) \cdot + {\textbf u}_i \\ + {\textbf r'}(t) = (w_{i}(t) \cdot {\textbf u}_i^T) + + \bar{{\textbf r}} + + For each residue, the projection can be extended to atoms that were + not part of PCA by applying the displacement vector of a PCA atom to + all the atoms in the residue. This could be useful to preserve the bond + distance between a PCA atom and other non-PCA atoms in a residue. + + If there are r residues and n non-PCA atoms in total, the displacement + vector has the size 3r. This needs to be broadcasted to a size 3n. An + extrapolation trick is used to shape the array, since going over each + residue for each frame can be expensive. Non-PCA atoms' displacement + vector is calculated with fancy indexing on the anchors' displacement + vector. `index_extrapolate` saves which atoms belong to which anchors. + If there are two non-PCA atoms in the first anchor's residue and three + in the second anchor's residue, `index_extrapolate` is [0, 0, 1, 1, 1] + + Examples + -------- + Run PCA class before using this function. For backbone PCA, run:: + + pca = PCA(universe, select='backbone').run() + + Obtain a transformation function to project the + backbone trajectory onto the first principal component:: + + project = pca.project_single_frame(components=0) + + To project onto the first two components, run:: + + project = pca.project_single_frame(components=[0,1]) + + Alternatively, the transformation can be applied to PCA atoms and + extrapolated to other atoms according to the CA atom's translation + in each residue:: + + all = u.select_atoms('all') + project = pca.project_single_frame(components=0, + group=all, anchor='name CA') + + Finally, apply the transformation function to a timestep:: + + project(u.trajectory.ts) + + or apply the projection to the universe:: + + u.trajectory.add_transformations(project) + + + .. versionadded:: 2.2.0 + """ + if not self._calculated: + raise ValueError('Call run() on the PCA before projecting') + + if group is not None: + if anchor is None: + raise ValueError("'anchor' cannot be 'None'" + + " if 'group' is not 'None'") + + anchors = group.select_atoms(anchor) + anchors_res_ids = anchors.resindices + if np.unique(anchors_res_ids).size != anchors_res_ids.size: + raise ValueError("More than one 'anchor' found in residues") + if not np.isin(group.resindices, anchors_res_ids).all(): + raise ValueError("Some residues in 'group'" + + " do not have an 'anchor'") + if not anchors.issubset(self._atoms): + raise ValueError("Some 'anchors' are not part of PCA class") + + # non_pca has "all" the atoms in residues of `group`. This makes + # sure that extrapolation works on residues, not random atoms. + non_pca = group.residues.atoms - self._atoms + pca_res_indices, pca_res_counts = np.unique( + self._atoms.resindices, return_counts=True) + + non_pca_atoms = np.array([], dtype=int) + for res in group.residues: + # n_common is the number of pca atoms in a residue + n_common = pca_res_counts[np.where( + pca_res_indices == res.resindex)][0] + non_pca_atoms = np.append(non_pca_atoms, + res.atoms.n_atoms - n_common) + # index_extrapolate records the anchor number for each non-PCA atom + index_extrapolate = np.repeat(np.arange(anchors.atoms.n_atoms), + non_pca_atoms) + + if components is None: + components = np.arange(self.results.p_components.shape[1]) + + def wrapped(ts): + """Projects a timestep""" + if group is not None: + anchors_coords_old = anchors.positions + + xyz = self._atoms.positions.ravel() - self._xmean + self._atoms.positions = np.reshape( + (np.dot(np.dot(xyz, self._p_components[:, components]), + self._p_components[:, components].T) + + self._xmean), (-1, 3) + ) + + if group is not None: + non_pca.positions += (anchors.positions - + anchors_coords_old)[index_extrapolate] + return ts + return wrapped + @due.dcite( Doi('10.1002/(SICI)1097-0134(19990901)36:4<419::AID-PROT5>3.0.CO;2-U'), Doi('10.1529/biophysj.104.052449'), diff --git a/testsuite/MDAnalysisTests/analysis/test_pca.py b/testsuite/MDAnalysisTests/analysis/test_pca.py index 931a04673f8..c77129ac64d 100644 --- a/testsuite/MDAnalysisTests/analysis/test_pca.py +++ b/testsuite/MDAnalysisTests/analysis/test_pca.py @@ -27,7 +27,7 @@ rmsip, cumulative_overlap) from numpy.testing import (assert_almost_equal, assert_equal, - assert_array_almost_equal) + assert_array_almost_equal, assert_allclose) from MDAnalysisTests.datafiles import (PSF, DCD, RANDOM_WALK, RANDOM_WALK_TOPO, waterPSF, waterDCD) @@ -41,6 +41,12 @@ def u(): return mda.Universe(PSF, DCD) +@pytest.fixture(scope='function') +def u_fresh(): + # each test gets a fresh universe + return mda.Universe(PSF, DCD) + + @pytest.fixture(scope='module') def u_aligned(): u = mda.Universe(PSF, DCD, in_memory=True) @@ -145,6 +151,110 @@ def test_transform_universe(): pca_test.transform(u2) +def test_project_no_pca_run(u, pca): + pca_class = PCA(u, select=SELECTION) + with pytest.raises(ValueError) as exc: + pca_class.project_single_frame() + assert 'Call run() on the PCA before projecting' in str(exc.value) + + +def test_project_none_anchor(u, pca): + group = u.select_atoms('resnum 1') + with pytest.raises(ValueError) as exc: + func = pca.project_single_frame(0, group=group, anchor=None) + assert ("'anchor' cannot be 'None'" + + " if 'group' is not 'None'") in str(exc.value) + + +def test_project_more_anchor(u, pca): + group = u.select_atoms('resnum 1') + with pytest.raises(ValueError) as exc: + project = pca.project_single_frame(0, group=group, anchor='backbone') + assert "More than one 'anchor' found in residues" in str(exc.value) + + +def test_project_less_anchor(u, pca): + group = u.select_atoms('all') + with pytest.raises(ValueError) as exc: + project = pca.project_single_frame(0, group=group, anchor='name CB') + assert ("Some residues in 'group'" + + " do not have an 'anchor'") in str(exc.value) + + +def test_project_invalid_anchor(u): + pca = PCA(u, select='name CA').run() + group = u.select_atoms('all') + with pytest.raises(ValueError) as exc: + project = pca.project_single_frame(0, group=group, anchor='name N') + assert "Some 'anchors' are not part of PCA class" in str(exc.value) + + +def test_project_compare_projections(u_fresh): + # projections along different PCs should be different + pca = PCA(u_fresh, select=SELECTION).run() + project0 = pca.project_single_frame(0) + project1 = pca.project_single_frame(1) + + u_fresh.trajectory[0] + coord0 = project0(u_fresh.trajectory.ts).positions + u_fresh.trajectory[0] + coord1 = project1(u_fresh.trajectory.ts).positions + assert not np.allclose(coord0, coord1, rtol=1e-05) + + +def test_project_reconstruct_whole(u, u_fresh): + # structure projected along all PCs + # should be same as the original structure + pca = PCA(u_fresh, select=SELECTION).run() + project = pca.project_single_frame() + + coord_original = u.trajectory.ts.positions + coord_reconstructed = project(u_fresh.trajectory.ts).positions + assert_allclose(coord_original, coord_reconstructed, rtol=1e-5) + + +@pytest.mark.parametrize( + ("n1", "n2"), + [(0, 0), (0, [0]), ([0, 1], [0, 1]), (0, 1), (1, 0)] +) +def test_project_twice_projection(u_fresh, n1, n2): + # Two succesive projections are applied. The second projection does nothing + # if both projections are along the same PC(s). + # Single PC input as an array should be equivalent to a scalar + pca = PCA(u_fresh, select=SELECTION).run() + + project_first = pca.project_single_frame(n1) + project_second = pca.project_single_frame(n2) + + u_fresh.trajectory[0] + coord1 = project_first(u_fresh.trajectory.ts).positions.copy() + coord2 = project_second(u_fresh.trajectory.ts).positions + + if np.array_equiv(n1, n2): + assert np.allclose(coord1, coord2, rtol=1e-5) + else: + assert not np.allclose(coord1, coord2, rtol=1e-05) + + +def test_project_extrapolate_translation(u_fresh): + # when the projection is extended to non-PCA atoms, + # non-PCA atoms' coordinates will be conserved relative to the anchor atom + pca = PCA(u_fresh, select='resnum 1 and backbone').run() + sel = 'resnum 1 and name CA CB CG' + group = u_fresh.select_atoms(sel) + project = pca.project_single_frame(0, group=group, + anchor='name CA') + + distances_original = ( + mda.lib.distances.self_distance_array(group.positions) + ) + distances_new = ( + mda.lib.distances.self_distance_array(project(group).positions) + ) + + assert_allclose(distances_original, distances_new, rtol=1e-05) + + def test_cosine_content(): rand = mda.Universe(RANDOM_WALK_TOPO, RANDOM_WALK) pca_random = PCA(rand).run()