Add plain Gradient Descent optimizer (Qiskit/qiskit#6459)

* implement plain Gradient Descent optimizer * replace leftover "SPSA" -> "GradientDescent" * update docstring in init * lint * add a proper docstring with examples * add callback test, mark slow test as slow_test * add reno * add test for learning rate as iterator * add stepsize to callback * Update qiskit/algorithms/optimizers/gradient_descent.py Add copyright * fix SPSA -> Gradient descent in docstrings Co-authored-by: Steve Wood <40241007+woodsp-ibm@users.noreply.github.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
ElePT · Jun 1, 2021 · bd3697f · bd3697f
1 parent e2b7c86
commit bd3697f
Show file tree

Hide file tree

Showing 4 changed files with 335 additions and 4 deletions.
diff --git a/qiskit_algorithms/optimizers/__init__.py b/qiskit_algorithms/optimizers/__init__.py
@@ -50,6 +50,7 @@
    COBYLA
    L_BFGS_B
    GSLS
+   GradientDescent
    NELDER_MEAD
    NFT
    P_BFGS
@@ -104,6 +105,7 @@
 from .cg import CG
 from .cobyla import COBYLA
 from .gsls import GSLS
+from .gradient_descent import GradientDescent
 from .imfil import IMFIL
 from .l_bfgs_b import L_BFGS_B
 from .nelder_mead import NELDER_MEAD
@@ -131,6 +133,7 @@
     "CG",
     "COBYLA",
     "GSLS",
+    "GradientDescent",
     "L_BFGS_B",
     "NELDER_MEAD",
     "NFT",

diff --git a/qiskit_algorithms/optimizers/gradient_descent.py b/qiskit_algorithms/optimizers/gradient_descent.py
@@ -0,0 +1,201 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2021.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""A standard gradient descent optimizer."""
+
+from typing import Iterator, Optional, Union, Callable
+from functools import partial
+
+import numpy as np
+
+from .optimizer import Optimizer, OptimizerSupportLevel
+
+CALLBACK = Callable[[int, np.ndarray, float, float], None]
+
+
+class GradientDescent(Optimizer):
+    r"""The gradient descent minimization routine.
+
+    For a function :math:`f` and an initial point :math:`\vec\theta_0`, the standard (or "vanilla")
+    gradient descent method is an iterative scheme to find the minimum :math:`\vec\theta^*` of
+    :math:`f` by updating the parameters in the direction of the negative gradient of :math:`f`
+
+    .. math::
+
+        \vec\theta_{n+1} = \vec\theta_{n} - \vec\eta\nabla f(\vec\theta_{n}),
+
+    for a small learning rate :math:`\eta > 0`.
+
+    You can either provide the analytic gradient :math:`\vec\nabla f` as ``gradient_function``
+    in the ``optimize`` method, or, if you do not provide it, use a finite difference approximation
+    of the gradient. To adapt the size of the perturbation in the finite difference gradients,
+    set the ``perturbation`` property in the initializer.
+
+    This optimizer supports a callback function. If provided in the initializer, the optimizer
+    will call the callback in each iteration with the following information in this order:
+    current number of function values, current parameters, current function value, norm of current
+    gradient.
+
+    Examples:
+
+        A minimum example that will use finite difference gradients with a default perturbation
+        of 0.01 and a default learning rate of 0.01.
+
+        .. code-block::python
+
+            from qiskit.algorithms.optimizers import GradientDescent
+
+            def f(x):
+                return (np.linalg.norm(x) - 1) ** 2
+
+            initial_point = np.array([1, 0.5, -0.2])
+
+            optimizer = GradientDescent(maxiter=100)
+            x_opt, fx_opt, nfevs = optimizer.optimize(initial_point.size,
+                                                      f,
+                                                      initial_point=initial_point)
+
+            print(f"Found minimum {x_opt} at a value of {fx_opt} using {nfevs} evaluations.")
+
+        An example where the learning rate is an iterator and we supply the analytic gradient.
+        Note how much faster this convergences (i.e. less ``nfevs``) compared to the previous
+        example.
+
+        .. code-block::python
+
+            from qiskit.algorithms.optimizers import GradientDescent
+
+            def learning_rate():
+                power = 0.6
+                constant_coeff = 0.1
+
+                def powerlaw():
+                    n = 0
+                    while True:
+                        yield constant_coeff * (n ** power)
+                        n += 1
+
+                return powerlaw()
+
+            def f(x):
+                return (np.linalg.norm(x) - 1) ** 2
+
+            def grad_f(x):
+                return 2 * (np.linalg.norm(x) - 1) * x / np.linalg.norm(x)
+
+            initial_point = np.array([1, 0.5, -0.2])
+
+            optimizer = GradientDescent(maxiter=100, learning_rate=learning_rate)
+            x_opt, fx_opt, nfevs = optimizer.optimize(initial_point.size,
+                                                      f,
+                                                      gradient_function=grad_f,
+                                                      initial_point=initial_point)
+
+            print(f"Found minimum {x_opt} at a value of {fx_opt} using {nfevs} evaluations.")
+
+    """
+
+    def __init__(
+        self,
+        maxiter: int = 100,
+        learning_rate: Union[float, Callable[[], Iterator]] = 0.01,
+        tol: float = 1e-7,
+        callback: Optional[CALLBACK] = None,
+        perturbation: Optional[float] = None,
+    ) -> None:
+        r"""
+        Args:
+            maxiter: The maximum number of iterations.
+            learning_rate: A constant or generator yielding learning rates for the parameter
+                updates. See the docstring for an example.
+            tol: If the norm of the parameter update is smaller than this threshold, the
+                optimizer is converged.
+            perturbation: If no gradient is passed to ``GradientDescent.optimize`` the gradient is
+                approximated with a symmetric finite difference scheme with ``perturbation``
+                perturbation in both directions (defaults to 1e-2 if required).
+                Ignored if a gradient callable is passed to ``GradientDescent.optimize``.
+        """
+        super().__init__()
+
+        self.maxiter = maxiter
+        self.learning_rate = learning_rate
+        self.perturbation = perturbation
+        self.tol = tol
+        self.callback = callback
+
+    def _minimize(self, loss, grad, initial_point):
+        # set learning rate
+        if isinstance(self.learning_rate, float):
+            eta = constant(self.learning_rate)
+        else:
+            eta = self.learning_rate()
+
+        if grad is None:
+            eps = 0.01 if self.perturbation is None else self.perturbation
+            grad = partial(
+                Optimizer.gradient_num_diff,
+                f=loss,
+                epsilon=eps,
+                max_evals_grouped=self._max_evals_grouped,
+            )
+
+        # prepare some initials
+        x = np.asarray(initial_point)
+        nfevs = 0
+
+        for _ in range(1, self.maxiter + 1):
+            # compute update -- gradient evaluation counts as one function evaluation
+            update = grad(x)
+            nfevs += 1
+
+            # compute next parameter value
+            x_next = x - next(eta) * update
+
+            # send information to callback
+            stepsize = np.linalg.norm(update)
+            if self.callback is not None:
+                self.callback(nfevs, x_next, loss(x_next), stepsize)
+
+            # update parameters
+            x = x_next
+
+            # check termination
+            if stepsize < self.tol:
+                break
+
+        return x, loss(x), nfevs
+
+    def get_support_level(self):
+        """Get the support level dictionary."""
+        return {
+            "gradient": OptimizerSupportLevel.supported,
+            "bounds": OptimizerSupportLevel.ignored,
+            "initial_point": OptimizerSupportLevel.required,
+        }
+
+    # pylint: disable=unused-argument
+    def optimize(
+        self,
+        num_vars,
+        objective_function,
+        gradient_function=None,
+        variable_bounds=None,
+        initial_point=None,
+    ):
+        return self._minimize(objective_function, gradient_function, initial_point)
+
+
+def constant(eta=0.01):
+    """Yield a constant."""
+
+    while True:
+        yield eta
diff --git a/test/optimizers/test_gradient_descent.py b/test/optimizers/test_gradient_descent.py
@@ -0,0 +1,116 @@
+# This code is part of Qiskit.
+#
+# (C) Copyright IBM 2021.
+#
+# This code is licensed under the Apache License, Version 2.0. You may
+# obtain a copy of this license in the LICENSE.txt file in the root directory
+# of this source tree or at http://www.apache.org/licenses/LICENSE-2.0.
+#
+# Any modifications or derivative works of this code must retain this
+# copyright notice, and modified files need to carry a notice indicating
+# that they have been altered from the originals.
+
+"""Tests for the Gradient Descent optimizer."""
+
+from test.python.algorithms import QiskitAlgorithmsTestCase
+
+import numpy as np
+
+from qiskit.algorithms.optimizers import GradientDescent
+from qiskit.circuit.library import PauliTwoDesign
+from qiskit.opflow import I, Z, StateFn
+from qiskit.test.decorators import slow_test
+
+
+class TestGradientDescent(QiskitAlgorithmsTestCase):
+    """Tests for the gradient descent optimizer."""
+
+    def setUp(self):
+        super().setUp()
+        np.random.seed(12)
+
+    @slow_test
+    def test_pauli_two_design(self):
+        """Test standard gradient descent on the Pauli two-design example."""
+        circuit = PauliTwoDesign(3, reps=3, seed=2)
+        parameters = list(circuit.parameters)
+        obs = Z ^ Z ^ I
+        expr = ~StateFn(obs) @ StateFn(circuit)
+
+        initial_point = np.array(
+            [
+                0.1822308,
+                -0.27254251,
+                0.83684425,
+                0.86153976,
+                -0.7111668,
+                0.82766631,
+                0.97867993,
+                0.46136964,
+                2.27079901,
+                0.13382699,
+                0.29589915,
+                0.64883193,
+            ]
+        )
+
+        def objective(x):
+            return expr.bind_parameters(dict(zip(parameters, x))).eval().real
+
+        optimizer = GradientDescent(maxiter=100, learning_rate=0.1, perturbation=0.1)
+
+        result = optimizer.optimize(circuit.num_parameters, objective, initial_point=initial_point)
+
+        self.assertLess(result[1], -0.95)  # final loss
+        self.assertEqual(result[2], 100)  # function evaluations
+
+    def test_callback(self):
+        """Test the callback."""
+
+        history = []
+
+        def callback(*args):
+            history.append(args)
+
+        optimizer = GradientDescent(maxiter=1, callback=callback)
+
+        def objective(x):
+            return np.linalg.norm(x)
+
+        _ = optimizer.optimize(2, objective, initial_point=np.array([1, -1]))
+
+        self.assertEqual(len(history), 1)
+        self.assertIsInstance(history[0][0], int)  # nfevs
+        self.assertIsInstance(history[0][1], np.ndarray)  # parameters
+        self.assertIsInstance(history[0][2], float)  # function value
+        self.assertIsInstance(history[0][3], float)  # norm of the gradient
+
+    def test_iterator_learning_rate(self):
+        """Test setting the learning rate as iterator."""
+
+        def learning_rate():
+            power = 0.6
+            constant_coeff = 0.1
+
+            def powerlaw():
+                n = 0
+                while True:
+                    yield constant_coeff * (n ** power)
+                    n += 1
+
+            return powerlaw()
+
+        def objective(x):
+            return (np.linalg.norm(x) - 1) ** 2
+
+        def grad(x):
+            return 2 * (np.linalg.norm(x) - 1) * x / np.linalg.norm(x)
+
+        initial_point = np.array([1, 0.5, -2])
+
+        optimizer = GradientDescent(maxiter=20, learning_rate=learning_rate)
+        _, fx_opt, _ = optimizer.optimize(
+            initial_point.size, objective, gradient_function=grad, initial_point=initial_point
+        )
+
+        self.assertLess(fx_opt, 1e-5)
diff --git a/test/optimizers/test_optimizers.py b/test/optimizers/test_optimizers.py
@@ -14,15 +14,15 @@
 
 import unittest
 from test.python.algorithms import QiskitAlgorithmsTestCase
-
 import numpy as np
-from scipy.optimize import rosen
+from scipy.optimize import rosen, rosen_der
 
 from qiskit.algorithms.optimizers import (
     ADAM,
     CG,
     COBYLA,
     GSLS,
+    GradientDescent,
     L_BFGS_B,
     NELDER_MEAD,
     P_BFGS,
@@ -42,9 +42,14 @@ def setUp(self):
         super().setUp()
         algorithm_globals.random_seed = 52
 
-    def _optimize(self, optimizer):
+    def _optimize(self, optimizer, grad=False):
         x_0 = [1.3, 0.7, 0.8, 1.9, 1.2]
-        res = optimizer.optimize(len(x_0), rosen, initial_point=x_0)
+        if grad:
+            res = optimizer.optimize(
+                len(x_0), rosen, gradient_function=rosen_der, initial_point=x_0
+            )
+        else:
+            res = optimizer.optimize(len(x_0), rosen, initial_point=x_0)
         np.testing.assert_array_almost_equal(res[0], [1.0] * len(x_0), decimal=2)
         return res
 
@@ -60,6 +65,12 @@ def test_cg(self):
         res = self._optimize(optimizer)
         self.assertLessEqual(res[2], 10000)
 
+    def test_gradient_descent(self):
+        """cg test"""
+        optimizer = GradientDescent(maxiter=100000, tol=1e-06, learning_rate=1e-3)
+        res = self._optimize(optimizer, grad=True)
+        self.assertLessEqual(res[2], 100000)
+
     def test_cobyla(self):
         """cobyla test"""
         optimizer = COBYLA(maxiter=100000, tol=1e-06)