diff --git a/keras/api/api_init_files.bzl b/keras/api/api_init_files.bzl
index 73ad12c98ded..e0ef35c38b03 100644
--- a/keras/api/api_init_files.bzl
+++ b/keras/api/api_init_files.bzl
@@ -53,6 +53,7 @@ KERAS_API_INIT_FILES = [
     "keras/preprocessing/text/__init__.py",
     "keras/regularizers/__init__.py",
     "keras/utils/__init__.py",
+    "keras/utils/experimental/__init__.py",
     "keras/wrappers/__init__.py",
     "keras/wrappers/scikit_learn/__init__.py",
 ]
diff --git a/keras/applications/efficientnet.py b/keras/applications/efficientnet.py
index a65183c9cff0..853cc3255436 100644
--- a/keras/applications/efficientnet.py
+++ b/keras/applications/efficientnet.py
@@ -739,6 +739,23 @@ def EfficientNetB7(include_top=True,
 
 @keras_export('keras.applications.efficientnet.preprocess_input')
 def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
+  """A placeholder method for backward compatibility.
+
+  The preprocessing logic has been included in the efficientnet model
+  implementation. Users are no longer required to call this method to normalize
+  the input data. This method does nothing and only kept as a placeholder to
+  align the API surface between old and new version of model.
+
+  Args:
+    x: A floating point `numpy.array` or a `tf.Tensor`.
+    data_format: Optional data format of the image tensor/array. Defaults to
+      None, in which case the global setting
+      `tf.keras.backend.image_data_format()` is used (unless you changed it,
+      it defaults to "channels_last").{mode}
+
+  Returns:
+    Unchanged `numpy.array` or `tf.Tensor`.
+  """
   return x
 
 
diff --git a/keras/callbacks.py b/keras/callbacks.py
index eda539ae701f..eb4f40188101 100644
--- a/keras/callbacks.py
+++ b/keras/callbacks.py
@@ -592,6 +592,27 @@ def __iter__(self):
 class Callback(object):
   """Abstract base class used to build new callbacks.
 
+  Callbacks can be passed to keras methods such as `fit`, `evaluate`, and
+  `predict` in order to hook into the various stages of the model training and
+  inference lifecycle.
+
+  To create a custom callback, subclass `keras.callbacks.Callback` and override
+  the method associated with the stage of interest. See
+  https://www.tensorflow.org/guide/keras/custom_callback for more information.
+
+  Example:
+
+  >>> training_finished = False
+  >>> class MyCallback(tf.keras.callbacks.Callback):
+  ...   def on_train_end(self, logs=None):
+  ...     global training_finished
+  ...     training_finished = True
+  >>> model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])
+  >>> model.compile(loss='mean_squared_error')
+  >>> model.fit(tf.constant([[1.0]]), tf.constant([[1.0]]),
+  ...           callbacks=[MyCallback()])
+  >>> assert training_finished == True
+
   Attributes:
       params: Dict. Training parameters
           (eg. verbosity, batch size, number of epochs...).
diff --git a/keras/constraints.py b/keras/constraints.py
index adcc7e8ad7e5..4c758e952a21 100644
--- a/keras/constraints.py
+++ b/keras/constraints.py
@@ -31,11 +31,53 @@
 
 @keras_export('keras.constraints.Constraint')
 class Constraint(object):
+  """Base class for weight constraints.
+
+  A `Constraint` instance works like a stateless function.
+  Users who subclass this
+  class should override the `__call__` method, which takes a single
+  weight parameter and return a projected version of that parameter
+  (e.g. normalized or clipped). Constraints can be used with various Keras
+  layers via the `kernel_constraint` or `bias_constraint` arguments.
+
+  Here's a simple example of a non-negative weight constraint:
+
+  >>> class NonNegative(tf.keras.constraints.Constraint):
+  ...
+  ...  def __call__(self, w):
+  ...    return w * tf.cast(tf.math.greater_equal(w, 0.), w.dtype)
+
+  >>> weight = tf.constant((-1.0, 1.0))
+  >>> NonNegative()(weight)
+  <tf.Tensor: shape=(2,), dtype=float32, numpy=array([0.,  1.], dtype=float32)>
+
+  >>> tf.keras.layers.Dense(4, kernel_constraint=NonNegative())
+  """
 
   def __call__(self, w):
+    """Applies the constraint to the input weight variable.
+
+    By default, the inputs weight variable is not modified.
+    Users should override this method to implement their own projection
+    function.
+
+    Args:
+      w: Input weight variable.
+
+    Returns:
+      Projected variable (by default, returns unmodified inputs).
+    """
     return w
 
   def get_config(self):
+    """Returns a Python dict of the object config.
+
+    A constraint config is a Python dictionary (JSON-serializable) that can
+    be used to reinstantiate the same object.
+
+    Returns:
+      Python dict containing the configuration of the constraint object.
+    """
     return {}
 
 
diff --git a/keras/distribute/mirrored_strategy_test.py b/keras/distribute/mirrored_strategy_test.py
index 8fe43f7d5432..e904b4595790 100644
--- a/keras/distribute/mirrored_strategy_test.py
+++ b/keras/distribute/mirrored_strategy_test.py
@@ -120,7 +120,7 @@ def step_fn(inputs):
       num_epochs = 4
       num_steps = 7
       for _ in range(num_epochs):
-        accuracy.reset_states()
+        accuracy.reset_state()
         for _ in range(num_steps):
           train_step(distributed_iterator)
 
diff --git a/keras/distribute/multi_worker_test.py b/keras/distribute/multi_worker_test.py
index fa2d375fe6a8..05d514b04152 100644
--- a/keras/distribute/multi_worker_test.py
+++ b/keras/distribute/multi_worker_test.py
@@ -292,7 +292,7 @@ def step_fn(inputs):
       num_epochs = 4
       num_steps = 7
       for _ in range(num_epochs):
-        accuracy.reset_states()
+        accuracy.reset_state()
         for _ in range(num_steps):
           train_step(distributed_iterator)
 
diff --git a/keras/distribute/parameter_server_training_test.py b/keras/distribute/parameter_server_training_test.py
index 3cdef3a0b2ba..48babb6ea044 100644
--- a/keras/distribute/parameter_server_training_test.py
+++ b/keras/distribute/parameter_server_training_test.py
@@ -167,7 +167,7 @@ def replica_fn(iterator):
     distributed_dataset = self.coordinator.create_per_worker_dataset(dataset_fn)
     distributed_iterator = iter(distributed_dataset)
     for _ in range(4):
-      accuracy.reset_states()
+      accuracy.reset_state()
       for _ in range(7):
         self.coordinator.schedule(worker_fn, args=(distributed_iterator,))
       self.coordinator.join()
diff --git a/keras/engine/base_layer.py b/keras/engine/base_layer.py
index 528055223fa6..9639b702470b 100644
--- a/keras/engine/base_layer.py
+++ b/keras/engine/base_layer.py
@@ -1683,48 +1683,48 @@ def add_update(self, updates, inputs=None):
           update()  # pylint: disable=not-callable
 
   def set_weights(self, weights):
-    """Sets the weights of the layer, from Numpy arrays.
+    """Sets the weights of the layer, from NumPy arrays.
 
     The weights of a layer represent the state of the layer. This function
     sets the weight values from numpy arrays. The weight values should be
     passed in the order they are created by the layer. Note that the layer's
-    weights must be instantiated before calling this function by calling
+    weights must be instantiated before calling this function, by calling
     the layer.
 
-    For example, a Dense layer returns a list of two values-- per-output
-    weights and the bias value. These can be used to set the weights of another
-    Dense layer:
+    For example, a `Dense` layer returns a list of two values: the kernel matrix
+    and the bias vector. These can be used to set the weights of another
+    `Dense` layer:
 
-    >>> a = tf.keras.layers.Dense(1,
+    >>> layer_a = tf.keras.layers.Dense(1,
     ...   kernel_initializer=tf.constant_initializer(1.))
-    >>> a_out = a(tf.convert_to_tensor([[1., 2., 3.]]))
-    >>> a.get_weights()
+    >>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
+    >>> layer_a.get_weights()
     [array([[1.],
            [1.],
            [1.]], dtype=float32), array([0.], dtype=float32)]
-    >>> b = tf.keras.layers.Dense(1,
+    >>> layer_b = tf.keras.layers.Dense(1,
     ...   kernel_initializer=tf.constant_initializer(2.))
-    >>> b_out = b(tf.convert_to_tensor([[10., 20., 30.]]))
-    >>> b.get_weights()
+    >>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
+    >>> layer_b.get_weights()
     [array([[2.],
            [2.],
            [2.]], dtype=float32), array([0.], dtype=float32)]
-    >>> b.set_weights(a.get_weights())
-    >>> b.get_weights()
+    >>> layer_b.set_weights(layer_a.get_weights())
+    >>> layer_b.get_weights()
     [array([[1.],
            [1.],
            [1.]], dtype=float32), array([0.], dtype=float32)]
 
     Args:
-        weights: a list of Numpy arrays. The number
-            of arrays and their shape must match
-            number of the dimensions of the weights
-            of the layer (i.e. it should match the
-            output of `get_weights`).
+      weights: a list of NumPy arrays. The number
+        of arrays and their shape must match
+        number of the dimensions of the weights
+        of the layer (i.e. it should match the
+        output of `get_weights`).
 
     Raises:
-        ValueError: If the provided weights list does not match the
-            layer's specifications.
+      ValueError: If the provided weights list does not match the
+        layer's specifications.
     """
     params = self.weights
 
@@ -1763,39 +1763,39 @@ def set_weights(self, weights):
     backend.batch_set_value(weight_value_tuples)
 
   def get_weights(self):
-    """Returns the current weights of the layer.
+    """Returns the current weights of the layer, as NumPy arrays.
 
     The weights of a layer represent the state of the layer. This function
     returns both trainable and non-trainable weight values associated with this
-    layer as a list of Numpy arrays, which can in turn be used to load state
+    layer as a list of NumPy arrays, which can in turn be used to load state
     into similarly parameterized layers.
 
-    For example, a Dense layer returns a list of two values-- per-output
-    weights and the bias value. These can be used to set the weights of another
-    Dense layer:
+    For example, a `Dense` layer returns a list of two values: the kernel matrix
+    and the bias vector. These can be used to set the weights of another
+    `Dense` layer:
 
-    >>> a = tf.keras.layers.Dense(1,
+    >>> layer_a = tf.keras.layers.Dense(1,
     ...   kernel_initializer=tf.constant_initializer(1.))
-    >>> a_out = a(tf.convert_to_tensor([[1., 2., 3.]]))
-    >>> a.get_weights()
+    >>> a_out = layer_a(tf.convert_to_tensor([[1., 2., 3.]]))
+    >>> layer_a.get_weights()
     [array([[1.],
            [1.],
            [1.]], dtype=float32), array([0.], dtype=float32)]
-    >>> b = tf.keras.layers.Dense(1,
+    >>> layer_b = tf.keras.layers.Dense(1,
     ...   kernel_initializer=tf.constant_initializer(2.))
-    >>> b_out = b(tf.convert_to_tensor([[10., 20., 30.]]))
-    >>> b.get_weights()
+    >>> b_out = layer_b(tf.convert_to_tensor([[10., 20., 30.]]))
+    >>> layer_b.get_weights()
     [array([[2.],
            [2.],
            [2.]], dtype=float32), array([0.], dtype=float32)]
-    >>> b.set_weights(a.get_weights())
-    >>> b.get_weights()
+    >>> layer_b.set_weights(layer_a.get_weights())
+    >>> layer_b.get_weights()
     [array([[1.],
            [1.],
            [1.]], dtype=float32), array([0.], dtype=float32)]
 
     Returns:
-        Weights values as a list of numpy arrays.
+        Weights values as a list of NumPy arrays.
     """
     weights = self.weights
     output_weights = []
diff --git a/keras/engine/base_preprocessing_layer.py b/keras/engine/base_preprocessing_layer.py
index 9be8b1683d66..9b70f0b04fc6 100644
--- a/keras/engine/base_preprocessing_layer.py
+++ b/keras/engine/base_preprocessing_layer.py
@@ -43,7 +43,19 @@
 @keras_export('keras.layers.experimental.preprocessing.PreprocessingLayer')
 @six.add_metaclass(abc.ABCMeta)
 class PreprocessingLayer(Layer):
-  """Base class for PreprocessingLayers.
+  """Base class for Preprocessing Layers.
+
+  **Don't use this class directly: it's an abstract base class!** You may
+  be looking for one of the many built-in
+  [preprocessing layers](https://keras.io/guides/preprocessing_layers/)
+  instead.
+
+  Preprocessing layers are layers whose state gets computed before model
+  training starts. They do not get updated during training.
+  Most preprocessing layers implement an `adapt()` method for state computation.
+
+  The `PreprocessingLayer` class is the base class you would subclass to
+  implement your own preprocessing layers.
 
   Attributes:
     stateful: Whether the layer contains state that needs to be adapted via
diff --git a/keras/engine/compile_utils.py b/keras/engine/compile_utils.py
index 22ace69fa150..0ebd49c7ac2b 100644
--- a/keras/engine/compile_utils.py
+++ b/keras/engine/compile_utils.py
@@ -251,14 +251,14 @@ def __call__(self,
       # Ok for a model to have no compiled loss.
       return tf.zeros(shape=())
 
-  def reset_states(self):
+  def reset_state(self):
     """Resets the state of loss metrics."""
     if not self._built:
       return
     metrics = [self._loss_metric] + tf.nest.flatten(self._per_output_metrics)
     for metric_obj in metrics:
       if metric_obj is not None:
-        metric_obj.reset_states()
+        metric_obj.reset_state()
 
   def _get_loss_object(self, loss):
     """Returns a `Loss` object.
@@ -466,7 +466,7 @@ def update_state(self, y_true, y_pred, sample_weight=None):
           continue
         weighted_metric_obj.update_state(y_t, y_p, sample_weight=sw)
 
-  def reset_states(self):
+  def reset_state(self):
     """Resets the state of all `Metric`s in this container."""
     if self._built:
       metrics = self._metrics_in_order
@@ -479,7 +479,7 @@ def reset_states(self):
 
     for metric_obj in metrics:
       if isinstance(metric_obj, metrics_mod.Metric):
-        metric_obj.reset_states()
+        metric_obj.reset_state()
 
   def _get_metric_objects(self, metrics, y_t, y_p):
     """Convert user-supplied metrics to `Metric` objects."""
diff --git a/keras/engine/compile_utils_test.py b/keras/engine/compile_utils_test.py
index 656057b8fa32..ec5487f2afa4 100644
--- a/keras/engine/compile_utils_test.py
+++ b/keras/engine/compile_utils_test.py
@@ -42,7 +42,7 @@ def test_single_loss(self):
     self.assertEqual(loss_metric.name, 'loss')
     self.assertEqual(loss_metric.result().numpy(), 1.)
 
-    loss_container.reset_states()
+    loss_container.reset_state()
     self.assertEqual(loss_metric.result().numpy(), 0.)
 
   def test_loss_list(self):
@@ -71,7 +71,7 @@ def test_loss_list(self):
     self.assertEqual(output_2_metric.name, 'output_2_loss')
     self.assertEqual(output_2_metric.result().numpy(), 0.5)
 
-    loss_container.reset_states()
+    loss_container.reset_state()
     self.assertEqual(loss_metric.result().numpy(), 0)
     self.assertEqual(output_1_metric.result().numpy(), 0)
     self.assertEqual(output_2_metric.result().numpy(), 0)
@@ -108,7 +108,7 @@ def test_loss_dict(self):
     self.assertEqual(out2_metric.name, 'out2_loss')
     self.assertEqual(out2_metric.result().numpy(), 0.5)
 
-    loss_container.reset_states()
+    loss_container.reset_state()
     self.assertEqual(loss_metric.result().numpy(), 0)
     self.assertEqual(out1_metric.result().numpy(), 0)
     self.assertEqual(out2_metric.result().numpy(), 0)
@@ -405,7 +405,7 @@ def test_single_metric(self):
     self.assertEqual(metric.name, 'mse')
     self.assertEqual(metric.result().numpy(), 1.)
 
-    metric_container.reset_states()
+    metric_container.reset_state()
     self.assertEqual(metric.result().numpy(), 0.)
 
   def test_list_of_metrics_one_output(self):
@@ -422,7 +422,7 @@ def test_list_of_metrics_one_output(self):
     self.assertEqual(mae_metric.name, 'mae')
     self.assertEqual(mae_metric.result().numpy(), 2.)
 
-    metric_container.reset_states()
+    metric_container.reset_state()
     self.assertEqual(mse_metric.result().numpy(), 0.)
     self.assertEqual(mae_metric.result().numpy(), 0.)
 
@@ -507,7 +507,7 @@ def test_metric_dict(self):
     self.assertEqual(weighted_mae_metric.name, 'out2_weighted_mae')
     self.assertEqual(weighted_mae_metric.result().numpy(), 2.)
 
-    metric_container.reset_states()
+    metric_container.reset_state()
     self.assertEqual(mse_metric.result().numpy(), 0.)
     self.assertEqual(weighted_mse_metric.result().numpy(), 0.)
     self.assertEqual(mae_metric.result().numpy(), 0.)
@@ -782,13 +782,13 @@ def __call__(self, y_true, y_pred):
     self.assertEqual(metric_container.metrics[0].name, 'custom_metric_fn')
     self.assertEqual(metric_container.metrics[1].name, 'custom_metric_class')
 
-  def test_reset_states_existing_metric_before_built(self):
+  def test_reset_state_existing_metric_before_built(self):
     metric = metrics_mod.Mean()
     metric.update_state([2.0, 4.0])
     self.assertEqual(metric.result().numpy(), 3.0)
 
     metric_container = compile_utils.MetricsContainer(metric)
-    metric_container.reset_states()
+    metric_container.reset_state()
     self.assertEqual(metric.result().numpy(), 0.0)
 
 
diff --git a/keras/engine/training.py b/keras/engine/training.py
index 290a6674bd8c..a900a607a03e 100644
--- a/keras/engine/training.py
+++ b/keras/engine/training.py
@@ -1704,7 +1704,7 @@ def reset_metrics(self):
 
     """
     for m in self.metrics:
-      m.reset_states()
+      m.reset_state()
 
   def train_on_batch(self,
                      x,
diff --git a/keras/engine/training_v1.py b/keras/engine/training_v1.py
index 390243cd072e..820fbb42871a 100644
--- a/keras/engine/training_v1.py
+++ b/keras/engine/training_v1.py
@@ -985,7 +985,7 @@ def reset_metrics(self):
     """Resets the state of metrics."""
     metrics = self._get_training_eval_metrics()
     for m in metrics:
-      m.reset_states()
+      m.reset_state()
 
     # Reset metrics on all the distributed (cloned) models.
     if self._distribution_strategy:
diff --git a/keras/feature_column/dense_features.py b/keras/feature_column/dense_features.py
index 2fc1f6564abb..090a14bd44c7 100644
--- a/keras/feature_column/dense_features.py
+++ b/keras/feature_column/dense_features.py
@@ -21,8 +21,6 @@
 import tensorflow.compat.v2 as tf
 
 import json
-
-from tensorflow.python.feature_column import feature_column_v2 as fc
 from keras import backend
 from keras.feature_column import base_feature_layer as kfc
 from keras.saving.saved_model import json_utils
@@ -159,7 +157,7 @@ def call(self, features, cols_to_output_tensors=None, training=None):
     if not isinstance(features, dict):
       raise ValueError('We expected a dictionary here. Instead we got: ',
                        features)
-    transformation_cache = fc.FeatureTransformationCache(features)
+    transformation_cache = tf.__internal__.feature_column.FeatureTransformationCache(features)
     output_tensors = []
     for column in self._feature_columns:
       with backend.name_scope(column.name):
diff --git a/keras/feature_column/sequence_feature_column.py b/keras/feature_column/sequence_feature_column.py
index adc8fd18398b..028fc9449d4e 100644
--- a/keras/feature_column/sequence_feature_column.py
+++ b/keras/feature_column/sequence_feature_column.py
@@ -22,8 +22,6 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
-
-from tensorflow.python.feature_column import feature_column_v2 as fc
 from keras import backend
 from keras.feature_column import base_feature_layer as kfc
 from tensorflow.python.util.tf_export import keras_export
@@ -145,7 +143,7 @@ def call(self, features, training=None):
                        features)
     if training is None:
       training = backend.learning_phase()
-    transformation_cache = fc.FeatureTransformationCache(features)
+    transformation_cache = tf.__internal__.feature_column.FeatureTransformationCache(features)
     output_tensors = []
     sequence_lengths = []
 
diff --git a/keras/initializers/__init__.py b/keras/initializers/__init__.py
index 08e790097075..24c2bd780845 100644
--- a/keras/initializers/__init__.py
+++ b/keras/initializers/__init__.py
@@ -150,6 +150,38 @@ def deserialize(config, custom_objects=None):
 
 @keras_export('keras.initializers.get')
 def get(identifier):
+  """Retrieve a Keras initializer by the identifier.
+
+  The `identifier` may be the string name of a initializers function or class (
+  case-sensitively).
+
+  >>> identifier = 'Ones'
+  >>> tf.keras.initializers.deserialize(identifier)
+  <...tensorflow.python.keras.initializers.initializers_v2.Ones...>
+
+  You can also specify `config` of the initializer to this function by passing
+  dict containing `class_name` and `config` as an identifier. Also note that the
+  `class_name` must map to a `Initializer` class.
+
+  >>> cfg = {'class_name': 'Ones', 'config': {}}
+  >>> tf.keras.initializers.deserialize(cfg)
+  <...tensorflow.python.keras.initializers.initializers_v2.Ones...>
+
+  In the case that the `identifier` is a class, this method will return a new
+  instance of the class by its constructor.
+
+  Args:
+    identifier: String or dict that contains the initializer name or
+      configurations.
+
+  Returns:
+    Initializer instance base on the input identifier.
+
+  Raises:
+    ValueError: If the input identifier is not a supported type or in a bad
+      format.
+  """
+
   if identifier is None:
     return None
   if isinstance(identifier, dict):
diff --git a/keras/integration_test/BUILD b/keras/integration_test/BUILD
index 076f079a3719..3621844097ad 100644
--- a/keras/integration_test/BUILD
+++ b/keras/integration_test/BUILD
@@ -131,6 +131,7 @@ tf_py_test(
     shard_count = 3,
     tags = [
         "no_tfrt",  # TODO(b/171765113)
+        "no_windows",  # TODO(b/183102726)
         "noasan",  # TODO(b/156029134)
         "nomac",  # TODO(b/182567880)
         "nomsan",  # TODO(b/156029134)
diff --git a/keras/integration_test/central_storage_strategy_test.py b/keras/integration_test/central_storage_strategy_test.py
index 5888341c2d3a..a137c83f1ead 100644
--- a/keras/integration_test/central_storage_strategy_test.py
+++ b/keras/integration_test/central_storage_strategy_test.py
@@ -74,7 +74,7 @@ def step_fn(inputs):
       num_epochs = 4
       num_steps = 7
       for _ in range(num_epochs):
-        accuracy.reset_states()
+        accuracy.reset_state()
         for _ in range(num_steps):
           train_step(distributed_iterator)
 
diff --git a/keras/integration_test/multi_worker_tutorial_test.py b/keras/integration_test/multi_worker_tutorial_test.py
index cfffa7333cf4..9789365a34b8 100644
--- a/keras/integration_test/multi_worker_tutorial_test.py
+++ b/keras/integration_test/multi_worker_tutorial_test.py
@@ -317,7 +317,7 @@ def step_fn(inputs):
           logging.info('Epoch: %d, accuracy: %f, train_loss: %f.',
                        epoch.numpy(), train_accuracy.result(), train_loss)
 
-          train_accuracy.reset_states()
+          train_accuracy.reset_state()
 
           checkpoint_manager.save()
           if not _is_chief(task_type, task_id):
diff --git a/keras/integration_test/tpu_strategy_test.py b/keras/integration_test/tpu_strategy_test.py
index edc4cef202c2..e366ed4b00c7 100644
--- a/keras/integration_test/tpu_strategy_test.py
+++ b/keras/integration_test/tpu_strategy_test.py
@@ -186,7 +186,7 @@ def step_fn(inputs):
       num_epochs = 4
       num_steps = 7
       for _ in range(num_epochs):
-        accuracy.reset_states()
+        accuracy.reset_state()
         for _ in range(num_steps):
           train_step(distributed_iterator)
 
diff --git a/keras/layers/convolutional.py b/keras/layers/convolutional.py
index 57b3bb243a0b..9bef7b8d0a09 100644
--- a/keras/layers/convolutional.py
+++ b/keras/layers/convolutional.py
@@ -2257,11 +2257,22 @@ def call(self, inputs):
 
 @keras_export('keras.layers.DepthwiseConv2D')
 class DepthwiseConv2D(Conv2D):
-  """Depthwise separable 2D convolution.
+  """Depthwise 2D convolution.
+
+  Depthwise convolution is a type of convolution in which a single convolutional
+  filter is apply to each input channel (i.e. in a depthwise way).
+  You can understand depthwise convolution as being
+  the first step in a depthwise separable convolution.
+
+  It is implemented via the following steps:
+
+  - Split the input into individual channels.
+  - Convolve each input with the layer's kernel (called a depthwise kernel).
+  - Stack the convolved outputs together (along the channels axis).
+
+  Unlike a regular 2D convolution, depthwise convolution does not mix
+  information across different input channels.
 
-  Depthwise Separable convolutions consist of performing
-  just the first step in a depthwise spatial convolution
-  (which acts on each input channel separately).
   The `depth_multiplier` argument controls how many
   output channels are generated per input channel in the depthwise step.
 
@@ -2329,10 +2340,11 @@ class DepthwiseConv2D(Conv2D):
 
   Output shape:
     4D tensor with shape:
-    `[batch_size, filters, new_rows, new_cols]` if data_format='channels_first'
-    or 4D tensor with shape:
-    `[batch_size, new_rows, new_cols, filters]` if data_format='channels_last'.
-    `rows` and `cols` values might have changed due to padding.
+    `[batch_size, channels * depth_multiplier, new_rows, new_cols]` if
+    data_format='channels_first' or 4D tensor with shape:
+    `[batch_size, new_rows, new_cols, channels * depth_multiplier]` if
+    data_format='channels_last'. `rows` and `cols` values might have
+    changed due to padding.
 
   Returns:
     A tensor of rank 4 representing
diff --git a/keras/layers/convolutional_recurrent.py b/keras/layers/convolutional_recurrent.py
index a5e2fd53503e..bd162bed9d9e 100644
--- a/keras/layers/convolutional_recurrent.py
+++ b/keras/layers/convolutional_recurrent.py
@@ -699,10 +699,16 @@ def get_config(self):
 
 @keras_export('keras.layers.ConvLSTM2D')
 class ConvLSTM2D(ConvRNN2D):
-  """Convolutional LSTM.
+  """2D Convolutional LSTM layer.
 
-  It is similar to an LSTM layer, but the input transformations
-  and recurrent transformations are both convolutional.
+  A convolutional LSTM is similar to an LSTM, but the input transformations
+  and recurrent transformations are both convolutional. This layer is typically
+  used to process timeseries of images (i.e. video-like data).
+
+  It is known to perform well for weather data forecasting,
+  using inputs that are timeseries of 2D grids of sensor values.
+  It isn't usually applied to regular video data, due to its high computational
+  cost.
 
   Args:
     filters: Integer, the dimensionality of the output space
@@ -714,8 +720,8 @@ class ConvLSTM2D(ConvRNN2D):
       Specifying any stride value != 1 is incompatible with specifying
       any `dilation_rate` value != 1.
     padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
@@ -776,7 +782,7 @@ class ConvLSTM2D(ConvRNN2D):
       the linear transformation of the recurrent state.
 
   Call arguments:
-    inputs: A 5D tensor.
+    inputs: A 5D float tensor (see input shape description below).
     mask: Binary tensor of shape `(samples, timesteps)` indicating whether
       a given timestep should be masked.
     training: Python boolean indicating whether the layer should behave in
@@ -824,6 +830,20 @@ class ConvLSTM2D(ConvRNN2D):
     - [Shi et al., 2015](http://arxiv.org/abs/1506.04214v1)
     (the current implementation does not include the feedback loop on the
     cells output).
+
+  Example:
+
+  ```python
+  steps = 10
+  height = 32
+  width = 32
+  input_channels = 3
+  output_channels = 6
+
+  inputs = tf.keras.Input(shape=(steps, height, width, input_channels))
+  layer = tf.keras.layers.ConvLSTM2D(filters=output_channels, kernel_size=3)
+  outputs = layer(inputs)
+  ```
   """
 
   def __init__(self,
diff --git a/keras/layers/convolutional_recurrent_test.py b/keras/layers/convolutional_recurrent_test.py
index 3708250c180b..147b1caccb02 100644
--- a/keras/layers/convolutional_recurrent_test.py
+++ b/keras/layers/convolutional_recurrent_test.py
@@ -202,6 +202,9 @@ def test_conv_lstm_cloning(self):
       outputs = clone.predict(test_inputs)
       self.assertAllClose(reference_outputs, outputs, atol=1e-5)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping the test as OOM occurred with 1 GB budget.')
   def test_conv_lstm_with_initial_state(self):
     num_samples = 32
     sequence_len = 5
diff --git a/keras/layers/gru_test.py b/keras/layers/gru_test.py
index a222aedbeca8..956f641b2798 100644
--- a/keras/layers/gru_test.py
+++ b/keras/layers/gru_test.py
@@ -46,10 +46,11 @@ def test_return_sequences_GRU(self):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Double type is not yet supported in ROCm')
   @testing_utils.run_v2_only
   def test_float64_GRU(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Double type is yet not supported in ROCm')
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
@@ -132,9 +133,10 @@ def test_reset_after_GRU(self):
     gru_model.fit(x_train, y_train)
     gru_model.predict(x_train)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='MIOpen only supports packed input output')
   def test_with_masking_layer_GRU(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('MIOpen only supports packed input output')
     layer_class = keras.layers.GRU
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
@@ -148,9 +150,10 @@ def test_with_masking_layer_GRU(self):
         run_eagerly=testing_utils.should_run_eagerly())
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='MIOpen only supports packed input output')
   def test_statefulness_GRU(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('MIOpen only supports packed input output')
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
diff --git a/keras/layers/gru_v2_test.py b/keras/layers/gru_v2_test.py
index 9978cd6b3519..e718cea5b73e 100644
--- a/keras/layers/gru_v2_test.py
+++ b/keras/layers/gru_v2_test.py
@@ -136,12 +136,11 @@ def test_from_config_GRU(self):
       l2 = layer_class.from_config(l1.get_config())
       assert l1.get_config() == l2.get_config()
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   @testing_utils.run_v2_only
   def test_gru_v2_feature_parity_with_canonical_gru(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
-
     input_shape = 10
     rnn_state_size = 8
     timestep = 4
@@ -309,11 +308,10 @@ def build_model(layer_cls):
 
     self.assertAllClose(y, y_ref)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_with_masking_layer_GRU(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
-
     layer_class = rnn.GRU
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
@@ -325,11 +323,10 @@ def test_with_masking_layer_GRU(self):
                   optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.001))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_masking_with_stacking_GRU(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
-
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
@@ -353,11 +350,11 @@ def test_return_sequences_GRU(self):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Double type is not yet supported in ROCm')
   @testing_utils.run_v2_only
   def test_float64_GRU(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Double type is yet not supported in ROCm')
-
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
@@ -370,11 +367,10 @@ def test_float64_GRU(self):
         input_shape=(num_samples, timesteps, embedding_dim),
         input_dtype='float64')
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_return_states_GRU(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
-
     layer_class = rnn.GRU
     x = np.random.random((2, 3, 4))
     y = np.abs(np.random.random((2, 5)))
@@ -454,11 +450,10 @@ def test_regularizers_GRU(self):
     else:
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_statefulness_GRU(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
-
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
@@ -553,12 +548,11 @@ def test_stateful_GRU_training(self):
         run_eagerly=testing_utils.should_run_eagerly())
     model.fit(x, y, epochs=1, shuffle=False)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   @testing_utils.run_v2_only
   def test_explicit_device_with_go_backward_and_mask(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
-
     batch_size = 8
     timestep = 7
     masksteps = 5
@@ -735,12 +729,11 @@ def test_GRU_runtime(self):
     model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
     self._test_runtime_with_model(model)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   @testing_utils.run_v2_only
   def test_GRU_runtime_with_mask(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
-
     # Masking will affect which backend is selected based on whether the mask
     # is strictly right padded.
     layer = rnn.GRU(self.rnn_state_size, return_runtime=True)
diff --git a/keras/layers/lstm_test.py b/keras/layers/lstm_test.py
index d6c129d3213f..f6556e0bdfa3 100644
--- a/keras/layers/lstm_test.py
+++ b/keras/layers/lstm_test.py
@@ -44,10 +44,11 @@ def test_return_sequences_LSTM(self):
                 'return_sequences': True},
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Double type is yet not supported in ROCm')
   @testing_utils.run_v2_only
   def test_float64_LSTM(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Double type is yet not supported in ROCm')
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
@@ -141,10 +142,10 @@ def test_constraints_LSTM(self):
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
   @parameterized.parameters([True, False])
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input.')
   def test_with_masking_layer_LSTM(self, unroll):
-    if tf.test.is_built_with_rocm():
-      self.skipTest(
-          'Skipping the test as ROCm MIOpen does not support padded input.')
     layer_class = keras.layers.LSTM
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
@@ -383,10 +384,10 @@ def test_regularizers_LSTM(self):
     else:
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input.')
   def test_statefulness_LSTM(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest(
-          'Skipping the test as ROCm MIOpen does not support padded input.')
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
diff --git a/keras/layers/lstm_v2_test.py b/keras/layers/lstm_v2_test.py
index cb900a18416e..54b35a52be74 100644
--- a/keras/layers/lstm_v2_test.py
+++ b/keras/layers/lstm_v2_test.py
@@ -240,10 +240,10 @@ def test_specify_state_with_masking(self):
     targets = np.random.random((num_samples, units))
     model.train_on_batch([inputs] + initial_state, targets)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_return_state(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
     num_states = 2
     timesteps = 3
     embedding_dim = 4
@@ -310,11 +310,11 @@ def test_initial_states_as_other_inputs(self):
     targets = np.random.random((num_samples, units))
     model.train_on_batch([main_inputs] + initial_state, targets)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   @testing_utils.run_v2_only
   def test_lstm_v2_feature_parity_with_canonical_lstm(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
     input_shape = 10
     rnn_state_size = 8
     timestep = 4
@@ -358,10 +358,10 @@ def test_lstm_v2_feature_parity_with_canonical_lstm(self):
     self.assertAllClose(y_2, y_4, rtol=1e-5, atol=2e-5)
 
   @parameterized.named_parameters(('v0', 0), ('v1', 1), ('v2', 2))
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_implementation_mode_LSTM(self, implementation_mode):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
@@ -403,10 +403,10 @@ def test_implementation_mode_LSTM(self, implementation_mode):
         optimizer=tf.compat.v1.train.GradientDescentOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_masking_with_stacking_LSTM(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
     targets /= targets.sum(axis=-1, keepdims=True)
@@ -578,11 +578,11 @@ def test_return_sequences_LSTM(self):
         },
         input_shape=(num_samples, timesteps, embedding_dim))
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support float64 yet.')
   @testing_utils.run_v2_only
   def test_float64_LSTM(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support float64 yet.')
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
@@ -618,10 +618,10 @@ def test_regularizers_LSTM(self):
     else:
       self.assertEqual(len(layer.get_losses_for(x)), 1)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_statefulness_LSTM(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
@@ -753,12 +753,11 @@ def test_bidirectional(self):
     model.evaluate(x, y)
     model.predict(x)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   @testing_utils.run_v2_only
   def test_explicit_device_with_go_backward_and_mask(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
-
     batch_size = 8
     timestep = 7
     masksteps = 5
@@ -908,12 +907,11 @@ def test_LSTM_runtime(self):
     model = keras.models.Model(inputs=inputs, outputs=[outputs, runtime])
     self._test_runtime_with_model(model)
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   @testing_utils.run_v2_only
   def test_LSTM_runtime_with_mask(self):
-    if tf.test.is_built_with_rocm():
-      self.skipTest('Skipping the test as ROCm MIOpen does not '
-                    'support padded input yet.')
-
     # Masking will affect which backend is selected based on whether the mask
     # is strictly right padded.
     layer = rnn.LSTM(self.rnn_state_size, return_runtime=True)
diff --git a/keras/layers/merge.py b/keras/layers/merge.py
index d2a5179bfa93..39d82af9fe60 100644
--- a/keras/layers/merge.py
+++ b/keras/layers/merge.py
@@ -803,6 +803,23 @@ def subtract(inputs, **kwargs):
 def multiply(inputs, **kwargs):
   """Functional interface to the `Multiply` layer.
 
+  Example:
+
+  >>> x1 = np.arange(3.0)
+  >>> x2 = np.arange(3.0)
+  >>> tf.keras.layers.multiply([x1, x2])
+  <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0., 1., 4.], ...)>
+
+  Usage in a functional model:
+
+  >>> input1 = tf.keras.layers.Input(shape=(16,))
+  >>> x1 = tf.keras.layers.Dense(8, activation='relu')(input1) #shape=(None, 8)
+  >>> input2 = tf.keras.layers.Input(shape=(32,))
+  >>> x2 = tf.keras.layers.Dense(8, activation='relu')(input2) #shape=(None, 8)
+  >>> out = tf.keras.layers.multiply([x1,x2]) #shape=(None, 8)
+  >>> out = tf.keras.layers.Dense(4)(out)
+  >>> model = tf.keras.models.Model(inputs=[input1, input2], outputs=out)
+
   Args:
       inputs: A list of input tensors (at least 2).
       **kwargs: Standard layer keyword arguments.
diff --git a/keras/layers/normalization.py b/keras/layers/normalization.py
index 35ecef56ac1c..3948d84a0dcf 100644
--- a/keras/layers/normalization.py
+++ b/keras/layers/normalization.py
@@ -1261,15 +1261,12 @@ def _broadcast(v):
 
       inputs = tf.reshape(inputs, squeezed_shape)
 
-      def _set_const_tensor(val, dtype, shape):
-        return tf.fill(shape, tf.constant(val, dtype=dtype))
-
       # self.gamma and self.beta have the wrong shape for fused_batch_norm, so
       # we cannot pass them as the scale and offset parameters. Therefore, we
       # create two constant tensors in correct shapes for fused_batch_norm and
       # later construct a separate calculation on the scale and offset.
-      scale = _set_const_tensor(1.0, self.dtype, [pre_dim])
-      offset = _set_const_tensor(0.0, self.dtype, [pre_dim])
+      scale = tf.ones([pre_dim], dtype=self.dtype)
+      offset = tf.zeros([pre_dim], dtype=self.dtype)
 
       # Compute layer normalization using the fused_batch_norm function.
       outputs, _, _ = tf.compat.v1.nn.fused_batch_norm(
diff --git a/keras/layers/pooling.py b/keras/layers/pooling.py
index 3f3b913c52a2..49bc30685064 100644
--- a/keras/layers/pooling.py
+++ b/keras/layers/pooling.py
@@ -201,14 +201,81 @@ def __init__(self, pool_size=2, strides=None,
 class AveragePooling1D(Pooling1D):
   """Average pooling for temporal data.
 
+  Downsamples the input representation by taking the average value over the
+  window defined by `pool_size`. The window is shifted by `strides`.  The
+  resulting output when using "valid" padding option has a shape of:
+  `output_shape = (input_shape - pool_size + 1) / strides)`
+
+  The resulting output shape when using the "same" padding option is:
+  `output_shape = input_shape / strides`
+
+  For example, for strides=1 and padding="valid":
+
+  >>> x = tf.constant([1., 2., 3., 4., 5.])
+  >>> x = tf.reshape(x, [1, 5, 1])
+  >>> x
+  <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
+    array([[[1.],
+            [2.],
+            [3.],
+            [4.],
+            [5.]], dtype=float32)>
+  >>> avg_pool_1d = tf.keras.layers.AveragePooling1D(pool_size=2,
+  ...    strides=1, padding='valid')
+  >>> avg_pool_1d(x)
+  <tf.Tensor: shape=(1, 4, 1), dtype=float32, numpy=
+  array([[[1.5],
+          [2.5],
+          [3.5],
+          [4.5]]], dtype=float32)>
+
+  For example, for strides=2 and padding="valid":
+
+  >>> x = tf.constant([1., 2., 3., 4., 5.])
+  >>> x = tf.reshape(x, [1, 5, 1])
+  >>> x
+  <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
+    array([[[1.],
+            [2.],
+            [3.],
+            [4.],
+            [5.]], dtype=float32)>
+  >>> avg_pool_1d = tf.keras.layers.AveragePooling1D(pool_size=2,
+  ...    strides=2, padding='valid')
+  >>> avg_pool_1d(x)
+  <tf.Tensor: shape=(1, 2, 1), dtype=float32, numpy=
+  array([[[1.5],
+          [3.5]]], dtype=float32)>
+
+  For example, for strides=1 and padding="same":
+
+  >>> x = tf.constant([1., 2., 3., 4., 5.])
+  >>> x = tf.reshape(x, [1, 5, 1])
+  >>> x
+  <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
+    array([[[1.],
+            [2.],
+            [3.],
+            [4.],
+            [5.]], dtype=float32)>
+  >>> avg_pool_1d = tf.keras.layers.AveragePooling1D(pool_size=2,
+  ...    strides=1, padding='same')
+  >>> avg_pool_1d(x)
+  <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
+  array([[[1.5],
+          [2.5],
+          [3.5],
+          [4.5],
+          [5.]]], dtype=float32)>
+
   Args:
     pool_size: Integer, size of the average pooling windows.
     strides: Integer, or None. Factor by which to downscale.
       E.g. 2 will halve the input.
       If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
@@ -464,6 +531,64 @@ def __init__(self,
 class AveragePooling2D(Pooling2D):
   """Average pooling operation for spatial data.
 
+  Downsamples the input representation by taking the average value over the
+  window defined by `pool_size` for each dimension along the features axis.
+  The window is shifted by `strides` in each dimension.  The resulting output
+  when using "valid" padding option has a shape(number of rows or columns) of:
+  `output_shape = math.floor((input_shape - pool_size) / strides) + 1`
+  (when input_shape >= pool_size)
+
+  The resulting output shape when using the "same" padding option is:
+  `output_shape = math.floor((input_shape - 1) / strides) + 1`
+
+  For example, for stride=(1,1) and padding="valid":
+
+  >>> x = tf.constant([[1., 2., 3.],
+  ...                  [4., 5., 6.],
+  ...                  [7., 8., 9.]])
+  >>> x = tf.reshape(x, [1, 3, 3, 1])
+  >>> avg_pool_2d = tf.keras.layers.AveragePooling2D(pool_size=(2, 2),
+  ...    strides=(1, 1), padding='valid')
+  >>> avg_pool_2d(x)
+  <tf.Tensor: shape=(1, 2, 2, 1), dtype=float32, numpy=
+    array([[[[3.],
+             [4.]],
+            [[6.],
+             [7.]]]], dtype=float32)>
+
+  For example, for stride=(2,2) and padding="valid":
+
+  >>> x = tf.constant([[1., 2., 3., 4.],
+  ...                  [5., 6., 7., 8.],
+  ...                  [9., 10., 11., 12.]])
+  >>> x = tf.reshape(x, [1, 3, 4, 1])
+  >>> avg_pool_2d = tf.keras.layers.AveragePooling2D(pool_size=(2, 2),
+  ...    strides=(2, 2), padding='valid')
+  >>> avg_pool_2d(x)
+  <tf.Tensor: shape=(1, 1, 2, 1), dtype=float32, numpy=
+    array([[[[3.5],
+             [5.5]]]], dtype=float32)>
+
+  For example, for stride=(1,1) and padding="same":
+
+  >>> x = tf.constant([[1., 2., 3.],
+  ...                  [4., 5., 6.],
+  ...                  [7., 8., 9.]])
+  >>> x = tf.reshape(x, [1, 3, 3, 1])
+  >>> avg_pool_2d = tf.keras.layers.AveragePooling2D(pool_size=(2, 2),
+  ...    strides=(1, 1), padding='same')
+  >>> avg_pool_2d(x)
+  <tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
+    array([[[[3.],
+             [4.],
+             [4.5]],
+            [[6.],
+             [7.],
+             [7.5]],
+            [[7.5],
+             [8.5],
+             [9.]]]], dtype=float32)>
+
   Args:
     pool_size: integer or tuple of 2 integers,
       factors by which to downscale (vertical, horizontal).
@@ -474,8 +599,8 @@ class AveragePooling2D(Pooling2D):
       Strides values.
       If None, it will default to `pool_size`.
     padding: One of `"valid"` or `"same"` (case-insensitive).
-      `"valid"` means no padding. `"same"` results in padding evenly to 
-      the left/right or up/down of the input such that output has the same 
+      `"valid"` means no padding. `"same"` results in padding evenly to
+      the left/right or up/down of the input such that output has the same
       height/width dimension as the input.
     data_format: A string,
       one of `channels_last` (default) or `channels_first`.
diff --git a/keras/layers/preprocessing/benchmarks/bucketized_column_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/bucketized_column_dense_benchmark.py
index 221c583c4129..bbd7db6ae7b5 100644
--- a/keras/layers/preprocessing/benchmarks/bucketized_column_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/bucketized_column_dense_benchmark.py
@@ -20,7 +20,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing import discretization
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
 
@@ -52,7 +51,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {"data": data.to_tensor(default_value=0.0)}
diff --git a/keras/layers/preprocessing/benchmarks/category_cross_hash_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/category_cross_hash_dense_benchmark.py
index e35c38bd78e5..4a2191bda0b8 100644
--- a/keras/layers/preprocessing/benchmarks/category_cross_hash_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_cross_hash_dense_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing import category_crossing
 from keras.layers.preprocessing import hashing
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
@@ -54,7 +53,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {
diff --git a/keras/layers/preprocessing/benchmarks/category_hash_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/category_hash_dense_benchmark.py
index b2ff5655833f..79eb4090e881 100644
--- a/keras/layers/preprocessing/benchmarks/category_hash_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_hash_dense_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing import hashing
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
 
@@ -49,7 +48,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {
diff --git a/keras/layers/preprocessing/benchmarks/category_hash_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/category_hash_varlen_benchmark.py
index 6ffe8ef0b7e2..e6faff30a970 100644
--- a/keras/layers/preprocessing/benchmarks/category_hash_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_hash_varlen_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing import hashing
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
 
@@ -51,7 +50,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {"data": data}
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_list_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_list_dense_benchmark.py
index 7d96e8440ea0..afb8f9367ed1 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_list_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_list_dense_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing import string_lookup
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
 
@@ -48,7 +47,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_dense_benchmark.py
index 9db08fab8862..a6a4e8de7eb4 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_dense_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing import category_encoding
 from keras.layers.preprocessing import string_lookup
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
@@ -54,7 +53,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_varlen_benchmark.py
index 7e229de2af07..c52a00094ad0 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_list_indicator_varlen_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing import category_encoding
 from keras.layers.preprocessing import string_lookup
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
@@ -56,7 +55,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {"data": data}
diff --git a/keras/layers/preprocessing/benchmarks/category_vocab_list_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/category_vocab_list_varlen_benchmark.py
index 9a0138601320..a1b60ae0ca61 100644
--- a/keras/layers/preprocessing/benchmarks/category_vocab_list_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/category_vocab_list_varlen_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing import string_lookup
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
 
@@ -50,7 +49,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {"data": data}
diff --git a/keras/layers/preprocessing/benchmarks/embedding_dense_benchmark.py b/keras/layers/preprocessing/benchmarks/embedding_dense_benchmark.py
index 9676694c999b..b3ff77994125 100644
--- a/keras/layers/preprocessing/benchmarks/embedding_dense_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/embedding_dense_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
 
 # This is required as of 3/2021 because otherwise we drop into graph mode.
@@ -51,7 +50,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {"data": data.to_tensor(default_value=0)}
diff --git a/keras/layers/preprocessing/benchmarks/embedding_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/embedding_varlen_benchmark.py
index 7d5ca189c5af..285adbe27e7b 100644
--- a/keras/layers/preprocessing/benchmarks/embedding_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/embedding_varlen_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
 
 # This is required as of 3/2021 because otherwise we drop into graph mode.
@@ -52,7 +51,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {"data": data}
diff --git a/keras/layers/preprocessing/benchmarks/weighted_embedding_varlen_benchmark.py b/keras/layers/preprocessing/benchmarks/weighted_embedding_varlen_benchmark.py
index e9c3f1f2cfaf..9e1257be1526 100644
--- a/keras/layers/preprocessing/benchmarks/weighted_embedding_varlen_benchmark.py
+++ b/keras/layers/preprocessing/benchmarks/weighted_embedding_varlen_benchmark.py
@@ -18,7 +18,6 @@
 
 import keras
 from tensorflow.python.eager.def_function import function as tf_function
-from tensorflow.python.feature_column import feature_column_v2 as fcv2
 from keras.layers.preprocessing.benchmarks import feature_column_benchmark as fc_bm
 
 # This is required as of 3/2021 because otherwise we drop into graph mode.
@@ -59,7 +58,7 @@ def embedding_varlen(batch_size, max_length):
   # Wrap the FC implementation in a tf.function for a fair comparison
   @tf_function()
   def fc_fn(tensors):
-    fc.transform_feature(fcv2.FeatureTransformationCache(tensors), None)
+    fc.transform_feature(tf.__internal__.feature_column.FeatureTransformationCache(tensors), None)
 
   # Benchmark runs
   keras_data = {"data": data, "weight": weight}
diff --git a/keras/layers/preprocessing/discretization.py b/keras/layers/preprocessing/discretization.py
index 4330d68d72cf..8e623bbca949 100644
--- a/keras/layers/preprocessing/discretization.py
+++ b/keras/layers/preprocessing/discretization.py
@@ -238,44 +238,39 @@ def compute_output_signature(self, input_spec):
     return tf.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def call(self, inputs):
-    def _bucketize_op(bins):
-      bins = [tf.cast(bins, tf.float32)]
-      return lambda inputs: tf.raw_ops.BoostedTreesBucketize(  # pylint: disable=g-long-lambda
+    bins = [tf.cast(tf.compat.v1.squeeze(self.bins), tf.float32)]
+
+    def _bucketize_fn(inputs):
+      return tf.raw_ops.BoostedTreesBucketize(
           float_values=[tf.cast(inputs, tf.float32)],
           bucket_boundaries=bins)[0]
 
     if tf_utils.is_ragged(inputs):
       integer_buckets = tf.ragged.map_flat_values(
-          _bucketize_op(tf.compat.v1.squeeze(self.bins)),
-          inputs)
+          _bucketize_fn, inputs)
       # Ragged map_flat_values doesn't touch the non-values tensors in the
       # ragged composite tensor. If this op is the only op a Keras model,
       # this can cause errors in Graph mode, so wrap the tensor in an identity.
       return tf.identity(integer_buckets)
     elif isinstance(inputs, tf.SparseTensor):
-      integer_buckets = tf.raw_ops.BoostedTreesBucketize(
-          float_values=[tf.cast(inputs.values, tf.float32)],
-          bucket_boundaries=[tf.cast(tf.compat.v1.squeeze(self.bins),
-                                           tf.float32)])[0]
       return tf.SparseTensor(
           indices=tf.identity(inputs.indices),
-          values=integer_buckets,
+          values=_bucketize_fn(inputs.values),
           dense_shape=tf.identity(inputs.dense_shape))
     else:
-      input_shape = inputs.get_shape()
-      if any(dim is None for dim in input_shape.as_list()[1:]):
+      static_shape = inputs.get_shape()
+      if any(dim is None for dim in static_shape.as_list()[1:]):
         raise NotImplementedError(
             "Discretization Layer requires known non-batch shape,"
-            "found {}".format(input_shape))
-
-      reshaped = tf.reshape(
-          inputs,
-          [-1, tf.raw_ops.Prod(input=input_shape.as_list()[1:], axis=0)])
+            "found {}".format(static_shape))
 
+      dynamic_shape = tf.shape(inputs)
+      # BoostedTreesBucketize only handles rank 1 inputs. We need to flatten our
+      # inputs after batch size and vectorized_map over each sample.
+      reshaped = tf.reshape(inputs, [dynamic_shape[0], -1])
       return tf.reshape(
-          tf.vectorized_map(
-              _bucketize_op(tf.compat.v1.squeeze(self.bins)), reshaped),
-          tf.constant([-1] + input_shape.as_list()[1:]))
+          tf.vectorized_map(_bucketize_fn, reshaped),
+          dynamic_shape)
 
   class DiscretizingCombiner(Combiner):
     """Combiner for the Discretization preprocessing layer.
diff --git a/keras/layers/preprocessing/discretization_distribution_test.py b/keras/layers/preprocessing/discretization_distribution_test.py
index b2d8f98299fe..ab71473c6e7a 100644
--- a/keras/layers/preprocessing/discretization_distribution_test.py
+++ b/keras/layers/preprocessing/discretization_distribution_test.py
@@ -31,7 +31,7 @@
 
 @tf.__internal__.distribute.combinations.generate(
     tf.__internal__.test.combinations.combine(
-        distribution=strategy_combinations.strategies_minus_tpu,
+        distribution=strategy_combinations.all_strategies,
         mode=["eager", "graph"]))
 class DiscretizationDistributionTest(
     keras_parameterized.TestCase,
diff --git a/keras/layers/preprocessing/discretization_test.py b/keras/layers/preprocessing/discretization_test.py
index fc5764a53539..4d53e5f2e5be 100644
--- a/keras/layers/preprocessing/discretization_test.py
+++ b/keras/layers/preprocessing/discretization_test.py
@@ -124,6 +124,12 @@ def test_bucketize_with_explicit_buckets_sparse_int_input(self):
     self.assertAllEqual(indices, output_dataset.indices)
     self.assertAllEqual(expected_output, output_dataset.values)
 
+  def test_output_shape(self):
+    input_data = keras.Input(batch_size=16, shape=(4,), dtype=tf.string)
+    layer = discretization.Discretization(bin_boundaries=[-.5, 0.5, 1.5])
+    output = layer(input_data)
+    self.assertAllEqual(output.shape.as_list(), [16, 4])
+
   def test_num_bins_negative_fails(self):
     with self.assertRaisesRegex(ValueError, "`num_bins` must be.*num_bins=-7"):
       _ = discretization.Discretization(num_bins=-7)
diff --git a/keras/layers/preprocessing/integer_lookup.py b/keras/layers/preprocessing/integer_lookup.py
index 18cf9a8e0ed1..9d1b1f0495f5 100644
--- a/keras/layers/preprocessing/integer_lookup.py
+++ b/keras/layers/preprocessing/integer_lookup.py
@@ -22,76 +22,80 @@
 from keras.engine import base_preprocessing_layer
 from keras.layers.preprocessing import index_lookup
 from keras.layers.preprocessing import table_utils
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import keras_export
 
 
 @keras_export("keras.layers.experimental.preprocessing.IntegerLookup", v1=[])
 class IntegerLookup(index_lookup.IndexLookup):
-  """Maps integers from a vocabulary to integer indices.
+  """Reindex integer inputs to be in a contiguous range, via a dict lookup.
 
-  This layer translates a set of arbitrary integers into an integer output via a
-  table-based vocabulary lookup.
+  This layer maps a set of arbitrary integer input tokens into indexed
+  integer output via a table-based vocabulary lookup. The layer's output indices
+  will be contiguously arranged up to the maximum vocab size, even if the input
+  tokens are non-continguous or unbounded. The layer supports multiple options
+  for encoding the output via `output_mode`, and has optional support for
+  out-of-vocabulary (OOV) tokens and masking.
 
   The vocabulary for the layer can be supplied on construction or learned via
   `adapt()`. During `adapt()`, the layer will analyze a data set, determine the
-  frequency of individual integer values, and create a vocabulary from them. If
-  the vocabulary is capped in size, the most frequent values will be used to
-  create the vocabulary and all others will be treated as out-of-vocabulary
-  (OOV).
+  frequency of individual integer tokens, and create a vocabulary from them. If
+  the vocabulary is capped in size, the most frequent tokens will be used to
+  create the vocabulary and all others will be treated as OOV.
 
   There are two possible output modes for the layer.
-  When `output_mode` is "int", input values are converted to their index in the
-  vocabulary (an integer).
-  When `output_mode` is "binary", "count", or "tf-idf", input strings
+  When `output_mode` is "int",
+  input integers are converted to their index in the vocabulary (an integer).
+  When `output_mode` is "binary", "count", or "tf-idf", input integers
   are encoded into an array where each dimension corresponds to an element in
   the vocabulary.
 
-  The vocabulary can optionally contain a mask value as well as an OOV value
+  The vocabulary can optionally contain a mask token as well as an OOV token
   (which can optionally occupy multiple indices in the vocabulary, as set
   by `num_oov_indices`).
-  The position of these values in the vocabulary is fixed. When `output_mode` is
-  "int", the vocabulary will begin with the mask value at index 0, followed by
+  The position of these tokens in the vocabulary is fixed. When `output_mode` is
+  "int", the vocabulary will begin with the mask token at index 0, followed by
   OOV indices, followed by the rest of the vocabulary. When `output_mode` is
   "binary", "count", or "tf-idf" the vocabulary will begin with OOV indices and
-  instances of the mask value will be dropped.
+  instances of the mask token will be dropped.
 
   Args:
-    max_values: The maximum size of the vocabulary for this layer. If None,
+    max_tokens: The maximum size of the vocabulary for this layer. If None,
       there is no cap on the size of the vocabulary. Note that this size
-      includes the OOV and mask values. Default to None.
-    num_oov_indices: The number of out-of-vocabulary values to use. If this
+      includes the OOV and mask tokens. Default to None.
+    num_oov_indices: The number of out-of-vocabulary tokens to use. If this
       value is more than 1, OOV inputs are modulated to determine their OOV
       value. If this value is 0, OOV inputs will map to -1 when `output_mode` is
       "int" and are dropped otherwise. Defaults to 1.
-    mask_value: A value that represents masked inputs. When `output_mode` is
-      "int", the value is included in vocabulary and mapped to index 0. In other
-      output modes, the value will not appear in the vocabulary and instances
-      of the mask value in the input will be dropped. If set to None, no mask
-      term will be added. Defaults to 0.
-    oov_value: Only used when `invert` is True. The value to return for OOV
+    mask_token: An integer token that represents masked inputs. When
+      `output_mode` is "int", the token is included in vocabulary and mapped to
+      index 0. In other output modes, the token will not appear in the
+      vocabulary and instances of the mask token in the input will be dropped.
+      If set to None, no mask term will be added. Defaults to 0.
+    oov_token: Only used when `invert` is True. The token to return for OOV
       indices. Defaults to -1.
-    vocabulary: An optional list of values, or a path to a text file containing
-      a vocabulary to load into this layer. The file should contain one value
-      per line. If the list or file contains the same value multiple times, an
-      error will be thrown.
+    vocabulary: An optional list of integer tokens, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one integer token per line. If the list or file contains the same token
+      multiple times, an error will be thrown.
     invert: Only valid when `output_mode` is "int". If True, this layer will map
       indices to vocabulary items instead of mapping vocabulary items to
       indices. Default to False.
     output_mode: Specification for the output of the layer. Defaults to "int".
       Values can be "int", "binary", "count", or "tf-idf" configuring the layer
       as follows:
-        "int": Return the raw integer indices of the input values.
-        "binary": Outputs a single int array per sample, of either vocab_size or
-          max_values size, containing 1s in all elements where the value mapped
-          to that index exists at least once in the sample.
+        "int": Return the vocabulary indices of the input tokens.
+        "binary": Outputs a single int array per sample, of either vocabulary
+          size or `max_tokens` size, containing 1s in all elements where the
+          token mapped to that index exists at least once in the sample.
         "count": Like "binary", but the int array contains a count of the number
-          of times the value at that index appeared in the sample.
+          of times the token at that index appeared in the sample.
         "tf-idf": As "binary", but the TF-IDF algorithm is applied to find the
-          value in each value slot.
-    pad_to_max_values: Only applicable when `output_mode` is "binary", "count",
+          value in each token slot.
+    pad_to_max_tokens: Only applicable when `output_mode` is "binary", "count",
       or "tf-idf". If True, the output will have its feature axis padded to
-      `max_values` even if the number of unique values in the vocabulary is less
-      than max_values, resulting in a tensor of shape [batch_size, max_values]
+      `max_tokens` even if the number of unique tokens in the vocabulary is less
+      than max_tokens, resulting in a tensor of shape [batch_size, max_tokens]
       regardless of vocabulary size. Defaults to False.
     sparse: Boolean. Only applicable when `output_mode` is "binary", "count",
       or "tf-idf". If True, returns a `SparseTensor` instead of a dense
@@ -104,7 +108,7 @@ class IntegerLookup(index_lookup.IndexLookup):
   This example creates a lookup layer with a pre-existing vocabulary.
 
   >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])  # Note OOV values
+  >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])  # Note OOV tokens
   >>> layer = IntegerLookup(vocabulary=vocab)
   >>> layer(data)
   <tf.Tensor: shape=(2, 3), dtype=int64, numpy=
@@ -122,8 +126,8 @@ class IntegerLookup(index_lookup.IndexLookup):
   >>> layer.get_vocabulary()
   [0, -1, 42, 1138, 1000, 36, 12]
 
-  Note how the mask value 0 and the OOV value -1 have been added to the
-  vocabulary. The remaining values are sorted by frequency (1138, which has
+  Note how the mask token 0 and the OOV token -1 have been added to the
+  vocabulary. The remaining tokens are sorted by frequency (1138, which has
   2 occurrences, is first) then by inverse sort order.
 
   >>> data = tf.constant([[12, 1138, 42], [42, 1000, 36]])
@@ -138,8 +142,8 @@ class IntegerLookup(index_lookup.IndexLookup):
   **Lookups with multiple OOV indices**
 
   This example demonstrates how to use a lookup layer with multiple OOV indices.
-  When a layer is created with more than one OOV index, any OOV values are
-  hashed into the number of OOV buckets, distributing OOV values in a
+  When a layer is created with more than one OOV index, any OOV tokens are
+  hashed into the number of OOV buckets, distributing OOV tokens in a
   deterministic fashion across the set.
 
   >>> vocab = [12, 36, 1138, 42]
@@ -150,31 +154,31 @@ class IntegerLookup(index_lookup.IndexLookup):
   array([[3, 5, 6],
          [2, 1, 4]])>
 
-  Note that the output for OOV value 37 is 2, while the output for OOV value
+  Note that the output for OOV token 37 is 2, while the output for OOV token
   1000 is 1. The in-vocab terms have their output index increased by 1 from
   earlier examples (12 maps to 3, etc) in order to make space for the extra OOV
-  value.
+  token.
 
   **Multi-hot output**
 
   Configure the layer with `output_mode='binary'`. Note that the first
-  `num_oov_indices` dimensions in the binary encoding represent OOV values
+  `num_oov_indices` dimensions in the binary encoding represent OOV tokens
 
   >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV values
+  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
   >>> layer = IntegerLookup(vocabulary=vocab, output_mode='binary')
   >>> layer(data)
   <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
     array([[0., 1., 0., 1., 1.],
            [1., 0., 1., 0., 1.]], dtype=float32)>
 
-  **Value count output**
+  **Token count output**
 
   Configure the layer with `output_mode='count'`. As with binary output, the
-  first `num_oov_indices` dimensions in the output represent OOV values.
+  first `num_oov_indices` dimensions in the output represent OOV tokens.
 
   >>> vocab = [12, 36, 1138, 42]
-  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV values
+  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
   >>> layer = IntegerLookup(vocabulary=vocab, output_mode='count')
   >>> layer(data)
   <tf.Tensor: shape=(2, 5), dtype=float32, numpy=
@@ -184,16 +188,16 @@ class IntegerLookup(index_lookup.IndexLookup):
   **TF-IDF output**
 
   Configure the layer with `output_mode='tf-idf'`. As with binary output, the
-  first `num_oov_indices` dimensions in the output represent OOV values.
+  first `num_oov_indices` dimensions in the output represent OOV tokens.
 
-  Each value bin will output `value_count * idf_weight`, where the idf weights
-  are the inverse document frequency weights per value. These should be provided
-  along with the vocabulary. Note that the `idf_weight` for OOV values will
+  Each token bin will output `token_count * idf_weight`, where the idf weights
+  are the inverse document frequency weights per token. These should be provided
+  along with the vocabulary. Note that the `idf_weight` for OOV tokens will
   default to the average of all idf weights passed in.
 
   >>> vocab = [12, 36, 1138, 42]
   >>> idf_weights = [0.25, 0.75, 0.6, 0.4]
-  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV values
+  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
   >>> layer = IntegerLookup(output_mode='tf-idf')
   >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
   >>> layer(data)
@@ -201,12 +205,12 @@ class IntegerLookup(index_lookup.IndexLookup):
     array([[0.  , 0.25, 0.  , 0.6 , 0.8 ],
            [1.0 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
 
-  To specify the idf weights for oov values, you will need to pass the entire
-  vocabularly including the leading oov value.
+  To specify the idf weights for oov tokens, you will need to pass the entire
+  vocabularly including the leading oov token.
 
   >>> vocab = [-1, 12, 36, 1138, 42]
   >>> idf_weights = [0.9, 0.25, 0.75, 0.6, 0.4]
-  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV values
+  >>> data = tf.constant([[12, 1138, 42, 42], [42, 7, 36, 7]]) # Note OOV tokens
   >>> layer = IntegerLookup(output_mode='tf-idf')
   >>> layer.set_vocabulary(vocab, idf_weights=idf_weights)
   >>> layer(data)
@@ -215,12 +219,12 @@ class IntegerLookup(index_lookup.IndexLookup):
            [1.8 , 0.  , 0.75, 0.  , 0.4 ]], dtype=float32)>
 
   When adapting the layer in tf-idf mode, each input sample will be considered a
-  document, and idf weight per value will be calculated as
-  `log(1 + num_documents / (1 + value_document_count))`.
+  document, and idf weight per token will be calculated as
+  `log(1 + num_documents / (1 + token_document_count))`.
 
   **Inverse lookup**
 
-  This example demonstrates how to map indices to values using this layer. (You
+  This example demonstrates how to map indices to tokens using this layer. (You
   can also use adapt() with inverse=True, but for simplicity we'll pass the
   vocab in this example.)
 
@@ -232,8 +236,8 @@ class IntegerLookup(index_lookup.IndexLookup):
   array([[  12, 1138,   42],
          [  42,   -1,   36]])>
 
-  Note that the first two indices correspond to the mask and oov value by
-  default. This behavior can be disabled by setting `mask_value=None` and
+  Note that the first two indices correspond to the mask and oov token by
+  default. This behavior can be disabled by setting `mask_token=None` and
   `num_oov_indices=0`.
 
 
@@ -252,26 +256,40 @@ class IntegerLookup(index_lookup.IndexLookup):
   array([[  12, 1138,   42],
          [  42,   -1,   36]])>
 
-  In this example, the input value 1000 resulted in an output of -1, since
+  In this example, the input token 1000 resulted in an output of -1, since
   1000 was not in the vocabulary - it got represented as an OOV, and all OOV
-  values are returned as -1 in the inverse layer. Also, note that for the
+  tokens are returned as -1 in the inverse layer. Also, note that for the
   inverse to work, you must have already set the forward layer vocabulary
   either directly or via `fit()` before calling `get_vocabulary()`.
   """
 
   def __init__(self,
-               max_values=None,
+               max_tokens=None,
                num_oov_indices=1,
-               mask_value=0,
-               oov_value=-1,
+               mask_token=0,
+               oov_token=-1,
                vocabulary=None,
                invert=False,
                output_mode=index_lookup.INT,
                sparse=False,
-               pad_to_max_values=False,
+               pad_to_max_tokens=False,
                **kwargs):
     allowed_dtypes = [tf.int64]
 
+    # Support deprecated args for this layer.
+    if "max_values" in kwargs:
+      logging.warning("max_values is deprecated, use max_tokens instead.")
+      max_tokens = kwargs["max_values"]
+      del kwargs["max_values"]
+    if "mask_value" in kwargs:
+      logging.warning("mask_value is deprecated, use mask_token instead.")
+      mask_token = kwargs["mask_value"]
+      del kwargs["mask_value"]
+    if "oov_value" in kwargs:
+      logging.warning("oov_value is deprecated, use oov_token instead.")
+      oov_token = kwargs["oov_value"]
+      del kwargs["oov_value"]
+
     if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
       raise ValueError("The value of the dtype argument for IntegerLookup may "
                        "only be one of %s." % (allowed_dtypes,))
@@ -279,11 +297,11 @@ def __init__(self,
     if "dtype" not in kwargs:
       kwargs["dtype"] = tf.int64
 
-    # If max_values is set, the value must be greater than 1 - otherwise we
+    # If max_tokens is set, the token must be greater than 1 - otherwise we
     # are creating a 0-element vocab, which doesn't make sense.
-    if max_values is not None and max_values <= 1:
-      raise ValueError("If set, max_values must be greater than 1. "
-                       "You passed %s" % (max_values,))
+    if max_tokens is not None and max_tokens <= 1:
+      raise ValueError("If set, max_tokens must be greater than 1. "
+                       "You passed %s" % (max_tokens,))
 
     if num_oov_indices < 0:
       raise ValueError(
@@ -296,36 +314,18 @@ def __init__(self,
         vocabulary = [int(v) for v in vocabulary]
 
     super(IntegerLookup, self).__init__(
-        max_tokens=max_values,
+        max_tokens=max_tokens,
         num_oov_indices=num_oov_indices,
-        mask_token=mask_value,
-        oov_token=oov_value,
+        mask_token=mask_token,
+        oov_token=oov_token,
         vocabulary=vocabulary,
         invert=invert,
         output_mode=output_mode,
         sparse=sparse,
-        pad_to_max_tokens=pad_to_max_values,
+        pad_to_max_tokens=pad_to_max_tokens,
         **kwargs)
     base_preprocessing_layer.keras_kpl_gauge.get_cell("IntegerLookup").set(True)
 
-  def get_config(self):
-    base_config = super(IntegerLookup, self).get_config()
-    # Because the super config has a bunch of args we're also passing,
-    # we need to rename and remove them from the config dict.
-    base_config["max_values"] = base_config["max_tokens"]
-    del base_config["max_tokens"]
-
-    base_config["mask_value"] = base_config["mask_token"]
-    del base_config["mask_token"]
-
-    base_config["oov_value"] = base_config["oov_token"]
-    del base_config["oov_token"]
-
-    base_config["pad_to_max_values"] = base_config["pad_to_max_tokens"]
-    del base_config["pad_to_max_tokens"]
-
-    return base_config
-
   def set_vocabulary(self, vocabulary, idf_weights=None):
     if isinstance(vocabulary, str):
       if self.output_mode == index_lookup.TFIDF:
diff --git a/keras/layers/preprocessing/integer_lookup_test.py b/keras/layers/preprocessing/integer_lookup_test.py
index a79d625e784b..f92f774a788a 100644
--- a/keras/layers/preprocessing/integer_lookup_test.py
+++ b/keras/layers/preprocessing/integer_lookup_test.py
@@ -52,7 +52,7 @@ def _get_end_to_end_test_cases():
               np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
                        dtype=np.int64),
           "kwargs": {
-              "max_values": None,
+              "max_tokens": None,
               "dtype": tf.int64,
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
@@ -133,7 +133,7 @@ def test_sparse_int_input(self):
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
-    layer = integer_lookup.IntegerLookup(max_values=None)
+    layer = integer_lookup.IntegerLookup(max_tokens=None)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -149,7 +149,7 @@ def test_ragged_int_input(self):
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
-    layer = integer_lookup.IntegerLookup(max_values=None)
+    layer = integer_lookup.IntegerLookup(max_tokens=None)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -175,11 +175,11 @@ def test_sparse_int_input_multi_bucket(self):
 
     input_data = keras.Input(shape=(None,), dtype=tf.int64, sparse=True)
     layer = integer_lookup.IntegerLookup(
-        max_values=None,
+        max_tokens=None,
         dtype=tf.int64,
         num_oov_indices=2,
-        mask_value=0,
-        oov_value=-1)
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -195,7 +195,7 @@ def test_ragged_int_input_multi_bucket(self):
     expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
     input_data = keras.Input(shape=(None,), dtype=tf.int64, ragged=True)
-    layer = integer_lookup.IntegerLookup(max_values=None, num_oov_indices=2)
+    layer = integer_lookup.IntegerLookup(max_tokens=None, num_oov_indices=2)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -240,7 +240,7 @@ def word_gen():
     batched_ds = ds.take(2)
     input_t = keras.Input(shape=(), dtype=tf.int64)
     layer = integer_lookup.IntegerLookup(
-        max_values=10, num_oov_indices=0, mask_value=None, oov_value=None)
+        max_tokens=10, num_oov_indices=0, mask_token=None, oov_token=None)
     _ = layer(input_t)
     layer.adapt(batched_ds)
 
@@ -264,7 +264,7 @@ def test_int_output(self):
 
   def test_output_shape(self):
     input_data = keras.Input(shape=(4,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(max_values=2, num_oov_indices=1)
+    layer = integer_lookup.IntegerLookup(max_tokens=2, num_oov_indices=1)
     int_data = layer(input_data)
     self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
 
@@ -274,7 +274,7 @@ def test_int_output_no_reserved_zero(self):
     expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
 
     input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(max_values=None, mask_value=None)
+    layer = integer_lookup.IntegerLookup(max_tokens=None, mask_token=None)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -289,7 +289,7 @@ def test_int_output_explicit_vocab(self):
     input_data = keras.Input(shape=(None,), dtype=tf.int64)
     layer = integer_lookup.IntegerLookup(
         vocabulary=vocab_data,
-        max_values=None,
+        max_tokens=None,
     )
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -304,7 +304,7 @@ def test_int_output_explicit_vocab_with_special_tokens(self):
     input_data = keras.Input(shape=(None,), dtype=tf.int64)
     layer = integer_lookup.IntegerLookup(
         vocabulary=vocab_data,
-        max_values=None,
+        max_tokens=None,
     )
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -479,14 +479,14 @@ class IntegerLookupErrorTest(keras_parameterized.TestCase,
   def test_too_long_vocab_fails_in_single_setting(self):
     vocab_data = [42, 1138, 725, 1729]
 
-    layer = integer_lookup.IntegerLookup(max_values=4, num_oov_indices=1)
+    layer = integer_lookup.IntegerLookup(max_tokens=4, num_oov_indices=1)
     with self.assertRaisesRegex(ValueError,
                                 "vocabulary larger than the maximum vocab.*"):
       layer.set_vocabulary(vocab_data)
 
-  def test_zero_max_values_fails(self):
-    with self.assertRaisesRegex(ValueError, ".*max_values.*"):
-      _ = integer_lookup.IntegerLookup(max_values=0, num_oov_indices=1)
+  def test_zero_max_tokens_fails(self):
+    with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
+      _ = integer_lookup.IntegerLookup(max_tokens=0, num_oov_indices=1)
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
@@ -505,7 +505,7 @@ def test_vocabulary_persistence_across_saving(self):
 
     # Build and validate a golden model.
     input_data = keras.Input(shape=(None,), dtype=tf.int64)
-    layer = integer_lookup.IntegerLookup(max_values=None, num_oov_indices=1)
+    layer = integer_lookup.IntegerLookup(max_tokens=None, num_oov_indices=1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
diff --git a/keras/layers/wrappers_test.py b/keras/layers/wrappers_test.py
index 408196048b3f..e81c8c96ddfc 100644
--- a/keras/layers/wrappers_test.py
+++ b/keras/layers/wrappers_test.py
@@ -1022,13 +1022,10 @@ def compute_output_shape(self, input_shape):
           input_layer.compute_output_shape([None, 2, 4]).as_list(),
           [None, 2, 16])
 
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_Bidirectional_last_output_with_masking(self):
-    if tf.test.is_built_with_rocm():
-      # testcase uses input and/or output sequences which require padding
-      # leading to the following error on ROCm platform
-      # ROCm MIOpen only supports packed input output
-      # Skip this subtest for now
-      self.skipTest('Test not supported on the ROCm platform')
     rnn = keras.layers.LSTM
     samples = 2
     dim = 5
@@ -1055,13 +1052,10 @@ def test_Bidirectional_last_output_with_masking(self):
       self.assertAllClose(y[0], np.concatenate([y[1], y[3]], axis=1))
 
   @parameterized.parameters([keras.layers.LSTM, keras.layers.GRU])
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm MIOpen does not support padded input yet.')
   def test_Bidirectional_sequence_output_with_masking(self, rnn):
-    if tf.test.is_built_with_rocm():
-      # testcase uses input and/or output sequences which require padding
-      # leading to the following error on ROCm platform
-      # ROCm MIOpen only supports packed input output
-      # Skip this subtest for now
-      self.skipTest('Test not supported on the ROCm platform')
     samples = 2
     dim = 5
     timesteps = 3
@@ -1263,10 +1257,10 @@ def test_wrapped_rnn_cell(self):
         batch_size=10)
 
   @parameterized.parameters(['ave', 'concat', 'mul'])
+  @tf.test.disable_with_predicate(
+      pred=tf.test.is_built_with_rocm,
+      skip_message='Skipping as ROCm RNN does not support ragged tensors yet.')
   def test_Bidirectional_ragged_input(self, merge_mode):
-    if tf.test.is_built_with_rocm():
-      # ragged tenors are not supported in ROCM RNN implementation
-      self.skipTest('Test not supported on the ROCm platform')
     np.random.seed(100)
     rnn = keras.layers.LSTM
     units = 3
diff --git a/keras/metrics.py b/keras/metrics.py
index a42d4c9ee23c..e4e83974d9cd 100644
--- a/keras/metrics.py
+++ b/keras/metrics.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 # pylint: disable=unused-import
 # pylint: disable=g-classes-have-attributes
+# pylint: disable=g-doc-return-or-yield
 """Built-in metrics.
 """
 from __future__ import absolute_import
@@ -24,10 +25,10 @@
 
 import abc
 import types
+import warnings
 
 import numpy as np
 import six
-from tensorflow.python.eager import def_function
 from keras import activations
 from keras import backend as K
 from keras.engine import base_layer
@@ -47,6 +48,7 @@
 from keras.losses import sparse_categorical_crossentropy
 from keras.losses import squared_hinge
 from keras.saving.saved_model import metric_serialization
+from keras.utils import generic_utils
 from keras.utils import losses_utils
 from keras.utils import metrics_utils
 from keras.utils import tf_inspect
@@ -157,7 +159,7 @@ def update_state_fn(*args, **kwargs):
         ag_update_state = tf.__internal__.autograph.tf_convert(obj_update_state, control_status)
         return ag_update_state(*args, **kwargs)
     else:
-      if isinstance(obj.update_state, def_function.Function):
+      if isinstance(obj.update_state, tf.__internal__.function.Function):
         update_state_fn = obj.update_state
       else:
         update_state_fn = tf.function(obj.update_state)
@@ -225,13 +227,20 @@ def get_config(self):
     """Returns the serializable config of the metric."""
     return {'name': self.name, 'dtype': self.dtype}
 
-  def reset_states(self):
+  def reset_state(self):
     """Resets all of the metric state variables.
 
     This function is called between epochs/steps,
     when a metric is evaluated during training.
     """
-    K.batch_set_value([(v, 0) for v in self.variables])
+    if not generic_utils.is_default(self.reset_states):
+      warnings.warn('Metric %s implements a `reset_states()` method; rename it '
+                    'to `reset_state()` (without the final "s"). The name '
+                    '`reset_states()` has been deprecated to improve API '
+                    'consistency.' % (self.__class__.__name__,))
+      return self.reset_states()
+    else:
+      K.batch_set_value([(v, 0) for v in self.variables])
 
   @abc.abstractmethod
   def update_state(self, *args, **kwargs):
@@ -323,6 +332,13 @@ def non_trainable_weights(self):
   def _trackable_saved_model_saver(self):
     return metric_serialization.MetricSavedModelSaver(self)
 
+  @generic_utils.default
+  @doc_controls.do_not_generate_docs
+  def reset_states(self):
+    # Backwards compatibility alias of `reset_state`. New classes should
+    # only implement `reset_state`.
+    return self.reset_state()
+
 
 class Reduce(Metric):
   """Encapsulates metrics that perform a reduce operation on the values.
@@ -475,7 +491,7 @@ class Mean(Reduce):
   >>> m.update_state([1, 3, 5, 7])
   >>> m.result().numpy()
   4.0
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([1, 3, 5, 7], sample_weight=[1, 1, 0, 0])
   >>> m.result().numpy()
   2.0
@@ -669,7 +685,7 @@ class Accuracy(MeanMetricWrapper):
   >>> m.result().numpy()
   0.75
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[1], [2], [3], [4]], [[0], [2], [3], [4]],
   ...                sample_weight=[1, 1, 0, 0])
   >>> m.result().numpy()
@@ -713,7 +729,7 @@ class BinaryAccuracy(MeanMetricWrapper):
   >>> m.result().numpy()
   0.75
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[1], [1], [0], [0]], [[0.98], [1], [0], [0.6]],
   ...                sample_weight=[1, 0, 0, 1])
   >>> m.result().numpy()
@@ -763,7 +779,7 @@ class CategoricalAccuracy(MeanMetricWrapper):
   >>> m.result().numpy()
   0.5
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8],
   ...                 [0.05, 0.95, 0]],
   ...                sample_weight=[0.7, 0.3])
@@ -815,7 +831,7 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
   >>> m.result().numpy()
   0.5
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[2], [1]], [[0.1, 0.6, 0.3], [0.05, 0.95, 0]],
   ...                sample_weight=[0.7, 0.3])
   >>> m.result().numpy()
@@ -854,7 +870,7 @@ class TopKCategoricalAccuracy(MeanMetricWrapper):
   >>> m.result().numpy()
   0.5
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 0, 1], [0, 1, 0]],
   ...                [[0.1, 0.9, 0.8], [0.05, 0.95, 0]],
   ...                sample_weight=[0.7, 0.3])
@@ -892,7 +908,7 @@ class SparseTopKCategoricalAccuracy(MeanMetricWrapper):
   >>> m.result().numpy()
   0.5
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]],
   ...                sample_weight=[0.7, 0.3])
   >>> m.result().numpy()
@@ -969,7 +985,7 @@ def result(self):
       result = self.accumulator
     return tf.convert_to_tensor(result)
 
-  def reset_states(self):
+  def reset_state(self):
     num_thresholds = len(to_list(self.thresholds))
     K.batch_set_value(
         [(v, np.zeros((num_thresholds,))) for v in self.variables])
@@ -1007,7 +1023,7 @@ class FalsePositives(_ConfusionMatrixConditionCount):
   >>> m.result().numpy()
   2.0
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 1, 0, 0], [0, 0, 1, 1], sample_weight=[0, 0, 1, 0])
   >>> m.result().numpy()
   1.0
@@ -1056,7 +1072,7 @@ class FalseNegatives(_ConfusionMatrixConditionCount):
   >>> m.result().numpy()
   2.0
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 1, 1, 1], [0, 1, 0, 0], sample_weight=[0, 0, 1, 0])
   >>> m.result().numpy()
   1.0
@@ -1105,7 +1121,7 @@ class TrueNegatives(_ConfusionMatrixConditionCount):
   >>> m.result().numpy()
   2.0
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 1, 0, 0], [1, 1, 0, 0], sample_weight=[0, 0, 1, 0])
   >>> m.result().numpy()
   1.0
@@ -1154,7 +1170,7 @@ class TruePositives(_ConfusionMatrixConditionCount):
   >>> m.result().numpy()
   2.0
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0])
   >>> m.result().numpy()
   1.0
@@ -1219,7 +1235,7 @@ class Precision(Metric):
   >>> m.result().numpy()
   0.6666667
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0])
   >>> m.result().numpy()
   1.0
@@ -1299,7 +1315,7 @@ def result(self):
                                  self.true_positives + self.false_positives)
     return result[0] if len(self.thresholds) == 1 else result
 
-  def reset_states(self):
+  def reset_state(self):
     num_thresholds = len(to_list(self.thresholds))
     K.batch_set_value(
         [(v, np.zeros((num_thresholds,))) for v in self.variables])
@@ -1356,7 +1372,7 @@ class Recall(Metric):
   >>> m.result().numpy()
   0.6666667
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0])
   >>> m.result().numpy()
   1.0
@@ -1424,7 +1440,7 @@ def result(self):
                                  self.true_positives + self.false_negatives)
     return result[0] if len(self.thresholds) == 1 else result
 
-  def reset_states(self):
+  def reset_state(self):
     num_thresholds = len(to_list(self.thresholds))
     K.batch_set_value(
         [(v, np.zeros((num_thresholds,))) for v in self.variables])
@@ -1447,11 +1463,17 @@ class SensitivitySpecificityBase(Metric):
   [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
   """
 
-  def __init__(self, value, num_thresholds=200, name=None, dtype=None):
+  def __init__(self,
+               value,
+               num_thresholds=200,
+               class_id=None,
+               name=None,
+               dtype=None):
     super(SensitivitySpecificityBase, self).__init__(name=name, dtype=dtype)
     if num_thresholds <= 0:
       raise ValueError('`num_thresholds` must be > 0.')
     self.value = value
+    self.class_id = class_id
     self.true_positives = self.add_weight(
         'true_positives',
         shape=(num_thresholds,),
@@ -1500,13 +1522,19 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         y_true,
         y_pred,
         thresholds=self.thresholds,
+        class_id=self.class_id,
         sample_weight=sample_weight)
 
-  def reset_states(self):
+  def reset_state(self):
     num_thresholds = len(self.thresholds)
     K.batch_set_value(
         [(v, np.zeros((num_thresholds,))) for v in self.variables])
 
+  def get_config(self):
+    config = {'class_id': self.class_id}
+    base_config = super(SensitivitySpecificityBase, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
   def _find_max_under_constraint(self, constrained, dependent, predicate):
     """Returns the maximum of dependent_statistic that satisfies the constraint.
 
@@ -1550,6 +1578,11 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
+  If `class_id` is specified, we calculate precision by considering only the
+  entries in the batch for which `class_id` is above the threshold predictions,
+  and computing the fraction of them for which `class_id` is indeed a correct
+  label.
+
   For additional information about specificity and sensitivity, see
   [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
 
@@ -1557,6 +1590,9 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
     specificity: A scalar value in range `[0, 1]`.
     num_thresholds: (Optional) Defaults to 200. The number of thresholds to
       use for matching the given specificity.
+    class_id: (Optional) Integer class ID for which we want binary metrics.
+      This must be in the half-open interval `[0, num_classes)`, where
+      `num_classes` is the last dimension of predictions.
     name: (Optional) string name of the metric instance.
     dtype: (Optional) data type of the metric result.
 
@@ -1567,7 +1603,7 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
   >>> m.result().numpy()
   0.5
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8],
   ...                sample_weight=[1, 1, 2, 2, 1])
   >>> m.result().numpy()
@@ -1583,13 +1619,22 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase):
   ```
   """
 
-  def __init__(self, specificity, num_thresholds=200, name=None, dtype=None):
+  def __init__(self,
+               specificity,
+               num_thresholds=200,
+               class_id=None,
+               name=None,
+               dtype=None):
     if specificity < 0 or specificity > 1:
       raise ValueError('`specificity` must be in the range [0, 1].')
     self.specificity = specificity
     self.num_thresholds = num_thresholds
     super(SensitivityAtSpecificity, self).__init__(
-        specificity, num_thresholds=num_thresholds, name=name, dtype=dtype)
+        specificity,
+        num_thresholds=num_thresholds,
+        class_id=class_id,
+        name=name,
+        dtype=dtype)
 
   def result(self):
     specificities = tf.math.divide_no_nan(
@@ -1625,6 +1670,11 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
+  If `class_id` is specified, we calculate precision by considering only the
+  entries in the batch for which `class_id` is above the threshold predictions,
+  and computing the fraction of them for which `class_id` is indeed a correct
+  label.
+
   For additional information about specificity and sensitivity, see
   [the following](https://en.wikipedia.org/wiki/Sensitivity_and_specificity).
 
@@ -1632,6 +1682,9 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
     sensitivity: A scalar value in range `[0, 1]`.
     num_thresholds: (Optional) Defaults to 200. The number of thresholds to
       use for matching the given sensitivity.
+    class_id: (Optional) Integer class ID for which we want binary metrics.
+      This must be in the half-open interval `[0, num_classes)`, where
+      `num_classes` is the last dimension of predictions.
     name: (Optional) string name of the metric instance.
     dtype: (Optional) data type of the metric result.
 
@@ -1642,7 +1695,7 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
   >>> m.result().numpy()
   0.66666667
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8],
   ...                sample_weight=[1, 1, 2, 2, 2])
   >>> m.result().numpy()
@@ -1658,13 +1711,22 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase):
   ```
   """
 
-  def __init__(self, sensitivity, num_thresholds=200, name=None, dtype=None):
+  def __init__(self,
+               sensitivity,
+               num_thresholds=200,
+               class_id=None,
+               name=None,
+               dtype=None):
     if sensitivity < 0 or sensitivity > 1:
       raise ValueError('`sensitivity` must be in the range [0, 1].')
     self.sensitivity = sensitivity
     self.num_thresholds = num_thresholds
     super(SpecificityAtSensitivity, self).__init__(
-        sensitivity, num_thresholds=num_thresholds, name=name, dtype=dtype)
+        sensitivity,
+        num_thresholds=num_thresholds,
+        class_id=class_id,
+        name=name,
+        dtype=dtype)
 
   def result(self):
     sensitivities = tf.math.divide_no_nan(
@@ -1695,10 +1757,18 @@ class PrecisionAtRecall(SensitivitySpecificityBase):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
+  If `class_id` is specified, we calculate precision by considering only the
+  entries in the batch for which `class_id` is above the threshold predictions,
+  and computing the fraction of them for which `class_id` is indeed a correct
+  label.
+
   Args:
     recall: A scalar value in range `[0, 1]`.
     num_thresholds: (Optional) Defaults to 200. The number of thresholds to
       use for matching the given recall.
+    class_id: (Optional) Integer class ID for which we want binary metrics.
+      This must be in the half-open interval `[0, num_classes)`, where
+      `num_classes` is the last dimension of predictions.
     name: (Optional) string name of the metric instance.
     dtype: (Optional) data type of the metric result.
 
@@ -1709,7 +1779,7 @@ class PrecisionAtRecall(SensitivitySpecificityBase):
   >>> m.result().numpy()
   0.5
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 0, 0, 1, 1], [0, 0.3, 0.8, 0.3, 0.8],
   ...                sample_weight=[2, 2, 2, 1, 1])
   >>> m.result().numpy()
@@ -1725,7 +1795,12 @@ class PrecisionAtRecall(SensitivitySpecificityBase):
   ```
   """
 
-  def __init__(self, recall, num_thresholds=200, name=None, dtype=None):
+  def __init__(self,
+               recall,
+               num_thresholds=200,
+               class_id=None,
+               name=None,
+               dtype=None):
     if recall < 0 or recall > 1:
       raise ValueError('`recall` must be in the range [0, 1].')
     self.recall = recall
@@ -1733,6 +1808,7 @@ def __init__(self, recall, num_thresholds=200, name=None, dtype=None):
     super(PrecisionAtRecall, self).__init__(
         value=recall,
         num_thresholds=num_thresholds,
+        class_id=class_id,
         name=name,
         dtype=dtype)
 
@@ -1765,10 +1841,18 @@ class RecallAtPrecision(SensitivitySpecificityBase):
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
 
+  If `class_id` is specified, we calculate precision by considering only the
+  entries in the batch for which `class_id` is above the threshold predictions,
+  and computing the fraction of them for which `class_id` is indeed a correct
+  label.
+
   Args:
     precision: A scalar value in range `[0, 1]`.
     num_thresholds: (Optional) Defaults to 200. The number of thresholds to
       use for matching the given precision.
+    class_id: (Optional) Integer class ID for which we want binary metrics.
+      This must be in the half-open interval `[0, num_classes)`, where
+      `num_classes` is the last dimension of predictions.
     name: (Optional) string name of the metric instance.
     dtype: (Optional) data type of the metric result.
 
@@ -1779,7 +1863,7 @@ class RecallAtPrecision(SensitivitySpecificityBase):
   >>> m.result().numpy()
   0.5
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9],
   ...                sample_weight=[1, 0, 0, 1])
   >>> m.result().numpy()
@@ -1795,7 +1879,12 @@ class RecallAtPrecision(SensitivitySpecificityBase):
   ```
   """
 
-  def __init__(self, precision, num_thresholds=200, name=None, dtype=None):
+  def __init__(self,
+               precision,
+               num_thresholds=200,
+               class_id=None,
+               name=None,
+               dtype=None):
     if precision < 0 or precision > 1:
       raise ValueError('`precision` must be in the range [0, 1].')
     self.precision = precision
@@ -1803,6 +1892,7 @@ def __init__(self, precision, num_thresholds=200, name=None, dtype=None):
     super(RecallAtPrecision, self).__init__(
         value=precision,
         num_thresholds=num_thresholds,
+        class_id=class_id,
         name=name,
         dtype=dtype)
 
@@ -1917,7 +2007,7 @@ class AUC(Metric):
   >>> m.result().numpy()
   0.75
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9],
   ...                sample_weight=[1, 0, 0, 1])
   >>> m.result().numpy()
@@ -2265,7 +2355,7 @@ def result(self):
           tf.multiply(x[:self.num_thresholds - 1] - x[1:], heights),
           name=self.name)
 
-  def reset_states(self):
+  def reset_state(self):
     if self.multi_label:
       K.batch_set_value([(v, np.zeros((self.num_thresholds, self._num_labels)))
                          for v in self.variables])
@@ -2323,7 +2413,7 @@ class CosineSimilarity(MeanMetricWrapper):
   >>> m.result().numpy()
   0.49999997
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]],
   ...                sample_weight=[0.3, 0.7])
   >>> m.result().numpy()
@@ -2359,7 +2449,7 @@ class MeanAbsoluteError(MeanMetricWrapper):
   >>> m.result().numpy()
   0.25
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2395,7 +2485,7 @@ class MeanAbsolutePercentageError(MeanMetricWrapper):
   >>> m.result().numpy()
   250000000.0
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2431,7 +2521,7 @@ class MeanSquaredError(MeanMetricWrapper):
   >>> m.result().numpy()
   0.25
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2467,7 +2557,7 @@ class MeanSquaredLogarithmicError(MeanMetricWrapper):
   >>> m.result().numpy()
   0.12011322
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2506,7 +2596,7 @@ class Hinge(MeanMetricWrapper):
   >>> m.result().numpy()
   1.3
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2541,7 +2631,7 @@ class SquaredHinge(MeanMetricWrapper):
   >>> m.result().numpy()
   1.86
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2576,7 +2666,7 @@ class CategoricalHinge(MeanMetricWrapper):
   >>> m.result().numpy()
   1.4000001
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2607,7 +2697,7 @@ class RootMeanSquaredError(Mean):
   >>> m.result().numpy()
   0.5
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2668,7 +2758,7 @@ class LogCoshError(MeanMetricWrapper):
   >>> m.result().numpy()
   0.10844523
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2704,7 +2794,7 @@ class Poisson(MeanMetricWrapper):
   >>> m.result().numpy()
   0.49999997
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2740,7 +2830,7 @@ class KLDivergence(MeanMetricWrapper):
   >>> m.result().numpy()
   0.45814306
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -2793,7 +2883,7 @@ class MeanIoU(Metric):
   >>> m.result().numpy()
   0.33333334
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([0, 0, 1, 1], [0, 1, 0, 1],
   ...                sample_weight=[0.3, 0.3, 0.3, 0.1])
   >>> m.result().numpy()
@@ -2881,7 +2971,7 @@ def result(self):
     return tf.math.divide_no_nan(
         tf.reduce_sum(iou, name='mean_iou'), num_valid_entries)
 
-  def reset_states(self):
+  def reset_state(self):
     K.set_value(self.total_cm, np.zeros((self.num_classes, self.num_classes)))
 
   def get_config(self):
@@ -3008,7 +3098,7 @@ def result(self):
           )
     return tf.math.divide_no_nan(self.total, self.count)
 
-  def reset_states(self):
+  def reset_state(self):
     if self._built:
       K.batch_set_value(
           [(v, np.zeros(self._shape.as_list())) for v in self.variables])
@@ -3038,7 +3128,7 @@ class BinaryCrossentropy(MeanMetricWrapper):
   >>> m.result().numpy()
   0.81492424
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]],
   ...                sample_weight=[1, 0])
   >>> m.result().numpy()
@@ -3101,7 +3191,7 @@ class CategoricalCrossentropy(MeanMetricWrapper):
   >>> m.result().numpy()
   1.1769392
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([[0, 1, 0], [0, 0, 1]],
   ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]],
   ...                sample_weight=tf.constant([0.3, 0.7]))
@@ -3172,7 +3262,7 @@ class SparseCategoricalCrossentropy(MeanMetricWrapper):
   >>> m.result().numpy()
   1.1769392
 
-  >>> m.reset_states()
+  >>> m.reset_state()
   >>> m.update_state([1, 2],
   ...                [[0.05, 0.95, 0], [0.1, 0.8, 0.1]],
   ...                sample_weight=tf.constant([0.3, 0.7]))
diff --git a/keras/metrics_confusion_matrix_test.py b/keras/metrics_confusion_matrix_test.py
index 5d169c44268a..33298e21f232 100644
--- a/keras/metrics_confusion_matrix_test.py
+++ b/keras/metrics_confusion_matrix_test.py
@@ -24,12 +24,12 @@
 
 from absl.testing import parameterized
 import numpy as np
-from scipy.special import expit
 from keras import combinations
 from keras import layers
 from keras import metrics
 from keras import models
 from keras.utils import metrics_utils
+from tensorflow.python.platform import tf_logging
 
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -729,11 +729,15 @@ class SensitivityAtSpecificityTest(tf.test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     s_obj = metrics.SensitivityAtSpecificity(
-        0.4, num_thresholds=100, name='sensitivity_at_specificity_1')
+        0.4,
+        num_thresholds=100,
+        class_id=12,
+        name='sensitivity_at_specificity_1')
     self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
     self.assertLen(s_obj.variables, 4)
     self.assertEqual(s_obj.specificity, 0.4)
     self.assertEqual(s_obj.num_thresholds, 100)
+    self.assertEqual(s_obj.class_id, 12)
 
     # Check save and restore config
     s_obj2 = metrics.SensitivityAtSpecificity.from_config(s_obj.get_config())
@@ -741,6 +745,7 @@ def test_config(self):
     self.assertLen(s_obj2.variables, 4)
     self.assertEqual(s_obj2.specificity, 0.4)
     self.assertEqual(s_obj2.num_thresholds, 100)
+    self.assertEqual(s_obj.class_id, 12)
 
   def test_value_is_idempotent(self):
     s_obj = metrics.SensitivityAtSpecificity(0.7)
@@ -797,6 +802,17 @@ def test_unweighted_low_specificity(self):
     result = s_obj(y_true, y_pred)
     self.assertAlmostEqual(0.6, self.evaluate(result))
 
+  def test_unweighted_class_id(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.4, class_id=2)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
+
+    y_pred = tf.compat.v1.transpose([pred_values] * 3)
+    y_true = tf.one_hot(label_values, depth=3)
+    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
   @parameterized.parameters([tf.bool, tf.int32, tf.float32])
   def test_weighted(self, label_dtype):
     s_obj = metrics.SensitivityAtSpecificity(0.4)
@@ -826,11 +842,15 @@ class SpecificityAtSensitivityTest(tf.test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     s_obj = metrics.SpecificityAtSensitivity(
-        0.4, num_thresholds=100, name='specificity_at_sensitivity_1')
+        0.4,
+        num_thresholds=100,
+        class_id=12,
+        name='specificity_at_sensitivity_1')
     self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
     self.assertLen(s_obj.variables, 4)
     self.assertEqual(s_obj.sensitivity, 0.4)
     self.assertEqual(s_obj.num_thresholds, 100)
+    self.assertEqual(s_obj.class_id, 12)
 
     # Check save and restore config
     s_obj2 = metrics.SpecificityAtSensitivity.from_config(s_obj.get_config())
@@ -838,6 +858,7 @@ def test_config(self):
     self.assertLen(s_obj2.variables, 4)
     self.assertEqual(s_obj2.sensitivity, 0.4)
     self.assertEqual(s_obj2.num_thresholds, 100)
+    self.assertEqual(s_obj.class_id, 12)
 
   def test_value_is_idempotent(self):
     s_obj = metrics.SpecificityAtSensitivity(0.7)
@@ -893,6 +914,17 @@ def test_unweighted_low_sensitivity(self):
     result = s_obj(y_true, y_pred)
     self.assertAlmostEqual(0.6, self.evaluate(result))
 
+  def test_unweighted_class_id(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.4, class_id=2)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
+
+    y_pred = tf.compat.v1.transpose([pred_values] * 3)
+    y_true = tf.one_hot(label_values, depth=3)
+    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
   @parameterized.parameters([tf.bool, tf.int32, tf.float32])
   def test_weighted(self, label_dtype):
     s_obj = metrics.SpecificityAtSensitivity(0.4)
@@ -922,11 +954,12 @@ class PrecisionAtRecallTest(tf.test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     s_obj = metrics.PrecisionAtRecall(
-        0.4, num_thresholds=100, name='precision_at_recall_1')
+        0.4, num_thresholds=100, class_id=12, name='precision_at_recall_1')
     self.assertEqual(s_obj.name, 'precision_at_recall_1')
     self.assertLen(s_obj.variables, 4)
     self.assertEqual(s_obj.recall, 0.4)
     self.assertEqual(s_obj.num_thresholds, 100)
+    self.assertEqual(s_obj.class_id, 12)
 
     # Check save and restore config
     s_obj2 = metrics.PrecisionAtRecall.from_config(s_obj.get_config())
@@ -934,6 +967,7 @@ def test_config(self):
     self.assertLen(s_obj2.variables, 4)
     self.assertEqual(s_obj2.recall, 0.4)
     self.assertEqual(s_obj2.num_thresholds, 100)
+    self.assertEqual(s_obj.class_id, 12)
 
   def test_value_is_idempotent(self):
     s_obj = metrics.PrecisionAtRecall(0.7)
@@ -991,6 +1025,18 @@ def test_unweighted_low_recall(self):
     # For 0.2 < decision threshold < 0.5.
     self.assertAlmostEqual(0.75, self.evaluate(result))
 
+  def test_unweighted_class_id(self):
+    s_obj = metrics.PrecisionAtRecall(0.6, class_id=2)
+    pred_values = [0.0, 0.1, 0.2, 0.5, 0.6, 0.2, 0.5, 0.6, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 2, 2, 2, 2, 2]
+
+    y_pred = tf.compat.v1.transpose([pred_values] * 3)
+    y_true = tf.one_hot(label_values, depth=3)
+    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    # For 0.2 < decision threshold < 0.5.
+    self.assertAlmostEqual(0.75, self.evaluate(result))
+
   @parameterized.parameters([tf.bool, tf.int32, tf.float32])
   def test_weighted(self, label_dtype):
     s_obj = metrics.PrecisionAtRecall(7.0/8)
@@ -1021,11 +1067,12 @@ class RecallAtPrecisionTest(tf.test.TestCase, parameterized.TestCase):
 
   def test_config(self):
     s_obj = metrics.RecallAtPrecision(
-        0.4, num_thresholds=100, name='recall_at_precision_1')
+        0.4, num_thresholds=100, class_id=12, name='recall_at_precision_1')
     self.assertEqual(s_obj.name, 'recall_at_precision_1')
     self.assertLen(s_obj.variables, 4)
     self.assertEqual(s_obj.precision, 0.4)
     self.assertEqual(s_obj.num_thresholds, 100)
+    self.assertEqual(s_obj.class_id, 12)
 
     # Check save and restore config
     s_obj2 = metrics.RecallAtPrecision.from_config(s_obj.get_config())
@@ -1033,6 +1080,7 @@ def test_config(self):
     self.assertLen(s_obj2.variables, 4)
     self.assertEqual(s_obj2.precision, 0.4)
     self.assertEqual(s_obj2.num_thresholds, 100)
+    self.assertEqual(s_obj.class_id, 12)
 
   def test_value_is_idempotent(self):
     s_obj = metrics.RecallAtPrecision(0.7)
@@ -1096,6 +1144,21 @@ def test_unweighted_low_precision(self):
     # The precision 5/7 can be reached at thresholds 00.3<=t<0.35.
     self.assertAlmostEqual(5. / 6, self.evaluate(result))
 
+  def test_unweighted_class_id(self):
+    s_obj = metrics.RecallAtPrecision(2.0 / 3, class_id=2)
+    pred_values = [
+        0.05, 0.1, 0.2, 0.3, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.9, 0.95
+    ]
+    label_values = [0, 2, 0, 0, 0, 2, 2, 0, 2, 2, 0, 2]
+    # precisions: [1/2, 6/11, 1/2, 5/9, 5/8, 5/7, 2/3, 3/5, 3/5, 2/3, 1/2, 1].
+    # recalls:    [1,   1,    5/6, 5/6, 5/6, 5/6, 2/3, 1/2, 1/2, 1/3, 1/6, 1/6].
+    y_pred = tf.compat.v1.transpose([pred_values] * 3)
+    y_true = tf.one_hot(label_values, depth=3)
+    self.evaluate(tf.compat.v1.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    # The precision 5/7 can be reached at thresholds 00.3<=t<0.35.
+    self.assertAlmostEqual(5. / 6, self.evaluate(result))
+
   @parameterized.parameters([tf.bool, tf.int32, tf.float32])
   def test_weighted(self, label_dtype):
     s_obj = metrics.RecallAtPrecision(0.75)
@@ -1409,16 +1472,20 @@ def test_invalid_summation_method(self):
       metrics.AUC(summation_method='Invalid')
 
   def test_extra_dims(self):
-    self.setup()
-    logits = expit(-np.array([[[-10., 10., -10.], [10., -10., 10.]],
-                              [[-12., 12., -12.], [12., -12., 12.]]],
-                             dtype=np.float32))
-    labels = np.array([[[1, 0, 0], [1, 0, 0]],
-                       [[0, 1, 1], [0, 1, 1]]], dtype=np.int64)
-    auc_obj = metrics.AUC()
-    self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
-    result = auc_obj(labels, logits)
-    self.assertEqual(self.evaluate(result), 0.5)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.setup()
+      logits = special.expit(-np.array([[[-10., 10., -10.], [10., -10., 10.]],
+                                        [[-12., 12., -12.], [12., -12., 12.]]],
+                                       dtype=np.float32))
+      labels = np.array([[[1, 0, 0], [1, 0, 0]], [[0, 1, 1], [0, 1, 1]]],
+                        dtype=np.int64)
+      auc_obj = metrics.AUC()
+      self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
+      result = auc_obj(labels, logits)
+      self.assertEqual(self.evaluate(result), 0.5)
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
 
 
 @combinations.generate(combinations.combine(mode=['graph', 'eager']))
@@ -1696,14 +1763,14 @@ def test_keras_model_compiles(self):
         metrics=[metrics.AUC(multi_label=True)]
     )
 
-  def test_reset_states(self):
+  def test_reset_state(self):
     with self.test_session():
       self.setup()
       auc_obj = metrics.AUC(num_thresholds=self.num_thresholds,
                             multi_label=True)
       self.evaluate(tf.compat.v1.variables_initializer(auc_obj.variables))
       auc_obj(self.y_true_good, self.y_pred)
-      auc_obj.reset_states()
+      auc_obj.reset_state()
       self.assertAllEqual(auc_obj.true_positives, np.zeros((5, 2)))
 
 
diff --git a/keras/metrics_test.py b/keras/metrics_test.py
index 4ab5c4c00c58..e84273da4ccd 100644
--- a/keras/metrics_test.py
+++ b/keras/metrics_test.py
@@ -64,8 +64,8 @@ def test_sum(self):
       self.assertAlmostEqual(self.evaluate(m.result()), 106)
       self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
 
-      # check reset_states()
-      m.reset_states()
+      # check reset_state()
+      m.reset_state()
       self.assertEqual(self.evaluate(m.total), 0)
 
   def test_sum_with_sample_weight(self):
@@ -189,8 +189,8 @@ def test_mean(self):
     self.assertEqual(self.evaluate(m.total), 106)  # 100 + 1 + 5
     self.assertEqual(self.evaluate(m.count), 3)
 
-    # check reset_states()
-    m.reset_states()
+    # check reset_state()
+    m.reset_state()
     self.assertEqual(self.evaluate(m.total), 0)
     self.assertEqual(self.evaluate(m.count), 0)
 
@@ -205,10 +205,10 @@ def test_mean(self):
   def test_function_wrapped_reset_state(self):
     m = metrics.Mean(name='my_mean')
 
-    # check reset_states in function.
+    # check reset_state in function.
     @tf.function
     def reset_in_fn():
-      m.reset_states()
+      m.reset_state()
       return m.update_state(100)
 
     for _ in range(5):
@@ -1415,8 +1415,8 @@ def test_unweighted(self):
       self.assertAllClose(self.evaluate(m.total), [101, 45])
       self.assertAllClose(self.evaluate(m.count), [2, 2])
 
-      # check reset_states()
-      m.reset_states()
+      # check reset_state()
+      m.reset_state()
       self.assertAllClose(self.evaluate(m.total), [0, 0])
       self.assertAllClose(self.evaluate(m.count), [0, 0])
 
@@ -2116,7 +2116,7 @@ def _get_model(compile_metrics):
 @keras_parameterized.run_all_keras_modes
 class ResetStatesTest(keras_parameterized.TestCase):
 
-  def test_reset_states_false_positives(self):
+  def test_reset_state_false_positives(self):
     fp_obj = metrics.FalsePositives()
     model = _get_model([fp_obj])
     x = np.ones((100, 4))
@@ -2126,7 +2126,7 @@ def test_reset_states_false_positives(self):
     model.evaluate(x, y)
     self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
 
-  def test_reset_states_false_negatives(self):
+  def test_reset_state_false_negatives(self):
     fn_obj = metrics.FalseNegatives()
     model = _get_model([fn_obj])
     x = np.zeros((100, 4))
@@ -2136,7 +2136,7 @@ def test_reset_states_false_negatives(self):
     model.evaluate(x, y)
     self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
 
-  def test_reset_states_true_negatives(self):
+  def test_reset_state_true_negatives(self):
     tn_obj = metrics.TrueNegatives()
     model = _get_model([tn_obj])
     x = np.zeros((100, 4))
@@ -2146,7 +2146,7 @@ def test_reset_states_true_negatives(self):
     model.evaluate(x, y)
     self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
 
-  def test_reset_states_true_positives(self):
+  def test_reset_state_true_positives(self):
     tp_obj = metrics.TruePositives()
     model = _get_model([tp_obj])
     x = np.ones((100, 4))
@@ -2156,7 +2156,7 @@ def test_reset_states_true_positives(self):
     model.evaluate(x, y)
     self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
 
-  def test_reset_states_precision(self):
+  def test_reset_state_precision(self):
     p_obj = metrics.Precision()
     model = _get_model([p_obj])
     x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
@@ -2168,7 +2168,7 @@ def test_reset_states_precision(self):
     self.assertEqual(self.evaluate(p_obj.true_positives), 50.)
     self.assertEqual(self.evaluate(p_obj.false_positives), 50.)
 
-  def test_reset_states_recall(self):
+  def test_reset_state_recall(self):
     r_obj = metrics.Recall()
     model = _get_model([r_obj])
     x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
@@ -2180,7 +2180,7 @@ def test_reset_states_recall(self):
     self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
     self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
 
-  def test_reset_states_sensitivity_at_specificity(self):
+  def test_reset_state_sensitivity_at_specificity(self):
     s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
     model = _get_model([s_obj])
     x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
@@ -2195,7 +2195,7 @@ def test_reset_states_sensitivity_at_specificity(self):
       self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
       self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
 
-  def test_reset_states_specificity_at_sensitivity(self):
+  def test_reset_state_specificity_at_sensitivity(self):
     s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
     model = _get_model([s_obj])
     x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
@@ -2210,7 +2210,7 @@ def test_reset_states_specificity_at_sensitivity(self):
       self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
       self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
 
-  def test_reset_states_precision_at_recall(self):
+  def test_reset_state_precision_at_recall(self):
     s_obj = metrics.PrecisionAtRecall(recall=0.5, num_thresholds=1)
     model = _get_model([s_obj])
     x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
@@ -2225,7 +2225,7 @@ def test_reset_states_precision_at_recall(self):
       self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
       self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
 
-  def test_reset_states_recall_at_precision(self):
+  def test_reset_state_recall_at_precision(self):
     s_obj = metrics.RecallAtPrecision(precision=0.5, num_thresholds=1)
     model = _get_model([s_obj])
     x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
@@ -2240,7 +2240,7 @@ def test_reset_states_recall_at_precision(self):
       self.assertEqual(self.evaluate(s_obj.false_negatives), 25.)
       self.assertEqual(self.evaluate(s_obj.true_negatives), 25.)
 
-  def test_reset_states_auc(self):
+  def test_reset_state_auc(self):
     auc_obj = metrics.AUC(num_thresholds=3)
     model = _get_model([auc_obj])
     x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
@@ -2255,7 +2255,7 @@ def test_reset_states_auc(self):
       self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.)
       self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.)
 
-  def test_reset_states_auc_from_logits(self):
+  def test_reset_state_auc_from_logits(self):
     auc_obj = metrics.AUC(num_thresholds=3, from_logits=True)
 
     model_layers = [layers.Dense(1, kernel_initializer='ones', use_bias=False)]
@@ -2278,7 +2278,7 @@ def test_reset_states_auc_from_logits(self):
       self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.)
       self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.)
 
-  def test_reset_states_auc_manual_thresholds(self):
+  def test_reset_state_auc_manual_thresholds(self):
     auc_obj = metrics.AUC(thresholds=[0.5])
     model = _get_model([auc_obj])
     x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
@@ -2293,7 +2293,7 @@ def test_reset_states_auc_manual_thresholds(self):
       self.assertEqual(self.evaluate(auc_obj.false_negatives[1]), 25.)
       self.assertEqual(self.evaluate(auc_obj.true_negatives[1]), 25.)
 
-  def test_reset_states_mean_iou(self):
+  def test_reset_state_mean_iou(self):
     m_obj = metrics.MeanIoU(num_classes=2)
     model = _get_model([m_obj])
     x = np.asarray([[0, 0, 0, 0], [1, 1, 1, 1], [1, 0, 1, 0], [0, 1, 0, 1]],
@@ -2306,7 +2306,7 @@ def test_reset_states_mean_iou(self):
     self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
     self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
 
-  def test_reset_states_recall_float64(self):
+  def test_reset_state_recall_float64(self):
     # Test case for GitHub issue 36790.
     try:
       backend.set_floatx('float64')
diff --git a/keras/models.py b/keras/models.py
index 5822dece01da..2222a4cf974b 100644
--- a/keras/models.py
+++ b/keras/models.py
@@ -386,22 +386,23 @@ def _clone_sequential_model(model, input_tensors=None, layer_fn=_clone_layer):
 
 @keras_export('keras.models.clone_model')
 def clone_model(model, input_tensors=None, clone_function=None):
-  """Clone any `Model` instance.
+  """Clone a Functional or Sequential `Model` instance.
 
   Model cloning is similar to calling a model on new inputs,
   except that it creates new layers (and thus new weights) instead
   of sharing the weights of the existing layers.
 
+  Note that
   `clone_model` will not preserve the uniqueness of shared objects within the
   model (e.g. a single variable attached to two distinct layers will be
   restored as two separate variables).
 
   Args:
       model: Instance of `Model`
-          (could be a functional model or a Sequential model).
+          (could be a Functional model or a Sequential model).
       input_tensors: optional list of input tensors or InputLayer objects
           to build the model upon. If not provided,
-          placeholders will be created.
+          new `Input` objects will be created.
       clone_function: Callable to be used to clone each layer in the target
           model (except `InputLayer` instances). It takes as argument the layer
           instance to be cloned, and returns the corresponding layer instance to
@@ -414,14 +415,34 @@ def clone_model(model, input_tensors=None, clone_function=None):
           `Bidirectional(LSTM(...))` instances, for example).
 
   Returns:
-      An instance of `Model` reproducing the behavior
-      of the original model, on top of new inputs tensors,
-      using newly instantiated weights. The cloned model might behave
-      differently from the original model if a custom clone_function
-      modifies the layer.
-
-  Raises:
-      ValueError: in case of invalid `model` argument value.
+    An instance of `Model` reproducing the behavior
+    of the original model, on top of new inputs tensors,
+    using newly instantiated weights. The cloned model may behave
+    differently from the original model if a custom `clone_function`
+    modifies the layer.
+
+  Example:
+
+  ```python
+  # Create a test Sequential model.
+  model = keras.Sequential([
+      keras.Input(shape=(728,)),
+      keras.layers.Dense(32, activation='relu'),
+      keras.layers.Dense(1, activation='sigmoid'),
+  ])
+  # Create a copy of the test model (with freshly initialized weights).
+  new_model = clone_model(model)
+  ```
+
+  Note that subclassed models cannot be cloned, since their internal
+  layer structure is not known. To achieve equivalent functionality
+  as `clone_model` in the case of a subclassed model, simply make sure
+  that the model class implements `get_config()`
+  (and optionally `from_config()`), and call:
+
+  ```python
+  new_model = model.__class__.from_config(model.get_config())
+  ```
   """
   with generic_utils.DisableSharedObjectScope():
     if clone_function is None:
diff --git a/keras/optimizer_v2/adadelta.py b/keras/optimizer_v2/adadelta.py
index a59cd09859c7..fc650c4e47fe 100644
--- a/keras/optimizer_v2/adadelta.py
+++ b/keras/optimizer_v2/adadelta.py
@@ -33,42 +33,34 @@ class Adadelta(optimizer_v2.OptimizerV2):
   Adadelta optimization is a stochastic gradient descent method that is based on
   adaptive learning rate per dimension to address two drawbacks:
 
-  - The continual decay of learning rates throughout training
-  - The need for a manually selected global learning rate
+  - The continual decay of learning rates throughout training.
+  - The need for a manually selected global learning rate.
 
   Adadelta is a more robust extension of Adagrad that adapts learning rates
   based on a moving window of gradient updates, instead of accumulating all
   past gradients. This way, Adadelta continues learning even when many updates
   have been done. Compared to Adagrad, in the original version of Adadelta you
-  don't have to set an initial learning rate. In this version, initial
+  don't have to set an initial learning rate. In this version, the initial
   learning rate can be set, as in most other Keras optimizers.
 
-  According to section 4.3 ("Effective Learning rates"), near the end of
-  training step sizes converge to 1 which is effectively a high learning
-  rate which would cause divergence. This occurs only near the end of the
-  training as gradients and step sizes are small, and the epsilon constant
-  in the numerator and denominator dominate past gradients and parameter
-  updates which converge the learning rate to 1.
-
-  According to section 4.4("Speech Data"),where a large neural network with
-  4 hidden layers was trained on a corpus of US English data, ADADELTA was
-  used with 100 network replicas.The epsilon used is 1e-6 with rho=0.95
-  which converged faster than ADAGRAD, by the following construction:
-  def __init__(self, lr=1.0, rho=0.95, epsilon=1e-6, decay=0., **kwargs):
-
   Args:
-    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
-      To match the exact form in the original paper use 1.0.
+    learning_rate: Initial value for the learning rate:
+      either a floating point value,
+      or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+      Defaults to 0.001.
+      Note that `Adadelta` tends to benefit from higher initial learning rate
+      values compared to other optimizers.
+      To match the exact form in the original paper, use 1.0.
     rho: A `Tensor` or a floating point value. The decay rate.
-    epsilon: A `Tensor` or a floating point value.  A constant epsilon used
-             to better conditioning the grad update.
+    epsilon: Small floating point value used to maintain numerical stability.
     name: Optional name prefix for the operations created when applying
       gradients.  Defaults to `"Adadelta"`.
     **kwargs: Keyword arguments. Allowed to be one of
       `"clipnorm"` or `"clipvalue"`.
-      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
-      gradients by value.
+      `"clipnorm"` (float) clips gradients by norm and represents
+      the maximum norm of each parameter;
+      `"clipvalue"` (float) clips gradient by value and represents the
+      maximum absolute value of each parameter.
 
   Reference:
     - [Zeiler, 2012](http://arxiv.org/abs/1212.5701)
diff --git a/keras/optimizer_v2/adadelta_test.py b/keras/optimizer_v2/adadelta_test.py
index cef0c96fb8f6..2ce4412aa1c0 100644
--- a/keras/optimizer_v2/adadelta_test.py
+++ b/keras/optimizer_v2/adadelta_test.py
@@ -25,7 +25,10 @@
 from keras import combinations
 from keras.optimizer_v2 import adadelta
 
-_DATA_TYPES = [tf.half, tf.float32, tf.float64, tf.complex64, tf.complex128]
+_DATA_TYPES = [
+    tf.half, tf.float32, tf.float64, tf.complex64,
+    tf.complex128
+]
 
 
 class AdadeltaOptimizerTest(tf.test.TestCase, parameterized.TestCase):
diff --git a/keras/optimizer_v2/adagrad.py b/keras/optimizer_v2/adagrad.py
index 0aa6bb395be7..910cff8598f5 100644
--- a/keras/optimizer_v2/adagrad.py
+++ b/keras/optimizer_v2/adagrad.py
@@ -36,17 +36,25 @@ class Adagrad(optimizer_v2.OptimizerV2):
   the smaller the updates.
 
   Args:
-    learning_rate: A `Tensor`, floating point value, or a schedule that is a
-      `tf.keras.optimizers.schedules.LearningRateSchedule`. The learning rate.
-    initial_accumulator_value: A floating point value.
-      Starting value for the accumulators, must be non-negative.
-    epsilon: A small floating point value to avoid zero denominator.
+    learning_rate: Initial value for the learning rate:
+      either a floating point value,
+      or a `tf.keras.optimizers.schedules.LearningRateSchedule` instance.
+      Defaults to 0.001.
+      Note that `Adagrad` tends to benefit from higher initial learning rate
+      values compared to other optimizers.
+      To match the exact form in the original paper, use 1.0.
+    initial_accumulator_value: Floating point value.
+      Starting value for the accumulators (per-parameter momentum values).
+      Must be non-negative.
+    epsilon: Small floating point value used to maintain numerical stability.
     name: Optional name prefix for the operations created when applying
       gradients.  Defaults to `"Adagrad"`.
     **kwargs: Keyword arguments. Allowed to be one of
       `"clipnorm"` or `"clipvalue"`.
-      `"clipnorm"` (float) clips gradients by norm; `"clipvalue"` (float) clips
-      gradients by value.
+      `"clipnorm"` (float) clips gradients by norm and represents
+      the maximum L2 norm of each weight variable;
+      `"clipvalue"` (float) clips gradient by value and represents the
+      maximum absolute value of each weight variable.
 
   Reference:
     - [Duchi et al., 2011](
diff --git a/keras/optimizer_v2/adagrad_test.py b/keras/optimizer_v2/adagrad_test.py
index ebff419eb15a..f5055ab21c3e 100644
--- a/keras/optimizer_v2/adagrad_test.py
+++ b/keras/optimizer_v2/adagrad_test.py
@@ -28,7 +28,10 @@
 from keras.optimizer_v2 import adagrad
 from keras.optimizer_v2 import learning_rate_schedule
 
-_DATA_TYPES = [tf.half, tf.float32, tf.float64, tf.complex64, tf.complex128]
+_DATA_TYPES = [
+    tf.half, tf.float32, tf.float64, tf.complex64,
+    tf.complex128
+]
 
 
 def adagrad_update_numpy(param, accum, g_t, lr=0.001, epsilon=1e-7):
diff --git a/keras/optimizer_v2/learning_rate_schedule.py b/keras/optimizer_v2/learning_rate_schedule.py
index 4cc5be8164fc..41d1983d534c 100644
--- a/keras/optimizer_v2/learning_rate_schedule.py
+++ b/keras/optimizer_v2/learning_rate_schedule.py
@@ -27,12 +27,46 @@
 
 @keras_export("keras.optimizers.schedules.LearningRateSchedule")
 class LearningRateSchedule(object):
-  """A serializable learning rate decay schedule.
+  """The learning rate schedule base class.
 
-  `LearningRateSchedule`s can be passed in as the learning rate of optimizers in
-  `tf.keras.optimizers`. They can be serialized and deserialized using
-  `tf.keras.optimizers.schedules.serialize` and
-  `tf.keras.optimizers.schedules.deserialize`.
+  You can use a learning rate schedule to modulate how the learning rate
+  of your optimizer changes over time.
+
+  Several built-in learning rate schedules are available, such as
+  `tf.keras.optimizers.schedules.ExponentialDecay` or
+  `tf.keras.optimizers.schedules.PiecewiseConstantDecay`:
+
+  ```python
+  lr_schedule = keras.optimizers.schedules.ExponentialDecay(
+      initial_learning_rate=1e-2,
+      decay_steps=10000,
+      decay_rate=0.9)
+  optimizer = keras.optimizers.SGD(learning_rate=lr_schedule)
+  ```
+
+  A `LearningRateSchedule` instance can be passed in as the `learning_rate`
+  argument of any optimizer.
+
+  To implement your own schedule object, you should implement the `__call__`
+  method, which takes a `step` argument (scalar integer tensor, the
+  current training step count).
+  Like for any other Keras object, you can also optionally
+  make your object serializable by implementing the `get_config`
+  and `from_config` methods.
+
+  Example:
+
+  ```python
+  class MyLRSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
+
+    def __init__(self, initial_learning_rate):
+      self.initial_learning_rate = initial_learning_rate
+
+    def __call__(self, step):
+       return self.initial_learning_rate / (step + 1)
+
+  optimizer = tf.keras.optimizers.SGD(learning_rate=MyLRSchedule(0.1))
+  ```
   """
 
   @abc.abstractmethod
diff --git a/keras/optimizer_v2/rmsprop_test.py b/keras/optimizer_v2/rmsprop_test.py
index 03271b1ed82b..e265d9ebf845 100644
--- a/keras/optimizer_v2/rmsprop_test.py
+++ b/keras/optimizer_v2/rmsprop_test.py
@@ -32,7 +32,10 @@
 from keras.optimizer_v2 import learning_rate_schedule
 from keras.optimizer_v2 import rmsprop
 
-_DATA_TYPES = [tf.half, tf.float32, tf.float64, tf.complex64, tf.complex128]
+_DATA_TYPES = [
+    tf.half, tf.float32, tf.float64, tf.complex64,
+    tf.complex128
+]
 
 _TEST_PARAM_VALUES = [
     # learning_rate, rho, momentum, epsilon, centered
diff --git a/keras/optimizers.py b/keras/optimizers.py
index 8a3b892dc2b7..30151d182255 100644
--- a/keras/optimizers.py
+++ b/keras/optimizers.py
@@ -44,6 +44,22 @@
 
 @keras_export('keras.optimizers.serialize')
 def serialize(optimizer):
+  """Serialize the optimizer configuration to JSON compatible python dict.
+
+  The configuration can be used for persistence and reconstruct the `Optimizer`
+  instance again.
+
+  >>> tf.keras.optimizers.serialize(tf.keras.optimizers.SGD())
+  {'class_name': 'SGD', 'config': {'name': 'SGD', 'learning_rate': 0.01,
+                                   'decay': 0.0, 'momentum': 0.0,
+                                   'nesterov': False}}
+
+  Args:
+    optimizer: An `Optimizer` instance to serialize.
+
+  Returns:
+    Python dict which contains the configuration of the input optimizer.
+  """
   return serialize_keras_object(optimizer)
 
 
diff --git a/keras/saving/saved_model/load.py b/keras/saving/saved_model/load.py
index 3b0cc090a34d..9f2739e7e13e 100644
--- a/keras/saving/saved_model/load.py
+++ b/keras/saving/saved_model/load.py
@@ -40,7 +40,6 @@
 from keras.utils.generic_utils import LazyLoader
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import loader_impl
-from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.saved_model import revived_types
 from tensorflow.python.training.tracking import base as trackable
 
@@ -749,7 +748,7 @@ def _search_for_child_node(self, parent_id, path_to_child):
 
   def _infer_inputs(self, layer_node_id, convert_to_shapes=False):
     """Infers input shape of layer from SavedModel functions."""
-    coder = nested_structure_coder.StructureCoder()
+    coder = tf.__internal__.saved_model.StructureCoder()
     call_fn_id = self._search_for_child_node(
         layer_node_id, ['call_and_return_all_conditional_losses'])
     if call_fn_id is None:
diff --git a/keras/saving/saved_model/save_impl.py b/keras/saving/saved_model/save_impl.py
index 90a29f1da35e..e32735ea8e70 100644
--- a/keras/saving/saved_model/save_impl.py
+++ b/keras/saving/saved_model/save_impl.py
@@ -26,8 +26,6 @@
 import functools
 import threading
 import weakref
-
-from tensorflow.python.eager import def_function
 from keras import backend as K
 from keras.engine import base_layer_utils
 from keras.engine import input_spec
@@ -420,7 +418,7 @@ def _generate_input_signature(self, layer):
       List of possibly nested TensorSpecs of the layer call function inputs.
       The list does not contain the `training` argument.
     """
-    if (isinstance(layer.call, def_function.Function) and
+    if (isinstance(layer.call, tf.__internal__.function.Function) and
         layer.call.input_signature is not None):
       return layer.call.input_signature
     elif isinstance(layer, training_lib.Model):
@@ -687,19 +685,19 @@ def _wrap_unconditional_loss(loss_fn, index):
   """Wraps callable/unconditional loss, returning a serializable function."""
   # Extract original loss function from partial function
   fn = loss_fn.args[0] if isinstance(loss_fn, functools.partial) else loss_fn
-  if isinstance(fn, def_function.Function):
+  if isinstance(fn, tf.__internal__.function.Function):
     return fn
   else:
-    return def_function.Function(
+    return tf.__internal__.function.Function(
         fn, 'loss_fn_{}'.format(index), input_signature=[])
 
 
 def _wrap_activity_regularizer(layer):
   """Wraps the activity regularizer."""
   # pylint: disable=protected-access
-  if isinstance(layer._activity_regularizer, def_function.Function):
+  if isinstance(layer._activity_regularizer, tf.__internal__.function.Function):
     return layer._activity_regularizer
-  return def_function.Function(
+  return tf.__internal__.function.Function(
       layer._activity_regularizer,
       '{}_activity_regularizer'.format(layer.name),
       input_signature=[
@@ -709,6 +707,6 @@ def _wrap_activity_regularizer(layer):
 
 
 def _get_layer_call_method(layer):
-  if isinstance(layer.call, (def_function.Function)):
+  if isinstance(layer.call, (tf.__internal__.function.Function)):
     return layer.call.python_function
   return layer.call
diff --git a/keras/saving/saved_model/serialized_attributes.py b/keras/saving/saved_model/serialized_attributes.py
index a17005983124..c981184e3061 100644
--- a/keras/saving/saved_model/serialized_attributes.py
+++ b/keras/saving/saved_model/serialized_attributes.py
@@ -19,8 +19,6 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
-
-from tensorflow.python.eager import def_function
 from keras.saving.saved_model import constants
 from keras.saving.saved_model import save_impl
 from keras.utils.generic_utils import LazyLoader
@@ -196,7 +194,7 @@ def set_and_validate_functions(self, function_dict):
       if key in function_dict:
         if (function_dict[key] is not None and  # Not all functions are required
             not isinstance(function_dict[key],
-                           (def_function.Function, save_impl.LayerCall))):
+                           (tf.__internal__.function.Function, save_impl.LayerCall))):
           raise ValueError(
               'Function dictionary contained a non-function object: {} (for key'
               ' {})'.format(function_dict[key], key))
diff --git a/keras/saving/saving_utils.py b/keras/saving/saving_utils.py
index 7a532065de2b..51644a384131 100644
--- a/keras/saving/saving_utils.py
+++ b/keras/saving/saving_utils.py
@@ -23,8 +23,6 @@
 import copy
 import os
 import six
-
-from tensorflow.python.eager import def_function
 from keras import backend as K
 from keras import losses
 from keras import optimizer_v1
@@ -114,7 +112,7 @@ def trace_model_call(model, input_signature=None):
     ValueError: if input signature cannot be inferred from the model.
   """
   if input_signature is None:
-    if isinstance(model.call, def_function.Function):
+    if isinstance(model.call, tf.__internal__.function.Function):
       input_signature = model.call.input_signature
 
   if input_signature is None:
diff --git a/keras/utils/dataset_creator.py b/keras/utils/dataset_creator.py
index dbe82f2fbf59..8b86a9a2841c 100644
--- a/keras/utils/dataset_creator.py
+++ b/keras/utils/dataset_creator.py
@@ -19,14 +19,17 @@
 from __future__ import print_function
 
 import tensorflow.compat.v2 as tf
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.utils.experimental.DatasetCreator', v1=[])
 class DatasetCreator(object):
   """Object that returns a `tf.data.Dataset` upon invoking.
 
-  `DatasetCreator` is designated as a supported type for `x`, or the input, in
-  `tf.keras.Model.fit`. Pass an instance of this class to `fit` when using a
-  callable (with a `input_context` argument) that returns a `tf.data.Dataset`.
+  `tf.keras.utils.experimental.DatasetCreator` is designated as a supported type
+  for `x`, or the input, in `tf.keras.Model.fit`. Pass an instance of this class
+  to `fit` when using a callable (with a `input_context` argument) that returns
+  a `tf.data.Dataset`.
 
   ```python
   model = tf.keras.Sequential([tf.keras.layers.Dense(10)])
@@ -45,6 +48,22 @@ def dataset_fn(input_context):
   model.fit(DatasetCreator(dataset_fn), epochs=10, steps_per_epoch=10)
   ```
 
+  `Model.fit` usage with `DatasetCreator` is intended to work across all
+  `tf.distribute.Strategy`s, as long as `Strategy.scope` is used at model
+  creation:
+
+  ```python
+  strategy = tf.distribute.experimental.ParameterServerStrategy(
+      cluster_resolver)
+  with strategy.scope():
+    model = tf.keras.Sequential([tf.keras.layers.Dense(10)])
+  model.compile(tf.keras.optimizers.SGD(), loss="mse")
+  ...
+  ```
+
+  Note: When using `DatasetCreator`, `steps_per_epoch` argument in `Model.fit`
+  must be provided as the cardinality of such input cannot be inferred.
+
   Args:
     dataset_fn: A callable that takes a single argument of type
       `tf.distribute.InputContext`, which is used for batch size calculation and
diff --git a/keras/utils/np_utils.py b/keras/utils/np_utils.py
index de724edd26d6..569f8a6c35e0 100644
--- a/keras/utils/np_utils.py
+++ b/keras/utils/np_utils.py
@@ -83,24 +83,15 @@ def to_categorical(y, num_classes=None, dtype='float32'):
 
 @keras_export('keras.utils.normalize')
 def normalize(x, axis=-1, order=2):
-  """Normalizes a NumPy array.
+  """Normalizes a Numpy array.
 
   Args:
-      x: NumPy array to normalize.
-      axis: Axis along which to normalize. For instance, `axis=-1` corresponds
-        to feature-wise normalization.
-      order: Normalization order (e.g. `order=2` for the L2 norm).
+      x: Numpy array to normalize.
+      axis: axis along which to normalize.
+      order: Normalization order (e.g. `order=2` for L2 norm).
 
   Returns:
       A normalized copy of the array.
-
-  Example:
-
-  >>> array = np.random.random(size=(32, 3))
-  >>> normalized_array = tf.keras.utils.normalize(array, axis=-1)
-  >>> # Every element in the batch has now a unit norm
-  >>> for i in range(32):
-  >>>     np.testing.assert_allclose(np.square(normalized_array[i, :]).sum(), 1)
   """
   l2 = np.atleast_1d(np.linalg.norm(x, order, axis))
   l2[l2 == 0] = 1