huggingface · gante · Jul 30, 2024 · Jul 26, 2024 · Jul 26, 2024 · Jul 30, 2024
diff --git a/docs/source/en/model_sharing.md b/docs/source/en/model_sharing.md
@@ -77,7 +77,7 @@ Then use `notebook_login` to sign-in to the Hub, and follow the link [here](http
 
 To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.
 
-Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. 
+Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework.
 
 <frameworkcontent>
 <pt>

diff --git a/scripts/benchmark/trainer-benchmark.py b/scripts/benchmark/trainer-benchmark.py
@@ -147,7 +147,7 @@ def get_original_command(max_width=80, full_python_path=False):
     Return the original command line string that can be replayed nicely and wrapped for 80 char width.
 
     Args:
-        max_width (`int`, `optional`, defaults to 80):
+        max_width (`int`, *optional*, defaults to 80):
             The width to wrap for.
         full_python_path (`bool`, `optional`, defaults to `False`):
              Whether to replicate the full path or just the last segment (i.e. `python`).

diff --git a/src/transformers/agents/evaluate_agent.py b/src/transformers/agents/evaluate_agent.py
@@ -113,7 +113,7 @@ class Problem:
             The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
             values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
             inputs expected (the value used will be `<<input_name>>` in this case).
-        answer (`str` or `list[str`]):
+        answer (`str` or `list[str]`):
             The theoretical answer (or list of possible valid answers) to the problem, as code.
     """
 

diff --git a/src/transformers/audio_utils.py b/src/transformers/audio_utils.py
@@ -663,7 +663,7 @@ def spectrogram_batch(
             Specifies log scaling strategy; options are None, "log", "log10", "dB".
         reference (`float`, *optional*, defaults to 1.0):
             Reference value for dB conversion in log_mel.
-        min_value (`float`, °optional*, defaults to 1e-10):
+        min_value (`float`, *optional*, defaults to 1e-10):
             Minimum floor value for log scale conversions.
         db_range (`float`, *optional*):
             Dynamic range for dB scale spectrograms.

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -539,7 +539,7 @@ class QuantoQuantizedCache(QuantizedCache):
     Quantized Cache class that uses `quanto` as a backend to perform quantization. Current implementation supports `int2` and `int4` dtypes only.
 
     Parameters:
-        cache_config (`QuantizedCacheConfig`,):
+        cache_config (`QuantizedCacheConfig`):
             A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
     """
 
@@ -580,7 +580,7 @@ class HQQQuantizedCache(QuantizedCache):
     Quantized Cache class that uses `HQQ` as a backend to perform quantization. Current implementation supports `int2`, `int4`, `int8` dtypes.
 
     Parameters:
-        cache_config (`QuantizedCacheConfig`,):
+        cache_config (`QuantizedCacheConfig`):
             A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
     """
 
@@ -790,7 +790,7 @@ class StaticCache(Cache):
     Static Cache class to be used with `torch.compile(model)`.
 
     Parameters:
-        config (`PretrainedConfig):
+        config (`PretrainedConfig`):
             The configuration file defining the shape-related attributes required to initialize the static cache.
         max_batch_size (`int`):
             The maximum batch size with which the model will be used.
@@ -914,7 +914,7 @@ class SlidingWindowCache(StaticCache):
     We overwrite the cache using these, then we always write at cache_position (clamped to `sliding_window`)
 
     Parameters:
-        config (`PretrainedConfig):
+        config (`PretrainedConfig`):
             The configuration file defining the shape-related attributes required to initialize the static cache.
         max_batch_size (`int`):
             The maximum batch size with which the model will be used.

diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py
@@ -225,7 +225,7 @@ def get_resize_output_image_size(
     Args:
         input_image (`np.ndarray`):
             The image to resize.
-        size (`int` or `Tuple[int, int]` or List[int] or Tuple[int]):
+        size (`int` or `Tuple[int, int]` or List[int] or `Tuple[int]`):
             The size to use for resizing the image. If `size` is a sequence like (h, w), output size will be matched to
             this.
 

diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
@@ -1389,7 +1389,7 @@ class NeptuneCallback(TrainerCallback):
             You can find and copy the name in Neptune from the project settings -> Properties. If None (default), the
             value of the `NEPTUNE_PROJECT` environment variable is used.
         name (`str`, *optional*): Custom name for the run.
-        base_namespace (`str`, optional, defaults to "finetuning"): In the Neptune run, the root namespace
+        base_namespace (`str`, *optional*, defaults to "finetuning"): In the Neptune run, the root namespace
             that will contain all of the metadata logged by the callback.
         log_parameters (`bool`, *optional*, defaults to `True`):
             If True, logs all Trainer arguments and model parameters provided by the Trainer.

diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py
@@ -266,7 +266,7 @@ def _ignore_causal_mask_sdpa(
             # or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True` which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108).
             # Thus, we only set `ignore_causal_mask = True` if the model is set to training.
             #
-            # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` (`TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor`).
+            # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` ("TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor").
             if (
                 (is_training or not is_tracing)
                 and (query_length == 1 or key_value_length == query_length)

diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -39,7 +39,7 @@ def _get_unpad_data(attention_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.T
             Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
 
     Return:
-        indices (`torch.Tensor):
+        indices (`torch.Tensor`):
             The indices of non-masked tokens from the flattened input sequence.
         cu_seqlens (`torch.Tensor`):
             The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,).
@@ -83,7 +83,7 @@ def _upad_input(
             Target length.
 
     Return:
-        query_layer (`torch.Tensor):
+        query_layer (`torch.Tensor`):
             Query state without padding. Shape: (total_target_length, num_heads, head_dim).
         key_layer (`torch.Tensor`):
             Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).
@@ -149,7 +149,7 @@ def prepare_fa2_from_position_ids(query, key, value, position_ids):
             Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid.
 
     Return:
-        query (`torch.Tensor):
+        query (`torch.Tensor`):
             Query state without padding. Shape: (total_target_length, num_heads, head_dim).
         key (`torch.Tensor`):
             Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim).

diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
@@ -1444,7 +1444,7 @@ def prepare_tf_dataset(
         Args:
             dataset (`Any`):
                 A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`.
-            batch_size (`int`, defaults to 8):
+            batch_size (`int`, *optional*, defaults to 8):
                 The size of batches to return.
             shuffle (`bool`, defaults to `True`):
                 Whether to return samples from the dataset in random order. Usually `True` for training datasets and
@@ -3442,7 +3442,7 @@ class TFSequenceSummary(keras.layers.Layer):
             - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation.
             - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation.
 
-        initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
+        initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation to use to initialize the weights.
         kwargs (`Dict[str, Any]`, *optional*):
             Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`.
     """

diff --git a/src/transformers/models/autoformer/configuration_autoformer.py b/src/transformers/models/autoformer/configuration_autoformer.py
@@ -105,10 +105,10 @@ class AutoformerConfig(PretrainedConfig):
         label_length (`int`, *optional*, defaults to 10):
             Start token length of the Autoformer decoder, which is used for direct multi-step prediction (i.e.
             non-autoregressive generation).
-        moving_average (`int`, defaults to 25):
+        moving_average (`int`, *optional*, defaults to 25):
             The window size of the moving average. In practice, it's the kernel size in AvgPool1d of the Decomposition
             Layer.
-        autocorrelation_factor (`int`, defaults to 3):
+        autocorrelation_factor (`int`, *optional*, defaults to 3):
             "Attention" (i.e. AutoCorrelation mechanism) factor which is used to find top k autocorrelations delays.
             It's recommended in the paper to set it to a number between 1 and 5.
 

diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
@@ -1219,7 +1219,7 @@ def forward(
 
                 - 0 indicates sequence B is a continuation of sequence A,
                 - 1 indicates sequence B is a random sequence.
-            kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+            kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
                 Used to hide legacy arguments that have been deprecated.
 
         Returns:

diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1291,7 +1291,7 @@ def call(
 
             - 0 indicates sequence B is a continuation of sequence A,
             - 1 indicates sequence B is a random sequence.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
             Used to hide legacy arguments that have been deprecated.
 
         Return:

diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -2290,7 +2290,7 @@ def forward(
 
             - 0 indicates sequence B is a continuation of sequence A,
             - 1 indicates sequence B is a random sequence.
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
             Used to hide legacy arguments that have been deprecated.
 
         Returns:

diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
@@ -56,7 +56,7 @@ def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torc
     Returns tensor shaped (batch_size * num_heads, 1, max_seq_len)
         attention_mask (`torch.Tensor`):
             Token-wise attention mask, this should be of shape (batch_size, max_seq_len).
-        num_heads (`int`, *required*):
+        num_heads (`int`):
             number of heads
         dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`):
             dtype of the output tensor
@@ -93,13 +93,13 @@ def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training:
     Dropout add function
 
     Args:
-        x (`torch.tensor`, *required*):
+        x (`torch.tensor`):
             input tensor
-        residual (`torch.tensor`, *required*):
+        residual (`torch.tensor`):
             residual tensor
-        prob (`float`, *required*):
+        prob (`float`):
             dropout probability
-        training (`bool`, *required*):
+        training (`bool`):
             training mode
     """
     out = F.dropout(x, p=prob, training=training)
@@ -113,7 +113,7 @@ def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor:
     make the model jitable.
 
     Args:
-        x (`torch.tensor`, *required*):
+        x (`torch.tensor`):
             input hidden states
     """
     return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
@@ -125,9 +125,9 @@ def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor:
     0.3989423 * x * torch.exp(-0.5 * x * x)
 
     Args:
-        g (`torch.tensor`, *required*):
+        g (`torch.tensor`):
             gradient output tensor
-        x (`torch.tensor`, *required*):
+        x (`torch.tensor`):
             input tensor
     """
     x = x[0]  # x is a tuple of 1 element, needs to unpack it first
@@ -202,7 +202,7 @@ def _split_heads(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Ten
         storage as `fused_qkv`
 
         Args:
-            fused_qkv (`torch.tensor`, *required*): [batch_size, seq_length, num_heads * 3 * head_dim]
+            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]
 
         Returns:
             query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
@@ -217,7 +217,7 @@ def _merge_heads(self, x: torch.Tensor) -> torch.Tensor:
         Merge heads together over the last dimension
 
         Args:
-            x (`torch.tensor`, *required*): [batch_size * num_heads, seq_length, head_dim]
+            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]
 
         Returns:
             torch.tensor: [batch_size, seq_length, num_heads * head_dim]

diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py
@@ -247,7 +247,7 @@ def resize(
                 Image to resize.
             size (`Dict[str, int]`):
                 Controls the size of the output image. Should be of the form `{"shortest_edge": int}`.
-            size_divisor (`int`, defaults to 32):
+            size_divisor (`int`, *optional*, defaults to 32):
                 The image is resized to a size that is a multiple of this value.
             resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`):
                 Resampling filter to use when resiizing the image.

diff --git a/src/transformers/models/camembert/modeling_camembert.py b/src/transformers/models/camembert/modeling_camembert.py
@@ -972,7 +972,7 @@ def forward(
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
             config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
             loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to `{}`):
             Used to hide legacy arguments that have been deprecated.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

diff --git a/src/transformers/models/clvp/feature_extraction_clvp.py b/src/transformers/models/clvp/feature_extraction_clvp.py
@@ -173,7 +173,7 @@ def __call__(
                 - `'tf'`: Return TensorFlow `tf.constant` objects.
                 - `'pt'`: Return PyTorch `torch.Tensor` objects.
                 - `'np'`: Return Numpy `np.ndarray` objects.
-            padding_value (`float`, defaults to 0.0):
+            padding_value (`float`, *optional*, defaults to 0.0):
                 The value that is used to fill the padding values / vectors.
             max_length (`int`, *optional*):
                 The maximum input length of the inputs.

diff --git a/src/transformers/models/convnext/configuration_convnext.py b/src/transformers/models/convnext/configuration_convnext.py
@@ -41,9 +41,9 @@ class ConvNextConfig(BackboneConfigMixin, PretrainedConfig):
     Args:
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        patch_size (`int`, optional, defaults to 4):
+        patch_size (`int`, *optional*, defaults to 4):
             Patch size to use in the patch embedding layer.
-        num_stages (`int`, optional, defaults to 4):
+        num_stages (`int`, *optional*, defaults to 4):
             The number of stages in the model.
         hidden_sizes (`List[int]`, *optional*, defaults to [96, 192, 384, 768]):
             Dimensionality (hidden size) at each stage.

diff --git a/src/transformers/models/convnextv2/configuration_convnextv2.py b/src/transformers/models/convnextv2/configuration_convnextv2.py
@@ -35,9 +35,9 @@ class ConvNextV2Config(BackboneConfigMixin, PretrainedConfig):
     Args:
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        patch_size (`int`, optional, defaults to 4):
+        patch_size (`int`, *optional*, defaults to 4):
             Patch size to use in the patch embedding layer.
-        num_stages (`int`, optional, defaults to 4):
+        num_stages (`int`, *optional*, defaults to 4):
             The number of stages in the model.
         hidden_sizes (`List[int]`, *optional*, defaults to `[96, 192, 384, 768]`):
             Dimensionality (hidden size) at each stage.

diff --git a/src/transformers/models/convnextv2/modeling_tf_convnextv2.py b/src/transformers/models/convnextv2/modeling_tf_convnextv2.py
@@ -175,7 +175,7 @@ class TFConvNextV2Layer(keras.layers.Layer):
             Model configuration class.
         dim (`int`):
             Number of input channels.
-        drop_path (`float`, defaults to 0.0):
+        drop_path (`float`, *optional*, defaults to 0.0):
             Stochastic depth rate.
     """
 

diff --git a/src/transformers/models/data2vec/modeling_data2vec_text.py b/src/transformers/models/data2vec/modeling_data2vec_text.py
@@ -1077,7 +1077,7 @@ def forward(
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
             config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
             loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        kwargs (`Dict[str, any]`, optional, defaults to *{}*):
+        kwargs (`Dict[str, any]`, *optional*, defaults to *{}*):
             Used to hide legacy arguments that have been deprecated.
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict

diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py
@@ -37,8 +37,8 @@ class DbrxAttentionConfig(PretrainedConfig):
             The dropout probability for the attention layers.
         clip_qkv (`float`, *optional*):
             If set, clip the queries, keys, and values in the attention layer to this value.
-        kv_n_heads (`Optional[int]`, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
-        rope_theta (`float`, defaults to 10000.0): The base frequency for rope.
+        kv_n_heads (`int`, *optional*, defaults to 1): For grouped_query_attention only, allow user to specify number of kv heads.
+        rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope.
     """
 
     def __init__(
@@ -92,11 +92,11 @@ class DbrxFFNConfig(PretrainedConfig):
         ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
             The dict should have a key 'name' with the value being the name of the activation function along with
             any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
-        ffn_hidden_size (`int`, defaults to 3584): The hidden size of the feedforward network.
-        moe_num_experts (`int`, defaults to 4): The number of experts in the mixture of experts layer.
-        moe_top_k (`int`, defaults to 1): The number of experts to use in the mixture of experts layer.
+        ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
+        moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
+        moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
         moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
-        moe_loss_weight (`float`, defaults to 0.01): The loss weight for the mixture of experts layer.
+        moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
         moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
     """