huggingface · LysandreJik · Apr 18, 2024 · Mar 27, 2024 · Mar 27, 2024 · Mar 27, 2024
diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
@@ -0,0 +1,52 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DBRX
+
+## Overview
+
+DBRX is a [transformer-based](https://www.isattentionallyouneed.com/) decoder-only large language model (LLM) that was trained using next-token prediction.
+It uses a *fine-grained* mixture-of-experts (MoE) architecture with 132B total parameters of which 36B parameters are active on any input.
+It was pre-trained on 12T tokens of text and code data.
+Compared to other open MoE models like Mixtral-8x7B and Grok-1, DBRX is fine-grained, meaning it uses a larger number of smaller experts. DBRX has 16 experts and chooses 4, while Mixtral-8x7B and Grok-1 have 8 experts and choose 2.
+This provides 65x more possible combinations of experts and we found that this improves model quality.
+DBRX uses rotary position encodings (RoPE), gated linear units (GLU), and grouped query attention (GQA).
+It uses the GPT-4 tokenizer as described in the [tiktoken](https://github.com/openai/tiktoken) repository.
+We made these choices based on exhaustive evaluation and scaling experiments.
+
+DBRX was pretrained on 12T tokens of carefully curated data and a maximum context length of 32K tokens.
+We estimate that this data is at least 2x better token-for-token than the data we used to pretrain the MPT family of models.
+This new dataset was developed using the full suite of Databricks tools, including Apache Spark™ and Databricks notebooks for data processing, and Unity Catalog for data management and governance.
+We used curriculum learning for pretraining, changing the data mix during training in ways we found to substantially improve model quality.
+
+
+More detailed information about DBRX Instruct and DBRX Base can be found in our [technical blog post](https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm).
+
+
+This model was contributed by [abhi-db](<https://huggingface.co/abhi-db). The original code can be found [here](https://github.com/databricks/dbrx).
+
+## DbrxConfig
+
+[[autodoc]] DbrxConfig
+
+
+## DbrxModel
+
+[[autodoc]] DbrxModel
+    - forward
+
+
+## DbrxForCausalLM
+
+[[autodoc]] DbrxForCausalLM
+    - forward
+
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -131,6 +131,7 @@
     ],
     "models": [],
     # Models
+    "models.dbrx": ["DbrxConfig"],
     "models.albert": ["ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "AlbertConfig"],
     "models.align": [
         "ALIGN_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -1442,6 +1443,15 @@
 
     # PyTorch models structure
 
+    _import_structure["models.dbrx"].extend(
+        [
+            "DbrxForCausalLM",
+            "DbrxBlock",
+            "DbrxModel",
+            "DbrxPreTrainedModel",
+        ]
+    )
+
     _import_structure["models.albert"].extend(
         [
             "ALBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -7786,6 +7796,13 @@
         )
 
         # PyTorch model imports
+
+        from .models.dbrx import (
+            DbrxForCausalLM,
+            DbrxBlock,
+            DbrxModel,
+            DbrxPreTrainedModel,
+        )
         from .models.seamless_m4t import (
             SEAMLESS_M4T_PRETRAINED_MODEL_ARCHIVE_LIST,
             SeamlessM4TCodeHifiGan,

diff --git a/src/transformers/models/dbrx/__init__.py b/src/transformers/models/dbrx/__init__.py
@@ -0,0 +1,65 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import  _LazyModule, OptionalDependencyNotAvailable
+from ...utils import is_torch_available
+
+
+
+
+_import_structure = {
+    "configuration_dbrx": ["DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP", "DbrxConfig"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_dbrx"] = [
+        "DBRX_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "DbrxForCausalLM",
+        "DbrxBlock",
+        "DbrxModel",
+        "DbrxPreTrainedModel",
+    ]
+
+
+
+
+if TYPE_CHECKING:
+    from .configuration_dbrx import DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP, DbrxConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_dbrx import (
+            DBRX_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DbrxForCausalLM,
+            DbrxBlock,
+            DbrxModel,
+            DbrxPreTrainedModel,
+        )
+
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)