ROCm · lcskrishna · May 10, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 29, 2024
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
@@ -12,3 +12,6 @@ ea7c14f8ef64924f2d0ff80df3cdabf2c7299848
 
 # Reformat with ruff-format
 5a4263f4dc05fe8f78f4111beab9f68a81deeab1
+
+# CHANGELOG: to reverse chron order + mdformat
+4743ff0d43e04e4cc3e5d8b9e7cd016c0defa36d
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -63,10 +63,12 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         arch: [x86_64, aarch64]
         cuda_version:
-          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2"]
+          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.0"]
         exclude:
           - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
+          - os: windows-latest  # The Jimver/cuda-toolkit is action used for Windows builds is not updated for 12.4 yet.
+            cuda_version: "12.4.0"
           - os: ubuntu-latest # Temporary. Takes too long, not ready yet.
             arch: aarch64
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -21,3 +21,4 @@ repos:
     rev: v1.18.2
     hooks:
       - id: typos
+        exclude: ^.*\.hip$
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/README.md b/README.md
@@ -6,42 +6,9 @@ The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom fu
 
 The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module.
 
-**Installation for ROCm:**
-
-To install develop version:
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-git checkout rocm_enabled
-pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=hip -S . (Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch)
-make
-pip install .
-```
-
-For ROCm specific versions:
-
-Install Dependencies:
-```bash
-# hipblaslt installation needed only for rocm<6.0
-apt install hipblaslt
-pip install --upgrade pip
-pip install einops lion_pytorch accelerate
-pip install git+https://github.com/ROCm/transformers.git
-```
-Install Bitsandbytes:
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-# Checkout branch as needed
-# for rocm 5.7 - rocm5.7_internal_testing
-# for rocm 6.x - rocm6.2_internal_testing
-git checkout <branch>
-make hip
-python setup.py install
-```
-
-**For more details, please head to the official documentation page:**
+There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon. Windows support is quite far along and is on its way as well.
+
+**Please head to the official documentation page:**
 
 **[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
 

diff --git a/benchmarking/accuracy/bnb_accuracy.py b/benchmarking/accuracy/bnb_accuracy.py
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
+
 from . import research, utils
 from .autograd._functions import (
     MatmulLtState,
@@ -12,19 +14,53 @@
     matmul_cublas,
     mm_cublas,
 )
+from .backends import register_backend
+from .backends.cpu import CPUBackend
 from .cextension import lib
 from .nn import modules
 
-if lib and lib.compiled_with_cuda:
-    from .backends import register_backend
-    from .backends.cuda import CUDABackend
-    from .optim import adam
+# Always register the CPU backend.
+register_backend("cpu", CPUBackend())
+
+# Register either CUDA or ROCm backend, if available.
+# Only one of these backends can be used at a time, since the torch.device semantics are
+# the same for both torch+rocm and torch+cuda (e.g. device name is "cuda")
+if torch.cuda.is_available():
+    # TODO: Consider deferring loading of cextension - should backend class implement that?
+
+    if torch.version.cuda:
+        from .backends.cuda import CUDABackend
+
+        register_backend("cuda", CUDABackend())
+    elif torch.version.hip:
+        from .backends.rocm import ROCmBackend
+
+        register_backend("cuda", ROCmBackend())
+
+# Register MPS backend, if available.
+if torch.backends.mps.is_available() and torch.backends.mps.is_built():
+    from .backends.mps import MPSBackend
+
+    register_backend("mps", MPSBackend())
+
+# Register Intel XPU backend, if available.
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .backends.xpu import XPUBackend
+
+    register_backend("xpu", XPUBackend())
+
+# TODO: Other potential backends:
+# XLA - Google TPU / PJRT runtime
+# HPU - Habana / Intel Gaudi
+# IPU - Graphcore
+# NPU - Ascend
+# Note that we may not map 1:1 with a device type, e.g. SYCL, XLA
+# In this case, it will be up to each backend to dispatch as needed
 
-    register_backend("cuda", CUDABackend())
 __pdoc__ = {
     "libbitsandbytes": False,
     "optim.optimizer.Optimizer8bit": False,
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.44.0.dev"
+__version__ = "0.43.2.dev"