diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index d953c93dd..648e437f4 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -12,3 +12,6 @@ ea7c14f8ef64924f2d0ff80df3cdabf2c7299848
 
 # Reformat with ruff-format
 5a4263f4dc05fe8f78f4111beab9f68a81deeab1
+
+# CHANGELOG: to reverse chron order + mdformat
+4743ff0d43e04e4cc3e5d8b9e7cd016c0defa36d
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
index ba5961f72..72e1b099a 100644
--- a/.github/workflows/python-package.yml
+++ b/.github/workflows/python-package.yml
@@ -63,10 +63,12 @@ jobs:
         os: [ubuntu-latest, windows-latest]
         arch: [x86_64, aarch64]
         cuda_version:
-          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2"]
+          ["11.7.1", "11.8.0", "12.0.1", "12.1.1", "12.2.2", "12.3.2", "12.4.0"]
         exclude:
           - os: windows-latest # This probably requires arm64 Windows agents
             arch: aarch64
+          - os: windows-latest  # The Jimver/cuda-toolkit is action used for Windows builds is not updated for 12.4 yet.
+            cuda_version: "12.4.0"
           - os: ubuntu-latest # Temporary. Takes too long, not ready yet.
             arch: aarch64
     runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a859d05af..9babbc0cc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,3 +21,4 @@ repos:
     rev: v1.18.2
     hooks:
       - id: typos
+        exclude: ^.*\.hip$
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 397dceb77..c456fa9e5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,368 +1,439 @@
-### 0.0.21
-- Ampere, RTX 30 series GPUs now compatible with the library.
+### 0.43.1
 
-### 0.0.22:
+#### Improvements:
 
-- Fixed an error where a `reset_parameters()` call on the `StableEmbedding` would lead to an error in older PyTorch versions (from 1.7.0).
+- Improved the serialization format for 8-bit weights; this change is fully backwards compatible. (#1164, thanks to @younesbelkada for the contributions and @akx for the review).
+- Added CUDA 12.4 support to the Linux x86-64 build workflow, expanding the library's compatibility with the latest CUDA versions. (#1171, kudos to @matthewdouglas for this addition).
+- Docs enhancement: Improved the instructions for installing the library from source. (#1149, special thanks to @stevhliu for the enhancements).
 
-### 0.0.23:
+#### Bug Fixes
 
-Bugs:
- - Unified quantization API: each quantization function now returns `Q, S` where `Q` is the quantized tensor and `S` the quantization state which may hold absolute max values, a quantization map or more. For dequantization all functions now accept the inputs `Q, S` so that `Q` is dequantized with the quantization state `S`.
- - Fixed an issue where the CUDA 11.1 binary was not compiled with the right headers
+- Fix 4bit quantization with blocksize = 4096, where an illegal memory access was encountered. (#1160, thanks @matthewdouglas for fixing and @YLGH for reporting)
 
-API changes:
- - Block-wise quantization for optimizers now enabled by default
+#### Internal Improvements:
 
-Features:
- - Block-wise quantization routines now support CPU Tensors.
+- Tests: improve memory usage (#1147, thanks @matthewdouglas)
+- Add CUDA 12.4 to docs/install helper (#1136, thanks @matthewdouglas)
+- Minor type/doc fixes (#1128, thanks @akx)
+- Reformat Python code with Ruff (#1081, thanks @akx)
+- Rework of CUDA/native-library setup and diagnostics (#1041, thanks @akx)
 
+### 0.43.0
 
-### 0.0.24:
+#### Improvements and New Features:
 
-- Fixed a bug where a float/half conversion led to a compilation error for CUDA 11.1 on Turning GPUs.
-- removed Apex dependency for bnb LAMB
+- QLoRA + FSDP official support is now live! https://github.com/TimDettmers/bitsandbytes/pull/970 by @warner-benjamin and team - with FSDP you can train very large models (70b scale) on multiple 24GB consumer-type GPUs. See https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html for more details.
+- Introduced improvements to the CI process for enhanced performance and efficiency during builds, specifically enabling more effective cross-compilation on Linux platforms. This was accomplished by deprecating Make and migrating to Cmake, as well as implementing new corresponding workflows. Huge thanks go to @wkpark, @rickardp, @matthewdouglas and @younesbelkada; #1055, #1050, #1111.
+- Windows should be officially supported in bitsandbytes if you install the library from source. See: https://huggingface.co/docs/bitsandbytes/main/en/index for more details
+- Updated installation instructions to provide more comprehensive guidance for users. This includes clearer explanations and additional tips for various setup scenarios, making the library more accessible to a broader audience (@rickardp, #1047).
+- Enhanced the library's compatibility and setup process, including fixes for CPU-only installations and improvements in CUDA setup error messaging. This effort aims to streamline the installation process and improve user experience across different platforms and setups (@wkpark, @akx, #1038, #996, #1012).
+- Setup a new documentation at https://huggingface.co/docs/bitsandbytes/main with extensive new sections and content to help users better understand and utilize the library. Especially notable are the new API docs. (big thanks to @stevhliu and @mishig25 from HuggingFace #1012). The API docs have been also addressed in #1075.
 
-### 0.0.25:
+#### Bug Fixes:
 
-Features:
- - Added `skip_zeros` for block-wise and 32-bit optimizers. This ensures correct updates for sparse gradients and sparse models.
- - Added support for Kepler GPUs. (#4)
- - Added Analysis Adam to track 8-bit vs 32-bit quantization errors over time.
- - Make compilation more user friendly.
+- Addressed a race condition in kEstimateQuantiles, enhancing the reliability of quantile estimation in concurrent environments (@pnunna93, #1061).
+- Fixed various minor issues, including typos in code comments and documentation, to improve code clarity and prevent potential confusion (@Brian Vaughan, #1063).
 
-Bug fixes:
- - fixed "undefined symbol: \_\_fatbinwrap_38" error for P100 GPUs on CUDA 10.1 (#5)
+#### Backwards Compatibility
 
-Docs:
- - Added docs with instructions to compile from source.
+- After upgrading from `v0.42` to `v0.43`, when using 4bit quantization, models may generate slightly different outputs (approximately up to the 2nd decimal place) due to a fix in the code. For anyone interested in the details, [see this comment](https://github.com/TimDettmers/bitsandbytes/discussions/1094#discussioncomment-8984069).
 
+#### Internal and Build System Enhancements:
 
-### 0.26.0:
+- Implemented several enhancements to the internal and build systems, including adjustments to the CI workflows, portability improvements, and build artifact management. These changes contribute to a more robust and flexible development process, ensuring the library's ongoing quality and maintainability (@rickardp, @akx, @wkpark, @matthewdouglas; #949, #1053, #1045, #1037).
+
+#### Contributors:
+
+This release is made possible thanks to the many active contributors that submitted PRs and many others who contributed to discussions, reviews, and testing. Your efforts greatly enhance the library's quality and user experience. It's truly inspiring to work with such a dedicated and competent group of volunteers and professionals!
+
+We give a special thanks to @TimDettmers for managing to find a little bit of time for valuable consultations on critical topics, despite preparing for and touring the states applying for professor positions. We wish him the utmost success!
+
+We also extend our gratitude to the broader community for your continued support, feedback, and engagement, which play a crucial role in driving the library's development forward.
+
+### 0.42.0
 
 Features:
- - Added Adagrad (without grad clipping) as 32-bit and 8-bit block-wise optimizer.
- - Added AdamW (copy of Adam with weight decay init 1e-2). #10
- - Introduced ModuleConfig overrides which can be seamlessly be used at initialization time of a module.
- - Added `bnb.nn.Embedding` layer which runs at 32-bit but without the layernorm. This works well if you need to fine-tune pretrained models that do not have a embedding layer norm. #19
 
-Bug fixes:
- - Fixed a bug where weight decay was incorrectly applied to 32-bit Adam. #13
- - Fixed an unsafe use of eval. #8
- - Fixed a bug where the StableEmbedding layer 32-bit optimizer override would not work without registering the whole model first (`bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters())`).  #13 #15
+- 4-bit serialization now supported. This enables 4-bit load/store. Thank you @poedator #753
+- the bitsandbytes library now has a version attribute: `bitsandbytes.__version__` @rasbt #710
 
-Docs:
- - Added instructions how to solve "\_\_fatbinwrap_" errors.
+Bug fixes:
 
+- Fixed bugs in dynamic exponent data type creation. Thank you @RossM, @KohakuBlueleaf, @ArrowM #659 #227 #262 #152
+- Fixed an issue where 4-bit serialization would fail for layers without double quantization #868. Thank you, @poedator
+- Fixed an issue where calling .to() or .cuda() on a 4-bit layer twice would result in an error #867. Thank you, @jph00
+- Fixed a bug where a missing access permission in a path searched for CUDA would lead to an error @osma #677
+- Fixed a bug where the GOOGLE_VM_CONFIG_LOCK_FILE variable could cause errors in colab environments @akrentsel @xaptronic #715 #883 #622
+- Fixed a bug where kgetColRowStats (LLM.int8()) would fail for certain dimensions @LucQueen @905
+- Fixed a bug where the adjusted regular Embedding layer was not available via bnb.nn.Embedding @neel04 #563
+- Fixed added missing scipy requirement @dulalbert #525
 
-### 0.30.0
+### 0.41.3
 
-#### 8-bit Inference Update
+Bug fixes:
 
-Features:
- - Added 8-bit matrix multiplication form cuBLAS,  and cuBLASLt as well as multiple GEMM kernels (GEMM, GEMMEx, GEMMLt)
- - Added 8-bit Linear layers with 8-bit Params that perform memory efficient inference with an option for 8-bit mixed precision matrix decomposition for inference without performance degradation
- - Added quantization methods for "fake" quantization as well as optimized kernels vector-wise quantization and equalization as well as optimized cuBLASLt transformations
- - CPU only build now available (Thank you, @mryab)
+- Fixed an issue where 4-bit serialization would fail for layers without double quantization #868. Thank you, @poedator
+- Fixed an issue where calling .to() or .cuda() on a 4-bit layer twice would result in an error #867. Thank you, @jph00
 
-Deprecated:
- - Pre-compiled release for CUDA 9.2, 10.0, 10.2 no longer available
+### 0.41.2
 
-### 0.31.0
+Feature:
 
-#### 8-bit Inference and Packaging Update
+- 4-bit serialization now supported. This enables 4-bit load/store. Thank you @poedator #753
 
-Features:
- - added direct outlier extraction. This enables outlier extraction without fp16 weights without performance degradation.
- - Added automatic CUDA SETUP procedure and packaging all binaries into a single bitsandbytes package.
+### 0.41.1
 
-### 0.32.0
+Bug fixes:
 
-#### 8-bit Inference Performance Enhancements
+- Fixed bugs in dynamic exponent data type creation. Thank you @RossM, @KohakuBlueleaf, @ArrowM #659 #227 #262 #152
 
-We added performance enhancements for small models. This makes small models about 2x faster for LLM.int8() inference.
+### 0.41.0
 
 Features:
- - Int32 dequantization now supports fused biases.
- - Linear8bitLt now uses a fused bias implementation.
- - Change `.data.storage().data_ptr()` to `.data.data_ptr()` to enhance inference performance.
+
+- Added precompiled CUDA 11.8 binaries to support H100 GPUs without compilation #571
+- CUDA SETUP now no longer looks for libcuda and libcudart and relies PyTorch CUDA libraries. To manually override this behavior see: how_to_use_nonpytorch_cuda.md. Thank you @rapsealk
 
 Bug fixes:
- - Now throws and error if LLM.int8() is used on a GPU that is not supported.
- - Enhances error messaging if CUDA SETUP fails.
 
+- Fixed a bug where the default type of absmax was undefined which leads to errors if the default type is different than torch.float32. # 553
+- Fixed a missing scipy dependency in requirements.txt. #544
+- Fixed a bug, where a view operation could cause an error in 8-bit layers.
+- Fixed a bug where CPU bitsandbytes would during the import. #593 Thank you @bilelomrani
+- Fixed a but where a non-existent LD_LIBRARY_PATH variable led to a failure in python -m bitsandbytes #588
+- Removed outdated get_cuda_lib_handle calls that lead to errors. #595 Thank you @ihsanturk
+- Fixed bug where read-permission was assumed for a file. #497
+- Fixed a bug where prefetchAsync lead to errors on GPUs that do not support unified memory but not prefetching (Maxwell, SM52). #470 #451 #453 #477 Thank you @jllllll and @stoperro
 
-### 0.33.0
+Documentation:
 
-#### Various bug fixes
+- Improved documentation for GPUs that do not support 8-bit matmul. #529
+- Added description and pointers for the NF4 data type. #543
 
-Features:
- - CPU quantization now supports a variable `blocksize` variable to enhance quantization speed or precision.
+User experience:
 
-Bug fixes:
- - fixed an issue in CPU quantization where tensors with more than 2^31 elements would fail 19a7adca7a6c9bf7061a384d7e9d9b13676a1a88
- - fixed a bug where cpu binaries would fail if no GPU would be detected eab4d8232d558f2e6bd7f7cc3d00e2e6e94f4e80
- - fixed an issue where cpu binaries cause additional stdout messages 92a3363096e10ad6a5c4e944af898bd1186d806a
- - fixed an import of bnb.utils 2e630b55f51d454f3bd723dffda68a07ef93190c
+- Improved handling of default compute_dtype for Linear4bit Layers, so that compute_dtype = input_dtype if the input data type is stable enough (float32, bfloat16, but not float16).
 
-We thank @mryab, @mbrukman, @chessgecko, @dbaranchuk for pull request with bug fixes and new features.
+Performance:
 
+- improved 4-bit inference performance for A100 GPUs. This degraded performance for A40/RTX3090 and RTX 4090 GPUs slightly.
 
-### 0.34.0
+### 0.40.2
 
-#### Bug fixes and memory efficient backprop
+Bug fixes:
+
+- Fixed a but where a non-existent LD_LIBRARY_PATH variable led to a failure in python -m bitsandbytes #588
+- Removed outdated get_cuda_lib_handle calls that lead to errors. #595 Thank you @ihsanturk
+- Fixed bug where read-permission was assumed for a file. #497
+- Fixed a bug where prefetchAsync lead to errors on GPUs that do not support unified memory but not prefetching (Maxwell, SM52). #470 #451 #453 #477 Thank you @jllllll and @stoperro
+
+### 0.40.1
 
 Features:
- - Linear8bitLt layer now supports `memory_efficient_backward=True` which enables backprop of gradients through frozen weights.
+
+- Added precompiled CUDA 11.8 binaries to support H100 GPUs without compilation #571
+- CUDA SETUP now no longer looks for libcuda and libcudart and relies PyTorch CUDA libraries. To manually override this behavior see: how_to_use_nonpytorch_cuda.md. Thank you @rapsealk
 
 Bug fixes:
- - fixed an issue where too many threads were created in blockwise quantization on the CPU for large tensors
 
+- Fixed a bug where the default type of absmax was undefined which leads to errors if the default type is different than torch.float32. # 553
+- Fixed a missing scipy dependency in requirements.txt. #544
+- Fixed a bug, where a view operation could cause an error in 8-bit layers.
+- Fixed a bug where CPU bitsandbytes would during the import. #593 Thank you @bilelomrani
 
-### 0.35.0
+Documentation:
 
-#### CUDA 11.8 support and bug fixes
+- Improved documentation for GPUs that do not support 8-bit matmul. #529
+- Added description and pointers for the NF4 data type. #543
+
+### 0.40.0
 
 Features:
- - CUDA 11.8 support added and binaries added to the PyPI release.
+
+- Added 4-bit inference kernels for batch size=1. Currently support are the NF4, FP4 data types.
+- Added support for quantizations of bfloat16 input data.
 
 Bug fixes:
- - fixed a bug where too long directory names would crash the CUDA SETUP #35 (thank you @tomaarsen)
- - fixed a bug where CPU installations on Colab would run into an error  #34 (thank you @tomaarsen)
- - fixed an issue where the default CUDA version with fast-DreamBooth was not supported #52
 
-### 0.35.1
+- Added `device` variable for bitsandbytes layers to be compatible with PyTorch layers.
 
-Features:
- - Added CUDA instruction generator to fix some installations.
+Deprecated:
 
-Bug fixes:
- - Fixed a problem where warning messages would be displayed even though everything worked correctly.
+- Binaries for CUDA 11.2, 11.6 no longer ship with `pip install bitsandbytes` and need to be compiled from source.
 
-### 0.35.2
+### 0.39.0
 
-Bug fixes:
- - Fixed a bug where the CUDA setup failed due to a wrong function call.
+Features:
 
-### 0.35.3
+- 4-bit matrix multiplication for Float4 and NormalFloat4 data types.
+- Added 4-bit quantization routines
+- Doubled quantization routines for 4-bit quantization
+- Paged optimizers for Adam and Lion.
+- bfloat16 gradient / weight support for Adam and Lion with 8 or 32-bit states.
 
 Bug fixes:
- - Fixed a bug in the CUDA Setup which led to an incomprehensible error if no GPU was detected.
 
-### 0.35.4
+- Fixed a bug where 8-bit models consumed twice the memory as expected after serialization
 
-Bug fixes:
- - Fixed a bug in the CUDA Setup failed with the cuda runtime was found, but not the cuda library.
- - Fixed a bug where not finding the cuda runtime led to an incomprehensible error.
+Deprecated:
 
+- Kepler binaries (GTX 700s and Tesla K40/K80) are not longer provided via pip and need to be compiled from source. Kepler support might be fully removed in the future.
 
-### 0.36.0
+### 0.38.1
 
-#### Improvements, Ada/Hopper support, fake k-bit quantization.
+Features:
+
+- Added Int8 SwitchBack layers
+- Added Fake FP8 layers for research purposes (available under `bnb.research.nn. ...`)
+
+### 0.38.0
+
+#### 8-bit Lion, Load/Store 8-bit Models directly from/to HF Hub
 
 Features:
- - CUDA 11.8 and 12.0 support added
- - support for Ada and Hopper GPUs added (compute capability 8.9 and 9.0)
- - support for fake k-bit block-wise quantization for Int, Float, quantile quantization, and dynamic exponent data types added
- - Added CUDA instruction generator to fix some installations.
- - Added additional block sizes for quantization {64, 128, 256, 512, 1024}
- - Added SRAM Quantile algorithm to quickly estimate less than 256 quantiles
- - Added option to suppress the bitsandbytes welcome message (@Cyberes)
 
-Regression:
- - Compute capability 3.0 removed: GTX 600s and 700s series is no longer supported (except GTX 780 and GTX 780 Ti)
+- Support for 32 and 8-bit Lion has been added. Thank you @lucidrains
+- Support for serialization of Linear8bitLt layers (LLM.int8()). This allows to store and load 8-bit weights directly from the HuggingFace Hub. Thank you @myrab
+- New bug report features `python -m bitsandbytes` now gives extensive debugging details to debug CUDA setup failures.
 
 Bug fixes:
- - fixed a bug where too long directory names would crash the CUDA SETUP #35 (@tomaarsen)
- - fixed a bug where CPU installations on Colab would run into an error  #34 (@tomaarsen)
- - fixed an issue where the default CUDA version with fast-DreamBooth was not supported #52
- - fixed a bug where the CUDA setup failed due to a wrong function call.
- - fixed a bug in the CUDA Setup which led to an incomprehensible error if no GPU was detected.
- - fixed a bug in the CUDA Setup failed with the cuda runtime was found, but not the cuda library.
- - fixed a bug where not finding the cuda runtime led to an incomprehensible error.
- - fixed a bug where with missing CUDA the default was an error instead of the loading the CPU library
- - fixed a bug where the CC version of the GPU was not detected appropriately (@BlackHC)
- - fixed a bug in CPU quantization which lead to errors when the input buffer exceeded 2^31 elements
+
+- Fixed a bug where some bitsandbytes methods failed in a model-parallel setup on multiple GPUs. Thank you @tonylins
+- Fixed a bug where cudart.so libraries could not be found in newer PyTorch releases.
 
 Improvements:
- - multiple improvements in formatting, removal of unused imports, and slight performance improvements (@tomaarsen)
- - StableEmbedding layer now has device and dtype parameters to make it 1:1 replaceable with regular Embedding layers (@lostmsu)
- - runtime performance of block-wise quantization slightly improved
- - added error message for the case multiple libcudart.so are installed and bitsandbytes picks the wrong one
 
+- Improved the CUDA Setup procedure by doing a more extensive search for CUDA libraries
+
+Deprecated:
+
+- Devices with compute capability 3.0 (GTX 700s, K10) and 3.2 (Tegra K1, Jetson TK1) are now deprecated and support will be removed in 0.39.0.
+- Support for CUDA 10.0 and 10.2 will be removed in bitsandbytes 0.39.0
 
 ### 0.37.0
 
 #### Int8 Matmul + backward support for all GPUs
 
 Features:
- - Int8 MatmulLt now supports backward through inversion of the ColTuring/ColAmpere format. Slow, but memory efficient. Big thanks to @borzunov
- - Int8 now supported on all GPUs. On devices with compute capability < 7.5, the Int weights are cast to 16/32-bit for the matrix multiplication. Contributed by @borzunov
+
+- Int8 MatmulLt now supports backward through inversion of the ColTuring/ColAmpere format. Slow, but memory efficient. Big thanks to @borzunov
+- Int8 now supported on all GPUs. On devices with compute capability \< 7.5, the Int weights are cast to 16/32-bit for the matrix multiplication. Contributed by @borzunov
 
 Improvements:
- - Improved logging for the CUDA detection mechanism.
 
-### 0.38.0
+- Improved logging for the CUDA detection mechanism.
 
-#### 8-bit Lion, Load/Store 8-bit Models directly from/to HF Hub
+### 0.36.0
+
+#### Improvements, Ada/Hopper support, fake k-bit quantization.
 
 Features:
- - Support for 32 and 8-bit Lion has been added. Thank you @lucidrains
- - Support for serialization of Linear8bitLt layers (LLM.int8()). This allows to store and load 8-bit weights directly from the HuggingFace Hub. Thank you @myrab
- - New bug report features `python -m bitsandbytes` now gives extensive debugging details to debug CUDA setup failures.
+
+- CUDA 11.8 and 12.0 support added
+- support for Ada and Hopper GPUs added (compute capability 8.9 and 9.0)
+- support for fake k-bit block-wise quantization for Int, Float, quantile quantization, and dynamic exponent data types added
+- Added CUDA instruction generator to fix some installations.
+- Added additional block sizes for quantization {64, 128, 256, 512, 1024}
+- Added SRAM Quantile algorithm to quickly estimate less than 256 quantiles
+- Added option to suppress the bitsandbytes welcome message (@Cyberes)
+
+Regression:
+
+- Compute capability 3.0 removed: GTX 600s and 700s series is no longer supported (except GTX 780 and GTX 780 Ti)
 
 Bug fixes:
- - Fixed a bug where some bitsandbytes methods failed in a model-parallel setup on multiple GPUs. Thank you @tonylins
- - Fixed a bug where cudart.so libraries could not be found in newer PyTorch releases.
+
+- fixed a bug where too long directory names would crash the CUDA SETUP #35 (@tomaarsen)
+- fixed a bug where CPU installations on Colab would run into an error  #34 (@tomaarsen)
+- fixed an issue where the default CUDA version with fast-DreamBooth was not supported #52
+- fixed a bug where the CUDA setup failed due to a wrong function call.
+- fixed a bug in the CUDA Setup which led to an incomprehensible error if no GPU was detected.
+- fixed a bug in the CUDA Setup failed with the cuda runtime was found, but not the cuda library.
+- fixed a bug where not finding the cuda runtime led to an incomprehensible error.
+- fixed a bug where with missing CUDA the default was an error instead of the loading the CPU library
+- fixed a bug where the CC version of the GPU was not detected appropriately (@BlackHC)
+- fixed a bug in CPU quantization which lead to errors when the input buffer exceeded 2^31 elements
 
 Improvements:
- - Improved the CUDA Setup procedure by doing a more extensive search for CUDA libraries
 
-Deprecated:
- - Devices with compute capability 3.0 (GTX 700s, K10) and 3.2 (Tegra K1, Jetson TK1) are now deprecated and support will be removed in 0.39.0.
- - Support for CUDA 10.0 and 10.2 will be removed in bitsandbytes 0.39.0
+- multiple improvements in formatting, removal of unused imports, and slight performance improvements (@tomaarsen)
+- StableEmbedding layer now has device and dtype parameters to make it 1:1 replaceable with regular Embedding layers (@lostmsu)
+- runtime performance of block-wise quantization slightly improved
+- added error message for the case multiple libcudart.so are installed and bitsandbytes picks the wrong one
 
+### 0.35.4
 
-### 0.38.1
+Bug fixes:
 
-Features:
- - Added Int8 SwitchBack layers
- - Added Fake FP8 layers for research purposes (available under `bnb.research.nn. ...`)
+- Fixed a bug in the CUDA Setup failed with the cuda runtime was found, but not the cuda library.
+- Fixed a bug where not finding the cuda runtime led to an incomprehensible error.
 
+### 0.35.3
 
-### 0.39.0
+Bug fixes:
 
+- Fixed a bug in the CUDA Setup which led to an incomprehensible error if no GPU was detected.
+
+### 0.35.2
+
+Bug fixes:
+
+- Fixed a bug where the CUDA setup failed due to a wrong function call.
+
+### 0.35.1
 
 Features:
- - 4-bit matrix multiplication for Float4 and NormalFloat4 data types.
- - Added 4-bit quantization routines
- - Doubled quantization routines for 4-bit quantization
- - Paged optimizers for Adam and Lion.
- - bfloat16 gradient / weight support for Adam and Lion with 8 or 32-bit states.
+
+- Added CUDA instruction generator to fix some installations.
 
 Bug fixes:
- - Fixed a bug where 8-bit models consumed twice the memory as expected after serialization
 
-Deprecated:
- - Kepler binaries (GTX 700s and Tesla K40/K80) are not longer provided via pip and need to be compiled from source. Kepler support might be fully removed in the future.
+- Fixed a problem where warning messages would be displayed even though everything worked correctly.
 
+### 0.35.0
 
-### 0.40.0
+#### CUDA 11.8 support and bug fixes
 
 Features:
- - Added 4-bit inference kernels for batch size=1. Currently support are the NF4, FP4 data types.
- - Added support for quantizations of bfloat16 input data.
+
+- CUDA 11.8 support added and binaries added to the PyPI release.
 
 Bug fixes:
- - Added `device` variable for bitsandbytes layers to be compatible with PyTorch layers.
 
-Deprecated:
- - Binaries for CUDA 11.2, 11.6 no longer ship with `pip install bitsandbytes` and need to be compiled from source.
+- fixed a bug where too long directory names would crash the CUDA SETUP #35 (thank you @tomaarsen)
+- fixed a bug where CPU installations on Colab would run into an error  #34 (thank you @tomaarsen)
+- fixed an issue where the default CUDA version with fast-DreamBooth was not supported #52
 
+### 0.34.0
 
-### 0.40.1
+#### Bug fixes and memory efficient backprop
 
 Features:
- - Added precompiled CUDA 11.8 binaries to support H100 GPUs without compilation #571
- - CUDA SETUP now no longer looks for libcuda and libcudart and relies PyTorch CUDA libraries. To manually override this behavior see: how_to_use_nonpytorch_cuda.md. Thank you @rapsealk
+
+- Linear8bitLt layer now supports `memory_efficient_backward=True` which enables backprop of gradients through frozen weights.
 
 Bug fixes:
- - Fixed a bug where the default type of absmax was undefined which leads to errors if the default type is different than torch.float32. # 553
- - Fixed a missing scipy dependency in requirements.txt. #544
- - Fixed a bug, where a view operation could cause an error in 8-bit layers.
- - Fixed a bug where CPU bitsandbytes would during the import. #593 Thank you @bilelomrani
 
-Documentation:
- - Improved documentation for GPUs that do not support 8-bit matmul. #529
- - Added description and pointers for the NF4 data type. #543
+- fixed an issue where too many threads were created in blockwise quantization on the CPU for large tensors
 
-### 0.40.2
+### 0.33.0
+
+#### Various bug fixes
+
+Features:
+
+- CPU quantization now supports a variable `blocksize` variable to enhance quantization speed or precision.
 
 Bug fixes:
- - Fixed a but where a non-existent LD_LIBRARY_PATH variable led to a failure in python -m bitsandbytes #588
- - Removed outdated get_cuda_lib_handle calls that lead to errors. #595 Thank you @ihsanturk
- - Fixed bug where read-permission was assumed for a file. #497
- - Fixed a bug where prefetchAsync lead to errors on GPUs that do not support unified memory but not prefetching (Maxwell, SM52). #470 #451 #453 #477 Thank you @jllllll and @stoperro
 
+- fixed an issue in CPU quantization where tensors with more than 2^31 elements would fail 19a7adca7a6c9bf7061a384d7e9d9b13676a1a88
+- fixed a bug where cpu binaries would fail if no GPU would be detected eab4d8232d558f2e6bd7f7cc3d00e2e6e94f4e80
+- fixed an issue where cpu binaries cause additional stdout messages 92a3363096e10ad6a5c4e944af898bd1186d806a
+- fixed an import of bnb.utils 2e630b55f51d454f3bd723dffda68a07ef93190c
 
-### 0.41.0
+We thank @mryab, @mbrukman, @chessgecko, @dbaranchuk for pull request with bug fixes and new features.
+
+### 0.32.0
+
+#### 8-bit Inference Performance Enhancements
+
+We added performance enhancements for small models. This makes small models about 2x faster for LLM.int8() inference.
 
 Features:
- - Added precompiled CUDA 11.8 binaries to support H100 GPUs without compilation #571
- - CUDA SETUP now no longer looks for libcuda and libcudart and relies PyTorch CUDA libraries. To manually override this behavior see: how_to_use_nonpytorch_cuda.md. Thank you @rapsealk
+
+- Int32 dequantization now supports fused biases.
+- Linear8bitLt now uses a fused bias implementation.
+- Change `.data.storage().data_ptr()` to `.data.data_ptr()` to enhance inference performance.
 
 Bug fixes:
- - Fixed a bug where the default type of absmax was undefined which leads to errors if the default type is different than torch.float32. # 553
- - Fixed a missing scipy dependency in requirements.txt. #544
- - Fixed a bug, where a view operation could cause an error in 8-bit layers.
- - Fixed a bug where CPU bitsandbytes would during the import. #593 Thank you @bilelomrani
- - Fixed a but where a non-existent LD_LIBRARY_PATH variable led to a failure in python -m bitsandbytes #588
- - Removed outdated get_cuda_lib_handle calls that lead to errors. #595 Thank you @ihsanturk
- - Fixed bug where read-permission was assumed for a file. #497
- - Fixed a bug where prefetchAsync lead to errors on GPUs that do not support unified memory but not prefetching (Maxwell, SM52). #470 #451 #453 #477 Thank you @jllllll and @stoperro
 
-Documentation:
- - Improved documentation for GPUs that do not support 8-bit matmul. #529
- - Added description and pointers for the NF4 data type. #543
+- Now throws and error if LLM.int8() is used on a GPU that is not supported.
+- Enhances error messaging if CUDA SETUP fails.
 
-User experience:
- - Improved handling of default compute_dtype for Linear4bit Layers, so that compute_dtype = input_dtype if the input data type is stable enough (float32, bfloat16, but not float16).
+### 0.31.0
 
-Performance:
- - improved 4-bit inference performance for A100 GPUs. This degraded performance for A40/RTX3090 and RTX 4090 GPUs slightly.
+#### 8-bit Inference and Packaging Update
 
-### 0.41.1
+Features:
 
-Bug fixes:
- - Fixed bugs in dynamic exponent data type creation. Thank you @RossM, @KohakuBlueleaf, @ArrowM #659 #227 #262 #152
+- added direct outlier extraction. This enables outlier extraction without fp16 weights without performance degradation.
+- Added automatic CUDA SETUP procedure and packaging all binaries into a single bitsandbytes package.
 
-### 0.41.2
+### 0.30.0
 
-Feature:
- - 4-bit serialization now supported. This enables 4-bit load/store. Thank you @poedator #753
+#### 8-bit Inference Update
 
-### 0.41.3
+Features:
+
+- Added 8-bit matrix multiplication form cuBLAS,  and cuBLASLt as well as multiple GEMM kernels (GEMM, GEMMEx, GEMMLt)
+- Added 8-bit Linear layers with 8-bit Params that perform memory efficient inference with an option for 8-bit mixed precision matrix decomposition for inference without performance degradation
+- Added quantization methods for "fake" quantization as well as optimized kernels vector-wise quantization and equalization as well as optimized cuBLASLt transformations
+- CPU only build now available (Thank you, @mryab)
+
+Deprecated:
+
+- Pre-compiled release for CUDA 9.2, 10.0, 10.2 no longer available
+
+### 0.26.0:
+
+Features:
+
+- Added Adagrad (without grad clipping) as 32-bit and 8-bit block-wise optimizer.
+- Added AdamW (copy of Adam with weight decay init 1e-2). #10
+- Introduced ModuleConfig overrides which can be seamlessly be used at initialization time of a module.
+- Added `bnb.nn.Embedding` layer which runs at 32-bit but without the layernorm. This works well if you need to fine-tune pretrained models that do not have a embedding layer norm. #19
 
 Bug fixes:
- - Fixed an issue where 4-bit serialization would fail for layers without double quantization #868. Thank you, @poedator
- - Fixed an issue where calling .to() or .cuda() on a 4-bit layer twice would result in an error #867. Thank you, @jph00
 
-### 0.42.0
+- Fixed a bug where weight decay was incorrectly applied to 32-bit Adam. #13
+- Fixed an unsafe use of eval. #8
+- Fixed a bug where the StableEmbedding layer 32-bit optimizer override would not work without registering the whole model first (`bnb.optim.GlobalOptimManager.get_instance().register_parameters(model.parameters())`).  #13 #15
+
+Docs:
+
+- Added instructions how to solve "\_\_fatbinwrap\_" errors.
+
+### 0.0.25:
 
 Features:
- - 4-bit serialization now supported. This enables 4-bit load/store. Thank you @poedator #753
- - the bitsandbytes library now has a version attribute: `bitsandbytes.__version__` @rasbt #710
+
+- Added `skip_zeros` for block-wise and 32-bit optimizers. This ensures correct updates for sparse gradients and sparse models.
+- Added support for Kepler GPUs. (#4)
+- Added Analysis Adam to track 8-bit vs 32-bit quantization errors over time.
+- Make compilation more user friendly.
 
 Bug fixes:
- - Fixed bugs in dynamic exponent data type creation. Thank you @RossM, @KohakuBlueleaf, @ArrowM #659 #227 #262 #152
- - Fixed an issue where 4-bit serialization would fail for layers without double quantization #868. Thank you, @poedator
- - Fixed an issue where calling .to() or .cuda() on a 4-bit layer twice would result in an error #867. Thank you, @jph00
- - Fixed a bug where a missing access permission in a path searched for CUDA would lead to an error @osma #677
- - Fixed a bug where the GOOGLE_VM_CONFIG_LOCK_FILE variable could cause errors in colab environments @akrentsel @xaptronic #715 #883 #622
- - Fixed a bug where kgetColRowStats (LLM.int8()) would fail for certain dimensions @LucQueen @905
- - Fixed a bug where the adjusted regular Embedding layer was not available via bnb.nn.Embedding @neel04 #563
- - Fixed added missing scipy requirement @dulalbert #525
 
-### 0.43.0
+- fixed "undefined symbol: \_\_fatbinwrap_38" error for P100 GPUs on CUDA 10.1 (#5)
 
-#### Improvements and New Features:
-- QLoRA + FSDP official support is now live! https://github.com/TimDettmers/bitsandbytes/pull/970 by @warner-benjamin and team - with FSDP you can train very large models (70b scale) on multiple 24GB consumer-type GPUs. See https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html for more details.
-- Introduced improvements to the CI process for enhanced performance and efficiency during builds, specifically enabling more effective cross-compilation on Linux platforms. This was accomplished by deprecating Make and migrating to Cmake, as well as implementing new corresponding workflows. Huge thanks go to @wkpark, @rickardp, @matthewdouglas and @younesbelkada; #1055, #1050, #1111.
-- Windows should be officially supported in bitsandbytes if you install the library from source. See: https://huggingface.co/docs/bitsandbytes/main/en/index for more details
-- Updated installation instructions to provide more comprehensive guidance for users. This includes clearer explanations and additional tips for various setup scenarios, making the library more accessible to a broader audience (@rickardp, #1047).
-- Enhanced the library's compatibility and setup process, including fixes for CPU-only installations and improvements in CUDA setup error messaging. This effort aims to streamline the installation process and improve user experience across different platforms and setups (@wkpark, @akx, #1038, #996, #1012).
-- Setup a new documentation at https://huggingface.co/docs/bitsandbytes/main with extensive new sections and content to help users better understand and utilize the library. Especially notable are the new API docs. (big thanks to @stevhliu and @mishig25 from HuggingFace #1012). The API docs have been also addressed in #1075.
+Docs:
 
-#### Bug Fixes:
-- Addressed a race condition in kEstimateQuantiles, enhancing the reliability of quantile estimation in concurrent environments (@pnunna93, #1061).
-- Fixed various minor issues, including typos in code comments and documentation, to improve code clarity and prevent potential confusion (@Brian Vaughan, #1063).
+- Added docs with instructions to compile from source.
 
-#### Internal and Build System Enhancements:
-- Implemented several enhancements to the internal and build systems, including adjustments to the CI workflows, portability improvements, and build artifact management. These changes contribute to a more robust and flexible development process, ensuring the library's ongoing quality and maintainability (@rickardp, @akx, @wkpark, @matthewdouglas; #949, #1053, #1045, #1037).
+### 0.0.24:
 
-#### Contributors:
-This release is made possible thanks to the many active contributors that submitted PRs and many others who contributed to discussions, reviews, and testing. Your efforts greatly enhance the library's quality and user experience. It's truly inspiring to work with such a dedicated and competent group of volunteers and professionals!
+- Fixed a bug where a float/half conversion led to a compilation error for CUDA 11.1 on Turning GPUs.
+- removed Apex dependency for bnb LAMB
 
-We give a special thanks to @TimDettmers for managing to find a little bit of time for valuable consultations on critical topics, despite preparing for and touring the states applying for professor positions. We wish him the utmost success!
+### 0.0.23:
 
-We also extend our gratitude to the broader community for your continued support, feedback, and engagement, which play a crucial role in driving the library's development forward.
+Bugs:
+
+- Unified quantization API: each quantization function now returns `Q, S` where `Q` is the quantized tensor and `S` the quantization state which may hold absolute max values, a quantization map or more. For dequantization all functions now accept the inputs `Q, S` so that `Q` is dequantized with the quantization state `S`.
+- Fixed an issue where the CUDA 11.1 binary was not compiled with the right headers
+
+API changes:
+
+- Block-wise quantization for optimizers now enabled by default
+
+Features:
+
+- Block-wise quantization routines now support CPU Tensors.
+
+### 0.0.22:
+
+- Fixed an error where a `reset_parameters()` call on the `StableEmbedding` would lead to an error in older PyTorch versions (from 1.7.0).
+
+### 0.0.21
+
+- Ampere, RTX 30 series GPUs now compatible with the library.
diff --git a/README.md b/README.md
index 377ca2e86..2cf630dcb 100644
--- a/README.md
+++ b/README.md
@@ -6,42 +6,9 @@ The `bitsandbytes` library is a lightweight Python wrapper around CUDA custom fu
 
 The library includes quantization primitives for 8-bit & 4-bit operations, through `bitsandbytes.nn.Linear8bitLt` and `bitsandbytes.nn.Linear4bit` and 8-bit optimizers through `bitsandbytes.optim` module.
 
-**Installation for ROCm:**
-
-To install develop version:
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-git checkout rocm_enabled
-pip install -r requirements-dev.txt
-cmake -DCOMPUTE_BACKEND=hip -S . (Use -DBNB_ROCM_ARCH="gfx90a;gfx942" to target specific gpu arch)
-make
-pip install .
-```
-
-For ROCm specific versions:
-
-Install Dependencies:
-```bash
-# hipblaslt installation needed only for rocm<6.0
-apt install hipblaslt
-pip install --upgrade pip
-pip install einops lion_pytorch accelerate
-pip install git+https://github.com/ROCm/transformers.git
-```
-Install Bitsandbytes:
-```bash
-git clone --recurse https://github.com/ROCm/bitsandbytes
-cd bitsandbytes
-# Checkout branch as needed
-# for rocm 5.7 - rocm5.7_internal_testing
-# for rocm 6.x - rocm6.2_internal_testing
-git checkout <branch>
-make hip
-python setup.py install
-```
-
-**For more details, please head to the official documentation page:**
+There are ongoing efforts to support further hardware backends, i.e. Intel CPU + GPU, AMD GPU, Apple Silicon. Windows support is quite far along and is on its way as well.
+
+**Please head to the official documentation page:**
 
 **[https://huggingface.co/docs/bitsandbytes/main](https://huggingface.co/docs/bitsandbytes/main)**
 
diff --git a/benchmarking/accuracy/bnb_accuracy.py b/benchmarking/accuracy/bnb_accuracy.py
deleted file mode 100644
index 2860338ec..000000000
--- a/benchmarking/accuracy/bnb_accuracy.py
+++ /dev/null
@@ -1,26 +0,0 @@
-import torch
-
-from bitsandbytes import functional as F
-
-
-def debug_blocksize(block):
-    x = torch.randn(4096, 4096).cuda()
-    qx, qstate = F.quantize_fp4(x, blocksize=block)
-    dq = F.dequantize_fp4(qx, qstate)
-    return torch.sum(torch.linalg.norm(x - dq, ord="fro"))
-
-
-def test_blocksize(block):
-    x = torch.randn(10, 10).cuda()
-    qx, qstate = F.quantize_fp4(x, blocksize=block)
-    print(x)
-    print("---------------")
-    print(qx)
-    print("---------------")
-    print(qstate)
-
-
-for block in [128, 256, 512, 1024, 2048]:
-    print(debug_blocksize(block))
-
-# test_blocksize(2048)
diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 019a4f6ab..760a8eda4 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -3,6 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import torch
+
 from . import research, utils
 from .autograd._functions import (
     MatmulLtState,
@@ -12,19 +14,53 @@
     matmul_cublas,
     mm_cublas,
 )
+from .backends import register_backend
+from .backends.cpu import CPUBackend
 from .cextension import lib
 from .nn import modules
 
-if lib and lib.compiled_with_cuda:
-    from .backends import register_backend
-    from .backends.cuda import CUDABackend
-    from .optim import adam
+# Always register the CPU backend.
+register_backend("cpu", CPUBackend())
+
+# Register either CUDA or ROCm backend, if available.
+# Only one of these backends can be used at a time, since the torch.device semantics are
+# the same for both torch+rocm and torch+cuda (e.g. device name is "cuda")
+if torch.cuda.is_available():
+    # TODO: Consider deferring loading of cextension - should backend class implement that?
+
+    if torch.version.cuda:
+        from .backends.cuda import CUDABackend
+
+        register_backend("cuda", CUDABackend())
+    elif torch.version.hip:
+        from .backends.rocm import ROCmBackend
+
+        register_backend("cuda", ROCmBackend())
+
+# Register MPS backend, if available.
+if torch.backends.mps.is_available() and torch.backends.mps.is_built():
+    from .backends.mps import MPSBackend
+
+    register_backend("mps", MPSBackend())
+
+# Register Intel XPU backend, if available.
+if hasattr(torch, "xpu") and torch.xpu.is_available():
+    from .backends.xpu import XPUBackend
+
+    register_backend("xpu", XPUBackend())
+
+# TODO: Other potential backends:
+# XLA - Google TPU / PJRT runtime
+# HPU - Habana / Intel Gaudi
+# IPU - Graphcore
+# NPU - Ascend
+# Note that we may not map 1:1 with a device type, e.g. SYCL, XLA
+# In this case, it will be up to each backend to dispatch as needed
 
-    register_backend("cuda", CUDABackend())
 __pdoc__ = {
     "libbitsandbytes": False,
     "optim.optimizer.Optimizer8bit": False,
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.44.0.dev"
+__version__ = "0.43.2.dev"
diff --git a/bitsandbytes/archive_functional.py b/bitsandbytes/archive_functional.py
deleted file mode 100644
index dac7430ed..000000000
--- a/bitsandbytes/archive_functional.py
+++ /dev/null
@@ -1,2466 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-import ctypes as ct
-from functools import reduce  # Required in Python 3
-import itertools
-import operator
-from typing import Tuple
-
-import numpy as np
-from scipy.stats import norm
-import torch
-from torch import Tensor
-
-from .cextension import COMPILED_WITH_CUDA, lib
-
-
-# math.prod not compatible with python < 3.8
-def prod(iterable):
-    return reduce(operator.mul, iterable, 1)
-
-
-name2qmap = {}
-
-if COMPILED_WITH_CUDA:
-    """C FUNCTIONS FOR OPTIMIZERS"""
-    str2optimizer32bit = {}
-    str2optimizer32bit["adam"] = (lib.cadam32bit_grad_fp32, lib.cadam32bit_grad_fp16)  # , lib.cadam32bit_grad_bf16)
-    str2optimizer32bit["momentum"] = (
-        lib.cmomentum32bit_grad_32,
-        lib.cmomentum32bit_grad_16,
-    )
-    str2optimizer32bit["rmsprop"] = (
-        lib.crmsprop32bit_grad_32,
-        lib.crmsprop32bit_grad_16,
-    )
-    str2optimizer32bit["lion"] = (lib.clion32bit_grad_fp32, lib.clion32bit_grad_fp16)  # , lib.clion32bit_grad_bf16)
-    str2optimizer32bit["adagrad"] = (
-        lib.cadagrad32bit_grad_32,
-        lib.cadagrad32bit_grad_16,
-    )
-
-    str2optimizer8bit = {}
-    str2optimizer8bit["adam"] = (
-        lib.cadam_static_8bit_grad_32,
-        lib.cadam_static_8bit_grad_16,
-    )
-    str2optimizer8bit["momentum"] = (
-        lib.cmomentum_static_8bit_grad_32,
-        lib.cmomentum_static_8bit_grad_16,
-    )
-    str2optimizer8bit["rmsprop"] = (
-        lib.crmsprop_static_8bit_grad_32,
-        lib.crmsprop_static_8bit_grad_16,
-    )
-    str2optimizer8bit["lion"] = (
-        lib.clion_static_8bit_grad_32,
-        lib.clion_static_8bit_grad_16,
-    )
-    str2optimizer8bit["lamb"] = (
-        lib.cadam_static_8bit_grad_32,
-        lib.cadam_static_8bit_grad_16,
-    )
-    str2optimizer8bit["lars"] = (
-        lib.cmomentum_static_8bit_grad_32,
-        lib.cmomentum_static_8bit_grad_16,
-    )
-
-    str2optimizer8bit_blockwise = {}
-    str2optimizer8bit_blockwise["adam"] = (
-        lib.cadam_8bit_blockwise_grad_fp32,
-        lib.cadam_8bit_blockwise_grad_fp16,
-        # lib.cadam_8bit_blockwise_grad_bf16,
-    )
-    str2optimizer8bit_blockwise["momentum"] = (
-        lib.cmomentum_8bit_blockwise_grad_fp32,
-        lib.cmomentum_8bit_blockwise_grad_fp16,
-    )
-    str2optimizer8bit_blockwise["rmsprop"] = (
-        lib.crmsprop_8bit_blockwise_grad_fp32,
-        lib.crmsprop_8bit_blockwise_grad_fp16,
-    )
-    str2optimizer8bit_blockwise["lion"] = (
-        lib.clion_8bit_blockwise_grad_fp32,
-        lib.clion_8bit_blockwise_grad_fp16,
-        # lib.clion_8bit_blockwise_grad_bf16,
-    )
-    str2optimizer8bit_blockwise["adagrad"] = (
-        lib.cadagrad_8bit_blockwise_grad_fp32,
-        lib.cadagrad_8bit_blockwise_grad_fp16,
-    )
-
-
-class GlobalPageManager:
-    _instance = None
-
-    def __init__(self):
-        raise RuntimeError("Call get_instance() instead")
-
-    def initialize(self):
-        self.paged_tensors = []
-
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls.__new__(cls)
-            cls._instance.initialize()
-        return cls._instance
-
-    def prefetch_all(self, to_cpu=False):
-        # assume the first added, will be the
-        # ones that are used first, so swap them in last
-        # in the case they are evicted again
-        for t in self.paged_tensors[::-1]:
-            prefetch_tensor(t, to_cpu)
-
-
-class CUBLAS_Context:
-    _instance = None
-
-    def __init__(self):
-        raise RuntimeError("Call get_instance() instead")
-
-    def initialize(self):
-        self.context = {}
-
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls.__new__(cls)
-            cls._instance.initialize()
-        return cls._instance
-
-    def get_context(self, device):
-        if device.index not in self.context:
-            prev_device = torch.cuda.current_device()
-            torch.cuda.set_device(device)
-            self.context[device.index] = ct.c_void_p(lib.get_context())
-            torch.cuda.set_device(prev_device)
-        return self.context[device.index]
-
-
-class Cusparse_Context:
-    _instance = None
-
-    def __init__(self):
-        raise RuntimeError("Call get_instance() instead")
-
-    def initialize(self):
-        # self.context = ct.c_void_p(lib.get_cusparse())
-        if torch.version.cuda:
-            self.context = ct.c_void_p(lib.get_cusparse())
-        elif torch.version.hip:
-            self.context = ct.c_void_p(lib.get_hipsparse())
-
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls.__new__(cls)
-            cls._instance.initialize()
-        return cls._instance
-
-
-dtype2bytes = {}
-dtype2bytes[torch.float32] = 4
-dtype2bytes[torch.float16] = 2
-dtype2bytes[torch.bfloat16] = 2
-dtype2bytes[torch.uint8] = 1
-dtype2bytes[torch.int8] = 1
-
-
-def get_paged(*shape, dtype=torch.float32, device=torch.device("cuda", index=0)):
-    num_bytes = dtype2bytes[dtype] * prod(shape)
-    cuda_ptr = lib.cget_managed_ptr(ct.c_size_t(num_bytes))
-    c_ptr = ct.cast(cuda_ptr, ct.POINTER(ct.c_int))
-    new_array = np.ctypeslib.as_array(c_ptr, shape=shape)
-    out = torch.frombuffer(new_array, dtype=dtype, count=prod(shape)).view(shape)
-    out.is_paged = True
-    out.page_deviceid = device.index
-    return out
-
-
-def prefetch_tensor(A, to_cpu=False):
-    assert A.is_paged, "Only paged tensors can be prefetched!"
-    if to_cpu:
-        deviceid = -1
-    else:
-        deviceid = A.page_deviceid
-
-    num_bytes = dtype2bytes[A.dtype] * A.numel()
-    lib.cprefetch(get_ptr(A), ct.c_size_t(num_bytes), ct.c_int32(deviceid))
-
-
-def elementwise_func(func_name, A, B, value, prefetch=True):
-    func = None
-    if A.dtype == torch.float32:
-        func = getattr(lib, f"c{func_name}_fp32", None)
-        cvalue = ct.c_float(value)
-    elif A.dtype == torch.uint8:
-        func = getattr(lib, f"c{func_name}_uint8", None)
-        cvalue = ct.c_uint8(value)
-
-    if func is None:
-        raise NotImplementedError(f"Function not implemented: {func_name}")
-
-    is_managed = getattr(A, "is_managed", False)
-    if is_managed and prefetch:
-        prefetch_tensor(A)
-        if B is not None:
-            prefetch_tensor(B)
-
-    func(get_ptr(A), get_ptr(B), cvalue, ct.c_int64(A.numel()))
-    if A.is_paged or B.is_paged:
-        # paged function are fully asynchronous
-        # if we return from this function, we want to the tensor
-        # to be in the correct state, that is the final state after the
-        # operation occurred. So we synchronize.
-        torch.cuda.synchronize()
-
-
-def fill(A, value, device=None, prefetch=True):
-    elementwise_func("fill", A, None, value)
-
-
-def arange(A, device=None):
-    elementwise_func("arange", A, None, 0)
-
-
-def _mul(A, B, device=None):
-    elementwise_func("_mul", A, B, 0)
-
-
-def create_linear_map(signed=True, total_bits=8, add_zero=True):
-    sign = -1.0 if signed else 0.0
-    total_values = 2**total_bits
-    if add_zero or total_bits < 8:
-        # add a zero
-        # since we simulate less bits by having zeros in the data type, we
-        # we need to center the quantization around zero and as such lose
-        # a single value
-        total_values = 2**total_bits if not signed else 2**total_bits - 1
-
-    values = torch.linspace(sign, 1.0, total_values)
-    gap = 256 - values.numel()
-    if gap == 0:
-        return values
-    else:
-        l = values.numel() // 2
-        return torch.Tensor(values[:l].tolist() + [0] * gap + values[l:].tolist())
-
-
-def create_normal_map(offset=0.9677083, use_extra_value=True):
-    if use_extra_value:
-        # one more positive value, this is an asymmetric type
-        v1 = norm.ppf(torch.linspace(offset, 0.5, 9)[:-1]).tolist()
-        v2 = [0] * (256 - 15)  ## we have 15 non-zero values in this data type
-        v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
-        v = v1 + v2 + v3
-    else:
-        v1 = norm.ppf(torch.linspace(offset, 0.5, 8)[:-1]).tolist()
-        v2 = [0] * (256 - 14)  ## we have 14 non-zero values in this data type
-        v3 = (-norm.ppf(torch.linspace(offset, 0.5, 8)[:-1])).tolist()
-        v = v1 + v2 + v3
-
-    values = torch.Tensor(v)
-    values = values.sort().values
-    values /= values.max()
-    assert values.numel() == 256
-    return values
-
-
-def create_fp8_map(signed=True, exponent_bits=5, precision_bits=2, total_bits=8):
-    e = exponent_bits
-    p = precision_bits
-    has_sign = 1 if signed else 0
-    assert e + p == total_bits - has_sign
-    # the exponent is biased to 2^(e-1) -1 == 0
-    evalues = []
-    pvalues = []
-    for i, val in enumerate(range(-(2 ** (exponent_bits - has_sign)), 2 ** (exponent_bits - has_sign), 1)):
-        evalues.append(2**val)
-
-    values = []
-    lst = list(itertools.product([0, 1], repeat=precision_bits))
-    # for ev in evalues:
-    bias = 2 ** (exponent_bits - 1)
-    for evalue in range(2 ** (exponent_bits)):
-        for bit_pattern in lst:
-            value = 1 if evalue != 0 else 0
-            for i, pval in enumerate(list(bit_pattern)):
-                value += pval * (2 ** -(i + 1))
-            if evalue == 0:
-                # subnormals
-                value = value * 2**-(bias)
-            else:
-                # normals
-                value = value * 2 ** -(evalue - bias - 1)
-            values.append(value)
-            if signed:
-                values.append(-value)
-
-    assert len(values) == 2**total_bits
-    values.sort()
-    if total_bits < 8:
-        gap = 256 - len(values)
-        for i in range(gap):
-            values.append(0)
-    values.sort()
-    code = torch.Tensor(values)
-    code /= code.max()
-
-    return code
-
-
-def create_dynamic_map(signed=True, max_exponent_bits=7, total_bits=8):
-    """
-    Creates the dynamic quantiztion map.
-
-    The dynamic data type is made up of a dynamic exponent and
-    fraction. As the exponent increase from 0 to -7 the number
-    of bits available for the fraction shrinks.
-
-    This is a generalization of the dynamic type where a certain
-    number of the bits and be reserved for the linear quantization
-    region (the fraction). n determines the maximum number of
-    exponent bits.
-
-    For more details see
-    (8-Bit Approximations for Parallelism in Deep Learning)[https://arxiv.org/abs/1511.04561]
-    """
-
-    data = []
-    # these are additional items that come from the case
-    # where all the exponent bits are zero and no
-    # indicator bit is present
-    non_sign_bits = total_bits - (1 if signed else 0)
-    additional_items = 2 ** (non_sign_bits - max_exponent_bits) - 1
-    if not signed:
-        additional_items = 2 * additional_items
-    for i in range(max_exponent_bits):
-        fraction_items = int(
-            2 ** (i + non_sign_bits - max_exponent_bits) + 1
-            if signed
-            else 2 ** (i + non_sign_bits - max_exponent_bits + 1) + 1
-        )
-        boundaries = torch.linspace(0.1, 1, fraction_items)
-        means = (boundaries[:-1] + boundaries[1:]) / 2.0
-        data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-        if signed:
-            data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-
-        if additional_items > 0:
-            boundaries = torch.linspace(0.1, 1, additional_items + 1)
-            means = (boundaries[:-1] + boundaries[1:]) / 2.0
-            data += ((10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-            if signed:
-                data += (-(10 ** (-(max_exponent_bits - 1) + i)) * means).tolist()
-
-    data.append(0)
-    data.append(1.0)
-
-    gap = 256 - len(data)
-    for i in range(gap):
-        data.append(0)
-
-    data.sort()
-    return Tensor(data)
-
-
-def create_quantile_map(A, total_bits=8):
-    q = estimate_quantiles(A, num_quantiles=2**total_bits - 1)
-    q = q.tolist()
-    q.append(0)
-
-    gap = 256 - len(q)
-    for i in range(gap):
-        q.append(0)
-
-    q.sort()
-
-    q = Tensor(q)
-    q = q / q.abs().max()
-    return q
-
-
-def get_special_format_str():
-    if not torch.cuda.is_available():
-        return "col_turing"
-    major, _minor = torch.cuda.get_device_capability()
-    if major <= 7:
-        return "col_turing"
-    if major == 8:
-        return "col_ampere"
-    return "col_turing"
-
-
-def is_on_gpu(tensors):
-    on_gpu = True
-    gpu_ids = set()
-    for t in tensors:
-        if t is None:
-            continue  # NULL pointers are fine
-        is_paged = getattr(t, "is_paged", False)
-        on_gpu &= t.device.type == "cuda" or is_paged
-        if not is_paged:
-            gpu_ids.add(t.device.index)
-    if not on_gpu:
-        raise TypeError(
-            f"All input tensors need to be on the same GPU, but found some tensors to not be on a GPU:\n {[(t.shape, t.device) for t in tensors]}"
-        )
-    if len(gpu_ids) > 1:
-        raise TypeError(
-            f"Input tensors need to be on the same GPU, but found the following tensor and device combinations:\n {[(t.shape, t.device) for t in tensors]}"
-        )
-    return on_gpu
-
-
-def get_ptr(A: Tensor) -> ct.c_void_p:
-    """
-    Get the ctypes pointer from a PyTorch Tensor.
-
-    Parameters
-    ----------
-    A : torch.tensor
-        The PyTorch tensor.
-
-    Returns
-    -------
-    ctypes.c_void_p
-    """
-    if A is None:
-        return None
-    else:
-        return ct.c_void_p(A.data.data_ptr())
-
-
-def pre_call(device):
-    prev_device = torch.cuda.current_device()
-    torch.cuda.set_device(device)
-    return prev_device
-
-
-def post_call(prev_device):
-    torch.cuda.set_device(prev_device)
-
-
-def get_transform_func(dtype, orderA, orderOut, transpose=False):
-    name = f'ctransform_{(8 if dtype == torch.int8 else 32)}_{orderA}_to_{orderOut}_{"t" if transpose else "n"}'
-    if not hasattr(lib, name):
-        print(name)
-        raise ValueError(
-            f"Transform function not supported: {orderA} to {orderOut} for data type {dtype} and transpose={transpose}"
-        )
-    else:
-        return getattr(lib, name)
-
-
-def get_transform_buffer(shape, dtype, device, to_order, from_order="row", transpose=False):
-    # init_func = torch.empty
-    init_func = torch.zeros
-    dims = len(shape)
-
-    if dims == 2:
-        rows = shape[0]
-    elif dims == 3:
-        rows = shape[0] * shape[1]
-    cols = shape[-1]
-
-    state = (shape, to_order)
-    if transpose:
-        # swap dims
-        tmp = rows
-        rows = cols
-        cols = tmp
-        state = (shape[::-1], to_order)
-
-    if to_order == "row" or to_order == "col":
-        return init_func(shape, dtype=dtype, device=device), state
-    elif to_order == "col32":
-        # blocks of 32 columns (padded)
-        cols = 32 * ((cols + 31) // 32)
-        return init_func((rows, cols), dtype=dtype, device=device), state
-    elif to_order == "col_turing":
-        # blocks of 32 columns and 8 rows
-        cols = 32 * ((cols + 31) // 32)
-        rows = 8 * ((rows + 7) // 8)
-        return init_func((rows, cols), dtype=dtype, device=device), state
-    elif to_order == "col_ampere":
-        # blocks of 32 columns and 32 rows
-        cols = 32 * ((cols + 31) // 32)
-        rows = 32 * ((rows + 31) // 32)
-        return init_func((rows, cols), dtype=dtype, device=device), state
-    else:
-        raise NotImplementedError(f"To_order not supported: {to_order}")
-
-
-def nvidia_transform(
-    A,
-    to_order,
-    from_order="row",
-    out=None,
-    transpose=False,
-    state=None,
-    ld=None,
-):
-    if state is None:
-        state = (A.shape, from_order)
-    else:
-        from_order = state[1]
-    if out is None:
-        out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1])
-    else:
-        new_state = (state[1], to_order)
-    func = get_transform_func(A.dtype, from_order, to_order, transpose)
-
-    shape = state[0]
-    if len(shape) == 2:
-        dim1 = ct.c_int32(shape[0])
-        dim2 = ct.c_int32(shape[1])
-    elif ld is not None:
-        n = prod(shape)
-        dim1 = prod([shape[i] for i in ld])
-        dim2 = ct.c_int32(n // dim1)
-        dim1 = ct.c_int32(dim1)
-    else:
-        dim1 = ct.c_int32(shape[0] * shape[1])
-        dim2 = ct.c_int32(shape[2])
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-    func(ptr, get_ptr(A), get_ptr(out), dim1, dim2)
-
-    return out, new_state
-
-
-def estimate_quantiles(A: Tensor, out: Tensor = None, offset: float = 1 / 512, num_quantiles=256) -> Tensor:
-    """
-    Estimates 256 equidistant quantiles on the input tensor eCDF.
-
-    Uses SRAM-Quantiles algorithm to quickly estimate 256 equidistant quantiles
-    via the eCDF of the input tensor `A`. This is a fast but approximate algorithm
-    and the extreme quantiles close to 0 and 1 have high variance / large estimation
-    errors. These large errors can be avoided by using the offset variable which trims
-    the distribution. The default offset value of 1/512 ensures minimum entropy encoding -- it
-    trims 1/512 = 0.2% from each side of the distrivution. An offset value of 0.01 to 0.02
-    usually has a much lower error but is not a minimum entropy encoding. Given an offset
-    of 0.02 equidistance points in the range [0.02, 0.98] are used for the quantiles.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor. Any shape.
-    out : torch.Tensor
-        Tensor with the 256 estimated quantiles.
-    offset : float
-        The offset for the first and last quantile from 0 and 1. Default: 1/(2*num_quantiles)
-    num_quantiles : int
-        The number of equally spaced quantiles.
-
-    Returns
-    -------
-    torch.Tensor:
-        The 256 quantiles in float32 datatype.
-    """
-    if A.numel() < 256:
-        raise NotImplementedError(
-            f"Quantile estimation needs at least 256 values in the Tensor, but Tensor had only {A.numel()} values."
-        )
-    if num_quantiles > 256:
-        raise NotImplementedError(
-            f"Currently only a maximum of 256 equally spaced quantiles are supported, but the argument num_quantiles={num_quantiles}"
-        )
-    if num_quantiles < 256 and offset == 1 / (512):
-        # override default arguments
-        offset = 1 / (2 * num_quantiles)
-
-    if out is None:
-        out = torch.zeros((256,), dtype=torch.float32, device=A.device)
-    is_on_gpu([A, out])
-    device = pre_call(A.device)
-    if A.dtype == torch.float32:
-        lib.cestimate_quantiles_fp32(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel()))
-    elif A.dtype == torch.float16:
-        lib.cestimate_quantiles_fp16(get_ptr(A), get_ptr(out), ct.c_float(offset), ct.c_int(A.numel()))
-    else:
-        raise NotImplementedError(f"Not supported data type {A.dtype}")
-    post_call(device)
-
-    if num_quantiles < 256:
-        step = round(256 / num_quantiles)
-        idx = torch.linspace(0, 255, num_quantiles).long().to(A.device)
-        out = out[idx]
-
-    return out
-
-
-def quantize_blockwise(
-    A: Tensor, code: Tensor = None, absmax: Tensor = None, out: Tensor = None, blocksize=4096, nested=False
-) -> Tensor:
-    """
-    Quantize tensor A in blocks of size 4096 values.
-
-    Quantizes tensor A by dividing it into blocks of 4096 values.
-    Then the absolute maximum value within these blocks is calculated
-    for the non-linear quantization.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    code : torch.Tensor
-        The quantization map.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        The output tensor (8-bit).
-
-    Returns
-    -------
-    torch.Tensor:
-        The 8-bit tensor.
-    tuple(torch.Tensor, torch.Tensor):
-        The quantization state to undo the quantization.
-    """
-
-    if code is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-
-    if absmax is None:
-        n = A.numel()
-        blocks = n // blocksize
-        blocks += 1 if n % blocksize > 0 else 0
-        absmax = torch.zeros((blocks,), device=A.device)
-
-    if out is None:
-        out = torch.zeros_like(A, dtype=torch.uint8)
-
-    if A.device.type != "cpu":
-        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
-        cblocksize = ct.c_int32(blocksize)
-        prev_device = pre_call(A.device)
-        code = code.to(A.device)
-        is_on_gpu([code, A, out, absmax])
-        if A.dtype == torch.float32:
-            lib.cquantize_blockwise_fp32(
-                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())
-            )
-        elif A.dtype == torch.float16:
-            lib.cquantize_blockwise_fp16(
-                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), cblocksize, ct.c_int(A.numel())
-            )
-        else:
-            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-        post_call(A.device)
-    else:
-        # cpu
-        code = code.cpu()
-        lib.cquantize_blockwise_cpu_fp32(
-            get_ptr(code),
-            get_ptr(A),
-            get_ptr(absmax),
-            get_ptr(out),
-            ct.c_longlong(blocksize),
-            ct.c_longlong(A.numel()),
-        )
-
-    if nested:
-        offset = absmax.mean()
-        absmax -= offset
-        qabsmax, state2 = quantize_blockwise(absmax, blocksize=blocksize, nested=False)
-        state = [qabsmax, code, blocksize, nested, offset, state2]
-    else:
-        state = [absmax, code, blocksize, nested, None, None]
-
-    return out, state
-
-
-def dequantize_blockwise(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    code: Tensor = None,
-    out: Tensor = None,
-    blocksize: int = 4096,
-    nested=False,
-) -> Tensor:
-    """
-    Dequantizes blockwise quantized values.
-
-    Dequantizes the tensor A with maximum absolute values absmax in
-    blocks of size 4096.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input 8-bit tensor.
-    quant_state : tuple(torch.Tensor, torch.Tensor)
-        Tuple of code and absmax values.
-    absmax : torch.Tensor
-        The absmax values.
-    code : torch.Tensor
-        The quantization map.
-    out : torch.Tensor
-        Dequantized output tensor (default: float32)
-
-
-    Returns
-    -------
-    torch.Tensor:
-        Dequantized tensor (default: float32)
-    """
-    assert quant_state is not None or absmax is not None
-    if code is None and quant_state is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-
-    if out is None:
-        out = torch.zeros_like(A, dtype=torch.float32)
-
-    if quant_state is None:
-        quant_state = (absmax, code, blocksize)
-        assert absmax is not None and out is not None
-    else:
-        absmax, code, blocksize, nested, offset, state2 = quant_state
-        if nested:
-            absmax = dequantize_blockwise(absmax, state2)
-            absmax += offset
-
-    if A.device.type != "cpu":
-        device = pre_call(A.device)
-        code = code.to(A.device)
-        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-            raise ValueError(
-                f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
-            )
-        is_on_gpu([A, absmax, out])
-        if out.dtype == torch.float32:
-            lib.cdequantize_blockwise_fp32(
-                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel())
-            )
-        elif out.dtype == torch.float16:
-            lib.cdequantize_blockwise_fp16(
-                get_ptr(code), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(A.numel())
-            )
-        else:
-            raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-        post_call(A.device)
-    else:
-        code = code.cpu()
-        lib.cdequantize_blockwise_cpu_fp32(
-            get_ptr(quant_state[1]),
-            get_ptr(A),
-            get_ptr(quant_state[0]),
-            get_ptr(out),
-            ct.c_longlong(blocksize),
-            ct.c_longlong(A.numel()),
-        )
-
-    return out
-
-
-def quantize_fp4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
-    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "fp4")
-
-
-def quantize_nf4(A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False):
-    return quantize_4bit(A, absmax, out, blocksize, compress_statistics, "nf4")
-
-
-def quantize_4bit(
-    A: Tensor, absmax: Tensor = None, out: Tensor = None, blocksize=64, compress_statistics=False, quant_type="fp4"
-) -> Tensor:
-    """
-    Quantize tensor A in blocks of 4-bit values.
-
-    Quantizes tensor A by dividing it into blocks which are independently quantized to FP4.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        The output tensor (8-bit).
-    blocksize : int
-        The blocksize used in quantization.
-    quant_type : str
-        The 4-bit quantization data type {fp4, nf4}
-
-    Returns
-    -------
-    torch.Tensor:
-        The 8-bit tensor with packed 4-bit values.
-    tuple(torch.Tensor, torch.Size, torch.dtype, int):
-        The quantization state to undo the quantization.
-    """
-    if A.device.type != "cuda":
-        raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
-    if quant_type not in ["fp4", "nf4"]:
-        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
-
-    n = A.numel()
-    input_shape = A.shape
-
-    if absmax is None:
-        blocks = n // blocksize
-        blocks += 1 if n % blocksize > 0 else 0
-        absmax = torch.zeros((blocks,), device=A.device)
-
-    if out is None:
-        out = torch.zeros(((n + 1) // 2, 1), dtype=torch.uint8, device=A.device)
-
-    assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
-
-    prev_device = pre_call(A.device)
-    is_on_gpu([A, out, absmax])
-
-    if A.dtype == torch.float32:
-        if quant_type == "fp4":
-            lib.cquantize_blockwise_fp32_fp4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
-            )
-        else:
-            lib.cquantize_blockwise_fp32_nf4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
-            )
-    elif A.dtype == torch.float16:
-        if quant_type == "fp4":
-            lib.cquantize_blockwise_fp16_fp4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
-            )
-        else:
-            lib.cquantize_blockwise_fp16_nf4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int32(blocksize), ct.c_int(n)
-            )
-    else:
-        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-    post_call(A.device)
-
-    if compress_statistics:
-        offset = absmax.mean()
-        absmax -= offset
-        # code = create_custom_map().to(absmax.device)
-        # qabsmax, state2 = quantize_blockwise(absmax, code=code, blocksize=256)
-        qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
-        del absmax
-        state = [qabsmax, input_shape, A.dtype, blocksize, [offset, state2], quant_type]
-    else:
-        state = [absmax, input_shape, A.dtype, blocksize, None, quant_type]
-
-    return out, state
-
-
-def dequantize_fp4(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    out: Tensor = None,
-    blocksize: int = 64,
-) -> Tensor:
-    return dequantize_4bit(A, quant_state, absmax, out, blocksize, "fp4")
-
-
-def dequantize_nf4(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    out: Tensor = None,
-    blocksize: int = 64,
-) -> Tensor:
-    return dequantize_4bit(A, quant_state, absmax, out, blocksize, "nf4")
-
-
-def dequantize_4bit(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    out: Tensor = None,
-    blocksize: int = 64,
-    quant_type="fp4",
-) -> Tensor:
-    """
-    Dequantizes FP4 blockwise quantized values.
-
-    Dequantizes the tensor A with maximum absolute values absmax in blocks of size blocksize.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input 8-bit tensor (packed 4-bit values).
-    quant_state : tuple(torch.Tensor, torch.Size, torch.dtype)
-        Tuple of absmax values, original tensor shape and original dtype.
-    absmax : torch.Tensor
-        The absmax values.
-    out : torch.Tensor
-        Dequantized output tensor.
-    blocksize : int
-        The blocksize used in quantization.
-    quant_type : str
-        The 4-bit quantization data type {fp4, nf4}
-
-
-    Returns
-    -------
-    torch.Tensor:
-        Dequantized tensor.
-    """
-    if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
-        raise ValueError(
-            f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
-        )
-    if quant_type not in ["fp4", "nf4"]:
-        raise NotImplementedError(f"4-bit quantization data type {quant_type} is not implemented.")
-
-    if quant_state is None:
-        assert absmax is not None and out is not None
-        shape = out.shape
-        dtype = out.dtype
-    else:
-        absmax, shape, dtype, blocksize, compressed_stats, quant_type = quant_state
-
-    if compressed_stats is not None:
-        offset, state2 = compressed_stats
-        absmax = dequantize_blockwise(absmax, state2)
-        absmax += offset
-
-    if out is None:
-        out = torch.empty(shape, dtype=dtype, device=A.device)
-
-    n = out.numel()
-
-    device = pre_call(A.device)
-    is_on_gpu([A, absmax, out])
-    if out.dtype == torch.float32:
-        if quant_type == "fp4":
-            lib.cdequantize_blockwise_fp32_fp4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
-            )
-        else:
-            lib.cdequantize_blockwise_fp32_nf4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
-            )
-    elif out.dtype == torch.float16:
-        if quant_type == "fp4":
-            lib.cdequantize_blockwise_fp16_fp4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
-            )
-        else:
-            lib.cdequantize_blockwise_fp16_nf4(
-                get_ptr(None), get_ptr(A), get_ptr(absmax), get_ptr(out), ct.c_int(blocksize), ct.c_int(n)
-            )
-    else:
-        raise ValueError(f"Blockwise quantization only supports 16/32-bit floats, but got {A.dtype}")
-    post_call(A.device)
-
-    is_transposed = True if A.shape[0] == 1 else False
-    if is_transposed:
-        return out.t()
-    else:
-        return out
-
-
-def quantize(A: Tensor, code: Tensor = None, out: Tensor = None) -> Tensor:
-    if code is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-        code = code.to(A.device)
-
-    absmax = torch.abs(A).max()
-    inp = A / absmax
-    out = quantize_no_absmax(inp, code, out)
-    return out, (absmax, code)
-
-
-def dequantize(
-    A: Tensor,
-    quant_state: Tuple[Tensor, Tensor] = None,
-    absmax: Tensor = None,
-    code: Tensor = None,
-    out: Tensor = None,
-) -> Tensor:
-    assert quant_state is not None or absmax is not None
-    if code is None and quant_state is None:
-        if "dynamic" not in name2qmap:
-            name2qmap["dynamic"] = create_dynamic_map().to(A.device)
-        code = name2qmap["dynamic"]
-        code = code.to(A.device)
-
-    if quant_state is None:
-        quant_state = (absmax, code)
-    out = dequantize_no_absmax(A, quant_state[1], out)
-    return out * quant_state[0]
-
-
-def quantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
-    """
-    Quantizes input tensor to 8-bit.
-
-    Quantizes the 32-bit input tensor `A` to the 8-bit output tensor
-    `out` using the quantization map `code`.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The input tensor.
-    code : torch.Tensor
-        The quantization map.
-    out : torch.Tensor, optional
-        The output tensor. Needs to be of type byte.
-
-    Returns
-    -------
-    torch.Tensor:
-        Quantized 8-bit tensor.
-    """
-    prev_device = pre_call(A.device)
-    if out is None:
-        out = torch.zeros_like(A, dtype=torch.uint8)
-    is_on_gpu([A, out])
-    lib.cquantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
-    post_call(prev_device)
-    return out
-
-
-def dequantize_no_absmax(A: Tensor, code: Tensor, out: Tensor = None) -> Tensor:
-    """
-    Dequantizes the 8-bit tensor to 32-bit.
-
-    Dequantizes the 8-bit tensor `A` to the 32-bit tensor `out` via
-    the quantization map `code`.
-
-    Parameters
-    ----------
-    A : torch.Tensor
-        The 8-bit input tensor.
-    code : torch.Tensor
-        The quantization map.
-    out : torch.Tensor
-        The 32-bit output tensor.
-
-    Returns
-    -------
-    torch.Tensor:
-        32-bit output tensor.
-    """
-    prev_device = pre_call(A.device)
-    if out is None:
-        out = torch.zeros_like(A, dtype=torch.float32)
-    is_on_gpu([code, A, out])
-    lib.cdequantize(get_ptr(code), get_ptr(A), get_ptr(out), ct.c_int(A.numel()))
-    post_call(prev_device)
-    return out
-
-
-def optimizer_update_32bit(
-    optimizer_name: str,
-    g: Tensor,
-    p: Tensor,
-    state1: Tensor,
-    beta1: float,
-    eps: float,
-    step: int,
-    lr: float,
-    state2: Tensor = None,
-    beta2: float = 0.0,
-    weight_decay: float = 0.0,
-    gnorm_scale: float = 1.0,
-    unorm_vec: Tensor = None,
-    max_unorm: float = 0.0,
-    skip_zeros=False,
-) -> None:
-    """
-    Performs an inplace optimizer update with one or two optimizer states.
-
-    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.
-
-    Parameters
-    ----------
-    optimizer_name : str
-        The name of the optimizer: {adam}.
-    g : torch.Tensor
-        Gradient tensor.
-    p : torch.Tensor
-        Parameter tensor.
-    state1 : torch.Tensor
-        Optimizer state 1.
-    beta1 : float
-        Optimizer beta1.
-    eps : float
-        Optimizer epsilon.
-    weight_decay : float
-        Weight decay.
-    step : int
-        Current optimizer step.
-    lr : float
-        The learning rate.
-    state2 : torch.Tensor
-        Optimizer state 2.
-    beta2 : float
-        Optimizer beta2.
-    gnorm_scale : float
-        The factor to rescale the gradient to the max clip value.
-    unorm_vec : torch.Tensor
-        The tensor for the update norm.
-    max_unorm : float
-        The maximum update norm relative to the weight norm.
-    skip_zeros : bool
-        Whether to skip zero-valued gradients or not (default: False).
-    """
-
-    param_norm = 0.0
-    if max_unorm > 0.0:
-        param_norm = torch.norm(p.data.float())
-
-    optim_func = None
-    if g.dtype == torch.float32:
-        optim_func = str2optimizer32bit[optimizer_name][0]
-    elif g.dtype == torch.float16:
-        optim_func = str2optimizer32bit[optimizer_name][1]
-    elif g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name]) == 3:
-        optim_func = str2optimizer32bit[optimizer_name][2]
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
-        )
-
-    is_on_gpu([g, p, state1, state2, unorm_vec])
-    prev_device = pre_call(g.device)
-    optim_func(
-        get_ptr(g),
-        get_ptr(p),
-        get_ptr(state1),
-        get_ptr(state2),
-        get_ptr(unorm_vec),
-        ct.c_float(max_unorm),
-        ct.c_float(param_norm),
-        ct.c_float(beta1),
-        ct.c_float(beta2),
-        ct.c_float(eps),
-        ct.c_float(weight_decay),
-        ct.c_int32(step),
-        ct.c_float(lr),
-        ct.c_float(gnorm_scale),
-        ct.c_bool(skip_zeros),
-        ct.c_int32(g.numel()),
-    )
-    post_call(prev_device)
-
-
-def optimizer_update_8bit(
-    optimizer_name: str,
-    g: Tensor,
-    p: Tensor,
-    state1: Tensor,
-    state2: Tensor,
-    beta1: float,
-    beta2: float,
-    eps: float,
-    step: int,
-    lr: float,
-    qmap1: Tensor,
-    qmap2: Tensor,
-    max1: Tensor,
-    max2: Tensor,
-    new_max1: Tensor,
-    new_max2: Tensor,
-    weight_decay: float = 0.0,
-    gnorm_scale: float = 1.0,
-    unorm_vec: Tensor = None,
-    max_unorm: float = 0.0,
-) -> None:
-    """
-    Performs an inplace Adam update.
-
-    Universal Adam update for 32/8-bit state and 32/16-bit gradients/weights.
-    Uses AdamW formulation if weight decay > 0.0.
-
-    Parameters
-    ----------
-    optimizer_name : str
-        The name of the optimizer. Choices {adam, momentum}
-    g : torch.Tensor
-        Gradient tensor.
-    p : torch.Tensor
-        Parameter tensor.
-    state1 : torch.Tensor
-        Adam state 1.
-    state2 : torch.Tensor
-        Adam state 2.
-    beta1 : float
-        Adam beta1.
-    beta2 : float
-        Adam beta2.
-    eps : float
-        Adam epsilon.
-    weight_decay : float
-        Weight decay.
-    step : int
-        Current optimizer step.
-    lr : float
-        The learning rate.
-    qmap1 : torch.Tensor
-        Quantization map for first Adam state.
-    qmap2 : torch.Tensor
-        Quantization map for second Adam state.
-    max1 : torch.Tensor
-        Max value for first Adam state update.
-    max2 : torch.Tensor
-        Max value for second Adam state update.
-    new_max1 : torch.Tensor
-        Max value for the next Adam update of the first state.
-    new_max2 : torch.Tensor
-        Max value for the next Adam update of the second state.
-    gnorm_scale : float
-        The factor to rescale the gradient to the max clip value.
-    unorm_vec : torch.Tensor
-        The tensor for the update norm.
-    max_unorm : float
-        The maximum update norm relative to the weight norm.
-    """
-
-    param_norm = 0.0
-    if max_unorm > 0.0:
-        param_norm = torch.norm(p.data.float())
-
-    prev_device = pre_call(g.device)
-    is_on_gpu([g, p, state1, state2, unorm_vec, qmap1, qmap2, max1, max2, new_max1, new_max2])
-    if g.dtype == torch.float32 and state1.dtype == torch.uint8:
-        str2optimizer8bit[optimizer_name][0](
-            get_ptr(p),
-            get_ptr(g),
-            get_ptr(state1),
-            get_ptr(state2),
-            get_ptr(unorm_vec),
-            ct.c_float(max_unorm),
-            ct.c_float(param_norm),
-            ct.c_float(beta1),
-            ct.c_float(beta2),
-            ct.c_float(eps),
-            ct.c_int32(step),
-            ct.c_float(lr),
-            get_ptr(qmap1),
-            get_ptr(qmap2),
-            get_ptr(max1),
-            get_ptr(max2),
-            get_ptr(new_max1),
-            get_ptr(new_max2),
-            ct.c_float(weight_decay),
-            ct.c_float(gnorm_scale),
-            ct.c_int32(g.numel()),
-        )
-    elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
-        str2optimizer8bit[optimizer_name][1](
-            get_ptr(p),
-            get_ptr(g),
-            get_ptr(state1),
-            get_ptr(state2),
-            get_ptr(unorm_vec),
-            ct.c_float(max_unorm),
-            ct.c_float(param_norm),
-            ct.c_float(beta1),
-            ct.c_float(beta2),
-            ct.c_float(eps),
-            ct.c_int32(step),
-            ct.c_float(lr),
-            get_ptr(qmap1),
-            get_ptr(qmap2),
-            get_ptr(max1),
-            get_ptr(max2),
-            get_ptr(new_max1),
-            get_ptr(new_max2),
-            ct.c_float(weight_decay),
-            ct.c_float(gnorm_scale),
-            ct.c_int32(g.numel()),
-        )
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
-        )
-    post_call(prev_device)
-
-
-def optimizer_update_8bit_blockwise(
-    optimizer_name: str,
-    g: Tensor,
-    p: Tensor,
-    state1: Tensor,
-    state2: Tensor,
-    beta1: float,
-    beta2: float,
-    eps: float,
-    step: int,
-    lr: float,
-    qmap1: Tensor,
-    qmap2: Tensor,
-    absmax1: Tensor,
-    absmax2: Tensor,
-    weight_decay: float = 0.0,
-    gnorm_scale: float = 1.0,
-    skip_zeros=False,
-) -> None:
-    optim_func = None
-    prev_device = pre_call(g.device)
-    is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2])
-    if g.dtype == torch.float32 and state1.dtype == torch.uint8:
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
-    elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
-    elif (
-        g.dtype == torch.bfloat16
-        and state1.dtype == torch.uint8
-        and len(str2optimizer8bit_blockwise[optimizer_name]) == 3
-    ):
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}"
-        )
-    post_call(prev_device)
-
-    is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
-
-    prev_device = pre_call(g.device)
-    optim_func(
-        get_ptr(p),
-        get_ptr(g),
-        get_ptr(state1),
-        get_ptr(state2),
-        ct.c_float(beta1),
-        ct.c_float(beta2),
-        ct.c_float(eps),
-        ct.c_int32(step),
-        ct.c_float(lr),
-        get_ptr(qmap1),
-        get_ptr(qmap2),
-        get_ptr(absmax1),
-        get_ptr(absmax2),
-        ct.c_float(weight_decay),
-        ct.c_float(gnorm_scale),
-        ct.c_bool(skip_zeros),
-        ct.c_int32(g.numel()),
-    )
-    post_call(prev_device)
-
-
-def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5):
-    """Applies percentile clipping
-
-    grad: torch.Tensor
-        The gradient tensor.
-    gnorm_vec: torch.Tensor
-        Vector of gradient norms. 100 elements expected.
-    step: int
-        The current optimiation steps (number of past gradient norms).
-
-    """
-    prev_device = pre_call(grad.device)
-    is_on_gpu([grad, gnorm_vec])
-    if grad.dtype == torch.float32:
-        lib.cpercentile_clipping_g32(
-            get_ptr(grad),
-            get_ptr(gnorm_vec),
-            ct.c_int32(step),
-            ct.c_int32(grad.numel()),
-        )
-    elif grad.dtype == torch.float16:
-        lib.cpercentile_clipping_g16(
-            get_ptr(grad),
-            get_ptr(gnorm_vec),
-            ct.c_int32(step),
-            ct.c_int32(grad.numel()),
-        )
-    else:
-        raise ValueError(f"Gradient type {grad.dtype} not supported!")
-    post_call(prev_device)
-
-    current_gnorm = torch.sqrt(gnorm_vec[step % 100])
-    vals, idx = torch.sort(gnorm_vec)
-    clip_value = torch.sqrt(vals[percentile])
-    gnorm_scale = 1.0
-
-    if current_gnorm > clip_value:
-        gnorm_scale = clip_value / current_gnorm
-
-    return current_gnorm, clip_value, gnorm_scale
-
-
-def histogram_scatter_add_2d(histogram: Tensor, index1: Tensor, index2: Tensor, source: Tensor):
-    assert len(histogram.shape) == 2
-    assert histogram.dtype == torch.float32
-    assert source.dtype == torch.float32
-    assert index1.dtype == torch.int32
-    assert index2.dtype == torch.int32
-
-    assert histogram.device.type == "cuda"
-    assert index1.device.type == "cuda"
-    assert index2.device.type == "cuda"
-    assert source.device.type == "cuda"
-
-    maxdim1 = ct.c_int32(histogram.shape[0])
-    n = ct.c_int32(index1.numel())
-    is_on_gpu([histogram, index1, index2, source])
-    lib.chistogram_scatter_add_2d(get_ptr(histogram), get_ptr(index1), get_ptr(index2), get_ptr(source), maxdim1, n)
-
-
-def check_matmul(A, B, out, transposed_A, transposed_B, expected_type=torch.int8):
-    if not torch.cuda.is_initialized():
-        torch.cuda.init()
-    if A.dtype != expected_type or B.dtype != expected_type:
-        raise TypeError(f"Expected torch.int8 input tensors A and B, but got {A.dtype} and {B.dtype}")
-
-    sA = A.shape
-    sB = B.shape
-    tA = transposed_A
-    tB = transposed_B
-
-    correct = True
-
-    if len(sA) == 2 and len(sB) == 2:
-        if not tA and not tB and A.shape[1] != B.shape[0]:
-            correct = False
-        elif tA and not tB and A.shape[0] != B.shape[0]:
-            correct = False
-        elif tA and tB and A.shape[0] != B.shape[1]:
-            correct = False
-        elif not tA and tB and A.shape[1] != B.shape[1]:
-            correct = False
-    elif len(sA) == 3 and len(sB) == 2:
-        if not tA and not tB and A.shape[2] != B.shape[0]:
-            correct = False
-        elif tA and not tB and A.shape[1] != B.shape[0]:
-            correct = False
-        elif tA and tB and A.shape[1] != B.shape[1]:
-            correct = False
-        elif not tA and tB and A.shape[2] != B.shape[1]:
-            correct = False
-    elif len(sA) == 3 and len(sB) == 3:
-        if not tA and not tB and A.shape[2] != B.shape[1]:
-            correct = False
-        elif tA and not tB and A.shape[1] != B.shape[1]:
-            correct = False
-        elif tA and tB and A.shape[1] != B.shape[2]:
-            correct = False
-        elif not tA and tB and A.shape[2] != B.shape[2]:
-            correct = False
-
-    if out is not None:
-        sout = out.shape
-        # special case common in backprop
-        if not correct and len(sA) == 3 and len(sB) == 3:
-            if sout[0] == sA[2] and sout[1] == sB[2] and sA[0] == sB[0] and sA[1] == sB[1]:
-                correct = True
-    else:
-        if len(sA) == 2 and len(sB) == 2:
-            if not tA and not tB:
-                sout = (sA[0], sB[1])
-            elif tA and tB:
-                sout = (sA[1], sB[0])
-            elif tA and not tB:
-                sout = (sA[1], sB[1])
-            elif not tA and tB:
-                sout = (sA[0], sB[0])
-        elif len(sA) == 3 and len(sB) == 2:
-            if not tA and not tB:
-                sout = (sA[0], sA[1], sB[1])
-            elif tA and tB:
-                sout = (sA[0], sA[2], sB[0])
-            elif tA and not tB:
-                sout = (sA[0], sA[2], sB[1])
-            elif not tA and tB:
-                sout = (sA[0], sA[1], sB[0])
-        elif len(sA) == 3 and len(sB) == 3:
-            if not tA and not tB:
-                sout = (sA[0], sA[1], sB[2])
-            elif tA and tB:
-                sout = (sA[0], sA[2], sB[1])
-            elif tA and not tB:
-                sout = (sA[0], sA[2], sB[2])
-            elif not tA and tB:
-                sout = (sA[0], sA[1], sB[1])
-
-    if not correct:
-        raise ValueError(
-            f"Tensor dimensions incorrect for matrix mulitiplication: A x B: {sA} x {sB} with transpose for A x B: {tA} x {tB}."
-        )
-
-    return sout
-
-
-def cutlass3_gemm(A: Tensor, B: Tensor, out: Tensor = None, transposed_A=False, transposed_B=False, state=None):
-    # sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype)
-    if state is None:
-        Bshape = B.shape
-        bout = Bshape[1]
-    else:
-        Bshape = state[1]
-        bout = Bshape[0]
-    if out is None:
-        out = torch.zeros(size=(A.shape[0], bout), dtype=A.dtype, device=A.device)
-
-    sA = A.shape
-    sB = B.shape
-    if transposed_A and len(sA) == 2:
-        sA = (sA[1], sA[0])
-    elif transposed_A and len(sA) == 3:
-        sA = (sA[0], sA[2], sA[0])
-    if transposed_B and len(sB) == 2:
-        sB = (sB[1], sB[0])
-    elif transposed_B and len(sB) == 3:
-        sB = (sB[0], sB[2], sB[0])
-    # this is a mess: cuBLAS expect column major, but PyTorch is row major.
-    # So to perform the matrix multiplication, we have to treat A, B, and C matrices
-    # (transpose of row major is column major)
-    # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
-
-    # matrices in the input arguments for cuBLAS
-    # column major: A @ B = C: [m, k] @ [k, n] = [m, n]
-    # row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n]
-    # column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m]
-    if len(sB) == 2:
-        if B.stride()[0] == B.shape[1]:
-            transposed_B = False
-        elif B.stride()[1] == B.shape[0]:
-            transposed_B = True
-        if len(A.shape) == 2:
-            if A.stride()[0] == A.shape[1]:
-                transposed_A = False
-            elif A.stride()[1] == A.shape[0]:
-                transposed_A = True
-        else:
-            if A.stride()[1] == A.shape[2]:
-                transposed_A = False
-            elif A.stride()[2] == A.shape[1]:
-                transposed_A = True
-
-        if len(sA) == 2:
-            n = sA[0]
-            ldb = A.stride()[1 if transposed_A else 0]
-        elif len(sA) == 3 and len(sB) == 2:
-            n = sA[0] * sA[1]
-            ldb = sA[2]
-
-        m = sB[1]
-        k = sB[0]
-        lda = B.stride()[0]
-        ldc = sB[1]
-    elif len(sB) == 3:
-        # special case
-        assert len(sA) == 3
-        if not (sA[0] == sB[0] and sA[1] == sB[1]):
-            raise ValueError(
-                f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}"
-            )
-
-        transposed_A = True
-        transposed_B = False
-
-        m = sB[2]
-        n = sA[2]
-        k = sB[0] * sB[1]
-
-        lda = n
-        ldb = sA[2]
-        ldc = m
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-
-    # B^T @ A^T = C^T
-    # [km, nk -> mn]
-    # lda = ldb = ldc = 1
-    # lda = 1
-    if state is not None:
-        m = Bshape[0]
-        k = Bshape[1]
-        lda = Bshape[0]
-        ldc = Bshape[0]
-        ldb = (ldb + 1) // 2
-    # print(m, n, k, lda, ldb, ldc)
-    is_on_gpu([B, A, out])
-    m = ct.c_int32(m)
-    n = ct.c_int32(n)
-    k = ct.c_int32(k)
-    lda = ct.c_int32(lda)
-    ldb = ct.c_int32(ldb)
-    ldc = ct.c_int32(ldc)
-
-    if B.dtype == torch.uint8:
-        lib.cgemm_4bit_inference(
-            m, n, k, get_ptr(A), get_ptr(B), get_ptr(state[0]), get_ptr(out), lda, ldb, ldc, ct.c_int32(state[3])
-        )
-    elif A.dtype == torch.float32:
-        lib.cgemm_host_fp32(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
-    elif A.dtype == torch.float16:
-        lib.cgemm_host_fp16(m, n, k, get_ptr(A), get_ptr(B), get_ptr(out), lda, ldb, ldc)
-    else:
-        raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
-
-    return out
-
-
-def igemm(
-    A: Tensor,
-    B: Tensor,
-    out: Tensor = None,
-    transposed_A=False,
-    transposed_B=False,
-):
-    sout = check_matmul(A, B, out, transposed_A, transposed_B)
-    if out is None:
-        out = torch.zeros(size=sout, dtype=torch.int32, device=A.device)
-    if len(A.shape) == 3 and len(B.shape) == 3:
-        if A.shape[0] == B.shape[0] and A.shape[2] == B.shape[1]:
-            return batched_igemm(A, B, out)
-
-    sA = A.shape
-    sB = B.shape
-    if transposed_A and len(sA) == 2:
-        sA = (sA[1], sA[0])
-    elif transposed_A and len(sA) == 3:
-        sA = (sA[0], sA[2], sA[0])
-    if transposed_B and len(sB) == 2:
-        sB = (sB[1], sB[0])
-    elif transposed_B and len(sB) == 3:
-        sB = (sB[0], sB[2], sB[0])
-    # this is a mess: cuBLAS expect column major, but PyTorch is row major.
-    # So to perform the matrix multiplication, we have to treat A, B, and C matrices
-    # (transpose of row major is column major)
-    # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
-
-    # matrices in the input arguments for cuBLAS
-    # column major: A @ B = C: [m, k] @ [k, n] = [m, n]
-    # row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n]
-    # column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m]
-    if len(sB) == 2:
-        if B.stride()[0] == B.shape[1]:
-            transposed_B = False
-        elif B.stride()[1] == B.shape[0]:
-            transposed_B = True
-        if len(A.shape) == 2:
-            if A.stride()[0] == A.shape[1]:
-                transposed_A = False
-            elif A.stride()[1] == A.shape[0]:
-                transposed_A = True
-        else:
-            if A.stride()[1] == A.shape[2]:
-                transposed_A = False
-            elif A.stride()[2] == A.shape[1]:
-                transposed_A = True
-
-        if len(sA) == 2:
-            n = sA[0]
-            ldb = A.stride()[1 if transposed_A else 0]
-        elif len(sA) == 3 and len(sB) == 2:
-            n = sA[0] * sA[1]
-            ldb = sA[2]
-
-        m = sB[1]
-        k = sB[0]
-        lda = B.stride()[(1 if transposed_B else 0)]
-        ldc = sB[1]
-    elif len(sB) == 3:
-        # special case
-        assert len(sA) == 3
-        if not (sA[0] == sB[0] and sA[1] == sB[1]):
-            raise ValueError(
-                f"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {sA} x {sB}"
-            )
-
-        transposed_A = True
-        transposed_B = False
-
-        m = sB[2]
-        n = sA[2]
-        k = sB[0] * sB[1]
-
-        lda = m
-        ldb = sA[2]
-        ldc = m
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-
-    # B^T @ A^T = C^T
-    # [km, nk -> mn]
-    is_on_gpu([B, A, out])
-    lib.cigemm(
-        ptr,
-        ct.c_bool(transposed_B),
-        ct.c_bool(transposed_A),
-        ct.c_int32(m),
-        ct.c_int32(n),
-        ct.c_int32(k),
-        get_ptr(B),
-        get_ptr(A),
-        get_ptr(out),
-        ct.c_int32(lda),
-        ct.c_int32(ldb),
-        ct.c_int32(ldc),
-    )
-    return out
-
-
-def batched_igemm(
-    A: Tensor,
-    B: Tensor,
-    out: Tensor = None,
-    transposed_A=False,
-    transposed_B=False,
-):
-    if not len(A.shape) == 3 or not len(B.shape) == 3:
-        raise ValueError(f"Expected 3-dimensional tensors for bmm, but got shapes A and B: {A.shape} and {B.shape}")
-    sout = check_matmul(A, B, out, transposed_A, transposed_B)
-    if out is None:
-        out = torch.zeros(size=sout, dtype=torch.int32, device=A.device)
-
-    if B.is_contiguous():
-        lda = B.stride()[1]
-        transposed_A = False
-    else:
-        s = B.stride()
-        if s[0] != B.shape[0]:
-            B = B.contiguous()
-            lda = B.stride()[1]
-        elif s[2] == B.shape[1]:
-            transposed_A = True
-            lda = B.stride()[2]
-        else:
-            if s[2] == 1:
-                B = B.contiguous()
-                lda = B.stride()[1]
-            elif s[1] == 1:
-                B = B.contiguous()
-                lda = B.stride()[1]
-            else:
-                B = B.contiguous()
-                lda = B.stride()[1]
-
-    if A.is_contiguous():
-        ldb = A.stride()[1]
-        transposed_B = False
-    else:
-        s = A.stride()
-        if s[0] != A.shape[0]:
-            A = A.contiguous()
-            ldb = A.stride()[1]
-            transposed_B = False
-        elif s[2] == A.shape[1]:
-            ldb = A.stride()[2]
-            transposed_B = True
-        else:
-            A = A.contiguous()
-            ldb = A.stride()[1]
-            transposed_B = False
-
-    # this is a mess: cuBLAS expect column major, but PyTorch is row major.
-    # So to perform the matrix multiplication, we have to treat A, B, and C matrices
-    # (transpose of row major is column major)
-    # This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
-    # matrices in the input arguments for cuBLAS
-
-    # column major: A @ B = C: [batch, m, k] @ [batch, k, n] = [batch, m, n]
-    # row major: B^T @ A^T = C^T: [batch, m, k] @ [batch, k, n] = [batch, m, n]
-    # column major with row major layout: B^T @ A^T = C^T: [batch, k, m] @ [batch, n, k] = [batch, n, m]
-    num_batch = A.shape[0]
-    n = A.shape[1]
-    m = B.shape[2]
-    k = B.shape[1]
-
-    ldc = m
-
-    strideA = B.shape[1] * B.shape[2]
-    strideB = A.shape[1] * A.shape[2]
-    strideC = A.shape[1] * B.shape[2]
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-
-    is_on_gpu([B, A, out])
-    lib.cbatched_igemm(
-        ptr,
-        ct.c_bool(transposed_B),
-        ct.c_bool(transposed_A),
-        ct.c_int32(m),
-        ct.c_int32(n),
-        ct.c_int32(k),
-        get_ptr(B),
-        get_ptr(A),
-        get_ptr(out),
-        ct.c_int32(lda),
-        ct.c_int32(ldb),
-        ct.c_int32(ldc),
-        ct.c_long(strideA),
-        ct.c_long(strideB),
-        ct.c_long(strideC),
-        ct.c_uint32(num_batch),
-    )
-    return out
-
-
-def igemmlt(A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
-    shapeA = SA[0]
-    shapeB = SB[0]
-    dimsA = len(shapeA)
-    dimsB = len(shapeB)
-    assert dimsB == 2, "Only two dimensional matrices are supported for argument B"
-    if dimsA == 2:
-        m = shapeA[0]
-    elif dimsA == 3:
-        m = shapeA[0] * shapeA[1]
-
-    rows = n = shapeB[0]
-    assert prod(list(shapeA)) > 0, f"Input tensor dimensions need to be > 0: {shapeA}"
-
-    # if the tensor is empty, return a transformed empty tensor with the right dimensions
-    if shapeA[0] == 0 and dimsA == 2:
-        return torch.empty((0, shapeB[0]), device=A.device, dtype=torch.float16)
-    elif shapeA[1] == 0 and dimsA == 3:
-        return torch.empty(tuple(shapeA[:2] + [shapeB[0]]), device=A.device, dtype=torch.float16)
-
-    if dimsA == 2 and out is None:
-        out, Sout = get_transform_buffer((shapeA[0], shapeB[0]), dtype, A.device, "col32", "row")
-    elif dimsA == 3 and out is None:
-        out, Sout = get_transform_buffer((shapeA[0], shapeA[1], shapeB[0]), dtype, A.device, "col32", "row")
-
-    assert dimsB != 3, "len(B.shape)==3 not supported"
-    assert A.device.type == "cuda"
-    assert B.device.type == "cuda"
-    assert A.dtype == torch.int8
-    assert B.dtype == torch.int8
-    assert out.dtype == dtype
-    assert SA[1] == "col32"
-    assert SB[1] in ["col_turing", "col_ampere"]
-    assert Sout[1] == "col32"
-    assert (
-        shapeA[-1] == shapeB[-1]
-    ), f"Matmullt only supports A @ B^T. Inner matrix dimensions do not match: A @ B = {shapeA} @ {shapeB}"
-    formatB = SB[1]
-    prev_device = A.device
-    torch.cuda.set_device(A.device)
-
-    ptr = CUBLAS_Context.get_instance().get_context(A.device)
-    ptrA = get_ptr(A)
-    ptrB = get_ptr(B)
-    ptrC = get_ptr(out)
-
-    k = shapeA[-1]
-    lda = ct.c_int32(m * 32)
-    if formatB == "col_turing":
-        # turing: tiles with rows filled up to multiple of 8 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 7) // 8) * 8 * 32)
-    else:
-        # ampere: tiles with rows filled up to multiple of 32 rows by 32 columns
-        # n = rows
-        ldb = ct.c_int32(((rows + 31) // 32) * 32 * 32)
-
-    ldc = ct.c_int32(m * 32)
-    m = ct.c_int32(m)
-    n = ct.c_int32(n)
-    k = ct.c_int32(k)
-
-    has_error = 0
-    ptrRowScale = get_ptr(None)
-    is_on_gpu([A, B, out])
-    if formatB == "col_turing":
-        if dtype == torch.int32:
-            has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
-        else:
-            has_error = lib.cigemmlt_turing_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
-    elif formatB == "col_ampere":
-        if dtype == torch.int32:
-            has_error = lib.cigemmlt_ampere_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
-        else:
-            has_error = lib.cigemmlt_ampere_8(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
-
-    if has_error == 1:
-        print(f"A: {shapeA}, B: {shapeB}, C: {Sout[0]}; (lda, ldb, ldc): {(lda, ldb, ldc)}; (m, n, k): {(m, n, k)}")
-        raise Exception("cublasLt ran into an error!")
-
-    torch.cuda.set_device(prev_device)
-
-    return out, Sout
-
-
-def mm_dequant(A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None):
-    assert A.dtype == torch.int32
-    if bias is not None:
-        assert bias.dtype == torch.float16
-    out_shape = quant_state[0]
-    if len(out_shape) == 3:
-        out_shape = (out_shape[0] * out_shape[1], out_shape[2])
-
-    if out is None:
-        out = torch.empty(out_shape, dtype=torch.float16, device=A.device)
-    if new_row_stats is None:
-        new_row_stats = torch.empty(out_shape[0], dtype=torch.float32, device=A.device)
-    if new_col_stats is None:
-        new_col_stats = torch.empty(out_shape[1], dtype=torch.float32, device=A.device)
-    assert new_row_stats.shape[0] == row_stats.shape[0], f"{new_row_stats.shape} vs {row_stats.shape}"
-    assert new_col_stats.shape[0] == col_stats.shape[0], f"{new_col_stats.shape} vs {col_stats.shape}"
-
-    prev_device = pre_call(A.device)
-    ptrA = get_ptr(A)
-    ptrOut = get_ptr(out)
-    ptrRowStats = get_ptr(row_stats)
-    ptrColStats = get_ptr(col_stats)
-    ptrNewRowStats = get_ptr(new_row_stats)
-    ptrNewColStats = get_ptr(new_col_stats)
-    ptrBias = get_ptr(bias)
-    numRows = ct.c_int32(out_shape[0])
-    numCols = ct.c_int32(out_shape[1])
-
-    is_on_gpu([A, row_stats, col_stats, out, new_row_stats, new_col_stats, bias])
-    lib.cdequant_mm_int32_fp16(
-        ptrA, ptrRowStats, ptrColStats, ptrOut, ptrNewRowStats, ptrNewColStats, ptrBias, numRows, numCols
-    )
-    post_call(prev_device)
-
-    return out
-
-
-def get_colrow_absmax(A, row_stats=None, col_stats=None, nnz_block_ptr=None, threshold=0.0):
-    assert A.dtype == torch.float16
-    device = A.device
-
-    cols = A.shape[-1]
-    if len(A.shape) == 3:
-        rows = A.shape[0] * A.shape[1]
-    else:
-        rows = A.shape[0]
-
-    col_tiles = (cols + 255) // 256
-    tiled_rows = ((rows + 15) // 16) * 16
-    if row_stats is None:
-        row_stats = torch.empty((rows,), dtype=torch.float32, device=device).fill_(-50000.0)
-    if col_stats is None:
-        col_stats = torch.empty((cols,), dtype=torch.float32, device=device).fill_(-50000.0)
-
-    if nnz_block_ptr is None and threshold > 0.0:
-        nnz_block_ptr = torch.zeros(((tiled_rows * col_tiles) + 1,), dtype=torch.int32, device=device)
-
-    ptrA = get_ptr(A)
-    ptrRowStats = get_ptr(row_stats)
-    ptrColStats = get_ptr(col_stats)
-    ptrNnzrows = get_ptr(nnz_block_ptr)
-    rows = ct.c_int32(rows)
-    cols = ct.c_int32(cols)
-
-    prev_device = pre_call(A.device)
-    is_on_gpu([A, row_stats, col_stats, nnz_block_ptr])
-    lib.cget_col_row_stats(ptrA, ptrRowStats, ptrColStats, ptrNnzrows, ct.c_float(threshold), rows, cols)
-    post_call(prev_device)
-
-    if threshold > 0.0:
-        nnz_block_ptr.cumsum_(0)
-
-    return row_stats, col_stats, nnz_block_ptr
-
-
-class COOSparseTensor:
-    def __init__(self, rows, cols, nnz, rowidx, colidx, values):
-        assert rowidx.dtype == torch.int32
-        assert colidx.dtype == torch.int32
-        assert values.dtype == torch.float16
-        assert values.numel() == nnz
-        assert rowidx.numel() == nnz
-        assert colidx.numel() == nnz
-
-        self.rows = rows
-        self.cols = cols
-        self.nnz = nnz
-        self.rowidx = rowidx
-        self.colidx = colidx
-        self.values = values
-
-
-class CSRSparseTensor:
-    def __init__(self, rows, cols, nnz, rowptr, colidx, values):
-        assert rowptr.dtype == torch.int32
-        assert colidx.dtype == torch.int32
-        assert values.dtype == torch.float16
-        assert values.numel() == nnz
-        assert colidx.numel() == nnz
-        assert rowptr.numel() == rows + 1
-
-        self.rows = rows
-        self.cols = cols
-        self.nnz = nnz
-        self.rowptr = rowptr
-        self.colidx = colidx
-        self.values = values
-
-
-class CSCSparseTensor:
-    def __init__(self, rows, cols, nnz, colptr, rowidx, values):
-        assert colptr.dtype == torch.int32
-        assert rowidx.dtype == torch.int32
-        assert values.dtype == torch.float16
-        assert values.numel() == nnz
-        assert rowidx.numel() == nnz
-        assert colptr.numel() == cols + 1
-
-        self.rows = rows
-        self.cols = cols
-        self.nnz = nnz
-        self.colptr = colptr
-        self.rowidx = rowidx
-        self.values = values
-
-
-def coo2csr(cooA):
-    values, counts = torch.unique(cooA.rowidx, return_counts=True)
-    values.add_(1)
-    rowptr = torch.zeros((cooA.rows + 1,), dtype=torch.int32, device=cooA.rowidx.device)
-    rowptr.scatter_(index=values.long(), src=counts.int(), dim=0)
-    rowptr.cumsum_(0)
-    return CSRSparseTensor(cooA.rows, cooA.cols, cooA.nnz, rowptr, cooA.colidx, cooA.values)
-
-
-def coo2csc(cooA):
-    val, col2rowidx = torch.sort(cooA.colidx)
-    rowidx = cooA.rowidx[col2rowidx]
-    values = cooA.values[col2rowidx]
-    colvalues, counts = torch.unique(val, return_counts=True)
-    colvalues.add_(1)
-    colptr = torch.zeros((cooA.cols + 1,), dtype=torch.int32, device=cooA.colidx.device)
-    colptr.scatter_(index=colvalues.long(), src=counts.int(), dim=0)
-    colptr.cumsum_(0)
-    return CSCSparseTensor(cooA.rows, cooA.cols, cooA.nnz, colptr, rowidx, values)
-
-
-def coo_zeros(rows, cols, nnz, device, dtype=torch.half):
-    rowidx = torch.zeros((nnz,), dtype=torch.int32, device=device)
-    colidx = torch.zeros((nnz,), dtype=torch.int32, device=device)
-    values = torch.zeros((nnz,), dtype=dtype, device=device)
-    return COOSparseTensor(rows, cols, nnz, rowidx, colidx, values)
-
-
-def double_quant(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
-    device = A.device
-    assert A.dtype == torch.half
-    assert device.type == "cuda"
-    prev_device = pre_call(A.device)
-
-    cols = A.shape[-1]
-    if len(A.shape) == 3:
-        rows = A.shape[0] * A.shape[1]
-    else:
-        rows = A.shape[0]
-
-    if row_stats is None or col_stats is None:
-        row_stats, col_stats, nnz_row_ptr = get_colrow_absmax(A, threshold=threshold)
-
-    if out_col is None:
-        out_col = torch.zeros(A.shape, device=device, dtype=torch.int8)
-    if out_row is None:
-        out_row = torch.zeros(A.shape, device=device, dtype=torch.int8)
-
-    coo_tensor = None
-    ptrA = get_ptr(A)
-    ptrColStats = get_ptr(col_stats)
-    ptrRowStats = get_ptr(row_stats)
-    ptrOutCol = get_ptr(out_col)
-    ptrOutRow = get_ptr(out_row)
-
-    is_on_gpu([A, col_stats, row_stats, out_col, out_row])
-    if threshold > 0.0:
-        nnz = nnz_row_ptr[-1].item()
-        if nnz > 0:
-            coo_tensor = coo_zeros(A.shape[0], A.shape[1], nnz_row_ptr[-1].item(), device)
-            ptrRowIdx = get_ptr(coo_tensor.rowidx)
-            ptrColIdx = get_ptr(coo_tensor.colidx)
-            ptrVal = get_ptr(coo_tensor.values)
-            ptrRowPtr = get_ptr(nnz_row_ptr)
-
-            lib.cdouble_rowcol_quant(
-                ptrA,
-                ptrRowStats,
-                ptrColStats,
-                ptrOutCol,
-                ptrOutRow,
-                ptrRowIdx,
-                ptrColIdx,
-                ptrVal,
-                ptrRowPtr,
-                ct.c_float(threshold),
-                ct.c_int32(rows),
-                ct.c_int32(cols),
-            )
-            val, idx = torch.sort(coo_tensor.rowidx)
-            coo_tensor.rowidx = val
-            coo_tensor.colidx = coo_tensor.colidx[idx]
-            coo_tensor.values = coo_tensor.values[idx]
-        else:
-            lib.cdouble_rowcol_quant(
-                ptrA,
-                ptrRowStats,
-                ptrColStats,
-                ptrOutCol,
-                ptrOutRow,
-                None,
-                None,
-                None,
-                None,
-                ct.c_float(0.0),
-                ct.c_int32(rows),
-                ct.c_int32(cols),
-            )
-    else:
-        lib.cdouble_rowcol_quant(
-            ptrA,
-            ptrRowStats,
-            ptrColStats,
-            ptrOutCol,
-            ptrOutRow,
-            None,
-            None,
-            None,
-            None,
-            ct.c_float(threshold),
-            ct.c_int32(rows),
-            ct.c_int32(cols),
-        )
-    post_call(prev_device)
-
-    return out_row, out_col, row_stats, col_stats, coo_tensor
-
-
-def transform(A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None):
-    prev_device = pre_call(A.device)
-    if state is None:
-        state = (A.shape, from_order)
-    else:
-        from_order = state[1]
-    if out is None:
-        out, new_state = get_transform_buffer(state[0], A.dtype, A.device, to_order, state[1], transpose)
-    else:
-        new_state = (state[0], to_order)  # (shape, order)
-
-    shape = state[0]
-    if len(shape) == 2:
-        dim1 = ct.c_int32(shape[0])
-        dim2 = ct.c_int32(shape[1])
-    else:
-        dim1 = ct.c_int32(shape[0] * shape[1])
-        dim2 = ct.c_int32(shape[2])
-
-    is_on_gpu([A, out])
-    if to_order == "col32":
-        if transpose:
-            lib.ctransform_row2col32T(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            lib.ctransform_row2col32(get_ptr(A), get_ptr(out), dim1, dim2)
-    elif to_order == "col_turing":
-        if transpose:
-            lib.ctransform_row2turingT(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            lib.ctransform_row2turing(get_ptr(A), get_ptr(out), dim1, dim2)
-    elif to_order == "col_ampere":
-        if transpose:
-            lib.ctransform_row2ampereT(get_ptr(A), get_ptr(out), dim1, dim2)
-        else:
-            lib.ctransform_row2ampere(get_ptr(A), get_ptr(out), dim1, dim2)
-    elif to_order == "row":
-        if from_order == "col_turing":
-            lib.ctransform_turing2row(get_ptr(A), get_ptr(out), dim1, dim2)
-        elif from_order == "col_ampere":
-            lib.ctransform_ampere2row(get_ptr(A), get_ptr(out), dim1, dim2)
-    else:
-        raise NotImplementedError(f"Transform function not implemented: From {from_order} to {to_order}")
-
-    post_call(prev_device)
-
-    return out, new_state
-
-
-def spmm_coo(cooA, B, out=None):
-    if out is None:
-        out = torch.empty((cooA.rows, B.shape[1]), device=B.device, dtype=B.dtype)
-    nnz = cooA.nnz
-    assert cooA.rowidx.numel() == nnz
-    assert cooA.colidx.numel() == nnz
-    assert cooA.values.numel() == nnz
-    assert cooA.cols == B.shape[0]
-
-    transposed_B = False if B.is_contiguous() else True
-
-    ldb = B.stride()[(1 if transposed_B else 0)]
-    ldc = B.shape[1]
-
-    ptr = Cusparse_Context.get_instance().context
-
-    ptrRowidx = get_ptr(cooA.rowidx)
-    ptrColidx = get_ptr(cooA.colidx)
-    ptrValues = get_ptr(cooA.values)
-    ptrB = get_ptr(B)
-    ptrC = get_ptr(out)
-    cnnz = ct.c_int32(cooA.nnz)
-    crowsA = ct.c_int32(cooA.rows)
-    ccolsA = ct.c_int32(cooA.cols)
-    ccolsB = ct.c_int32(B.shape[1])
-    cldb = ct.c_int32(ldb)
-    cldc = ct.c_int32(ldc)
-
-    is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out])
-    lib.cspmm_coo(
-        ptr,
-        ptrRowidx,
-        ptrColidx,
-        ptrValues,
-        cnnz,
-        crowsA,
-        ccolsA,
-        ccolsB,
-        cldb,
-        ptrB,
-        cldc,
-        ptrC,
-        ct.c_bool(transposed_B),
-    )
-
-    return out
-
-
-def spmm_coo_very_sparse(cooA, B, dequant_stats=None, out=None):
-    if out is None:
-        out = torch.zeros((cooA.rows, B.shape[1]), device=B.device, dtype=cooA.values.dtype)
-    nnz = cooA.nnz
-    prev_device = pre_call(B.device)
-    assert cooA.rowidx.numel() == nnz
-    assert cooA.colidx.numel() == nnz
-    assert cooA.values.numel() == nnz
-    assert cooA.cols == B.shape[0], f"{cooA.cols} vs {B.shape}"
-
-    transposed_B = False if B.is_contiguous() else True
-
-    ldb = B.stride()[(1 if transposed_B else 0)]
-    ldc = B.shape[1]
-
-    values, counts = torch.unique(cooA.rowidx, return_counts=True)
-    offset = counts.cumsum(0).int()
-    max_count, max_idx = torch.sort(counts, descending=True)
-    max_idx = max_idx.int()
-    max_count = max_count.int()
-    assert max_count[0] <= 32, f"Current max count per row is 8 but found {max_count[0]}."
-    assert B.dtype in [torch.float16, torch.int8]
-    ptrOffset = get_ptr(offset)
-    ptrMaxCount = get_ptr(max_count)
-    ptrMaxIdx = get_ptr(max_idx)
-
-    ptrRowidx = get_ptr(cooA.rowidx)
-    ptrColidx = get_ptr(cooA.colidx)
-    ptrValues = get_ptr(cooA.values)
-    ptrB = get_ptr(B)
-    ptrC = get_ptr(out)
-    ptrDequantStats = get_ptr(dequant_stats)
-    cnnz_rows = ct.c_int32(counts.numel())
-    cnnz = ct.c_int32(cooA.nnz)
-    crowsA = ct.c_int32(cooA.rows)
-    ccolsA = ct.c_int32(cooA.cols)
-    crowsB = ct.c_int32(B.shape[1])
-    ccolsB = ct.c_int32(B.shape[1])
-    cldb = ct.c_int32(ldb)
-    cldc = ct.c_int32(ldc)
-
-    is_on_gpu([cooA.rowidx, cooA.colidx, cooA.values, B, out, dequant_stats])
-    if B.dtype == torch.float16:
-        lib.cspmm_coo_very_sparse_naive_fp16(
-            ptrMaxCount,
-            ptrMaxIdx,
-            ptrOffset,
-            ptrRowidx,
-            ptrColidx,
-            ptrValues,
-            ptrB,
-            ptrC,
-            ptrDequantStats,
-            cnnz_rows,
-            cnnz,
-            crowsA,
-            crowsB,
-            ccolsB,
-        )
-    elif B.dtype == torch.int8:
-        lib.cspmm_coo_very_sparse_naive_int8(
-            ptrMaxCount,
-            ptrMaxIdx,
-            ptrOffset,
-            ptrRowidx,
-            ptrColidx,
-            ptrValues,
-            ptrB,
-            ptrC,
-            ptrDequantStats,
-            cnnz_rows,
-            cnnz,
-            crowsA,
-            crowsB,
-            ccolsB,
-        )
-    # else: assertion error
-    post_call(prev_device)
-
-    return out
-
-
-C = 127.0
-
-
-def vectorwise_quant(x, dim=1, quant_type="vector"):
-    if quant_type == "linear":
-        max1 = torch.abs(x).max().float()
-        xq = torch.round(x / max1 * 127).to(torch.int8)
-        return xq, max1
-    elif quant_type in ["vector", "row"]:
-        max1 = torch.amax(torch.abs(x), dim=dim, keepdim=True)
-        xq = torch.round(x * (C / max1)).to(torch.int8)
-        return xq, max1
-    elif quant_type == "zeropoint":
-        dtype = x.dtype
-        x = x.float()
-        dyna = x.max() - x.min()
-        if dyna == 0:
-            dyna = 1
-        qx = 255.0 / dyna
-        minx = x.min()
-        zpx = torch.round(minx * qx)
-        x = torch.round(qx * x - zpx) + zpx
-        return x, qx
-    elif quant_type in ["vector-zeropoint", "row-zeropoint"]:
-        dtype = x.dtype
-        x = x.float()
-        dyna = torch.amax(x, dim=dim, keepdim=True) - torch.amin(x, dim=dim, keepdim=True)
-        dyna[dyna == 0] = 1
-        qx = 255.0 / dyna
-        minx = torch.amin(x, dim=dim, keepdim=True)
-        zpx = torch.round(minx * qx)
-        x = torch.round(qx * x - zpx) + zpx
-        return x, qx
-    elif quant_type == "truncated-vector":
-        with torch.no_grad():
-            absx = torch.abs(x)
-            max1 = torch.amax(absx, dim=dim, keepdim=True)
-            max1 = max1 * 0.7
-            idx = absx > max1.expand_as(absx)
-            sign = torch.sign(x[idx])
-            x[idx] = max1.expand_as(absx)[idx] * sign
-            xq = torch.round(x / max1 * C).to(torch.int8)
-        return xq, max1
-    else:
-        return None
-
-
-def vectorwise_dequant(xq, max1, quant_type="vector"):
-    if quant_type == "vector":
-        x = (xq / C * max1).to(torch.float32)
-        return x
-    else:
-        return None
-
-
-def vectorwise_mm_dequant(xq, S1, S2, dtype=torch.half, quant_type="vector"):
-    if quant_type == "linear":
-        norm = S1 * S2 / (C * C)
-        # double cast needed to prevent overflows
-        return (xq.float() * norm).to(dtype)
-    elif quant_type == "zeropoint":
-        norm = 1.0 / (S1 * S2)
-        return (xq.float() * norm).to(dtype)
-    elif quant_type == "row-zeropoint":
-        norm = 1.0 / (S1 * S2)
-        x = xq.float()
-        if len(S1.shape) == 3 and len(x.shape) == 2:
-            S1 = S1.squeeze(0)
-        if len(S2.shape) == 3 and len(x.shape) == 2:
-            S2 = S2.squeeze(0)
-        if len(S1.shape) == 2:
-            x *= norm
-        else:
-            x *= norm
-        return x.to(dtype)
-    elif quant_type == "vector-zeropoint":
-        x = xq.float()
-        if len(S1.shape) == 3 and len(x.shape) == 2:
-            S1 = S1.squeeze(0)
-        if len(S2.shape) == 3 and len(x.shape) == 2:
-            S2 = S2.squeeze(0)
-        if len(S1.shape) == 2:
-            x *= 1.0 / S1
-        else:
-            x *= 1.0 / S1
-        x *= 1.0 / S2.t()
-        return x.to(dtype)
-    elif quant_type == "row":
-        x = xq.float()
-        if len(S1.shape) == 3 and len(x.shape) == 2:
-            S1 = S1.squeeze(0)
-        if len(S2.shape) == 3 and len(x.shape) == 2:
-            S2 = S2.squeeze(0)
-        if len(S1.shape) == 2:
-            x *= S1 * S2 / (C * C)
-        else:
-            x *= S1 * S2 / (C * C)
-        return x.to(dtype)
-    elif quant_type in ["truncated-vector", "vector"]:
-        x = xq.float()
-        if len(S1.shape) == 3 and len(x.shape) == 2:
-            S1 = S1.squeeze(0)
-        if len(S2.shape) == 3 and len(x.shape) == 2:
-            S2 = S2.squeeze(0)
-        if len(S1.shape) == 2:
-            x *= S1 / C
-        else:
-            x *= S1 / C
-        x *= S2 / C
-        return x.to(dtype)
-    else:
-        return None
-
-
-def dequant_min_max(xq, A, B, SA, SB, dtype=torch.half):
-    offset = B.float().t().sum(0) * (SA[0] + SA[1])
-    x = xq.float()
-    if len(xq.shape) == 2 and len(SB.shape) == 3:
-        SB = SB.squeeze(0)
-    if len(SB.shape) == 2:
-        x *= SB.t() / 127
-    else:
-        x *= SB / 127
-    x *= SA[1] / 127
-    x += offset
-    return x.to(dtype)
-
-
-def extract_outliers(A, SA, idx):
-    shapeA = SA[0]
-    formatA = SA[1]
-    assert formatA in ["col_turing", "col_ampere"]
-    assert A.device.type == "cuda"
-
-    out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device)
-
-    idx_size = ct.c_int32(idx.numel())
-    rows = ct.c_int32(shapeA[0])
-    cols = ct.c_int32(shapeA[1])
-    ptrA = get_ptr(A)
-    ptrIdx = get_ptr(idx)
-    ptrOut = get_ptr(out)
-
-    prev_device = pre_call(A.device)
-    if formatA == "col_turing":
-        lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
-    elif formatA == "col_ampere":
-        lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
-    post_call(prev_device)
-
-    return out
-
-
-def pipeline_test(A, batch_size):
-    out = torch.zeros_like(A)
-    lib.cpipeline_test(get_ptr(A), get_ptr(out), ct.c_size_t(A.numel()), ct.c_size_t(batch_size))
-    return out
diff --git a/bitsandbytes/autograd/_functions.py b/bitsandbytes/autograd/_functions.py
index 18ca66b17..71943915b 100644
--- a/bitsandbytes/autograd/_functions.py
+++ b/bitsandbytes/autograd/_functions.py
@@ -95,6 +95,9 @@ def undo_layout(permuted_tensor: torch.Tensor, tile_indices: torch.LongTensor) -
     :param tile_indices: reverse transformation indices, from get_inverse_transform_indices
     :return: contiguous row-major tensor
     """
+    # CPU has no change on layout
+    if permuted_tensor.device.type == "cpu":
+        return permuted_tensor
     (rows, cols), (tile_rows, tile_cols) = permuted_tensor.shape, tile_indices.shape
     assert rows % tile_rows == cols % tile_cols == 0, "tensor must contain a whole number of tiles"
     tensor = permuted_tensor.reshape(-1, tile_indices.numel()).t()
@@ -218,6 +221,8 @@ def backward(ctx, grad_output):
 
 def supports_igemmlt(device: torch.device) -> bool:
     """check if this device supports the optimized int8 kernel"""
+    if device == torch.device("cpu"):
+        return True
     if torch.version.hip:
         return False if BNB_HIP_VERSION < 601 else True
     if torch.cuda.get_device_capability(device=device) < (7, 5):
@@ -315,13 +320,16 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
             state.outlier_pool = GlobalOutlierPooler.get_instance()
 
         # Cast A to fp16
-        if A.dtype != torch.float16:
-            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization")
+        A_dtype = torch.float16
+        if A.device == torch.device("cpu"):
+            A_dtype = torch.bfloat16
+        if A.dtype != A_dtype:
+            warnings.warn(f"MatMul8bitLt: inputs will be cast from {A.dtype} to {A_dtype} during quantization")
 
         # 1. Quantize A
         if len(A.shape) == 3:
             A = A.reshape(-1, A.shape[-1])
-        CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(torch.float16), threshold=state.threshold)
+        CA, CAt, SCA, SCAt, coo_tensorA = F.double_quant(A.to(A_dtype), threshold=state.threshold)
 
         if state.threshold > 0.0 and coo_tensorA is not None:
             if state.has_fp16_weights:
@@ -396,7 +404,7 @@ def forward(ctx, A, B, out=None, bias=None, state=MatmulLtState):
         if using_igemmlt:
             C32A, SA = F.transform(CA, "col32")
             out32, Sout32 = F.igemmlt(C32A, state.CxB, SA, state.SB)
-            if bias is None or bias.dtype == torch.float16:
+            if bias is None or bias.dtype == A_dtype:
                 # we apply the fused bias here
                 output = F.mm_dequant(out32, Sout32, SCA, state.SCB, bias=bias)
                 output = output.to(A.dtype)
diff --git a/bitsandbytes/backends/base.py b/bitsandbytes/backends/base.py
index 8232d17c1..2e73c3d6e 100644
--- a/bitsandbytes/backends/base.py
+++ b/bitsandbytes/backends/base.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Optional, Tuple
+from typing import Literal, Optional, Tuple, Union
 
 import torch
 
@@ -12,11 +12,11 @@ class Backend(ABC):
     @abstractmethod
     def double_quant(
         self,
-        A,
-        col_stats=None,
-        row_stats=None,
-        out_col=None,
-        out_row=None,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
         threshold=0.0,
     ):
         raise NotImplementedError
@@ -24,36 +24,50 @@ def double_quant(
     @abstractmethod
     def transform(
         self,
-        A,
-        to_order,
+        A: torch.Tensor,
+        to_order: str,
         from_order="row",
-        out=None,
+        out: Optional[torch.Tensor] = None,
         transpose=False,
-        state=None,
+        state: Optional[Tuple[torch.Size, str]] = None,
         ld=None,
     ):
         raise NotImplementedError
 
     @abstractmethod
-    def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
         raise NotImplementedError
 
     @abstractmethod
     def mm_dequant(
         self,
-        A,
-        quant_state,
-        row_stats,
-        col_stats,
-        out=None,
-        new_row_stats=None,
-        new_col_stats=None,
-        bias=None,
-    ):
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
     @abstractmethod
-    def extract_outliers(self, A, SA, idx):
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
         raise NotImplementedError
 
     @abstractmethod
@@ -64,7 +78,7 @@ def quantize_4bit(
         out: Optional[torch.Tensor] = None,
         blocksize=64,
         compress_statistics=False,
-        quant_type="fp4",
+        quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
         """
@@ -102,7 +116,7 @@ def dequantize_4bit(
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
         blocksize: int = 64,
-        quant_type="fp4",
+        quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
         """
         Dequantizes FP4 blockwise quantized values.
@@ -131,3 +145,128 @@ def dequantize_4bit(
             Dequantized tensor.
         """
         raise NotImplementedError
+
+    @abstractmethod
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    @abstractmethod
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    @abstractmethod
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        """
+        Performs an in-place optimizer update with one or two optimizer states.
+
+        Args:
+            optimizer_name (`str`): The name of the optimizer, e.g. `adam`
+            g (`torch.Tensor`): Gradient tensor.
+            p (`torch.Tensor`): Parameter tensor.
+            state1 (`torch.Tensor`): Optimizer state 1.
+            state2 (`torch.Tensor`, optional): Optimizer state 2.
+            beta1 (`float`): Optimizer beta1.
+            beta2 (`float`): Optimizer beta2.
+            eps (`float`): Optimizer epsilon.
+            step (`int`): Current optimizer step.
+            lr (`float`): The learning rate.
+            qmap1 (`torch.Tensor`): Quantization map for the first state.
+            qmap2 (`torch.Tensor`, optional): Quantization map for the second state.
+            absmax1 (`torch.Tensor`): Max value for the first state update.
+            absmax2 (`torch.Tensor`, optional): Max value for the second state update.
+            weight_decay (`float`, optional): Weight decay. Defaults to 0.0.
+            gnorm_scale (`float`, optional): The factor to rescale the gradient to the max clip value. Defaults to 1.0.
+            skip_zeros (`bool`, optional): Whether to skip zero-valued gradients or not. Defaults to False.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        """
+        Performs an in-place optimizer update with one or two optimizer states.
+
+        Universal optimizer update for 32-bit state and 32/16-bit gradients/weights
+
+        Args:
+            optimizer_name (`str`): The name of the optimizer, e.g. `adam`
+            g (`torch.Tensor`): Gradient tensor.
+            p (`torch.Tensor`): Parameter tensor.
+            state1 (`torch.Tensor`): Optimizer state 1.
+            beta1 (`float`): Optimizer beta1.
+            eps (`float`): Optimizer epsilon.
+            step (`int`): Current optimizer step.
+            lr (`float`): The learning rate.
+            state2 (`torch.Tensor`, optional): Optimizer state 2. Defaults to None.
+            beta2 (`float`, optional): Optimizer beta2. Defaults to 0.0.
+            weight_decay (`float`, optional): Defaults to 0.0.
+            gnorm_scale (`float`, optional): The factor to rescale the gradient to the max clip value. Defaults to 1.0.
+            unorm_vec (`torch.Tensor`, optional): The tensor for the update norm. Defaults to None.
+            max_unorm (`float`, optional): The maximum update norm relative to the weight norm.. Defaults to 0.0.
+            skip_zeros (`bool`, optional): Whether to skip zero-valued gradients or not. Defaults to False.
+        """
+        raise NotImplementedError
diff --git a/bitsandbytes/backends/cpu.py b/bitsandbytes/backends/cpu.py
new file mode 100644
index 000000000..d6a9192e4
--- /dev/null
+++ b/bitsandbytes/backends/cpu.py
@@ -0,0 +1,222 @@
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+from .cpu_xpu_common import (
+    double_quant_impl,
+    igemmlt_impl,
+    mm_dequant_impl,
+)
+
+Tensor = torch.Tensor
+
+
+def assert_on_cpu(tensors):
+    on_cpu = True
+    for t in tensors:
+        if t is None:
+            continue  # NULL pointers are fine
+        on_cpu &= t.device.type == "cpu"
+    if not on_cpu:
+        raise TypeError(
+            "All input tensors need to be on CPU, but found some tensors to not be on CPU:\n"
+            f" {[(t.shape, t.device) if isinstance(t, Tensor) else None for t in tensors]}"
+        )
+    return on_cpu
+
+
+class CPUBackend(Backend):
+    mm_dequant_compute_dtype = torch.bfloat16
+    mm_dequant_output_dtype = torch.bfloat16
+
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
+        assert_on_cpu([A, col_stats, row_stats, out_col, out_row])
+        return double_quant_impl(A, col_stats, row_stats, out_col, out_row, threshold)
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        """
+        Transform tensor A to to_order. It is originally designed for CUDA.
+        For CPU, it returns the original tensor if transpose=False.
+        Otherwise, it returns the transpose of A
+        """
+        assert_on_cpu([A, out])
+        if transpose:
+            if out is not None:
+                out.copy_(A.T)
+            else:
+                out = A.T
+        else:
+            if out is not None:
+                out.copy_(A)
+            else:
+                out = A
+        return out, state
+
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+        assert_on_cpu([A, B])
+        return igemmlt_impl(A, B, SA, SB, out, Sout, dtype)
+
+    def mm_dequant(
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        assert_on_cpu([A, row_stats, col_stats, out, bias])
+        return mm_dequant_impl(
+            A,
+            quant_state,
+            row_stats,
+            col_stats,
+            out,
+            new_row_stats,
+            new_col_stats,
+            bias,
+            self.mm_dequant_compute_dtype,
+            self.mm_dequant_output_dtype,
+        )
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Extract columns of A by idx
+        """
+        assert_on_cpu([A])
+        return A[:, idx].contiguous()
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError("Not yet implemented for CPU backend")
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError("Not yet implemented for CPU backend")
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Not yet implemented for CPU backend")
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError("Not yet implemented for CPU backend")
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError("Not yet implemented for CPU backend")
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError("Not yet implemented for CPU backend")
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError("Not yet implemented for CPU backend")
diff --git a/bitsandbytes/backends/cpu_xpu_common.py b/bitsandbytes/backends/cpu_xpu_common.py
new file mode 100644
index 000000000..f4e5ed3ec
--- /dev/null
+++ b/bitsandbytes/backends/cpu_xpu_common.py
@@ -0,0 +1,230 @@
+import warnings
+
+import torch
+
+try:
+    # to support Intel CPU/GPU (XPU) backend
+    import intel_extension_for_pytorch as ipex
+
+    ipex_cpu = ipex if ipex._C._has_cpu() else None
+    ipex_xpu = ipex if ipex._C._has_xpu() else None
+except BaseException:
+    ipex_cpu = None
+    ipex_xpu = None
+
+
+Tensor = torch.Tensor
+
+
+def _torch_version_prereq(major, minor):
+    ver_major = int(torch.__version__.split(".")[0])
+    ver_minor = int(torch.__version__.split(".")[1])
+    return ver_major * 32 + ver_minor >= major * 32 + minor
+
+
+def _ipex_cpu_version_prereq(major, minor):
+    if ipex_cpu is not None:
+        ver_major = ipex_cpu.__version__.split(".")[0]
+        ver_minor = ipex_cpu.__version__.split(".")[1]
+        return int(ver_major) * 32 + int(ver_minor) >= major * 32 + minor
+    return False
+
+
+def _ipex_xpu_version_prereq(major, minor):
+    if ipex_xpu is not None:
+        ver_major = ipex_xpu.__version__.split(".")[0]
+        ver_minor = ipex_xpu.__version__.split(".")[1]
+        return int(ver_major) * 32 + int(ver_minor) >= major * 32 + minor
+    return False
+
+
+def _maybe_torch_compile(func):
+    # torch.compile requires pytorch >= 2.0
+    if _torch_version_prereq(2, 0):
+        options = {}
+        # fx_graph_cache requires pytorch >= 2.2
+        if _torch_version_prereq(2, 2):
+            options.update({"fx_graph_cache": True})
+        return torch.compile(func, dynamic=True, options=options)
+    return func
+
+
+# Don't use torch.compile for now due to PyTorch issue https://github.com/pytorch/pytorch/issues/124382
+def double_quant_impl(A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
+    """
+    Find absolute max values of each row/column of a tensor, and symmetrically quantize it to int8.
+    If threshold > 0.0, only values <= threshold are counted. All outliers are zeroed out in
+    the original tensor and they are kept in COO format: (rows, cols, values)
+    If threshold == 0.0, there are no outliers.
+    Args:
+        A The tensor to be analyzed and quantized.
+        col_stats Absolute max values of each column of A. If it is not None, use the values directly.
+            Otherwise, find the values.
+        row_stats Absolute max values of each row of A. If it is not None, use the values directly.
+            Otherwise, find the values.
+        out_col Output buffer for the result quantized per column if it is not None
+        out_row Output buffer for the result quantized per row if it is not None
+        threshold The threshold for finding outliers if it is > 0.0. Otherwise it has no effect.
+    Return:
+        A tuple of output quantized per row, output quantized per column, absolute max values of
+        each row of A, absolute max values of each column of A, outliers in COO format
+    """
+    from ..functional import COOSparseTensor
+
+    cols = A.shape[-1]
+    if len(A.shape) == 3:
+        rows = A.shape[0] * A.shape[1]
+    else:
+        assert A.dim() == 2, f"double_quant: Input tensor should be 2d or 3d but got {A.dim()}d"
+        rows = A.shape[0]
+    A = A.reshape(rows, cols)
+
+    coo_tensor = None
+
+    def get_row_col_stats(A):
+        row_stats = torch.max(torch.abs(A), 1).values  # absolute max of each row
+        col_stats = torch.max(torch.abs(A), 0).values  # absolute max of each col
+        return row_stats, col_stats
+
+    def quant_to_int8(A, stats):
+        return torch.clamp(torch.round(A * (127.0 / stats)), -128, 127).to(torch.int8)
+
+    if threshold == 0.0:
+        if row_stats is None or col_stats is None:
+            row_stats, col_stats = get_row_col_stats(A)
+    else:
+        outlier_indices = torch.abs(A) >= threshold  # find outliers
+        outlier_coord = outlier_indices.nonzero()  # get outlier coordinates
+        outlier_rows = outlier_coord[:, 0]  # outlier row for COO sparse tensor
+        outlier_cols = outlier_coord[:, 1]  # outlier column for COO sparse tensor
+        outlier_values = A[outlier_indices]  # outlier values for COO sparse tensor
+        coo_tensor = COOSparseTensor(
+            A.shape[0], A.shape[1], outlier_values.numel(), outlier_rows.int(), outlier_cols.int(), outlier_values
+        )
+        if row_stats is None or col_stats is None:
+            A[outlier_indices] = 0  # zero out outliers
+            row_stats, col_stats = get_row_col_stats(A)
+
+    quant_by_row = quant_to_int8(A, row_stats.unsqueeze(-1))
+    quant_by_col = quant_to_int8(A, col_stats.unsqueeze(0))
+
+    if coo_tensor is not None:
+        A[outlier_indices] = outlier_values  # restore outliers for later use
+
+    if out_row is not None:
+        out_row.copy_(quant_by_row)
+    else:
+        out_row = quant_by_row
+    if out_col is not None:
+        out_col.copy_(quant_by_col)
+    else:
+        out_col = quant_by_col
+    # Return float stats to align with CUDA impl
+    return out_row, out_col, row_stats.float(), col_stats.float(), coo_tensor
+
+
+def igemmlt_impl(A, B, SA=None, SB=None, out=None, Sout=None, dtype=torch.int32):
+    """
+    Do GEMMM computation. Data type: int8 * int8 -> int32.
+    Args:
+        A Activation of linear, data type is int8
+        B Weight of linear, data type is int8
+        SA Not used for CPU/XPU
+        SB Not used for CPU/XPU
+        out Specified output tensor if it is not None
+        Sout Not used for CPU/XPU but returned as is
+        dtype Data type of output
+    Return:
+        A tuple of GEMM result in dtype and Sout
+    """
+    assert A.dtype == torch.int8
+    assert B.dtype == torch.int8
+    if out is not None:
+        assert out.dtype == dtype
+
+    dimsA = A.ndim
+    dimsB = B.ndim
+    shapeA = A.shape
+    shapeB = B.shape
+    assert dimsA in [2, 3], "Only two or three dimensional matrices are supported for argument A"
+    assert dimsB == 2, "Only two dimensional matrices are supported for argument B"
+
+    if dimsA == 2:
+        m = shapeA[0]
+    elif dimsA == 3:
+        m = shapeA[0] * shapeA[1]
+    n = shapeB[0]
+    k = shapeA[-1]
+    assert shapeA[-1] == shapeB[-1], f"Shapes of A and B do not match, got {shapeA} and {shapeB}"
+
+    # if the tensor is empty, return a transformed empty tensor with the right dimensions
+    if shapeA[0] == 0 and dimsA == 2:
+        return torch.empty((0, n), device=A.device, dtype=A.dtype)
+    elif shapeA[1] == 0 and dimsA == 3:
+        return torch.empty(tuple(shapeA[:2] + [n]), device=A.device, dtype=A.dtype)
+
+    A_reshaped = A.reshape(m, k)
+
+    # torch._int_mm is available on CPU since torch 2.4
+    if _torch_version_prereq(2, 4):
+        C = torch._int_mm(A_reshaped, B.T).to(dtype)
+    else:
+        C = torch.matmul(A_reshaped.float(), B.t().float()).to(dtype)
+    if C.ndim != dimsA:
+        assert dimsA == 3
+        shapeOut = (shapeA[0], m // shapeA[0], C.shape[-1])
+        C = C.reshape(shapeOut)
+    if out is not None:
+        out.copy_(C)
+    else:
+        out = C
+
+    return out, Sout
+
+
+@_maybe_torch_compile
+def mm_dequant_impl(
+    A,
+    quant_state,
+    row_stats,
+    col_stats,
+    out=None,
+    new_row_stats=None,
+    new_col_stats=None,
+    bias=None,
+    compute_dtype=torch.float32,
+    output_dtype=torch.float32,
+):
+    """
+    Dequant and add bias
+    out = A_int32 * (abs_max_A * abs_max_B) / 127 * 127 + bias
+    Args:
+        A The output of int8 gemm, whose dtype is int32
+        quant_state Not used for CPU
+        row_stats Absolute max value of each row of input (A) of gemm
+        col_stats Absolute max value of each row of weight (B) of gemm
+        out Output buffer
+        new_row_stats Not used for CPU/XPU
+        new_col_stats Not used for CPU/XPU
+        bias Bias of linear
+        compute_dtype Data type for computation
+        output_dtype Data type for output
+    Return:
+        The result
+    """
+    assert A.dtype == torch.int32
+    out_shape = A.shape
+    if len(out_shape) == 3:
+        out_shape = (out_shape[0] * out_shape[1], out_shape[2])
+
+    if compute_dtype not in [torch.float32, torch.bfloat16]:
+        warnings.warn(f"mm_dequant_{A.device}: compute_dtype {compute_dtype} is not supported, will use float instead")
+        compute_dtype = torch.float32
+    A_reshaped = A.reshape(out_shape).to(compute_dtype)
+    row_stats = row_stats.reshape(-1).unsqueeze(-1).to(compute_dtype)
+    col_stats = col_stats.reshape(-1).unsqueeze(0).to(compute_dtype)
+    out = A_reshaped * row_stats * col_stats / (127 * 127)
+    if bias is not None:
+        out = out + bias.to(compute_dtype)
+    out = out.to(output_dtype)
+    return out
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
index a449b493c..57f9e953f 100644
--- a/bitsandbytes/backends/cuda.py
+++ b/bitsandbytes/backends/cuda.py
@@ -1,5 +1,5 @@
 import ctypes as ct
-from typing import Optional, Tuple
+from typing import Literal, Optional, Tuple
 
 import torch
 
@@ -24,9 +24,69 @@
 
 from .base import Backend
 
+if lib and lib.compiled_with_cuda:
+    """C FUNCTIONS FOR OPTIMIZERS"""
+    str2optimizer32bit = {
+        "adam": (
+            lib.cadam32bit_grad_fp32,
+            lib.cadam32bit_grad_fp16,
+            lib.cadam32bit_grad_bf16,
+        ),
+        "momentum": (
+            lib.cmomentum32bit_grad_32,
+            lib.cmomentum32bit_grad_16,
+        ),
+        "rmsprop": (
+            lib.crmsprop32bit_grad_32,
+            lib.crmsprop32bit_grad_16,
+        ),
+        "lion": (
+            lib.clion32bit_grad_fp32,
+            lib.clion32bit_grad_fp16,
+            lib.clion32bit_grad_bf16,
+        ),
+        "adagrad": (
+            lib.cadagrad32bit_grad_32,
+            lib.cadagrad32bit_grad_16,
+        ),
+    }
+
+    str2optimizer8bit_blockwise = {
+        "adam": (
+            lib.cadam_8bit_blockwise_grad_fp32,
+            lib.cadam_8bit_blockwise_grad_fp16,
+            lib.cadam_8bit_blockwise_grad_bf16,
+        ),
+        "momentum": (
+            lib.cmomentum_8bit_blockwise_grad_fp32,
+            lib.cmomentum_8bit_blockwise_grad_fp16,
+        ),
+        "rmsprop": (
+            lib.crmsprop_8bit_blockwise_grad_fp32,
+            lib.crmsprop_8bit_blockwise_grad_fp16,
+        ),
+        "lion": (
+            lib.clion_8bit_blockwise_grad_fp32,
+            lib.clion_8bit_blockwise_grad_fp16,
+            lib.clion_8bit_blockwise_grad_bf16,
+        ),
+        "adagrad": (
+            lib.cadagrad_8bit_blockwise_grad_fp32,
+            lib.cadagrad_8bit_blockwise_grad_fp16,
+        ),
+    }
+
 
 class CUDABackend(Backend):
-    def double_quant(self, A, col_stats=None, row_stats=None, out_col=None, out_row=None, threshold=0.0):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
         device = A.device
         assert A.dtype == torch.half
         assert device.type == "cuda"
@@ -115,7 +175,16 @@ def double_quant(self, A, col_stats=None, row_stats=None, out_col=None, out_row=
 
         return out_row, out_col, row_stats, col_stats, coo_tensor
 
-    def transform(self, A, to_order, from_order="row", out=None, transpose=False, state=None, ld=None):
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
         if HIP_ENVIRONMENT:
             return nvidia_transform(A, to_order, from_order, out, transpose, state, ld)
 
@@ -170,7 +239,16 @@ def transform(self, A, to_order, from_order="row", out=None, transpose=False, st
 
         return out, new_state
 
-    def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ):
         shapeA = SA[0]
         shapeB = SB[0]
         dimsA = len(shapeA)
@@ -280,7 +358,15 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         return out, Sout
 
     def mm_dequant(
-        self, A, quant_state, row_stats, col_stats, out=None, new_row_stats=None, new_col_stats=None, bias=None
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
     ):
         if HIP_ENVIRONMENT:
             A, quant_state = nvidia_transform(A, "row", state=quant_state)
@@ -319,7 +405,7 @@ def mm_dequant(
 
         return out
 
-    def extract_outliers(self, A, SA, idx):
+    def extract_outliers(self, A: torch.Tensor, SA: Tuple[torch.Size, str], idx: torch.Tensor):
         shapeA = SA[0]
         formatA = SA[1]
         if not HIP_ENVIRONMENT:
@@ -355,7 +441,7 @@ def quantize_4bit(
         out: Optional[torch.Tensor] = None,
         blocksize: Optional[int] = None,
         compress_statistics=False,
-        quant_type="fp4",
+        quant_type: Literal["fp4", "nf4"] = "fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
         if blocksize is None:
@@ -425,7 +511,7 @@ def quantize_4bit(
         if compress_statistics:
             offset = absmax.mean()
             absmax -= offset
-            qabsmax, state2 = quantize_blockwise(absmax, blocksize=256)
+            qabsmax, state2 = self.quantize_blockwise(absmax, blocksize=256)
             del absmax
             state = QuantState(
                 absmax=qabsmax,
@@ -452,7 +538,7 @@ def dequantize_4bit(
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
         blocksize: Optional[int] = None,
-        quant_type="fp4",
+        quant_type: Literal["fp4", "nf4"] = "fp4",
     ) -> torch.Tensor:
         if blocksize is None:
             blocksize = 64 if not HIP_ENVIRONMENT else 128
@@ -477,7 +563,7 @@ def dequantize_4bit(
             absmax = quant_state.absmax
 
         if quant_state.nested:
-            absmax = dequantize_blockwise(quant_state.absmax, quant_state.state2)
+            absmax = self.dequantize_blockwise(quant_state.absmax, quant_state.state2)
             absmax += quant_state.offset
             if absmax.dtype != torch.float32:
                 absmax = absmax.float()
@@ -561,3 +647,240 @@ def dequantize_4bit(
             return out.t()
         else:
             return out
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ):
+        prev_device = pre_call(A.device)
+
+        if state is None:
+            raise ValueError("state cannot be None. gemv_4bit() requires the state from quantize_4bit()")
+
+        if A.numel() != A.shape[-1]:
+            raise ValueError(
+                'Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]',
+            )
+
+        Bshape = state.shape
+        bout = Bshape[0]
+        absmax = state.absmax
+        if state.nested:
+            absmax = self.dequantize_blockwise(state.absmax, state.state2)
+            absmax += state.offset
+
+        if out is None:
+            if len(A.shape) == 3:
+                out = torch.empty(size=(A.shape[0], A.shape[1], bout), dtype=A.dtype, device=A.device)
+            else:
+                out = torch.empty(size=(A.shape[0], bout), dtype=A.dtype, device=A.device)
+
+        n = 1
+        m = Bshape[0]
+        k = Bshape[1]
+        lda = Bshape[0]
+        ldc = Bshape[0]
+        ldb = (A.shape[-1] + 1) // 2
+        is_on_gpu([B, A, out, absmax, state.code])
+        m = ct.c_int32(m)
+        n = ct.c_int32(n)
+        k = ct.c_int32(k)
+        lda = ct.c_int32(lda)
+        ldb = ct.c_int32(ldb)
+        ldc = ct.c_int32(ldc)
+
+        inference_args = [
+            m,
+            n,
+            k,
+            get_ptr(A),
+            get_ptr(B),
+            get_ptr(absmax),
+            get_ptr(state.code),
+            get_ptr(out),
+            lda,
+            ldb,
+            ldc,
+            ct.c_int32(state.blocksize),
+        ]
+
+        if B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32]:
+            if A.dtype == torch.float16:
+                lib.cgemm_4bit_inference_naive_fp16(*inference_args)
+            elif A.dtype == torch.bfloat16:
+                lib.cgemm_4bit_inference_naive_bf16(*inference_args)
+            elif A.dtype == torch.float32:
+                lib.cgemm_4bit_inference_naive_fp32(*inference_args)
+            else:
+                raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
+
+        else:
+            raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
+
+        post_call(prev_device)
+
+        return out
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        # TODO: Move from bnb.functional
+        return dequantize_blockwise(
+            A,
+            quant_state=quant_state,
+            absmax=absmax,
+            code=code,
+            out=out,
+            blocksize=blocksize,
+            nested=nested,
+        )
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        # TODO: Move from bnb.functional
+        return quantize_blockwise(
+            A,
+            absmax=absmax,
+            code=code,
+            out=out,
+            blocksize=blocksize,
+            nested=nested,
+        )
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        optim_func = None
+        prev_device = pre_call(g.device)
+        is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2])
+        if g.dtype == torch.float32 and state1.dtype == torch.uint8:
+            optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
+        elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
+            optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
+        elif (
+            g.dtype == torch.bfloat16
+            and state1.dtype == torch.uint8
+            and len(str2optimizer8bit_blockwise[optimizer_name]) == 3
+        ):
+            optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
+        else:
+            raise ValueError(
+                f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
+            )
+        post_call(prev_device)
+
+        is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
+
+        prev_device = pre_call(g.device)
+        optim_func(
+            get_ptr(p),
+            get_ptr(g),
+            get_ptr(state1),
+            get_ptr(state2),
+            ct.c_float(beta1),
+            ct.c_float(beta2),
+            ct.c_float(eps),
+            ct.c_int32(step),
+            ct.c_float(lr),
+            get_ptr(qmap1),
+            get_ptr(qmap2),
+            get_ptr(absmax1),
+            get_ptr(absmax2),
+            ct.c_float(weight_decay),
+            ct.c_float(gnorm_scale),
+            ct.c_bool(skip_zeros),
+            ct.c_int32(g.numel()),
+        )
+        post_call(prev_device)
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        param_norm = 0.0
+        if max_unorm > 0.0:
+            param_norm = torch.norm(p.data.float())
+
+        optim_func = None
+        if g.dtype == torch.float32:
+            optim_func = str2optimizer32bit[optimizer_name][0]
+        elif g.dtype == torch.float16:
+            optim_func = str2optimizer32bit[optimizer_name][1]
+        elif g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name]) == 3:
+            optim_func = str2optimizer32bit[optimizer_name][2]
+        else:
+            raise ValueError(
+                f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
+            )
+
+        is_on_gpu([g, p, state1, state2, unorm_vec])
+        prev_device = pre_call(g.device)
+        optim_func(
+            get_ptr(g),
+            get_ptr(p),
+            get_ptr(state1),
+            get_ptr(state2),
+            get_ptr(unorm_vec),
+            ct.c_float(max_unorm),
+            ct.c_float(param_norm),
+            ct.c_float(beta1),
+            ct.c_float(beta2),
+            ct.c_float(eps),
+            ct.c_float(weight_decay),
+            ct.c_int32(step),
+            ct.c_float(lr),
+            ct.c_float(gnorm_scale),
+            ct.c_bool(skip_zeros),
+            ct.c_int32(g.numel()),
+        )
+        post_call(prev_device)
diff --git a/bitsandbytes/backends/mps.py b/bitsandbytes/backends/mps.py
new file mode 100644
index 000000000..5b7eda0c7
--- /dev/null
+++ b/bitsandbytes/backends/mps.py
@@ -0,0 +1,164 @@
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+
+class MPSBackend(Backend):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+        raise NotImplementedError
+
+    def mm_dequant(
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
diff --git a/bitsandbytes/backends/rocm.py b/bitsandbytes/backends/rocm.py
new file mode 100644
index 000000000..d74f10ead
--- /dev/null
+++ b/bitsandbytes/backends/rocm.py
@@ -0,0 +1,12 @@
+from .cuda import CUDABackend
+
+
+class ROCmBackend(CUDABackend):
+    """
+    Backend for AMD ROCm implementation.
+
+    The interface is largely the same as the CUDA implementation, so only any
+    differences need to be implemented here.
+    """
+
+    pass
diff --git a/bitsandbytes/backends/xpu.py b/bitsandbytes/backends/xpu.py
new file mode 100644
index 000000000..3976c4d5a
--- /dev/null
+++ b/bitsandbytes/backends/xpu.py
@@ -0,0 +1,164 @@
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from bitsandbytes.utils import QuantState
+
+from .base import Backend
+
+
+class XPUBackend(Backend):
+    def double_quant(
+        self,
+        A: torch.Tensor,
+        col_stats: Optional[torch.Tensor] = None,
+        row_stats: Optional[torch.Tensor] = None,
+        out_col: Optional[torch.Tensor] = None,
+        out_row: Optional[torch.Tensor] = None,
+        threshold=0.0,
+    ):
+        raise NotImplementedError
+
+    def transform(
+        self,
+        A: torch.Tensor,
+        to_order: str,
+        from_order="row",
+        out: Optional[torch.Tensor] = None,
+        transpose=False,
+        state: Optional[Tuple[torch.Size, str]] = None,
+        ld=None,
+    ):
+        raise NotImplementedError
+
+    def igemmlt(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        SB: Tuple[torch.Size, str],
+        out: Optional[torch.Tensor] = None,
+        Sout: Optional[Tuple[torch.Size, str]] = None,
+        dtype=torch.int32,
+    ) -> Union[torch.Tensor, Tuple[Optional[Tuple[torch.Tensor, Tuple[torch.Size, str]]]]]:
+        raise NotImplementedError
+
+    def mm_dequant(
+        self,
+        A: torch.Tensor,
+        quant_state: Tuple[torch.Size, str],
+        row_stats: torch.Tensor,
+        col_stats: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        new_row_stats: Optional[torch.Tensor] = None,
+        new_col_stats: Optional[torch.Tensor] = None,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def extract_outliers(
+        self,
+        A: torch.Tensor,
+        SA: Tuple[torch.Size, str],
+        idx: torch.Tensor,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_4bit(
+        self,
+        A: torch.Tensor,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=64,
+        compress_statistics=False,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+        quant_storage=torch.uint8,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def dequantize_4bit(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 64,
+        quant_type: Literal["fp4", "nf4"] = "fp4",
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def gemv_4bit(
+        self,
+        A: torch.Tensor,
+        B: torch.Tensor,
+        out: Optional[torch.Tensor] = None,
+        transposed_A=False,
+        transposed_B=False,
+        state: QuantState = None,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def dequantize_blockwise(
+        self,
+        A: torch.Tensor,
+        quant_state: Optional[QuantState] = None,
+        absmax: Optional[torch.Tensor] = None,
+        code: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize: int = 4096,
+        nested=False,
+    ) -> torch.Tensor:
+        raise NotImplementedError
+
+    def quantize_blockwise(
+        self,
+        A: torch.Tensor,
+        code: Optional[torch.Tensor] = None,
+        absmax: Optional[torch.Tensor] = None,
+        out: Optional[torch.Tensor] = None,
+        blocksize=4096,
+        nested=False,
+    ) -> Tuple[torch.Tensor, QuantState]:
+        raise NotImplementedError
+
+    def optimizer_update_8bit_blockwise(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        state2: Optional[torch.Tensor],
+        beta1: float,
+        beta2: float,
+        eps: float,
+        step: int,
+        lr: float,
+        qmap1: torch.Tensor,
+        qmap2: Optional[torch.Tensor],
+        absmax1: torch.Tensor,
+        absmax2: Optional[torch.Tensor],
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
+
+    def optimizer_update_32bit(
+        self,
+        optimizer_name: str,
+        g: torch.Tensor,
+        p: torch.Tensor,
+        state1: torch.Tensor,
+        beta1: float,
+        eps: float,
+        step: int,
+        lr: float,
+        state2: Optional[torch.Tensor] = None,
+        beta2: float = 0.0,
+        weight_decay: float = 0.0,
+        gnorm_scale: float = 1.0,
+        unorm_vec: Optional[torch.Tensor] = None,
+        max_unorm: float = 0.0,
+        skip_zeros=False,
+    ) -> None:
+        raise NotImplementedError
diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
deleted file mode 100644
index b2f9214a4..000000000
--- a/bitsandbytes/cuda_setup/main.py
+++ /dev/null
@@ -1,451 +0,0 @@
-"""
-extract factors the build is dependent on:
-[X] compute capability
-    [ ] TODO: Q - What if we have multiple GPUs of different makes?
-- CUDA version
-- Software:
-    - CPU-only: only CPU quantization functions (no optimizer, no matrix multiplication)
-    - CuBLAS-LT: full-build 8-bit optimizer
-    - no CuBLAS-LT: no 8-bit matrix multiplication (`nomatmul`)
-
-evaluation:
-    - if paths faulty, return meaningful error
-    - else:
-        - determine CUDA version
-        - determine capabilities
-        - based on that set the default path
-"""
-
-import ctypes as ct
-import errno
-import os
-from pathlib import Path
-from typing import Set, Union
-from warnings import warn
-
-import torch
-
-from .env_vars import get_potentially_lib_path_containing_env_vars
-
-# these are the most common libs names
-# libcudart.so is missing by default for a conda install with PyTorch 2.0 and instead
-# we have libcudart.so.11.0 which causes a lot of errors before
-# not sure if libcudart.so.12.0 exists in pytorch installs, but it does not hurt
-CUDA_RUNTIME_LIBS: list = [
-    "libcudart.so",
-    "libcudart.so.11.0",
-    "libcudart.so.12.0",
-    "libcudart.so.12.1",
-    "libcudart.so.12.2",
-]
-
-# this is a order list of backup paths to search CUDA in, if it cannot be found in the main environmental paths
-backup_paths = []
-backup_paths.append("$CONDA_PREFIX/lib/libcudart.so.11.0")
-
-
-class CUDASetup:
-    _instance = None
-
-    def __init__(self):
-        raise RuntimeError("Call get_instance() instead")
-
-    def generate_instructions(self):
-        if getattr(self, "error", False):
-            return
-        print(self.error)
-        self.error = True
-        if not self.cuda_available:
-            self.add_log_entry(
-                "CUDA SETUP: Problem: The main issue seems to be that the main CUDA library was not detected or CUDA not installed."
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1): Your paths are probably not up-to-date. You can update them via: sudo ldconfig."
-            )
-            self.add_log_entry("CUDA SETUP: Solution 2): If you do not have sudo rights, you can do the following:")
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2a): Find the cuda library via: find / -name libcuda.so 2>/dev/null"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_2a"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2c): For a permanent solution add the export from 2b into your .bashrc file, located at ~/.bashrc"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 3): For a missing CUDA runtime library (libcudart.so), use `find / -name libcudart.so* and follow with step (2b)"
-            )
-            return
-
-        if self.cudart_path is None:
-            self.add_log_entry(
-                "CUDA SETUP: Problem: The main issue seems to be that the main CUDA runtime library was not detected."
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1: To solve the issue the libcudart.so location needs to be added to the LD_LIBRARY_PATH variable"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1a): Find the cuda runtime library via: find / -name libcudart.so 2>/dev/null"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc"
-            )
-            self.add_log_entry("CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.")
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh"
-            )
-            self.add_log_entry(
-                "CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO."
-            )
-            self.add_log_entry(
-                'CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local'
-            )
-
-            return
-
-        make_cmd = f"CUDA_VERSION={self.cuda_version_string}"
-        if len(self.cuda_version_string) < 3:
-            make_cmd += " make cuda92"
-        elif self.cuda_version_string == "110":
-            make_cmd += " make cuda110"
-        elif self.cuda_version_string[:2] == "11" and int(self.cuda_version_string[2]) > 0:
-            make_cmd += " make cuda11x"
-        elif self.cuda_version_string[:2] == "12" and 1 >= int(self.cuda_version_string[2]) >= 0:
-            make_cmd += " make cuda12x"
-        elif self.cuda_version_string == "100":
-            self.add_log_entry("CUDA SETUP: CUDA 10.0 not supported. Please use a different CUDA version.")
-            self.add_log_entry(
-                "CUDA SETUP: Before you try again running bitsandbytes, make sure old CUDA 10.0 versions are uninstalled and removed from $LD_LIBRARY_PATH variables."
-            )
-            return
-
-        has_cublaslt = is_cublasLt_compatible(self.cc)
-        if not has_cublaslt:
-            make_cmd += "_nomatmul"
-
-        self.add_log_entry("CUDA SETUP: Something unexpected happened. Please compile from source:")
-        self.add_log_entry("git clone https://github.com/TimDettmers/bitsandbytes.git")
-        self.add_log_entry("cd bitsandbytes")
-        self.add_log_entry(make_cmd)
-        self.add_log_entry("python setup.py install")
-
-    def initialize(self):
-        if not getattr(self, "initialized", False):
-            self.has_printed = False
-            self.lib = None
-            self.initialized = False
-            self.error = False
-
-    def manual_override(self):
-        if torch.cuda.is_available():
-            if "BNB_CUDA_VERSION" in os.environ:
-                if len(os.environ["BNB_CUDA_VERSION"]) > 0:
-                    warn(
-                        f'\n\n{"="*80}\n'
-                        'WARNING: Manual override via BNB_CUDA_VERSION env variable detected!\n'
-                        'BNB_CUDA_VERSION=XXX can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.\n'
-                        'If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=\n'
-                        'If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH\n'
-                        'For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64\n'
-                        f'Loading CUDA version: BNB_CUDA_VERSION={os.environ["BNB_CUDA_VERSION"]}'
-                        f'\n{"="*80}\n\n'
-                    )
-                    self.binary_name = self.binary_name[:-6] + f'{os.environ["BNB_CUDA_VERSION"]}.so'
-
-    def run_cuda_setup(self):
-        self.initialized = True
-        self.cuda_setup_log = []
-
-        binary_name, cudart_path, cc, cuda_version_string = evaluate_cuda_setup()
-        self.cudart_path = cudart_path
-        self.cuda_available = torch.cuda.is_available()
-        self.cc = cc
-        self.cuda_version_string = cuda_version_string
-        self.binary_name = binary_name
-        self.manual_override()
-
-        package_dir = Path(__file__).parent.parent
-        binary_path = package_dir / self.binary_name
-
-        try:
-            if not binary_path.exists():
-                self.add_log_entry(
-                    f"CUDA SETUP: Required library version not found: {binary_name}. Maybe you need to compile it from source?"
-                )
-                legacy_binary_name = "libbitsandbytes_cpu.so"
-                self.add_log_entry(f"CUDA SETUP: Defaulting to {legacy_binary_name}...")
-                binary_path = package_dir / legacy_binary_name
-                if not binary_path.exists() or torch.cuda.is_available():
-                    self.add_log_entry("")
-                    self.add_log_entry("=" * 48 + "ERROR" + "=" * 37)
-                    self.add_log_entry("CUDA SETUP: CUDA detection failed! Possible reasons:")
-                    self.add_log_entry(
-                        "1. You need to manually override the PyTorch CUDA version. Please see: "
-                        '"https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md'
-                    )
-                    self.add_log_entry("2. CUDA driver not installed")
-                    self.add_log_entry("3. CUDA not installed")
-                    self.add_log_entry("4. You have multiple conflicting CUDA libraries")
-                    self.add_log_entry("5. Required library not pre-compiled for this bitsandbytes release!")
-                    self.add_log_entry(
-                        "CUDA SETUP: If you compiled from source, try again with `make CUDA_VERSION=DETECTED_CUDA_VERSION` for example, `make CUDA_VERSION=113`."
-                    )
-                    self.add_log_entry(
-                        "CUDA SETUP: The CUDA version for the compile might depend on your conda install. Inspect CUDA version via `conda list | grep cuda`."
-                    )
-                    self.add_log_entry("=" * 80)
-                    self.add_log_entry("")
-                    self.generate_instructions()
-                    raise Exception("CUDA SETUP: Setup Failed!")
-                self.lib = ct.cdll.LoadLibrary(binary_path)
-            else:
-                self.add_log_entry(f"CUDA SETUP: Loading binary {binary_path}...")
-                self.lib = ct.cdll.LoadLibrary(binary_path)
-        except Exception as ex:
-            # debug
-            self.add_log_entry("Exception in run_cuda_setup: \n")
-            self.add_log_entry(str(ex))
-
-    def add_log_entry(self, msg, is_warning=False):
-        self.cuda_setup_log.append((msg, is_warning))
-
-    def print_log_stack(self):
-        for msg, is_warning in self.cuda_setup_log:
-            if is_warning:
-                warn(msg)
-            else:
-                print(msg)
-
-    @classmethod
-    def get_instance(cls):
-        if cls._instance is None:
-            cls._instance = cls.__new__(cls)
-            cls._instance.initialize()
-        return cls._instance
-
-
-def is_cublasLt_compatible(cc):
-    has_cublaslt = False
-    if cc is not None:
-        cc_major, cc_minor = cc.split(".")
-        if int(cc_major) < 7 or (int(cc_major) == 7 and int(cc_minor) < 5):
-            CUDASetup.get_instance().add_log_entry(
-                "WARNING: Compute capability < 7.5 detected! Only slow 8-bit matmul is supported for your GPU! \
-                    If you run into issues with 8-bit matmul, you can try 4-bit quantization: https://huggingface.co/blog/4bit-transformers-bitsandbytes",
-                is_warning=True,
-            )
-        else:
-            has_cublaslt = True
-    return has_cublaslt
-
-
-def extract_candidate_paths(paths_list_candidate: str) -> Set[Path]:
-    return {Path(ld_path) for ld_path in paths_list_candidate.split(":") if ld_path}
-
-
-def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
-    existent_directories: Set[Path] = set()
-    for path in candidate_paths:
-        try:
-            if path.exists():
-                existent_directories.add(path)
-        except PermissionError as pex:
-            # Handle the PermissionError first as it is a subtype of OSError
-            # https://docs.python.org/3/library/exceptions.html#exception-hierarchy
-            pass
-        except OSError as exc:
-            if exc.errno != errno.ENAMETOOLONG:
-                raise exc
-
-    non_existent_directories: Set[Path] = candidate_paths - existent_directories
-    if non_existent_directories:
-        CUDASetup.get_instance().add_log_entry(
-            "The following directories listed in your path were found to "
-            f"be non-existent: {non_existent_directories}",
-            is_warning=False,
-        )
-
-    return existent_directories
-
-
-def get_cuda_runtime_lib_paths(candidate_paths: Set[Path]) -> Set[Path]:
-    paths = set()
-    for libname in CUDA_RUNTIME_LIBS:
-        for path in candidate_paths:
-            try:
-                if (path / libname).is_file():
-                    paths.add(path / libname)
-            except PermissionError:
-                pass
-    return paths
-
-
-def resolve_paths_list(paths_list_candidate: str) -> Set[Path]:
-    """
-    Searches a given environmental var for the CUDA runtime library,
-    i.e. `libcudart.so`.
-    """
-    return remove_non_existent_dirs(extract_candidate_paths(paths_list_candidate))
-
-
-def find_cuda_lib_in(paths_list_candidate: str) -> Set[Path]:
-    return get_cuda_runtime_lib_paths(resolve_paths_list(paths_list_candidate))
-
-
-def warn_in_case_of_duplicates(results_paths: Set[Path]) -> None:
-    if len(results_paths) > 1:
-        warning_msg = (
-            f"Found duplicate {CUDA_RUNTIME_LIBS} files: {results_paths}.. "
-            "We select the PyTorch default libcudart.so, which is {torch.version.cuda},"
-            "but this might mismatch with the CUDA version that is needed for bitsandbytes."
-            "To override this behavior set the BNB_CUDA_VERSION=<version string, e.g. 122> environmental variable"
-            "For example, if you want to use the CUDA version 122"
-            "BNB_CUDA_VERSION=122 python ..."
-            "OR set the environmental variable in your .bashrc: export BNB_CUDA_VERSION=122"
-            "In the case of a manual override, make sure you set the LD_LIBRARY_PATH, e.g."
-            "export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-11.2"
-        )
-        CUDASetup.get_instance().add_log_entry(warning_msg, is_warning=True)
-
-
-def determine_cuda_runtime_lib_path() -> Union[Path, None]:
-    """
-    Searches for a cuda installations, in the following order of priority:
-        1. active conda env
-        2. LD_LIBRARY_PATH
-        3. any other env vars, while ignoring those that
-            - are known to be unrelated (see `bnb.cuda_setup.env_vars.to_be_ignored`)
-            - don't contain the path separator `/`
-
-    If multiple libraries are found in part 3, we optimistically try one,
-    while giving a warning message.
-    """
-    candidate_env_vars = get_potentially_lib_path_containing_env_vars()
-
-    cuda_runtime_libs = set()
-    if "CONDA_PREFIX" in candidate_env_vars:
-        conda_libs_path = Path(candidate_env_vars["CONDA_PREFIX"]) / "lib"
-
-        conda_cuda_libs = find_cuda_lib_in(str(conda_libs_path))
-        warn_in_case_of_duplicates(conda_cuda_libs)
-
-        if conda_cuda_libs:
-            cuda_runtime_libs.update(conda_cuda_libs)
-
-        CUDASetup.get_instance().add_log_entry(
-            f'{candidate_env_vars["CONDA_PREFIX"]} did not contain '
-            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...',
-            is_warning=True,
-        )
-
-    if "LD_LIBRARY_PATH" in candidate_env_vars:
-        lib_ld_cuda_libs = find_cuda_lib_in(candidate_env_vars["LD_LIBRARY_PATH"])
-
-        if lib_ld_cuda_libs:
-            cuda_runtime_libs.update(lib_ld_cuda_libs)
-        warn_in_case_of_duplicates(lib_ld_cuda_libs)
-
-        CUDASetup.get_instance().add_log_entry(
-            f'{candidate_env_vars["LD_LIBRARY_PATH"]} did not contain '
-            f'{CUDA_RUNTIME_LIBS} as expected! Searching further paths...',
-            is_warning=True,
-        )
-
-    remaining_candidate_env_vars = {
-        env_var: value
-        for env_var, value in candidate_env_vars.items()
-        if env_var not in {"CONDA_PREFIX", "LD_LIBRARY_PATH"}
-    }
-
-    cuda_runtime_libs = set()
-    for env_var, value in remaining_candidate_env_vars.items():
-        cuda_runtime_libs.update(find_cuda_lib_in(value))
-
-    if len(cuda_runtime_libs) == 0:
-        CUDASetup.get_instance().add_log_entry(
-            "CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching in backup paths..."
-        )
-        cuda_runtime_libs.update(find_cuda_lib_in("/usr/local/cuda/lib64"))
-
-    warn_in_case_of_duplicates(cuda_runtime_libs)
-
-    cuda_setup = CUDASetup.get_instance()
-    cuda_setup.add_log_entry(f"DEBUG: Possible options found for libcudart.so: {cuda_runtime_libs}")
-
-    return next(iter(cuda_runtime_libs)) if cuda_runtime_libs else None
-
-
-# https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART____VERSION.html#group__CUDART____VERSION
-def get_cuda_version():
-    major, minor = map(int, torch.version.cuda.split("."))
-
-    if major < 11:
-        CUDASetup.get_instance().add_log_entry(
-            "CUDA SETUP: CUDA version lower than 11 are currently not supported for LLM.int8(). You will be only to use 8-bit optimizers and quantization routines!!"
-        )
-
-    return f"{major}{minor}"
-
-
-def get_compute_capabilities():
-    ccs = []
-    for i in range(torch.cuda.device_count()):
-        cc_major, cc_minor = torch.cuda.get_device_capability(torch.cuda.device(i))
-        ccs.append(f"{cc_major}.{cc_minor}")
-
-    ccs.sort(key=lambda v: tuple(map(int, str(v).split("."))))
-
-    return ccs
-
-
-def evaluate_cuda_setup():
-    cuda_setup = CUDASetup.get_instance()
-    if "BITSANDBYTES_NOWELCOME" not in os.environ or str(os.environ["BITSANDBYTES_NOWELCOME"]) == "0":
-        cuda_setup.add_log_entry("")
-        cuda_setup.add_log_entry("=" * 35 + "BUG REPORT" + "=" * 35)
-        cuda_setup.add_log_entry(
-            ("Welcome to bitsandbytes. For bug reports, please run\n\npython -m bitsandbytes\n\n"),
-            (
-                "and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues"
-            ),
-        )
-        cuda_setup.add_log_entry("=" * 80)
-    if not torch.cuda.is_available():
-        return "libbitsandbytes_cpu.so", None, None, None
-    if torch.version.hip:
-        return "libbitsandbytes_hip_nohipblaslt.so", None, None, None
-
-    cudart_path = determine_cuda_runtime_lib_path()
-    ccs = get_compute_capabilities()
-    ccs.sort()
-    cc = ccs[-1]  # we take the highest capability
-    cuda_version_string = get_cuda_version()
-
-    cuda_setup.add_log_entry(
-        f"CUDA SETUP: PyTorch settings found: CUDA_VERSION={cuda_version_string}, Highest Compute Capability: {cc}."
-    )
-    cuda_setup.add_log_entry(
-        "CUDA SETUP: To manually override the PyTorch CUDA version please see:"
-        "https://github.com/TimDettmers/bitsandbytes/blob/main/how_to_use_nonpytorch_cuda.md"
-    )
-
-    # 7.5 is the minimum CC vor cublaslt
-    has_cublaslt = is_cublasLt_compatible(cc)
-
-    # TODO:
-    # (1) CUDA missing cases (no CUDA installed by CUDA driver (nvidia-smi accessible)
-    # (2) Multiple CUDA versions installed
-
-    # we use ls -l instead of nvcc to determine the cuda version
-    # since most installations will have the libcudart.so installed, but not the compiler
-
-    if has_cublaslt:
-        binary_name = f"libbitsandbytes_cuda{cuda_version_string}.so"
-    else:
-        "if not has_cublaslt (CC < 7.5), then we have to choose  _nocublaslt.so"
-        binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt.so"
-
-    return binary_name, cudart_path, cc, cuda_version_string
diff --git a/bitsandbytes/functional.py b/bitsandbytes/functional.py
index 4ab375910..c807ba17a 100644
--- a/bitsandbytes/functional.py
+++ b/bitsandbytes/functional.py
@@ -27,31 +27,6 @@ def prod(iterable):
 
 if lib and lib.compiled_with_cuda:
     """C FUNCTIONS FOR OPTIMIZERS"""
-    str2optimizer32bit = {
-        "adam": (
-            lib.cadam32bit_grad_fp32,
-            lib.cadam32bit_grad_fp16,
-            lib.cadam32bit_grad_bf16,
-        ),
-        "momentum": (
-            lib.cmomentum32bit_grad_32,
-            lib.cmomentum32bit_grad_16,
-        ),
-        "rmsprop": (
-            lib.crmsprop32bit_grad_32,
-            lib.crmsprop32bit_grad_16,
-        ),
-        "lion": (
-            lib.clion32bit_grad_fp32,
-            lib.clion32bit_grad_fp16,
-            lib.clion32bit_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad32bit_grad_32,
-            lib.cadagrad32bit_grad_16,
-        ),
-    }
-
     str2optimizer8bit = {
         "adam": (
             lib.cadam_static_8bit_grad_32,
@@ -79,31 +54,6 @@ def prod(iterable):
         ),
     }
 
-    str2optimizer8bit_blockwise = {
-        "adam": (
-            lib.cadam_8bit_blockwise_grad_fp32,
-            lib.cadam_8bit_blockwise_grad_fp16,
-            lib.cadam_8bit_blockwise_grad_bf16,
-        ),
-        "momentum": (
-            lib.cmomentum_8bit_blockwise_grad_fp32,
-            lib.cmomentum_8bit_blockwise_grad_fp16,
-        ),
-        "rmsprop": (
-            lib.crmsprop_8bit_blockwise_grad_fp32,
-            lib.crmsprop_8bit_blockwise_grad_fp16,
-        ),
-        "lion": (
-            lib.clion_8bit_blockwise_grad_fp32,
-            lib.clion_8bit_blockwise_grad_fp16,
-            lib.clion_8bit_blockwise_grad_bf16,
-        ),
-        "adagrad": (
-            lib.cadagrad_8bit_blockwise_grad_fp32,
-            lib.cadagrad_8bit_blockwise_grad_fp16,
-        ),
-    }
-
 
 class GlobalPageManager:
     _instance = None
@@ -930,11 +880,12 @@ def get_4bit_type(typename, device=None, blocksize=64):
     if data is None:
         raise NotImplementedError(f"Typename {typename} not supported")
 
-    data = Tensor(data)
-    data /= data.abs().max()
+    data = torch.tensor(data, device=device)
+    data.div_(data.abs().max())
+
     assert data.numel() == 16
 
-    return data.to(device)
+    return data
 
 
 def quantize_fp4(
@@ -1191,82 +1142,24 @@ def optimizer_update_32bit(
     max_unorm: float = 0.0,
     skip_zeros=False,
 ) -> None:
-    """
-    Performs an inplace optimizer update with one or two optimizer states.
-
-    Universal optimizer update for 32-bit state and 32/16-bit gradients/weights.
-
-    Parameters
-    ----------
-    optimizer_name : str
-        The name of the optimizer: {adam}.
-    g : torch.Tensor
-        Gradient tensor.
-    p : torch.Tensor
-        Parameter tensor.
-    state1 : torch.Tensor
-        Optimizer state 1.
-    beta1 : float
-        Optimizer beta1.
-    eps : float
-        Optimizer epsilon.
-    weight_decay : float
-        Weight decay.
-    step : int
-        Current optimizer step.
-    lr : float
-        The learning rate.
-    state2 : torch.Tensor
-        Optimizer state 2.
-    beta2 : float
-        Optimizer beta2.
-    gnorm_scale : float
-        The factor to rescale the gradient to the max clip value.
-    unorm_vec : torch.Tensor
-        The tensor for the update norm.
-    max_unorm : float
-        The maximum update norm relative to the weight norm.
-    skip_zeros : bool
-        Whether to skip zero-valued gradients or not (default: False).
-    """
-
-    param_norm = 0.0
-    if max_unorm > 0.0:
-        param_norm = torch.norm(p.data.float())
-
-    optim_func = None
-    if g.dtype == torch.float32:
-        optim_func = str2optimizer32bit[optimizer_name][0]
-    elif g.dtype == torch.float16:
-        optim_func = str2optimizer32bit[optimizer_name][1]
-    elif g.dtype == torch.bfloat16 and len(str2optimizer32bit[optimizer_name]) == 3:
-        optim_func = str2optimizer32bit[optimizer_name][2]
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
-        )
-
-    is_on_gpu([g, p, state1, state2, unorm_vec])
-    prev_device = pre_call(g.device)
-    optim_func(
-        get_ptr(g),
-        get_ptr(p),
-        get_ptr(state1),
-        get_ptr(state2),
-        get_ptr(unorm_vec),
-        ct.c_float(max_unorm),
-        ct.c_float(param_norm),
-        ct.c_float(beta1),
-        ct.c_float(beta2),
-        ct.c_float(eps),
-        ct.c_float(weight_decay),
-        ct.c_int32(step),
-        ct.c_float(lr),
-        ct.c_float(gnorm_scale),
-        ct.c_bool(skip_zeros),
-        ct.c_int32(g.numel()),
+    ensure_backend_is_available(g.device.type)
+    return backends[g.device.type].optimizer_update_32bit(
+        optimizer_name=optimizer_name,
+        g=g,
+        p=p,
+        state1=state1,
+        beta1=beta1,
+        eps=eps,
+        step=step,
+        lr=lr,
+        state2=state2,
+        beta2=beta2,
+        weight_decay=weight_decay,
+        gnorm_scale=gnorm_scale,
+        unorm_vec=unorm_vec,
+        max_unorm=max_unorm,
+        skip_zeros=skip_zeros,
     )
-    post_call(prev_device)
 
 
 def optimizer_update_8bit(
@@ -1421,48 +1314,26 @@ def optimizer_update_8bit_blockwise(
     gnorm_scale: float = 1.0,
     skip_zeros=False,
 ) -> None:
-    optim_func = None
-    prev_device = pre_call(g.device)
-    is_on_gpu([g, p, state1, state2, qmap1, qmap2, absmax1, absmax2])
-    if g.dtype == torch.float32 and state1.dtype == torch.uint8:
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][0]
-    elif g.dtype == torch.float16 and state1.dtype == torch.uint8:
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][1]
-    elif (
-        g.dtype == torch.bfloat16
-        and state1.dtype == torch.uint8
-        and len(str2optimizer8bit_blockwise[optimizer_name]) == 3
-    ):
-        optim_func = str2optimizer8bit_blockwise[optimizer_name][2]
-    else:
-        raise ValueError(
-            f"Gradient+optimizer bit data type combination not supported: grad {g.dtype}, optimizer {state1.dtype}",
-        )
-    post_call(prev_device)
-
-    is_on_gpu([p, g, state1, state2, qmap1, qmap2, absmax1, absmax2])
-
-    prev_device = pre_call(g.device)
-    optim_func(
-        get_ptr(p),
-        get_ptr(g),
-        get_ptr(state1),
-        get_ptr(state2),
-        ct.c_float(beta1),
-        ct.c_float(beta2),
-        ct.c_float(eps),
-        ct.c_int32(step),
-        ct.c_float(lr),
-        get_ptr(qmap1),
-        get_ptr(qmap2),
-        get_ptr(absmax1),
-        get_ptr(absmax2),
-        ct.c_float(weight_decay),
-        ct.c_float(gnorm_scale),
-        ct.c_bool(skip_zeros),
-        ct.c_int32(g.numel()),
+    ensure_backend_is_available(g.device.type)
+    return backends[g.device.type].optimizer_update_8bit_blockwise(
+        optimizer_name=optimizer_name,
+        g=g,
+        p=p,
+        state1=state1,
+        state2=state2,
+        beta1=beta1,
+        beta2=beta2,
+        eps=eps,
+        step=step,
+        lr=lr,
+        qmap1=qmap1,
+        qmap2=qmap2,
+        absmax1=absmax1,
+        absmax2=absmax2,
+        weight_decay=weight_decay,
+        gnorm_scale=gnorm_scale,
+        skip_zeros=skip_zeros,
     )
-    post_call(prev_device)
 
 
 def percentile_clipping(grad: Tensor, gnorm_vec: Tensor, step: int, percentile: int = 5):
@@ -1617,98 +1488,15 @@ def gemv_4bit(
     transposed_B=False,
     state=None,
 ):
-    prev_device = pre_call(A.device)
-    # sout = check_matmul(A, B, out, transposed_A, transposed_B, expected_type=A.dtype)
-    if state is None:
-        raise ValueError("state cannot None. gem_4bit( ) requires the state from quantize_4bit( )")
-
-    if A.numel() != A.shape[-1]:
-        raise ValueError(
-            'Dimensions of A are invalid. Must be a vector with the leading dimensions of "1", e.g. [1, 1, 2048]',
-        )
-
-    Bshape = state.shape
-    bout = Bshape[0]
-    absmax = state.absmax
-    if state.nested:
-        absmax = dequantize_blockwise(state.absmax, state.state2)
-        absmax += state.offset
-
-    if out is None:
-        if len(A.shape) == 3:
-            out = torch.empty(size=(A.shape[0], A.shape[1], bout), dtype=A.dtype, device=A.device)
-        else:
-            out = torch.empty(size=(A.shape[0], bout), dtype=A.dtype, device=A.device)
-
-    n = 1
-    m = Bshape[0]
-    k = Bshape[1]
-    lda = Bshape[0]
-    ldc = Bshape[0]
-    ldb = (A.shape[-1] + 1) // 2
-    is_on_gpu([B, A, out, absmax, state.code])
-    m = ct.c_int32(m)
-    n = ct.c_int32(n)
-    k = ct.c_int32(k)
-    lda = ct.c_int32(lda)
-    ldb = ct.c_int32(ldb)
-    ldc = ct.c_int32(ldc)
-
-    if B.dtype in [torch.uint8, torch.bfloat16, torch.float16, torch.float32]:
-        if A.dtype == torch.float16:
-            lib.cgemm_4bit_inference_naive_fp16(
-                m,
-                n,
-                k,
-                get_ptr(A),
-                get_ptr(B),
-                get_ptr(absmax),
-                get_ptr(state.code),
-                get_ptr(out),
-                lda,
-                ldb,
-                ldc,
-                ct.c_int32(state.blocksize),
-            )
-        elif A.dtype == torch.bfloat16:
-            lib.cgemm_4bit_inference_naive_bf16(
-                m,
-                n,
-                k,
-                get_ptr(A),
-                get_ptr(B),
-                get_ptr(absmax),
-                get_ptr(state.code),
-                get_ptr(out),
-                lda,
-                ldb,
-                ldc,
-                ct.c_int32(state.blocksize),
-            )
-        elif A.dtype == torch.float32:
-            lib.cgemm_4bit_inference_naive_fp32(
-                m,
-                n,
-                k,
-                get_ptr(A),
-                get_ptr(B),
-                get_ptr(absmax),
-                get_ptr(state.code),
-                get_ptr(out),
-                lda,
-                ldb,
-                ldc,
-                ct.c_int32(state.blocksize),
-            )
-        else:
-            raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
-
-    else:
-        raise NotImplementedError(f"Matmul not implemented for data type {A.dtype}")
-
-    post_call(prev_device)
-
-    return out
+    ensure_backend_is_available(A.device.type)
+    return backends[A.device.type].gemv_4bit(
+        A,
+        B,
+        out=out,
+        transposed_A=transposed_A,
+        transposed_B=transposed_B,
+        state=state,
+    )
 
 
 def igemm(
@@ -1969,7 +1757,10 @@ class COOSparseTensor:
     def __init__(self, rows, cols, nnz, rowidx, colidx, values):
         assert rowidx.dtype == torch.int32
         assert colidx.dtype == torch.int32
-        assert values.dtype == torch.float16
+        if values.device == torch.device("cpu"):
+            assert values.dtype in [torch.bfloat16, torch.half, torch.float]
+        else:
+            assert values.dtype == torch.float16
         assert values.numel() == nnz
         assert rowidx.numel() == nnz
         assert colidx.numel() == nnz
@@ -2198,7 +1989,7 @@ def vectorwise_quant(x, dim=1, quant_type="vector"):
         return xq, max1
     elif quant_type in ["vector", "row"]:
         max1 = torch.amax(torch.abs(x), dim=dim, keepdim=True)
-        xq = torch.round(x * (C / max1)).to(torch.int8)
+        xq = torch.clamp(torch.round(x * (C / max1)), -128, 127).to(torch.int8)
         return xq, max1
     elif quant_type == "zeropoint":
         dtype = x.dtype
diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 3684badf6..79b31f51f 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -15,7 +15,11 @@
 from bitsandbytes.cextension import HIP_ENVIRONMENT
 from bitsandbytes.functional import QuantState
 from bitsandbytes.optim import GlobalOptimManager
-from bitsandbytes.utils import OutlierTracer
+from bitsandbytes.utils import (
+    INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
+    LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
+    OutlierTracer,
+)
 
 T = TypeVar("T", bound="torch.nn.Module")
 
@@ -585,6 +589,19 @@ def cuda(self, device):
 
         return self
 
+    def cpu(self):
+        # we store the 8-bit rows-major weight
+        B = self.data.contiguous().bfloat16().cpu()
+        CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
+        if CBt is not None:
+            del CBt
+        if SCBt is not None:
+            del SCBt
+        self.data = CB
+        self.CB = CB
+        self.SCB = SCB
+        return self
+
     @overload
     def to(
         self: T,
@@ -602,8 +619,10 @@ def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T: ...
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
 
-        if device is not None and device.type == "cuda" and self.data.device.type == "cpu":
+        if device.type == "cuda" and self.data.device.type == "cpu":
             return self.cuda(device)
+        elif device.type == "cpu" and self.data.dtype != torch.int8:
+            return self.cpu()
         else:
             new_param = Int8Params(
                 super().to(device=device, dtype=dtype, non_blocking=non_blocking),
@@ -623,6 +642,16 @@ def maybe_rearrange_weight(state_dict, prefix, local_metadata, strict, missing_k
         return
     weight_format = state_dict.pop(f"{prefix}weight_format", "row")
 
+    if isinstance(weight_format, torch.Tensor):
+        weight_format = weight_format.item()
+
+    # For new weights format storage type, we explicitly check
+    # if weights_format is on the mapping
+    if isinstance(weight_format, int) and weight_format not in INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING:
+        raise ValueError(f"Expected supported weight format - got {weight_format}")
+    elif isinstance(weight_format, int) and weight_format in INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING:
+        weight_format = INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING[weight_format]
+
     if weight_format != "row":
         tile_indices = get_tile_inds(weight_format, weight.device)
         state_dict[f"{prefix}weight"] = undo_layout(weight, tile_indices)
@@ -715,13 +744,20 @@ def _save_to_state_dict(self, destination, prefix, keep_vars):
         if not self.state.has_fp16_weights:
             if param_from_weight is not None:
                 destination[key_name] = param_from_weight if keep_vars else param_from_weight.detach()
-                destination[format_name] = "row"
+                destination[format_name] = torch.tensor(0, dtype=torch.uint8)
             elif param_from_state is not None and not layout_reordered:
                 destination[key_name] = param_from_state if keep_vars else param_from_state.detach()
-                destination[format_name] = "row"
+                destination[format_name] = torch.tensor(0, dtype=torch.uint8)
             elif param_from_state is not None:
                 destination[key_name] = param_from_state if keep_vars else param_from_state.detach()
-                destination[format_name] = self.state.formatB
+                weights_format = self.state.formatB
+                # At this point `weights_format` is an str
+                if weights_format not in LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING:
+                    raise ValueError(f"Unrecognized weights format {weights_format}")
+
+                weights_format = LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING[weights_format]
+
+                destination[format_name] = torch.tensor(weights_format, dtype=torch.uint8)
 
     def _load_from_state_dict(
         self,
diff --git a/bitsandbytes/utils.py b/bitsandbytes/utils.py
index 92744dead..fa9a7eb70 100644
--- a/bitsandbytes/utils.py
+++ b/bitsandbytes/utils.py
@@ -374,3 +374,7 @@ def __eq__(self, other):
                 else self.state2 is other.state2
             )
         )
+
+
+LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING = {"row": 0, "col32": 1, "col_turing": 2, "col_ampere": 3}
+INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING = {val: name for (name, val) in LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING.items()}
diff --git a/csrc/ops.cu b/csrc/ops.cu
index 796211fed..3a6ffdda8 100644
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
@@ -58,7 +58,7 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa
   num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
 
   if(blocksize == 4096)
-    kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, 0><<<num_blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
+    kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, DATA_TYPE><<<num_blocks, 1024>>>(code, A, absmax, out, rand, rand_offset, n);
   else if(blocksize == 2048)
     kQuantizeBlockwise<T, 2048, 4, 0, DATA_TYPE><<<num_blocks, 512>>>(code, A, absmax, out, rand, rand_offset, n);
   else if(blocksize == 1024)
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 67cece5c1..157e84629 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -64,7 +64,7 @@ template <typename T, int STOCHASTIC, int DATA_TYPE> void quantizeBlockwise(floa
   num_blocks = n % blocksize == 0 ? num_blocks : num_blocks + 1;
 
   if(blocksize == 4096)
-   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, 0>), dim3(num_blocks), dim3(1024), 0, 0, code, A, absmax, out, rand, rand_offset, n);
+   hipLaunchKernelGGL(( kQuantizeBlockwise<T, 4096, 4, STOCHASTIC, DATA_TYPE>), dim3(num_blocks), dim3(1024), 0, 0, code, A, absmax, out, rand, rand_offset, n);
   else if(blocksize == 2048)
    hipLaunchKernelGGL(( kQuantizeBlockwise<T, 2048, 4, 0, DATA_TYPE>), dim3(num_blocks), dim3(512), 0, 0, code, A, absmax, out, rand, rand_offset, n);
   else if(blocksize == 1024)
diff --git a/csrc/test_delete_later.c b/csrc/test_delete_later.c
deleted file mode 100644
index 21dab4580..000000000
--- a/csrc/test_delete_later.c
+++ /dev/null
@@ -1,375 +0,0 @@
-// !!! This is a file automatically generated by hipify!!!
-// Copyright (c) Facebook, Inc. and its affiliates.
-//
-// This source code is licensed under the MIT license found in the
-// LICENSE file in the root directory of this source tree.
-
-#if BUILD_CUDA
-#include <ops.cuh>
-#endif
-#include <cpu_ops.h>
-
-// We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
-// We use macro functions to expand all the different optimizers. Looks ugly, and is ugly, but its better than to
-// maintain all that boilerplate
-//===================================================================================
-//                               UNMANGLED CALLS
-//===================================================================================
-
-#if BUILD_CUDA
-void estimateQuantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles<float>(A, code, offset, n); }
-void estimateQuantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles<half>(A, code, offset, n); }
-
-
-//void gemm_host_fp32(int M, int N, int K, float * A,  float* B,  float * out,  int lda, int ldb, int ldc)
-//{ gemm_host<float>(M, N, K, A, B, out, lda, ldb, ldc, 32); }
-void gemm_host_fp16(int M, int N, int K, half * A,  half* B,  half * out,  int lda, int ldb, int ldc)
-{ gemm_host<half>(M, N, K, A, B, out, lda, ldb, ldc, 16); }
-
-void gemm_4bit_inference(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize)
-{ gemm_4bit_inference<half>(m, n, k, A, B, absmax,  out, lda, ldb, ldc, blocksize); }
-
-#define MAKE_ELEMENTWISE_FUNC(fname, type_name, ctype, FUNC) \
-void fname##_##type_name(ctype *A, ctype *B, ctype value, long n){ func<ctype, FUNC>(A, B, value, n); } \
-
-MAKE_ELEMENTWISE_FUNC(fill, fp32, float, FILL)
-MAKE_ELEMENTWISE_FUNC(fill, uint8, unsigned char, FILL)
-MAKE_ELEMENTWISE_FUNC(arange, fp32, float, ARANGE)
-MAKE_ELEMENTWISE_FUNC(_mul, fp32, float, _MUL)
-
-
-#define MAKE_FUNC32(fname, oname, gtype, gbits) \
-void fname##32bit_grad_##gbits(gtype *g, gtype *p, \
-               float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
-               const float beta1, const float beta2, const float eps, const float weight_decay, \
-               const int step, const float lr, float gnorm_scale, bool skip_zeros, const int n) \
-{ optimizer32bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n); } \
-
-MAKE_FUNC32(momentum, MOMENTUM, float, 32)
-MAKE_FUNC32(momentum, MOMENTUM, half, 16)
-MAKE_FUNC32(adam, ADAM, float, fp32)
-MAKE_FUNC32(adam, ADAM, half, fp16)
-MAKE_FUNC32(adam, ADAM, __nv_bfloat16, bf16)
-MAKE_FUNC32(rmsprop, RMSPROP, float, 32)
-MAKE_FUNC32(rmsprop, RMSPROP, half, 16)
-MAKE_FUNC32(lion, LION, float, fp32)
-MAKE_FUNC32(lion, LION, half, fp16)
-MAKE_FUNC32(lion, LION, __nv_bfloat16, bf16)
-MAKE_FUNC32(adagrad, ADAGRAD, float, 32)
-MAKE_FUNC32(adagrad, ADAGRAD, half, 16)
-
-#define MAKE_FUNC8(fname, oname, gtype, gbits) \
-void fname##_static_8bit_grad_##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
-								float *unorm, float max_unorm, float param_norm, \
-                float beta1, float beta2, \
-                float eps, int step, float lr,  \
-                float* quantiles1, float* quantiles2, \
-                float* max1, float* max2, float* new_max1, float* new_max2, \
-                float weight_decay, float gnorm_scale, int n) \
-{  \
-	optimizerStatic8bit<gtype, oname>(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
-			                                  quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
-} \
-
-MAKE_FUNC8(adam, ADAM, float, 32)
-MAKE_FUNC8(adam, ADAM, half, 16)
-MAKE_FUNC8(momentum, MOMENTUM, float, 32)
-MAKE_FUNC8(momentum, MOMENTUM, half, 16)
-MAKE_FUNC8(rmsprop, RMSPROP, float, 32)
-MAKE_FUNC8(rmsprop, RMSPROP, half, 16)
-MAKE_FUNC8(lion, LION, float, 32)
-MAKE_FUNC8(lion, LION, half, 16)
-
-#define MAKE_BLOCKWISE8(fname, optim_name, gtype, gbits) \
-void fname##_8bit_blockwise_grad_##gbits(gtype* p, gtype* g, \
-                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr, \
-                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n)\
-{	optimizerStatic8bitBlockwise<gtype, optim_name>(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); }\
-
-MAKE_BLOCKWISE8(adam, ADAM, half, fp16)
-MAKE_BLOCKWISE8(adam, ADAM, float, fp32)
-MAKE_BLOCKWISE8(momentum, MOMENTUM, half, fp16)
-MAKE_BLOCKWISE8(momentum, MOMENTUM, float, fp32)
-MAKE_BLOCKWISE8(rmsprop, RMSPROP, half, fp16)
-MAKE_BLOCKWISE8(rmsprop, RMSPROP, float, fp32)
-MAKE_BLOCKWISE8(adagrad, ADAGRAD, half, fp16)
-MAKE_BLOCKWISE8(adagrad, ADAGRAD, float, fp32)
-MAKE_BLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
-MAKE_BLOCKWISE8(lion, LION, half, fp16)
-MAKE_BLOCKWISE8(lion, LION, float, fp32)
-MAKE_BLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
-
-
-void percentileClipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping<float>(g, gnorm_vec, step, n); }
-void percentileClipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping<half>(g, gnorm_vec, step, n); }
-
-void quantizeBlockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, General8bit>(code, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, FP4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<half, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
-void quantizeBlockwise_fp32_nf4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise<float, 0, NF4>(NULL, A, absmax, out, NULL, 0, blocksize, n); }
-
-void dequantizeBlockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, General8bit>(code, A, absmax, out, blocksize, n); } \
-void dequantizeBlockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, General8bit>(code, A, absmax, out, blocksize, n); }
-void dequantizeBlockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, FP4>(NULL, A, absmax, out, blocksize, n); } \
-void dequantizeBlockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, FP4>(NULL, A, absmax, out, blocksize, n); }
-void dequantizeBlockwise_fp16_nf4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise<half, NF4>(NULL, A, absmax, out, blocksize, n); } \
-void dequantizeBlockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise<float, NF4>(NULL, A, absmax, out, blocksize, n); }
-
-
-#define MAKE_FUNC_TRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
-void transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(cublasLtHandle_t ltHandle, dtype *A, dtype *out, int dim1, int dim2) \
-{ \
-	transform<dtype, src, target, transpose, bits>(ltHandle, A, out, dim1, dim2); \
-} \
-
-MAKE_FUNC_TRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8);
-MAKE_FUNC_TRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8);
-MAKE_FUNC_TRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8);
-MAKE_FUNC_TRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32);
-MAKE_FUNC_TRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8);
-MAKE_FUNC_TRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8);
-MAKE_FUNC_TRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8);
-MAKE_FUNC_TRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32);
-
-void transform_row2col32(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 0>(A, out, rows, cols); }
-void transform_row2col32T(char * A, char *out, int rows, int cols){ transformRowToFormat<COL32, 1>(A, out, rows, cols); }
-void transform_row2turing(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_TURING, 0>(A, out, rows, cols); }
-void transform_row2turingT(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_TURING, 1>(A, out, rows, cols); }
-void transform_row2ampere(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_AMPERE, 0>(A, out, rows, cols); }
-void transform_row2ampereT(char * A, char *out, int rows, int cols){ transformRowToFormat<COL_AMPERE, 1>(A, out, rows, cols); }
-
-void extractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_TURING>(A, idx, out, idx_size, rows, cols); }
-void extractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_AMPERE>(A, idx, out, idx_size, rows, cols); }
-
- int igemmlt_turing_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_TURING, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_turing_8(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_TURING, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_turing_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_TURING, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_ampere_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_AMPERE, 32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_ampere_8(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_AMPERE, 8, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
- int igemmlt_ampere_8_rowscale(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt<COL_AMPERE, 8, 1>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-void spmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
-{ spmm_coo_very_sparse_naive<half, 16>(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
-
-void spmm_coo_very_sparse_naive_int8(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
-{ spmm_coo_very_sparse_naive<signed char, 8>(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
-#endif
-
-extern "C"
-{
-#if BUILD_CUDA
-	void cestimate_quantiles_fp32(float *A, float *code, float offset, int n){ estimateQuantiles_fp32(A, code, offset, n); }
-	void cestimate_quantiles_fp16(half *A, float *code, float offset, int n){ estimateQuantiles_fp16(A, code, offset, n); }
-	void cquantize(float *code, float *A, unsigned char *out, int n){ quantize(code, A, out, n); }
-	void cdequantize(float *code, unsigned char *A, float *out, int n){ dequantize(code, A, out, n); }
-  void cquantize_blockwise_fp16(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_fp32(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
-
-  void cdequantize_blockwise_fp16(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp32(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32(code, A, absmax, out, blocksize, n); }
-
-  void cquantize_blockwise_fp16_fp4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_fp32_fp4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp16_fp4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16_fp4(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp32_fp4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_fp4(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_fp16_nf4(float * code, half *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n); }
-  void cquantize_blockwise_fp32_nf4(float * code, float *A, float *absmax, unsigned char *out, int blocksize, const int n){ quantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp16_nf4(float *code, unsigned char *A, float *absmax, half *out, int blocksize, const int n){ dequantizeBlockwise_fp16_nf4(code, A, absmax, out, blocksize, n); }
-  void cdequantize_blockwise_fp32_nf4(float *code, unsigned char *A, float *absmax, float *out, int blocksize, const int n){ dequantizeBlockwise_fp32_nf4(code, A, absmax, out, blocksize, n); }
-
-	#define MAKE_CFUNC32(name, gtype, gbits) \
-	void c##name##32bit_grad_##gbits(gtype *g, gtype *p, \
-								 float* state1, float* state2, float *unorm, float max_unorm, float param_norm, \
-								 const float beta1, const float beta2, const float eps, const float weight_decay, \
-								 const int step, const float lr, const float gnorm_scale, bool skip_zeros, const int n) \
-	{ name##32bit_grad_##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, weight_decay, step, lr, gnorm_scale, skip_zeros, n); } \
-
-	MAKE_CFUNC32(adam, float, fp32)
-	MAKE_CFUNC32(adam, half, fp16)
-	MAKE_CFUNC32(adam, __nv_bfloat16, bf16)
-	MAKE_CFUNC32(momentum, float, 32)
-	MAKE_CFUNC32(momentum, half, 16)
-	MAKE_CFUNC32(rmsprop, float, 32)
-	MAKE_CFUNC32(rmsprop, half, 16)
-	MAKE_CFUNC32(lion, float, fp32)
-	MAKE_CFUNC32(lion, half, fp16)
-	MAKE_CFUNC32(lion, __nv_bfloat16, bf16)
-	MAKE_CFUNC32(adagrad, float, 32)
-	MAKE_CFUNC32(adagrad, half, 16)
-
-	#define MAKE_CFUNC8(name, gtype, gbits) \
-	void c##name##_static_8bit_grad_##gbits(gtype* p, gtype* g, unsigned char* state1, unsigned char* state2, \
-                float *unorm, float max_unorm, float param_norm, \
-                float beta1, float beta2, \
-                float eps, int step, float lr,  \
-                float* quantiles1, float* quantiles2, \
-                float* max1, float* max2, float* new_max1, float* new_max2, \
-                float weight_decay, float gnorm_scale, int n) \
-  {  \
-	    name##_static_8bit_grad_##gbits(g, p, state1, state2, unorm, max_unorm, param_norm, beta1, beta2, eps, step, lr, \
-			                                 quantiles1, quantiles2, max1, max2, new_max1, new_max2, weight_decay, gnorm_scale, n); \
-  } \
-
-	MAKE_CFUNC8(adam, float, 32)
-	MAKE_CFUNC8(adam, half, 16)
-	MAKE_CFUNC8(momentum, float, 32)
-	MAKE_CFUNC8(momentum, half, 16)
-	MAKE_CFUNC8(rmsprop, float, 32)
-	MAKE_CFUNC8(rmsprop, half, 16)
-	MAKE_CFUNC8(lion, float, 32)
-	MAKE_CFUNC8(lion, half, 16)
-
-  #define MAKE_CBLOCKWISE8(fname, optim_name, gtype, gbits) \
-  void c##fname##_8bit_blockwise_grad_##gbits(gtype* p, gtype* g, \
-                unsigned char* state1, unsigned char* state2, float beta1, float beta2, float eps, int step, float lr,  \
-                float* quantiles1, float* quantiles2, float* absmax1, float* absmax2, float weight_decay, const float gnorm_scale, bool skip_zeros, int n) \
-  {	fname##_8bit_blockwise_grad_##gbits(p, g, state1, state2, beta1, beta2, eps, step, lr, quantiles1, quantiles2, absmax1, absmax2, weight_decay, gnorm_scale, skip_zeros, n); } \
-
-	MAKE_CBLOCKWISE8(adam, ADAM, half, fp16)
-	MAKE_CBLOCKWISE8(adam, ADAM, float, fp32)
-	MAKE_CBLOCKWISE8(momentum, MOMENTUM, half, fp16)
-	MAKE_CBLOCKWISE8(momentum, MOMENTUM, float, fp32)
-	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, half, fp16)
-	MAKE_CBLOCKWISE8(rmsprop, RMSPROP, float, fp32)
-	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, half, fp16)
-	MAKE_CBLOCKWISE8(adagrad, ADAGRAD, float, fp32)
-	MAKE_CBLOCKWISE8(adam, ADAM, __nv_bfloat16, bf16)
-	MAKE_CBLOCKWISE8(lion, LION, half, fp16)
-	MAKE_CBLOCKWISE8(lion, LION, float, fp32)
-	MAKE_CBLOCKWISE8(lion, LION, __nv_bfloat16, bf16)
-
-	void cpercentile_clipping_g32(float * g, float *gnorm_vec, int step, const int n){ percentileClipping_g32(g, gnorm_vec, step, n); }
-	void cpercentile_clipping_g16(half * g, float *gnorm_vec, int step, const int n){ percentileClipping_g16(g, gnorm_vec, step, n); }
-	void chistogram_scatter_add_2d(float* histogram, int *index1, int *index2, float *src, int maxidx1, int n){ histogramScatterAdd2D(histogram, index1, index2, src, maxidx1, n); }
-
-	void cigemm(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc)
-	{ gemmex(context, transposeA, transposeB, m, n, k, A, B, C, lda, ldb, ldc); }
-	void cbatched_igemm(Context *context, bool transposeA, bool transposeB, int m, int n, int k, void *A, void *B, void *C, int lda, int ldb, int ldc,
-			               long strideA, long strideB, long strideC, int batchCount)
-	{ strided_gemmex(context, transposeA, transposeB, m, n, k, A, B, C, lda, ldb, ldc, strideA, strideB, strideC, batchCount); }
-
-	Context *get_context(){ return new Context(); }
-	ContextCusparse *get_cusparse(){ return new ContextCusparse(); }
-
-	int cigemmlt_turing_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_turing_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-	//{ (cublasLtHandle_t)context->m_handle; return 0; }
-	//{ return 0; }//igemmlt_turing_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_turing_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_turing_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_turing_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_turing_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_ampere_32(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_ampere_32((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_ampere_8_rowscale(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_ampere_8_rowscale((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-	int cigemmlt_ampere_8(Context *context, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
-	{ return igemmlt_ampere_8((cublasLtHandle_t) context->m_handle, m, n, k, A, B, C, row_scale, lda, ldb, ldc); }
-
-  #define MAKE_FUNC_CTRANSFORM(fbits, fsrc, ftrgt, ftranspose, dtype, src, target, transpose, bits) \
-	void ctransform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose(Context *context, dtype *A, dtype *out, int dim1, int dim2) \
-	{ \
-		transform_##fbits##_##fsrc##_to_##ftrgt##_##ftranspose((cublasLtHandle_t) context->m_handle, A, out, dim1, dim2); \
-	} \
-
-	MAKE_FUNC_CTRANSFORM(8, row, col, n, int8_t, ROW, COL, false, 8)
-	MAKE_FUNC_CTRANSFORM(8, row, row, n, int8_t, ROW, ROW, false, 8)
-	MAKE_FUNC_CTRANSFORM(8, row, col32, n, int8_t, ROW, COL32, false, 8)
-	MAKE_FUNC_CTRANSFORM(32, row, col32, n, int32_t, ROW, COL32, false, 32)
-	MAKE_FUNC_CTRANSFORM(8, row, col_turing, n, int8_t, ROW, COL_TURING, false, 8)
-	MAKE_FUNC_CTRANSFORM(8, row, col_ampere, n, int8_t, ROW, COL_AMPERE, false, 8)
-	MAKE_FUNC_CTRANSFORM(8, col32, row, n, int8_t, COL32, ROW, false, 8)
-	MAKE_FUNC_CTRANSFORM(32, col32, row, n, int32_t, COL32, ROW, false, 32)
-
-	void cdequant_mm_int32_fp16(int *A, float *rowStats, float *colStats, half *out, float* newRowStats, float* newcolStats, half* bias, int numRows, int numCols)
-	{ dequant_mm_int32_fp16(A, rowStats, colStats, out, newRowStats, newcolStats, bias, numRows, numCols); }
-	void cget_col_row_stats(half * A, float *rowStats, float *colStats, int *nnz_count_row, float nnz_threshold, int rows, int cols)
-	{ getColRowStats(A, rowStats, colStats, nnz_count_row, nnz_threshold, rows, cols); }
-
-  void cdouble_rowcol_quant(half * A, float *rowStats, float *colStats, char *out_col_normed, char *out_row_normed, int *rowidx, int *colidx, half *val, int *nnz_row_ptr, float threshold, int rows, int cols)
-	{ doubleRowColQuant(A, rowStats, colStats, out_col_normed, out_row_normed, rowidx, colidx, val, nnz_row_ptr, threshold, rows, cols); }
-
-	void ctransform_row2col32(char * A, char *out, int rows, int cols)
-	{ transform_row2col32(A, out, rows, cols); }
-
-	void ctransform_row2col32T(char * A, char *out, int rows, int cols)
-	{ transform_row2col32T(A, out, rows, cols); }
-
-	void ctransform_row2turing(char * A, char *out, int rows, int cols)
-	{ transform_row2turing(A, out, rows, cols); }
-
-	void ctransform_row2turingT(char * A, char *out, int rows, int cols)
-	{ transform_row2turingT(A, out, rows, cols); }
-
-	void ctransform_row2ampere(char * A, char *out, int rows, int cols)
-	{ transform_row2ampere(A, out, rows, cols); }
-
-	void ctransform_row2ampereT(char * A, char *out, int rows, int cols)
-	{ transform_row2ampereT(A, out, rows, cols); }
-
-	void cspmm_coo(ContextCusparse *context, int *A_rowidx, int *A_colidx, half *A_vals, int A_nnz, int A_rows, int A_cols, int B_cols, int ldb, half *B, int ldc, half* C, bool transposed_B)
-  { spmm_coo((hipsparseHandle_t) context->m_handle, A_rowidx, A_colidx, A_vals, A_nnz, A_rows, A_cols, B_cols, ldb, B, ldc, C, transposed_B); }
-
-	void cspmm_coo_very_sparse_naive_fp16(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
-	{ spmm_coo_very_sparse_naive_fp16(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
-
-	void cspmm_coo_very_sparse_naive_int8(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
-	{ spmm_coo_very_sparse_naive_int8(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
-
-	void cextractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_turing(A, idx, out, idx_size, rows, cols); }
-	void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); }
-
-	//void cgemm_host_fp32(int M, int N, int K, float * A,  float* B,  float * out,  int lda, int ldb, int ldc)
-	//{ gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); }
-
-	void cgemm_host_fp16(int M, int N, int K, half * A,  half* B,  half * out,  int lda, int ldb, int ldc)
-	{ gemm_host_fp16(M, N, K, A, B, out, lda, ldb, ldc); }
-
-	void cgemm_4bit_inference(int m, int n, int k, half * A,  unsigned char* B,  float *absmax, half * out,  int lda, int ldb, int ldc, int blocksize)
-	{ gemm_4bit_inference(m, n, k, A, B, absmax, out, lda, ldb, ldc, blocksize); }
-
-	void *cget_managed_ptr(size_t bytes)
-	{
-		void *ptr;
-		CUDA_CHECK_RETURN(hipMallocManaged(&ptr, bytes, hipMemAttachHost));
-		CUDA_CHECK_RETURN(hipPeekAtLastError());
-
-		return ptr;
-	}
-
-	void cprefetch(void *ptr, size_t bytes, int device)
-	{
-		CUDA_CHECK_RETURN(hipMemPrefetchAsync(ptr, bytes, device, 0));
-		CUDA_CHECK_RETURN(hipPeekAtLastError());
-	}
-
-  #define CMAKE_ELEMENTWISE_FUNC(fname, type_name, ctype, FUNC) \
-	void c##fname##_##type_name(ctype *A, ctype *B, ctype value, long n){ fname##_##type_name(A, B, value, n); } \
-
-	CMAKE_ELEMENTWISE_FUNC(fill, fp32, float, FILL)
-	CMAKE_ELEMENTWISE_FUNC(fill, uint8, unsigned char, FILL)
-	CMAKE_ELEMENTWISE_FUNC(arange, fp32, float, ARANGE)
-	CMAKE_ELEMENTWISE_FUNC(_mul, fp32, float, _MUL)
-
-#endif
-	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
-	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
-}
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 2184cce8c..fdfe19ee4 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -12,6 +12,8 @@
     title: 8-bit optimizers
   - local: algorithms
     title: Algorithms
+  - local: fsdp_qlora
+    title: FSDP-QLoRA
   - local: integrations
     title: Integrations
   - local: errors
diff --git a/docs/source/fsdp_qlora.md b/docs/source/fsdp_qlora.md
new file mode 100644
index 000000000..47922cfcc
--- /dev/null
+++ b/docs/source/fsdp_qlora.md
@@ -0,0 +1,106 @@
+# FSDP-QLoRA
+
+FSDP-QLoRA combines data parallelism (FSDP enables sharding model parameters, optimizer states, and gradients across GPUs), 4-bit quantization, and LoRA to train LLMs up to 70B parameters on a dual 24GB GPU system. This technique was released by [Answer.AI](https://www.answer.ai/posts/2024-03-06-fsdp-qlora) in collaboration with bitsandbytes to make training LLMs more efficient and accessible for everyone.
+
+This guide provides a brief guide on how bitsandbytes supports storing quantized weights to enable FSDP-QLoRA, and how to run training with the Hugging Face libraries.
+
+> [!TIP]
+> Other changes required for bitsandbytes to support FSDP-QLoRA, such as reconstructing the weights from the quantization metadata and preventing quantizing already quantized weights when they're moved from a CPU to GPU, are documented in this [Pull Request](https://github.com/TimDettmers/bitsandbytes/pull/970) and described in the [Enabling 70B Finetuning on Consumer GPUs](https://www.answer.ai/posts/2024-03-14-fsdp-qlora-deep-dive) blog post. We highly recommend reading these resources for a better understanding of FSDP-QLoRA!
+
+## Quantized data storage
+
+FSDP only supports sharding float data types which can be problematic because quantized weights are typically stored as integer data types (uint8). bitsandbytes doesn't have this problem because it uses `StoreChar` to read and write quantized weights regardless of the data type storage. This makes it simple to add a `quant_storage` parameter to the [`~nn.Linear4bit`] and [`~nn.Params4bit`] classes and set it to `torch.uint8` to maintain backward compatibility with the codebase.
+
+```py
+import torch
+import bitsandbytes as bnb
+
+model = bnb.nn.Linear4bit(
+    input_features,
+    output_features,
+    quant_type="fp4",
+    quant_storage=torch.uint8,
+)
+```
+
+With the `quant_storage` parameter, you can select any of the FSDP supported data types to shard [`~nn.Linear4bit`] with such as bfloat16, float16 or float32.
+
+## Training
+
+bitsandbytes is deeply integrated with the Hugging Face ecosystem, making it easy to use with libraries like [Transformers](https://hf/co/docs/transformers), [PEFT](https://hf/co/docs/peft), and [TRL](https://hf/co/docs/trl).
+
+Before you begin, make sure you have the latest libraries installed.
+
+```bash
+pip install -U bitsandbytes accelerate transformers peft trl
+```
+
+> [!TIP]
+> PEFT provides a configuration file ([fsdp_config_qlora.yaml](https://github.com/huggingface/peft/blob/main/examples/sft/configs/fsdp_config_qlora.yaml)), launch command ([run_peft_qlora_fsdp.sh](https://github.com/huggingface/peft/blob/main/examples/sft/run_peft_qlora_fsdp.sh)), and training script ([train.py](https://github.com/huggingface/peft/blob/main/examples/sft/train.py)) for FSDP-QLoRA. To learn more, check out the [Use PEFT QLoRA and FSDP for finetuning large models on multiple GPUs](https://huggingface.co/docs/peft/main/en/accelerate/fsdp#use-peft-qlora-and-fsdp-for-finetuning-large-models-on-multiple-gpus) documentation.
+
+The important change that enables FSDP-QLoRA training is the `bnb_4bit_quant_storage` parameter in the [`~transformers.BitsAndBytesConfig`] class. This allows you to set the storage data type of the quantized weights to a float data type.
+
+```py
+from transformers import BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_storage=torch.bfloat16,
+)
+```
+
+Pass the [`~transformers.BitsAndBytesConfig`] to a model to set it up for FSDP-QLoRA. You should set the `torch_dtype` parameter to match `bnb_4bit_quant_storage` so that the [`~nn.Linear4bit`] layers are wrapped identically to the `Linear` layers. If the storage types do not match, then each [`~nn.Linear4bit`] layer is wrapped individually.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-2-70b",
+    quantization_config=bnb_config,
+    torch_dtype=torch.bfloat16,
+)
+```
+
+Configure the [`~peft.LoraConfig`] class for QLoRA training by setting `target_modules="all-linear"`.
+
+```py
+from peft import LoraConfig
+
+peft_config = LoraConfig(
+    lora_alpha=16,
+    lora_dropout=0.1,
+    r=64,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules="all-linear",
+)
+```
+
+Now you can pass everything to the [`~trl.SFTTrainer`] for training.
+
+```py
+from trl import SFTTrainer
+
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset,
+    peft_config=peft_config,
+    dataset_text_field="text",
+    max_seq_length=max_seq_length,
+    tokenizer=tokenizer,
+    args=training_arguments,
+)
+trainer.train()
+```
+
+## Resources
+
+To learn more about FSDP and QLoRA, check out the following resources:
+
+- The [AnswerDotAI/fsdp_qlora](https://github.com/AnswerDotAI/fsdp_qlora) repository.
+- The introductory [You can now train a 70b language model at home](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) blog post by Answer.AI.
+- For an introduction to FSDP, read the [Introducing PyTorch Fully Sharded Data Parallel (FSDP) API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api) blog post.
+- For more details about QLoRA, take a look at the [Making LLMs even more accessible with bitsandbytes, 4-bit quantization and QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes) blog post.
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index d0dd7ba76..caf22488f 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -1,9 +1,17 @@
 # Installation
 
-bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**. Select your operating system below to see the installation instructions.
+bitsandbytes is only supported on CUDA GPUs for CUDA versions **11.0 - 12.3**.
 
-<hfoptions id="OS system">
-<hfoption id="Linux">
+The latest version of bitsandbytes (v0.43.0) builds on:
+
+| OS | CUDA | Compiler |
+|---|---|---|
+| Linux | 11.7 - 12.3 | GCC 11.4 |
+|  | 12.4+ | GCC 13.2 |
+| Windows | 11.7 - 12.4 | MSVC 19.38+ (VS2022 17.8.0+) |
+
+> [!TIP]
+> MacOS support is still a work in progress! Subscribe to this [issue](https://github.com/TimDettmers/bitsandbytes/issues/1020) to get notified about discussions and to track the integration progress.
 
 For Linux systems, make sure your hardware meets the following requirements to use bitsandbytes features.
 
@@ -23,13 +31,26 @@ pip install bitsandbytes
 
 ## Compile from source
 
+For Linux and Windows systems, you can compile bitsandbytes from source. Installing from source allows for more build options with different CMake configurations.
+
+<hfoptions id="source">
+<hfoption id="Linux">
+
 To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. Make sure you have a compiler installed to compile C++ (gcc, make, headers, etc.). For example, to install a compiler and CMake on Ubuntu:
 
 ```bash
 apt-get install -y build-essential cmake
 ```
 
-You should also install CUDA Toolkit by following the [NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) guide from NVIDIA.
+You should also install CUDA Toolkit by following the [NVIDIA CUDA Installation Guide for Linux](https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html) guide from NVIDIA. The current expected CUDA Toolkit version is **11.1+** and it is recommended to install **GCC >= 7.3** and required to have at least **GCC >= 6**.
+
+Refer to the following table if you're using another CUDA Toolkit version.
+
+| CUDA Toolkit | GCC |
+|---|---|
+| >= 11.4.1 | >= 11 |
+| >= 12.0 | >= 12 |
+| >= 12.4 | >= 13 |
 
 Now to install the bitsandbytes package from source, run the following commands:
 
@@ -49,7 +70,13 @@ pip install .
 
 Windows systems require Visual Studio with C++ support as well as an installation of the CUDA SDK.
 
-You'll need to build bitsandbytes from source. To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA.
+To compile from source, you need CMake >= **3.22.1** and Python >= **3.8** installed. You should also install CUDA Toolkit by following the [CUDA Installation Guide for Windows](https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html) guide from NVIDIA.
+
+Refer to the following table if you're using another CUDA Toolkit version.
+
+| CUDA Toolkit | MSVC |
+|---|---|
+| >= 11.6 | 19.30+ (VS2022) |
 
 ```bash
 git clone https://github.com/TimDettmers/bitsandbytes.git && cd bitsandbytes/
@@ -61,12 +88,6 @@ python -m build --wheel
 
 Big thanks to [wkpark](https://github.com/wkpark), [Jamezo97](https://github.com/Jamezo97), [rickardp](https://github.com/rickardp), [akx](https://github.com/akx) for their amazing contributions to make bitsandbytes compatible with Windows.
 
-</hfoption>
-<hfoption id="MacOS">
-
-> [!TIP]
-> MacOS support is still a work in progress! Subscribe to this [issue](https://github.com/TimDettmers/bitsandbytes/issues/1020) to get notified about discussions and to track the integration progress.
-
 </hfoption>
 </hfoptions>
 
diff --git a/docs/source/integrations.mdx b/docs/source/integrations.mdx
index 4badece49..42b8edf03 100644
--- a/docs/source/integrations.mdx
+++ b/docs/source/integrations.mdx
@@ -12,7 +12,7 @@ With Transformers, it's very easy to load any model in 4 or 8-bit and quantize t
 For example, to load and quantize a model to 4-bits and use the bfloat16 data type for compute:
 
 > [!WARNING]
-> bfloat16 is the optimal compute data type if your hardware supports it. The default is float32 for backward compatibility and numerical stability, but it can often lead to numerical instabilities. bfloat16 provides the best of both worlds, numerical stability equivalent to float32, but combined with the memory footprint and significant computation speedup of a 16-bit data type. Make sure to check if your hardware supports bfloat16 and if it does, configure it using the `bnb_4bit_compute_dtype` parameter in [`~transformers.BitsAndBytesConfig`]!
+> bfloat16 is the ideal `compute_dtype` if your hardware supports it. While the default `compute_dtype`, float32, ensures backward compatibility (due to wide-ranging hardware support) and numerical stability, it is large and slows down computations. In contrast, float16 is smaller and faster but can lead to numerical instabilities. bfloat16 combines the best aspects of both; it offers the numerical stability of float32 and the reduced memory footprint and speed of a 16-bit data type. Check if your hardware supports bfloat16 and configure it using the `bnb_4bit_compute_dtype` parameter in [`~transformers.BitsAndBytesConfig`]!
 
 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
index 91dded6f4..547ca9955 100644
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -94,8 +94,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         __m128 vxp = _mm_shuffle_ps(xp01, xp23, (1) + (3 << 2) + (1 << 4) + (3 << 6));
 #endif
         IVec<SSE, float> i(u.vec);
-        IVec<SSE, float> vlem = operator< (vz, vxm);
-        IVec<SSE, float> vlep = operator< (vz, vxp);
+        IVec<SSE, float> vlem = vz < vxm;
+        IVec<SSE, float> vlep = vz < vxp;
         i = i + vlem + vlep;
         i.store(pr);
     }
@@ -124,8 +124,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         __m128d vxp = _mm_shuffle_pd(vx0, vx1, 3);
 
         IVec<SSE, double> i(b1, b0);
-        IVec<SSE, double> vlem = operator< (vz, vxm);
-        IVec<SSE, double> vlep = operator< (vz, vxp);
+        IVec<SSE, double> vlem = (vz < vxm);
+        IVec<SSE, double> vlep = (vz < vxp);
         i = i + vlem + vlep;
 
         union {
@@ -229,8 +229,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 
 #endif
 
-        IVec<SSE, float> vlem = operator< (vz, vxm);
-        IVec<SSE, float> vlep = operator< (vz, vxp);
+        IVec<AVX, float> vlem = vz < vxm;
+        IVec<AVX, float> vlep = vz < vxp;
         ip = ip + vlem + vlep;
 
         ip.store(pr);
@@ -279,8 +279,8 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 //        FVec<AVX, double> vxp = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01p), h23p, 1);
 
         IVec<AVX, double> i(u.vec);
-        IVec<SSE, float> vlem = operator< (vz, vxm);
-        IVec<SSE, float> vlep = operator< (vz, vxp);
+        IVec<AVX, double> vlem = vz < vxm;
+        IVec<AVX, double> vlep = vz < vxp;
         i = i + vlem + vlep;
         i.extractLo32s().store(pr);
     }
diff --git a/include/SIMD.h b/include/SIMD.h
index e97f5fc33..9d1410c73 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -307,7 +307,7 @@ FORCE_INLINE FVec<SSE,float> operator-   (const FVec<SSE,float>& a,  const FVec<
 FORCE_INLINE FVec<SSE,float> operator*   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_mul_ps( a, b ); }
 FORCE_INLINE FVec<SSE,float> operator/   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_div_ps( a, b ); }
 FORCE_INLINE IVec<SSE,float> ftoi        (const FVec<SSE,float>& a)                             { return _mm_cvttps_epi32(a); }
-#if !defined(__clang__) || defined(__HIP_PLATFORM_AMD__) // Conflicts with builtin operator
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,float> operator<=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator>=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator<   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128(_mm_cmplt_ps(a, b)); }
@@ -363,7 +363,7 @@ FORCE_INLINE FVec<SSE,double> operator-   (const FVec<SSE,double>& a, const FVec
 FORCE_INLINE FVec<SSE,double> operator*   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_mul_pd( a, b ); }
 FORCE_INLINE FVec<SSE,double> operator/   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_div_pd( a, b ); }
 FORCE_INLINE IVec<SSE,float>  ftoi        (const FVec<SSE,double>& a)                               { return _mm_cvttpd_epi32(a); }
-#if !defined(__clang__) || defined(__HIP_PLATFORM_AMD__) // Conflicts with builtin operator
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,double> operator<=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); }
 FORCE_INLINE IVec<SSE,double> operator<   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); }
 FORCE_INLINE IVec<SSE,double> operator>=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); }
diff --git a/requirements-ci.txt b/requirements-ci.txt
index 61f92018a..24e2db324 100644
--- a/requirements-ci.txt
+++ b/requirements-ci.txt
@@ -1,6 +1,6 @@
 # Requirements used for GitHub actions
-pytest==8.1.1
-einops==0.7.0
-lion-pytorch==0.1.2
+pytest==8.2.0
+einops==0.8.0
+lion-pytorch==0.1.4
 scipy==1.10.1; python_version < "3.9"
-scipy==1.12.0; python_version >= "3.9"
+scipy==1.13.0; python_version >= "3.9"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index fc5449ba7..0334896be 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,9 +1,9 @@
 # Requirements used for local development
 setuptools>=63
-pytest~=8.1.1
-einops~=0.7.0
+pytest~=8.2.0
+einops~=0.8.0
 wheel~=0.43.0
-lion-pytorch~=0.1.2
-scipy~=1.12.0
-pandas~=2.2.1
-matplotlib~=3.8.3
+lion-pytorch~=0.1.4
+scipy~=1.13.0
+pandas~=2.2.2
+matplotlib~=3.8.4
diff --git a/setup.py b/setup.py
index a51b3867c..f8d6a92a1 100644
--- a/setup.py
+++ b/setup.py
@@ -25,7 +25,7 @@ def has_ext_modules(self):
 
 setup(
     name="bitsandbytes",
-    version="0.44.0.dev",
+    version="0.43.2.dev",
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",
diff --git a/tests/conftest.py b/tests/conftest.py
index 17ffd281c..59146963d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,3 +1,5 @@
+import gc
+
 import pytest
 import torch
 
@@ -20,6 +22,13 @@ def pytest_runtest_call(item):
         raise
 
 
+@pytest.hookimpl(trylast=True)
+def pytest_runtest_teardown(item, nextitem):
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+
 @pytest.fixture(scope="session")
 def requires_cuda() -> bool:
     cuda_available = torch.cuda.is_available()
diff --git a/tests/test_functional.py b/tests/test_functional.py
index 8acd5395d..8ddee9f9a 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -12,7 +12,14 @@
 import bitsandbytes as bnb
 from bitsandbytes import functional as F
 from bitsandbytes.cextension import BNB_HIP_VERSION, HIP_ENVIRONMENT, ROCM_GPU_ARCH
-from tests.helpers import BOOLEAN_TUPLES, TRUE_FALSE, describe_dtype, get_blocksizes, get_test_dims, id_formatter
+from tests.helpers import (
+    BOOLEAN_TUPLES,
+    TRUE_FALSE,
+    describe_dtype,
+    get_blocksizes,
+    get_test_dims,
+    id_formatter,
+)
 
 torch.set_printoptions(precision=5, sci_mode=False, linewidth=120, edgeitems=20, threshold=10000)
 k = 20
@@ -575,28 +582,37 @@ def test_nvidia_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, trans
 @pytest.mark.parametrize("dim4", get_test_dims(32, 1024, n=1), ids=id_formatter("dim4"))
 @pytest.mark.parametrize("dims", (2, 3), ids=id_formatter("dims"))
 @pytest.mark.parametrize("ldb", (0,), ids=id_formatter("ldb"))
-def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb):
+@pytest.mark.parametrize("device", ("cuda", "cpu"), ids=id_formatter("device"))
+def test_igemmlt_int(dim1, dim2, dim3, dim4, dims, ldb, device):
     for i in range(k):
         if dims == 2:
-            A = torch.randint(-128, 127, size=(dim1, dim3), device="cuda").to(torch.int8)
+            A = torch.randint(-128, 127, size=(dim1, dim3), device=device).to(torch.int8)
         elif dims == 3:
-            A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device="cuda").to(torch.int8)
-        B = torch.randint(-128, 127, size=(dim4, dim3), device="cuda").to(torch.int8)
+            A = torch.randint(-128, 127, size=(dim1, dim2, dim3), device=device).to(torch.int8)
+        B = torch.randint(-128, 127, size=(dim4, dim3), device=device).to(torch.int8)
         C1 = torch.matmul(A.float(), B.t().float())
 
         A2, SA = F.transform(A, "col32")
         B2, SB = F.transform(B, "col_turing")
         C2, SC = F.igemmlt(A2, B2, SA, SB)
-        C3, S = F.nvidia_transform(C2, "row", state=SC)
+        if device == "cpu":
+            assert SC is None
+        if device == "cuda":
+            C3, S = F.nvidia_transform(C2, "row", state=SC)
+        else:
+            C3, S = C2, None
         torch.testing.assert_close(C1, C3.float())
 
         # transpose
-        B = torch.randint(-128, 127, size=(dim3, dim4), device="cuda").to(torch.int8)
+        B = torch.randint(-128, 127, size=(dim3, dim4), device=device).to(torch.int8)
         C1 = torch.matmul(A.float(), B.float())
 
         B2t, SBt = F.transform(B, "col_turing", transpose=True)
         C2, SC = F.igemmlt(A2, B2t, SA, SBt)
-        C3, S = F.nvidia_transform(C2, "row", state=SC)
+        if device == "cuda":
+            C3, S = F.nvidia_transform(C2, "row", state=SC)
+        else:
+            C3, S = C2, None
         torch.testing.assert_close(C1, C3.float())
 
 
@@ -845,6 +861,33 @@ def test_dequant_mm(dim1, dim4, dims, formatB, has_bias):
         assert_all_approx_close(C1, C4, atol=0.015, rtol=0.1, count=int(0.01 * n))
 
 
+@pytest.mark.parametrize("dim1", get_test_dims(64, 256, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim4", get_test_dims(64, 1024, n=2), ids=id_formatter("dim4"))
+@pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
+@pytest.mark.parametrize("has_bias", TRUE_FALSE, ids=id_formatter("has_bias"))
+def test_dequant_mm_cpu(dim1, dim4, dims, has_bias):
+    inner = torch.randint(1, 128, size=(1,)).item()
+    bias = None
+    if has_bias:
+        bias = torch.randn(dim4, device="cpu", dtype=torch.bfloat16)
+    for i in range(1):
+        A = torch.randn(dim1, inner, device="cpu")
+        B = torch.randn(dim4, inner, device="cpu")
+
+        A1, maxA = F.vectorwise_quant(A, dim=1)
+        B1, maxB = F.vectorwise_quant(B, dim=1)
+
+        C2, SC = F.igemmlt(A1, B1, SA=None, SB=None)
+        assert SC is None
+
+        C3 = F.vectorwise_mm_dequant(C2.bfloat16(), maxA, maxB.t())
+        if has_bias:
+            C3 += bias
+
+        C4 = F.mm_dequant(C2, SC, maxA.flatten(), maxB.flatten(), bias=bias)
+        torch.testing.assert_close(C3.float(), C4.float(), atol=0.05, rtol=0.1)
+
+
 @pytest.mark.parametrize("dim1", [1 * 1024], ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", [1 * 1024], ids=id_formatter("dim2"))
 @pytest.mark.parametrize("dims", (2,), ids=id_formatter("dims"))
@@ -891,9 +934,13 @@ def test_colrow_absmax(dim1, dim2, dims):
 
 @pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
-def test_double_quant(dim1, dim2):
+@pytest.mark.parametrize("device", ["cuda", "cpu"], ids=id_formatter("device"))
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
+def test_double_quant(dim1, dim2, device, dtype):
+    if device == "cuda" and dtype == torch.bfloat16:
+        pytest.skip("bfloat16 is not implemented for this operation on CUDA backend")
     for i in range(k):
-        A = torch.randn(dim1, dim2, device="cuda").half()
+        A = torch.randn(dim1, dim2, device=device).to(dtype)
         out_col1, Scol = F.vectorwise_quant(A, dim=0)
         out_row1, Srow = F.vectorwise_quant(A, dim=1)
 
@@ -1125,6 +1172,33 @@ def test_transform(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
         torch.testing.assert_close(out1, out2)
 
 
+@pytest.mark.parametrize("dim1", get_test_dims(2, 1024, n=2), ids=id_formatter("dim1"))
+@pytest.mark.parametrize("dim2", get_test_dims(2, 1024, n=2), ids=id_formatter("dim2"))
+@pytest.mark.parametrize("dim3", [0], ids=id_formatter("dim3"))
+@pytest.mark.parametrize("dims", [2], ids=id_formatter("dims"))
+@pytest.mark.parametrize("dtype", [torch.int8], ids=describe_dtype)
+@pytest.mark.parametrize("orderA", ["row"], ids=id_formatter("orderA"))
+@pytest.mark.parametrize("orderOut", ["col32", "col_turing", "col_ampere"], ids=id_formatter("orderOut"))
+@pytest.mark.parametrize("transpose", TRUE_FALSE, ids=id_formatter("transpose"))
+def test_transform_cpu(dim1, dim2, dim3, dims, dtype, orderA, orderOut, transpose):
+    for i in range(k):
+        if dims == 2:
+            A = torch.randint(10, 99, size=(dim1, dim2), device="cpu").to(dtype)
+        elif dims == 3:
+            A = torch.randint(10, 99, size=(dim1, dim2, dim3), device="cpu").to(dtype)
+
+        A.view(-1)[-1] = -1
+        if transpose:
+            out1 = A.t().contiguous()
+        else:
+            out1 = A
+        out2, S2 = F.transform(A, to_order=orderOut, transpose=transpose)
+
+        assert S2 is None
+
+        torch.testing.assert_close(out1, out2)
+
+
 @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_overflow():
     formatB = F.get_special_format_str()
@@ -1142,15 +1216,21 @@ def test_overflow():
 
 @pytest.mark.parametrize("dim1", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim1"))
 @pytest.mark.parametrize("dim2", get_test_dims(1, 4 * 1024, n=2), ids=id_formatter("dim2"))
-def test_coo_double_quant(dim1, dim2):
+@pytest.mark.parametrize("device", ["cuda", "cpu"], ids=id_formatter("device"))
+@pytest.mark.parametrize("dtype", [torch.half, torch.bfloat16], ids=id_formatter("dtype"))
+def test_coo_double_quant(dim1, dim2, device, dtype):
+    if device == "cuda" and dtype == torch.bfloat16:
+        pytest.skip("bfloat16 is not implemented for this operation on CUDA backend")
     threshold = 3.00
     for i in range(k):
-        A = torch.randn(dim1, dim2, device="cuda").half()
+        A = torch.randn(dim1, dim2, device=device).to(dtype)
 
         idx = torch.abs(A) >= threshold
         CA2, CAt, statsA, statsAt, coo_tensor = F.double_quant(A)
         CA, CAt, statsA, statsAt, coo_tensor = F.double_quant(A, threshold=threshold)
 
+        if idx.sum() > 0:
+            assert coo_tensor is not None
         if coo_tensor is not None:
             A1 = A * idx
             A2 = torch.zeros_like(A)
@@ -1158,8 +1238,8 @@ def test_coo_double_quant(dim1, dim2):
             torch.testing.assert_close(A1, A2)
 
             A1 = A * (idx == 0)
-            A2 = (CA.float() * statsA.unsqueeze(1) / 127).half()
-            torch.testing.assert_close(A * (idx == 0), A2, rtol=0.05, atol=1.5e-2)
+            A2 = (CA.float() * statsA.unsqueeze(1) / 127).to(dtype)
+            torch.testing.assert_close(A1, A2, rtol=0.05, atol=1.5e-2)
 
 
 @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
@@ -1735,12 +1815,12 @@ def quant_zp(x):
 
 
 @pytest.mark.skipif(0 < BNB_HIP_VERSION < 601, reason="this test is supported on ROCm from 6.1")
-def test_extract_outliers():
+@pytest.mark.parametrize("device", ["cuda", "cpu"])
+def test_extract_outliers(device):
     for i in range(k):
         shapeA = (4096, 4096 * 4)
-        idx = torch.unique(torch.randint(0, shapeA[1], size=(10,)).int()).cuda()
-        # idx = torch.Tensor([0]).int().cuda()
-        A = torch.randint(-128, 127, size=shapeA, device="cuda").to(torch.int8)
+        idx = torch.unique(torch.randint(0, shapeA[1], size=(10,)).int()).to(device=device)
+        A = torch.randint(-128, 127, size=shapeA, device=device).to(torch.int8)
         outliers1 = A[:, idx.long()]
 
         CA, SA = F.transform(A, "col_turing")
@@ -1935,7 +2015,9 @@ def test_bench_dequantization():
 
 @pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 @pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16], ids=describe_dtype)
-def test_fp4_quant(dtype):
+@pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
+@pytest.mark.parametrize("blocksize", [64, 128, 256, 512, 1024, 2048, 4096])
+def test_4bit_quant(dtype, quant_type, blocksize):
     vals = list(product([0, 1], repeat=4))
 
     code = {}
@@ -1960,8 +2042,8 @@ def test_fp4_quant(dtype):
         code[idx] = result
 
     A1 = torch.randn(1024, 1024, device="cuda", dtype=dtype)
-    qa, SA = F.quantize_fp4(A1, blocksize=64)
-    A2 = F.dequantize_fp4(qa, SA)
+    qa, SA = F.quantize_4bit(A1, blocksize=blocksize, quant_type=quant_type)
+    A2 = F.dequantize_4bit(qa, SA, blocksize=blocksize, quant_type=quant_type)
 
     err = (A1 - A2).abs().float()
     relerr = (err / (A1.abs().float() + 1e-8)).mean()
@@ -1969,8 +2051,24 @@ def test_fp4_quant(dtype):
     err = err.mean()
 
     assert A2.dtype == dtype
-    assert err.item() < 0.1
-    assert relerr.item() < 0.28
+
+    # With larger block sizes, we can expect this to blow up.
+    # At blocksize>=1024, don't even bother looking at relerr.
+    if blocksize <= 64:
+        assert err.item() < 0.1
+        assert relerr.item() < 0.28
+    elif blocksize <= 256:
+        assert err.item() < 0.11
+        assert relerr.item() < 0.30
+    elif blocksize <= 512:
+        assert err.item() < 0.12
+        assert relerr.item() < 0.31
+    elif quant_type == "fp4":
+        # 1024 => 0.48, 2048 => 0.52, 4096 => 0.56
+        assert err.item() < 0.08 + math.log2(blocksize) * 4e-2
+    else:
+        # 1024 => 0.8, 2048 => 0.88, 4096 => 0.96
+        assert err.item() < math.log2(blocksize) * 8e-2
 
 
 @pytest.mark.parametrize("quant_type", ["fp4", "nf4"])
diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index ca52f312e..2a4bd02e2 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -40,6 +40,7 @@ def test_layout_exact_match():
         assert torch.all(torch.eq(restored_x, x))
 
 
+@pytest.mark.skipif(HIP_ENVIRONMENT, reason="this test is not supported on ROCm yet")
 def test_linear_no_igemmlt():
     linear = torch.nn.Linear(1024, 3072)
     x = torch.randn(3, 1024, dtype=torch.half)