ROCm · pnunna93 · May 8, 2024 · May 6, 2024 · May 6, 2024 · May 6, 2024
diff --git a/bitsandbytes/backends/cuda.py b/bitsandbytes/backends/cuda.py
@@ -3,7 +3,7 @@
 
 import torch
 
-from bitsandbytes.cextension import lib, HIP_ENVIRONMENT
+from bitsandbytes.cextension import HIP_ENVIRONMENT, lib
 from bitsandbytes.functional import (
     CUBLAS_Context,
     coo_zeros,
@@ -13,8 +13,8 @@
     get_colrow_absmax,
     get_ptr,
     get_transform_buffer,
-    nvidia_transform,
     is_on_gpu,
+    nvidia_transform,
     post_call,
     pre_call,
     prod,
@@ -254,7 +254,7 @@ def igemmlt(self, A, B, SA, SB, out=None, Sout=None, dtype=torch.int32):
         ptrRowScale = get_ptr(None)
         is_on_gpu([A, B, out])
 
-        if formatB == "col_turing"  or HIP_ENVIRONMENT:
+        if formatB == "col_turing" or HIP_ENVIRONMENT:
             if dtype == torch.int32:
                 has_error = lib.cigemmlt_turing_32(ptr, m, n, k, ptrA, ptrB, ptrC, ptrRowScale, lda, ldb, ldc)
             else:
@@ -322,7 +322,10 @@ def mm_dequant(
     def extract_outliers(self, A, SA, idx):
         shapeA = SA[0]
         formatA = SA[1]
-        assert formatA in ["col_turing", "col_ampere"]
+        if not HIP_ENVIRONMENT:
+            assert formatA in ["col_turing", "col_ampere"]
+        else:
+            assert formatA in ["col"]
         assert A.device.type == "cuda"
 
         out = torch.zeros((shapeA[0], idx.numel()), dtype=torch.int8, device=A.device)
@@ -336,7 +339,7 @@ def extract_outliers(self, A, SA, idx):
 
         prev_device = pre_call(A.device)
 
-        if formatA == "col_turing":
+        if formatA == "col_turing" or HIP_ENVIRONMENT:
             lib.cextractOutliers_turing(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
         elif formatA == "col_ampere":
             lib.cextractOutliers_ampere(ptrA, ptrIdx, ptrOut, idx_size, rows, cols)
@@ -350,11 +353,13 @@ def quantize_4bit(
         A: torch.Tensor,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize=64,
+        blocksize: Optional[int] = None,
         compress_statistics=False,
         quant_type="fp4",
         quant_storage=torch.uint8,
     ) -> Tuple[torch.Tensor, QuantState]:
+        if blocksize is None:
+            blocksize = 64 if not HIP_ENVIRONMENT else 128
         if A.device.type != "cuda":
             raise NotImplementedError(f"Device type not supported for FP4 quantization: {A.device.type}")
         if quant_type not in ["fp4", "nf4"]:
@@ -372,7 +377,10 @@ def quantize_4bit(
             mod = dtype2bytes[quant_storage] * 2
             out = torch.zeros(((n + 1) // mod, 1), dtype=quant_storage, device=A.device)
 
-        assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        if not HIP_ENVIRONMENT:
+            assert blocksize in [4096, 2048, 1024, 512, 256, 128, 64]
+        else:
+            assert blocksize in [4096, 2048, 1024, 512, 256, 128]
 
         prev_device = pre_call(A.device)
         is_on_gpu([A, out, absmax])
@@ -443,12 +451,17 @@ def dequantize_4bit(
         quant_state: Optional[QuantState] = None,
         absmax: Optional[torch.Tensor] = None,
         out: Optional[torch.Tensor] = None,
-        blocksize: int = 64,
+        blocksize: Optional[int] = None,
         quant_type="fp4",
     ) -> torch.Tensor:
-        if blocksize not in [2048, 4096, 1024, 512, 256, 128, 64]:
+        if blocksize is None:
+            blocksize = 64 if not HIP_ENVIRONMENT else 128
+        supported_blocksizes = [2048, 4096, 1024, 512, 256, 128, 64]
+        if HIP_ENVIRONMENT:
+            supported_blocksizes = supported_blocksizes[:-1]
+        if blocksize not in supported_blocksizes:
             raise ValueError(
-                f"The blockwise of {blocksize} is not supported. Supported values: [2048, 4096, 1024, 512, 256, 128, 64]"
+                f"The blockwise of {blocksize} is not supported. Supported values: {supported_blocksizes}"
             )
 
         if quant_type not in ["fp4", "nf4"]:

diff --git a/bitsandbytes/cuda_specs.py b/bitsandbytes/cuda_specs.py
@@ -52,7 +52,7 @@ def get_rocm_gpu_arch() -> str:
     try:
         if torch.version.hip:
             result = subprocess.run(["rocminfo"], capture_output=True, text=True)
-            match = re.search(r"Name:\s+gfx(\d+)", result.stdout)
+            match = re.search(r"Name:\s+gfx([a-zA-Z\d]+)", result.stdout)
             if match:
                 return "gfx" + match.group(1)
             else: