From c620f4d677ec659e27f9b93f576142814f454a1d Mon Sep 17 00:00:00 2001
From: JohannesGaessler <johannesg@5d6.de>
Date: Fri, 1 Sep 2023 20:15:49 +0200
Subject: [PATCH] KV cache quantized to q8_0

---
 common/common.cpp                            |  31 +-
 common/common.h                              |   3 +-
 examples/llama-bench/llama-bench.cpp         |  68 +-
 examples/main/README.md                      |   4 +-
 examples/main/main.cpp                       |  17 +-
 examples/quantize-stats/quantize-stats.cpp   |   2 +-
 examples/save-load-state/save-load-state.cpp |   2 +-
 examples/server/README.md                    |   2 +-
 examples/server/server.cpp                   |  29 +-
 ggml-cuda.cu                                 | 876 +++++++++++++------
 ggml.c                                       | 163 +++-
 ggml.h                                       |  22 +
 llama.cpp                                    | 166 +++-
 llama.h                                      |   3 +-
 run_with_preset.py                           |   6 +-
 15 files changed, 1024 insertions(+), 370 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 6d655fd5548c5..134f2e00d6dbd 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -198,8 +198,30 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.rope_freq_scale = 1.0f/std::stof(argv[i]);
+        } else if (arg == "--kv-type" || arg == "-kvt") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+
+            std::string type_name(argv[i]);
+            for (char & c : type_name) {
+                c = std::tolower(c);
+            }
+
+            if (type_name == "q8_0") {
+                params.kv_type = GGML_TYPE_Q8_0;
+            } else if (type_name == "f16") {
+                params.kv_type = GGML_TYPE_F16;
+            } else if (type_name == "f32") {
+                params.kv_type = GGML_TYPE_F32;
+            } else {
+                fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
         } else if (arg == "--memory-f32") {
-            params.memory_f16 = false;
+            params.kv_type = GGML_TYPE_F32;
         } else if (arg == "--top-p") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -652,8 +674,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --rope-freq-scale N   RoPE frequency linear scaling factor, inverse of --rope-scale (default: %g)\n", params.rope_freq_scale);
     printf("  --ignore-eos          ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n");
     printf("  --no-penalize-nl      do not penalize newline token\n");
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    printf("  -kvt, --kv-type       the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
     printf("  --temp N              temperature (default: %.1f)\n", (double)params.temp);
     printf("  --perplexity          compute perplexity over each ctx window of the prompt\n");
     printf("  --hellaswag           compute HellaSwag score over random tasks from datafile supplied with -f\n");
@@ -735,7 +756,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     lparams.low_vram        = params.low_vram;
     lparams.mul_mat_q       = params.mul_mat_q;
     lparams.seed            = params.seed;
-    lparams.f16_kv          = params.memory_f16;
+    lparams.kv_type         = params.kv_type;
     lparams.use_mmap        = params.use_mmap;
     lparams.use_mlock       = params.use_mlock;
     lparams.logits_all      = params.perplexity;
@@ -1201,6 +1222,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
     fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
     fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
+    fprintf(stream, "kv_type: %s # default: false\n", ggml_type_name(params.kv_type));
     fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());
 
     fprintf(stream, "logit_bias:\n");
@@ -1215,7 +1237,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "lora_base: %s\n", params.lora_base.c_str());
     fprintf(stream, "low_vram: %s # default: false\n", params.low_vram ? "true" : "false");
     fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
-    fprintf(stream, "memory_f32: %s # default: false\n", !params.memory_f16 ? "true" : "false");
     fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", params.mirostat);
     fprintf(stream, "mirostat_ent: %f # default: 5.0\n", params.mirostat_tau);
     fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
diff --git a/common/common.h b/common/common.h
index f9dfd4a2c524c..b4fb2a3e102dc 100644
--- a/common/common.h
+++ b/common/common.h
@@ -94,9 +94,10 @@ struct gpt_params {
     bool hellaswag         = false; // compute HellaSwag score over random tasks from datafile supplied in prompt
     size_t hellaswag_tasks = 400;   // number of tasks to use when computing the HellaSwag score
 
+    ggml_type kv_type      = GGML_TYPE_Q8_0; // the type to use for the KV cache
+
     bool low_vram          = false; // if true, reduce VRAM usage at the cost of performance
     bool mul_mat_q         = true;  // if true, use mul_mat_q kernels instead of cuBLAS
-    bool memory_f16        = true;  // use f16 instead of f32 for memory kv
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
     bool interactive       = false; // interactive mode
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 34ddfde39d295..f48af6eb93dda 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -127,7 +127,7 @@ struct cmd_params {
     std::vector<int> n_prompt;
     std::vector<int> n_gen;
     std::vector<int> n_batch;
-    std::vector<bool> f32_kv;
+    std::vector<ggml_type> kv_type;
     std::vector<int> n_threads;
     std::vector<int> n_gpu_layers;
     std::vector<int> main_gpu;
@@ -144,7 +144,7 @@ static const cmd_params cmd_params_defaults = {
     /* n_prompt      */ {512},
     /* n_gen         */ {128},
     /* n_batch       */ {512},
-    /* f32_kv        */ {false},
+    /* kv_type       */ {GGML_TYPE_Q8_0},
     /* n_threads     */ {get_num_physical_cores()},
     /* n_gpu_layers  */ {99},
     /* main_gpu      */ {0},
@@ -165,7 +165,16 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -p, --n-prompt <n>                (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str());
     printf("  -n, --n-gen <n>                   (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
     printf("  -b, --batch-size <n>              (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str());
-    printf("  --memory-f32 <0|1>                (default: %s)\n", join(cmd_params_defaults.f32_kv, ",").c_str());
+
+    std::string kv_type_default;
+    for (unsigned int i = 0; i < cmd_params_defaults.kv_type.size(); ++i) {
+        if (i > 0) {
+            kv_type_default += ",";
+        }
+        kv_type_default += ggml_type_name(cmd_params_defaults.kv_type[i]);
+    }
+    printf("  -kvt, --kv_type <q8_0|f16|f32>    (default: %s)\n", kv_type_default.c_str());
+
     printf("  -t, --threads <n>                 (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str());
     printf("  -ngl N, --n-gpu-layers <n>        (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
     printf("  -mg i, --main-gpu <n>             (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
@@ -177,7 +186,6 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -v, --verbose                     (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
     printf("\n");
     printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n");
-
 }
 
 static cmd_params parse_cmd_params(int argc, char ** argv) {
@@ -228,13 +236,32 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
             }
             auto p = split<int>(argv[i], split_delim);
             params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
-        } else if (arg == "--memory-f32") {
+        } else if (arg == "-kvt" || arg == "--kv-type") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
-            auto p = split<int>(argv[i], split_delim);
-            params.f32_kv.insert(params.f32_kv.end(), p.begin(), p.end());
+            auto p = split<std::string>(argv[i], split_delim);
+
+            std::vector<ggml_type> kvt;
+            for (const std::string & type_name : p) {
+                if (type_name == "q8_0") {
+                    kvt.push_back(GGML_TYPE_Q8_0);
+                } else if (type_name == "f16") {
+                    kvt.push_back(GGML_TYPE_F16);
+                } else if (type_name == "f32") {
+                    kvt.push_back(GGML_TYPE_F32);
+                } else {
+                    invalid_param = true;
+                    break;
+                }
+            }
+            if (invalid_param) {
+                fprintf(stderr, "error: unknown KV type: %s. Known types: Q8_0, F16, F32.\n", argv[i]);
+                break;
+            }
+
+            params.kv_type.insert(params.kv_type.end(), kvt.begin(), kvt.end());
         } else if (arg == "-t" || arg == "--threads") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -332,7 +359,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.n_prompt.empty())     { params.n_prompt = cmd_params_defaults.n_prompt; }
     if (params.n_gen.empty())        { params.n_gen = cmd_params_defaults.n_gen; }
     if (params.n_batch.empty())      { params.n_batch = cmd_params_defaults.n_batch; }
-    if (params.f32_kv.empty())       { params.f32_kv = cmd_params_defaults.f32_kv; }
+    if (params.kv_type.empty())      { params.kv_type = cmd_params_defaults.kv_type; }
     if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; }
     if (params.main_gpu.empty())     { params.main_gpu = cmd_params_defaults.main_gpu; }
     if (params.mul_mat_q.empty())    { params.mul_mat_q = cmd_params_defaults.mul_mat_q; }
@@ -348,7 +375,7 @@ struct cmd_params_instance {
     int n_prompt;
     int n_gen;
     int n_batch;
-    bool f32_kv;
+    ggml_type kv_type;
     int n_threads;
     int n_gpu_layers;
     int main_gpu;
@@ -360,7 +387,7 @@ struct cmd_params_instance {
         llama_context_params lparams = llama_context_default_params();
         lparams.n_ctx = n_prompt + n_gen;
         lparams.n_batch = n_batch;
-        lparams.f16_kv = !f32_kv;
+        lparams.kv_type = kv_type;
         lparams.n_gpu_layers = n_gpu_layers;
         lparams.main_gpu = main_gpu;
         lparams.mul_mat_q = mul_mat_q;
@@ -376,7 +403,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
 
     for (const auto & m : params.model)
     for (const auto & nb : params.n_batch)
-    for (const auto & fk : params.f32_kv)
+    for (const auto & kvt : params.kv_type)
     for (const auto & nl : params.n_gpu_layers)
     for (const auto & mg : params.main_gpu)
     for (const auto & mmq : params.mul_mat_q)
@@ -388,7 +415,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances_int(const cmd_p
             /* .n_prompt     = */ n_prompt,
             /* .n_gen        = */ n_gen,
             /* .n_batch      = */ nb,
-            /* .f32_kv       = */ fk,
+            /* .kv_type      = */ kvt,
             /* .n_threads    = */ nt,
             /* .n_gpu_layers = */ nl,
             /* .main_gpu     = */ mg,
@@ -439,7 +466,7 @@ struct test {
     uint64_t model_n_params;
     int n_batch;
     int n_threads;
-    bool f32_kv;
+    ggml_type kv_type;
     int n_gpu_layers;
     int main_gpu;
     bool mul_mat_q;
@@ -459,7 +486,7 @@ struct test {
         model_n_params = llama_model_n_params(lmodel);
         n_batch = inst.n_batch;
         n_threads = inst.n_threads;
-        f32_kv = inst.f32_kv;
+        kv_type = inst.kv_type;
         n_gpu_layers = inst.n_gpu_layers;
         main_gpu = inst.main_gpu;
         mul_mat_q = inst.mul_mat_q;
@@ -523,7 +550,7 @@ struct test {
             "cuda", "opencl", "metal", "gpu_blas", "blas",
             "cpu_info", "gpu_info",
             "model_filename", "model_type", "model_size", "model_n_params",
-            "n_batch", "n_threads", "f16_kv",
+            "n_batch", "n_threads", "kv_type",
             "n_gpu_layers", "main_gpu", "mul_mat_q", "low_vram", "tensor_split",
             "n_prompt", "n_gen", "test_time",
             "avg_ns", "stddev_ns",
@@ -543,7 +570,7 @@ struct test {
             return INT;
         }
         if (field == "cuda" || field == "opencl" || field == "metal" || field == "gpu_blas" || field == "blas" ||
-            field == "f16_kv" || field == "mul_mat_q" || field == "low_vram") {
+            field == "mul_mat_q" || field == "low_vram") {
             return BOOL;
         }
         if (field == "avg_ts" || field == "stddev_ts") {
@@ -573,7 +600,7 @@ struct test {
             std::to_string(cuda), std::to_string(opencl), std::to_string(metal), std::to_string(gpu_blas), std::to_string(blas),
             cpu_info, gpu_info,
             model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params),
-            std::to_string(n_batch), std::to_string(n_threads), std::to_string(!f32_kv),
+            std::to_string(n_batch), std::to_string(n_threads), std::string(ggml_type_name(kv_type)),
             std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(mul_mat_q), std::to_string(low_vram), tensor_split_str,
             std::to_string(n_prompt), std::to_string(n_gen), test_time,
             std::to_string(avg_ns()), std::to_string(stdev_ns()),
@@ -757,8 +784,8 @@ struct markdown_printer : public printer {
         if (params.n_batch.size() > 1 || params.n_batch != cmd_params_defaults.n_batch) {
             fields.push_back("n_batch");
         }
-        if (params.f32_kv.size() > 1 || params.f32_kv != cmd_params_defaults.f32_kv) {
-            fields.push_back("f16_kv");
+        if (params.kv_type.size() > 1 || params.kv_type != cmd_params_defaults.kv_type) {
+            fields.push_back("kv_type");
         }
         if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
             fields.push_back("main_gpu");
@@ -826,6 +853,9 @@ struct markdown_printer : public printer {
             } else if (field == "t/s") {
                 snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
                 value = buf;
+            } else if (field == "kv_type") {
+                snprintf(buf, sizeof(buf), "%s", ggml_type_name(t.kv_type));
+                value = buf;
             } else if (vmap.find(field) != vmap.end()) {
                 value = vmap.at(field);
             } else {
diff --git a/examples/main/README.md b/examples/main/README.md
index 26e1e28dd08c1..8c6d23edc5c16 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -276,9 +276,9 @@ These options help improve the performance and memory usage of the LLaMA models.
 
 -   `--numa`: Attempt optimizations that help on some systems with non-uniform memory access. This currently consists of pinning an equal proportion of the threads to the cores on each NUMA node, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
 
-### Memory Float 32
+### KV cache type
 
--   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. This doubles the context memory requirement and cached prompt file size but does not appear to increase generation quality in a measurable way. Not recommended.
+-   `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
 
 ### Batch Size
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index d78112260de08..ae8d85a0e5355 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -36,7 +36,7 @@
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
-static std::vector<llama_token> * g_input_tokens;
+static std::vector<llama_token> * g_embd_inp;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
@@ -44,7 +44,7 @@ static bool is_interacting = false;
 
 static void write_logfile(
     const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> & input_tokens, const std::string & output,
+    const std::vector<llama_token> & embd_inp, const std::string & output,
     const std::vector<llama_token> & output_tokens
 ) {
     if (params.logdir.empty()) {
@@ -71,7 +71,7 @@ static void write_logfile(
     fprintf(logfile, "binary: main\n");
     char model_desc[128];
     llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, embd_inp, model_desc);
 
     fprintf(logfile, "\n");
     fprintf(logfile, "######################\n");
@@ -95,7 +95,7 @@ static void sigint_handler(int signo) {
             console::cleanup();
             printf("\n");
             llama_print_timings(*g_ctx);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+            write_logfile(*g_ctx, *g_params, *g_model, *g_embd_inp, g_output_ss->str(), *g_output_tokens);
             _exit(130);
         }
     }
@@ -238,7 +238,7 @@ int main(int argc, char ** argv) {
     const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
     LOG("add_bos: %d\n", add_bos);
 
-    std::vector<llama_token> embd_inp;
+    std::vector<llama_token> embd_inp; g_embd_inp = &embd_inp;
 
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
@@ -465,7 +465,6 @@ int main(int argc, char ** argv) {
     int n_session_consumed = 0;
     int n_past_guidance    = 0;
 
-    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
     std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
     std::ostringstream output_ss;     g_output_ss     = &output_ss;
 
@@ -661,9 +660,7 @@ int main(int argc, char ** argv) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
                 printf("%s", token_str.c_str());
 
-                if (embd.size() > 1) {
-                    input_tokens.push_back(id);
-                } else {
+                if (embd.size() == 1) {
                     output_tokens.push_back(id);
                     output_ss << token_str;
                 }
@@ -843,7 +840,7 @@ int main(int argc, char ** argv) {
     }
 
     llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+    write_logfile(ctx, params, model, embd_inp, output_ss.str(), output_tokens);
 
     if (ctx_guidance) { llama_free(ctx_guidance); }
     llama_free(ctx);
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
index 9f930dede4bcf..1bb72874617f9 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@@ -312,7 +312,7 @@ int main(int argc, char ** argv) {
 
         lparams.n_ctx      = 256;
         lparams.seed       = 1;
-        lparams.f16_kv     = false;
+        lparams.kv_type    = GGML_TYPE_F32;
         lparams.use_mlock  = false;
 
         model = llama_load_model_from_file(params.model.c_str(), lparams);
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
index eac307904fbc1..8894f2b42c96b 100644
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@@ -26,7 +26,7 @@ int main(int argc, char ** argv) {
 
     lparams.n_ctx     = params.n_ctx;
     lparams.seed      = params.seed;
-    lparams.f16_kv    = params.memory_f16;
+    lparams.kv_type   = params.kv_type;
     lparams.use_mmap  = params.use_mmap;
     lparams.use_mlock = params.use_mlock;
 
diff --git a/examples/server/README.md b/examples/server/README.md
index 5176080463839..8c5a96de4400f 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -13,7 +13,7 @@ Command line options:
 -   `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
 -   `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
 -   `-b N`, `--batch-size N`: Set the batch size for prompt processing. Default: `512`.
--   `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
+-   `-kvt, --kv-type`: The data type to use for the KV cache. Uses q8_0 by default. Alternatives are f16 and f32. The alternatives increase memory consumption for marginal quality differences.
 -   `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
 -   `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
 -   `--numa`: Attempt optimizations that help on some NUMA systems.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 1bb8e92c0f95e..a1f410e065b2a 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -704,8 +704,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
     printf("  --rope-freq-base N    RoPE base frequency (default: %.1f)\n", params.rope_freq_base);
     printf("  --rope-freq-scale N   RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale);
     printf("  -b N, --batch-size N  batch size for prompt processing (default: %d)\n", params.n_batch);
-    printf("  --memory-f32          use f32 instead of f16 for memory key+value (default: disabled)\n");
-    printf("                        not recommended: doubles context memory required and no measurable increase in quality\n");
+    printf("  -kvt, --kv-type       the type to use for the KV cache (default: q8_0; alternatives: f16, f32)\n");
     if (llama_mlock_supported())
     {
         printf("  --mlock               force system to keep model in RAM rather than swapping or compressing\n");
@@ -838,9 +837,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
             }
             params.rope_freq_scale = std::stof(argv[i]);
         }
+        else if (arg == "--kv-type" || arg == "-kvt")
+        {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+
+            std::string type_name(argv[i]);
+            for (char & c : type_name) {
+                c = std::tolower(c);
+            }
+
+            if (type_name == "q8_0") {
+                params.kv_type = GGML_TYPE_Q8_0;
+            } else if (type_name == "f16") {
+                params.kv_type = GGML_TYPE_F16;
+            } else if (type_name == "f32") {
+                params.kv_type = GGML_TYPE_F32;
+            } else {
+                fprintf(stderr, "error: unknown KV type: %s. Known types: q8_0, f16, f32.\n", argv[i]);
+                invalid_param = true;
+                break;
+            }
+        }
         else if (arg == "--memory-f32" || arg == "--memory_f32")
         {
-            params.memory_f16 = false;
+            params.kv_type = GGML_TYPE_F32;
         }
         else if (arg == "--threads" || arg == "-t")
         {
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 5346b9e09519a..06f63b589fbdd 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -59,6 +59,7 @@
 #define cudaMemcpyHostToDevice hipMemcpyHostToDevice
 #define cudaMemcpyKind hipMemcpyKind
 #define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
@@ -1515,23 +1516,30 @@ static __device__ void convert_f16(const void * vx, const int ib, const int iqs,
     v.y = x[ib + iqs + 1];
 }
 
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded) {
+static __global__ void quantize_q8_1(
+    const float * __restrict__ src, void * __restrict__ vdst, const int kx, const int kx_padded, const int ky,
+    const int ky_stride, const int channel_stride) {
+
     const int ix = blockDim.x*blockIdx.x + threadIdx.x;
 
     if (ix >= kx_padded) {
         return;
     }
 
-    const int iy = blockDim.y*blockIdx.y + threadIdx.y;
+    const int iy      = blockDim.y*blockIdx.y + threadIdx.y;
+    const int channel = blockDim.z*blockIdx.z + threadIdx.z;
 
-    const int i_padded = iy*kx_padded + ix;
+    // padded and contiguous:
+    const int i_padded = channel*ky*kx_padded + iy*kx_padded + ix;
 
-    block_q8_1 * y = (block_q8_1 *) vy;
+    block_q8_1 * dst = (block_q8_1 *) vdst;
 
     const int ib = i_padded / QK8_1; // block index
     const int iqs = i_padded % QK8_1; // quant index
 
-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
+    // not padded and not necessarily contiguous:
+    const float xi = ix < kx ? src[channel*channel_stride + iy*ky_stride + ix] : 0.0f;
+
     float amax = fabsf(xi);
     float sum = xi;
 
@@ -1544,14 +1552,14 @@ static __global__ void quantize_q8_1(const float * __restrict__ x, void * __rest
     const float d = amax / 127;
     const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
 
-    y[ib].qs[iqs] = q;
+    dst[ib].qs[iqs] = q;
 
     if (iqs > 0) {
         return;
     }
 
-    reinterpret_cast<half&>(y[ib].ds.x) = d;
-    reinterpret_cast<half&>(y[ib].ds.y) = sum;
+    reinterpret_cast<half&>(dst[ib].ds.x) = d;
+    reinterpret_cast<half&>(dst[ib].ds.y) = sum;
 }
 
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
@@ -3389,10 +3397,11 @@ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
               allocate_tiles_cuda_t allocate_tiles, load_tiles_cuda_t load_tiles, int vdr, vec_dot_q_mul_mat_cuda_t vec_dot>
 static __device__ __forceinline__ void mul_mat_q(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
+    const block_q_t  * x = ((const block_q_t  *) vx) + blockIdx.z*channel_stride_x;
+    const block_q8_1 * y = ((const block_q8_1 *) vy) + blockIdx.z*channel_stride_y;
 
     const int blocks_per_row_x = ncols_x / qk;
     const int blocks_per_col_y = nrows_y / QK8_1;
@@ -3420,8 +3429,8 @@ static __device__ __forceinline__ void mul_mat_q(
 
     for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
 
-        load_tiles(x + row_x_0*blocks_per_row_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, blocks_per_row_x);
+        load_tiles(x + row_x_0*row_stride_x + ib0, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                   threadIdx.y, nrows_x-row_x_0-1, threadIdx.x, row_stride_x);
 
 #pragma unroll
         for (int ir = 0; ir < qr; ++ir) {
@@ -3490,7 +3499,7 @@ static __device__ __forceinline__ void mul_mat_q(
                 continue;
             }
 
-            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
+            dst[blockIdx.z*ncols_dst*nrows_dst + col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
         }
     }
 }
@@ -3516,7 +3525,8 @@ template <bool need_check> static __global__ void
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
     mul_mat_q4_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -3531,8 +3541,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
     const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
@@ -3540,8 +3549,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
     const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
@@ -3549,9 +3557,8 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
         load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q4_0_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
@@ -3579,7 +3586,8 @@ template <bool need_check> static __global__ void
 #endif // __CUDA_ARCH__ < CC_TURING
     mul_mat_q4_1(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -3594,8 +3602,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
     const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
@@ -3603,8 +3610,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
     const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
@@ -3612,9 +3618,8 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
         load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q4_1_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
@@ -3640,7 +3645,8 @@ template <bool need_check> static __global__ void
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
     mul_mat_q5_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -3655,8 +3661,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
     const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
@@ -3664,8 +3669,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
     const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
@@ -3673,9 +3677,8 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
         load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q5_0_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
@@ -3701,7 +3704,8 @@ template <bool need_check> static __global__ void
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 mul_mat_q5_1(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -3716,8 +3720,7 @@ mul_mat_q5_1(
 
     mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
     const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
@@ -3725,8 +3728,7 @@ mul_mat_q5_1(
 
     mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
     const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
@@ -3734,9 +3736,8 @@ mul_mat_q5_1(
 
     mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
         load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q5_1_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
@@ -3762,7 +3763,8 @@ template <bool need_check> static __global__ void
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
     mul_mat_q8_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -3777,8 +3779,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
     const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
@@ -3786,8 +3787,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
     const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
@@ -3795,9 +3795,8 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
         load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q8_0_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
@@ -3823,7 +3822,8 @@ template <bool need_check> static __global__ void
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 mul_mat_q2_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -3838,8 +3838,7 @@ mul_mat_q2_K(
 
     mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
@@ -3847,8 +3846,7 @@ mul_mat_q2_K(
 
     mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
@@ -3856,9 +3854,8 @@ mul_mat_q2_K(
 
     mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
         load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q2_K_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
@@ -3886,7 +3883,8 @@ template <bool need_check> static __global__ void
 #endif // __CUDA_ARCH__ < CC_TURING
     mul_mat_q3_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -3901,8 +3899,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
@@ -3910,8 +3907,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
@@ -3919,9 +3915,8 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
         load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q3_K_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
@@ -3949,7 +3944,8 @@ template <bool need_check> static __global__ void
 #endif // __CUDA_ARCH__ < CC_TURING
     mul_mat_q4_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -3964,8 +3960,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
@@ -3973,8 +3968,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
@@ -3982,9 +3976,8 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
         load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q4_K_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
@@ -4010,7 +4003,8 @@ template <bool need_check> static __global__ void
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 mul_mat_q5_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4025,8 +4019,7 @@ mul_mat_q5_K(
 
     mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
@@ -4034,8 +4027,7 @@ mul_mat_q5_K(
 
     mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
@@ -4043,9 +4035,8 @@ mul_mat_q5_K(
 
     mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
         load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q5_K_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
@@ -4073,7 +4064,8 @@ template <bool need_check> static __global__ void
 #endif // __CUDA_ARCH__ < CC_TURING
     mul_mat_q6_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const int row_stride_x, const int channel_stride_x, const int channel_stride_y) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4088,8 +4080,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= CC_TURING
     const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
@@ -4097,8 +4088,7 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #elif __CUDA_ARCH__ >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
@@ -4106,32 +4096,36 @@ template <bool need_check> static __global__ void
 
     mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
         load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, row_stride_x, channel_stride_x, channel_stride_y);
 #else
-    (void) vec_dot_q6_K_q8_1_mul_mat;
     assert(false);
 #endif // __CUDA_ARCH__ >= CC_TURING
 }
 
 template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
-static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows) {
+static __global__ void mul_mat_vec_q(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols, const int nrows, const int row_stride, const int channel_stride_x, const int channel_stride_y) {
+
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
 
     if (row >= nrows) {
         return;
     }
 
+    const int channel = blockIdx.z*blockDim.z + threadIdx.z;
+
     const int blocks_per_row = ncols / qk;
     const int blocks_per_warp = vdr * WARP_SIZE / qi;
 
 // partial sum for each thread
     float tmp = 0.0f;
 
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
+    const block_q_t  * x = ((const block_q_t  *) vx) + channel*channel_stride_x;
+    const block_q8_1 * y = ((const block_q8_1 *) vy) + channel*channel_stride_y;
 
     for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row*blocks_per_row + i + threadIdx.x / (qi/vdr); // x block index
+        const int ibx = row*row_stride + i + threadIdx.x / (qi/vdr); // x block index
 
         const int iby = (i + threadIdx.x / (qi/vdr)) * (qk/QK8_1); // y block index that aligns with ibx
 
@@ -4147,17 +4141,20 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
     }
 
     if (threadIdx.x == 0) {
-        dst[row] = tmp;
+        dst[channel*nrows + row] = tmp;
     }
 }
 
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows) {
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, bool need_check>
+static __global__ void dequantize_mul_mat_vec(
+    const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst,
+    const int ncols, const int nrows_x, const int nrows_y) {
+
     // qk = quantized weights per x block
     // qr = number of quantized weights per data value in x block
     const int row = blockIdx.y*blockDim.y + threadIdx.y;
 
-    if (row >= nrows) {
+    if (row >= nrows_x) {
         return;
     }
 
@@ -4190,16 +4187,22 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons
             dfloat2 v;
             dequantize_kernel(vx, ib, iqs + j/qr, v);
 
+            const int iy0 = iybs + iqs + j/qr + 0;
+            const int iy1 = iybs + iqs + j/qr + y_offset;
+
             // matrix multiplication
             // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
 #ifdef GGML_CUDA_F16
-            tmp += __hmul2(v, {
-                y[iybs + iqs + j/qr + 0],
-                y[iybs + iqs + j/qr + y_offset]
-            });
+            const half yi0 = need_check && iy0 >= nrows_y ? __float2half(0.0f) : y[iy0];
+            const half yi1 = need_check && iy1 >= nrows_y ? __float2half(0.0f) : y[iy1];
+
+            tmp += __hmul2(v, {yi0, yi1});
 #else
-            tmp += v.x * y[iybs + iqs + j/qr + 0];
-            tmp += v.y * y[iybs + iqs + j/qr + y_offset];
+            const float yi0 = need_check && iy0 >= nrows_y ? 0.0f : y[iy0];
+            const float yi1 = need_check && iy1 >= nrows_y ? 0.0f : y[iy1];
+
+            tmp += v.x * yi0;
+            tmp += v.y * yi1;
 #endif // GGML_CUDA_F16
         }
     }
@@ -4354,6 +4357,60 @@ static __global__ void cpy_f32_f16(const char * cx, char * cdst, const int ne,
     cpy_1(cx + x_offset, cdst + dst_offset);
 }
 
+template <bool first_incomplete, bool last_incomplete, bool save_unquantized>
+static __global__ void cpy_f32_q8_0(
+    const char * cx, char * cdst, const int i_blck_0, const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02, const int nb11, const int nb12) {
+
+    const int i0 = blockDim.x*blockIdx.x + threadIdx.x;
+    const int i1 = blockDim.y*blockIdx.y + threadIdx.y;
+    const int i2 = blockDim.z*blockIdx.z + threadIdx.z;
+
+    float * x        = (float *)      (cx   + (i0 - i_blck_0)*nb00 + i1*nb01 + i2*nb02);
+    block_q8_0 * dst = (block_q8_0 *) (cdst +                        i1*nb11 + i2*nb12);
+    dst += i0 / QK8_0;
+    const int iqs = i0 % QK8_0;
+
+    float zero = 0.0f;
+    void * src = x;
+
+    if (first_incomplete && i0 < i_blck_0) {
+        src = &dst[1 + iqs/8].qs[sizeof(float) * (iqs % 8)];
+    }
+    if (last_incomplete && i0 >= (i_blck_0 + ne00)) {
+        src = &zero;
+    }
+
+    float val;
+    if (first_incomplete) {
+        memcpy(&val, src, sizeof(float));
+    } else {
+        val = *((float *) src);
+    }
+
+    if (save_unquantized && last_incomplete && i0 / QK8_0 == (i_blck_0 + ne00) / QK8_0) {
+        memcpy(&dst[1 + iqs/8].qs[sizeof(float) * (iqs % 8)], src, sizeof(float));
+    }
+
+    float amax = fabsf(val);
+
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        amax = fmaxf(amax, __shfl_xor_sync(0xffffffff, amax, mask, 32));
+    }
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : roundf(val / d);
+
+    dst->qs[iqs] = q;
+
+    if (threadIdx.x != 0) {
+        return;
+    }
+
+    dst->d = d;
+}
+
 // rope == RoPE == rotary positional embedding
 static __global__ void rope_f32(const float * x, float * dst, const int ncols, const float p0,
                                 const float p_delta, const int p_delta_rows, const float theta_scale) {
@@ -4571,11 +4628,14 @@ static void rms_norm_f32_cuda(const float * x, float * dst, const int ncols, con
     }
 }
 
-static void quantize_row_q8_1_cuda(const float * x, void * vy, const int kx, const int ky, const int kx_padded, cudaStream_t stream) {
+static void quantize_row_q8_1_cuda(
+    const float * x, void * vy, const int kx, const int ky, const int kx_padded, const int nchannels,
+    const int row_stride, const int channel_stride, cudaStream_t stream) {
+
     const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    const dim3 num_blocks(block_num_x, ky, 1);
+    const dim3 num_blocks(block_num_x, ky, nchannels);
     const dim3 block_size(CUDA_DEQUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx, kx_padded, ky, row_stride, channel_stride);
 }
 
 static void dequantize_row_q4_0_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -4644,181 +4704,240 @@ static void dequantize_row_q6_K_cuda(const void * vx, float * y, const int k, cu
 #endif
 }
 
-static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q4_0_cuda(
+    const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows_x, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows_x + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0, false>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows_x, nrows_y);
 }
 
-static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q4_1_cuda(
+    const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows_x, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows_x + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1, false>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows_x, nrows_y);
 }
 
-static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q5_0_cuda(
+    const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows_x, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows_x + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0, false>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows_x, nrows_y);
 }
 
-static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q5_1_cuda(
+    const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows_x, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows_x + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1, false>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows_x, nrows_y);
 }
 
-static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+static void dequantize_mul_mat_vec_q8_0_cuda(
+    const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows_x, const int nrows_y, cudaStream_t stream) {
+
+    const int block_num_y = (nrows_x + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    if (ncols == nrows_y && ncols % GGML_CUDA_DMMV_X == 0) {
+        dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0, false>
+            <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows_x, nrows_y);
+    } else {
+        dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0, true>
+            <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows_x, nrows_y);
+    }
 }
 
-static void dequantize_mul_mat_vec_q2_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q2_K_cuda(
+    const void * vx, const float * y, float * dst, const int ncols, const int nrows, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
     const int block_num_y = (nrows + ny - 1) / ny;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(32, ny, 1);
     dequantize_mul_mat_vec_q2_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    (void) nrows_y;
 }
 
-static void dequantize_mul_mat_vec_q3_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q3_K_cuda(
+    const void * vx, const float * y, float * dst, const int ncols, const int nrows, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int ny = 2 / K_QUANTS_PER_ITERATION;
     const int block_num_y = (nrows + ny - 1) / ny;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(32, ny, 1);
     dequantize_mul_mat_vec_q3_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    (void) nrows_y;
 }
 
-static void dequantize_mul_mat_vec_q4_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q4_K_cuda(
+    const void * vx, const float * y, float * dst, const int ncols, const int nrows, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int ny = 2 / K_QUANTS_PER_ITERATION;
     const int block_num_y = (nrows + ny - 1) / ny;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(32, ny, 1);
     dequantize_mul_mat_vec_q4_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    (void) nrows_y;
 }
 
-static void dequantize_mul_mat_vec_q5_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q5_K_cuda(
+    const void * vx, const float * y, float * dst, const int ncols, const int nrows, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const dim3 block_dims(32, 1, 1);
     dequantize_mul_mat_vec_q5_k<<<nrows, block_dims, 0, stream>>>(vx, y, dst, ncols);
+    (void) nrows_y;
 }
 
-static void dequantize_mul_mat_vec_q6_K_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void dequantize_mul_mat_vec_q6_K_cuda(
+    const void * vx, const float * y, float * dst, const int ncols, const int nrows, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int ny = 2 / K_QUANTS_PER_ITERATION;
     const int block_num_y = (nrows + ny - 1) / ny;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(32, ny, 1);
     dequantize_mul_mat_vec_q6_k<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    (void) nrows_y;
 }
 
-static void mul_mat_vec_q4_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q4_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK4_0 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
-static void mul_mat_vec_q4_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q4_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK4_1 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
-static void mul_mat_vec_q5_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q5_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK5_0 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
-static void mul_mat_vec_q5_1_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q5_1_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK5_1 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
-static void mul_mat_vec_q8_0_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q8_0_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK8_0 == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
-static void mul_mat_vec_q2_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q2_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
-static void mul_mat_vec_q3_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q3_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
-static void mul_mat_vec_q4_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q4_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
-static void mul_mat_vec_q5_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q5_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
-static void mul_mat_vec_q6_K_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void mul_mat_vec_q6_K_q8_1_cuda(
+    const void * vx, const void * vy, float * dst, const int ncols, const int nrows, const int nchannels,
+    const int row_stride, const int channel_stride, const int channel_stride_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % QK_K == 0);
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const dim3 block_nums(1, block_num_y, 1);
+    const dim3 block_nums(1, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
     mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>
-        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows);
+        <<<block_nums, block_dims, 0, stream>>>(vx, vy, dst, ncols, nrows, row_stride, channel_stride, channel_stride_y);
 }
 
 static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, cudaStream_t stream) {
@@ -4826,13 +4945,15 @@ static void convert_fp16_to_fp32_cuda(const void * vx, float * y, const int k, c
     dequantize_block<1, 1, convert_f16><<<num_blocks, CUDA_DEQUANTIZE_BLOCK_SIZE, 0, stream>>>(vx, y, k);
 }
 
-static void convert_mul_mat_vec_f16_cuda(const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+static void convert_mul_mat_vec_f16_cuda(
+    const void * vx, const dfloat * y, float * dst, const int ncols, const int nrows_x, const int nrows_y, cudaStream_t stream) {
+
     GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows_x + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const dim3 block_nums(1, block_num_y, 1);
     const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1);
-    dequantize_mul_mat_vec<1, 1, convert_f16>
-        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows);
+    dequantize_mul_mat_vec<1, 1, convert_f16, false>
+        <<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols, nrows_x, nrows_y);
 }
 
 static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
@@ -4866,7 +4987,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
 
 static void ggml_mul_mat_q4_0_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -4895,23 +5017,26 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q4_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 }
 
 static void ggml_mul_mat_q4_1_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -4940,23 +5065,26 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q4_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 }
 
 static void ggml_mul_mat_q5_0_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -4985,23 +5113,26 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q5_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 }
 
 static void ggml_mul_mat_q5_1_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -5030,23 +5161,26 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q5_1<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 }
 
 static void ggml_mul_mat_q8_0_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -5075,23 +5209,26 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q8_0<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 }
 
 static void ggml_mul_mat_q2_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -5120,23 +5257,26 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q2_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 }
 
 static void ggml_mul_mat_q3_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
 #if QK_K == 256
 
@@ -5167,24 +5307,27 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q3_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 #endif
 }
 
 static void ggml_mul_mat_q4_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -5213,23 +5356,26 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q4_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 }
 
 static void ggml_mul_mat_q5_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -5258,23 +5404,26 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q5_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 }
 
 static void ggml_mul_mat_q6_K_q8_1_cuda(
     const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
-    const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
+    const int ncols_y, const int nrows_y, const int nrows_dst, const int nchannels,
+    const int row_stride, const int channel_stride_x, const int channel_stride_y, cudaStream_t stream) {
 
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
@@ -5303,17 +5452,19 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
 
     const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
     const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const dim3 block_nums(block_num_x, block_num_y, 1);
+    const dim3 block_nums(block_num_x, block_num_y, nchannels);
     const dim3 block_dims(WARP_SIZE, nwarps, 1);
 
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     } else {
         const bool need_check = true;
         mul_mat_q6_K<need_check><<<block_nums, block_dims, 0, stream>>>
-            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+            (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst,
+             row_stride, channel_stride_x, channel_stride_y);
     }
 }
 
@@ -5356,6 +5507,43 @@ static void ggml_cpy_f32_f16_cuda(
         (cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12);
 }
 
+static void ggml_cpy_f32_q8_0_cuda(
+    const char * cx, char * cdst, const int i_blck_0, const int ne00, const int ne01, const int ne02,
+    const int nb00, const int nb01, const int nb02, const int nb11, const int nb12, const bool pad, cudaStream_t stream) {
+
+    const int num_blocks_x = (i_blck_0 + ne00 + WARP_SIZE - 1) / WARP_SIZE;
+    const dim3 block_nums(num_blocks_x, ne01, ne02);
+    const dim3 block_dims(WARP_SIZE, 1 , 1);
+
+    const bool first_incomplete = i_blck_0 != 0;
+    const bool  last_incomplete = (i_blck_0 + ne00) % QK8_0 != 0;
+
+    if (first_incomplete && last_incomplete) {
+        GGML_ASSERT(i_blck_0 + ne00 < QK8_0); // otherwise there would be a race condition
+        GGML_ASSERT(pad == false);
+        cpy_f32_q8_0<true, true, true><<<block_nums, block_dims, 0, stream>>>
+            (cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
+    } else if (first_incomplete && !last_incomplete) {
+        GGML_ASSERT(pad == false);
+        cpy_f32_q8_0<true, false, true><<<block_nums, block_dims, 0, stream>>>
+            (cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
+    } else if (!first_incomplete && last_incomplete && pad) {
+        cpy_f32_q8_0<false, true, false><<<block_nums, block_dims, 0, stream>>>
+            (cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
+    } else if (!first_incomplete && last_incomplete && !pad) {
+        cpy_f32_q8_0<false, true, true><<<block_nums, block_dims, 0, stream>>>
+            (cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
+    } else if (!first_incomplete && !last_incomplete && pad) {
+        cpy_f32_q8_0<false, false, false><<<block_nums, block_dims, 0, stream>>>
+            (cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
+    } else if (!first_incomplete && !last_incomplete && !pad) {
+        cpy_f32_q8_0<false, false, true><<<block_nums, block_dims, 0, stream>>>
+            (cx, cdst, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02, nb11, nb12);
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
 static void scale_f32_cuda(const float * x, float * dst, const float scale, const int k, cudaStream_t stream) {
     const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
     scale_f32<<<num_blocks, CUDA_SCALE_BLOCK_SIZE, 0, stream>>>(x, dst, scale, k);
@@ -5756,18 +5944,23 @@ inline void ggml_cuda_op_rms_norm(
     (void) src1_dd;
 }
 
+template <bool buffers_contiguous>
 inline void ggml_cuda_op_mul_mat_q(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, const cudaStream_t & stream) {
 
     const int64_t ne00 = src0->ne[0];
+    const int64_t ne02 = src0->ne[2];
 
     const int64_t ne10 = src1->ne[0];
-    GGML_ASSERT(ne10 % QK8_1 == 0);
+    const int64_t ne11 = src1->ne[1];
 
     const int64_t ne0 = dst->ne[0];
 
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
     const int64_t row_diff = row_high - row_low;
 
     int id;
@@ -5777,36 +5970,64 @@ inline void ggml_cuda_op_mul_mat_q(
     // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
     const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
 
+    const int64_t nchannels = buffers_contiguous ? 1 : ne02;
+
+    const int64_t src0_blck_size = ggml_blck_size(src0->type);
+    const int64_t ne10_whole_blck = ne10 % src0_blck_size == 0 ? ne10 : ne10 - ne10 % src0_blck_size + src0_blck_size;
+    const int64_t row_stride       = buffers_contiguous ? ne10_whole_blck      / ggml_blck_size(src0->type) : nb01 / ggml_type_size(src0->type);
+    const int64_t channel_stride_x = buffers_contiguous ? ne10_whole_blck*ne11 / ggml_blck_size(src0->type) : nb02 / ggml_type_size(src0->type);
+    const int64_t channel_stride_y = src1_padded_row_size*ne11 / QK8_1;
+
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q4_0_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q4_1:
-            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q4_1_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q5_0:
-            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q5_0_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q5_1:
-            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q5_1_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q8_0:
-            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q8_0_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q2_K:
-            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q2_K_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q3_K:
-            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q3_K_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q4_K:
-            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q4_K_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q5_K:
-            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q5_K_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q6_K:
-            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q6_K_q8_1_cuda(
+                src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, nchannels,
+                row_stride, channel_stride_x, channel_stride_y, stream);
             break;
         default:
             GGML_ASSERT(false);
@@ -5877,44 +6098,64 @@ static int64_t get_row_rounding(ggml_type type) {
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 }
 
+template <bool buffers_contiguous>
 inline void ggml_cuda_op_mul_mat_vec_q(
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
     const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
     const int64_t src1_padded_row_size, const cudaStream_t & stream) {
 
     const int64_t ne00 = src0->ne[0];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
     const int64_t row_diff = row_high - row_low;
+    const int nchannels = buffers_contiguous ? 1 : ne02;
 
+    const int row_stride_x     = buffers_contiguous ? ne00   / ggml_blck_size(src0->type) : nb01 / ggml_type_size(src0->type);
+    const int channel_stride_x = buffers_contiguous ? ne00*1 / ggml_blck_size(src0->type) : nb02 / ggml_type_size(src0->type);
+    const int channel_stride_y = src1_padded_row_size / QK8_1;
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, nchannels,
+                                       row_stride_x, channel_stride_x, channel_stride_y, stream);
             break;
         default:
             GGML_ASSERT(false);
@@ -5934,6 +6175,9 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
     const int64_t src1_padded_row_size, const cudaStream_t & stream) {
 
     const int64_t ne00 = src0->ne[0];
+
+    const int64_t ne10 = src1->ne[0];
+
     const int64_t row_diff = row_high - row_low;
 
     // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
@@ -5957,37 +6201,37 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, ne10, stream);
             break;
         default:
             GGML_ASSERT(false);
@@ -6321,6 +6565,9 @@ static void ggml_cuda_op_mul_mat(
     const int64_t ne0 = dst->ne[0];
     const int64_t ne1 = dst->ne[1];
 
+    const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+
     const int nb2 = dst->nb[2];
     const int nb3 = dst->nb[3];
 
@@ -6346,8 +6593,7 @@ static void ggml_cuda_op_mul_mat(
     const bool src0_is_contiguous = ggml_is_contiguous(src0);
 
     const bool src1_is_contiguous = ggml_is_contiguous(src1);
-    const int64_t src1_padded_col_size = ne10 % MATRIX_ROW_PADDING == 0 ?
-        ne10 : ne10 - ne10 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING;
+    const int64_t src1_padded_col_size = ((ne10 + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING) * MATRIX_ROW_PADDING;
 
     const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
     GGML_ASSERT(!(split && ne02 > 1));
@@ -6405,8 +6651,9 @@ static void ggml_cuda_op_mul_mat(
         if (src0_on_device && src0_is_contiguous) {
             src0_dd[id] = (char *) src0_extra->data_device[id];
         } else {
+            GGML_ASSERT(!split || (ne02 == 1 && ne03 == 1));
             const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
-            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
+            src0_dd[id] = (char *) ggml_cuda_pool_malloc(size_src0_ddq, &src0_as[id]);
         }
 
         if (src1_on_device && src1_is_contiguous) {
@@ -6418,8 +6665,9 @@ static void ggml_cuda_op_mul_mat(
         if (convert_src1_to_q8_1) {
             src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
 
-            if (split && src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
+            if (src1_on_device && src1_is_contiguous) {
+                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, ne11, src1_padded_col_size,
+                                       ne02, nb11/sizeof(float), nb12/sizeof(float), stream);
                 CUDA_CHECK(cudaGetLastError());
             }
         }
@@ -6500,8 +6748,9 @@ static void ggml_cuda_op_mul_mat(
                     GGML_ASSERT(false);
                 }
 
-                if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous))) {
+                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, 1,
+                                           ne10, ne10*ne11, stream);
                     CUDA_CHECK(cudaGetLastError());
                 }
 
@@ -6632,25 +6881,73 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
     return false;
 }
 
+void ggml_cuda_mul_mat_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
+    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(dst->backend  == GGML_BACKEND_GPU);
+    GGML_ASSERT(ggml_is_quantized(src0->type));
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+
+    const int64_t src1_padded_col_size = ((ne10 + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING) * MATRIX_ROW_PADDING;
+
+    CUDA_CHECK(cudaSetDevice(g_main_device));
+    const cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    char * src0_ddq = (char *) src0_extra->data_device[g_main_device];
+
+    struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    size_t as;
+    char * src1_ddq = (char *) ggml_cuda_pool_malloc(src1_padded_col_size*ggml_nrows(src1), &as);
+    quantize_row_q8_1_cuda(src1_ddf, src1_ddq, ne10, ne11, src1_padded_col_size, ne02,
+                           nb11/sizeof(float), nb12/sizeof(float), main_stream);
+
+    ggml_cuda_op_mul_mat_q<false>(src0, src1, dst, src0_ddq, nullptr, src1_ddq, dst_ddf,
+                                  0, ne01, ne11, src1_padded_col_size, main_stream);
+    CUDA_CHECK(cudaGetLastError());
+
+    ggml_cuda_pool_free(src1_ddq, as);
+}
+
 void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
     GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
     GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
     GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
     GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
     const int64_t ne00 = src0->ne[0];
     const int64_t ne01 = src0->ne[1];
     const int64_t ne02 = src0->ne[2];
 
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
     const int64_t ne12 = src1->ne[2];
 
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+
+    const int64_t src1_padded_col_size = ((ne10 + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING) * MATRIX_ROW_PADDING;
+
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
     struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
+    char * src0_ddq = (char *) src0_extra->data_device[g_main_device];
 
     struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
@@ -6658,30 +6955,50 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr
     struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 
-    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+    if (src0->type == GGML_TYPE_F16) {
+        ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+    } else if (ggml_is_quantized(src0->type)) {
+        size_t as;
+        char * src1_ddq = (char *) ggml_cuda_pool_malloc(src1_padded_col_size*ggml_nrows(src1), &as);
+        quantize_row_q8_1_cuda(src1_ddf, src1_ddq, ne10, ne11, src1_padded_col_size, ne02,
+                               nb11/sizeof(float), nb12/sizeof(float), main_stream);
+
+        ggml_cuda_op_mul_mat_vec_q<false>(src0, src1, dst, src0_ddq, nullptr, src1_ddq, dst_ddf,
+                                          0, ne01, ne10, src1_padded_col_size, main_stream);
+
+        ggml_cuda_pool_free(src1_ddq, as);
+    } else {
+        GGML_ASSERT(false);
+    }
 }
 
 void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
     GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1));
     GGML_ASSERT(!ggml_is_permuted(src0));
     GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
     GGML_ASSERT(src1->type == GGML_TYPE_F32);
 
     const int64_t ne00 = src0->ne[0];
     const int64_t ne01 = src0->ne[1];
     const int64_t ne02 = src0->ne[2];
 
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
     const int64_t ne12 = src1->ne[2];
 
     const int64_t nb01 = src0->nb[1];
     const int64_t nb02 = src0->nb[2];
 
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+
+    const int64_t src1_padded_col_size = ((ne10 + MATRIX_ROW_PADDING - 1) / MATRIX_ROW_PADDING) * MATRIX_ROW_PADDING;
+
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
     struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
+    char * src0_ddq = (char *) src0_extra->data_device[g_main_device];
 
     struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
     float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
@@ -6689,15 +7006,30 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1
     struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
 
-    const int64_t row_stride_x = nb01 / sizeof(half);
-    const int64_t channel_stride_x = nb02 / sizeof(half);
+    if (src0->type == GGML_TYPE_F16) {
+        const int row_stride_x = nb01 / sizeof(half);
+        const int channel_stride_x = nb02 / sizeof(half);
 
-    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+        ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+    } else if (ggml_is_quantized(src0->type)) {
+        size_t as;
+        char * src1_ddq = (char *) ggml_cuda_pool_malloc(src1_padded_col_size*ggml_nrows(src1), &as);
+        quantize_row_q8_1_cuda(src1_ddf, src1_ddq, ne10, ne11, src1_padded_col_size, ne02,
+                               nb11/sizeof(float), nb12/sizeof(float), main_stream);
+
+        ggml_cuda_op_mul_mat_vec_q<false>(src0, src1, dst, src0_ddq, nullptr, src1_ddq, dst_ddf,
+                                          0, ne01, ne10, src1_padded_col_size, main_stream);
+
+        ggml_cuda_pool_free(src1_ddq, as);
+    } else {
+        GGML_ASSERT(false);
+    }
 }
 
 void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
+    const bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
         src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU;
+    const bool src0_is_quantized = ggml_is_quantized(src0->type);
 
     int64_t min_compute_capability = INT_MAX;
     for (int64_t id = 0; id < g_device_count; ++id) {
@@ -6707,9 +7039,12 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
         }
     }
 
-    if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    // no quantized non-contiguous support for lower CC kernels implemented
+    const bool nc_okay = src0->type == GGML_TYPE_F16 || g_compute_capabilities[g_main_device] >= MIN_CC_DP4A;
+
+    if (all_on_device && nc_okay && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
+    } else if (all_on_device && nc_okay && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
         ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
     }else if (src0->type == GGML_TYPE_F32) {
         ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
@@ -6723,13 +7058,17 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
 #endif // GGML_CUDA_FORCE_DMMV
 
             if (use_mul_mat_vec_q) {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q<true>, true);
             } else {
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
             }
         } else {
             if (g_mul_mat_q && ggml_is_quantized(src0->type) && min_compute_capability >= MIN_CC_DP4A) {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
+                if (all_on_device && nc_okay && src0->backend != GGML_BACKEND_GPU_SPLIT) {
+                    ggml_cuda_mul_mat_nc(src0, src1, dst);
+                } else {
+                    ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q<true>, true);
+                }
             } else {
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
             }
@@ -6744,9 +7083,6 @@ void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_te
 }
 
 void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const int64_t ne = ggml_nelements(src0);
-    GGML_ASSERT(ne == ggml_nelements(src1));
-
     GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
     GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
 
@@ -6755,6 +7091,7 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
 
     const int64_t ne00 = src0->ne[0];
     const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
     GGML_ASSERT(src0->ne[3] == 1);
 
     const int64_t nb00 = src0->nb[0];
@@ -6769,6 +7106,16 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
     const int64_t nb11 = src1->nb[1];
     const int64_t nb12 = src1->nb[2];
 
+    const int64_t blck_size = ggml_blck_size(src1->type);
+    const int64_t ne00_padded = ((ne00 + blck_size - 1) / blck_size) * blck_size;
+    const int64_t ne = ggml_nelements(src0);
+    const bool pad = dst->op_params[0] & 1;
+    if (pad) {
+        GGML_ASSERT(ne00_padded * ggml_nrows(src0) == ggml_nelements(src1));
+    } else {
+        GGML_ASSERT(ne == ggml_nelements(src1));
+    }
+
     CUDA_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
@@ -6784,6 +7131,23 @@ void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tens
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
         ggml_cpy_f32_f16_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02,
                               ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+        GGML_ASSERT(nb10 == sizeof(block_q8_0));
+
+        size_t i_blck_0 = 0;
+        if (src1->op == GGML_OP_VIEW) {
+            const size_t * op_params = (const size_t *) src1->op_params;
+            i_blck_0 = op_params[1];
+        }
+
+        if (ggml_is_contiguous(src1)) {
+            ggml_cpy_f32_q8_0_cuda(
+                src0_ddc, src1_ddc, i_blck_0, ne00, ne01, ne02, nb00, nb01, nb02,
+                ne00_padded*sizeof(block_q8_0)/QK8_0, ne00_padded*ne01*sizeof(block_q8_0)/QK8_0, pad, main_stream);
+        } else {
+            ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, i_blck_0, ne00, ne01, ne02,
+                                nb00, nb01, nb02, nb11, nb12, pad, main_stream);
+        }
     } else {
         GGML_ASSERT(false);
     }
diff --git a/ggml.c b/ggml.c
index a0be068d6c9f7..6d11cf9f10d36 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1117,11 +1117,19 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 
 static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
     assert(QK8_0 == 32);
-    assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
     block_q8_0 * restrict y = vy;
 
+    if (k % QK8_0 != 0) {
+        float x_end[QK8_0] = {0};
+        memcpy(x_end, x + nb*QK8_0, sizeof(float) * (k % QK8_0));
+
+        block_q8_0 * y_end = y + nb;
+
+        quantize_row_q8_0(x_end, y_end, QK8_0);
+    }
+
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
         float32x4_t srcv [8];
@@ -4384,8 +4392,13 @@ static inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {
 static inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
-    return (t0->ne[0]           == t1->ne[0])  &&
-           (t1->ne[2]%t0->ne[2] == 0)          && // verify t0 is broadcastable
+    const int64_t blck_size = ggml_blck_size(t0->type);
+
+    const int64_t nblcks00_padded = (t0->ne[0] + blck_size - 1) / blck_size;
+    const int64_t nblcks10_padded = (t1->ne[0] + blck_size - 1) / blck_size;
+
+    return (nblcks00_padded     == nblcks10_padded) && // ensure same number of blocks after padding
+           (t1->ne[2]%t0->ne[2] == 0)               && // verify t0 is broadcastable
            (t1->ne[3]%t0->ne[3] == 0);
 }
 
@@ -6333,8 +6346,15 @@ static struct ggml_tensor * ggml_cpy_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         struct ggml_tensor  * b,
-        bool inplace) {
-    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
+        const bool inplace,
+        const bool pad) {
+    if (pad) {
+        const int64_t blck_size = ggml_blck_size(b->type);
+        const int64_t ne00_padded = ((a->ne[0] + blck_size - 1) / blck_size) * blck_size;
+        GGML_ASSERT(ne00_padded*ggml_nrows(a) == ggml_nelements(b));
+    } else {
+        GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));
+    }
 
     bool is_node = false;
 
@@ -6350,6 +6370,8 @@ static struct ggml_tensor * ggml_cpy_impl(
         ggml_format_name(result, "%s (copy)", a->name);
     }
 
+    ggml_set_op_params_i32(result, 0, pad ? 1 : 0);
+
     result->op   = GGML_OP_CPY;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = a;
@@ -6362,14 +6384,21 @@ struct ggml_tensor * ggml_cpy(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b, false);
+    return ggml_cpy_impl(ctx, a, b, false, false);
 }
 
 struct ggml_tensor * ggml_cpy_inplace(
         struct ggml_context * ctx,
         struct ggml_tensor * a,
         struct ggml_tensor * b) {
-    return ggml_cpy_impl(ctx, a, b, true);
+    return ggml_cpy_impl(ctx, a, b, true, false);
+}
+
+struct ggml_tensor * ggml_cpy_pad(
+        struct ggml_context * ctx,
+        struct ggml_tensor * a,
+        struct ggml_tensor * b) {
+    return ggml_cpy_impl(ctx, a, b, false, true);
 }
 
 // ggml_cont
@@ -6544,7 +6573,8 @@ static struct ggml_tensor * ggml_view_impl(
         struct ggml_tensor  * a,
         int                   n_dims,
         const int64_t       * ne,
-        size_t                offset) {
+        size_t                offset,
+        size_t                i_blck) {
 
     bool is_node = false;
 
@@ -6555,7 +6585,8 @@ static struct ggml_tensor * ggml_view_impl(
     struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, n_dims, ne, a, offset);
     ggml_format_name(result, "%s (view)", a->name);
 
-    ggml_set_op_params(result, &offset, sizeof(offset));
+    size_t params[2] = {offset, i_blck};
+    ggml_set_op_params(result, &params, sizeof(params));
 
     result->op   = GGML_OP_VIEW;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6572,7 +6603,7 @@ struct ggml_tensor * ggml_view_1d(
         int64_t               ne0,
         size_t                offset) {
 
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset);
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset, 0);
 
     return result;
 }
@@ -6589,7 +6620,7 @@ struct ggml_tensor * ggml_view_2d(
 
     const int64_t ne[2] = { ne0, ne1 };
 
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset);
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset, 0);
 
     result->nb[1] = nb1;
     result->nb[2] = result->nb[1]*ne1;
@@ -6612,7 +6643,7 @@ struct ggml_tensor * ggml_view_3d(
 
     const int64_t ne[3] = { ne0, ne1, ne2 };
 
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset);
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 3, ne, offset, 0);
 
     result->nb[1] = nb1;
     result->nb[2] = nb2;
@@ -6637,7 +6668,7 @@ struct ggml_tensor * ggml_view_4d(
 
     const int64_t ne[4] = { ne0, ne1, ne2, ne3 };
 
-    struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset);
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 4, ne, offset, 0);
 
     result->nb[1] = nb1;
     result->nb[2] = nb2;
@@ -6646,6 +6677,42 @@ struct ggml_tensor * ggml_view_4d(
     return result;
 }
 
+// ggml_view_blck_1d
+
+struct ggml_tensor * ggml_view_blck_1d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        size_t                offset,
+        size_t                i_blck) {
+
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 1, &ne0, offset, i_blck);
+
+    return result;
+}
+
+// ggml_view_blck_2d
+
+struct ggml_tensor * ggml_view_blck_2d(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        int64_t               ne0,
+        int64_t               ne1,
+        size_t                nb1,
+        size_t                offset,
+        size_t                i_blck) {
+
+    const int64_t ne[2] = { ne0, ne1 };
+
+    struct ggml_tensor * result = ggml_view_impl(ctx, a, 2, ne, offset, i_blck);
+
+    result->nb[1] = nb1;
+    result->nb[2] = result->nb[1]*ne1;
+    result->nb[3] = result->nb[2];
+
+    return result;
+}
+
 // ggml_permute
 
 struct ggml_tensor * ggml_permute(
@@ -8216,6 +8283,8 @@ static void ggml_compute_forward_dup_f16(
 
     GGML_TENSOR_UNARY_OP_LOCALS;
 
+    GGML_ASSERT(dst->op_params[0] == 0);
+
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
 
@@ -8479,14 +8548,21 @@ static void ggml_compute_forward_dup_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
-
     if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
         return;
     }
 
     GGML_TENSOR_UNARY_OP_LOCALS;
 
+    const bool pad = dst->op_params[0] & 1;
+    const int blck_size = ggml_blck_size(dst->type);
+    const int ne00_padded = ((ne00 + blck_size - 1) / blck_size) * blck_size;
+    if (pad) {
+        GGML_ASSERT(ggml_nelements(dst) == ne00_padded*ggml_nrows(src0));
+    } else {
+        GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
+    }
+
     const int ith = params->ith; // thread index
     const int nth = params->nth; // number of threads
 
@@ -8544,15 +8620,20 @@ static void ggml_compute_forward_dup_f32(
                 ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
 
                 size_t id = 0;
-                size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
+                const size_t rs = nb0 * ne00_padded / blck_size;
                 char * dst_ptr = (char *) dst->data;
+                float src0_padded[ne00_padded];
 
                 for (int i03 = 0; i03 < ne03; i03++) {
                     for (int i02 = 0; i02 < ne02; i02++) {
                         id += rs * ir0;
                         for (int i01 = ir0; i01 < ir1; i01++) {
                             const float * src0_ptr = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);
-                            quantize_row_q(src0_ptr, dst_ptr + id, ne00);
+                            if (ne00 != ne00_padded) {
+                                memcpy(src0_padded, src0_ptr, ne00*sizeof(float));
+                                memset(src0_padded + ne00, 0, (ne00_padded - ne00) * sizeof(float));
+                            }
+                            quantize_row_q(ne00 == ne00_padded ? src0_ptr : src0_padded, dst_ptr + id, ne00_padded);
                             id += rs;
                         }
                         id += rs * (ne01 - ir1);
@@ -8719,6 +8800,48 @@ static void ggml_compute_forward_dup_f32(
                 }
             }
         }
+    } else if (type_traits[dst->type].from_float) {
+        GGML_ASSERT(!pad);
+        GGML_ASSERT(ne00 == ne0);
+        GGML_ASSERT(ne01 == ne1);
+        GGML_ASSERT(ne02 == ne2);
+        GGML_ASSERT(ne03 == ne3);
+
+        size_t blck_index_0 = 0;
+        if (dst->src[1]->op == GGML_OP_VIEW) {
+            const size_t * op_params = (const size_t *) dst->src[1]->op_params;
+            blck_index_0 = op_params[1];
+        }
+
+        ggml_from_float_t const quantize_row_q = type_traits[dst->type].from_float;
+
+        for (int i03 = 0; i03 < ne03; i03++) {
+            for (int i02 = 0; i02 < ne02; i02++) {
+                for (int i01 = ir0; i01 < ir1; i01++) {
+                    const char * src0_row_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;
+                    char * dst_row_ptr = (char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3;
+                    size_t blck_index = blck_index_0;
+
+                    for (int i00 = 0; i00 < ne00; ++i00) {
+                        char * dst_ptr = dst_row_ptr
+                            + ggml_element_size(dst) * ((i00 + blck_index_0) / ggml_blck_size(dst->type));
+                        float * dst_tmp_ptr = (float *) (dst_ptr + ggml_element_size(dst));
+
+                        if (blck_index == 0) {
+                            memset(dst_tmp_ptr, 0, ggml_blck_size(dst->type)*sizeof(float));
+                        }
+
+                        dst_tmp_ptr[blck_index] = *((const float *) (src0_row_ptr + i00*nb00));
+
+                        blck_index = (blck_index + 1) % ggml_blck_size(dst->type);
+
+                        if (blck_index == 0 || i00 == (ne00 - 1)) {
+                            quantize_row_q(dst_tmp_ptr, dst_ptr, ggml_blck_size(dst->type));
+                        }
+                    }
+                }
+            }
+        }
     } else {
         GGML_ASSERT(false); // TODO: implement
     }
@@ -11333,7 +11456,8 @@ static void ggml_compute_forward_mul_mat(
     if (params->type == GGML_TASK_INIT) {
         if (src1->type != vec_dot_type) {
             char * wdata = params->wdata;
-            const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
+            const size_t row_size = ggml_type_size(vec_dot_type)*((ne10 + ggml_blck_size(vec_dot_type) - 1)
+                / ggml_blck_size(vec_dot_type));
 
             for (int64_t i13 = 0; i13 < ne13; ++i13) {
                 for (int64_t i12 = 0; i12 < ne12; ++i12) {
@@ -11353,7 +11477,8 @@ static void ggml_compute_forward_mul_mat(
     }
 
     const void * wdata    = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ne10*ggml_type_size(vec_dot_type)/ggml_blck_size(vec_dot_type);
+    const size_t row_size = ggml_type_size(vec_dot_type)*((ne10 + ggml_blck_size(vec_dot_type) - 1)
+        / ggml_blck_size(vec_dot_type));
 
     const int64_t nr0 = ne01;           // src0 rows
     const int64_t nr1 = ne11*ne12*ne13; // src1 rows
diff --git a/ggml.h b/ggml.h
index f45456876da62..0bc5eb82aae17 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1062,6 +1062,12 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    // a -> b, pad row size of a to a multiple of block size of b, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy_pad(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
     // make contiguous
     GGML_API struct ggml_tensor * ggml_cont(
             struct ggml_context * ctx,
@@ -1146,6 +1152,22 @@ extern "C" {
             size_t                nb3,
             size_t                offset);
 
+    GGML_API struct ggml_tensor * ggml_view_blck_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            size_t                offset,
+            size_t                i_blck);
+
+    GGML_API struct ggml_tensor * ggml_view_blck_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            size_t                nb1, // row stride in bytes
+            size_t                offset,
+            size_t                i_blck);
+
     GGML_API struct ggml_tensor * ggml_permute(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/llama.cpp b/llama.cpp
index 79b48897d8bbe..e6bd8a0b7ae1b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -963,12 +963,33 @@ struct llama_hparams {
         return n_embd/n_gqa();
     }
 
-    size_t kv_size() const {
-        size_t result = 2ull;
-        result *= (size_t) n_embd_gqa();
+    size_t kv_size(ggml_type type) const {
+        return kv_size_k(type) + kv_size_v(type);
+    }
+
+    size_t kv_size_k(ggml_type type) const {
+        const int64_t blck_size = ggml_blck_size(type);
+        const int64_t n_embd_head_padded = ((n_embd_head() + blck_size - 1) / blck_size) * blck_size;
+
+        size_t result = 1ull;
+        result *= (size_t) n_embd_head_padded;
+        result *= (size_t) n_head_kv;
         result *= (size_t) n_ctx;
         result *= (size_t) n_layer;
-        result *= sizeof(ggml_fp16_t);
+        result *= ggml_type_size(type);
+        result /= blck_size;
+        return result;
+    }
+
+    size_t kv_size_v(ggml_type type) const {
+        const size_t row_padding = type == GGML_TYPE_Q8_0 ? 128 : 0;
+
+        size_t result = 1ull;
+        result *= (size_t) n_embd_gqa();
+        result *= (size_t) n_ctx + row_padding;
+        result *= (size_t) n_layer;
+        result *= ggml_type_size(type);
+        result /= ggml_blck_size(type);
         return result;
     }
 };
@@ -1165,6 +1186,9 @@ struct llama_context {
     // key + value cache for the self attention
     struct llama_kv_cache kv_self;
 
+    std::vector<llama_token> token_history;
+    int64_t previous_v_blck;
+
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
     bool logits_all = false;
@@ -1200,13 +1224,25 @@ static bool llama_kv_cache_init(
                          ggml_type   wtype,
                                int   n_ctx,
                                int   n_gpu_layers) {
-    const int n_embd  = hparams.n_embd_gqa();
-    const int n_layer = hparams.n_layer;
+    const int blck_size          = ggml_blck_size(wtype);
+    const int n_embd_head        = hparams.n_embd_head();
+    const int n_embd_head_padded = ((n_embd_head + blck_size - 1) / blck_size) * blck_size;
+    const int n_head_kv          = hparams.n_head_kv;
+    const int n_layer            = hparams.n_layer;
+
+    if (n_ctx % ggml_blck_size(wtype) != 0) {
+        LLAMA_LOG_ERROR("error: for KV type %s n_ctx must be a multiple of %d but received n_ctx=%d\n",
+                        ggml_type_name(wtype), ggml_blck_size(wtype), n_ctx);
+        return false;
+    }
 
-    const int64_t n_mem      = n_layer*n_ctx;
-    const int64_t n_elements = n_embd*n_mem;
+    // if the KV cache is quantized we need a little extra space for each row to store the
+    // unquantized values between evals (this avoids precision loss when rebuilding the block)
+    const int64_t n_mem        = n_layer*n_ctx;
+    const int64_t n_elements_k = n_embd_head_padded * n_head_kv *  n_mem;
+    const int64_t n_elements_v = n_embd_head        * n_head_kv * (n_mem + (wtype == GGML_TYPE_Q8_0 ? 128*n_layer : 0));
 
-    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);
+    cache.buf.resize((n_elements_k + n_elements_v)*ggml_type_size(wtype)/ggml_blck_size(wtype) + 2u*MB);
     cache.n = 0;
 
     struct ggml_init_params params;
@@ -1221,8 +1257,8 @@ static bool llama_kv_cache_init(
         return false;
     }
 
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements_k);
+    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements_v);
     ggml_set_name(cache.k, "cache_k");
     ggml_set_name(cache.v, "cache_v");
 
@@ -2305,15 +2341,13 @@ static void llm_load_tensors(
 
     // print memory requirements
     {
-        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
-
         // this is the total memory required to run the inference
         size_t mem_required =
             ctx_size +
             mmapped_size - vram_weights; // weights in VRAM not in memory
 
         // this is the memory required by one llama_state
-        const size_t mem_required_state = scale*hparams.kv_size();
+        const size_t mem_required_state = hparams.kv_size(memory_type);
 
         LLAMA_LOG_INFO("%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                 mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
@@ -2337,7 +2371,7 @@ static void llm_load_tensors(
                 LLAMA_LOG_INFO("%s: cannot offload v cache to GPU due to low VRAM option\n", __func__);
             } else {
                 LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
-                vram_kv_cache += hparams.kv_size() / 2;
+                vram_kv_cache += hparams.kv_size_v(memory_type);
             }
         }
         if (n_gpu_layers > (int) hparams.n_layer + 2) {
@@ -2345,7 +2379,7 @@ static void llm_load_tensors(
                 LLAMA_LOG_WARN("%s: cannot offload k cache to GPU due to low VRAM option\n", __func__);
             } else {
                 LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
-                vram_kv_cache += hparams.kv_size() / 2;
+                vram_kv_cache += hparams.kv_size_k(memory_type);
             }
         }
 #elif defined(GGML_USE_CLBLAST)
@@ -2454,13 +2488,17 @@ static struct ggml_cgraph * llm_build_llama(
 
     GGML_ASSERT(!!kv_self.ctx);
 
-    const int64_t n_embd      = hparams.n_embd;
-    const int64_t n_layer     = hparams.n_layer;
-    const int64_t n_ctx       = hparams.n_ctx;
-    const int64_t n_head      = hparams.n_head;
-    const int64_t n_head_kv   = hparams.n_head_kv;
-    const int64_t n_embd_head = hparams.n_embd_head();
-    const int64_t n_embd_gqa  = hparams.n_embd_gqa();
+    const int64_t blck_size_k = ggml_blck_size(kv_self.k->type);
+    const int64_t blck_size_v = ggml_blck_size(kv_self.v->type);
+
+    const int64_t n_embd             = hparams.n_embd;
+    const int64_t n_layer            = hparams.n_layer;
+    const int64_t n_ctx              = hparams.n_ctx;
+    const int64_t n_head             = hparams.n_head;
+    const int64_t n_head_kv          = hparams.n_head_kv;
+    const int64_t n_embd_head        = hparams.n_embd_head();
+    const int64_t n_embd_head_padded = ((n_embd_head + blck_size_k - 1) / blck_size_k) * blck_size_k;
+    const int64_t n_embd_gqa         = hparams.n_embd_gqa();
 
     GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -2597,19 +2635,23 @@ static struct ggml_cgraph * llm_build_llama(
                 offload_func_v(Vcur);
                 ggml_set_name(Vcur, "Vcur");
 
-                struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
+                struct ggml_tensor * k = ggml_view_1d(
+                    ctx0, kv_self.k, N*n_embd_head_padded*n_head_kv,
+                    (ggml_element_size(kv_self.k)*n_embd_head_padded*n_head_kv)*(il*n_ctx + n_past)/blck_size_k);
                 offload_func_kq(k);
                 ggml_set_name(k, "k");
 
-                struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
-                        (   n_ctx)*ggml_element_size(kv_self.v),
-                        (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
+                const int64_t v_row_size = kv_self.v->type == GGML_TYPE_Q8_0 ? n_ctx + 128 : n_ctx;
+                struct ggml_tensor * v = ggml_view_blck_2d(ctx0, kv_self.v, N, n_embd_gqa,
+                        (   v_row_size)*ggml_element_size(kv_self.v)/blck_size_v,
+                        (il*v_row_size)*ggml_element_size(kv_self.v)*n_embd_gqa/blck_size_v + ggml_element_size(kv_self.v)*(n_past/blck_size_v),
+                        n_past % blck_size_v);
                 offload_func_v(v);
                 ggml_set_name(v, "v");
 
                 // important: storing RoPE-ed version of K in the KV cache!
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy_pad(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf,     ggml_cpy(ctx0, Vcur, v));
             }
 
             struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
@@ -2618,10 +2660,10 @@ static struct ggml_cgraph * llm_build_llama(
 
             struct ggml_tensor * K =
                 ggml_view_3d(ctx0, kv_self.k,
-                        n_embd_head, n_past + N, n_head_kv,
-                        ggml_element_size(kv_self.k)*n_embd_gqa,
-                        ggml_element_size(kv_self.k)*n_embd_head,
-                        ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
+                        n_embd_head_padded, n_past + N, n_head_kv,
+                        ggml_element_size(kv_self.k)*n_embd_head_padded*n_head_kv/blck_size_k,
+                        ggml_element_size(kv_self.k)*n_embd_head_padded/blck_size_k,
+                        ggml_element_size(kv_self.k)*n_embd_head_padded*n_head_kv*n_ctx*il/blck_size_k);
             offload_func_kq(K);
             ggml_set_name(K, "K");
 
@@ -2646,13 +2688,16 @@ static struct ggml_cgraph * llm_build_llama(
             offload_func_v(KQ_soft_max);
             ggml_set_name(KQ_soft_max, "KQ_soft_max");
 
+
             // split cached V into n_head heads
+            const int64_t v_ne0_padded = ((n_past + N + blck_size_v - 1) / blck_size_v) * blck_size_v; // ne0 padded to multiple of blck_size_v
+            const int64_t v_row_size   = kv_self.v->type == GGML_TYPE_Q8_0 ? n_ctx + 128 : n_ctx; // maximum ne0 + space for temporarily storing unquantized values
             struct ggml_tensor * V =
                 ggml_view_3d(ctx0, kv_self.v,
-                        n_past + N, n_embd_head, n_head_kv,
-                        ggml_element_size(kv_self.v)*n_ctx,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
-                        ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
+                        v_ne0_padded, n_embd_head, n_head_kv,
+                        ggml_element_size(kv_self.v)*v_row_size/blck_size_v,
+                        ggml_element_size(kv_self.v)*v_row_size*n_embd_head/blck_size_v,
+                        ggml_element_size(kv_self.v)*v_row_size*n_embd_gqa*il/blck_size_v);
             offload_func_v(V);
             ggml_set_name(V, "V");
 
@@ -3744,9 +3789,29 @@ static bool llama_eval_internal(
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = hparams.n_vocab;
 
+    std::vector<llama_token> tokens_v_redo;
+    const int64_t     v_blck_size = ggml_blck_size(kv_self.v->type);
+    const int64_t  current_v_blck = n_past / v_blck_size;
+
+    // if the v component of the KV cache is q8_0 the unquantized temporary values may have already been overwritten
+    // in that case we need to roll back to the beginning of a q8_0 block
+    const int64_t n_v_redo = lctx.previous_v_blck > current_v_blck ? n_past % v_blck_size : 0;
+    if (n_v_redo > 0) {
+        tokens_v_redo.insert(tokens_v_redo.end(),
+            lctx.token_history.begin() + n_past - n_v_redo,
+            lctx.token_history.begin() + n_past);
+        for (int64_t i = 0; i < n_tokens; ++i) {
+            tokens_v_redo.push_back(tokens[i]);
+        }
+
+        n_tokens = tokens_v_redo.size();
+        n_past  -= n_v_redo;
+    }
+    const llama_token * tokens_eff = n_v_redo > 0 ? tokens_v_redo.data() : tokens;
+
     ggml_allocr_reset(lctx.alloc);
 
-    ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
+    ggml_cgraph * gf = llama_build_graph(lctx, tokens_eff, embd, n_tokens, n_past);
 
     ggml_allocr_alloc_graph(lctx.alloc, gf);
 
@@ -3773,7 +3838,7 @@ static bool llama_eval_internal(
     // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
     //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
     //       with the BLAS calls. need a better solution
-    if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+    if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
         n_threads = std::min(4, n_threads);
     }
 
@@ -3827,11 +3892,11 @@ static bool llama_eval_internal(
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res) + n_vocab*n_v_redo, sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res) + n_vocab*(n_v_redo+N-1), sizeof(float)*n_vocab);
         }
     }
 
@@ -3843,6 +3908,12 @@ static bool llama_eval_internal(
         memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
     }
 
+    // update token history and how far the v component of the KV cache was filled (for q8_0 rollback)
+    for (int64_t i = 0; i < n_tokens; ++i) {
+        lctx.token_history[n_past + i] = tokens_eff[i];
+    }
+    lctx.previous_v_blck = (n_past + n_tokens) / v_blck_size;
+
     // measure the performance only for the single-token evals
     if (N == 1) {
         lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -6192,9 +6263,9 @@ struct llama_context_params llama_context_default_params() {
         /*.rope_freq_scale             =*/ 1.0f,
         /*.progress_callback           =*/ nullptr,
         /*.progress_callback_user_data =*/ nullptr,
+        /*.kv_type                     =*/ GGML_TYPE_Q8_0,
         /*.low_vram                    =*/ false,
         /*.mul_mat_q                   =*/ true,
-        /*.f16_kv                      =*/ true,
         /*.logits_all                  =*/ false,
         /*.vocab_only                  =*/ false,
         /*.use_mmap                    =*/ true,
@@ -6269,8 +6340,6 @@ struct llama_model * llama_load_model_from_file(
 
     llama_model * model = new llama_model;
 
-    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
-
     unsigned cur_percentage = 0;
     if (params.progress_callback == NULL) {
         params.progress_callback_user_data = &cur_percentage;
@@ -6289,7 +6358,7 @@ struct llama_model * llama_load_model_from_file(
 
     if (!llama_model_load(path_model, *model, params.n_ctx, params.n_batch, params.n_gpu_layers,
                 params.main_gpu, params.tensor_split, params.mul_mat_q, params.rope_freq_base, params.rope_freq_scale,
-                params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
+                params.low_vram, params.kv_type, params.use_mmap, params.use_mlock, params.vocab_only,
                 params.progress_callback, params.progress_callback_user_data)) {
         LLAMA_LOG_ERROR("%s: failed to load model\n", __func__);
         delete model;
@@ -6320,11 +6389,9 @@ struct llama_context * llama_new_context_with_model(
     ctx->rng = std::mt19937(params.seed);
     ctx->logits_all = params.logits_all;
 
-    ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
-
     // reserve memory for context buffers
     if (!params.vocab_only) {
-        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, memory_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
+        if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, params.kv_type, ctx->model.hparams.n_ctx, params.n_gpu_layers)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;
@@ -6337,6 +6404,9 @@ struct llama_context * llama_new_context_with_model(
 
         const auto & hparams = ctx->model.hparams;
 
+        ctx->token_history.resize(hparams.n_ctx);
+        ctx->previous_v_blck = 0;
+
         // resized during inference
         if (params.logits_all) {
             ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);
diff --git a/llama.h b/llama.h
index 369be048c0012..4a950dee5dfe5 100644
--- a/llama.h
+++ b/llama.h
@@ -140,10 +140,11 @@ extern "C" {
         // context pointer passed to the progress callback
         void * progress_callback_user_data;
 
+        enum ggml_type kv_type; // the type to use for the KV cache
+
         // Keep the booleans together to avoid misalignment during copy-by-value.
         bool low_vram;   // if true, reduce VRAM usage at the cost of performance
         bool mul_mat_q;  // if true, use experimental mul_mat_q kernels
-        bool f16_kv;     // use fp16 for KV cache
         bool logits_all; // the llama_eval() call computes all logits, not just the last one
         bool vocab_only; // only load the vocabulary, no weights
         bool use_mmap;   // use mmap if possible
diff --git a/run_with_preset.py b/run_with_preset.py
index 9b4d7ecbe82d4..df416828ec122 100755
--- a/run_with_preset.py
+++ b/run_with_preset.py
@@ -11,9 +11,9 @@
     "batch-size", "cfg-negative-prompt", "cfg-scale", "chunks", "color", "ctx-size", "escape",
     "export", "file", "frequency-penalty", "grammar", "grammar-file", "hellaswag",
     "hellaswag-tasks", "ignore-eos", "in-prefix", "in-prefix-bos", "in-suffix", "instruct",
-    "interactive", "interactive-first", "keep", "logdir", "logit-bias", "lora", "lora-base",
-    "low-vram", "main-gpu", "memory-f32", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
-    "model", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
+    "interactive", "interactive-first", "keep", "kv_type", "logdir", "logit-bias", "lora",
+    "lora-base", "low-vram", "main-gpu", "mirostat", "mirostat-ent", "mirostat-lr", "mlock",
+    "model", "mtest", "multiline-input", "n-gpu-layers", "n-predict", "no-mmap", "no-mul-mat-q",
     "np-penalize-nl", "numa", "ppl-output-type", "ppl-stride", "presence-penalty", "prompt",
     "prompt-cache", "prompt-cache-all", "prompt-cache-ro", "random-prompt", "repeat-last-n",
     "repeat-penalty", "reverse-prompt", "rope-freq-base", "rope-freq-scale", "rope-scale", "seed",