Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vulkan: Dynamic subgroup size support for Q6_K mat_vec #10536

Merged
merged 6 commits into from
Nov 30, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q6_k.comp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@

#include "mul_mat_vec_base.comp"

layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

shared FLOAT_TYPE tmp[32];
layout (constant_id = 0) const uint SUBGROUP_SIZE = 32;
netrunnereve marked this conversation as resolved.
Show resolved Hide resolved

shared FLOAT_TYPE tmp[SUBGROUP_SIZE];

void main() {
const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
Expand All @@ -21,21 +23,19 @@ void main() {
const uint num_blocks_per_row = p.ncols / QUANT_K;
const uint ib0 = a_offset / QUANT_K + row*num_blocks_per_row;

const uint tid = gl_LocalInvocationID.x/K_QUANTS_PER_ITERATION; // 0...31 or 0...16
const uint ix = gl_LocalInvocationID.x%K_QUANTS_PER_ITERATION; // 0 or 0, 1
uint it_size = gl_WorkGroupSize.x/16;

const uint tid = gl_LocalInvocationID.x;
const uint itid = tid/it_size; // 0...16
const uint ix = tid%it_size;
netrunnereve marked this conversation as resolved.
Show resolved Hide resolved

const uint step = 16/K_QUANTS_PER_ITERATION; // 16 or 8
const uint step = 8;

const uint v_im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = tid - step*v_im; // 0...15 or 0...7
const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const uint v_in = itid - step*v_im; // 0...15 or 0...7

#if K_QUANTS_PER_ITERATION == 1
const uint l0 = v_in; // 0...15
const uint is = 0;
#else
const uint l0 = 4 * v_in; // 0, 4, 8, ..., 28
const uint is = v_in / 4;
#endif

const uint ql_offset = 64*v_im + l0;
const uint qh_offset = 32*v_im + l0;
Expand All @@ -44,7 +44,7 @@ void main() {

FLOAT_TYPE temp = FLOAT_TYPE(0.0); // partial sum for thread in warp

[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
const uint y_idx = i * QUANT_K + y_offset;

const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);
Expand Down Expand Up @@ -95,10 +95,10 @@ void main() {
}

tmp[gl_LocalInvocationID.x] = temp;

// sum up partial sums and write back result

barrier();
[[unroll]] for (uint s = 16; s > 0; s >>= 1) {
[[unroll]] for (uint s = gl_WorkGroupSize.x/2; s > 0; s >>= 1) {
if (tid < s) {
tmp[tid] += tmp[tid + s];
}
Expand Down