Skip to content

Commit

Permalink
Add environmental variable GGML_KLEIDIAI_SME
Browse files Browse the repository at this point in the history
  • Loading branch information
chaxu01 committed Jan 30, 2025
1 parent 6adca19 commit 119d3bf
Show file tree
Hide file tree
Showing 14 changed files with 38 additions and 48 deletions.
2 changes: 0 additions & 2 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1099,8 +1099,6 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.kv_overrides = params.kv_overrides.data();
}

mparams.n_threads = params.cpuparams.n_threads;

return mparams;
}

Expand Down
2 changes: 1 addition & 1 deletion ggml/include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ extern "C" {
// Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device, int n_threads);
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
// Set the abort callback for the backend
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
Expand Down
6 changes: 3 additions & 3 deletions ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -325,9 +325,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)

# Fetch KleidiAI sources:
include(FetchContent)
set(KLEIDIAI_COMMIT_SHA "v1.2.0")
set(KLEIDIAI_DOWNLOAD_URL "https://gitlab.arm.com/kleidi/kleidiai/-/archive/${KLEIDIAI_COMMIT_SHA}/kleidiai-${KLEIDIAI_COMMIT_SHA}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "cebcb660079bf15626e7bdaecd18f49c")
set(KLEIDIAI_COMMIT_TAG "v1.2.0")
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
set(KLEIDIAI_ARCHIVE_MD5 "6634fefce7357ecfee9eace2068bc68b")

if (POLICY CMP0135)
cmake_policy(SET CMP0135 NEW)
Expand Down
4 changes: 2 additions & 2 deletions ggml/src/ggml-cpu/ggml-cpu-traits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ extra_buffer_type::~extra_buffer_type() {}
} // namespace ggml::cpu

bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * op) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(params->nth)) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
if (extra && extra->context) {
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
auto tensor_traits = buf_extra->get_tensor_traits(op);
Expand All @@ -23,7 +23,7 @@ bool ggml_cpu_extra_compute_forward(struct ggml_compute_params * params, struct
}

bool ggml_cpu_extra_work_size(int n_threads, const struct ggml_tensor * op, size_t * size) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(n_threads)) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
if (extra && extra->context) {
auto buf_extra = (ggml::cpu::extra_buffer_type *) extra->context;
auto tensor_traits = buf_extra->get_tensor_traits(op);
Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-cpu/ggml-cpu-traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,6 @@ class extra_buffer_type {
} // namespace ggml::cpu

// implemented in ggml-cpu.cpp.
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type(int n_threads);
std::vector<ggml_backend_buffer_type_t> & ggml_backend_cpu_get_extra_buffers_type();

#endif
20 changes: 9 additions & 11 deletions ggml/src/ggml-cpu/ggml-cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@

// ggml-backend interface

std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type(int n_threads) {
static std::vector<ggml_backend_buffer_type_t> bufts = [n_threads]() {
std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type() {
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
std::vector<ggml_backend_buffer_type_t> bufts;

#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)
Expand All @@ -44,8 +44,8 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
#endif

#ifdef GGML_USE_CPU_KLEIDIAI
if (ggml_backend_cpu_kleidiai_buffer_type(n_threads)) {
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type(n_threads));
if (ggml_backend_cpu_kleidiai_buffer_type()) {
bufts.push_back(ggml_backend_cpu_kleidiai_buffer_type());
}
#endif

Expand All @@ -58,21 +58,19 @@ std::vector<ggml_backend_buffer_type_t>& ggml_backend_cpu_get_extra_buffers_type
bufts.push_back(NULL);

return bufts;

GGML_UNUSED(n_threads);
}();

return bufts;
}

static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device, int n_threads) {
return ggml_backend_cpu_get_extra_buffers_type(n_threads).data();
static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_type(ggml_backend_dev_t device) {
return ggml_backend_cpu_get_extra_buffers_type().data();

GGML_UNUSED(device);
}

static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
if (extra && extra == buft) return true;
}
return false;
Expand Down Expand Up @@ -387,7 +385,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
}

// extra_buffer_op?
for (auto extra : ggml_backend_cpu_get_extra_buffers_type(-1)) {
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
if (extra) {
auto buf_extra = (ggml::cpu::extra_buffer_type*) extra->context;
if (buf_extra && buf_extra->supports_op(dev, op)) {
Expand Down Expand Up @@ -577,7 +575,7 @@ static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t r
features.push_back({ "OPENMP", "1" });
#endif
#ifdef GGML_USE_CPU_KLEIDIAI
features.push_back({ "KLEIDIAI_REPACK", "1" });
features.push_back({ "KLEIDIAI", "1" });
#endif
#ifdef GGML_USE_CPU_AARCH64
features.push_back({ "AARCH64_REPACK", "1" });
Expand Down
26 changes: 14 additions & 12 deletions ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,25 +34,25 @@ struct ggml_kleidiai_context {
ggml_kleidiai_kernels * kernels;
} static ctx = { NULL };

static void init_kleidiai_context(int n_threads) {
static void init_kleidiai_context(void) {
static bool initialized = false;

if (!initialized) {
GGML_ASSERT(n_threads > 0);

initialized = true;
const char *env_var = getenv("GGML_KLEIDIAI_SME");
int sme_enabled = 0;

cpu_feature features = (ggml_cpu_has_dotprod() ? CPU_FEATURE_DOTPROD : CPU_FEATURE_NONE) |
(ggml_cpu_has_matmul_int8() ? CPU_FEATURE_I8MM : CPU_FEATURE_NONE) |
(ggml_cpu_has_sve() ? CPU_FEATURE_SVE : CPU_FEATURE_NONE);

#if defined(__APPLE__)
if (n_threads == 1) {
if (env_var) {
sme_enabled = atoi(env_var);
}

if (sme_enabled != 0) {
features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
}
#else
features |= ggml_cpu_has_sme() ? CPU_FEATURE_SME : CPU_FEATURE_NONE;
#endif
ctx.kernels = ggml_kleidiai_select_kernels(features);
}
}
Expand Down Expand Up @@ -162,6 +162,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
ctx.kernels->rhs_info.pack_func(1, n, k, nr, kr, sr, k_q4_0_block_size, (const uint8_t *)data, NULL, tensor->data, 0, &params);

return 0;

GGML_UNUSED(data_size);
}
};

Expand Down Expand Up @@ -223,7 +225,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
op->src[0]->type == GGML_TYPE_Q4_0 &&
op->src[0]->buffer &&
(ggml_n_dims(op->src[0]) == 2) &&
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type(-1) && ctx.kernels
op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type() && ctx.kernels
) {
if (op->src[1]->buffer && !ggml_backend_buft_is_host(op->src[1]->buffer->buft)) {
return false;
Expand All @@ -237,7 +239,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {

ggml::cpu::tensor_traits * get_tensor_traits(const struct ggml_tensor * op) override {
if (op->op == GGML_OP_MUL_MAT) {
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type(-1)) {
if (op->src[0]->buffer && op->src[0]->buffer->buft == ggml_backend_cpu_kleidiai_buffer_type()) {
return (ggml::cpu::tensor_traits *) op->src[0]->extra;
}
}
Expand All @@ -246,7 +248,7 @@ class extra_buffer_type : ggml::cpu::extra_buffer_type {
};
} // namespace ggml::cpu::kleidiai

ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(int n_threads) {
ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void) {
static ggml::cpu::kleidiai::extra_buffer_type ctx;
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_kleidiai = {
/* .iface = */ {
Expand All @@ -261,7 +263,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(int n_threads)
/* .context = */ &ctx,
};

init_kleidiai_context(n_threads);
init_kleidiai_context();

return &ggml_backend_cpu_buffer_type_kleidiai;
}
2 changes: 1 addition & 1 deletion ggml/src/ggml-cpu/ggml-kleidiai/ggml-kleidiai.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
extern "C" {
#endif

ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(int n_threads);
ggml_backend_buffer_type_t ggml_backend_cpu_kleidiai_buffer_type(void);

#ifdef __cplusplus
}
Expand Down
2 changes: 0 additions & 2 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,6 @@ extern "C" {
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool check_tensors; // validate model tensor data

int n_threads;
};

// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
Expand Down
4 changes: 1 addition & 3 deletions src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,8 +445,7 @@ llama_model_loader::llama_model_loader(
std::vector<std::string> & splits,
bool use_mmap,
bool check_tensors,
const struct llama_model_kv_override * param_overrides_p,
int n_threads) {
const struct llama_model_kv_override * param_overrides_p) {
int trace = 0;
if (getenv("LLAMA_TRACE")) {
trace = atoi(getenv("LLAMA_TRACE"));
Expand Down Expand Up @@ -684,7 +683,6 @@ llama_model_loader::llama_model_loader(

this->use_mmap = use_mmap;
this->check_tensors = check_tensors;
this->n_threads = n_threads;
}

std::string llama_model_loader::get_arch_name() const {
Expand Down
5 changes: 1 addition & 4 deletions src/llama-model-loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,6 @@ struct llama_model_loader {

llama_mmaps mappings;

int n_threads;

std::map<std::string, struct llama_tensor_weight, weight_name_comparer> weights_map;
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;

Expand All @@ -97,8 +95,7 @@ struct llama_model_loader {
std::vector<std::string> & splits, // optional, only need if the split does not follow naming scheme
bool use_mmap,
bool check_tensors,
const struct llama_model_kv_override * param_overrides_p,
int n_threads);
const struct llama_model_kv_override * param_overrides_p);

template<typename T>
typename std::enable_if<std::is_integral<T>::value, bool>::type
Expand Down
7 changes: 3 additions & 4 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara
}

// CPU: ACCEL -> CPU extra -> GPU host -> CPU
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices, int n_threads) {
static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & devices) {
buft_list_t buft_list;

// add ACCEL buffer types
Expand All @@ -268,7 +268,7 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
if (ggml_backend_dev_get_extra_bufts_fn) {
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev, n_threads);
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
while (extra_bufts && *extra_bufts) {
buft_list.emplace_back(cpu_dev, *extra_bufts);
++extra_bufts;
Expand Down Expand Up @@ -1264,7 +1264,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
const bool use_mmap_buffer = true;

// build a list of buffer types for the CPU and GPU devices
pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.n_threads);
pimpl->cpu_buft_list = make_cpu_buft_list(devices);
for (auto * dev : devices) {
buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split);
// add CPU buffer types as a fallback
Expand Down Expand Up @@ -3768,7 +3768,6 @@ struct llama_model_params llama_model_default_params() {
/*.use_mmap =*/ true,
/*.use_mlock =*/ false,
/*.check_tensors =*/ false,
/*.n_threads =*/ GGML_DEFAULT_N_THREADS,
};

#ifdef GGML_USE_METAL
Expand Down
2 changes: 1 addition & 1 deletion src/llama-quant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
}

std::vector<std::string> splits = {};
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nthread);
llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides);
ml.init_mappings(false); // no prefetching

llama_model model(llama_model_default_params());
Expand Down
2 changes: 1 addition & 1 deletion src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
model.t_start_us = tm.t_start_us;

try {
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.n_threads);
llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides);

ml.print_info();

Expand Down

0 comments on commit 119d3bf

Please sign in to comment.