From ec581de73f754340c1363b9fcd47aea6f929dd48 Mon Sep 17 00:00:00 2001 From: leo-pony Date: Fri, 20 Dec 2024 07:15:11 +0000 Subject: [PATCH] Add ggml Ascend CANN backend compile and Ascend GPU detect --- Makefile | 3 + discover/ascend_linux.go | 101 + discover/gpu.go | 50 +- discover/gpu_info.h | 1 + discover/gpu_info_ascend.c | 178 + discover/gpu_info_ascend.h | 78 + llama/Makefile.detect | 27 + llama/ggml-cann.h | 149 + llama/ggml-cann/acl_tensor.cpp | 201 + llama/ggml-cann/acl_tensor.h | 284 ++ llama/ggml-cann/aclnn_ops.cpp | 3453 +++++++++++++++++ llama/ggml-cann/aclnn_ops.h | 618 +++ llama/ggml-cann/common.h | 312 ++ llama/ggml-cann/ggml-cann.cpp | 2214 +++++++++++ llama/ggml-cann/kernels/CMakeLists.txt | 37 + llama/ggml-cann/kernels/ascendc_kernels.h | 45 + llama/ggml-cann/kernels/dup.cpp | 262 ++ llama/ggml-cann/kernels/get_row_f16.cpp | 223 ++ llama/ggml-cann/kernels/get_row_f32.cpp | 216 ++ llama/ggml-cann/kernels/get_row_q4_0.cpp | 230 ++ llama/ggml-cann/kernels/get_row_q8_0.cpp | 217 ++ llama/ggml-cann/kernels/quantize_f16_q8_0.cpp | 244 ++ llama/ggml-cann/kernels/quantize_f32_q8_0.cpp | 242 ++ .../kernels/quantize_float_to_q4_0.cpp | 321 ++ llama/llama.go | 4 + make/Makefile.cann | 115 + make/Makefile.sync | 12 +- make/gpu.make | 26 +- 28 files changed, 9850 insertions(+), 13 deletions(-) create mode 100644 discover/ascend_linux.go create mode 100644 discover/gpu_info_ascend.c create mode 100644 discover/gpu_info_ascend.h create mode 100644 llama/Makefile.detect create mode 100644 llama/ggml-cann.h create mode 100644 llama/ggml-cann/acl_tensor.cpp create mode 100644 llama/ggml-cann/acl_tensor.h create mode 100644 llama/ggml-cann/aclnn_ops.cpp create mode 100644 llama/ggml-cann/aclnn_ops.h create mode 100644 llama/ggml-cann/common.h create mode 100644 llama/ggml-cann/ggml-cann.cpp create mode 100644 llama/ggml-cann/kernels/CMakeLists.txt create mode 100644 llama/ggml-cann/kernels/ascendc_kernels.h create mode 100644 llama/ggml-cann/kernels/dup.cpp create mode 100644 llama/ggml-cann/kernels/get_row_f16.cpp create mode 100644 llama/ggml-cann/kernels/get_row_f32.cpp create mode 100644 llama/ggml-cann/kernels/get_row_q4_0.cpp create mode 100644 llama/ggml-cann/kernels/get_row_q8_0.cpp create mode 100644 llama/ggml-cann/kernels/quantize_f16_q8_0.cpp create mode 100644 llama/ggml-cann/kernels/quantize_f32_q8_0.cpp create mode 100644 llama/ggml-cann/kernels/quantize_float_to_q4_0.cpp create mode 100644 make/Makefile.cann diff --git a/Makefile b/Makefile index 383354ee907..bafb80fdcd0 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,9 @@ ifneq ($(HIP_COMPILER),) endif endif +ifeq ($(OLLAMA_SKIP_CANN_GENERATE),) + RUNNER_TARGETS += cann +endif all: runners exe diff --git a/discover/ascend_linux.go b/discover/ascend_linux.go new file mode 100644 index 00000000000..02fd27b5162 --- /dev/null +++ b/discover/ascend_linux.go @@ -0,0 +1,101 @@ +//go:build linux + +package discover + +/* +#cgo linux LDFLAGS: -lrt -lpthread -ldl -lstdc++ -lm +#cgo windows LDFLAGS: -lpthread + +#include "gpu_info_ascend.h" + +*/ +import "C" +import ( + "fmt" + "log/slog" + "os" + "path/filepath" + "strings" + "unsafe" +) + +var AscendLinuxGlobs = []string{ + "/usr/local/Ascend/latest/aarch64-linux/lib64/libascendcl.so*", + "/usr/local/Ascend/ascend-toolkit/latest/aarch64-linux/lib64/libascendcl.so*", +} + +var AscendMgmtName = "libascendcl.so" + +var ( + ascendLibPath string +) + +type ascendHandles struct { + ascend *C.ascend_handle_t + deviceCount int +} + +type AscendGPUInfo struct { + GpuInfo + index int //nolint:unused,nolintlint +} +type AscendGPUInfoList []AscendGPUInfo + +func initAscendHandles() *ascendHandles { + aHandles := &ascendHandles{} + + // Short Circuit if we already know which library to use + if ascendLibPath != "" { + aHandles.deviceCount, aHandles.ascend, _ = LoadAscendMgmt([]string{ascendLibPath}) + return aHandles + } + + ascendToolkitHome := os.Getenv("ASCEND_TOOLKIT_HOME") + if ascendToolkitHome != "" { + AscendLinuxGlobs = append(AscendLinuxGlobs, filepath.Join(ascendToolkitHome, "/lib64/libascendcl.so*")) + } + + ascendLibPaths := FindGPULibs(AscendMgmtName, AscendLinuxGlobs) + if len(ascendLibPaths) > 0 { + deviceCount, ascend, libPath := LoadAscendMgmt(ascendLibPaths) + if ascend != nil { + slog.Debug("detected GPUs", "count", deviceCount, "library", libPath) + aHandles.ascend = ascend + aHandles.deviceCount = deviceCount + ascendLibPath = libPath + return aHandles + } + } + + return aHandles +} + +func LoadAscendMgmt(ascendLibPath []string) (int, *C.ascend_handle_t, string) { + var resp C.ascend_init_resp_t + resp.ah.verbose = getVerboseState() + for _, libPath := range ascendLibPath { + lib := C.CString(libPath) + defer C.free(unsafe.Pointer(lib)) + C.ascend_init(lib, &resp) + if resp.err != nil { + slog.Debug(fmt.Sprintf("Unable to load ascend management library %s: %s", libPath, C.GoString(resp.err))) + C.free(unsafe.Pointer(resp.err)) + } else { + return int(resp.num_devices), &resp.ah, libPath + } + } + return 0, nil, "" +} + +func ascendGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) { + ids := []string{} + for _, info := range gpuInfo { + if info.Library != "ascend" { + // TODO shouldn't happen if things are wired correctly... + slog.Debug("ascendGetVisibleDevicesEnv skipping over non-ascend device", "library", info.Library) + continue + } + ids = append(ids, info.ID) + } + return "ASCEND_RT_VISIBLE_DEVICES", strings.Join(ids, ",") +} diff --git a/discover/gpu.go b/discover/gpu.go index e76c844fef2..760b92dc59c 100644 --- a/discover/gpu.go +++ b/discover/gpu.go @@ -55,6 +55,7 @@ var ( nvmlLibPath string rocmGPUs []RocmGPUInfo oneapiGPUs []OneapiGPUInfo + ascendGPUs []AscendGPUInfo // If any discovered GPUs are incompatible, report why unsupportedGPUs []UnsupportedGPUInfo @@ -202,6 +203,7 @@ func GetGPUInfo() GpuInfoList { needRefresh := true var cHandles *cudaHandles var oHandles *oneapiHandles + var aHandles *ascendHandles defer func() { if cHandles != nil { if cHandles.cudart != nil { @@ -220,6 +222,9 @@ func GetGPUInfo() GpuInfoList { C.oneapi_release(*oHandles.oneapi) } } + if aHandles.ascend != nil { + C.ascend_release(*aHandles.ascend) + } }() if !bootstrapped { @@ -387,8 +392,31 @@ func GetGPUInfo() GpuInfoList { if err != nil { bootstrapErrors = append(bootstrapErrors, err) } + + // Then Ascend + aHandles = initAscendHandles() + + for i := range aHandles.deviceCount { + if aHandles.ascend != nil { + gpuInfo := AscendGPUInfo{ + GpuInfo: GpuInfo{ + Library: "ascend", + }, + index: i, + } + C.ascend_bootstrap(*aHandles.ascend, C.int(i), &memInfo) + gpuInfo.TotalMemory = uint64(memInfo.total) + gpuInfo.FreeMemory = uint64(memInfo.free) + gpuInfo.ID = C.GoString(&memInfo.gpu_id[0]) + gpuInfo.Name = C.GoString(&memInfo.gpu_name[0]) + ascendGPUs = append(ascendGPUs, gpuInfo) + + slog.Info(fmt.Sprintf("[%d] Name:%s: FreeMemory:%d, TotalMemory: %d", gpuInfo.ID, gpuInfo.Name, gpuInfo.FreeMemory, gpuInfo.TotalMemory)) + } + } + bootstrapped = true - if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 { + if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 && len(ascendGPUs) == 0 { slog.Info("no compatible GPUs were discovered") } @@ -492,6 +520,19 @@ func GetGPUInfo() GpuInfoList { if err != nil { slog.Debug("problem refreshing ROCm free memory", "error", err) } + if aHandles == nil && len(ascendGPUs) > 0 { + aHandles = initAscendHandles() + } + + for i, gpu := range ascendGPUs { + if aHandles.ascend == nil { + // shouldn't happen + slog.Warn("nil ascend handle with device count", "count", aHandles.deviceCount) + continue + } + C.ascend_bootstrap(*aHandles.ascend, C.int(gpu.index), &memInfo) + ascendGPUs[i].FreeMemory = uint64(memInfo.free) + } } resp := []GpuInfo{} @@ -504,6 +545,9 @@ func GetGPUInfo() GpuInfoList { for _, gpu := range oneapiGPUs { resp = append(resp, gpu.GpuInfo) } + for _, gpu := range ascendGPUs { + resp = append(resp, gpu.GpuInfo) + } if len(resp) == 0 { resp = append(resp, cpus[0].GpuInfo) } @@ -558,6 +602,8 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string { var err error for ; err == nil; tmp, err = os.Readlink(libPath) { if !filepath.IsAbs(tmp) { + // Resolve possible Symlinks in libPath + libPath, _ = filepath.EvalSymlinks(libPath) tmp = filepath.Join(filepath.Dir(libPath), tmp) } libPath = tmp @@ -710,6 +756,8 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) { return rocmGetVisibleDevicesEnv(l) case "oneapi": return oneapiGetVisibleDevicesEnv(l) + case "ascend": + return ascendGetVisibleDevicesEnv(l) default: slog.Debug("no filter required for library " + l[0].Library) return "", "" diff --git a/discover/gpu_info.h b/discover/gpu_info.h index 094b791a885..cfefad3810c 100644 --- a/discover/gpu_info.h +++ b/discover/gpu_info.h @@ -65,6 +65,7 @@ void cpu_check_ram(mem_info_t *resp); #include "gpu_info_nvcuda.h" #include "gpu_info_nvml.h" #include "gpu_info_oneapi.h" +#include "gpu_info_ascend.h" #endif // __GPU_INFO_H__ #endif // __APPLE__ diff --git a/discover/gpu_info_ascend.c b/discover/gpu_info_ascend.c new file mode 100644 index 00000000000..63967f13927 --- /dev/null +++ b/discover/gpu_info_ascend.c @@ -0,0 +1,178 @@ +#ifndef __APPLE__ + +#include "gpu_info_ascend.h" + +#include + + +void ascend_init(char *ascend_lib_path, ascend_init_resp_t *resp) +{ + aclError ret = -1; + resp->err = NULL; + resp->num_devices = 0; + const int buflen = 256; + char buf[buflen + 1]; + int i; + struct lookup + { + char *s; + void **p; + } l[] = { + {"aclInit", (void *)&resp->ah.aclInit}, + {"aclFinalize", (void *)&resp->ah.aclFinalize}, + {"aclrtSetDevice", (void *)&resp->ah.aclrtSetDevice}, + {"aclrtResetDevice", (void *)&resp->ah.aclrtResetDevice}, + {"aclrtGetVersion", (void *)&resp->ah.aclrtGetVersion}, + {"aclrtGetDeviceCount", (void *)&resp->ah.aclrtGetDeviceCount}, + {"aclrtQueryDeviceStatus", (void *)&resp->ah.aclrtQueryDeviceStatus}, + {"aclrtGetMemInfo", (void *)&resp->ah.aclrtGetMemInfo}, + {"aclrtGetSocName", (void *)&resp->ah.aclrtGetSocName}, + {"aclGetRecentErrMsg", (void *)&resp->ah.aclGetRecentErrMsg}, + {NULL, NULL}, + }; + + resp->ah.handle = LOAD_LIBRARY(ascend_lib_path, RTLD_LAZY); + if (!resp->ah.handle) { + char *msg = LOAD_ERR(); + LOG(resp->ah.verbose, "library %s load err: %s\n", ascend_lib_path, msg); + snprintf(buf, buflen, + "Unable to load %s library to query for ascend GPUs: %s", + ascend_lib_path, msg); + free(msg); + resp->err = strdup(buf); + return; + } + + for (i = 0; l[i].s != NULL; i++) { + // TODO once we've squashed the remaining corner cases remove this log + LOG(resp->ah.verbose, "dlsym: %s\n", l[i].s); + + *l[i].p = LOAD_SYMBOL(resp->ah.handle, l[i].s); + if (!*(l[i].p)) { + resp->ah.handle = NULL; + char *msg = LOAD_ERR(); + LOG(resp->ah.verbose, "dlerr: %s\n", msg); + UNLOAD_LIBRARY(resp->ah.handle); + snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg); + free(msg); + resp->err = strdup(buf); + return; + } + } + LOG(resp->ah.verbose, "calling aclInit\n"); + ret = (*resp->ah.aclInit)(NULL); + // A process must only call the aclInit function once. + // If the init function is called, the error will be ignored. + if (ret != ACL_SUCCESS && ret != ACL_ERROR_REPEAT_INITIALIZE ) { + LOG(resp->ah.verbose, "aclInit err: %d\n", ret); + UNLOAD_LIBRARY(resp->ah.handle); + resp->ah.handle = NULL; + snprintf(buf, buflen, "ascend init failure: %d", ret); + resp->err = strdup(buf); + return; + } + + int32_t majorVersion; + int32_t minorVersion; + int32_t patchVersion; + resp->ah.driver_major = 0; + resp->ah.driver_minor = 0; + + // Report driver version if we're in verbose mode, ignore errors + ret = (*resp->ah.aclrtGetVersion)(&majorVersion, &minorVersion, &patchVersion); + if (ret != ACL_SUCCESS) { + LOG(resp->ah.verbose, "aclrtGetVersion failed: %d\n", ret); + } else { + resp->ah.driver_major = majorVersion; + resp->ah.driver_minor = minorVersion; + LOG(resp->ah.verbose, "ascend driver version: %d.%d\n", resp->ah.driver_major, resp->ah.driver_minor); + } + + ret = (*resp->ah.aclrtGetDeviceCount)(&resp->num_devices); + if (ret != ACL_SUCCESS) { + LOG(resp->ah.verbose, "aclrtGetDeviceCount err: %d\n", ret); + UNLOAD_LIBRARY(resp->ah.handle); + resp->ah.handle = NULL; + snprintf(buf, buflen, "unable to get device count: %d", ret); + resp->err = strdup(buf); + return; + } +} + +void ascend_bootstrap(ascend_handle_t h, int device_id, mem_info_t *resp) { + resp->err = NULL; + aclError aclRet; + const int buflen = 256; + char buf[buflen + 1]; + + if (h.handle == NULL) { + resp->err = strdup("ascend handle isn't initialized"); + return; + } + + snprintf(&resp->gpu_id[0], GPU_ID_LEN, "%d", device_id); + + aclRet = (*h.aclrtSetDevice)(device_id); + if (aclRet != ACL_SUCCESS) { + snprintf(buf, buflen, "ascend device failed to set: %u\n", device_id); + resp->err = strdup(buf); + return; + } + + aclrtDeviceStatus device_status; + aclRet = (*h.aclrtQueryDeviceStatus)(device_id, &device_status); + if (aclRet != ACL_SUCCESS) { + printf("aclrtQueryDeviceStatus %u fail with %d.\n", device_id, aclRet); + (*h.aclFinalize)(); + return; + } + + if (device_status != ACL_RT_DEVICE_STATUS_NORMAL) { + printf("invalid device %u status: %d", device_id, device_status); + (*h.aclFinalize)(); + return; + } + + const char *soc_version = (*h.aclrtGetSocName)(); + char soc_name[11] = {0}; + strncpy(soc_name, soc_version, 10); + snprintf(&resp->gpu_name[0], GPU_NAME_LEN, "%s", soc_name); + + size_t free = 0; + size_t total = 0; + aclRet = (*h.aclrtGetMemInfo)(ACL_DDR_MEM, &free, &total); + if (aclRet != ACL_SUCCESS) { + printf("aclrtGetMemInfo to DDR failed: %u\n", device_id); + return; + } + resp->free += free; + resp->total += total; + + aclRet = (*h.aclrtGetMemInfo)(ACL_HBM_MEM, &free, &total); + if (aclRet != ACL_SUCCESS) { + printf("aclrtGetMemInfo to HBM failed: %u\n", device_id); + return; + } + resp->free += free; + resp->total += total; + + aclRet = (*h.aclrtResetDevice)(device_id); + if (aclRet != ACL_SUCCESS) { + printf("aclrtResetDevice failed: %u\n", device_id); + return; + } +} + +void ascend_release(ascend_handle_t h) { + int d; + LOG(h.verbose, "releasing ascned library\n"); + aclError ret; + ret = (*h.aclFinalize)(); + if (ret != ACL_SUCCESS && ret != ACL_ERROR_REPEAT_FINALIZE) { + LOG(1, "error during aclFinalize %d", ret); + } + UNLOAD_LIBRARY(h.handle); + h.handle = NULL; +} + +#endif // __APPLE__ \ No newline at end of file diff --git a/discover/gpu_info_ascend.h b/discover/gpu_info_ascend.h new file mode 100644 index 00000000000..012ebd9bd04 --- /dev/null +++ b/discover/gpu_info_ascend.h @@ -0,0 +1,78 @@ +#ifndef __APPLE__ +#ifndef __GPU_INFO_ASCEND_H__ +#define __GPU_INFO_ASCEND_H__ +#include "gpu_info.h" + +typedef int aclError; + +typedef struct aclrtUtilizationExtendInfo aclrtUtilizationExtendInfo; + +typedef struct aclrtUtilizationInfo { + int32_t cubeUtilization; + int32_t vectorUtilization; + int32_t aicpuUtilization; + int32_t memoryUtilization; + aclrtUtilizationExtendInfo *utilizationExtend; +} aclrtUtilizationInfo; + +typedef enum aclrtMemAttr { + ACL_DDR_MEM, + ACL_HBM_MEM, + ACL_DDR_MEM_HUGE, + ACL_DDR_MEM_NORMAL, + ACL_HBM_MEM_HUGE, + ACL_HBM_MEM_NORMAL, + ACL_DDR_MEM_P2P_HUGE, + ACL_DDR_MEM_P2P_NORMAL, + ACL_HBM_MEM_P2P_HUGE, + ACL_HBM_MEM_P2P_NORMAL, +} aclrtMemAttr; + +typedef enum aclrtDeviceStatus { + ACL_RT_DEVICE_STATUS_NORMAL = 0, + ACL_RT_DEVICE_STATUS_ABNORMAL, + ACL_RT_DEVICE_STATUS_END = 0xFFFF, +} aclrtDeviceStatus; + +// Just enough typedef's to dlopen/dlsym for memory information +typedef enum ascendError_enum { + ACL_SUCCESS = 0, + ACL_ERROR_REPEAT_INITIALIZE = 100002, + ACL_ERROR_REPEAT_FINALIZE = 100037, + // Other values omitted for now... +} ACLresult; + +typedef struct ascend_handle +{ + void *handle; + uint16_t verbose; + + int driver_major; + int driver_minor; + + aclError (*aclInit)(char *configPath); + aclError (*aclFinalize)(void); + aclError (*aclrtSetDevice)(int32_t deviceId); + aclError (*aclrtResetDevice)(int32_t deviceId); + aclError (*aclrtGetVersion)(int32_t *majorVersion, int32_t *minorVersion, int32_t *patchVersion); + aclError (*aclrtGetDeviceCount)(uint32_t *count); + aclError (*aclrtQueryDeviceStatus)(int32_t deviceId, aclrtDeviceStatus *deviceStatus); + aclError (*aclrtGetMemInfo)(aclrtMemAttr attr, size_t *free, size_t *total); + + const char *(*aclrtGetSocName)(void); + const char *(*aclGetRecentErrMsg)(void); +} ascend_handle_t; + +typedef struct ascend_init_resp +{ + char *err; // If err is non-null handle is invalid + int num_devices; + ascend_handle_t ah; +} ascend_init_resp_t; + +void ascend_init(char *ascend_lib_path, ascend_init_resp_t *resp); +void ascend_bootstrap(ascend_handle_t h, int device_id, mem_info_t *resp); +void ascend_release(ascend_handle_t h); + +#endif // __GPU_INFO_ASCEND_H__ +#endif // __APPLE__ diff --git a/llama/Makefile.detect b/llama/Makefile.detect new file mode 100644 index 00000000000..cc322d3a4bd --- /dev/null +++ b/llama/Makefile.detect @@ -0,0 +1,27 @@ +SOC_VERSION := +SOC_TYPE := + +# Function to detect Ascend SOC type +detect_ascend_soc_type = $(shell npu-smi info | awk -F' ' 'NF > 0 && NR==7 {print $$3}') +$(info CANN detect_ascend_soc_type auto-detected is $(detect_ascend_soc_type)) + +ifeq ($(SOC_TYPE),) + SOC_VERSION := $(call detect_ascend_soc_type) + ifeq ($(SOC_VERSION),) + $(error Auto-detect ascend soc type failed, please specify manually or check ascend device working normally.) + endif + SOC_TYPE := $(SOC_VERSION) + $(info CANN SOC_VERSION auto-detected is $(SOC_VERSION)) +endif + +SOC_VERSION := $(shell echo $(SOC_TYPE) | tr '[:upper:]' '[:lower:]') + +# Construct Soc specify compile option: ASCEND_SOC_MAJOR_SN, Such as ASCEND_910B, ASCEND_310P. +SOC_TYPE_MAJOR_SN := $(shell echo $(SOC_VERSION) | grep -o [0-9][0-9][0-9][a-zA-Z]*) +SOC_TYPE_COMPILE_OPTION := ASCEND_$(SOC_TYPE_MAJOR_SN) +SOC_TYPE_COMPILE_OPTION := $(shell echo $(SOC_TYPE_COMPILE_OPTION) | tr '[:lower:]' '[:upper:]') + +detect-ascend: + @echo "SOC_VERSION: $(SOC_VERSION)" + @echo "SOC_TYPE_MAJOR_SN: $(SOC_TYPE_MAJOR_SN)" + @echo "SOC_TYPE_COMPILE_OPTION: $(SOC_TYPE_COMPILE_OPTION)" diff --git a/llama/ggml-cann.h b/llama/ggml-cann.h new file mode 100644 index 00000000000..0a0febb0465 --- /dev/null +++ b/llama/ggml-cann.h @@ -0,0 +1,149 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#pragma once + +#include "ggml-backend.h" +#include "ggml.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief Maximum number of CANN devices supported. + */ +#define GGML_CANN_MAX_DEVICES 16 + +GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void); + +/** + * @brief Initializes the CANN backend for a specified device. + * + * This function initializes the CANN backend for the given device. + * It verifies the device index, allocates a context, and creates a backend + * instance. + * + * @param device The index of the device to initialize. + * @return A pointer to the initialized backend instance, or nullptr on failure. + */ +GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device); + +/** + * @brief Checks if a given backend is a CANN backend. + * + * This function verifies if the provided backend is a CANN backend by comparing + * its GUID with the CANN backend's GUID. + * + * @param backend The backend instance to check. + * @return True if the backend is a CANN backend, false otherwise. + */ +GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend); + +/** + * @brief Retrieves the CANN buffer type for a specified device. + * + * This function initializes and returns the buffer type interface associated + * with the given device. It ensures thread-safe access using a mutex. + * + * @param device The device index for which to retrieve the buffer type. + * @return A pointer to the buffer type interface for the specified device, or + * nullptr if the device index is out of range. + */ +GGML_BACKEND_API ggml_backend_buffer_type_t +ggml_backend_cann_buffer_type(int32_t device); + +/** + * @brief Retrieves the number of CANN devices available. + * + * This function returns the number of CANN devices available based on + * information obtained from `ggml_cann_info()`. + * + * @return The number of CANN devices available. + */ +GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void); + +/** + * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU. + * + * @return A pointer to the host buffer type interface. + */ +GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void); + +/** + * @brief Retrieves the description of a specific CANN device. + * + * This function sets the specified device, retrieves the SoC name, + * and writes it into the provided description buffer. + * + * @param device The device index to retrieve the description for. + * @param description Pointer to a buffer where the description will be written. + * @param description_size Size of the description buffer. + */ +GGML_BACKEND_API void ggml_backend_cann_get_device_description( + int32_t device, char* description, size_t description_size); + +/** + * @brief Retrieves the memory information of a specific CANN device. + * + * This function sets the specified device, retrieves the free and total + * memory information of the specified type (ACL_HBM_MEM), and stores them + * in the provided pointers. + * + * @param device The device index to retrieve memory information for. + * @param free Pointer to a variable where the free memory size will be stored. + * @param total Pointer to a variable where the total memory size will be + * stored. + */ +GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device, + size_t* free, + size_t* total); + +#ifdef __cplusplus +} +#endif diff --git a/llama/ggml-cann/acl_tensor.cpp b/llama/ggml-cann/acl_tensor.cpp new file mode 100644 index 00000000000..8e74199a1b5 --- /dev/null +++ b/llama/ggml-cann/acl_tensor.cpp @@ -0,0 +1,201 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "acl_tensor.h" + +#include +#include + +aclDataType ggml_cann_type_mapping(ggml_type type) { + switch (type) { + case GGML_TYPE_F32: + return ACL_FLOAT; + case GGML_TYPE_F16: + return ACL_FLOAT16; + case GGML_TYPE_I8: + return ACL_INT8; + case GGML_TYPE_I16: + return ACL_INT16; + case GGML_TYPE_I32: + return ACL_INT32; + case GGML_TYPE_Q4_0: + return ACL_INT4; + case GGML_TYPE_Q8_0: + return ACL_INT8; + default: + return ACL_DT_UNDEFINED; + } + return ACL_DT_UNDEFINED; +} + +aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne, + size_t* nb, int64_t dims, aclFormat format, + size_t offset) { + // If tensor is bcasted, Up to GGML_MAX_DIMS additional dimensions will be + // added. + int64_t acl_ne[GGML_MAX_DIMS * 2], acl_stride[GGML_MAX_DIMS * 2]; + + int64_t acl_storage_len = 0; + if (ne == nullptr) { + acl_storage_len = ggml_nbytes(tensor); + for (int i = 0; i < GGML_MAX_DIMS; i++) { + acl_ne[i] = tensor->ne[i]; + // The step size of acl is in elements. + acl_stride[i] = tensor->nb[i] / ggml_element_size(tensor); + } + } else { + // With bcast + for (int i = 0; i < dims; i++) { + acl_storage_len += (ne[i] - 1) * nb[i]; + acl_ne[i] = ne[i]; + acl_stride[i] = nb[i] / ggml_element_size(tensor); + } + } + + // Reverse ne and stride. + int64_t final_dims = (dims == 0 ? GGML_MAX_DIMS : dims); + std::reverse(acl_ne, acl_ne + final_dims); + std::reverse(acl_stride, acl_stride + final_dims); + + aclTensor* acl_tensor = aclCreateTensor( + acl_ne, final_dims, ggml_cann_type_mapping(tensor->type), acl_stride, + offset / ggml_element_size(tensor), format, &acl_storage_len, 1, + tensor->data); + + return acl_tensor; +} + +bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) { + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (t1->ne[i] != t0->ne[i] && t1->ne[i] != 1) { + return true; + } + } + return false; +} + +int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, + const ggml_tensor* src1, + int64_t* bcast_src0_ne, + int64_t* bcast_src1_ne, size_t* bcast_src0_nb, + size_t* bcast_src1_nb) { + GGML_ASSERT(ggml_can_repeat(src1, src0)); + int bcast_dim_cnt = 0; + for (int i = 0; i < GGML_MAX_DIMS; i++) { + int64_t nr = src0->ne[i] / src1->ne[i]; + bcast_src0_ne[bcast_dim_cnt] = src0->ne[i] / nr; + bcast_src1_ne[bcast_dim_cnt] = src1->ne[i]; + bcast_src0_nb[bcast_dim_cnt] = src0->nb[i]; + bcast_src1_nb[bcast_dim_cnt] = src1->nb[i]; + bcast_dim_cnt++; + if (nr != 1) { + // Need to add an extra dim. + bcast_src0_ne[bcast_dim_cnt] = nr; + bcast_src1_ne[bcast_dim_cnt] = 1; + bcast_src0_nb[bcast_dim_cnt] = bcast_src0_nb[bcast_dim_cnt - 1] * + bcast_src0_ne[bcast_dim_cnt - 1]; + bcast_src1_nb[bcast_dim_cnt] = bcast_src1_nb[bcast_dim_cnt - 1] * + bcast_src1_ne[bcast_dim_cnt - 1]; + bcast_dim_cnt++; + } + } + return bcast_dim_cnt; +} + +int64_t ggml_cann_get_mulmat_bcast_shape( + const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne, + const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb, + int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne, + size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb) { + // input and dst shoule in same shape, except first two dims. + GGML_ASSERT(input_ne[2] == dst_ne[2]); + GGML_ASSERT(input_ne[3] == dst_ne[3]); + + int bcast_dim_cnt = 0; + + // For mul_mat, a dimension needs to be added before the dimension that + // weight needs to be expanded to satisfy the bcast rule of matrix + // multiplication. + for (int i = 0; i < GGML_MAX_DIMS; i++) { + int64_t nr = input_ne[i] / weight_ne[i]; + // Do not use bcast in the first two dimensions because we only support + // the bcast batch dimension. Just copy them. + if (i < 2 || nr == 1) { + bcast_input_ne[bcast_dim_cnt] = input_ne[i]; + bcast_weight_ne[bcast_dim_cnt] = weight_ne[i]; + bcast_dst_ne[bcast_dim_cnt] = dst_ne[i]; + + bcast_input_nb[bcast_dim_cnt] = input_nb[i]; + bcast_weight_nb[bcast_dim_cnt] = weight_nb[i]; + bcast_dst_nb[bcast_dim_cnt] = dst_nb[i]; + bcast_dim_cnt++; + } else { + // Need to add an extra dim. + bcast_input_ne[bcast_dim_cnt] = nr; + bcast_dst_ne[bcast_dim_cnt] = nr; + bcast_weight_ne[bcast_dim_cnt] = 1; + bcast_input_nb[bcast_dim_cnt] = input_nb[i]; + bcast_dst_nb[bcast_dim_cnt] = dst_nb[i]; + bcast_weight_nb[bcast_dim_cnt] = weight_nb[i]; + bcast_dim_cnt++; + + bcast_input_ne[bcast_dim_cnt] = input_ne[i] / nr; + bcast_dst_ne[bcast_dim_cnt] = dst_ne[i] / nr; + bcast_weight_ne[bcast_dim_cnt] = weight_ne[i]; + bcast_input_nb[bcast_dim_cnt] = bcast_input_nb[bcast_dim_cnt - 1] * + bcast_input_ne[bcast_dim_cnt - 1]; + bcast_dst_nb[bcast_dim_cnt] = bcast_dst_nb[bcast_dim_cnt - 1] * + bcast_dst_ne[bcast_dim_cnt - 1]; + bcast_weight_nb[bcast_dim_cnt] = + bcast_weight_nb[bcast_dim_cnt - 1] * + bcast_weight_ne[bcast_dim_cnt - 1]; + bcast_dim_cnt++; + } + } + return bcast_dim_cnt; +} diff --git a/llama/ggml-cann/acl_tensor.h b/llama/ggml-cann/acl_tensor.h new file mode 100644 index 00000000000..187ac7159d0 --- /dev/null +++ b/llama/ggml-cann/acl_tensor.h @@ -0,0 +1,284 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CANN_ACL_TENSOR_H +#define CANN_ACL_TENSOR_H + +#include +#include + +#include +#include "common.h" + +/** + * @brief Maps a ggml_type to its corresponding aclDataType. + * + * @details This function takes a ggml_type as input and returns the corresponding + * aclDataType. It supports mapping for various ggml_types. If the input type + * does not match any of the predefined ggml_types, the function returns + * ACL_DT_UNDEFINED. + * + * @param type The ggml_type to be mapped. + * @return The corresponding aclDataType. If the input type is not recognized, + * ACL_DT_UNDEFINED is returned. + */ +aclDataType ggml_cann_type_mapping(ggml_type type); + +/** + * @brief Creates an ACL tensor from a ggml_tensor with optional shape. + * + * @details This function creates an ACL tensor based on the properties of the + * provided ggml_tensor. It supports customer shape by adjusting dimensions + * and strides accordingly. If customer shape is applied, additional + * dimensions and strides are calculated based on the provided parameters. + * + * @param tensor Pointer to the ggml_tensor to be converted to ACL tensor. + * @param ne Pointer to an array containing dimensions. Defaults to nullptr + * if no customer shape is applied. + * @param nb Pointer to an array containing strides. Defaults to nullptr + * if no customer shape is applied. + * @param dims Number of dimensions in the tensor. Defaults to 0 if no customer + * shape is applied. + * @param format ACL tensor format. Defaults to ACL_FORMAT_ND. + * @param offset Offset in bytes for the ACL tensor data. Defaults to 0. + * @return Pointer to the created ACL tensor. + */ +aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = nullptr, + size_t* nb = nullptr, int64_t dims = 0, + aclFormat format = ACL_FORMAT_ND, + size_t offset = 0); + +/** + * @brief Template for creating an ACL tensor from provided parameters. typename TYPE + * should be size_t or float. + * + * @details This function creates an ACL tensor using the provided data pointer, + * data type, dimensions, strides, format, offset, and additional parameters. + * It calculates necessary dimensions and strides based on the provided ne and nb + * arrays, adjusting them for the ACL tensor creation. The ACL storage length + * is also calculated based on the provided dimensions and strides. + * + * @param data_ptr Pointer to the data buffer for the ACL tensor. + * @param dtype ACL data type of the tensor. + * @param type_size Size of each element in the tensor data buffer. + * @param ne Pointer to an array containing tensor dimensions. + * @param nb Pointer to an array containing tensor strides. + * @param dims Number of dimensions of the tensor. + * @param format ACL tensor format. Defaults to ACL_FORMAT_ND. + * @param offset Offset in bytes for the ACL tensor data. Defaults to 0. + * @return Pointer to the created ACL tensor. + */ +template +aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype, + TYPE type_size, int64_t* ne, TYPE* nb, + int64_t dims, + aclFormat format = ACL_FORMAT_ND, + size_t offset = 0) { + int64_t tmp_ne[GGML_MAX_DIMS * 2]; + int64_t tmp_stride[GGML_MAX_DIMS * 2]; + + memcpy(tmp_ne, ne, dims * sizeof(int64_t)); + for (int i = 0; i < dims; i++) { + tmp_stride[i] = nb[i] / type_size; + } + + std::reverse(tmp_ne, tmp_ne + dims); + std::reverse(tmp_stride, tmp_stride + dims); + + int64_t acl_storage_len = 0; + for (int i = 0; i < dims; i++) { + acl_storage_len += (ne[i] - 1) * nb[i]; + } + + aclTensor* acl_tensor = + aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size, + format, &acl_storage_len, 1, data_ptr); + + return acl_tensor; +} + +/** + * @brief Checks if tensors require broadcasting based on their shapes. + * + * @details This function determines if two ggml_tensors need to be broadcasted for + * element-wise operations. Broadcasting is necessary if the shapes of the + * tensors are not identical and no dimension in either tensor equals 1. + * + * @param t0 Pointer to the first ggml_tensor. + * @param t1 Pointer to the second ggml_tensor. + * @return True if broadcasting is needed, False otherwise. + * + * @remarks This function iterates over the dimensions of t0 and t1. It checks if each + * dimension in t1 differs from t0's corresponding dimension and is not equal + * to 1. If such a dimension is found, broadcasting is required to align t1 + * with t0 for element-wise operations. + */ +bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1); + +/** + * @brief Computes broadcast shapes and strides for two ggml_tensors. + * + * @details This function calculates the broadcast shapes and strides for two ggml_tensors, + * following the broadcasting rules similar to numpy. It adjusts dimensions and + * strides to ensure compatibility for element-wise operations where one tensor + * can be broadcasted to match the shape of another tensor. + * + * @param src0 Pointer to the first ggml_tensor. + * @param src1 Pointer to the second ggml_tensor. + * @param bcast_ne_src0 Output array to store broadcasted dimensions for src0. + * @param bcast_ne_src1 Output array to store broadcasted dimensions for src1. + * @param bcast_nb_src0 Output array to store broadcasted strides for src0. + * @param bcast_nb_src1 Output array to store broadcasted strides for src1. + * @return Number of dimensions in the broadcasted shape. + * + * @pre ggml_can_repeat(src1, src0) must return true, indicating src1 can be broadcasted + * to match src0. + * + * @remarks This function iterates over the dimensions of src0 and src1, calculating the + * necessary broadcast dimensions and strides. If a dimension requires broadcasting + * (i.e., its size in src1 is smaller than in src0), an additional dimension is + * added with size calculated to match src0's dimension. This adjustment ensures + * that src1 can be element-wise broadcasted to src0's shape. + * + * How it works: + * + * if dim0 has padding. + * a -> (2, 2) padding = 2 + * a: [[1, 2, *, *] + * [2, 3, *, *]] + * nb = (8, 4, 2) + * + * if a should bcast with b -> (2, 4) + * b' -> (2, 2, 2) + * b : [[1, 2, 3, 4, *, *] + * [5, 6, 7, 8, *, *]] + * nb = (12, 6, 1) + * + * after bcast: + * a' -> (2, 1, 2) + * a': [[[1, 2], *, *] + * [[2, 3], *, *]] + * nb = (8, 4, 2, 1) + * + * b' : [[[1, 2], [3, 4], *, *] + * [[5, 6], [7, 8], *, *]] + * nb = (12, 6, 2, 1) + * \endcode + * + * dim1 in a inserted dim, should add nb for dim1, + * and all other nb moves to next in order. + */ +int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0, const ggml_tensor* src1, + int64_t* bcast_ne_src0, int64_t* bcast_ne_src1, + size_t* bcast_nb_src0, size_t* bcast_nb_src1); + +// Bcast macro to avoid duplicate code. +#define BCAST_SHAPE(src0, src1) \ + int64_t bcast_##src0##_ne[GGML_MAX_DIMS * 2]; \ + int64_t bcast_##src1##_ne[GGML_MAX_DIMS * 2]; \ + size_t bcast_##src0##_nb[GGML_MAX_DIMS * 2]; \ + size_t bcast_##src1##_nb[GGML_MAX_DIMS * 2]; \ + int64_t bcast_dims = ggml_cann_get_bcast_shape( \ + src0, src1, bcast_##src0##_ne, bcast_##src1##_ne, bcast_##src0##_nb, \ + bcast_##src1##_nb); + +#define BCAST_PARAM(tensor) bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims + +/** + * @brief Calculates broadcast shapes for matrix multiplication. + * + * @details This function computes the broadcast shapes required for matrix multiplication + * based on the input, weight, and destination tensor shapes. It ensures that the + * dimensions of weight tensors are expanded appropriately to satisfy matrix + * multiplication broadcast rules. + * + * @param input_ne Array containing the dimensions of the input tensor. + * @param weight_ne Array containing the dimensions of the weight tensor. + * @param dst_ne Array containing the dimensions of the destination tensor. + * @param input_nb Array containing the strides of the input tensor. + * @param weight_nb Array containing the strides of the weight tensor. + * @param dst_nb Array containing the strides of the destination tensor. + * @param bcast_input_ne Output array for broadcasted input tensor dimensions. + * @param bcast_weight_ne Output array for broadcasted weight tensor dimensions. + * @param bcast_dst_ne Output array for broadcasted destination tensor dimensions. + * @param bcast_input_nb Output array for broadcasted input tensor strides. + * @param bcast_weight_nb Output array for broadcasted weight tensor strides. + * @param bcast_dst_nb Output array for broadcasted destination tensor strides. + * @return The number of dimensions in the broadcasted tensors. + * + * @remarks This function iterates over the tensor dimensions and calculates the broadcast + * shapes needed for matrix multiplication. It ensures that dimensions where + * weight tensor requires expansion are appropriately handled to conform with + * broadcasting rules. + * @note compare with ggml_cann_get_bcast_shape, mul_mat broadcast need add this new dim + * before cast dim. + * @sa ggml_cann_get_bcast_shape + */ +int64_t ggml_cann_get_mulmat_bcast_shape( + const int64_t* input_ne, const int64_t* weight_ne, const int64_t* dst_ne, + const size_t* input_nb, const size_t* weight_nb, const size_t* dst_nb, + int64_t* bcast_input_ne, int64_t* bcast_weight_ne, int64_t* bcast_dst_ne, + size_t* bcast_input_nb, size_t* bcast_weight_nb, size_t* bcast_dst_nb); + +// Bcast macro to avoid duplicate code. +#define BCAST_MUL_MAT_SHAPE(input, weight, dst) \ + int64_t bcast_##input##_ne[GGML_MAX_DIMS * 2]; \ + int64_t bcast_##weight##_ne[GGML_MAX_DIMS * 2]; \ + int64_t bcast_##dst##_ne[GGML_MAX_DIMS * 2]; \ + size_t bcast_##input##_nb[GGML_MAX_DIMS * 2]; \ + size_t bcast_##weight##_nb[GGML_MAX_DIMS * 2]; \ + size_t bcast_##dst##_nb[GGML_MAX_DIMS * 2]; \ + int64_t bcast_dims = ggml_cann_get_mulmat_bcast_shape( \ + input->ne, weight->ne, dst->ne, input->nb, weight->nb, dst->nb, \ + bcast_##input##_ne, bcast_##weight##_ne, bcast_##dst##_ne, \ + bcast_##input##_nb, bcast_##weight##_nb, bcast_##dst##_nb); + +#define BCAST_MUL_MAT_PARAM(tensor) \ + bcast_##tensor##_ne, bcast_##tensor##_nb, bcast_dims + +#endif // CANN_ACL_TENSOR_H diff --git a/llama/ggml-cann/aclnn_ops.cpp b/llama/ggml-cann/aclnn_ops.cpp new file mode 100644 index 00000000000..cb032f09faf --- /dev/null +++ b/llama/ggml-cann/aclnn_ops.cpp @@ -0,0 +1,3453 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "aclnn_ops.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "llama/ggml-impl.h" +#include "llama/ggml-cann/kernels/ascendc_kernels.h" + +#define GGML_COMMON_DECL_C + +#include "llama/ggml-common.h" + +/** + * @brief Repeats elements of a tensor along each dimension according to the + * specified repeat array. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor to be repeated. + * @param acl_dst The destination tensor after repeating. + * @param repeat_array The array specifying the number of repetitions along each + * dimension. + */ +static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, int64_t* repeat_array) { + // repeat tensor along each dim with repeat_array + aclIntArray* repeats = aclCreateIntArray(repeat_array, GGML_MAX_DIMS); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnRepeatGetWorkspaceSize(acl_src, repeats, acl_dst, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + // Memory from allocator will "free" immediately, and this memory + // will be alloced to other pointers, but it won't access before + // this async task end because all tasks in same stream will execute + // in queue. + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + ACL_CHECK( + aclnnRepeat(workspaceAddr, workspaceSize, executor, ctx.stream())); + ACL_CHECK(aclDestroyIntArray(repeats)); +} + +void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + GGML_ASSERT(ggml_can_repeat(src, dst)); + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + int64_t repeatsArray[] = {dst->ne[3] / src->ne[3], dst->ne[2] / src->ne[2], + dst->ne[1] / src->ne[1], dst->ne[0] / src->ne[0]}; + + aclnn_repeat(ctx, acl_src, acl_dst, repeatsArray); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +/** + * @brief Adds two tensors element-wise and stores the result in a destination + * tensor. + * + * This function performs the operation: + * \f[ + * dst = acl\_src0 + alpha \times acl\_src1 + * \f] + * where alpha is a scalar value and defaults to 1.0f. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src0 The first source tensor. + * @param acl_src1 The second source tensor. + * @param acl_dst The destination tensor where the result will be stored. + */ +static void aclnn_add(ggml_backend_cann_context& ctx, aclTensor* acl_src0, + aclTensor* acl_src1, aclTensor* acl_dst) { + aclScalar* alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyScalar(alpha)); +} + +void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + aclTensor* acl_src0; + aclTensor* acl_src1; + aclTensor* acl_dst; + + // Need bcast + if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) { + BCAST_SHAPE(src0, src1) + acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0)); + acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1)); + acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0)); + } else { + acl_src0 = ggml_cann_create_tensor(src0); + acl_src1 = ggml_cann_create_tensor(src1); + acl_dst = ggml_cann_create_tensor(dst); + } + + aclnn_add(ctx, acl_src0, acl_src1, acl_dst); + + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(acl_src1)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + float negative_slope; + memcpy(&negative_slope, dst->op_params, sizeof(float)); + aclScalar* acl_negative_slope = + aclCreateScalar(&negative_slope, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnLeakyReluGetWorkspaceSize( + acl_src, acl_negative_slope, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnLeakyRelu(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyScalar(acl_negative_slope)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +/** + * @brief Concatenates a list of tensors along a specified dimension and stores + * the result in a destination tensor. + * + * @param ctx The context for the CANN backend operations. + * @param tensorList The list of tensors to be concatenated. + * @param acl_dst The destination tensor where the concatenated result will be + * stored. + * @param concat_dim The dimension along which the tensors will be concatenated. + */ +static void aclnn_concat(ggml_backend_cann_context& ctx, + aclTensorList* tensorList, aclTensor* acl_dst, + int64_t concat_dim) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnCatGetWorkspaceSize(tensorList, concat_dim, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnCat(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + aclTensor* acl_src0 = ggml_cann_create_tensor(src0); + aclTensor* acl_src1 = ggml_cann_create_tensor(src1); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + const int32_t dim = ggml_get_op_params_i32(dst, 0); + + GGML_ASSERT(dim >= 0 && dim < 4); + int32_t acl_dim = 3 - dim; + + aclTensor* tensors[] = {acl_src0, acl_src1}; + aclTensorList* tensorList = aclCreateTensorList(tensors, 2); + aclnn_concat(ctx, tensorList, acl_dst, acl_dim); + + ACL_CHECK(aclDestroyTensorList(tensorList)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +/** + * @brief Creates a tensor with values starting from `start`, incremented by + * `step`, and ending before `stop`. + * + * This function performs the operation: + * \f[ + * \text {out }_{i+1}=\text {out }_i+\text {step} + * \f] + * the range is [start, stop). + * + * @param ctx The context for the CANN backend operations. + * @param acl_dst The destination tensor where the values will be stored. + * @param start The starting value of the range. + * @param stop The ending value of the range (exclusive). + * @param step The step size between consecutive values. + * @param n_elements The number of elements in the destination tensor. + */ +static void aclnn_arange(ggml_backend_cann_context& ctx, aclTensor* acl_dst, + float start, float stop, float step, + int64_t n_elements) { + int64_t steps = (int64_t)std::ceil((stop - start) / step); + GGML_ASSERT(n_elements == steps); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + aclScalar* acl_start = aclCreateScalar(&start, aclDataType::ACL_FLOAT); + aclScalar* acl_end = aclCreateScalar(&stop, aclDataType::ACL_FLOAT); + aclScalar* acl_step = aclCreateScalar(&step, aclDataType::ACL_FLOAT); + + ACL_CHECK(aclnnArangeGetWorkspaceSize(acl_start, acl_end, acl_step, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnArange(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyScalar(acl_start)); + ACL_CHECK(aclDestroyScalar(acl_end)); + ACL_CHECK(aclDestroyScalar(acl_step)); +} + +void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + int64_t n_elements = ggml_nelements(dst); + float start; + float stop; + float step; + memcpy(&start, (float*)dst->op_params + 0, sizeof(float)); + memcpy(&stop, (float*)dst->op_params + 1, sizeof(float)); + memcpy(&step, (float*)dst->op_params + 2, sizeof(float)); + + aclnn_arange(ctx, acl_dst, start, stop, step, n_elements); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + dst->src[1] = dst->src[0]; + ggml_cann_mul_div(ctx, dst); +} + +void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + float min; + float max; + memcpy(&min, dst->op_params, sizeof(float)); + memcpy(&max, (float*)dst->op_params + 1, sizeof(float)); + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + aclScalar* acl_min = aclCreateScalar(&min, aclDataType::ACL_FLOAT); + aclScalar* acl_max = aclCreateScalar(&max, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnClampGetWorkspaceSize(acl_src, acl_min, acl_max, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnClamp(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyScalar(acl_min)); + ACL_CHECK(aclDestroyScalar(acl_max)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + // scale factor + float v; + memcpy(&v, dst->op_params, sizeof(float)); + + aclScalar* scale = aclCreateScalar(&v, aclDataType::ACL_FLOAT); + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, scale, acl_dst, &workspaceSize, + &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyScalar(scale)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + enum ggml_sort_order order = (enum ggml_sort_order)dst->op_params[0]; + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + ggml_cann_pool_alloc temp_buffer_allocator( + ctx.pool(), ggml_nelements(dst) * sizeof(int64_t)); + void* buffer = temp_buffer_allocator.get(); + aclTensor* tmp_tensor = + ggml_cann_create_tensor(buffer, ACL_INT64, ggml_type_size(dst->type), + dst->ne, dst->nb, GGML_MAX_DIMS); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnArgsortGetWorkspaceSize( + acl_src, -1, (order == GGML_SORT_ORDER_DESC ? true : false), tmp_tensor, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnArgsort(workspaceAddr, workspaceSize, executor, ctx.stream())); + + workspaceSize = 0; + ACL_CHECK(aclnnCastGetWorkspaceSize(tmp_tensor, + ggml_cann_type_mapping(dst->type), + acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(tmp_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + std::vector normData = {dst->ne[0]}; + aclIntArray* norm = aclCreateIntArray(normData.data(), normData.size()); + ACL_CHECK(aclnnLayerNormGetWorkspaceSize(acl_src, norm, nullptr, nullptr, + eps, acl_dst, nullptr, nullptr, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnLayerNorm(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyIntArray(norm)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + int n_groups = dst->op_params[0]; + + float eps; + memcpy(&eps, dst->op_params + 1, sizeof(float)); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + int64_t N = src->ne[3]; + int64_t C = src->ne[2]; + int64_t HxW = src->ne[1] * src->ne[0]; + + size_t type_size = ggml_type_size(src->type); + int64_t ne[] = {n_groups, N}; + size_t nb[] = {type_size, type_size * n_groups}; + size_t n_bytes = N * n_groups; + + ggml_cann_pool_alloc temp_buffer_allocator(ctx.pool(), n_bytes * 2); + void* buffer = temp_buffer_allocator.get(); + aclTensor* acl_mean_out = ggml_cann_create_tensor( + buffer, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); + aclTensor* acl_rstd_out = ggml_cann_create_tensor( + (char*)buffer + n_bytes, ACL_FLOAT, type_size, ne, nb, ACL_FORMAT_ND); + + ACL_CHECK(aclnnGroupNormGetWorkspaceSize( + acl_src, nullptr, nullptr, N, C, HxW, n_groups, eps, acl_dst, + acl_mean_out, acl_rstd_out, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnGroupNorm(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyTensor(acl_mean_out)); + ACL_CHECK(aclDestroyTensor(acl_rstd_out)); +} + +void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + + size_t nb1 = ((int32_t*)dst->op_params)[0]; + size_t nb2 = ((int32_t*)dst->op_params)[1]; + size_t nb3 = ((int32_t*)dst->op_params)[2]; + size_t offset = ((int32_t*)dst->op_params)[3]; + bool inplace = (bool)((int32_t*)dst->op_params)[4]; + + size_t param_nb[] = {ggml_element_size(src0), nb1, nb2, nb3}; + + aclTensor* acl_dst = ggml_cann_create_tensor( + dst, src1->ne, param_nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); + aclTensor* acl_src1 = ggml_cann_create_tensor(src1); + + aclScalar* alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + if (!inplace) { + size_t cpy_size = ggml_nbytes(dst); + ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src0->data, cpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + aclTensor* acl_src0 = ggml_cann_create_tensor( + src0, src1->ne, src0->nb, GGML_MAX_DIMS, ACL_FORMAT_ND, offset); + ACL_CHECK(aclnnAddGetWorkspaceSize(acl_src0, acl_src1, alpha, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + ACL_CHECK( + aclnnAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); + ACL_CHECK(aclDestroyTensor(acl_src0)); + } else { + ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src1, alpha, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + ACL_CHECK(aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, + ctx.stream())); + } + + ACL_CHECK(aclDestroyTensor(acl_src1)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = ggml_cann_create_tensor(src); + + GGML_ASSERT(dst->ne[0] == 1); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + int64_t reduce_dims_host[] = {3}; + aclIntArray* reduce_dims = aclCreateIntArray(reduce_dims_host, 1); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnReduceSumGetWorkspaceSize( + acl_src, reduce_dims, true, ggml_cann_type_mapping(src->type), acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnReduceSum(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + aclTensor* acl_src = + ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + + std::vector output_size{dst->ne[1], dst->ne[0]}; + auto output_size_array = aclCreateIntArray(output_size.data(), 2); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnUpsampleNearest2dGetWorkspaceSize( + acl_src, output_size_array, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnUpsampleNearest2d(workspaceAddr, workspaceSize, executor, + ctx.stream())); + + ACL_CHECK(aclDestroyIntArray(output_size_array)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +/** + * @brief Pads a tensor with a specified value along each dimension. + * + * This function performs padding of the source tensor `acl_src` and stores the + * result in the destination tensor `acl_dst`. The padding values for each + * dimension are specified in the `paddings` array. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor to be padded. + * @param acl_dst The destination tensor where the padded result will be stored. + * @param paddings An array specifying the padding values for each dimension. + * The size of the array should be twice the number of dimensions of the tensor. + * @param value The value to be used for padding. The default value is 0.0. + */ +static void aclnn_pad(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, int64_t* paddings, + float value = 0.0f) { + aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2); + aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnConstantPadNdGetWorkspaceSize( + acl_src, acl_pad, acl_value, acl_dst, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnConstantPadNd(workspaceAddr, workspaceSize, executor, + ctx.stream())); + + ACL_CHECK(aclDestroyIntArray(acl_pad)); + ACL_CHECK(aclDestroyScalar(acl_value)); +} + +void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + // padding: value in the array means how much distance will be padding. + // the position of elements in the array means which dirction to padding, + // each position means: [dim0.front, dim0.behind, dim1.front, dim1.behind, + // dim2.front, dim2.behind, dim3.front, dim3.behind] + int64_t paddings[] = { + 0, dst->ne[0] - src->ne[0], 0, dst->ne[1] - src->ne[1], + 0, dst->ne[2] - src->ne[2], 0, dst->ne[3] - src->ne[3]}; + aclnn_pad(ctx, acl_src, acl_dst, paddings); + + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyTensor(acl_src)); +} + +/** + * @brief Performs 2D average pooling on the input tensor and stores the result + * in the destination tensor. + * + * This function performs average pooling on the source tensor and stores the + * result in the destination tensor. The pooling parameters (kernel size, + * strides, padding) are specified in the `op_params` of the destination tensor. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result will be stored. The source + * tensor is referenced by `dst->src[0]`. + */ +static void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = + ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + + const int32_t* opts = (const int32_t*)dst->op_params; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + std::vector kernel_dims = {k1, k0}; + std::vector stride_dims = {s1, s0}; + std::vector padding_avg_dims = {p1, p0}; // (padH, padW) + + auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); + auto* strides = aclCreateIntArray(stride_dims.data(), 2); + auto* paddings_avg = aclCreateIntArray(padding_avg_dims.data(), 2); + + bool ceil_mode = false; + bool count_include_pad = true; + int64_t divisor_override = 0; + int8_t cube_math_type = 0; + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnAvgPool2dGetWorkspaceSize( + acl_src, kernel_size, strides, paddings_avg, ceil_mode, + count_include_pad, divisor_override, cube_math_type, acl_dst, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + ACL_CHECK( + aclnnAvgPool2d(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyIntArray(kernel_size)); + ACL_CHECK(aclDestroyIntArray(strides)); + ACL_CHECK(aclDestroyIntArray(paddings_avg)); +} + +/** + * @brief Performs 2D max pooling on the input tensor and stores the result in + * the destination tensor. + * + * This function performs max pooling on the source tensor and stores the result + * in the destination tensor. The pooling parameters (kernel size, strides, + * padding) are specified in the `op_params` of the destination tensor. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result will be stored. The source + * tensor is referenced by `dst->src[0]`. + */ +static void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = + ggml_cann_create_tensor(src, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, nullptr, nullptr, 0, ACL_FORMAT_NCHW); + + const int32_t* opts = (const int32_t*)dst->op_params; + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + int64_t temp_ne[] = {src->ne[0] + p0 * 2, src->ne[1] + p1 * 2, src->ne[2], + src->ne[3]}; + size_t temp_nb[GGML_MAX_DIMS]; + + temp_nb[0] = ggml_element_size(src); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + temp_nb[i] = temp_nb[i - 1] * temp_ne[i - 1]; + } + + ggml_cann_pool_alloc temp_buffer_allocator( + ctx.pool(), ggml_nbytes(src) + p0 * 2 + p1 * 2 * src->nb[1]); + void* buffer = temp_buffer_allocator.get(); + aclTensor* tmp_tensor = ggml_cann_create_tensor( + buffer, ACL_FLOAT, ggml_element_size(src), temp_ne, temp_nb, + GGML_MAX_DIMS, ACL_FORMAT_NCHW); + + // pad: see padding in ggml_cann_pad() + int64_t paddings[] = {p0, p0, p1, p1, 0, 0, 0, 0}; + float value = -FLT_MAX; + aclnn_pad(ctx, acl_src, tmp_tensor, paddings, value); + + // max_pool + std::vector kernel_dims = {k1, k0}; + std::vector stride_dims = {s1, s0}; + // padding_max_dims: [dim0_start, dim0_end, dim1_start, dim1_end] + std::vector padding_max_dims = {0, 0, 0, 0}; + std::vector dilation_size = {1, 1}; + auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); + auto* strides = aclCreateIntArray(stride_dims.data(), 2); + auto* paddings_max = aclCreateIntArray(padding_max_dims.data(), 4); + auto* dilations = aclCreateIntArray(dilation_size.data(), 2); + + bool ceil_mode = false; + int64_t auto_pads = 0; + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMaxPoolGetWorkspaceSize( + tmp_tensor, kernel_size, strides, auto_pads, paddings_max, dilations, + ceil_mode, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnMaxPool(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyTensor(tmp_tensor)); + ACL_CHECK(aclDestroyIntArray(kernel_size)); + ACL_CHECK(aclDestroyIntArray(strides)); + ACL_CHECK(aclDestroyIntArray(paddings_max)); + ACL_CHECK(aclDestroyIntArray(dilations)); +} + +void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + const int32_t* opts = (const int32_t*)dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + switch (op) { + case GGML_OP_POOL_AVG: + ggml_cann_avg_pool2d(ctx, dst); + break; + case GGML_OP_POOL_MAX: + ggml_cann_max_pool2d(ctx, dst); + break; + case GGML_OP_POOL_COUNT: + GGML_ABORT("fatal error"); + break; + } +} + +/** + * @brief Copies data from the source tensor to the destination tensor. + * + * This function copies data from the source tensor `acl_src` to the destination + * tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor from which data will be copied. + * @param acl_dst The destination tensor where the data will be copied to. + */ +static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceCopyGetWorkspaceSize(acl_dst, acl_src, &workspaceSize, + &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnInplaceCopy(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + src->extra = src_extra_allocator.get(); + dst->extra = dst_extra_allocator.get(); + ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); + ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); + + if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) && + ggml_are_same_shape(src, dst)) { + cann_copy(ctx, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + // TODO: simplify + if (src->type == GGML_TYPE_F16) { + if (dst->type == GGML_TYPE_Q8_0) { + aclrtlaunch_ascendc_quantize_f16_q8_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } + if (dst->type == GGML_TYPE_Q4_0) { + aclrtlaunch_ascendc_quantize_f16_to_q4_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } + if (dst->type == GGML_TYPE_F16) { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + if (ggml_is_contiguous(dst)) { + const size_t src_type_size = ggml_type_size(src->type); + if (src->nb[0] == src_type_size) { + // src0 is contigous on first dimension, copy by rows + int64_t rows_num = ggml_nrows(src); + + aclrtlaunch_ascendc_dup_by_rows_fp16( + rows_num, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + return; + } + GGML_ABORT("fatal error"); + } + GGML_ABORT("fatal error"); + } + if (dst->type == GGML_TYPE_F32) { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + if (ggml_is_contiguous(dst)) { + const size_t src_type_size = ggml_type_size(src->type); + if (src->nb[0] == src_type_size) { + // src0 is contigous on first dimension, copy by rows + int64_t rows_num = ggml_nrows(src); + aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32( + rows_num, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + return; + } + GGML_ABORT("fatal error"); + } + GGML_ABORT("fatal error"); + } + // TODO + GGML_ABORT("fatal error"); + } else if (src->type == GGML_TYPE_F32) { + // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size + // && nb0 == type_size) + if (dst->type == GGML_TYPE_Q8_0) { + aclrtlaunch_ascendc_quantize_f32_q8_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } + if (dst->type == GGML_TYPE_Q4_0) { + aclrtlaunch_ascendc_quantize_f32_to_q4_0( + 24, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne); + return; + } + if (dst->type == GGML_TYPE_F32) { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + if (ggml_is_contiguous(dst)) { + const size_t src_type_size = ggml_type_size(src->type); + if (src->nb[0] == src_type_size) { + // src0 is contigous on first dimension, copy by rows + int64_t rows_num = ggml_nrows(src); + aclrtlaunch_ascendc_dup_by_rows_fp32( + rows_num, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + return; + } + GGML_ABORT("fatal error"); + } else { + // TODO: dst not contiguous + GGML_ABORT("fatal error"); + } + } + if (dst->type == GGML_TYPE_F16) { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + if (ggml_is_contiguous(dst)) { + const size_t src_type_size = ggml_type_size(src->type); + if (src->nb[0] == src_type_size) { + // src0 is contigous on first dimension, copy by rows + int64_t rows_num = ggml_nrows(src); + aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16( + rows_num, ctx.stream(), src->data, dst->data, + ((ggml_tensor*)src->extra)->ne, + ((ggml_tensor*)src->extra)->nb, + ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + return; + } + GGML_ABORT("fatal error"); + } + } + // TODO + GGML_ABORT("fatal error"); + } else { + if (ggml_are_same_shape(src, dst)) { + cann_copy(ctx, acl_src, acl_dst); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + return; + } + GGML_ABORT("fatal error"); + } +} + +#ifdef __cplusplus +extern "C" { +#endif +aclnnStatus aclnnRmsNormGetWorkspaceSize(const aclTensor* x, + const aclTensor* gamma, double epsilon, + const aclTensor* yOut, + const aclTensor* rstdOout, + uint64_t* workspaceSize, + aclOpExecutor** executor); +aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize, + aclOpExecutor* executor, aclrtStream stream); +#ifdef __cplusplus +} +#endif + +/** + * @brief Creates an ACL tensor initialized with zeros using a provided buffer. + * + * This function initializes a tensor with zeros using the specified buffer and + * tensor parameters. + * + * @param ctx The context for the CANN backend operations. + * @param buffer The buffer to be used for the tensor data. + * @param n_bytes The size of the buffer in bytes. + * @param ne An array specifying the extents (sizes) of each dimension of the + * tensor. + * @param dims The number of dimensions of the tensor. + * @param type The data type of the tensor. + * @param type_size The size of each element in the tensor data type. + * @return An ACL tensor initialized with zeros. + */ +static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer, + size_t n_bytes, int64_t* ne, int64_t dims, + aclDataType type, size_t type_size) { + size_t nb[GGML_MAX_DIMS]; + nb[0] = type_size; + for (int i = 1; i < dims; i++) { + nb[i] = nb[i - 1] * ne[i - 1]; + } + + ACL_CHECK(aclrtMemsetAsync(buffer, n_bytes, 0, n_bytes, ctx.stream())); + aclTensor* zero = + ggml_cann_create_tensor(buffer, type, type_size, ne, nb, dims); + return zero; +} + +/** + * @brief Creates an ACL tensor initialized with value using a provided buffer. + * + * This function initializes a tensor with value using the specified buffer and + * tensor parameters. + * + * @param ctx The context for the CANN backend operations. + * @param buffer The buffer to be used for the tensor data. + * @param n_bytes The size of the buffer in bytes. + * @param ne An array specifying the extents (sizes) of each dimension of the + * tensor. + * @param dims The number of dimensions of the tensor. + * @param type The data type of the tensor. + * @param type_size The size of each element in the tensor data type. + * @param value The value to be used for initializing the tensor (default + * is 1.0). + * @return An ACL tensor initialized with value. + */ +static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer, + size_t n_bytes, int64_t* ne, int64_t dims, + aclDataType type, size_t type_size, + float value = 1.0f) { + aclTensor* acl_tensor = + aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size); + float alpha_host = 1.0f; + aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT); + aclScalar* other = aclCreateScalar(&value, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceAddsGetWorkspaceSize(acl_tensor, other, alpha, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + ACL_CHECK( + aclnnInplaceAdds(workspaceAddr, workspaceSize, executor, ctx.stream())); + + return acl_tensor; +} + +void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + float eps; + memcpy(&eps, dst->op_params, sizeof(float)); + + GGML_ASSERT(eps > 0.0f); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src); + ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); + + aclTensor* acl_gamma = aclnn_values( + ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1, + ggml_cann_type_mapping(src->type), ggml_element_size(src)); + + size_t zero_tensor_n_bytes = + src->ne[1] * src->ne[2] * src->ne[3] * ggml_element_size(src); + ggml_cann_pool_alloc zero_tensor_allocator(ctx.pool(), zero_tensor_n_bytes); + aclTensor* acl_rstd = + aclnn_zero(ctx, zero_tensor_allocator.get(), zero_tensor_n_bytes, + src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), + ggml_element_size(src)); + + ACL_CHECK(aclnnRmsNormGetWorkspaceSize( + acl_src, acl_gamma, eps, acl_dst, acl_rstd, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnRmsNorm(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyTensor(acl_gamma)); + ACL_CHECK(aclDestroyTensor(acl_rstd)); +} + +// TODO: performace is low. +void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, + float value) { + ggml_tensor* src = dst->src[0]; + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + const int n_past = ((int32_t*)dst->op_params)[0]; + + size_t one_tensor_n_bytes = src->ne[0] * src->ne[1] * src->ne[2] * + src->ne[3] * ggml_element_size(src); + ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes); + + aclTensor* mask_tensor = + aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, + src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type), + ggml_element_size(src), value); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnTril(workspaceAddr, workspaceSize, executor, ctx.stream())); + + aclScalar* alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + + ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, mask_tensor, alpha, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + ACL_CHECK( + aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyScalar(alpha)); + ACL_CHECK(aclDestroyTensor(mask_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +/** + * @brief Casts the data type of a source tensor to a destination tensor. + * + * This function casts the data type of the source tensor `acl_src` to the + * specified data type `cast_data_type` and stores the result in the destination + * tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose data type will be casted. + * @param acl_dst The destination tensor where the casted result will be stored. + * @param cast_data_type The target data type to which the source tensor will be + * casted. + */ +static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, aclDataType cast_data_type) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Permutes the dimensions of a tensor according to a specified order. + * + * This function permutes the dimensions of the source tensor `acl_src` + * according to the order specified in the `new_dim` array and stores the result + * in the destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose dimensions will be permuted. + * @param acl_dst The destination tensor where the permuted result will be + * stored. + * @param new_dim An array specifying the new order of dimensions for the + * tensor. + * @param dims The number of dimensions in the tensor. + */ +static void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, int64_t* new_dim, uint64_t dims) { + aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyIntArray(acl_dims)); +} + +#ifdef __cplusplus +extern "C" { +#endif +aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self, + const aclIntArray* kernelSize, + const aclIntArray* dilation, + const aclIntArray* padding, + const aclIntArray* stride, + aclTensor* out, uint64_t* workspaceSize, + aclOpExecutor** executor); +aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize, + aclOpExecutor* executor, aclrtStream stream); +#ifdef __cplusplus +} +#endif + +static void ggml_cann_im2col_2d_post_process(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + ggml_tensor* src1, + aclTensor* tmp_cast_tensor, + aclTensor* tmp_im2col_tensor) { + // Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW] + int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]}; + size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]}; + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3); + } + + // release + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +static void ggml_cann_im2col_1d_post_process( + ggml_backend_cann_context& ctx, ggml_tensor* dst, ggml_tensor* src1, + aclTensor* tmp_cast_tensor, aclTensor* tmp_im2col_tensor, + const std::vector& im2col_op_params) { + // get params + const int64_t KH = im2col_op_params[0]; + const int64_t KW = im2col_op_params[1]; + const int64_t IW = im2col_op_params[2]; + const int64_t IC = im2col_op_params[3]; + const int64_t N = im2col_op_params[4]; + const int64_t OH = im2col_op_params[5]; + const int64_t OW = im2col_op_params[6]; + const int64_t s0 = im2col_op_params[7]; + const int64_t p0 = im2col_op_params[8]; + const int64_t d0 = im2col_op_params[9]; + const int64_t n_bytes_factor = im2col_op_params[10]; + + // Permute: [N, IC * KH * KW, OW * OH] -> + // [N, OW * OH * n_bytes_factor, IC * KH * KW] + aclTensor* tmp_permute_tensor = nullptr; + ggml_cann_pool_alloc tmp_permute_allocator(ctx.pool()); + tmp_permute_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + void* tmp_permute_buffer = tmp_permute_allocator.get(); + + int64_t tmp_permute_ne[] = {IC * KH * KW, OW * OH * n_bytes_factor, N}; + size_t tmp_permute_nb[GGML_MAX_DIMS - 1]; + tmp_permute_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; + } + + tmp_permute_tensor = ggml_cann_create_tensor( + tmp_permute_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_permute_ne, tmp_permute_nb, + GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + + int64_t permute_dim[] = {0, 2, 1}; + if (src1->type != dst->type) { + aclnn_permute(ctx, tmp_cast_tensor, tmp_permute_tensor, permute_dim, 3); + } else { + aclnn_permute(ctx, tmp_im2col_tensor, tmp_permute_tensor, permute_dim, + 3); + } + + // number of times the kernel moves in W dimension + const int n_step_w = (IW + 2 * p0 - d0 * (KW - 1) - 1) / s0 + 1; + size_t offset; + void *cur_dst_buffer = dst->data, *cur_permute_buffer = tmp_permute_buffer; + + // memory copy with offset to restore 1D im2col from 2d + if (IC > 1) { + offset = IC * KH * KW * n_step_w * ggml_type_size(dst->type); + size_t size_cpy = KH * KW * ggml_type_size(dst->type); + + for (int c = 0; c < IC; c++) { + cur_permute_buffer = (char*)tmp_permute_buffer + offset + + KH * KW * c * ggml_type_size(dst->type); + cur_dst_buffer = (char*)dst->data + + c * KH * KW * n_step_w * ggml_type_size(dst->type); + + for (int i = 0; i < n_step_w; i++) { + ACL_CHECK(aclrtMemcpyAsync( + cur_dst_buffer, size_cpy, cur_permute_buffer, size_cpy, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + cur_dst_buffer = + (char*)cur_dst_buffer + KH * KW * ggml_type_size(dst->type); + cur_permute_buffer = (char*)cur_permute_buffer + + KH * KW * IC * ggml_type_size(dst->type); + } + } + } else { + offset = KH * KW * n_step_w * + ggml_type_size(dst->type); // equal to ggml_nbytes(dst) + ACL_CHECK(aclrtMemcpyAsync(dst->data, offset, + (char*)tmp_permute_buffer + offset, offset, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + } + + // release + ACL_CHECK(aclDestroyTensor(tmp_permute_tensor)); +} + +void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; // kernel + ggml_tensor* src1 = dst->src[1]; // input + + GGML_TENSOR_BINARY_OP_LOCALS; + + // aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D + // im2col and do post-processing to restore it to 1D. + const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1; + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = is_2D ? ((const int32_t*)(dst->op_params))[1] : 1; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = is_2D ? ((const int32_t*)(dst->op_params))[3] : 1; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = is_2D ? ((const int32_t*)(dst->op_params))[5] : 1; + + const int64_t N = ne13; + const int64_t IC = ne12; + const int64_t KH = ne01; + const int64_t KW = ne00; + const int64_t IW = ne10; + + const int64_t OH = is_2D ? ne2 : 1; + const int64_t OW = ne1; + + // memory allocated increased to 3x when is_2D == false + const int64_t n_bytes_factor = is_2D ? 1 : 3; + + // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH * n_bytes_factor] + aclTensor* acl_src1 = ggml_cann_create_tensor(src1); + int64_t tmp_im2col_ne[] = {OW * OH * n_bytes_factor, IC * KH * KW, N}; + size_t tmp_im2col_nb[GGML_MAX_DIMS - 1]; + + tmp_im2col_nb[0] = ggml_type_size(src1->type); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1]; + } + + // Calculate im2col. + // If dst is f16, tmp_buffer is f32, we need alloc src.typesize * + // dst.elemcount. + ggml_cann_pool_alloc im2col_allocator( + ctx.pool(), + ggml_nelements(dst) * ggml_element_size(src1) * n_bytes_factor); + void* tmp_im2col_buffer = im2col_allocator.get(); + + aclTensor* tmp_im2col_tensor = ggml_cann_create_tensor( + tmp_im2col_buffer, ggml_cann_type_mapping(src1->type), + ggml_type_size(src1->type), tmp_im2col_ne, tmp_im2col_nb, + GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + + std::vector kernel_dims = {KH, KW}; + std::vector dilation_size = {d1, d0}; + std::vector padding_dims = {p1, p0}; + std::vector stride_dims = {s1, s0}; + auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2); + auto* dilations = aclCreateIntArray(dilation_size.data(), 2); + auto* paddings = aclCreateIntArray(padding_dims.data(), 2); + auto* strides = aclCreateIntArray(stride_dims.data(), 2); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations, + paddings, strides, tmp_im2col_tensor, + &workspaceSize, &executor)); + + ggml_cann_pool_alloc workspace_allocator(ctx.pool()); + if (workspaceSize > 0) { + workspace_allocator.alloc(workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnIm2col(workspaceAddr, workspaceSize, executor, ctx.stream())); + + // Cast if dst is f16. + aclTensor* tmp_cast_tensor = nullptr; + ggml_cann_pool_alloc tmp_cast_allocator(ctx.pool()); + void* tmp_cast_buffer = nullptr; + if (src1->type != dst->type) { + tmp_cast_allocator.alloc(ggml_nbytes(dst) * n_bytes_factor); + tmp_cast_buffer = tmp_cast_allocator.get(); + size_t temp_cast_nb[GGML_MAX_DIMS - 1]; + temp_cast_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS - 1; i++) { + temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1]; + } + + tmp_cast_tensor = ggml_cann_create_tensor( + tmp_cast_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_im2col_ne, temp_cast_nb, + GGML_MAX_DIMS - 1, ACL_FORMAT_ND); + aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor, + ggml_cann_type_mapping(dst->type)); + } + + // post-processing + if (is_2D) { + ggml_cann_im2col_2d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor); + } else { + std::vector im2col_op_params = { + KH, KW, IW, IC, N, OH, OW, s0, p0, d0, n_bytes_factor}; + ggml_cann_im2col_1d_post_process(ctx, dst, src1, tmp_cast_tensor, + tmp_im2col_tensor, im2col_op_params); + } + + // release + ACL_CHECK(aclDestroyTensor(acl_src1)); + ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_cast_tensor)); + ACL_CHECK(aclDestroyIntArray(kernel_size)); + ACL_CHECK(aclDestroyIntArray(dilations)); + ACL_CHECK(aclDestroyIntArray(paddings)); + ACL_CHECK(aclDestroyIntArray(strides)); +} + +/** + * @brief Applies element-wise exponential function to the elements of a tensor. + * + * This function computes the exponential of each element in the source tensor + * `acl_src` and stores the result back into the same tensor. + * The operation is defined as: + * \f[ + * \text {acl_src }_i=e^{acl\_src_i} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The tensor on which the exponential function will be applied. + */ +static void aclnn_exp(ggml_backend_cann_context& ctx, aclTensor* acl_src) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK( + aclnnInplaceExpGetWorkspaceSize(acl_src, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnInplaceExp(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Multiplies elements of a tensor by a scalar value, optionally + * in-place. + * + * This function multiplies each element of the source tensor `acl_src` by the + * scalar `scale` and stores the result in the destination tensor `acl_dst`. If + * `inplace` is true, `acl_dst` will not be used and the operation is performed + * in-place on `acl_src`. + * The operation is defined as: + * \f[ + * \text {acl_dst }_i=\text {acl_src }_i \times \text {scale} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose elements will be multiplied. + * @param scale The scalar value by which each element of `acl_src` will be + * multiplied. + * @param acl_dst The destination tensor where the result will be stored if + * `inplace` is false. + * @param inplace Flag indicating whether to perform the operation in-place on + * `acl_src`. + */ +static void aclnn_muls(ggml_backend_cann_context& ctx, aclTensor* acl_src, + float scale, aclTensor* acl_dst, bool inplace) { + aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + if (inplace) { + ACL_CHECK(aclnnInplaceMulsGetWorkspaceSize(acl_src, acl_scale, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnInplaceMuls(workspaceAddr, workspaceSize, executor, + ctx.stream())); + } else { + ACL_CHECK(aclnnMulsGetWorkspaceSize(acl_src, acl_scale, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnMuls(workspaceAddr, workspaceSize, executor, ctx.stream())); + } + + ACL_CHECK(aclDestroyScalar(acl_scale)); +} + +/** + * @brief Performs an in-place element-wise multiplication of two tensors. + * + * This function performs an element-wise multiplication of the tensors + * `acl_src` and `acl_other` and stores the result in `acl_src`. + * The operation is defined as: + * \f[ + * \text {acl_src }_i=\text {acl_src }_i \times \text {acl_other }_i + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor where the multiplication result will be + * stored. + * @param acl_other The tensor whose elements will be multiplied with `acl_src`. + */ +static void aclnn_inplace_mul(ggml_backend_cann_context& ctx, + aclTensor* acl_src, aclTensor* acl_other) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceMulGetWorkspaceSize(acl_src, acl_other, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnInplaceMul(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Performs element-wise multiplication of two tensors and stores the + * result in a destination tensor. + * + * This function performs element-wise multiplication of the tensors `acl_src` + * and `acl_other` and stores the result in the destination tensor `acl_dst`. + * The operation is defined as: + * \f[ + * \text {acl_dst }_i=\text {acl_src }_i \times \text {acl_other }_i + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The first tensor for element-wise multiplication. + * @param acl_other The second tensor for element-wise multiplication. + * @param acl_dst The destination tensor where the result will be stored. + */ +static void aclnn_mul(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_other, aclTensor* acl_dst) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMulGetWorkspaceSize(acl_src, acl_other, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnMul(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Applies element-wise cosine function to the elements of a tensor. + * + * This function computes the cosine of each element in the source tensor + * `acl_src` and stores the result in the destination tensor `acl_dst`. The + * operation is defined as: \f[ \text {acl_dst }_i=\cos \left(\text {acl_src + * }_i\right) \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor on which the cosine function will be + * applied. + * @param acl_dst The destination tensor where the cosine results will be + * stored. + */ +static void aclnn_cos(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK( + aclnnCosGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnCos(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Applies element-wise sine function to the elements of a tensor. + * + * This function computes the sine of each element in the source tensor + `acl_src` + * and stores the result in the destination tensor `acl_dst`. + * The operation is defined as: + * \f[ + * \text {acl_dst }_i=\sin \left(\text {acl_src }_i\right) + * \f] + + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor on which the sine function will be applied. + * @param acl_dst The destination tensor where the sine results will be stored. + */ +static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK( + aclnnSinGetWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Performs element-wise division of tensor1 by tensor2 , multiplies the + result by the scalar value and adds it to self . + * + * Performs element-wise division of tensor1 by tensor2, + * multiplies the result by the scalar value and adds it to self . + * The operation is defined as: + * \f[ + * \text{out}_i = \text{selft}_i + \text{value} \times + \frac{\text{tensor1}_i}{\text{tensor2}_i} + * \f] + + * @param ctx The context for the CANN backend operations. + * @param acl_self The source tensor on which the addcdiv function will be + applied. + * @param tensor1 Numerator tensor. + * @param tensor2 Denominator tensor. + * @param value The value to be used for coefficient. + */ +static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx, + aclTensor* acl_self, aclTensor* tensor1, + aclTensor* tensor2, float value) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); + + ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize( + acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor, + ctx.stream())); +} + +/** + * @brief Matrix division, optionally in-place. + * + * This function division each element of the source tensor `acl_src` by the + * tensor `acl_other` and stores the result in the destination tensor `acl_dst`. + * If `inplace` is true, `acl_dst` will not be used and the operation is + * performed in-place on `acl_src`. The operation is defined as: \f[ + * \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_src Numerator tensor.. + * @param acl_other Denominator tensor. + * @param acl_dst The destination tensor where the result will be stored if + * `inplace` is false. + * @param inplace Flag indicating whether to perform the operation in-place on + * `acl_src`. + */ +static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_other, aclTensor* acl_dst, + bool inplace) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + if (inplace) { + ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor, + ctx.stream())); + } else { + ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream())); + } +} + +void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { + const ggml_tensor* src = dst->src[0]; + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int dim = dst->op_params[0]; + const int max_period = dst->op_params[1]; + int half = dim / 2; + + aclTensor* acl_src = ggml_cann_create_tensor(src); + + // arange: [0, ..., half) + float start = 0; + float stop = half; + float step = 1; + int64_t n_elements_arange = half; + int64_t tmp_arange_ne[] = {half}; + size_t tmp_arange_nb[] = {sizeof(dst->type)}; + + ggml_cann_pool_alloc arange_allocator(ctx.pool(), half * sizeof(dst->type)); + void* tmp_arange_buffer = arange_allocator.get(); + aclTensor* tmp_arange_tensor = ggml_cann_create_tensor( + tmp_arange_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_arange_ne, tmp_arange_nb, + GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + + aclnn_arange(ctx, tmp_arange_tensor, start, stop, step, n_elements_arange); + + // freq + float freq_param = -logf(max_period) / half; + bool inplace = true; + aclnn_muls(ctx, tmp_arange_tensor, freq_param, nullptr, inplace); + aclnn_exp(ctx, tmp_arange_tensor); + + // permute: src [0,1,2,3]->[0,1,3,2] + int64_t tmp_permute_ne[] = {src->ne[1], src->ne[0], src->ne[2], src->ne[3]}; + size_t tmp_permute_nb[GGML_MAX_DIMS]; + tmp_permute_nb[0] = ggml_type_size(src->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_permute_nb[i] = tmp_permute_nb[i - 1] * tmp_permute_ne[i - 1]; + } + + ggml_cann_pool_alloc permute_allocator(ctx.pool(), ggml_nbytes(src)); + void* tmp_permute_buffer = permute_allocator.get(); + aclTensor* tmp_permute_tenosr = ggml_cann_create_tensor( + tmp_permute_buffer, ggml_cann_type_mapping(src->type), + ggml_type_size(src->type), tmp_permute_ne, tmp_permute_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + int64_t permute_dim[] = {0, 1, 3, 2}; + int64_t num_dims = 4; + aclnn_permute(ctx, acl_src, tmp_permute_tenosr, permute_dim, num_dims); + + // timestep * freq + int64_t tmp_mul_ne[] = {src->ne[1] * half, src->ne[0], src->ne[2], + src->ne[3]}; + size_t tmp_mul_nb[GGML_MAX_DIMS]; + tmp_mul_nb[0] = ggml_type_size(src->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_mul_nb[i] = tmp_mul_nb[i - 1] * tmp_mul_ne[i - 1]; + } + + int mul_nelements = + src->ne[1] * half * src->ne[0] * src->ne[2] * src->ne[3]; + + ggml_cann_pool_alloc mul_allocator( + ctx.pool(), mul_nelements * ggml_type_size(src->type)); + void* tmp_mul_buffer = mul_allocator.get(); + aclTensor* tmp_mul_tensor = ggml_cann_create_tensor( + tmp_mul_buffer, ggml_cann_type_mapping(src->type), + ggml_type_size(src->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, + ACL_FORMAT_ND); + aclnn_mul(ctx, tmp_permute_tenosr, tmp_arange_tensor, tmp_mul_tensor); + + // cos + ggml_cann_pool_alloc cos_allocator( + ctx.pool(), mul_nelements * ggml_type_size(src->type)); + void* tmp_cos_buffer = cos_allocator.get(); + aclTensor* tmp_cos_tensor = ggml_cann_create_tensor( + tmp_cos_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, + ACL_FORMAT_ND); + + aclnn_cos(ctx, tmp_mul_tensor, tmp_cos_tensor); + + // sin + ggml_cann_pool_alloc sin_allocator( + ctx.pool(), mul_nelements * ggml_type_size(src->type)); + void* tmp_sin_buffer = sin_allocator.get(); + aclTensor* tmp_sin_tensor = ggml_cann_create_tensor( + tmp_sin_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_mul_ne, tmp_mul_nb, GGML_MAX_DIMS, + ACL_FORMAT_ND); + + aclnn_sin(ctx, tmp_mul_tensor, tmp_sin_tensor); + + // concat + int64_t concat_dim = 3; + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + aclTensor* tensors[] = {tmp_cos_tensor, tmp_sin_tensor}; + aclTensorList* tensorList = aclCreateTensorList(tensors, 2); + aclnn_concat(ctx, tensorList, acl_dst, concat_dim); + + // release + // segmentation fault when delete both tensorList and his elements. + ACL_CHECK(aclDestroyTensorList(tensorList)); + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(tmp_arange_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_permute_tenosr)); + ACL_CHECK(aclDestroyTensor(tmp_mul_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +/** + * @brief Fills a tensor with a scalar value. + * + * This function fills the destination tensor `acl_dst` with the scalar value + * `scalar`. + * + * @param ctx The context for the CANN backend operations. + * @param scalar The scalar value used to fill the tensor. + * @param acl_dst The destination tensor to be filled with the scalar value. + */ +static void aclnn_fill_scalar(ggml_backend_cann_context& ctx, float scalar, + aclTensor* acl_dst) { + auto acl_scalar = aclCreateScalar(&scalar, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceFillScalarGetWorkspaceSize( + acl_dst, acl_scalar, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnInplaceFillScalar(workspaceAddr, workspaceSize, executor, + ctx.stream())); + ACL_CHECK(aclDestroyScalar(acl_scalar)); +} + +/** + * @brief Raises each element of a tensor to the power of the corresponding + * element in another tensor. + * + * This function computes the element-wise power of the destination tensor + * `acl_dst` raised to the power of the exponent tensor `acl_exp`. + * The operation is defined as: + * \f[ + * \text {acl_dst }_i=acl\_dst_i^{\text {acl_exp }_i} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_dst The destination tensor, which also serves as the base tensor. + * @param acl_exp The exponent tensor, each element of which is used to raise + * the corresponding element in the destination tensor. + */ +static void aclnn_pow_tensor_tensor(ggml_backend_cann_context& ctx, + aclTensor* acl_dst, aclTensor* acl_exp) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplacePowTensorTensorGetWorkspaceSize( + acl_dst, acl_exp, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnInplacePowTensorTensor(workspaceAddr, workspaceSize, + executor, ctx.stream())); +} + +/** + * @brief Applies the Alibi (Attention with Linear Biases) mechanism to the + * @details This function implements the Alibi mechanism, which introduces + * learnable biases into the attention scores to simulate relative + * position encoding without the need for explicit positional + * embeddings. + * + * @param ctx The backend CANN context for executing operations. + * @param acl_src The source tensor representing the query or key. + * @param acl_position The position tensor containing relative positions. + * @param acl_dst The destination tensor where the result will be stored. + * @param n_head The number of attention heads. + * @param src_ne The dimensions of the source tensor. + * @param src_nb0 The byte size of the first dimension of the source + tensor. + * @param max_bias The maximum bias value used in the Alibi mechanism. + * @param dst The destination tensor object for additional metadata. + * + * The function performs the following steps: + * 1. Calculates the logarithm floor of the number of heads to determine the + base for bias calculation. + * 2. Initializes arrays with arithmetic sequences and fills them with bias + values. + * 3. Computes the bias tensor based on the calculated biases and arithmetic + sequences. + * 4. Reshapes the bias tensor to match the dimensions of the input tensors. + * 5. Multiplies the position tensor by the bias tensor. + * 6. Adds the result of the multiplication to the source tensor to produce the + final output. + */ +static void aclnn_alibi(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_position, aclTensor* acl_dst, + const int n_head, int64_t* src_ne, const size_t src_nb0, + float max_bias, ggml_tensor* dst) { + const int64_t ne2_ne3 = src_ne[2] * src_ne[3]; + GGML_ASSERT(src_nb0 == sizeof(float)); + GGML_ASSERT(n_head == src_ne[2]); + + const int n_heads_log2_floor = 1u << (uint32_t)floor(log2(n_head)); + + float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor); + float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor); + + // init arange + ggml_cann_pool_alloc arange_allocator(ctx.pool(), + ne2_ne3 * ggml_type_size(dst->type)); + void* tmp_arange_buffer = arange_allocator.get(); + + // arange1: [1, ..., n_heads_log2_floor+1) + float start = 1; + float stop = n_heads_log2_floor + 1; + float step = 1; + int64_t n_elements_arange = n_heads_log2_floor; + + int64_t tmp_arange1_ne[] = {n_heads_log2_floor}; + size_t tmp_arange1_nb[] = {sizeof(dst->type)}; + aclTensor* tmp_arange1_tensor = ggml_cann_create_tensor( + tmp_arange_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_arange1_ne, tmp_arange1_nb, + GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + + aclnn_arange(ctx, tmp_arange1_tensor, start, stop, step, n_elements_arange); + + aclTensor* tmp_arange2_tensor = nullptr; + if (n_heads_log2_floor < ne2_ne3) { + // arange2: [1, ..., 2 * (k - n_heads_log2_floor) + 1) + start = 1; + stop = 2 * (ne2_ne3 - n_heads_log2_floor) + 1; + step = 2; + n_elements_arange = ne2_ne3 - n_heads_log2_floor; + int64_t tmp_arange2_ne[] = {ne2_ne3 - n_heads_log2_floor}; + size_t tmp_arange2_nb[] = {sizeof(dst->type)}; + + aclTensor* tmp_arange2_tensor = ggml_cann_create_tensor( + (char*)tmp_arange_buffer + + n_heads_log2_floor * ggml_type_size(dst->type), + ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), + tmp_arange2_ne, tmp_arange2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclnn_arange(ctx, tmp_arange2_tensor, start, stop, step, + n_elements_arange); + } + + // init mk_base + ggml_cann_pool_alloc mk_base_allocator(ctx.pool(), + ne2_ne3 * ggml_type_size(dst->type)); + void* tmp_mk_base_buffer = mk_base_allocator.get(); + int64_t tmp_mk_base1_ne[] = {n_heads_log2_floor}; + size_t tmp_mk_base1_nb[] = {sizeof(dst->type)}; + aclTensor* tmp_mk_base1_tensor = ggml_cann_create_tensor( + tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_mk_base1_ne, tmp_mk_base1_nb, + GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + + aclnn_fill_scalar(ctx, m0, tmp_mk_base1_tensor); + + aclTensor* tmp_mk_base2_tensor = nullptr; + if (n_heads_log2_floor < ne2_ne3) { + int64_t tmp_mk_base2_ne[] = {ne2_ne3 - n_heads_log2_floor}; + size_t tmp_mk_base2_nb[] = {sizeof(dst->type)}; + aclTensor* tmp_mk_base2_tensor = ggml_cann_create_tensor( + (char*)tmp_mk_base_buffer + + n_heads_log2_floor * ggml_type_size(dst->type), + ggml_cann_type_mapping(dst->type), ggml_type_size(dst->type), + tmp_mk_base2_ne, tmp_mk_base2_nb, GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclnn_fill_scalar(ctx, m1, tmp_mk_base2_tensor); + } + + // init mk + int64_t tmp_mk_base_ne[] = {ne2_ne3}; + size_t tmp_mk_base_nb[] = {sizeof(dst->type)}; + aclTensor* tmp_mk_base_tensor = ggml_cann_create_tensor( + tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb, + GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclTensor* tmp_arange_tensor = ggml_cann_create_tensor( + tmp_arange_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_mk_base_ne, tmp_mk_base_nb, + GGML_MAX_DIMS - 3, ACL_FORMAT_ND); + aclnn_pow_tensor_tensor(ctx, tmp_mk_base_tensor, tmp_arange_tensor); + + // reshape mk + int64_t tmp_mk_ne[] = {1, 1, src_ne[2], src_ne[3]}; + size_t tmp_mk_nb[GGML_MAX_DIMS]; + tmp_mk_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_mk_nb[i] = tmp_mk_nb[i - 1] * tmp_mk_ne[i - 1]; + } + aclTensor* tmp_mk_tensor = ggml_cann_create_tensor( + tmp_mk_base_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_mk_ne, tmp_mk_nb, GGML_MAX_DIMS, + ACL_FORMAT_ND); + + // acl_position * mk + int64_t tmp_output_ne[] = {src_ne[0], src_ne[1], src_ne[2], src_ne[3]}; + size_t tmp_output_nb[GGML_MAX_DIMS]; + tmp_output_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_output_nb[i] = tmp_output_nb[i - 1] * tmp_output_ne[i - 1]; + } + ggml_cann_pool_alloc output_allocator(ctx.pool(), ggml_nbytes(dst)); + void* tmp_output_buffer = output_allocator.get(); + aclTensor* tmp_output_tensor = ggml_cann_create_tensor( + tmp_output_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), tmp_output_ne, tmp_output_nb, GGML_MAX_DIMS, + ACL_FORMAT_ND); + aclnn_mul(ctx, acl_position, tmp_mk_tensor, tmp_output_tensor); + + // add + aclnn_add(ctx, tmp_output_tensor, acl_src, acl_dst); + + ACL_CHECK(aclDestroyTensor(tmp_arange1_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_arange2_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_mk_base1_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_mk_base2_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_mk_base_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_arange_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_mk_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_output_tensor)); +} + +void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_cann_dup(ctx, dst); +} + +/** + * @brief Performs element-wise addition of two tensors in place. + * + * This function adds the source tensor `acl_src` to the destination tensor + * `acl_dst` element-wise and stores the result in the destination tensor + * `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor to be added. + * @param acl_dst The destination tensor which will hold the result of the + * addition. + */ +static void aclnn_inplace_add(ggml_backend_cann_context& ctx, + aclTensor* acl_src, aclTensor* acl_dst) { + aclScalar* alpha = nullptr; + float alphaValue = 1.0f; + alpha = aclCreateScalar(&alphaValue, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceAddGetWorkspaceSize(acl_dst, acl_src, alpha, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnInplaceAdd(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyScalar(alpha)); +} + +/** + * @brief Applies the softmax function to a tensor along a specified dimension. + * + * This function computes the softmax of the source tensor `acl_src` along the + * specified dimension `dim` and stores the result in the destination tensor + * `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor on which the softmax function will be + * applied. + * @param dim The dimension along which the softmax function will be computed. + * @param acl_dst The destination tensor where the softmax results will be + * stored. + */ +static void aclnn_softmax(ggml_backend_cann_context& ctx, aclTensor* acl_src, + int64_t dim, aclTensor* acl_dst) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnSoftmaxGetWorkspaceSize(acl_src, dim, acl_dst, + &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + aclrtStream stream = ctx.stream(); + ACL_CHECK(aclnnSoftmax(workspaceAddr, workspaceSize, executor, stream)); +} + +void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; // mask + + aclTensor* acl_src0 = ggml_cann_create_tensor(src0); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + float scale = 1.0f; + float max_bias = 0.0f; + + memcpy(&scale, (float*)dst->op_params + 0, sizeof(float)); + memcpy(&max_bias, (float*)dst->op_params + 1, sizeof(float)); + + // input mul scale + aclScalar* acl_scale = aclCreateScalar(&scale, aclDataType::ACL_FLOAT); + + size_t n_bytes = ggml_nbytes(src0); + ggml_cann_pool_alloc mul_scale_allocator(ctx.pool(), n_bytes); + void* input_mul_scale_buffer = mul_scale_allocator.get(); + aclTensor* acl_input_mul_scale_tensor = ggml_cann_create_tensor( + input_mul_scale_buffer, ACL_FLOAT, ggml_type_size(src0->type), src0->ne, + src0->nb, GGML_MAX_DIMS); + + bool inplace = false; + aclnn_muls(ctx, acl_src0, scale, acl_input_mul_scale_tensor, inplace); + + // mask + aclTensor* acl_src1_fp32_tensor = nullptr; + aclTensor* tmp_mask_tensor = nullptr; + ggml_cann_pool_alloc src1_fp32_allocator(ctx.pool()); + if (src1) { + const bool use_f16 = src1->type == GGML_TYPE_F16; + if (use_f16) { + // cast to fp32 + size_t n_bytes = ggml_nelements(src1) * sizeof(float_t); + size_t src1_fp32_nb[GGML_MAX_DIMS]; + src1_fp32_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src1_fp32_nb[i] = src1_fp32_nb[i - 1] * src1->ne[i - 1]; + } + src1_fp32_allocator.alloc(n_bytes); + void* src1_fp32_buffer = src1_fp32_allocator.get(); + acl_src1_fp32_tensor = ggml_cann_create_tensor( + src1_fp32_buffer, ACL_FLOAT, sizeof(float), src1->ne, + src1_fp32_nb, GGML_MAX_DIMS); + aclTensor* acl_src1 = ggml_cann_create_tensor(src1); + aclnn_cast(ctx, acl_src1, acl_src1_fp32_tensor, ACL_FLOAT); + + ACL_CHECK(aclDestroyTensor(acl_src1)); + } else { + acl_src1_fp32_tensor = ggml_cann_create_tensor(src1); + } + + // broadcast the mask across rows, only use ne11 of ne01 in mask + if (src1->ne[1] != src0->ne[1]) { + // mask shape: [1,1,ne11,ne10] + int64_t tmp_mask_ne[] = {src0->ne[0], src0->ne[1], 1, 1}; + size_t tmp_mask_nb[GGML_MAX_DIMS]; + tmp_mask_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + tmp_mask_nb[i] = tmp_mask_nb[i - 1] * tmp_mask_ne[i - 1]; + } + tmp_mask_tensor = ggml_cann_create_tensor( + src1->data, ACL_FLOAT, sizeof(float), tmp_mask_ne, tmp_mask_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + } + + // alibi + const int n_head = src0->ne[2]; + const size_t src_nb0 = src0->nb[0]; + + n_bytes = ggml_nbytes(dst); + ggml_cann_pool_alloc output_allocator(ctx.pool(), n_bytes); + void* output_buffer = output_allocator.get(); + aclTensor* alibi_output_tensor = ggml_cann_create_tensor( + output_buffer, ACL_FLOAT, ggml_type_size(dst->type), dst->ne, + dst->nb, GGML_MAX_DIMS); + if (max_bias <= 0.0f) { + // slope = 1.0 + if (tmp_mask_tensor) { + aclnn_add(ctx, tmp_mask_tensor, acl_input_mul_scale_tensor, + alibi_output_tensor); + } else { + aclnn_add(ctx, acl_src1_fp32_tensor, acl_input_mul_scale_tensor, + alibi_output_tensor); + } + } else { + // slope != 1.0 + if (tmp_mask_tensor) { + aclnn_alibi(ctx, acl_input_mul_scale_tensor, tmp_mask_tensor, + alibi_output_tensor, n_head, src0->ne, src_nb0, + max_bias, dst); + } else { + aclnn_alibi(ctx, acl_input_mul_scale_tensor, + acl_src1_fp32_tensor, alibi_output_tensor, n_head, + src0->ne, src_nb0, max_bias, dst); + } + } + + // softmax + aclnn_softmax(ctx, alibi_output_tensor, 3, acl_dst); + ACL_CHECK(aclDestroyTensor(alibi_output_tensor)); + } else { + aclnn_softmax(ctx, acl_input_mul_scale_tensor, 3, acl_dst); + } + + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(acl_src1_fp32_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); + ACL_CHECK(aclDestroyScalar(acl_scale)); + ACL_CHECK(aclDestroyTensor(acl_input_mul_scale_tensor)); + ACL_CHECK(aclDestroyTensor(tmp_mask_tensor)); +} + +void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + + ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); + src0->extra = src0_extra_allocator.get(); + src1->extra = src1_extra_allocator.get(); + dst->extra = dst_extra_allocator.get(); + ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); + ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); + ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, + sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, + ctx.stream())); + + switch (src0->type) { + case GGML_TYPE_F32: { +#ifdef ASCEND_310P + // Special operation for get_row_f32 kernel of 310P: clear the + // content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 8) != 0) { + size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * + src0->ne[0] * ggml_type_size(GGML_TYPE_F32); + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif + aclrtlaunch_ascendc_get_row_f32( + 24, ctx.stream(), src0->data, src1->data, dst->data, + ((ggml_tensor*)src0->extra)->ne, + ((ggml_tensor*)src0->extra)->nb, + ((ggml_tensor*)src1->extra)->ne, + ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + break; + } + case GGML_TYPE_F16: { +#ifdef ASCEND_310P + // Special operation for get_row_f16 kernel of 310P: clear the + // content of dest data buffer when row is not aligned to 32 bytes + if ((src0->ne[0] % 16) != 0) { + size_t dst_len = + src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * + ggml_type_size( + GGML_TYPE_F32); // out is also f32, even input is f16 + ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + } +#endif + aclrtlaunch_ascendc_get_row_f16( + 24, ctx.stream(), src0->data, src1->data, dst->data, + ((ggml_tensor*)src0->extra)->ne, + ((ggml_tensor*)src0->extra)->nb, + ((ggml_tensor*)src1->extra)->ne, + ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + break; + } + case GGML_TYPE_Q4_0: + aclrtlaunch_ascendc_get_row_q4_0( + 24, ctx.stream(), src0->data, src1->data, dst->data, + ((ggml_tensor*)src0->extra)->ne, + ((ggml_tensor*)src1->extra)->ne, + ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + break; + case GGML_TYPE_Q8_0: + aclrtlaunch_ascendc_get_row_q8_0( + 24, ctx.stream(), src0->data, src1->data, dst->data, + ((ggml_tensor*)src0->extra)->ne, + ((ggml_tensor*)src1->extra)->ne, + ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, + ((ggml_tensor*)dst->extra)->nb); + break; + default: + GGML_ABORT("fatal error"); + break; + } +} + +/** + * @brief Repeats elements of a tensor along a specified dimension. + * + * This function repeats each element of the source tensor `acl_src` a specified + * number of times (`repeats`) along the specified dimension `dim` and stores + * the result in the destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose elements will be repeated. + * @param acl_dst The destination tensor where the repeated elements will be + * stored. + * @param dim The dimension along which the elements will be repeated. + * @param repeats The number of times each element will be repeated. + * @param output_size The size of the output tensor. + */ +static void aclnn_repeat_interleave(ggml_backend_cann_context& ctx, + aclTensor* acl_src, aclTensor* acl_dst, + int64_t dim, int64_t repeats, + int64_t output_size) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnRepeatInterleaveIntWithDimGetWorkspaceSize( + acl_src, repeats, dim, output_size, acl_dst, &workspaceSize, + &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnRepeatInterleaveIntWithDim(workspaceAddr, workspaceSize, + executor, ctx.stream())); +} + +/** + * @brief Performs matrix multiplication of two tensors. + * + * This function computes the matrix multiplication of the input tensor + * `acl_input` and the weight tensor `acl_weight`, and stores the result in the + * destination tensor `acl_dst`. + * The operation is defined as: + * \f[ + * \text {acl_dst}=\text {acl_input@acl_weight} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_input The input tensor for the matrix multiplication. + * @param acl_weight The weight tensor for the matrix multiplication. + * @param acl_dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input, + aclTensor* acl_weight, aclTensor* acl_dst) { + int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is + // fp32, atlas a2 will transpose it to HFLOAT32. + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMatmulGetWorkspaceSize(acl_input, acl_weight, acl_dst, + cube_math_type, &workspaceSize, + &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Performs matrix multiplication of two 2D tensors. + * + * This function computes the matrix multiplication of the input tensor + * `acl_input` and the weight tensor `acl_weight`, and stores the result in the + * destination tensor `acl_dst`. + * The operation is defined as: + * \f[ + * \text {acl_dst}=\text {acl_input@acl_weight} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_input The input tensor for the matrix multiplication. + * @param acl_weight The weight tensor for the matrix multiplication. + * @param acl_dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx, + aclTensor* acl_input, aclTensor* acl_weight, + aclTensor* acl_dst) { + int8_t cube_math_type = 2; + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst, + cube_math_type, &workspaceSize, + &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Performs matrix multiplication of two 3D tensors. + * + * This function computes the matrix multiplication of the input tensor + * `acl_input` and the weight tensor `acl_weight`, and stores the result in the + * destination tensor `acl_dst`. + * The operation is defined as: + * \f[ + * \text {acl_dst}=\text {acl_input@acl_weight} + * \f] + * + * @param ctx The context for the CANN backend operations. + * @param acl_input The input tensor for the matrix multiplication. + * @param acl_weight The weight tensor for the matrix multiplication. + * @param acl_dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx, + aclTensor* acl_input, aclTensor* acl_weight, + aclTensor* acl_dst) { + int8_t cube_math_type = 2; + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst, + cube_math_type, &workspaceSize, + &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK( + aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + +/** + * @brief Performs matrix multiplication with floating-point precision on + * tensors using the CANN backend. + * + * This function performs matrix multiplication of the input tensor and the + * weight tensor, handling broadcasting and transposing as needed, and stores + * the result in the destination tensor `dst`. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, + ggml_tensor* dst) { + ggml_tensor* weight = dst->src[0]; // weight + ggml_tensor* input = dst->src[1]; // input + + // when weight ne2 or ne3 is 1, aclnnMatmulGetWorkspaceSize will auto + // broadcast, when weight ne2 or ne3 is not 1, weight need repeat. + BCAST_MUL_MAT_SHAPE(input, weight, dst); + + int64_t n_dims = bcast_dims; + if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) { + if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) { + n_dims = 2; + } else if (bcast_input_ne[2] == 1) { + n_dims = 3; + } + } + + aclTensor* acl_input_tensor = + ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims); + int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0], + bcast_weight_ne[2], bcast_weight_ne[3], + bcast_weight_ne[4], bcast_weight_ne[5]}; + size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], + bcast_weight_nb[2], bcast_weight_nb[3], + bcast_weight_nb[4], bcast_weight_nb[5]}; + aclTensor* acl_weight_tensor = + ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims); + aclTensor* acl_dst = + ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); + + switch (n_dims) { + case 2: + aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + case 3: + aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + default: + aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst); + break; + } + + ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +/** + * @brief Performs matrix multiplication with quantized weights and + * floating-point inputs using the CANN backend. + * + * This function performs matrix multiplication of the input tensor `src1` and + * the weight tensor `src0`, handling broadcasting, transposing, and + * quantization as needed, and stores the result in the destination tensor + * `dst`. + * + * @param ctx The context for the CANN backend operations. + * @param dst The destination tensor where the result of the matrix + * multiplication will be stored. + */ +static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, + ggml_tensor* dst, + const enum ggml_type type) { + ggml_tensor* src0 = dst->src[0]; // weight + ggml_tensor* src1 = dst->src[1]; // input + + // The shape of the weight is NCHW. + // Matrix multiplication uses HW dims. + // HC is regarded as batch. + // weight need transpose. + float weight_elem_size; + if (type == GGML_TYPE_Q4_0) { + weight_elem_size = float(sizeof(uint8_t)) / 2; + } else if (type == GGML_TYPE_Q8_0) { + weight_elem_size = float(sizeof(uint8_t)); + } else { + GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT"); + } + float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size}; + size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size; + size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3]; + + // scale stored at the end of weight. Also need transpose. + size_t scale_elem_size = sizeof(uint16_t); + size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size, + scale_elem_size}; + size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size; + char* scale_offset = (char*)src0->data + weight_size; + + // input + size_t input_elem_size = sizeof(uint16_t); + int64_t input_ne[] = {src1->ne[0], src1->ne[1]}; + size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size}; + size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size; + ggml_cann_pool_alloc input_alloctor(ctx.pool()); + void* input_buffer = src1->data; + + // case in + if (src1->type != GGML_TYPE_F16) { + aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1); + input_buffer = + input_alloctor.alloc(ggml_nelements(src1) * input_elem_size); + + int64_t* input_cast_ne = src1->ne; + size_t input_cast_nb[GGML_MAX_DIMS]; + input_cast_nb[0] = sizeof(uint16_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_cast_nb[i] = input_cast_nb[i - 1] * input_cast_ne[i - 1]; + } + + aclTensor* acl_input_tensor = ggml_cann_create_tensor( + input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne, + input_cast_nb, GGML_MAX_DIMS); + aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16); + + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src1_tensor)); + } + + // output + size_t output_elem_size = sizeof(uint16_t); + size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size}; + ggml_cann_pool_alloc output_allocator(ctx.pool()); + void* output_buffer = + output_allocator.alloc(ggml_nelements(dst) * output_elem_size); + size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size; + + // aclnn + int64_t max_elem_size = 65535; + int64_t split_size = (src0->ne[1] / max_elem_size) + 1; + ggml_cann_pool_alloc workspace_allocator(ctx.pool()); + aclOpExecutor* executor = nullptr; + uint64_t workspaceSize = 0; + void* workspaceAddr = nullptr; + for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) { + for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) { + int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]); + int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]); + + int64_t batch1 = (n1 * src1->ne[2]) + c1; + int64_t batch0 = (n0 * src0->ne[2]) + c0; + + aclTensor* acl_input_tensor = ggml_cann_create_tensor( + (char*)input_buffer + batch1 * input_stride, ACL_FLOAT16, + input_elem_size, input_ne, input_nb, 2); + + // first split + int64_t weight_ne_offset = 0; + int64_t weight_ne[2] = { + max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size, + src0->ne[0]}; + int64_t scale_ne_offset = 0; + int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0}; + int64_t output_ne_offset = 0; + int64_t output_ne[2] = {weight_ne[0], dst->ne[1]}; + + aclTensor* acl_weight_tensor = ggml_cann_create_tensor( + (char*)src0->data + batch0 * weight_stride, + ggml_cann_type_mapping(type), weight_elem_size, weight_ne, + weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); + aclTensor* acl_scale_tensor = ggml_cann_create_tensor( + scale_offset + batch0 * scale_stride, ACL_FLOAT16, + scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, + scale_ne_offset); + aclTensor* acl_output_tensor = ggml_cann_create_tensor( + (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, + output_ne_offset); + + ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( + acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, + nullptr, nullptr, nullptr, QK8_0, acl_output_tensor, + &workspaceSize, &executor)); + if (workspaceAddr == nullptr) { + workspaceAddr = workspace_allocator.alloc(workspaceSize); + } + ACL_CHECK(aclnnWeightQuantBatchMatmulV2( + workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + + // other splits + for (int64_t split = 1; split < split_size; split++) { + weight_ne_offset += + weight_elem_size * weight_ne[0] * weight_ne[1]; + weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1] + ? src0->ne[1] - (max_elem_size * split) + : max_elem_size; + scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1]; + scale_ne[0] = weight_ne[0]; + output_ne_offset += + output_elem_size * output_ne[0] * output_ne[1]; + output_ne[0] = weight_ne[0]; + + acl_weight_tensor = ggml_cann_create_tensor( + (char*)src0->data + batch0 * weight_stride, + ggml_cann_type_mapping(type), weight_elem_size, weight_ne, + weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset); + acl_scale_tensor = ggml_cann_create_tensor( + scale_offset + batch0 * scale_stride, ACL_FLOAT16, + scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND, + scale_ne_offset); + acl_output_tensor = ggml_cann_create_tensor( + (char*)output_buffer + batch1 * output_stride, ACL_FLOAT16, + output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND, + output_ne_offset); + + ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( + acl_input_tensor, acl_weight_tensor, acl_scale_tensor, + nullptr, nullptr, nullptr, nullptr, QK8_0, + acl_output_tensor, &workspaceSize, &executor)); + ACL_CHECK(aclnnWeightQuantBatchMatmulV2( + workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_weight_tensor)); + ACL_CHECK(aclDestroyTensor(acl_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + } + + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + } + } + + // cast out + if (dst->type != GGML_TYPE_F16) { + int64_t* output_cast_ne = dst->ne; + size_t output_cast_nb[GGML_MAX_DIMS]; + output_cast_nb[0] = sizeof(uint16_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1]; + } + + aclTensor* acl_output_tensor = ggml_cann_create_tensor( + output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne, + output_cast_nb, GGML_MAX_DIMS); + aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst); + aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, + ggml_cann_type_mapping(dst->type)); + + ACL_CHECK(aclDestroyTensor(acl_output_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst_tensor)); + } +} + +void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + const enum ggml_type type = dst->src[0]->type; + switch (type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + ggml_cann_mat_mul_fp(ctx, dst); + break; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + ggml_cann_mul_mat_quant(ctx, dst, type); + break; + default: + GGML_ABORT("fatal error"); + break; + } +} + +/** + * @brief Rolls the elements of a tensor along a specified dimension. + * + * This function rolls the elements of the source tensor `acl_src` by the + * specified shifts `shifts` along the specified dimensions `dims`, and stores + * the result in the destination tensor `acl_dst`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose elements will be rolled. + * @param acl_dst The destination tensor where the rolled elements will be + * stored. + * @param shifts An array specifying the number of positions by which elements + * are shifted. + * @param dims An array specifying the dimensions along which elements are + * shifted. + */ +static void aclnn_roll(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, int64_t* shifts, int64_t* dims) { + aclIntArray* acl_shifts = aclCreateIntArray(shifts, 1); + aclIntArray* acl_dims = aclCreateIntArray(dims, 1); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnRollGetWorkspaceSize(acl_src, acl_shifts, acl_dims, acl_dst, + &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnRoll(workspaceAddr, workspaceSize, executor, ctx.stream())); + + ACL_CHECK(aclDestroyIntArray(acl_shifts)); + ACL_CHECK(aclDestroyIntArray(acl_dims)); +} + +/** + * @brief Fills specified positions of a tensor with a scalar value. + * + * This function fills the positions in the source tensor `acl_src` specified by + * `index` along the dimension `dim` with the scalar value `value`. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor where the positions will be filled. + * @param dim The dimension along which the positions are specified. + * @param index An array specifying the positions to be filled. + * @param index_num The number of positions specified in the index array. + * @param value The scalar value used to fill the specified positions. + */ +static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx, + aclTensor* acl_src, int64_t dim, + int64_t* index, int64_t index_num, + float value) { + aclIntArray* acl_index = aclCreateIntArray(index, index_num); + aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnInplaceIndexFillTensorGetWorkspaceSize( + acl_src, dim, acl_index, acl_value, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnInplaceIndexFillTensor(workspaceAddr, workspaceSize, + executor, ctx.stream())); + + ACL_CHECK(aclDestroyIntArray(acl_index)); + ACL_CHECK(aclDestroyScalar(acl_value)); +} + +static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst, + aclTensor* acl_cos_repeat_tensor, + aclTensor* acl_sin_repeat_tensor, + float theta_scale, float freq_scale, + float attn_factor, bool is_neox) { + // int sin/cos cache, cache has different repeat method depond on + // @param.is_neox + + ggml_tensor* src0 = dst->src[0]; // input + ggml_tensor* src1 = dst->src[1]; // position + ggml_tensor* src2 = dst->src[2]; // freq_factors + + // arange, [0,1,...,ne0/2] + int64_t arange_length = src0->ne[0] / 2; + ggml_cann_pool_alloc arange_allocator(ctx.pool(), + arange_length * sizeof(float_t)); + void* arange_buffer = arange_allocator.get(); + int64_t arange_ne[] = {arange_length, 1, 1, 1}; + size_t arange_nb[] = {sizeof(float_t), sizeof(float_t), sizeof(float_t), + arange_length * sizeof(float_t)}; + + aclTensor* acl_arange_tensor = + ggml_cann_create_tensor(arange_buffer, ACL_FLOAT, sizeof(float_t), + arange_ne, arange_nb, GGML_MAX_DIMS); + float start = 0; + float step = 1; + float stop = src0->ne[0] / 2; + float n_elements = src0->ne[0] / 2; + aclnn_arange(ctx, acl_arange_tensor, start, stop, step, n_elements); + + // power + // aclnnPowScalarTensor(): @param self is tensor which should be scalar, so + // use aclnn_pow_tensor_tensor() until fixed. aclScalar* acl_theta_scale = + // aclCreateScalar(&theta_scale, aclDataType::ACL_FLOAT); + // aclnn_power_scalar_tensor(ctx, acl_theta_scale, acl_arange_tensor, + // acl_power_tensor); + ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(), + arange_length * sizeof(float_t)); + void* theta_scale_buffer = theta_scale_allocator.get(); + aclTensor* acl_theta_scale_tensor = aclnn_values( + ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne, + GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale); + aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor); + + // freq_scale + if (freq_scale != 1) { + aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true); + } + + // freq_factors + if (src2) { + aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor( + src2->data, ggml_cann_type_mapping(src2->type), + ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS); + aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor, + nullptr, true); + ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor)); + } + + // position + GGML_ASSERT(src1->type == GGML_TYPE_I32); + int64_t position_length = src1->ne[0]; + int64_t position_ne[] = {1, position_length, 1, 1}; + size_t position_nb[] = {sizeof(int32_t), sizeof(int32_t), + sizeof(int32_t) * position_length, + sizeof(int32_t) * position_length}; + aclTensor* acl_position_tensor = ggml_cann_create_tensor( + src1->data, ggml_cann_type_mapping(src1->type), + ggml_type_size(src1->type), position_ne, position_nb, GGML_MAX_DIMS); + + // power * position + int64_t theta_length = arange_length * position_length; + ggml_cann_pool_alloc theta_allocator(ctx.pool(), + theta_length * sizeof(float_t)); + void* theta_buffer = theta_allocator.get(); + int64_t theta_ne[] = {arange_length, position_length, 1, 1}; + size_t theta_nb[GGML_MAX_DIMS]; + theta_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + theta_nb[i] = theta_nb[i - 1] * theta_ne[i - 1]; + } + aclTensor* acl_theta_tensor = + ggml_cann_create_tensor(theta_buffer, ACL_FLOAT, sizeof(float_t), + theta_ne, theta_nb, GGML_MAX_DIMS); + aclnn_mul(ctx, acl_position_tensor, acl_theta_scale_tensor, + acl_theta_tensor); + + // permute: [0,1,2,3]->[0,2,1,3] + int64_t permute_ne[] = {arange_length, 1, position_length, 1}; + size_t permute_nb[GGML_MAX_DIMS]; + permute_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + permute_nb[i] = permute_nb[i - 1] * permute_ne[i - 1]; + } + ggml_cann_pool_alloc permute_allocator(ctx.pool(), + theta_length * sizeof(float_t)); + void* permute_buffer = permute_allocator.get(); + aclTensor* acl_permute_tensor = ggml_cann_create_tensor( + permute_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + int64_t permute_dim[] = {0, 2, 1, 3}; + int64_t num_dims = 4; + aclnn_permute(ctx, acl_theta_tensor, acl_permute_tensor, permute_dim, + num_dims); + + // sin/cos + ggml_cann_pool_alloc sin_allocator(ctx.pool(), + theta_length * sizeof(float_t)); + void* sin_buffer = sin_allocator.get(); + aclTensor* acl_sin_tensor = ggml_cann_create_tensor( + sin_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + aclnn_sin(ctx, acl_permute_tensor, acl_sin_tensor); + + ggml_cann_pool_alloc cos_allocator(ctx.pool(), + theta_length * sizeof(float_t)); + void* cos_buffer = cos_allocator.get(); + aclTensor* acl_cos_tensor = ggml_cann_create_tensor( + cos_buffer, ACL_FLOAT, sizeof(float_t), permute_ne, permute_nb, + GGML_MAX_DIMS, ACL_FORMAT_ND); + aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor); + + // attn_factor + if (attn_factor != 1) { + aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true); + aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true); + } + + // repeat + if (is_neox) { + int64_t repeatsArray[] = {1, 1, 1, 2}; + aclnn_repeat(ctx, acl_sin_tensor, acl_sin_repeat_tensor, repeatsArray); + aclnn_repeat(ctx, acl_cos_tensor, acl_cos_repeat_tensor, repeatsArray); + } else { + int64_t num_repeats = 2; + int64_t dim = 3; + int64_t output_size = arange_length * num_repeats; + aclnn_repeat_interleave(ctx, acl_sin_tensor, acl_sin_repeat_tensor, dim, + num_repeats, output_size); + aclnn_repeat_interleave(ctx, acl_cos_tensor, acl_cos_repeat_tensor, dim, + num_repeats, output_size); + } + + // release + ACL_CHECK(aclDestroyTensor(acl_arange_tensor)); + ACL_CHECK(aclDestroyTensor(acl_theta_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_position_tensor)); + ACL_CHECK(aclDestroyTensor(acl_theta_tensor)); + ACL_CHECK(aclDestroyTensor(acl_permute_tensor)); + ACL_CHECK(aclDestroyTensor(acl_sin_tensor)); + ACL_CHECK(aclDestroyTensor(acl_cos_tensor)); +} + +#ifdef __cplusplus +extern "C" { +#endif +aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize( + const aclTensor* x, const aclTensor* cos, const aclTensor* sin, + int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize, + aclOpExecutor** executor); +aclnnStatus aclnnRotaryPositionEmbedding(void* workspace, + uint64_t workspaceSize, + aclOpExecutor* executor, + aclrtStream stream); +#ifdef __cplusplus +} +#endif + +void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + // TODO: use ascendc + // Only test with LLAMA model. + ggml_tensor* src0 = dst->src[0]; // input + ggml_tensor* src2 = dst->src[2]; // freq_factors + + // param + float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; + // const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t*)dst->op_params)[1]; + const int mode = ((int32_t*)dst->op_params)[2]; + // const int n_ctx = ((int32_t *) dst->op_params)[3]; + const int n_ctx_orig = ((int32_t*)dst->op_params)[4]; + + GGML_TENSOR_UNARY_OP_LOCALS + + memcpy(&freq_base, (int32_t*)dst->op_params + 5, sizeof(float)); + memcpy(&freq_scale, (int32_t*)dst->op_params + 6, sizeof(float)); + memcpy(&ext_factor, (int32_t*)dst->op_params + 7, sizeof(float)); + memcpy(&attn_factor, (int32_t*)dst->op_params + 8, sizeof(float)); + memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float)); + memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float)); + + // TODO: n_dims <= ne0 + GGML_ASSERT(n_dims == ne0); + GGML_ASSERT(n_dims % 2 == 0); + // TODO: ext_factor != 0 + GGML_ASSERT(ext_factor == 0); + + const float theta_scale = powf(freq_base, -2.0f / n_dims); + + float corr_dims[2]; + ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, + beta_slow, corr_dims); + + const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; + + // init cos/sin cache + ggml_cann_pool_alloc sin_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t)); + ggml_cann_pool_alloc cos_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * sizeof(float_t)); + void* sin_buffer = sin_allocator.get(); + void* cos_buffer = cos_allocator.get(); + + int64_t sin_reshape_ne[4] = {src0->ne[0], 1, src0->ne[2], 1}; + size_t sin_reshape_nb[GGML_MAX_DIMS]; + sin_reshape_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + sin_reshape_nb[i] = sin_reshape_nb[i - 1] * sin_reshape_ne[i - 1]; + } + aclTensor* acl_sin_reshape_tensor = + ggml_cann_create_tensor(sin_buffer, ACL_FLOAT, sizeof(float_t), + sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); + aclTensor* acl_cos_reshape_tensor = + ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t), + sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS); + aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor, + theta_scale, freq_scale, attn_factor, is_neox); + + aclTensor* acl_src = ggml_cann_create_tensor(src0); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + +#ifdef ASCEND_310P + // Special ROPE operation for 310P + + // roll input + void* input_roll_buffer; + aclTensor* acl_minus_one_tensor; + void* minus_one_scale_buffer = nullptr; + ggml_cann_pool_alloc roll_allocator(ctx.pool(), ggml_nbytes(src0)); + ggml_cann_pool_alloc minus_one_scale_allocator( + ctx.pool(), sizeof(float_t) * src0->ne[0]); + if (!is_neox) { + // roll input: [q0,q1,q2,q3,...] -> [q1,q0,q3,q2,...] + input_roll_buffer = roll_allocator.get(); + int64_t input_roll_ne[4] = {2, src0->ne[1] * (src0->ne[0] / 2), + src0->ne[2], src0->ne[3]}; + size_t input_roll_nb[GGML_MAX_DIMS]; + input_roll_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_roll_nb[i] = input_roll_nb[i - 1] * input_roll_ne[i - 1]; + } + aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( + input_roll_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), input_roll_ne, input_roll_nb, + GGML_MAX_DIMS); + aclTensor* acl_input_tensor = ggml_cann_create_tensor( + src0->data, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), input_roll_ne, input_roll_nb, + GGML_MAX_DIMS); + + int64_t shifts[] = {1}; + int64_t dims[] = {3}; + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); + ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + + // init [-1, 1, -1, 1, ...] + minus_one_scale_buffer = minus_one_scale_allocator.get(); + + int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; + size_t minus_one_nb[GGML_MAX_DIMS]; + minus_one_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; + } + acl_minus_one_tensor = aclnn_values( + ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], + minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); + int64_t dim = 3; + int64_t* index = new int64_t[src0->ne[0]]; + for (int i = 0; i < src0->ne[0]; i++) { + index[i] = i / 2 * 2; + } + int64_t index_num = src0->ne[0]; + float value = -1; + aclnn_index_fill_tensor(ctx, acl_minus_one_tensor, dim, index, + index_num, value); + } else { + // roll input: [q0,q1,q2,...] -> + // [q_half,q_half+1,...,q_end,q0,q1,...q_half-1] + input_roll_buffer = roll_allocator.get(); + aclTensor* acl_input_roll_tensor = ggml_cann_create_tensor( + input_roll_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, src0->nb, GGML_MAX_DIMS); + aclTensor* acl_input_tensor = ggml_cann_create_tensor(src0); + + int64_t shifts[] = {src0->ne[0] / 2}; + int64_t dims[] = {3}; + aclnn_roll(ctx, acl_input_tensor, acl_input_roll_tensor, shifts, dims); + + ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_tensor)); + // init [-1, -1, -1, 1, 1,1,...] + minus_one_scale_buffer = minus_one_scale_allocator.get(); + int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1}; + size_t minus_one_nb[GGML_MAX_DIMS]; + minus_one_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1]; + } + acl_minus_one_tensor = aclnn_values( + ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0], + minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1); + // -1 * first half + int64_t first_half_ne[4] = {src0->ne[0] / 2, 1, 1, 1}; + size_t first_half_nb[GGML_MAX_DIMS]; + first_half_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + first_half_nb[i] = first_half_nb[i - 1] * first_half_ne[i - 1]; + } + aclTensor* acl_first_half_tensor = ggml_cann_create_tensor( + minus_one_scale_buffer, ACL_FLOAT, sizeof(float_t), first_half_ne, + first_half_nb, GGML_MAX_DIMS); + bool inplace = true; + float scale = -1; + aclnn_muls(ctx, acl_first_half_tensor, scale, nullptr, inplace); + ACL_CHECK(aclDestroyTensor(acl_first_half_tensor)); + } + + // TODO: n_dims < ne0 + GGML_ASSERT(n_dims == src0->ne[0]); + + // input * scale + ggml_cann_pool_alloc roll_mul_scale_allocator(ctx.pool(), + ggml_nbytes(src0)); + void* input_roll_mul_scale_buffer = roll_mul_scale_allocator.get(); + size_t input_nb[GGML_MAX_DIMS]; + input_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_nb[i] = input_nb[i - 1] * src0->ne[i - 1]; + } + aclTensor* acl_input_roll_mul_scale_tensor = ggml_cann_create_tensor( + input_roll_mul_scale_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); + aclTensor* acl_input_roll_reshape_tensor = ggml_cann_create_tensor( + input_roll_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), src0->ne, input_nb, GGML_MAX_DIMS); + + aclnn_mul(ctx, acl_input_roll_reshape_tensor, acl_minus_one_tensor, + acl_input_roll_mul_scale_tensor); + + // output + void* output_fp32_buffer; + if (src0->type == GGML_TYPE_F32) { + aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor); + aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor, + acl_sin_reshape_tensor); + aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst); + // TODO: ne0 != n_dims in mode2 + } else if (src0->type == GGML_TYPE_F16) { + size_t input_fp32_nb[GGML_MAX_DIMS]; + input_fp32_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + input_fp32_nb[i] = input_fp32_nb[i - 1] * dst->ne[i - 1]; + } + ggml_cann_pool_alloc fp32_allocator1( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* input_fp32_buffer1 = fp32_allocator1.get(); + aclTensor* input_fp32_tensor1 = ggml_cann_create_tensor( + input_fp32_buffer1, ACL_FLOAT, sizeof(float_t), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + ggml_cann_pool_alloc fp32_allocator2( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + void* input_fp32_buffer2 = fp32_allocator2.get(); + aclTensor* input_fp32_tensor2 = ggml_cann_create_tensor( + input_fp32_buffer2, ACL_FLOAT, sizeof(float_t), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + + ggml_cann_pool_alloc fp32_allocator( + ctx.pool(), ggml_nelements(dst) * sizeof(float_t)); + output_fp32_buffer = fp32_allocator.get(); + aclTensor* output_fp32_tensor = ggml_cann_create_tensor( + output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne, + input_fp32_nb, GGML_MAX_DIMS); + aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1); + aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor, + input_fp32_tensor2); + aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2, + output_fp32_tensor); + aclnn_cast(ctx, output_fp32_tensor, acl_dst, ACL_FLOAT16); + + ACL_CHECK(aclDestroyTensor(input_fp32_tensor1)); + ACL_CHECK(aclDestroyTensor(input_fp32_tensor2)); + ACL_CHECK(aclDestroyTensor(output_fp32_tensor)); + ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor)); + ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_src)); + } + return; +#endif + + // src0 == GGML_TYPE_F16 + // TODO: optimization this `if` code + if (src0->type == GGML_TYPE_F16) { + ggml_cann_pool_alloc sin_final_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type)); + ggml_cann_pool_alloc cos_final_allocator( + ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type)); + void* sin_final_buffer = sin_final_allocator.get(); + void* cos_final_buffer = cos_final_allocator.get(); + + int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1}; + size_t sin_final_nb[GGML_MAX_DIMS]; + sin_final_nb[0] = ggml_type_size(src0->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1]; + } + aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor( + sin_final_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), sin_final_ne, sin_final_nb, + GGML_MAX_DIMS); + aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor( + cos_final_buffer, ggml_cann_type_mapping(src0->type), + ggml_type_size(src0->type), sin_final_ne, sin_final_nb, + GGML_MAX_DIMS); + + aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor, + ggml_cann_type_mapping(src0->type)); + aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor, + ggml_cann_type_mapping(src0->type)); + ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); + acl_sin_reshape_tensor = acl_sin_final_tensor; + acl_cos_reshape_tensor = acl_cos_final_tensor; + } + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + + void* workspaceAddr = nullptr; + + int acl_mode = mode; + if (mode == 0) { + acl_mode = 1; + } + + ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize( + acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode, + acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize, + executor, ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} diff --git a/llama/ggml-cann/aclnn_ops.h b/llama/ggml-cann/aclnn_ops.h new file mode 100644 index 00000000000..7624903454a --- /dev/null +++ b/llama/ggml-cann/aclnn_ops.h @@ -0,0 +1,618 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef CANN_ACLNN_OPS +#define CANN_ACLNN_OPS + +/** + * @file acl_tensor + * @brief This file contains related functions of ggml_tensor and acl_tensor. + * Contains conversion from ggml_tensor to acl_tensor, broadcast and other + * functions. + * @author hipudding + * @author wangshuai09 <391746016@qq.com> + * @date July 15, 2024 + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "acl_tensor.h" +#include "common.h" + +/** + * @brief Repeats a ggml tensor along each dimension to match the dimensions + * of another tensor. + * + * @details This function repeats the elements of a source ggml tensor along + * each dimension to create a destination tensor with the specified + * dimensions. The operation is performed using the ACL backend and + * executed asynchronously on the device. + * + * @param ctx The CANN context used for operations. + * @param dst The ggml tensor representing the destination, which op is + * GGML_OP_REPEAT and specifies the desired dimensions. + */ +void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Adds two ggml tensors using the CANN backend. + * + * @details This function performs an element-wise addition of two tensors. In + * case the tensors do not have the same shape, one or both tensors + * will be broadcasted to match the shape of the other before the + * addition is performed.The formula for the operation is given by: + * \f[ + * \text{dst} = \text{acl_src0} + \alpha \cdot \text{acl_src1} + * \f] + * + * @param ctx The CANN context used for operations. + * @param dst The ggml tensor representing the destination, result of the + * addition is stored at dst->data, and dst->op is `GGML_OP_ADD` + */ +void ggml_cann_add(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Applies the Leaky ReLU activation function to a tensor using the CANN + * backend. + * + * @details This function computes the Leaky ReLU activation for each element of + * the input tensor. The Leaky ReLU function allows a small gradient + * when the unit is not active (i.e., when the input is negative). The + * Leaky ReLU function is defined as: + * \f[ + * \text{dst} = \max(0, src) + \text{negativeSlope} \cdot \min(0, + * src) + * \f] + * `negativeSlope` is in dst->params. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the result of the Leaky ReLU + * activation is stored, which op is `GGML_OP_LEAKY_RELU` + */ +void ggml_cann_leaky_relu(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Concatenates multiple tensors along a specified dimension using the + * CANN backend. + * + * @param ctx The CANN context used for operations. + * @param tensorList A pointer to the list of tensors to be concatenated. + * @param dst The destination tensor where the result of the + * concatenation is stored. dst->op is `GGML_OP_CONCAT`. + * @param concat_dim The dimension along which the tensors are concatenated. + * + * @attention tensorList length should be 2 and the dimension using for concat + * default to 1. + */ +void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Generates a sequence of evenly spaced values within a specified + * interval for a ggml tensor using the CANN backend. + * + * @details This function creates a sequence of numbers over a specified i + * nterval, starting from `start`, ending before `stop`, and + * incrementing by `step`. The sequence is stored in the destination + * tensor `dst`. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the generated sequence will be stored. + * `start`, 'stop' and 'step' are in dst->op_params and dst->op is + * `GGML_OP_ARANGE`. + */ +void ggml_cann_arange(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Computes the square of the elements of a ggml tensor using the CANN + * backend. + * @details The function sets the second source tensor of the destination + * tensor `dst` to be equal to the first source tensor. This is + * effectively squaring the elements since the multiplication becomes + * `element * element`. + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the squared values will be stored, + * which dst->op is `GGML_OP_SQR`. + */ +void ggml_cann_sqr(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Applies a clamp operation to the elements of a ggml tensor using the + * CANN backend. + * + * @details This function clamps the elements of the input tensor `src` to a + * specified range defined by `min` and `max` values. The result is + * stored in the destination tensor `dst`. The operation is defined as: + * \f[ + * y = \max(\min(x, max\_value), min\_value) + * \f] + * where `x` is an element of the input tensor, and `y` is the + * corresponding element in the output tensor. + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the clamped values will be stored. + * dst->op is `GGML_OP_CLAMP`, `min` and `max` value is in dst->params. + */ +void ggml_cann_clamp(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Scales the elements of a ggml tensor by a constant factor using the + * CANN backend. + * + * @details This function multiplies each element of the input tensor `src` by + * a scaling factor `scale`, storing the result in the destination + * tensor `dst`. The operation is defined as: + * \f[ + * dst = src \times scale + * \f] + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the scaled values will be stored. + * dst->op is `GGML_OP_SCALE` and `scale` value is in dst->params. + */ +void ggml_cann_scale(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Sorts the elements of a ggml tensor and returns the indices that + * would sort the tensor using the CANN backend. + * + * @details This function performs an argsort operation on the input tensor + * `src`. It sorts the elements of `src` in either ascending or + * descending order, depending on the `GGML_SORT_ORDER_DESC`, + * and returns the indices that would sort the original tensor. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the sorted indices will be stored. + * dst->op is `GGML_OP_ARGSORT`. + */ +void ggml_cann_argsort(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Computes the Layer Normalization for a ggml tensor using the CANN + * backend. + * + * @details This function applies the Layer Normalization operation on the + * input tensor `src` and stores the result in the destination tensor + * `dst`. Layer Normalization normalizes the features at each sample in + * a mini-batch independently. It is commonly used in neural networks + * to normalize the activations of a layer by adjusting and scaling + * the outputs. + * The operation is defined as: + * \f[ + * \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}} + * \f] + * `Var` defaults dst->ne[0]. `eps` is in dst->params. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the normalized values will be stored. + * @attention `Var` defaults to dst->ne[0]. + */ +void ggml_cann_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Computes the Group Normalization for a ggml tensor using the CANN + * backend. + * + * @brief This function applies the Group Normalization operation on the input + * tensor `src` and stores the result in the destination tensor `dst`. + * Group Normalization divides the channels into groups and normalizes + * the features within each group across spatial locations. + * It is commonly used in convolutional neural networks to improve + * training stability and performance. + * The operation is defined as: + * \f[ + * \text { out }=\frac{x-\mathrm{E}[x]}{\sqrt{\text{Var}[x]+eps}} + * \f] + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the normalized values will be stored. + * `n_groups` is in dst->params, which split C channel to `n_groups`. + * dst->op is `GGML_OP_GROUP_NORM`. + * + * @attention eps defaults to 1e-6f. + */ +void ggml_cann_group_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Computes the accumulation of tensors using the CANN backend. + * + * @details This function performs an accumulation operation on two tensors. + * Depending on the `inplace` flag, it either updates the destination + * tensor `dst` in place by adding `alpha * src1` to it, or it creates + * a new tensor as the result of `src0 + alpha * src1` and stores it in + * `dst`. + * The operation is defined as: + * \f[ + * dst = src0 + alpha \times src1 + * \f] + * if `inplace` is `true`, `src0` is equal to 'dst'. + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the accumulated values will be stored. + * `inplace` is in dst->params, and dst->op is `GGML_OP_ACC`. + */ +void ggml_cann_acc(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Computes the sum of elements along the last dimension of a ggml tensor + * using the CANN backend. + * + * @details This function performs a reduction sum operation along the last + * dimension of the input tensor `src`. The result of the sum is stored + * in the destination tensor `dst`. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the reduced values will be stored。 + * dst->op is `GGML_OP_SUM_ROWS`. + * + * @attention `reduce_dims` defaults to 3, which means the last dimension. + */ +void ggml_cann_sum_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Upsamples a ggml tensor using nearest neighbor interpolation using + * the CANN backend. + * + * @details This function performs upsampling of the input tensor `src` using + * nearest neighbor interpolation. The upsampling is applied to the + * height and width dimensions (last two dimensions) of the tensor. The + * result is stored in the destination tensor `dst`, which must have + * the appropriate dimensions for the upsampled output. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the upsampled values will be stored. + * dst->op is `GGML_OP_UPSCALE`. + */ +void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx, + ggml_tensor* dst); + +/** + * @brief Pads a ggml tensor to match the dimensions of the destination tensor + * using the CANN backend. + * + * @details This function pads the input tensor `src` so that it matches the + * dimensions of the destination tensor `dst`. The amount of padding + * is calculated based on the difference in sizes between `src` and + * `dst` along each dimension. The padded tensor is stored in `dst`. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor, which specifies the target dimensions for + * padding. dst->op is `GGML_OP_PAD`. + */ +void ggml_cann_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Executes a 2D pooling operation on a ggml tensor using the CANN + * backend. + * + * @details This function dispatches the execution of a 2D pooling operation on + * the input tensor `dst`. The type of pooling (average or max) is + * determined by the `op` parameter, which is read from the operation + * parameters of `dst`. The function supports average pooling + * (`GGML_OP_POOL_AVG`) and max pooling (`GGML_OP_POOL_MAX`). If an + * invalid operation is encountered, the function asserts a failure. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor on which the pooling operation is to be + * performed. dst->op is `GGML_OP_POOL_2D`. + */ +void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Duplicates a ggml tensor using the CANN backend. + * + * @details This function duplicates the contents of the source tensor `src` to + * the destination tensor `dst`. The function supports various tensor + * types and configurations, including handling of extra data, type + * conversions, and special cases for contiguous and non-contiguous + * tensors. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the duplicated data will be stored. + * dst->op is `GGML_OP_DUP` + * + * @attention Only support Fp16/FP32. Not support when src and dst have + * different shape and dst is no-contiguous. + * @note: This func need to simplify. + */ +void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Computes the Root Mean Square (RMS) normalization of a ggml tensor + * using the CANN backend. + * + * @details This function applies RMS normalization to the input tensor `src` + * and stores the result in the destination tensor `dst`. RMS + * normalization involves computing the root mean square of the input + * tensor along a specified dimension and then dividing each element of + * the tensor by this value, adjusted by a small epsilon value to + * prevent division by zero. + * The operation is defined as: + * \f[ + * \text{RmsNorm}\left(x_i\right)=\frac{x_i}{\text{Rms}(\mathbf{x})} g_i, + * \quad \text { where } \text{Rms}(\mathbf{x})=\sqrt{\frac{1}{n} \sum_{i=1}^n x_i^2+e p s} + * \f] + * `eps` is in dst->op_params. + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the normalized values will be stored. + * dst->op is `GGML_OP_RMS_NORM`. + */ +void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Applies a diagonal mask to the tensor with a specified value. + * + * @details This function creates a mask tensor filled with ones, then applies + * an upper triangular and lower triangular operation to it based on + * the number of past elements specified. Afterward, it adds the masked + * tensor to the destination tensor in-place. + * + * @param ctx The backend CANN context used for operations. + * @param dst The destination tensor where the result will be stored. dst->op is + * `GGML_OP_DIAG_MASK` + * @param value The value to use for masking. + */ +void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value); + +/** + * @brief Performs an image-to-column transformation on the input tensor. + * + * @details This function takes an input tensor and applies an image-to-column + * operation, converting spatial dimensions into column-like + * structures suitable for convolutional operations. It supports both + * half-precision (F16) and single-precision (F32) floating-point data + * types. + * + * @param ctx The backend CANN context for executing operations. + * @param dst The destination tensor that stores the result of the operation. + * dst->op is `GGML_OP_IM2COL`. + */ +void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Computes time step embeddings using sine and cosine functions. + * + * @details This function calculates time step embeddings by applying sine and + * cosine transformations to a given input tensor, which is typically + * used in temporal models like diffusion models or transformers to + * encode time information effectively. + * + * @param ctx The backend CANN context for executing operations. + * @param dst The destination tensor where the result of the embedding operation + * will be stored. dst->op is `GGML_OP_TIMESTEP_EMBEDDING`. + */ +void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +// @see ggml_cann_dup. +void ggml_cann_cpy(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Computes the softmax activation with optional masking. + * + * @details This function computes the softmax activation over the input tensor, + * optionally applying a mask and scaling factor. It supports both FP16 + * and FP32 data types and can handle masking by broadcasting the mask + * across rows if necessary. + * The function performs the following steps: + * 1. Multiplies the input tensor by a scale factor. + * 2. Optionally casts the mask tensor to FP32 if it is in FP16 format. + * 3. Broadcasts the mask tensor if its dimensions do not match the + * input tensor's dimensions. + * 4. Adds the mask to the scaled input tensor. + * 5. Applies the softmax activation function along the specified + * dimension. + * + * @param ctx The backend CANN context for executing operations. + * @param dst The destination tensor where the result will be stored. dst->op is + * `GGML_OP_SOFTMAX`. + */ +void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Extracts specific rows from a tensor based on indices. + * + * @details This function retrieves rows from a source tensor src0 according to + * the indices provided in another tensor src1 and stores the result in + * a destination tensor (\p dst). It supports different data types + * including F32, F16, Q4_0, and Q8_0. + * + * @param ctx The backend CANN context for executing operations. + * @param dst The destination tensor where the extracted rows will be stored. + * dst->op is `GGML_OP_GET_ROWS`. + */ +void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Executes matrix multiplication for the given tensor. + * + * @details This function performs matrix multiplication on the source tensors + * associated with the destination tensor. It supports matrix + * multiplication F32, F16, and Q8_0. + * + * @param ctx The backend CANN context for executing operations. + * @param dst The destination tensor for storing the result of the matrix + * multiplication. dst->op is `GGML_OP_MUL_MAT`. + */ +void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +/** + * @brief Applies Rotary Positional Embedding (RoPE) to the input tensor. + * + * @details This function implements the RoPE mechanism, which is a method to + * encode positional information into sequence data, particularly + * useful in transformer models. It supports both F32 and F16 data + * types. + * + * @param ctx The backend CANN context for executing operations. + * @param dst The destination tensor where the RoPE-transformed data will be + * stored. dst->op is `GGML_OP_ROPE`. + * + * @note The function currently does not support cases where the n_dims is less + * than the input tensor's first dimension. + * @note The function currently does not support cases where the freq_factors is + * not NULL. + * @note The function currently does not support cases where the ext_factor is + * not equal 0. + * @note The function currently does not support cases where the freq_scale is + * not equal 1. + */ +void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst); + +template +void ggml_cann_mul_div(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; + ggml_tensor* src1 = dst->src[1]; + GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst)); + + aclTensor* acl_src0; + aclTensor* acl_src1; + aclTensor* acl_dst; + + // Need bcast + if (!ggml_are_same_shape(src0, src1) && ggml_cann_need_bcast(src0, src1)) { + BCAST_SHAPE(src0, src1) + acl_src0 = ggml_cann_create_tensor(src0, BCAST_PARAM(src0)); + acl_src1 = ggml_cann_create_tensor(src1, BCAST_PARAM(src1)); + acl_dst = ggml_cann_create_tensor(dst, BCAST_PARAM(src0)); + } else { + acl_src0 = ggml_cann_create_tensor(src0); + acl_src1 = ggml_cann_create_tensor(src1); + acl_dst = ggml_cann_create_tensor(dst); + } + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(getWorkspaceSize(acl_src0, acl_src1, acl_dst, &workspaceSize, + &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(acl_src1)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +// Activation functions template. +template +void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +// Activation functions template for const aclTensors. +template +void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src = dst->src[0]; + + GGML_ASSERT(src->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_dst = ggml_cann_create_tensor(dst); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(getWorkspaceSize(acl_src, acl_dst, &workspaceSize, &executor)); + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + aclrtStream main_stream = ctx.stream(); + ACL_CHECK(execute(workspaceAddr, workspaceSize, executor, main_stream)); + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); +} + +#endif // CANN_ACLNN_OPS diff --git a/llama/ggml-cann/common.h b/llama/ggml-cann/common.h new file mode 100644 index 00000000000..eeccae33521 --- /dev/null +++ b/llama/ggml-cann/common.h @@ -0,0 +1,312 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef CANN_COMMON_H +#define CANN_COMMON_H + +#include + +#include +#include +#include +#include +#include +#include + +#include "llama/ggml-cann.h" +#include "llama/ggml.h" + +#define MATRIX_ROW_PADDING 512 +#define GGML_CANN_MAX_STREAMS 8 + +/** + * @brief Handles CANN-related errors by printing an error message and + * terminating the program. + * @param stmt The statement that caused the error. + * @param func The function in which the error occurred. + * @param file The file in which the error occurred. + * @param line The line number at which the error occurred. + * @param msg The error message. + */ +[[noreturn]] void ggml_cann_error(const char* stmt, const char* func, + const char* file, int line, const char* msg); + +/** + * @brief Checks the result of a CANN function call and invokes the error + * handler if the call fails. + * @param stmt The CANN function call to check. + * @param success The success code that indicates the call was successful. + * @param error_fn The function to call to retrieve the error message. + */ +#define ACL_CHECK_GEN(stmt, success, error_fn) \ + do { \ + int err_code = (stmt); \ + if (err_code != (success)) { \ + ggml_cann_error(#stmt, __func__, __FILE__, __LINE__, error_fn()); \ + } \ + } while (0); + +#define ACL_CHECK(stmt) ACL_CHECK_GEN(stmt, 0, aclGetRecentErrMsg) + +/** + * @brief Contains information about CANN devices. + */ +struct ggml_cann_device_info { + /** + * @brief Number of CANN devices available. + */ + int32_t device_count; + + /** + * @brief Information about a single CANN device. + */ + struct cann_device_info { + int cc; /**< Compute capability. */ + size_t smpb; /**< Maximum shared memory per block. */ + bool vmm; /**< Virtual memory support. */ + size_t vmm_granularity; /**< Granularity of virtual memory. */ + size_t total_vram; /**< Total video RAM available on the device. */ + }; + + cann_device_info devices[GGML_CANN_MAX_DEVICES] = + {}; /**< Array of CANN device information. */ +}; + +const ggml_cann_device_info& ggml_cann_info(); + +void ggml_cann_set_device(int32_t device); +int32_t ggml_cann_get_device(); + +/** + * @brief Abstract base class for memory pools used by CANN. + */ +struct ggml_cann_pool { + /** + * @brief Virtual destructor for the memory pool. + */ + virtual ~ggml_cann_pool() = default; + + /** + * @brief Allocates memory from the pool. + * + * @param size The size of the memory block to allocate. + * @param actual_size Pointer to a variable where the actual allocated size + * will be stored. + * @return Pointer to the allocated memory block. + */ + virtual void* alloc(size_t size, size_t* actual_size) = 0; + + /** + * @brief Frees a previously allocated memory block. + * + * @param ptr Pointer to the memory block to free. + * @param size Size of the memory block to free. + * @note Note that all CANN opertors are running async. Make sure memory is + * still avaiable before this operator finished. + */ + virtual void free(void* ptr, size_t size) = 0; +}; + +/** + * @brief RAII wrapper for managing memory allocations from a CANN memory pool. + */ +struct ggml_cann_pool_alloc { + ggml_cann_pool* pool = nullptr; /**< Pointer to the memory pool. */ + void* ptr = nullptr; /**< Pointer to the allocated memory block. */ + size_t actual_size = 0; /**< Actual size of the allocated memory block. */ + + /** + * @brief Default constructor. + */ + ggml_cann_pool_alloc() = default; + + /** + * @brief Constructor that initializes the memory pool. + * @param pool Reference to the memory pool. + */ + explicit ggml_cann_pool_alloc(ggml_cann_pool& pool) : pool(&pool) {} + + /** + * @brief Constructor that initializes the memory pool and allocates memory. + * @param pool Reference to the memory pool. + * @param size Size of the memory block to allocate. + */ + ggml_cann_pool_alloc(ggml_cann_pool& pool, size_t size) : pool(&pool) { + alloc(size); + } + + /** + * @brief Destructor that frees the allocated memory block. + */ + ~ggml_cann_pool_alloc() { + if (ptr != nullptr) { + pool->free(ptr, actual_size); + } + } + + /** + * @brief Allocates memory from the pool. + * @param size Size of the memory block to allocate. + * @return Pointer to the allocated memory block. + */ + void* alloc(size_t size) { + GGML_ASSERT(pool != nullptr); + GGML_ASSERT(ptr == nullptr); + ptr = pool->alloc(size, &this->actual_size); + return ptr; + } + + /** + * @brief Allocates memory from a specific memory pool. + * @param pool Reference to the memory pool. + * @param size Size of the memory block to allocate. + * @return Pointer to the allocated memory block. + */ + void* alloc(ggml_cann_pool& pool, size_t size) { + this->pool = &pool; + return alloc(size); + } + + /** + * @brief Gets the pointer to the allocated memory block. + * @return Pointer to the allocated memory block. + */ + void* get() { return ptr; } + + // Deleted copy constructor + ggml_cann_pool_alloc(const ggml_cann_pool_alloc&) = delete; + + // Deleted move constructor + ggml_cann_pool_alloc(ggml_cann_pool_alloc&&) = delete; + + // Deleted copy assignment operator + ggml_cann_pool_alloc& operator=(const ggml_cann_pool_alloc&) = delete; + + // Deleted move assignment operator + ggml_cann_pool_alloc& operator=(ggml_cann_pool_alloc&&) = delete; +}; + +/** + * @brief Context for managing CANN backend operations. + */ +struct ggml_backend_cann_context { + int32_t device; /**< Device ID. */ + std::string name; /**< Name of the device. */ + std::string description; /**< Description of the device. */ + aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */ + + aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */ + + /** + * @brief Constructor for initializing the context with a given device. + * @param device Device ID. + */ + explicit ggml_backend_cann_context(int device) + : device(device), name("CANN" + std::to_string(device)) { + ggml_cann_set_device(device); + description = aclrtGetSocName(); + } + + /** + * @brief Destructor for cleaning up resources. + */ + ~ggml_backend_cann_context() { + ggml_cann_set_device(device); + if (copy_event != nullptr) { + ACL_CHECK(aclrtDestroyEvent(copy_event)); + } + for (int i = 0; i < GGML_CANN_MAX_STREAMS; ++i) { + if (streams[i] != nullptr) { + ACL_CHECK(aclrtDestroyStream(streams[i])); + } + } + } + + /** + * @brief Get or create a stream for a given index. + * @param stream Index of the stream. + * @return The stream corresponding to the given index. + */ + aclrtStream stream(int stream) { + if (streams[stream] == nullptr) { + ggml_cann_set_device(device); + ACL_CHECK(aclrtCreateStream(&streams[stream])); + } + return streams[stream]; + } + + /** + * @brief Get or create the default stream (index 0). + * @return The default stream. + */ + aclrtStream stream() { return stream(0); } + + // TODO: each stream should have a memory pool. + std::unique_ptr + mem_pool; /**< Memory pool for the device. */ + + /** + * @brief Create a new memory pool for a given device. + * @param device Device ID. + * @return A unique pointer to the new memory pool. + */ + static std::unique_ptr new_pool_for_device(int device); + + /** + * @brief Get or create the memory pool for the context. + * @return Reference to the memory pool. + */ + ggml_cann_pool& pool() { + if (mem_pool == nullptr) { + mem_pool = new_pool_for_device(device); + } + return *mem_pool; + } +}; + +#endif // CANN_COMMON_H diff --git a/llama/ggml-cann/ggml-cann.cpp b/llama/ggml-cann/ggml-cann.cpp new file mode 100644 index 00000000000..5b96bf47d99 --- /dev/null +++ b/llama/ggml-cann/ggml-cann.cpp @@ -0,0 +1,2214 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "llama/ggml-cann.h" + +#include +#include + +#include +#include +#include +#include + +#include "llama/ggml-impl.h" +#include "llama/ggml-backend-impl.h" +#include "llama/ggml-cann/aclnn_ops.h" +#include "llama/ggml-cann/common.h" + +#define GGML_COMMON_DECL_C + +#include "llama/ggml-common.h" + +#define GGML_CANN_NAME "CANN" + +/** + * @brief Handles CANN errors by printing an error message and aborting. + * + * @param stmt The statement that caused the error. + * @param func The function in which the error occurred. + * @param file The file in which the error occurred. + * @param line The line number where the error occurred. + * @param msg The error message. + */ +[[noreturn]] void ggml_cann_error(const char* stmt, const char* func, + const char* file, int line, const char* msg) { + int32_t id = -1; + aclrtGetDevice(&id); + + GGML_LOG_ERROR("CANN error: %s\n", msg); + GGML_LOG_ERROR(" current device: %d, in function %s at %s:%d\n", id, func, + file, line); + GGML_LOG_ERROR(" %s\n", stmt); + // abort with GGML_ASSERT to get a stack trace + GGML_ABORT("CANN error"); +} + +/** + * @brief Sets the device to be used by CANN. + * + * @param device The device ID to set. + */ +void ggml_cann_set_device(const int32_t device) { + // TODO: uncomment these lines after empty context has fixed. + // int current_device; + // ACL_CHECK(aclrtGetDevice(¤t_device)); + + // if (device == current_device) { + // return; + // } + ACL_CHECK(aclrtSetDevice(device)); +} + +/** + * @brief Retrieves the current device ID. + * + * @return The current device ID. + */ +int32_t ggml_cann_get_device() { + int32_t id; + ACL_CHECK(aclrtGetDevice(&id)); + return id; +} + +/** + * @brief Initialize the CANN device information. + * + * This function initializes the CANN device information by obtaining the + * device count and setting the memory allocation granularity for each device. + * + * @return A structure containing the device information. + */ +static ggml_cann_device_info ggml_cann_init() { + ggml_cann_device_info info = {}; + + aclError err = aclrtGetDeviceCount((uint32_t*)&info.device_count); + + if (err != ACL_SUCCESS) { + GGML_LOG_ERROR("%s: failed to initialize CANN: %s\n", + __func__, aclGetRecentErrMsg()); + return info; + } + + GGML_ASSERT(info.device_count <= GGML_CANN_MAX_DEVICES); + + for (int id = 0; id < info.device_count; ++id) { + aclrtPhysicalMemProp prop = {}; + prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; + prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; + prop.memAttr = ACL_HBM_MEM_HUGE; + prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = id; + prop.reserve = 0; + ACL_CHECK(aclrtMemGetAllocationGranularity( + &prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED, + &info.devices[id].vmm_granularity)); + + size_t free, total; + ggml_backend_cann_get_device_memory(id, &free, &total); + info.devices[id].total_vram = free; + } + + // TODO: add more device info later. + return info; +} + +/** + * @brief Retrieve the CANN device information. + * + * This function returns a reference to a structure containing the CANN device + * information. The device information is initialized once and reused on + * subsequent calls. + * + * @return A reference to the structure containing the device information. + */ +const ggml_cann_device_info& ggml_cann_info() { + static ggml_cann_device_info info = ggml_cann_init(); + return info; +} + +//#define DEBUG_CANN_MALLOC +/** + * @brief A pool of CANN buffers(legacy). + * + * This class manages a pool of CANN buffers for a specific device. + */ +struct ggml_cann_pool_leg : public ggml_cann_pool { + /** + * @brief The maximum number of buffers in the pool. + */ + static const int MAX_BUFFERS = 256; + + /** + * @brief The device ID associated with this buffer pool. + */ + int device; + + /** + * @brief Structure representing a CANN buffer. + */ + struct ggml_cann_buffer { + void* ptr = nullptr; ///< Pointer to the buffer memory. + size_t size = 0; ///< Size of the buffer. + }; + + /** + * @brief Array of CANN buffers in the pool. + */ + ggml_cann_buffer buffer_pool[MAX_BUFFERS] = {}; + + /** + * @brief Total size of all buffers in the pool. + */ + size_t pool_size = 0; + + /** + * @brief Constructor to initialize the buffer pool for a specific device. + * + * @param device The device ID to associate with this buffer pool. + */ + explicit ggml_cann_pool_leg(int device) : device(device) {} + + /** + * @brief Destructor to free all buffers in the pool. + */ + ~ggml_cann_pool_leg() { + ggml_cann_set_device(device); + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cann_buffer& b = buffer_pool[i]; + if (b.ptr != nullptr) { + ACL_CHECK(aclrtFree(b.ptr)); + pool_size -= b.size; + } + } + GGML_ASSERT(pool_size == 0); + } + + /** + * @brief Allocate a buffer of the given size. + * + * @param size The size of the buffer to allocate. + * @param actual_size A pointer to a variable to receive the actual size of + * the allocated buffer. + * @return A pointer to the allocated buffer. + */ + void* alloc(size_t size, size_t* actual_size) override { + const size_t alignment = 128; + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } +#ifdef DEBUG_CANN_MALLOC + int nnz = 0; + size_t max_size = 0; +#endif + size_t best_diff = 1ull << 36; + int ibest = -1; + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cann_buffer& b = buffer_pool[i]; + if (b.ptr != nullptr) { +#ifdef DEBUG_CANN_MALLOC + ++nnz; + if (b.size > max_size) max_size = b.size; +#endif + if (b.size >= size) { + size_t diff = b.size - size; + if (diff < best_diff) { + best_diff = diff; + ibest = i; + if (!best_diff) { + void* ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + } + } + } + if (ibest >= 0) { + ggml_cann_buffer& b = buffer_pool[ibest]; + void* ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + void* ptr; + ggml_cann_set_device(device); + ACL_CHECK( + aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST)); + *actual_size = size; + pool_size += size; +#ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO( + "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, " + "requested %u MB\n", + __func__, device, nnz, (uint32_t)(max_size / 1024 / 1024), + (uint32_t)(pool_size / 1024 / 1024), + (uint32_t)(size / 1024 / 1024)); +#endif + return ptr; + } + + /** + * @brief Free a buffer and return it to the pool. + * + * @param ptr Pointer to the buffer to free. + * @param size Size of the buffer to free. + */ + void free(void* ptr, size_t size) override { + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cann_buffer& b = buffer_pool[i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + // memory should always buffered. these memory may still needed by + // tasks in stream. + // TODO, fix me. + GGML_ABORT("Cann buffer pool full, increase MAX_CANN_BUFFERS\n"); + } +}; + +/** + * @brief A pool of CANN buffers with virtual memory. + * + * This class manages a pool of CANN buffers with virtual memory for a specific + * device. + */ +struct ggml_cann_pool_vmm : public ggml_cann_pool { + /** + * @brief The maximum size of the virtual memory pool (32 GB). + */ + size_t max_size; + + /** + * @brief The device ID associated with this buffer pool. + */ + int device; + + /** + * @brief Pointer to the start of the virtual memory pool. + */ + void* pool_addr = 0; + + /** + * @brief Amount of virtual memory used in the pool. + */ + size_t pool_used = 0; + + /** + * @brief Total size of the virtual memory pool. + */ + size_t pool_size = 0; + + /** + * @brief Allocation granularity for the virtual memory pool. + */ + size_t granularity; + + /** + * @brief Handles for the physical memory allocated. + */ + std::vector handles; + + /** + * @brief Offsets for the mapped memory regions. + */ + std::vector map_offsets; + + /** + * @brief Constructor to initialize the buffer pool with virtual memory for + * a specific device. + * + * @param device The device ID to associate with this buffer pool. + */ + explicit ggml_cann_pool_vmm(int device) + : device(device), + granularity(ggml_cann_info().devices[device].vmm_granularity) { + auto dev = ggml_cann_info().devices[device]; + granularity = dev.vmm_granularity; + max_size = dev.total_vram; + } + + /** + * @brief Destructor to free all buffers in the virtual memory pool. + */ + ~ggml_cann_pool_vmm() { + if (pool_addr != 0) { + for (auto& offset : map_offsets) { + ACL_CHECK(aclrtUnmapMem(offset)); + } + for (auto& handle : handles) { + ACL_CHECK(aclrtFreePhysical(handle)); + } + ACL_CHECK(aclrtReleaseMemAddress(pool_addr)); + } + } + + /** + * @brief Allocate a buffer of the given size in the virtual memory pool. + * + * @param size The size of the buffer to allocate. + * @param actual_size A pointer to a variable to receive the actual size of + * the allocated buffer. + * @return A pointer to the allocated buffer. + */ + void* alloc(size_t size, size_t* actual_size) override { + // round up the allocation size to the alignment to ensure that all + // allocations are aligned for all data types + const size_t alignment = 128; + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } + + size_t avail = pool_size - pool_used; + + if (size > avail) { + // round up to the next multiple of the granularity + size_t reserve_size = size - avail; + reserve_size = GGML_PAD(reserve_size, granularity); + + GGML_ASSERT(pool_size + reserve_size <= max_size); + + // allocate more physical memory + aclrtPhysicalMemProp prop = {}; + prop.handleType = ACL_MEM_HANDLE_TYPE_NONE; + prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED; + prop.memAttr = ACL_HBM_MEM_HUGE; + prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + prop.reserve = 0; + aclrtDrvMemHandle handle; + ACL_CHECK(aclrtMallocPhysical(&handle, reserve_size, &prop, 0)); + + // reserve virtual address space (if not already reserved) + if (pool_addr == 0) { + ACL_CHECK(aclrtReserveMemAddress( + &pool_addr, max_size, 0, NULL, 1)); + } + + // map at the end of the pool + ACL_CHECK(aclrtMapMem((char*)pool_addr + pool_size, reserve_size, 0, + handle, 0)); + + handles.push_back(handle); + map_offsets.push_back((char*)pool_addr + pool_size); + + // add to the pool + pool_size += reserve_size; + +#ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n", + device, (unsigned long long) (pool_size/1024/1024), + (unsigned long long) (reserve_size/1024/1024)); +#endif + } + + GGML_ASSERT(pool_addr != 0); + + void* ptr = (void*)((char*)pool_addr + pool_used); + *actual_size = size; + pool_used += size; + +#ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO("cann pool[%d]: allocated %llu bytes at %llx\n", device, + (unsigned long long)size, (unsigned long long)ptr); +#endif + return ptr; + } + + /** + * @brief Free a buffer and return it to the virtual memory pool. + * + * @param ptr Pointer to the buffer to free. + * @param size Size of the buffer to free. + */ + void free(void* ptr, size_t size) override { +#ifdef DEBUG_CANN_MALLOC + GGML_LOG_INFO("cann pool[%d]: freed %llu bytes at %llx\n", device, + (unsigned long long)size, (unsigned long long)ptr); +#endif + + pool_used -= size; + + // all deallocations must be in reverse order of the allocations + GGML_ASSERT(ptr == (void*)((char*)pool_addr + pool_used)); + } +}; + +/** + * @brief Create a new CANN pool for a specific device. + * + * Factory method to create a new CANN pool object based on the device type. + * + * @param device The device ID for which to create the pool. + * @return A unique pointer to the created CANN pool. + */ +std::unique_ptr ggml_backend_cann_context::new_pool_for_device( + int device) { + return std::unique_ptr(new ggml_cann_pool_vmm(device)); +} + +// cann buffer +/** + * @brief Context for managing a CANN buffer associated with a specific device. + * + * This structure holds information about a CANN buffer, including the device + * ID, device pointer, and a name derived from GGML_CANN_NAME and the device ID. + */ +struct ggml_backend_cann_buffer_context { + int32_t device; ///< The device ID associated with this buffer context. + void* dev_ptr = + nullptr; ///< Pointer to the device memory allocated for the buffer. + + /** + * @brief Constructor to initialize the CANN buffer context. + * + * @param device The device ID associated with this buffer context. + * @param dev_ptr Pointer to the device memory allocated for the buffer. + */ + ggml_backend_cann_buffer_context(int32_t device, void* dev_ptr) + : device(device), + dev_ptr(dev_ptr) {} + + /** + * @brief Destructor to free the device memory allocated for the buffer. + */ + ~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); } +}; + +/** + * @brief Check if a buffer is a CANN buffer. + * + * This function checks if a given buffer is a CANN buffer by comparing its + * `get_name` function pointer to `ggml_backend_cann_buffer_get_name`. + * + * @param buffer The buffer to check. + * @return true if the buffer is a CANN buffer, false otherwise. + */ +static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft); +static bool ggml_backend_buffer_is_cann( + ggml_backend_buffer_t buffer) { + return ggml_backend_buft_is_cann(buffer->buft); +} + +/** + * @brief Free resources associated with a CANN buffer. + * + * This function frees the resources associated with a CANN buffer, including + * its context. + * + * @param buffer The CANN buffer to free. + */ +static void ggml_backend_cann_buffer_free_buffer( + ggml_backend_buffer_t buffer) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + delete ctx; +} + +/** + * @brief Retrieve the base pointer of a CANN buffer. + * + * This function returns the base pointer of a CANN buffer, which points to the + * device memory allocated for the buffer. + * + * @param buffer The CANN buffer whose base pointer is to be retrieved. + * @return A pointer to the base of the device memory allocated for the buffer. + */ +static void* ggml_backend_cann_buffer_get_base( + ggml_backend_buffer_t buffer) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + return ctx->dev_ptr; +} + +/** + * @brief Transform quantized Q4.0 tensor data into a format suitable for CANN + * processing. + * + * This function transforms quantized Q4.0 tensor data into a format suitable + * for CANN processing. It extracts quantization values and scales from the + * source data and prepares them in a format expected by CANN operations. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data in Q4.0 format. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor, + const void* src, + void* dst) { + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_0; + size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + + for (int i = 0; i < groups; i++) { + const block_q4_0* group = + (const block_q4_0*)((const char*)src + i * sizeof(block_q4_0)); + *scale_offset = group->d; + scale_offset++; + + // 0-15 + for (int j = 0; j < QK4_0 / 2; j += 2) { + (*quant_offset) = (group->qs[j] & 0x0F); + (*quant_offset) |= ((group->qs[j + 1] << 4)); + quant_offset++; + } + + // 16-31 + for (int j = 0; j < QK4_0 / 2; j += 2) { + (*quant_offset) = (group->qs[j] >> 4); + (*quant_offset) |= (group->qs[j + 1] & 0xF0); + quant_offset++; + } + } + + // put (uint4b_t -8) into int4b_t + for (quant_offset = (uint8_t*)dst; + quant_offset < (uint8_t*)dst + quant_bytes; quant_offset++) { + (*quant_offset) ^= 0x88; + } +} + +/** + * @brief Transform CANN processed data back into quantized Q4.0 format. + * + * This function transforms CANN processed data back into quantized Q4.0 format. + * It reverses the transformation performed by + * ggml_backend_cann_transform_q4_0(), converting the data back into its + * original quantized form. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source buffer containing transformed data. + * @param dst Pointer to the destination buffer where the Q4.0 formatted data + * will be stored. + */ +static void ggml_backend_cann_transform_back_q4_0( + const ggml_tensor* tensor, void* src, void* dst) { + + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK4_0; + size_t quant_bytes = n_elems * sizeof(uint8_t) / 2; + + uint8_t* quant_offset = (uint8_t*)src; + uint16_t* scale_offset = (uint16_t*)((char*)src + quant_bytes); + + for (; quant_offset < (uint8_t*)src + quant_bytes; quant_offset++) { + (*quant_offset) ^= 0x88; + } + quant_offset = (uint8_t*)src; + + for (int i = 0; i < groups; i++) { + block_q4_0* group = (block_q4_0*)((char*)dst + i * sizeof(block_q4_0)); + group->d = *scale_offset; + scale_offset++; + + // 0-15 + for (int j = 0; j < QK4_0 / 2; j += 2) { + group->qs[j] = ((*quant_offset) & 0x0F); + group->qs[j + 1] = ((*quant_offset) >> 4); + quant_offset++; + } + + // 16-31 + for (int j = 0; j < QK4_0 / 2; j += 2) { + group->qs[j] |= ((*quant_offset) << 4); + group->qs[j + 1] |= ((*quant_offset) & 0xF0); + quant_offset++; + } + } +} + +/** + * @brief Transform quantized Q8.0 tensor data into a format suitable for CANN + * processing. + * + * This function transforms quantized Q8.0 tensor data into a format suitable + * for CANN processing. It extracts quantization values and scales from the + * source data and prepares them in a format expected by CANN operations. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data in Q8.0 format. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +static void ggml_backend_cann_transform_q8_0(ggml_tensor* tensor, + const void* src, + void* dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + uint8_t* quant_offset = (uint8_t*)dst; + uint16_t* scale_offset = (uint16_t*)((char*)dst + quant_bytes); + + for (int i = 0; i < groups; i++) { + const block_q8_0* group = + (const block_q8_0*)((const char*)src + i * sizeof(block_q8_0)); + *scale_offset = group->d; + scale_offset++; + size_t group_quant_size = QK8_0 * sizeof(uint8_t); + memcpy(quant_offset, group->qs, group_quant_size); + quant_offset += group_quant_size; + } +} + +/** + * @brief Transform CANN processed data back into quantized Q8.0 format. + * + * This function transforms CANN processed data back into quantized Q8.0 format. + * It reverses the transformation performed by + * ggml_backend_cann_transform_q8_0(), converting the data back into its + * original quantized form. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source buffer containing transformed data. + * @param dst Pointer to the destination buffer where the Q8.0 formatted data + * will be stored. + */ +static void ggml_backend_cann_transform_back_q8_0( + const ggml_tensor* tensor, const void* src, void* dst) { + int64_t n_elems = ggml_nelements(tensor); + int64_t groups = n_elems / QK8_0; + size_t quant_bytes = n_elems * sizeof(uint8_t); + + const uint8_t* quant_offset = (const uint8_t*)src; + const uint16_t* scale_offset = + (const uint16_t*)((const char*)src + quant_bytes); + + for (int i = 0; i < groups; i++) { + block_q8_0* group = (block_q8_0*)((char*)dst + i * sizeof(block_q8_0)); + group->d = *scale_offset; + scale_offset++; + size_t group_quant_size = QK8_0 * sizeof(uint8_t); + memcpy(group->qs, quant_offset, group_quant_size); + quant_offset += group_quant_size; + } +} + +/** + * @brief Transform tensor data based on its type for CANN processing. + * + * This function transforms tensor data based on its quantization type for CANN + * processing. It dispatches the transformation based on the tensor's type to + * specialized functions handling Q4.0 and Q8.0 formats. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data to be transformed. + * @param dst Pointer to the destination buffer where transformed data will be + * stored. + */ +static void ggml_backend_cann_transform(ggml_tensor* tensor, + const void* src, void* dst) { + switch (tensor->type) { + case GGML_TYPE_Q4_0: + ggml_backend_cann_transform_q4_0(tensor, src, dst); + break; + case GGML_TYPE_Q8_0: + ggml_backend_cann_transform_q8_0(tensor, src, dst); + break; + default: + break; + } +} + +/** + * @brief Transform CANN processed data back into tensor data based on its type. + * + * This function transforms CANN processed data back into tensor data based on + * its quantization type for Q4.0 and Q8.0 formats. It dispatches the + * transformation based on the tensor's type to specialized functions. + * + * @param tensor Pointer to the tensor information. + * @param src Pointer to the source data containing CANN processed data. + * @param dst Pointer to the destination buffer where transformed tensor data + * will be stored. + */ +static void ggml_backend_cann_transform_back( + const ggml_tensor* tensor, void* src, void* dst) { + switch (tensor->type) { + case GGML_TYPE_Q4_0: + ggml_backend_cann_transform_back_q4_0(tensor, src, dst); + break; + case GGML_TYPE_Q8_0: + ggml_backend_cann_transform_back_q8_0(tensor, src, dst); + break; + default: + break; + } +} + +/** + * @brief Check if transformation is needed for a given tensor type. + * + * This function checks if transformation is needed for a given tensor type + * to prepare data for CANN processing. + * + * @param type The tensor type to check. + * @return true if transformation is needed, false otherwise. + */ +static bool need_transform(ggml_type type) { + switch (type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } +} + +/** + * @brief Initialize a tensor using data from a CANN buffer. + * + * This function initializes a tensor using data from a CANN buffer. + * It handles special cases such as views and quantization. + * + * @param buffer The CANN buffer from which to initialize the tensor. + * @param tensor Pointer to the tensor to be initialized. + */ +static void ggml_backend_cann_buffer_init_tensor( + ggml_backend_buffer_t buffer, ggml_tensor* tensor) { + if (tensor->view_src != NULL && tensor->view_offs == 0) { + GGML_ASSERT(tensor->view_src->buffer->buft == buffer->buft); + return; + } + + // TODO: can backend doesn't support quantized yet. Just leave the code + // here. + if (ggml_is_quantized(tensor->type)) { + // Initialize padding to 0 to avoid possible NaN values + size_t original_size = ggml_nbytes(tensor); + size_t padded_size = + ggml_backend_buft_get_alloc_size(buffer->buft, tensor); + + if (padded_size > original_size && tensor->view_src == nullptr) { + size_t memset_size = padded_size - original_size; + ACL_CHECK(aclrtMemset((char*)tensor->data + original_size, + memset_size, 0, memset_size)); + } + } +} + +// TODO: need handle tensor which has paddings. +/** + * @brief Set tensor data in a CANN buffer. + * + * This function sets tensor data in a CANN buffer, handling transformations + * if needed based on the tensor's type. + * + * @param buffer The CANN buffer where the tensor data will be set. + * @param tensor Pointer to the tensor whose data will be set. + * @param data Pointer to the source data to be copied into the tensor. + * @param offset Offset in the source data from where to start copying. + * @param size Size of the data to be copied, in bytes. + */ +static void ggml_backend_cann_buffer_set_tensor( + ggml_backend_buffer_t buffer, ggml_tensor *tensor, const void *data, + size_t offset, size_t size) { + ggml_backend_cann_buffer_context *ctx = + (ggml_backend_cann_buffer_context *)buffer->context; + + ggml_cann_set_device(ctx->device); + // TODO: refer to cann(#6017), it use thread's default stream. + // For acl, synchronous functions use this default stream. + // Why aclrtSynchronizeDevice? + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, + ACL_MEMCPY_HOST_TO_DEVICE)); + } else { + void *transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, data, transform_buffer); + + ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, + transform_buffer, size, + ACL_MEMCPY_HOST_TO_DEVICE)); + free(transform_buffer); + } +} + +/** + * @brief Get tensor data from a CANN buffer. + * + * This function retrieves tensor data from a CANN buffer, handling + * transformations if needed based on the tensor's type. + * + * @param buffer The CANN buffer from which to retrieve tensor data. + * @param tensor Pointer to the tensor whose data will be retrieved. + * @param data Pointer to the destination buffer where the tensor data will be + * copied. + * @param offset Offset in the destination buffer where to start copying. + * @param size Size of the data to be copied, in bytes. + */ +static void ggml_backend_cann_buffer_get_tensor( + ggml_backend_buffer_t buffer, const ggml_tensor* tensor, void* data, + size_t offset, size_t size) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + ggml_cann_set_device(ctx->device); + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpy(data, size, (char*)tensor->data + offset, size, + ACL_MEMCPY_DEVICE_TO_HOST)); + } else { + void* transform_buffer = malloc(size); + ACL_CHECK(aclrtMemcpy(transform_buffer, size, + (char*)tensor->data + offset, size, + ACL_MEMCPY_DEVICE_TO_HOST)); + ggml_backend_cann_transform_back(tensor, transform_buffer, data); + free(transform_buffer); + } +} + +/** + * @brief Copy tensor data between CANN buffers if possible. + * + * This function copies tensor data between CANN buffers if the source and + * destination buffers are CANN buffers and they meet the necessary conditions + * (same device or devices can access each other). + * + * @param buffer The destination CANN buffer where the tensor data will be + * copied. + * @param src Pointer to the source tensor whose data will be copied. + * @param dst Pointer to the destination tensor where the data will be copied. + * @return true if the copy operation succeeded, false otherwise. + */ +static bool ggml_backend_cann_buffer_cpy_tensor( + ggml_backend_buffer_t buffer, const ggml_tensor* src, ggml_tensor* dst) { + if (ggml_backend_buffer_is_cann(src->buffer)) { + ggml_backend_cann_buffer_context* src_ctx = + (ggml_backend_cann_buffer_context*)src->buffer->context; + ggml_backend_cann_buffer_context* dst_ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + size_t memcpy_size = ggml_nbytes(src); + // Same device. + if (src_ctx->device == dst_ctx->device) { + ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size, + (const char*)src->data, memcpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE)); + return true; + } else { + // Different device but can access by peer. + int32_t canAccessPeer = 0; + ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, src_ctx->device, + dst_ctx->device)); + if (canAccessPeer) { + ggml_cann_set_device(src_ctx->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(dst_ctx->device, 0)); + ACL_CHECK(aclrtMemcpy((char*)dst->data, memcpy_size, + (const char*)src->data, memcpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE)); + return true; + } + } + } + return false; +} + +/** + * @brief Clear a CANN buffer by setting all its memory to a specified value. + * + * This function clears a CANN buffer by setting all its memory to a specified + * value. + * + * @param buffer The CANN buffer to be cleared. + * @param value The value to which each byte in the buffer will be set. + */ +static void ggml_backend_cann_buffer_clear( + ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_cann_buffer_context* ctx = + (ggml_backend_cann_buffer_context*)buffer->context; + + ggml_cann_set_device(ctx->device); + ACL_CHECK(aclrtMemset(ctx->dev_ptr, buffer->size, value, buffer->size)); +} + +/** + * @brief Interface for a CANN buffer in the backend. + * + * This structure defines function pointers to operations that can be performed + * on a CANN buffer within the backend. + */ +static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = { + /* .free_buffer = */ ggml_backend_cann_buffer_free_buffer, + /* .get_base = */ ggml_backend_cann_buffer_get_base, + /* .init_tensor = */ ggml_backend_cann_buffer_init_tensor, + /* .memset_tensor = */ NULL, + /* .set_tensor = */ ggml_backend_cann_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cann_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cann_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cann_buffer_clear, + /* .reset = */ NULL, +}; + +// cann buffer type +/** + * @brief Structure representing context information for a specific backend + * buffer type. + */ +struct ggml_backend_cann_buffer_type_context { + int32_t + device; /**< Device identifier associated with the buffer context. */ + std::string name; /**< Name associated with the buffer context. */ +}; + +/** + * @brief Retrieves the name associated with a CANN buffer type. + * + * This function returns the descriptive name associated with the specified + * CANN buffer type context. + * + * @param buft Pointer to the buffer type context. + * @return Const pointer to the C-style string containing the name. + */ +static const char* ggml_backend_cann_buffer_type_name( + ggml_backend_buffer_type_t buft) { + ggml_backend_cann_buffer_type_context* buft_ctx = + (ggml_backend_cann_buffer_type_context*)buft->context; + + return buft_ctx->name.c_str(); +} + +/** + * @brief Allocates a new CANN buffer of the specified type and size. + * + * This function allocates a new CANN buffer on the specified device with the + * given size. + * + * @param buft Pointer to the buffer type context. + * @param size Size in bytes of the buffer to allocate. + * @return Pointer to the allocated buffer, or nullptr if allocation fails. + */ +static ggml_backend_buffer_t +ggml_backend_cann_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, + size_t size) { + ggml_backend_cann_buffer_type_context* buft_ctx = + (ggml_backend_cann_buffer_type_context*)buft->context; + + ggml_cann_set_device(buft_ctx->device); + + size = std::max(size, (size_t)1); + + void* dev_ptr; + aclError err = aclrtMalloc(&dev_ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); + if (err != ACL_SUCCESS) { + GGML_LOG_ERROR( + "%s: allocating %.2f MiB on device %d: aclrtMalloc failed: %s\n", + __func__, size / 1024.0 / 1024.0, buft_ctx->device, + aclGetRecentErrMsg()); + return nullptr; + } + + ggml_backend_cann_buffer_context* ctx = + new ggml_backend_cann_buffer_context(buft_ctx->device, dev_ptr); + + return ggml_backend_buffer_init(buft, ggml_backend_cann_buffer_interface, + ctx, size); +} + +/** + * @brief Retrieves the memory alignment requirement for CANN buffers of this + * type. + * + * This function returns the alignment requirement in bytes for memory allocated + * by the CANN buffer type. + * + * @param buft Pointer to the buffer type context (unused in this + * implementation). + * @return The alignment requirement in bytes (fixed at 128 bytes for CANN + * buffers). + */ +static size_t ggml_backend_cann_buffer_type_get_alignment( + ggml_backend_buffer_type_t buft) { + return 128; + + GGML_UNUSED(buft); +} + +/** + * @brief Calculates the allocation size required for a tensor in a CANN buffer. + * + * Computes the total allocation size needed for storing the tensor's data in a + * CANN buffer, considering any necessary padding or adjustments for quantized + * types. + * + * @param buft Pointer to the buffer type context (unused in this + * implementation). + * @param tensor Pointer to the tensor for which the allocation size is + * calculated. + * @return The total allocation size in bytes required for the tensor in the + * CANN buffer. + */ +static size_t ggml_backend_cann_buffer_type_get_alloc_size( + ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) { + size_t size = ggml_nbytes(tensor); + int64_t ne0 = tensor->ne[0]; + + // last line must bigger than 32, because every single op deal at + // least 32 bytes. + // TODO: quantized type? + // int64_t line_size = ne0 * ggml_element_size(tensor); + // int64_t line_size_align_32 = (line_size + 31) & ~31; + // size += (line_size_align_32 - line_size); + + // TODO: not support quantized yet. + // TODO: consider un-continue tensor. + if (ggml_is_quantized(tensor->type)) { + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size( + tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return size; + + GGML_UNUSED(buft); +} + +static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + GGML_UNUSED(buft); +} + +/** + * @brief Interface for managing CANN buffer types in the GGML backend. + * + * Provides function pointers for allocating, querying properties, and managing + * memory for CANN buffer types in the GGML backend. + */ +static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = { + /* .get_name = */ ggml_backend_cann_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size, + /* .is_host = */ ggml_backend_cann_buffer_type_is_host, +}; + +/** + * @brief Retrieves the CANN buffer type for a specified device. + * + * This function initializes and returns the buffer type interface associated + * with the given device. It ensures thread-safe access using a mutex. + * + * @param device The device index for which to retrieve the buffer type. + * @return A pointer to the buffer type interface for the specified device, or + * nullptr if the device index is out of range. + */ +ggml_backend_buffer_type_t +ggml_backend_cann_buffer_type(int32_t device) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + if (device >= ggml_backend_cann_get_device_count()) { + return nullptr; + } + + static ggml_backend_buffer_type + ggml_backend_cann_buffer_types[GGML_CANN_MAX_DEVICES]; + + static bool ggml_backend_cann_buffer_type_initialized = false; + + if (!ggml_backend_cann_buffer_type_initialized) { + for (int32_t i = 0; i < ggml_cann_info().device_count; i++) { + ggml_backend_cann_buffer_types[i] = { + /* .iface = */ ggml_backend_cann_buffer_type_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i), + /* .context = */ + new ggml_backend_cann_buffer_type_context{ + i, "CANN" + std::to_string(i)}, + }; + } + ggml_backend_cann_buffer_type_initialized = true; + } + + return &ggml_backend_cann_buffer_types[device]; +} + +/** + * @brief Retrieves the name associated with a CANN host buffer type. + * + * This function returns the descriptive name associated with the specified + * CANN host buffer type context. + * + * @param buft Pointer to the host buffer type context. + * @return Const pointer to the C-style string containing the name. + */ +static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "CANN_Host"; + + GGML_UNUSED(buft); +} + +/** + * @brief Retrieves the name associated with a CANN host buffer. + * + * This function returns the descriptive name associated with the specified + * CANN host buffer context. + * + * @param buft Pointer to the host buffer context. + * @return Const pointer to the C-style string containing the name. + */ +static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) { + return "CANN_Host"; + + GGML_UNUSED(buffer); +} + +/** + * @brief Free resources associated with a CANN host buffer. + * + * This function frees the resources associated with a CANN host buffer, including + * its context. + * + * @param buffer The CANN host buffer to free. + */ +static void ggml_backend_cann_host_buffer_free(ggml_backend_buffer_t buffer) { + ACL_CHECK(aclrtFreeHost(buffer->context)); +} + +/** + * @brief Allocates a new CANN host buffer of the specified size. + * + * This function allocates a new CANN host buffer with the given size. + * @param size Size in bytes of the host buffer to allocate. + * @return Pointer to the allocated host buffer, or nullptr if allocation fails. + */ +static void * ggml_cann_host_malloc(size_t size) { + if (getenv("GGML_CANN_NO_PINNED") != nullptr) { + return nullptr; + } + + const size_t alignment = 128; + size = GGML_PAD(size, alignment); + if (size == 0) { + size = alignment; + } + + void * hostPtr = nullptr; + aclError err = aclrtMallocHost((void **) &hostPtr, size); + if (err != ACL_SUCCESS) { + GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + size / 1024.0 / 1024.0, aclGetRecentErrMsg()); + return nullptr; + } + return hostPtr; +} + +/** + * @brief Allocates a new CANN host buffer of the specified type and size. + * + * @param buft Pointer to the host buffer type context. + * @param size Size in bytes of the host buffer to allocate. + * @return Pointer to the allocated host buffer, or CPU buffer pointer if allocation fails. + */ +static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * hostPtr = ggml_cann_host_malloc(size); + + if (hostPtr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size); + buffer->buft = buft; + buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free; + + return buffer; +} + +/** + * @brief Interface for managing CANN host buffer types in the GGML backend. + * + * Provides function pointers for allocating, querying properties, and managing + * memory for CANN buffer types in the GGML backend. + */ +ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cann_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cann_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0), + /* .context = */ nullptr, + }; + + return &ggml_backend_cann_buffer_type_host; +} + +/** + * @brief Computes the forward operation for a given tensor using CANN + * operations. + * + * This function selects the appropriate CANN operation based on the type of + * operation specified in the tensor and performs the computation. + * + * @param ctx The CANN context containing necessary resources and + * configurations. + * @param dst The destination tensor where the result of the computation will be + * stored. + * @return true if the computation was successful; false otherwise. + */ +static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, + struct ggml_tensor* dst) { + switch (dst->op) { + case GGML_OP_REPEAT: + ggml_cann_repeat(ctx, dst); + break; + case GGML_OP_GET_ROWS: + ggml_cann_get_rows(ctx, dst); + break; + case GGML_OP_DUP: + ggml_cann_dup(ctx, dst); + break; + case GGML_OP_ADD: + ggml_cann_add(ctx, dst); + break; + case GGML_OP_ACC: + ggml_cann_acc(ctx, dst); + break; + case GGML_OP_MUL: + ggml_cann_mul_div(ctx, dst); + break; + case GGML_OP_DIV: + ggml_cann_mul_div(ctx, dst); + break; + case GGML_OP_UNARY: + switch (ggml_get_unary_op(dst)) { + case GGML_UNARY_OP_GELU: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_SILU: + ggml_cann_activation( + ctx, dst); + break; + // TODO: Use faster gelu?? + case GGML_UNARY_OP_GELU_QUICK: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_TANH: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_RELU: + ggml_cann_activation( + ctx, dst); + break; + case GGML_UNARY_OP_HARDSIGMOID: + ggml_cann_activation(ctx, dst); + break; + case GGML_UNARY_OP_HARDSWISH: + ggml_cann_activation(ctx, dst); + break; + default: + return false; + } + break; + case GGML_OP_NORM: + ggml_cann_norm(ctx, dst); + break; + case GGML_OP_GROUP_NORM: + ggml_cann_group_norm(ctx, dst); + break; + case GGML_OP_CONCAT: + ggml_cann_concat(ctx, dst); + break; + case GGML_OP_UPSCALE: + ggml_cann_upsample_nearest2d(ctx, dst); + break; + case GGML_OP_PAD: + ggml_cann_pad(ctx, dst); + break; + case GGML_OP_ARANGE: + ggml_cann_arange(ctx, dst); + break; + case GGML_OP_TIMESTEP_EMBEDDING: + ggml_cann_timestep_embedding(ctx, dst); + break; + case GGML_OP_LEAKY_RELU: + ggml_cann_leaky_relu(ctx, dst); + break; + case GGML_OP_RMS_NORM: + ggml_cann_rms_norm(ctx, dst); + break; + case GGML_OP_MUL_MAT: + ggml_cann_mul_mat(ctx, dst); + break; + case GGML_OP_MUL_MAT_ID: + return false; + case GGML_OP_SCALE: + ggml_cann_scale(ctx, dst); + break; + case GGML_OP_SQR: + ggml_cann_sqr(ctx, dst); + break; + case GGML_OP_CLAMP: + ggml_cann_clamp(ctx, dst); + break; + case GGML_OP_CPY: + ggml_cann_cpy(ctx, dst); + break; + case GGML_OP_CONT: + ggml_cann_dup(ctx, dst); + break; + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + break; + case GGML_OP_DIAG_MASK_INF: + ggml_cann_diag_mask(ctx, dst, -INFINITY); + break; + case GGML_OP_SOFT_MAX: + ggml_cann_softmax(ctx, dst); + break; + case GGML_OP_ROPE: + ggml_cann_rope(ctx, dst); + break; + case GGML_OP_IM2COL: + ggml_cann_im2col(ctx, dst); + break; + case GGML_OP_POOL_2D: + ggml_cann_pool2d(ctx, dst); + break; + case GGML_OP_SUM_ROWS: + ggml_cann_sum_rows(ctx, dst); + break; + case GGML_OP_ARGSORT: + ggml_cann_argsort(ctx, dst); + break; + default: + return false; + } + + return true; +} + +// backend +/** + * @brief Retrieves the name associated with the CANN backend. + * + * This function returns the name assigned to the CANN backend, which is stored + * in the context of the provided backend structure. + * + * @param backend Pointer to the CANN backend structure. + * @return A pointer to a constant string representing the backend name. + */ +static const char* ggml_backend_cann_name(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + return cann_ctx->name.c_str(); +} + +/** + * @brief Frees resources associated with the CANN backend. + * + * This function releases resources associated with the CANN backend context + * and resets the device associated with the backend to its initial state. + * + * @param backend Pointer to the CANN backend structure to be freed. + */ +static void ggml_backend_cann_free(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + ACL_CHECK(aclrtSynchronizeDevice()); + ACL_CHECK(aclrtResetDevice(cann_ctx->device)); + + // finalize when last backend freed. + if (cann_ctx->device == ggml_backend_cann_get_device_count() - 1) { + ACL_CHECK(aclFinalize()); + } + + delete cann_ctx; + delete backend; +} + +/** + * @brief Sets tensor data asynchronously in the CANN backend. + * + * This function asynchronously sets tensor data in the CANN backend. Depending + * on the tensor type, it may perform data transformations before copying data + * to the device. + * + * @param backend Pointer to the CANN backend structure. + * @param tensor Pointer to the tensor structure to set data for. + * @param data Pointer to the host data to copy to the tensor. + * @param offset Offset in bytes within the host data. + * @param size Size of the data to copy in bytes. + */ +static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend, + ggml_tensor *tensor, + const void *data, + size_t offset, + size_t size) { + ggml_backend_cann_context *cann_ctx = + (ggml_backend_cann_context *)backend->context; + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpyAsync((char *)tensor->data + offset, size, data, + size, ACL_MEMCPY_HOST_TO_DEVICE, + cann_ctx->stream())); + } else { + void *transform_buffer = malloc(size); + ggml_backend_cann_transform(tensor, data, transform_buffer); + + ACL_CHECK(aclrtMemcpyAsync( + (char *)tensor->data + offset, size, transform_buffer, size, + ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream())); + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); + free(transform_buffer); + } +} + +static void ggml_backend_cann_get_tensor_async( + ggml_backend_t backend, const ggml_tensor *tensor, void *data, + size_t offset, size_t size) { + ggml_backend_cann_context *cann_ctx = + (ggml_backend_cann_context *)backend->context; + ggml_backend_buffer_t buf = + tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + + GGML_ASSERT(buf->buft == ggml_backend_cann_buffer_type(cann_ctx->device) && + "unsupported buffer type"); + + if (!need_transform(tensor->type)) { + ACL_CHECK(aclrtMemcpyAsync(data, size, (char *)tensor->data + offset, + size, ACL_MEMCPY_DEVICE_TO_HOST, + cann_ctx->stream())); + } else { + void *transform_buffer = malloc(size); + ACL_CHECK(aclrtMemcpyAsync( + transform_buffer, size, (char *)tensor->data + offset, size, + ACL_MEMCPY_DEVICE_TO_HOST, cann_ctx->stream())); + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); + ggml_backend_cann_transform_back(tensor, transform_buffer, data); + free(transform_buffer); + } +} + +/** + * @brief Asynchronously copies tensor data between CANN backends. + * + * This function copies tensor data asynchronously between two CANN backends. It + * checks if both tensors reside in CANN buffers and whether the devices support + * peer-to-peer access for direct copying. If not, it returns false. + * + * @param backend_src Pointer to the source CANN backend structure. + * @param backend_dst Pointer to the destination CANN backend structure. + * @param src Pointer to the source tensor to copy data from. + * @param dst Pointer to the destination tensor to copy data to. + * @return true if the copy operation succeeds, false otherwise. + */ +static bool ggml_backend_cann_cpy_tensor_async( + ggml_backend_t backend_src, ggml_backend_t backend_dst, + const ggml_tensor* src, ggml_tensor* dst) { + GGML_ASSERT(ggml_backend_is_cann(backend_src) || + ggml_backend_is_cann(backend_dst)); + + if (!ggml_backend_buffer_is_cann(src->buffer) || + !ggml_backend_buffer_is_cann(dst->buffer)) { + return false; + } + + ggml_backend_buffer_t buf_src = + src->view_src ? src->view_src->buffer : src->buffer; + ggml_backend_buffer_t buf_dst = + dst->view_src ? dst->view_src->buffer : dst->buffer; + + ggml_backend_cann_context* cann_ctx_src = + (ggml_backend_cann_context*)backend_src->context; + ggml_backend_cann_context* cann_ctx_dst = + (ggml_backend_cann_context*)backend_dst->context; + + size_t copy_size = ggml_nbytes(dst); + if (backend_src != backend_dst) { + ggml_backend_cann_buffer_context* buf_ctx_src = + (ggml_backend_cann_buffer_context*)buf_src->context; + ggml_backend_cann_buffer_context* buf_ctx_dst = + (ggml_backend_cann_buffer_context*)buf_dst->context; + + GGML_ASSERT(cann_ctx_src->device == buf_ctx_src->device); + GGML_ASSERT(cann_ctx_dst->device == buf_ctx_dst->device); + + int32_t canAccessPeer = 0; + ACL_CHECK(aclrtDeviceCanAccessPeer(&canAccessPeer, cann_ctx_src->device, + cann_ctx_dst->device)); + if (!canAccessPeer) { + return false; + } + + // need open both directions for memcpyasync between devices. + ggml_cann_set_device(cann_ctx_dst->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_src->device, 0)); + ggml_cann_set_device(cann_ctx_src->device); + ACL_CHECK(aclrtDeviceEnablePeerAccess(cann_ctx_dst->device, 0)); + + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, + cann_ctx_src->stream())); + + //TODO: workaround for Event didn`t work here. + aclrtSynchronizeStream(cann_ctx_src->stream()); + } else { + // src and dst are on the same backend + ACL_CHECK(aclrtMemcpyAsync(dst->data, copy_size, src->data, copy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, + cann_ctx_dst->stream())); + } + + return true; +} + +/** + * @brief Synchronizes a CANN backend. + * + * This function synchronizes the specified CANN backend by waiting for all + * operations in its associated stream to complete. + * + * @param backend Pointer to the CANN backend structure to synchronize. + */ +static void ggml_backend_cann_synchronize(ggml_backend_t backend) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + ggml_cann_set_device(cann_ctx->device); + + ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream())); +} + +/** + * @brief Computes a computational graph using a CANN backend. + * + * This function computes the operations defined in the computational graph + * using the specified CANN backend. + * + * @param backend Pointer to the CANN backend structure to use for computation. + * @param cgraph Pointer to the computational graph structure containing nodes + * representing operations to be computed. + * @return enum ggml_status Returns GGML_STATUS_SUCCESS if computation + * completes successfully, otherwise an appropriate error status. + */ +static enum ggml_status ggml_backend_cann_graph_compute( + ggml_backend_t backend, ggml_cgraph* cgraph) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + + ggml_cann_set_device(cann_ctx->device); + + for (int i = 0; i < cgraph->n_nodes; i++) { + ggml_tensor* node = cgraph->nodes[i]; + + if (ggml_is_empty(node) || node->op == GGML_OP_NONE) { + continue; + } + + bool ok = ggml_cann_compute_forward(*cann_ctx, node); + + if (!ok) { + GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__, + node->name, ggml_op_name(node->op)); + } + GGML_ASSERT(ok); + } + + return GGML_STATUS_SUCCESS; +} + +/** + * @brief Checks if the CANN backend supports a specific operation. + * + * This function checks whether the specified operation is supported by the + * CANN backend. + * + * @param backend Pointer to the CANN backend structure to check support for + * the operation. + * @param op Pointer to the tensor representing the operation to check. + * @return bool Returns true if the operation is supported by the backend, + * otherwise false. + */ +static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, + const ggml_tensor* op) { + switch (op->op) { + case GGML_OP_UNARY: + switch (ggml_get_unary_op(op)) { + case GGML_UNARY_OP_GELU: + case GGML_UNARY_OP_SILU: + case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: + case GGML_UNARY_OP_GELU_QUICK: + case GGML_UNARY_OP_TANH: + return true; + default: + return false; + } + case GGML_OP_MUL_MAT: { + switch (op->src[0]->type) { + case GGML_TYPE_Q8_0: + // Current groupsize should not be greater than k-1 in + // aclnnWeightQuantBatchMatmulV2GetWorkspaceSize + if (op->src[0]->ne[0] <= QK8_0) { + return false; + } + case GGML_TYPE_F16: + case GGML_TYPE_F32: + case GGML_TYPE_Q4_0: + return true; + default: + return false; + } + } + case GGML_OP_MUL_MAT_ID: + return false; + // embedding + case GGML_OP_GET_ROWS: { + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q8_0: + return true; + default: + return false; + } + } break; + case GGML_OP_CPY: { + switch (op->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q4_0: + return true; + default: + return false; + } + } + case GGML_OP_CONT: { + // TODO: support GGML_TYPE_BF16 + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + return true; + default: + return false; + } + } + case GGML_OP_ROPE: { + // TODO: with ops-test v == 1 + float * ext_factor = (float*)((int32_t*)op->op_params + 7); + // TODO: n_dims <= ne0 + if (op->src[0]->ne[0] != op->op_params[1]) { + return false; + } + // TODO: ext_factor != 0 + if (*ext_factor != 0) { + return false; + } + + const int mode = ((const int32_t *) op->op_params)[2]; + if (mode & GGML_ROPE_TYPE_MROPE) { + return false; + } + if (mode & GGML_ROPE_TYPE_VISION) { + return false; + } + + return true; + } + case GGML_OP_UPSCALE: { + // aclnnUpsampleNearest2dGetWorkspaceSize not support + // selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal + if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) { + return false; + } + return true; + } + case GGML_OP_IM2COL: + case GGML_OP_CONCAT: + case GGML_OP_DUP: + case GGML_OP_REPEAT: + case GGML_OP_NONE: + case GGML_OP_RESHAPE: + case GGML_OP_VIEW: + case GGML_OP_PERMUTE: + case GGML_OP_TRANSPOSE: + case GGML_OP_NORM: + case GGML_OP_ADD: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_RMS_NORM: + case GGML_OP_SCALE: + case GGML_OP_SQR: + case GGML_OP_CLAMP: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_SOFT_MAX: + case GGML_OP_POOL_2D: + case GGML_OP_SUM_ROWS: + case GGML_OP_ARGSORT: + case GGML_OP_ACC: + case GGML_OP_GROUP_NORM: + case GGML_OP_PAD: + case GGML_OP_ARANGE: + case GGML_OP_TIMESTEP_EMBEDDING: + case GGML_OP_LEAKY_RELU: + return true; + default: + return false; + } + + GGML_UNUSED(dev); +} + +/** + * @brief Checks if the backend buffer type is associated with the CANN backend. + * + * This function checks whether the provided backend buffer type is associated + * with the CANN backend based on the comparison of its name retrieval function + * pointer. + * + * @param buft Pointer to the backend buffer type to check. + * @return bool Returns true if the buffer type is associated with the CANN + * backend, otherwise false. + */ +static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name == ggml_backend_cann_buffer_type_name; +} + +/** + * @brief Determines if a tensor operation should be offloaded to the CANN + * backend. + * + * This function checks if a given tensor operation should be offloaded to the + * CANN backend based on the operation type and the size of the tensor. It + * returns true if the second dimension (ne[1]) of the tensor is greater than or + * equal to the minimum batch size and the operation is not GGML_OP_GET_ROWS. + * + * @param backend Pointer to the CANN backend. + * @param op Pointer to the tensor operation to check. + * @return bool Returns true if the operation should be offloaded, otherwise + * false. + */ +static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev, + const ggml_tensor* op) { + const int min_batch_size = 32; + GGML_UNUSED(dev); + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; +} + +/** + * @brief Records an event on the CANN backend stream. + * + * This function records the given event on the ACL runtime stream associated + * with the backend context. + * + * @param event Pointer to the event structure to be recorded. + */ +static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream())); +} + +/** + * @brief Waits for a recorded event to complete on the CANN backend stream. + * + * This function makes the given backend wait for the event to complete on its + * ACL runtime stream. + * + * @param backend Pointer to the backend structure. + * @param event Pointer to the event structure that the backend needs to wait + * for. + */ +static void ggml_backend_cann_event_wait(ggml_backend_t backend, + ggml_backend_event_t event) { + ggml_backend_cann_context* cann_ctx = + (ggml_backend_cann_context*)backend->context; + if (ggml_backend_is_cann(backend)) { + ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(), + (aclrtEvent)event->context)); + } else { + GGML_ABORT("fatal error"); + } +} + +/** + * @brief Structure defining the interface for the CANN backend. + * + * This structure contains function pointers for various operations + * supported by the CANN backend, including name retrieval, memory + * management, tensor operations, synchronization, and event handling. + */ +static const ggml_backend_i ggml_backend_cann_interface = { + /* .get_name = */ ggml_backend_cann_name, + /* .free = */ ggml_backend_cann_free, + /* .set_tensor_async = */ ggml_backend_cann_set_tensor_async, + /* .get_tensor_async = */ ggml_backend_cann_get_tensor_async, + /* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async, + /* .synchronize = */ ggml_backend_cann_synchronize, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_update = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_cann_graph_compute, + /* .event_record = */ ggml_backend_cann_event_record, + /* .event_wait = */ ggml_backend_cann_event_wait, +}; + +/** + * @brief Return the hardcoded GUID for the CANN backend. + * + * This function returns a static GUID which uniquely identifies the CANN + * backend. + * + * @return A pointer to the static GUID. + */ +static ggml_guid_t ggml_backend_cann_guid() { + static ggml_guid guid = {0xa1, 0x94, 0xaf, 0xac, 0xbd, 0x4f, 0x47, 0x34, + 0xbe, 0x1a, 0x9e, 0x71, 0x1f, 0x9e, 0xed, 0x64}; + return &guid; +} + +// backend device +struct ggml_backend_cann_device_context { + int device; + std::string name; + std::string description; +}; + +static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + return ctx->name.c_str(); +} + +static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + return ctx->description.c_str(); +} + +static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_get_device_memory(ctx->device, free, total); +} + +static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return GGML_BACKEND_DEVICE_TYPE_GPU; +} + +static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) { + props->name = ggml_backend_cann_device_get_name(dev); + props->description = ggml_backend_cann_device_get_description(dev); + props->type = ggml_backend_cann_device_get_type(dev); + ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total); + + bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr; + + props->caps = { + /* .async = */ false, + /* .host_buffer = */ host_buffer, + /* .buffer_from_host_ptr = */ false, + /* .events = */ true, + }; +} + +static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) { + GGML_UNUSED(params); + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + return ggml_backend_cann_init(ctx->device); +} + +/** + * @brief Checks if the CANN backend supports a specific backend buffer type. + * + * This function determines whether the CANN backend supports the given backend + * buffer type by comparing the device context of the backend and buffer type. + * It returns true if the devices are same between the backend context and + * buffer type context. + * + * @param backend Pointer to the CANN backend. + * @param buft Pointer to the backend buffer type to check. + * @return bool Returns true if the CANN backend supports the buffer type, + * otherwise false. + */ +static bool ggml_backend_cann_supports_buft( + ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { + if (ggml_backend_buft_is_cann(buft)) { + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; + ggml_backend_cann_buffer_type_context * buft_ctx = + (ggml_backend_cann_buffer_type_context *)buft->context; + return buft_ctx->device == dev_ctx->device; + } + return false; +} + +static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context; + return ggml_backend_cann_buffer_type(ctx->device); +} + +static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) { + GGML_UNUSED(dev); + return ggml_backend_cann_host_buffer_type(); +} + +/** + * @brief Creates a new event for the CANN backend device. + * + * This function initializes a new event for the CANN backend by setting the + * device and creating an ACL runtime event. The created event is then wrapped + * in a ggml_backend_event structure and returned. + * + * @param backend Pointer to the CANN backend. + * @return ggml_backend_event_t Returns a pointer to the new event structure. + */ +static ggml_backend_event_t ggml_backend_cann_device_event_new( + ggml_backend_dev_t dev) { + ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context; + + ggml_cann_set_device(dev_ctx->device); + + aclrtEvent event; + ACL_CHECK(aclrtCreateEvent(&event)); + + return new ggml_backend_event{ + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device), + /* .context = */ event, + }; +} + +/** + * @brief Frees a CANN backend event. + * + * This function destroys the ACL runtime event associated with the given CANN + * backend event and then deletes the event structure itself. + * + * @param event Pointer to the event structure to be freed. + */ +static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) { + ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context)); + + delete event; + GGML_UNUSED(dev); +} + +/** + * @brief Synchronizes the given event on the CANN backend. + * + * This function waits for the specified event to complete on the ACL runtime. + * + * @param event Pointer to the event structure to be synchronized. + */ +static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) { + ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context)); + + GGML_UNUSED(dev); +} + +static const ggml_backend_device_i ggml_backend_cann_device_interface = { + /* .get_name = */ ggml_backend_cann_device_get_name, + /* .get_description = */ ggml_backend_cann_device_get_description, + /* .get_memory = */ ggml_backend_cann_device_get_memory, + /* .get_type = */ ggml_backend_cann_device_get_type, + /* .get_props = */ ggml_backend_cann_device_get_props, + /* .init_backend = */ ggml_backend_cann_device_init, // called for every card + /* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type, + /* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type, + /* .buffer_from_host_ptr = */ NULL, // not supported for CANN + /* .supports_op = */ ggml_backend_cann_supports_op, + /* .supports_buft = */ ggml_backend_cann_supports_buft, + /* .offload_op = */ ggml_backend_cann_offload_op, + /* .event_new = */ ggml_backend_cann_device_event_new, + /* .event_free = */ ggml_backend_cann_device_event_free, + /* .event_synchronize = */ ggml_backend_cann_device_event_synchronize, +}; + + +// backend reg +struct ggml_backend_cann_reg_context { + std::vector devices; +}; + +static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) { + GGML_UNUSED(reg); + return GGML_CANN_NAME; +} + +static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) { + ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context; + return ctx->devices.size(); +} + +static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) { + ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context; + GGML_ASSERT(index < ctx->devices.size()); + return ctx->devices[index]; +} + +static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { + GGML_UNUSED(reg); + GGML_UNUSED(name); + // reserved for future use + return nullptr; +} + +static const ggml_backend_reg_i ggml_backend_cann_reg_interface = { + /* .get_name = */ ggml_backend_cann_reg_get_name, + /* .get_device_count = */ ggml_backend_cann_reg_get_device_count, + /* .get_device = */ ggml_backend_cann_reg_get_device, + /* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address, +}; + +// backend registry, called only once for cann backend +ggml_backend_reg_t ggml_backend_cann_reg() { + static ggml_backend_reg reg; + static bool initialized = false; + + { + static std::mutex mutex; + std::lock_guard lock(mutex); + if (!initialized) { + aclInit(nullptr); + ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context; + + for (int i = 0; i < ggml_cann_info().device_count; i++) { + ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context(); + dev_ctx->description = aclrtGetSocName(); + dev_ctx->device = i; + dev_ctx->name = GGML_CANN_NAME + std::to_string(i); + ggml_cann_set_device(i); + ggml_backend_dev_t dev = new ggml_backend_device { + /* .iface = */ ggml_backend_cann_device_interface, + /* .reg = */ ®, + /* .context = */ dev_ctx + }; + ctx->devices.push_back(dev); + } + + reg = ggml_backend_reg { + /* .api_version = */ GGML_BACKEND_API_VERSION, + /* .iface = */ ggml_backend_cann_reg_interface, + /* .context = */ ctx + }; + } + + initialized = true; + } + + return ® +} + +ggml_backend_t ggml_backend_cann_init(int32_t device) { + aclInit(nullptr); + if (device < 0 || device >= ggml_backend_cann_get_device_count()) { + GGML_LOG_ERROR("%s: error: invalid device %d\n", __func__, device); + return nullptr; + } + + ggml_backend_cann_context* ctx = new ggml_backend_cann_context(device); + if (ctx == nullptr) { + GGML_LOG_ERROR("%s: error: failed to allocate context\n", __func__); + return nullptr; + } + ggml_cann_set_device(ctx->device); + ggml_backend_t cann_backend = + new ggml_backend{/* .guid = */ ggml_backend_cann_guid(), + /* .interface = */ ggml_backend_cann_interface, + /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device), + /* .context = */ ctx}; + + return cann_backend; +} + +bool ggml_backend_is_cann(ggml_backend_t backend) { + return backend != NULL && + ggml_guid_matches(backend->guid, ggml_backend_cann_guid()); +} + +int32_t ggml_backend_cann_get_device_count() { + return ggml_cann_info().device_count; +} + +void ggml_backend_cann_get_device_description( + int32_t device, char* description, size_t description_size) { + ggml_cann_set_device(device); + const char* soc_name = aclrtGetSocName(); + snprintf(description, description_size, "%s", soc_name); +} + +void ggml_backend_cann_get_device_memory(int32_t device, size_t* free, + size_t* total) { + ggml_cann_set_device(device); + ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total)); +} + +GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg) diff --git a/llama/ggml-cann/kernels/CMakeLists.txt b/llama/ggml-cann/kernels/CMakeLists.txt new file mode 100644 index 00000000000..e4bea66b1d2 --- /dev/null +++ b/llama/ggml-cann/kernels/CMakeLists.txt @@ -0,0 +1,37 @@ +cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories. +# 设置政策 CMP0057 +cmake_policy(SET CMP0057 NEW) + +# 其他 CMake 配置 +project("llama.cann.kernel") + +file(GLOB SRC_FILES + get_row_f32.cpp + get_row_f16.cpp + get_row_q4_0.cpp + get_row_q8_0.cpp + quantize_f32_q8_0.cpp + quantize_f16_q8_0.cpp + quantize_float_to_q4_0.cpp + dup.cpp +) + +set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) +set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels SHARED + ${SRC_FILES} +) + +message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.") +ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") +# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/llama/ggml-cann/kernels/ascendc_kernels.h b/llama/ggml-cann/kernels/ascendc_kernels.h new file mode 100644 index 00000000000..e8816091943 --- /dev/null +++ b/llama/ggml-cann/kernels/ascendc_kernels.h @@ -0,0 +1,45 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ASCENDC_KERNELS_H +#define ASCENDC_KERNELS_H + +#include "aclrtlaunch_ascendc_get_row_f32.h" +#include "aclrtlaunch_ascendc_get_row_f16.h" +#include "aclrtlaunch_ascendc_get_row_q8_0.h" +#include "aclrtlaunch_ascendc_get_row_q4_0.h" + +#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h" +#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h" +#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h" +#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h" + +#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h" +#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" +#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h" +#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h" + +#endif // ASCENDC_KERNELS_H diff --git a/llama/ggml-cann/kernels/dup.cpp b/llama/ggml-cann/kernels/dup.cpp new file mode 100644 index 00000000000..9d7faf262f1 --- /dev/null +++ b/llama/ggml-cann/kernels/dup.cpp @@ -0,0 +1,262 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "kernel_operator.h" + +#include + +using namespace AscendC; + +#define BUFFER_NUM 2 +const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template + +template +class DupByRows { + public: + __aicore__ inline DupByRows() {} + __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub, + size_t *input_nb_ub) { + /* Dup by rows when src is contigous on first dimension and dst is + contiguous, each kernel process one row. + */ + + // Input has four dims. + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + // param + num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; + num_elem = input_ne_ub[0]; + + // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3) + idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]); + idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])) + / (input_ne_ub[1]); + idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) + - idx_ne2 * input_ne_ub[1]; + + // src may not contiguous in dim [1,2,3], so stride decited by ne&nb + src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2 + + input_nb_ub[1] * idx_ne1; + + // dst is contiguous + dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T)); + + src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src + + src_stride)); + dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst + + dst_stride)); + + pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem + + 32 - 1) / 32 * 32); + pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem + + 32 - 1) / 32 * 32); + } + + __aicore__ inline void copy_in() { + LocalTensor src_local = src_queue.AllocTensor(); + const size_t elem_per_block = 32 / sizeof(SRC_T); + size_t tail = num_elem % elem_per_block; + size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem; + DataCopy(src_local, src_gm, cpy_elements_len); + src_queue.EnQue(src_local); + } + + __aicore__ inline void copy_out() { + LocalTensor dst_local = dst_queue.DeQue(); +#ifdef ASCEND_310P + const size_t elem_per_block = 32 / sizeof(DST_T); + size_t tail = num_elem % elem_per_block; + size_t len = num_elem & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(dst_gm, dst_local, len); + } + if(tail != 0) { + for (size_t i = tail; i < elem_per_block; i++) { + dst_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(dst_gm[len], dst_local[len], elem_per_block); + SetAtomicNone(); + } +#else + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = num_elem * sizeof(DST_T); + DataCopyPad(dst_gm, dst_local, dataCopyParams); +#endif + dst_queue.FreeTensor(dst_local); + } + + __aicore__ inline void dup() { + // main process, copy one row data from src to dst. + copy_in(); + + LocalTensor src_local = src_queue.DeQue(); + LocalTensor dst_local = dst_queue.AllocTensor(); + + int32_t BLOCK_NUM = 32 / sizeof(DST_T); + DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1) + / BLOCK_NUM * BLOCK_NUM); + dst_queue.EnQue(dst_local); + + src_queue.FreeTensor(src_local); + copy_out(); + } + + __aicore__ inline void dup_with_cast() { + // main process, copy one row data from src to dst. + // cast dtype from src to dst. + copy_in(); + + LocalTensor src_local = src_queue.DeQue(); + LocalTensor dst_local = dst_queue.AllocTensor(); + + Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem); + dst_queue.EnQue(dst_local); + + src_queue.FreeTensor(src_local); + copy_out(); + } + + private: + + TPipe pipe; + GlobalTensor src_gm; + GlobalTensor dst_gm; + + int64_t num_rows; + int64_t num_elem; + int64_t idx_ne3; + int64_t idx_ne2; + int64_t idx_ne1; + int64_t src_stride; + int64_t dst_stride; + + TQue src_queue; + TQue dst_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16( + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, + GM_ADDR output_ne_gm, + GM_ADDR output_nb_gm) { + + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + DupByRows op; + op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); + op.dup(); +} + +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32( + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, + GM_ADDR output_ne_gm, + GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + DupByRows op; + op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); + op.dup(); +} + +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16( + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, + GM_ADDR output_ne_gm, + GM_ADDR output_nb_gm) { + + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + DupByRows op; + op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); + op.dup_with_cast(); +} + +extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32( + GM_ADDR src_gm, + GM_ADDR dst_gm, + GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, + GM_ADDR output_ne_gm, + GM_ADDR output_nb_gm) { + + // copy params from gm to ub. + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + DupByRows op; + op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); + op.dup_with_cast(); +} diff --git a/llama/ggml-cann/kernels/get_row_f16.cpp b/llama/ggml-cann/kernels/get_row_f16.cpp new file mode 100644 index 00000000000..d7a4ade650f --- /dev/null +++ b/llama/ggml-cann/kernels/get_row_f16.cpp @@ -0,0 +1,223 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "kernel_operator.h" + +// optimize me. Use template to avoid copy code. +using namespace AscendC; + +#define BUFFER_NUM 2 + +class GET_ROW_F16 { + public: + __aicore__ inline GET_ROW_F16() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *indices_ne_ub, size_t *indices_nb_ub, + int64_t *output_ne_ub, size_t *output_nb_ub) { + // TODO, use template for F16/f32 + int64_t op_block_num = GetBlockNum(); + op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + + indices_ne[i] = indices_ne_ub[i]; + indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; + + output_ne[i] = output_ne_ub[i]; + output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; + } + + // Indices has two dims. n_elements = all rows should get. + // dr, all rows should this thread get. + uint64_t n_elements = + indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; + dr = n_elements / op_block_num; + + uint64_t tails = n_elements % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + input_gm.SetGlobalBuffer((__gm__ half *)input); + indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); + output_gm.SetGlobalBuffer((__gm__ float *)output); + + uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31) + & ~31); + uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31) + & ~31); + + local_buffer_elems = input_local_buffer_size / sizeof(half); + + // TODO, consider long row that can't put in UB. + // All data should asign to 32. It's ok because all data is align to 32. + pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size); + pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size); + } + + __aicore__ inline void copy_in(uint32_t offset, size_t len) { + size_t origin_len = len; + LocalTensor input_local = input_queue.AllocTensor(); + const size_t elem_per_block = 32 / sizeof(half); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if(tail != 0) { + len += elem_per_block; + } + DataCopy(input_local, input_gm[offset], len); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset, size_t len) { + LocalTensor output_local = output_queue.DeQue(); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = tail * sizeof(float); + DataCopyPad(output_gm[offset + len], output_local[len], + dataCopyParams); +#endif + } + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void calculate_row(int64_t idx) { + const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); + const int64_t indices_ne1_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / + indices_ne[0]; + const int64_t indices_ne0_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - + indices_ne1_idx * indices_ne[0]); + + const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + + indices_ne1_idx * indices_stride[1] + + indices_ne2_idx * indices_stride[2]; + const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); + + const int64_t input_offset = selected_row_idx * input_stride[1] + + indices_ne1_idx * input_stride[2] + + indices_ne2_idx * input_stride[3]; + + const int64_t output_offset = indices_ne0_idx * output_stride[1] + + indices_ne1_idx * output_stride[2] + + indices_ne2_idx * output_stride[3]; + + copy_in(input_offset, input_ne[0]); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + + Cast(output_local, input_local, RoundMode::CAST_NONE, + local_buffer_elems); + output_queue.EnQue(output_local); + copy_out(output_offset, input_ne[0]); + + input_queue.FreeTensor(input_local); + } + + __aicore__ inline void calculate() { + for (int64_t i = ir; i < ir + dr; i++) { + calculate_row(i); + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t indices_ne[4]; + size_t indices_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + size_t local_buffer_elems; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor indices_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + int64_t op_block_idx; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_get_row_f16( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, + GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t indices_ne_ub[4]; + size_t indices_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(indices_ne_gm, indices_ne_ub, 32); + copy_to_ub(indices_nb_gm, indices_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + GET_ROW_F16 op; + op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, + indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); + op.calculate(); +} diff --git a/llama/ggml-cann/kernels/get_row_f32.cpp b/llama/ggml-cann/kernels/get_row_f32.cpp new file mode 100644 index 00000000000..630cfd16fa2 --- /dev/null +++ b/llama/ggml-cann/kernels/get_row_f32.cpp @@ -0,0 +1,216 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "kernel_operator.h" + +// optimize me. Use template to avoid copy code. +using namespace AscendC; + +#define BUFFER_NUM 2 + +class GET_ROW_F32 { + public: + __aicore__ inline GET_ROW_F32() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *indices_ne_ub, size_t *indices_nb_ub, + int64_t *output_ne_ub, size_t *output_nb_ub) { + int64_t op_block_num = GetBlockNum(); + op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + + indices_ne[i] = indices_ne_ub[i]; + indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; + + output_ne[i] = output_ne_ub[i]; + output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; + } + + // Indices has two dims. n_elements = all rows should get. + // dr, all rows should this thread get. + uint64_t n_elements = + indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; + dr = n_elements / op_block_num; + + uint64_t tails = n_elements % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + input_gm.SetGlobalBuffer((__gm__ float *)input); + indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); + output_gm.SetGlobalBuffer((__gm__ float *)output); + + uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31); + local_buffer_elems = local_buffer_size / sizeof(float); + + // TODO, consider long row that can't put in UB. + // All data should asign to 32. It's ok because all data is align to 32. + pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size); + pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size); + } + + __aicore__ inline void copy_in(uint32_t offset, size_t len) { + LocalTensor input_local = input_queue.AllocTensor(); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if(tail != 0) { + len += elem_per_block; + } + DataCopy(input_local, input_gm[offset], len); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset, size_t len) { + LocalTensor output_local = output_queue.DeQue(); + const size_t elem_per_block = 32 / sizeof(float); + size_t tail = len % elem_per_block; + len = len & ~(elem_per_block - 1); + if (len > 0) { + DataCopy(output_gm[offset], output_local, len); + } + + if(tail != 0) { +#ifdef ASCEND_310P + for (size_t i = tail; i < elem_per_block; i++) { + output_local[len + i].SetValue(0, 0); + } + SetAtomicAdd(); + DataCopy(output_gm[offset + len], output_local[len], elem_per_block); + SetAtomicNone(); +#else + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = tail * sizeof(float); + DataCopyPad(output_gm[offset + len], output_local[len], + dataCopyParams); +#endif + } + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void calculate_row(int64_t idx) { + const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); + const int64_t indices_ne1_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / + indices_ne[0]; + const int64_t indices_ne0_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - + indices_ne1_idx * indices_ne[0]); + + const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + + indices_ne1_idx * indices_stride[1] + + indices_ne2_idx * indices_stride[2]; + const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); + + const int64_t input_offset = selected_row_idx * input_stride[1] + + indices_ne1_idx * input_stride[2] + + indices_ne2_idx * input_stride[3]; + + const int64_t output_offset = indices_ne0_idx * output_stride[1] + + indices_ne1_idx * output_stride[2] + + indices_ne2_idx * output_stride[3]; + + copy_in(input_offset, input_ne[0]); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + + DataCopy(output_local, input_local, local_buffer_elems); + output_queue.EnQue(output_local); + copy_out(output_offset, input_ne[0]); + + input_queue.FreeTensor(input_local); + } + + __aicore__ inline void calculate() { + for (int64_t i = ir; i < ir + dr; i++) { + calculate_row(i); + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t indices_ne[4]; + size_t indices_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + size_t local_buffer_elems; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor indices_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + int64_t op_block_idx; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_get_row_f32( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, + GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t indices_ne_ub[4]; + size_t indices_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(indices_ne_gm, indices_ne_ub, 32); + copy_to_ub(indices_nb_gm, indices_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + GET_ROW_F32 op; + op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, + indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); + op.calculate(); +} diff --git a/llama/ggml-cann/kernels/get_row_q4_0.cpp b/llama/ggml-cann/kernels/get_row_q4_0.cpp new file mode 100644 index 00000000000..77dfab76dbb --- /dev/null +++ b/llama/ggml-cann/kernels/get_row_q4_0.cpp @@ -0,0 +1,230 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "kernel_operator.h" + +// optimize me. Use template to avoid copy code. +using namespace AscendC; +#ifdef ASCEND_310P // 310P not support 4bit get row + extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, + GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support 4bit get row.\n"); + } +#else + +#define BUFFER_NUM 2 + +#define QK4_0 32 + +class GET_ROW_Q4_0 { + public: + __aicore__ inline GET_ROW_Q4_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, + int64_t *input_ne_ub, int64_t *indices_ne_ub, + size_t *indices_nb_ub, int64_t *output_ne_ub, + size_t *output_nb_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + indices_ne[i] = indices_ne_ub[i]; + indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; + scale_ne[i] = input_ne_ub[i]; + output_ne[i] = output_ne_ub[i]; + output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; + } + + // one scale for a group. + scale_ne[0] /= QK4_0; + + input_stride[0] = 1; + scale_stride[0] = 1; + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + group_size_in_row = input_ne[0] / QK4_0; + int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * + input_ne[3] / 2; + + // Indices has two dims. n_elements = all rows should get. + // dr, all rows should this thread get. + uint64_t n_elements = + indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; + dr = n_elements / op_block_num; + + uint64_t tails = n_elements % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + input_gm.SetGlobalBuffer((__gm__ int4b_t *)input); + scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); + indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); + output_gm.SetGlobalBuffer((__gm__ float *)output); + + pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t)); + pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half)); + pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float)); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error? + DataCopy(input_local, input_gm[offset], QK4_0); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + LocalTensor output_local = output_queue.DeQue(); + DataCopy(output_gm[offset], output_local, QK4_0); + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void calculate_group(int64_t idx, int64_t group) { + const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); + const int64_t indices_ne1_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / + indices_ne[0]; + const int64_t indices_ne0_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - + indices_ne1_idx * indices_ne[0]); + + const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + + indices_ne1_idx * indices_stride[1] + + indices_ne2_idx * indices_stride[2]; + const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); + + const int64_t input_offset = selected_row_idx * input_stride[1] + + indices_ne1_idx * input_stride[2] + + indices_ne2_idx * input_stride[3] + + group * QK4_0; + const int64_t scale_offset = selected_row_idx * scale_stride[1] + + indices_ne1_idx * scale_stride[2] + + indices_ne2_idx * scale_stride[3] + group; + const int64_t output_offset = indices_ne0_idx * output_stride[1] + + indices_ne1_idx * output_stride[2] + + indices_ne2_idx * output_stride[3] + + group * QK4_0; + + copy_in(input_offset); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor cast_local = cast_queue.AllocTensor(); + LocalTensor output_local = output_queue.AllocTensor(); + + // TODO: cast more data to speed up. + Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); + Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); + + // Only mul need compile by group. + half scale = scale_gm.GetValue(scale_offset); + + Muls(output_local, output_local, (float)scale, QK4_0); + + input_queue.FreeTensor(input_local); + cast_queue.FreeTensor(cast_local); + output_queue.EnQue(output_local); + + copy_out(output_offset); + } + + __aicore__ inline void calculate() { + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + calculate_group(i, j); + } + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t scale_ne[4]; + size_t scale_stride[4]; + + int64_t indices_ne[4]; + size_t indices_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t ir; + int64_t dr; + + int64_t group_size_in_row; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor indices_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue cast_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, + GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + int64_t indices_ne_ub[4]; + size_t indices_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(indices_ne_gm, indices_ne_ub, 32); + copy_to_ub(indices_nb_gm, indices_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + GET_ROW_Q4_0 op; + op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, + indices_nb_ub, output_ne_ub, output_nb_ub); + op.calculate(); +} + +#endif // #ifdef ASCEND_310P diff --git a/llama/ggml-cann/kernels/get_row_q8_0.cpp b/llama/ggml-cann/kernels/get_row_q8_0.cpp new file mode 100644 index 00000000000..776a3ead679 --- /dev/null +++ b/llama/ggml-cann/kernels/get_row_q8_0.cpp @@ -0,0 +1,217 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "kernel_operator.h" + +// optimize me. Use template to avoid copy code. +using namespace AscendC; + +#define BUFFER_NUM 2 + +#define QK8_0 32 + +class GET_ROW_Q8_0 { + public: + __aicore__ inline GET_ROW_Q8_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, + int64_t *input_ne_ub, int64_t *indices_ne_ub, + size_t *indices_nb_ub, int64_t *output_ne_ub, + size_t *output_nb_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + indices_ne[i] = indices_ne_ub[i]; + indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; + scale_ne[i] = input_ne_ub[i]; + output_ne[i] = output_ne_ub[i]; + output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; + } + + // one scale for a group. + scale_ne[0] /= QK8_0; + + input_stride[0] = 1; + scale_stride[0] = 1; + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + group_size_in_row = input_ne[0] / QK8_0; + int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * + input_ne[3] * sizeof(int8_t); + + // Indices has two dims. n_elements = all rows should get. + // dr, all rows should this thread get. + uint64_t n_elements = + indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; + dr = n_elements / op_block_num; + + uint64_t tails = n_elements % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + input_gm.SetGlobalBuffer((__gm__ int8_t *)input); + scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); + indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); + output_gm.SetGlobalBuffer((__gm__ float *)output); + + pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); + pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half)); + pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float)); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + DataCopy(input_local, input_gm[offset], QK8_0); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + LocalTensor output_local = output_queue.DeQue(); + DataCopy(output_gm[offset], output_local, QK8_0); + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void calculate_group(int64_t idx, int64_t group) { + const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); + const int64_t indices_ne1_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / + indices_ne[0]; + const int64_t indices_ne0_idx = + (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - + indices_ne1_idx * indices_ne[0]); + + const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + + indices_ne1_idx * indices_stride[1] + + indices_ne2_idx * indices_stride[2]; + const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); + + const int64_t input_offset = selected_row_idx * input_stride[1] + + indices_ne1_idx * input_stride[2] + + indices_ne2_idx * input_stride[3] + + group * QK8_0; + const int64_t scale_offset = selected_row_idx * scale_stride[1] + + indices_ne1_idx * scale_stride[2] + + indices_ne2_idx * scale_stride[3] + group; + const int64_t output_offset = indices_ne0_idx * output_stride[1] + + indices_ne1_idx * output_stride[2] + + indices_ne2_idx * output_stride[3] + + group * QK8_0; + + copy_in(input_offset); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor cast_local = cast_queue.AllocTensor(); + LocalTensor output_local = output_queue.AllocTensor(); + + // TODO: cast more data to speed up. + Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); + Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0); + + // Only mul need compile by group. + half scale = scale_gm.GetValue(scale_offset); + Muls(output_local, output_local, (float)scale, QK8_0); + + input_queue.FreeTensor(input_local); + cast_queue.FreeTensor(cast_local); + output_queue.EnQue(output_local); + + copy_out(output_offset); + } + + __aicore__ inline void calculate() { + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + calculate_group(i, j); + } + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t scale_ne[4]; + size_t scale_stride[4]; + + int64_t indices_ne[4]; + size_t indices_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t ir; + int64_t dr; + + int64_t group_size_in_row; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor indices_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue cast_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_get_row_q8_0( + GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, + GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, + GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { + int64_t input_ne_ub[4]; + int64_t indices_ne_ub[4]; + size_t indices_nb_ub[4]; + int64_t output_ne_ub[4]; + size_t output_nb_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(indices_ne_gm, indices_ne_ub, 32); + copy_to_ub(indices_nb_gm, indices_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + copy_to_ub(output_nb_gm, output_nb_ub, 32); + + GET_ROW_Q8_0 op; + op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, + indices_nb_ub, output_ne_ub, output_nb_ub); + op.calculate(); +} diff --git a/llama/ggml-cann/kernels/quantize_f16_q8_0.cpp b/llama/ggml-cann/kernels/quantize_f16_q8_0.cpp new file mode 100644 index 00000000000..59ae6d2fa7d --- /dev/null +++ b/llama/ggml-cann/kernels/quantize_f16_q8_0.cpp @@ -0,0 +1,244 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "kernel_operator.h" + +using namespace AscendC; +#ifdef ASCEND_310P + extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support f16->8bit quantization.\n"); + } +#else + +#define BUFFER_NUM 2 +#define QK8_0 32 + +class QUANTIZE_F16_Q8_0 { + public: + __aicore__ inline QUANTIZE_F16_Q8_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *output_ne_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + + output_ne[i] = output_ne_ub[i]; + } + + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; + } + + scale_ne = input_ne; + scale_stride[0] = 1; + scale_stride[1] = input_ne[0] / QK8_0; + for (int i = 2; i < 4; i++) { + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + // split input tensor by rows. + uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; + dr = nr / op_block_num; + + uint64_t tails = nr % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + group_size_in_row = scale_stride[1]; + int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * + output_ne[3] * sizeof(uint8_t); + + input_gm.SetGlobalBuffer((__gm__ half *)input); + output_gm.SetGlobalBuffer((__gm__ int8_t *)output); + scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir * + group_size_in_row * + sizeof(half))); + + pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half)); + pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); + pipe.InitBuffer(work_queue, 1, 32); + pipe.InitBuffer(max_queue, 1, 32); + pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); + pipe.InitBuffer(scale_queue, 1, 32); + pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float)); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + DataCopy(input_local, input_gm[offset], QK8_0); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + LocalTensor output_local = output_queue.DeQue(); + DataCopy(output_gm[offset], output_local, QK8_0); + output_queue.FreeTensor(output_local); + } + + __aicore__ inline half calculate_group(int64_t row, int64_t group) { + const int64_t i3 = row / (input_ne[1] * input_ne[2]); + const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; + const int64_t i1 = + row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; + + const int64_t input_offset = i1 * input_stride[1] + + i2 * input_stride[2] + + i3 * input_stride[3] + QK8_0 * group; + + const int64_t output_offset = i1 * output_stride[1] + + i2 * output_stride[2] + + i3 * output_stride[3] + QK8_0 * group; + + copy_in(input_offset); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + LocalTensor work_local = work_queue.AllocTensor(); + LocalTensor abs_local = abs_queue.AllocTensor(); + LocalTensor max_local = max_queue.AllocTensor(); + LocalTensor cast_local = cast_queue.AllocTensor(); + + Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); + Abs(abs_local, cast_local, QK8_0); + ReduceMax(max_local, abs_local, work_local, QK8_0); + + pipe_barrier(PIPE_ALL); + float d = max_local.GetValue(0); + d = d / ((1 << 7) - 1); + if (d != 0) { + Muls(cast_local, cast_local, 1.0f / d, QK8_0); + } + + Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0); + Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0); + Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0); + output_queue.EnQue(output_local); + copy_out(output_offset); + + input_queue.FreeTensor(input_local); + work_queue.FreeTensor(work_local); + abs_queue.FreeTensor(abs_local); + max_queue.FreeTensor(max_local); + cast_queue.FreeTensor(cast_local); + return (half)d; + } + + __aicore__ inline void calculate() { + LocalTensor scale_local = scale_queue.AllocTensor(); + uint32_t scale_local_offset = 0; + uint32_t scale_global_offset = 0; + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + half scale = calculate_group(i, j); + scale_local.SetValue(scale_local_offset++, scale); + if (scale_local_offset == 16) { + scale_local_offset = 0; + // TODO: OPTIMIZE ME + pipe_barrier(PIPE_ALL); + DataCopy(scale_gm[scale_global_offset], scale_local, 16); + pipe_barrier(PIPE_ALL); + scale_global_offset += 16; + } + } + } + + if (scale_local_offset != 0) { + pipe_barrier(PIPE_ALL); + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = scale_local_offset * sizeof(half); + DataCopyPad(scale_gm[scale_global_offset], scale_local, + dataCopyParams); + pipe_barrier(PIPE_ALL); + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t *scale_ne; + size_t scale_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t group_size_in_row; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue work_queue; + TQue max_queue; + TQue abs_queue; + TQue scale_queue; + TQue cast_queue; + +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_F16_Q8_0 op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} + +#endif // #ifdef ASCEND_310P diff --git a/llama/ggml-cann/kernels/quantize_f32_q8_0.cpp b/llama/ggml-cann/kernels/quantize_f32_q8_0.cpp new file mode 100644 index 00000000000..ed7bd14a8ec --- /dev/null +++ b/llama/ggml-cann/kernels/quantize_f32_q8_0.cpp @@ -0,0 +1,242 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "kernel_operator.h" + +using namespace AscendC; +#ifdef ASCEND_310P // 310P not support f32->8bit quantization + extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support f32->8bit quantization.\n"); + } +#else + +#define BUFFER_NUM 2 +#define QK8_0 32 + +class QUANTIZE_F32_Q8_0 { + public: + __aicore__ inline QUANTIZE_F32_Q8_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *output_ne_ub) { + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + + output_ne[i] = output_ne_ub[i]; + } + + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; + } + + scale_ne = input_ne; + scale_stride[0] = 1; + scale_stride[1] = input_ne[0] / QK8_0; + for (int i = 2; i < 4; i++) { + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + // split input tensor by rows. + uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; + dr = nr / op_block_num; + + uint64_t tails = nr % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + group_size_in_row = scale_stride[1]; + int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * + output_ne[3] * sizeof(uint8_t); + + input_gm.SetGlobalBuffer((__gm__ float *)input); + output_gm.SetGlobalBuffer((__gm__ int8_t *)output); + scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + + ir * group_size_in_row * + sizeof(half))); + + pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float)); + pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); + pipe.InitBuffer(work_queue, 1, 32); + pipe.InitBuffer(max_queue, 1, 32); + pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); + pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half)); + pipe.InitBuffer(scale_queue, 1, 32); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + DataCopy(input_local, input_gm[offset], QK8_0); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + LocalTensor output_local = output_queue.DeQue(); + DataCopy(output_gm[offset], output_local, QK8_0); + output_queue.FreeTensor(output_local); + } + + __aicore__ inline half calculate_group(int64_t row, int64_t group) { + const int64_t i3 = row / (input_ne[1] * input_ne[2]); + const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; + const int64_t i1 = + row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; + + const int64_t input_offset = i1 * input_stride[1] + + i2 * input_stride[2] + + i3 * input_stride[3] + QK8_0 * group; + + const int64_t output_offset = i1 * output_stride[1] + + i2 * output_stride[2] + + i3 * output_stride[3] + QK8_0 * group; + + copy_in(input_offset); + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + LocalTensor work_local = work_queue.AllocTensor(); + LocalTensor abs_local = abs_queue.AllocTensor(); + LocalTensor max_local = max_queue.AllocTensor(); + LocalTensor cast_local = cast_queue.AllocTensor(); + + Abs(abs_local, input_local, QK8_0); + ReduceMax(max_local, abs_local, work_local, QK8_0); + pipe_barrier(PIPE_ALL); + float d = max_local.GetValue(0); + d = d / ((1 << 7) - 1); + if (d != 0) { + Muls(input_local, input_local, 1.0f / d, QK8_0); + } + + Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0); + Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0); + Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0); + output_queue.EnQue(output_local); + copy_out(output_offset); + + input_queue.FreeTensor(input_local); + work_queue.FreeTensor(work_local); + abs_queue.FreeTensor(abs_local); + max_queue.FreeTensor(max_local); + cast_queue.FreeTensor(cast_local); + + return (half)d; + } + + __aicore__ inline void calculate() { + LocalTensor scale_local = scale_queue.AllocTensor(); + uint32_t scale_local_offset = 0; + uint32_t scale_global_offset = 0; + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + half scale = calculate_group(i, j); + scale_local.SetValue(scale_local_offset++, scale); + if (scale_local_offset == 16) { + scale_local_offset = 0; + // TODO: OPTIMIZE ME + pipe_barrier(PIPE_ALL); + DataCopy(scale_gm[scale_global_offset], scale_local, 16); + pipe_barrier(PIPE_ALL); + scale_global_offset += 16; + } + } + } + + if (scale_local_offset != 0) { + pipe_barrier(PIPE_ALL); + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = scale_local_offset * sizeof(half); + DataCopyPad(scale_gm[scale_global_offset], scale_local, + dataCopyParams); + pipe_barrier(PIPE_ALL); + } + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t *scale_ne; + size_t scale_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t group_size_in_row; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue work_queue; + TQue max_queue; + TQue abs_queue; + TQue cast_queue; + TQue scale_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_F32_Q8_0 op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} + +#endif // #ifdef ASCEND_310P diff --git a/llama/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/llama/ggml-cann/kernels/quantize_float_to_q4_0.cpp new file mode 100644 index 00000000000..a230c79ba5a --- /dev/null +++ b/llama/ggml-cann/kernels/quantize_float_to_q4_0.cpp @@ -0,0 +1,321 @@ +/** + * llama.cpp - commit ba1cb19cdd0d92e012e0f6e009e0620f854b6afd - do not edit this file + * + * MIT License + * + * Copyright (c) 2023-2024 The ggml authors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "kernel_operator.h" + +using namespace AscendC; +#ifdef ASCEND_310P // 310P not support float->4bit quantization + extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support f32->4bit quantization.\n"); + } + + extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. + printf("Ascend310P not support f16->4bit quantization.\n"); + } +#else + +#define BUFFER_NUM 2 +#define Group_Size 32 + +template +class QUANTIZE_FLOAT_TO_Q4_0 { + public: + __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {} + __aicore__ inline void init(GM_ADDR input, GM_ADDR output, + int64_t *input_ne_ub, size_t *input_nb_ub, + int64_t *output_ne_ub) { + // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4], + // permute=[0,0,0,0]): + // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL + int64_t op_block_num = GetBlockNum(); + int64_t op_block_idx = GetBlockIdx(); + + // input stride of data elements + for (int i = 0; i < 4; i++) { + input_ne[i] = input_ne_ub[i]; + input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; + output_ne[i] = output_ne_ub[i]; + } + + // output stride of data elements + output_stride[0] = 1; + for (int i = 1; i < 4; i++) { + output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; + } + + // scale saved one by one after data:. [group1_scale, group2_scale, ...] + scale_ne = input_ne; + scale_stride[0] = 1; + scale_stride[1] = input_ne[0] / Group_Size; + for (int i = 2; i < 4; i++) { + scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; + } + + // split input tensor by rows. + uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; + dr = nr / op_block_num; + + uint64_t tails = nr % op_block_num; + if (op_block_idx < tails) { + dr += 1; + ir = dr * op_block_idx; + } else { + ir = dr * op_block_idx + tails; + } + + group_size_in_row = scale_stride[1]; + int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] * + output_ne[3] * sizeof(uint8_t) / 2; + + input_gm.SetGlobalBuffer((__gm__ SRC_T *)input); + output_gm.SetGlobalBuffer((__gm__ int8_t *)output); + scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir * + group_size_in_row * + sizeof(half))); + + pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T)); + pipe.InitBuffer(output_queue, BUFFER_NUM, + Group_Size * sizeof(int8_t) / 2); + pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float)); + pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float)); + pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half)); + pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t)); + pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half)); + } + + __aicore__ inline void copy_in(uint32_t offset) { + LocalTensor input_local = input_queue.AllocTensor(); + DataCopy(input_local, input_gm[offset], Group_Size); + input_queue.EnQue(input_local); + } + + __aicore__ inline void copy_out(uint32_t offset) { + // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t, + // and using DataCopyPad to avoid 32 bits align. + LocalTensor output_local = output_queue.DeQue(); + LocalTensor output_int8_local = + output_local.ReinterpretCast(); + + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t); + DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams); + + output_queue.FreeTensor(output_local); + } + + __aicore__ inline void input_to_cast(LocalTensor cast_local, + LocalTensor input_local) { + DataCopy(cast_local, input_local, Group_Size); + } + + __aicore__ inline void input_to_cast(LocalTensor cast_local, + LocalTensor input_local) { + Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size); + } + + __aicore__ inline half calculate_group(int64_t row, int64_t group) { + const int64_t i3 = row / (input_ne[1] * input_ne[2]); + const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; + const int64_t i1 = + row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; + + const int64_t input_offset = i1 * input_stride[1] + + i2 * input_stride[2] + + i3 * input_stride[3] + Group_Size * group; + + // output_offset is stride for output_gm which datatype is int8_t and + // divided by 2 is needed for int4b_t. + const int64_t output_offset = (i1 * output_stride[1] + + i2 * output_stride[2] + + i3 * output_stride[3] + + Group_Size * group) / 2; + copy_in(input_offset); + + LocalTensor input_local = input_queue.DeQue(); + LocalTensor output_local = output_queue.AllocTensor(); + LocalTensor cast_local = cast_queue.AllocTensor(); + LocalTensor work_local = work_queue.AllocTensor(); + LocalTensor max_local = max_queue.AllocTensor(); + LocalTensor min_local = min_queue.AllocTensor(); + LocalTensor int8_local = int8_queue.AllocTensor(); + LocalTensor half_local = half_queue.AllocTensor(); + + input_to_cast(cast_local, input_local); + + ReduceMax(max_local, cast_local, work_local, Group_Size); + ReduceMin(min_local, cast_local, work_local, Group_Size); + const float max_value = max_local.GetValue(0); + const float min_value = min_local.GetValue(0); + float d = max_value; + if (min_value < 0 && (-1 * min_value) > max_value) { + d = min_value; + } + + d = d / (-8); + if (d != 0) { + Muls(cast_local, cast_local, 1.0f / d, Group_Size); + } + + // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7] + float scalar = 8.5f; + Adds(cast_local, cast_local, scalar, Group_Size); + Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size); + scalar = 15.0f; + Mins(cast_local, cast_local, scalar, Group_Size); + scalar = -8.0f; + Adds(cast_local, cast_local, scalar, Group_Size); + + // float->half->int4b + Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size); + Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size); + + output_queue.EnQue(output_local); + copy_out(output_offset); + + input_queue.FreeTensor(input_local); + work_queue.FreeTensor(work_local); + max_queue.FreeTensor(max_local); + min_queue.FreeTensor(min_local); + int8_queue.FreeTensor(int8_local); + half_queue.FreeTensor(half_local); + cast_queue.FreeTensor(cast_local); + return (half)d; + } + + __aicore__ inline void calculate() { + LocalTensor scale_local = scale_queue.AllocTensor(); + uint32_t scale_local_offset = 0; + uint32_t scale_global_offset = 0; + for (int64_t i = ir; i < ir + dr; i++) { + for (int64_t j = 0; j < group_size_in_row; j++) { + half scale = calculate_group(i, j); + scale_local.SetValue(scale_local_offset++, scale); + // Copy Group_Size/2 length data each time. + if (scale_local_offset == Group_Size / 2) { + scale_local_offset = 0; + // TODO: OPTIMIZE ME + pipe_barrier(PIPE_ALL); + DataCopy(scale_gm[scale_global_offset], scale_local, + Group_Size / 2); + pipe_barrier(PIPE_ALL); + scale_global_offset += Group_Size / 2; + } + } + } + + if (scale_local_offset != 0) { + pipe_barrier(PIPE_ALL); + DataCopyExtParams dataCopyParams; + dataCopyParams.blockCount = 1; + dataCopyParams.blockLen = scale_local_offset * sizeof(half); + DataCopyPad(scale_gm[scale_global_offset], scale_local, + dataCopyParams); + pipe_barrier(PIPE_ALL); + } + scale_queue.FreeTensor(scale_local); + } + + private: + int64_t input_ne[4]; + size_t input_stride[4]; + + int64_t *scale_ne; + size_t scale_stride[4]; + + int64_t output_ne[4]; + size_t output_stride[4]; + + int64_t group_size_in_row; + + int64_t ir; + int64_t dr; + + TPipe pipe; + GlobalTensor input_gm; + GlobalTensor scale_gm; + GlobalTensor output_gm; + TQue input_queue; + TQue output_queue; + TQue work_queue; + TQue max_queue; + TQue min_queue; + TQue scale_queue; + TQue cast_queue; + TQue int8_queue; + TQue half_queue; +}; + +template +__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { + auto gm_ptr = (__gm__ uint8_t *)gm; + auto ub_ptr = (uint8_t *)(ub); + for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { + *ub_ptr = *gm_ptr; + } +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_FLOAT_TO_Q4_0 op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} + +extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( + GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, + GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { + int64_t input_ne_ub[4]; + size_t input_nb_ub[4]; + int64_t output_ne_ub[4]; + + copy_to_ub(input_ne_gm, input_ne_ub, 32); + copy_to_ub(input_nb_gm, input_nb_ub, 32); + copy_to_ub(output_ne_gm, output_ne_ub, 32); + + QUANTIZE_FLOAT_TO_Q4_0 op; + op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); + op.calculate(); +} + +#endif // #ifdef ASCEND_310P diff --git a/llama/llama.go b/llama/llama.go index c11d53411d4..bf272a6174a 100644 --- a/llama/llama.go +++ b/llama/llama.go @@ -51,6 +51,10 @@ package llama #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIP -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1 #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_rocm -lhipblas -lamdhip64 -lrocblas #cgo windows CFLAGS: -Wno-discarded-qualifiers -D_WIN32_WINNT=0x602 +#cgo linux,cann LDFLAGS: -lpthread -lrt -lresolv +#cgo cann CFLAGS: -DGGML_USE_CANN -DGGML_BUILD=1 +#cgo cann CXXFLAGS: -DGGML_USE_CANN -DGGML_BUILD=1 +#cgo cann LDFLAGS: -L${SRCDIR}/build/linux-arm64/runners/cann -lggml_cann -lascendc_kernels #cgo windows CXXFLAGS: -D_WIN32_WINNT=0x602 #cgo windows LDFLAGS: -lmsvcrt -static-libstdc++ -static-libgcc -static #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows-amd64 diff --git a/make/Makefile.cann b/make/Makefile.cann new file mode 100644 index 00000000000..261f50b3440 --- /dev/null +++ b/make/Makefile.cann @@ -0,0 +1,115 @@ +# Build rules for CANN runner +# +# Note: at present we only support a single CANN version + +include make/common-defs.make + +ifeq ($(OS),windows) + @echo "CANN does not currently support windows." + @exit 0 +endif + +# Check if CANN_INSTALL_DIR is unset or empty, and ASCEND_TOOLKIT_HOME is defined +ifeq ($(strip $(CANN_INSTALL_DIR)),) + ifneq ($(strip $(ASCEND_TOOLKIT_HOME)),) + CANN_INSTALL_DIR := $(ASCEND_TOOLKIT_HOME) + $(info CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$(ASCEND_TOOLKIT_HOME)) + endif +endif + +GPU_LIB_DIR := $(CANN_INSTALL_DIR)/lib64 +GPU_TRANSITIVE_LIBS = $(empty) +GPU_RUNNER_GO_TAGS := cann +GPU_RUNNER_NAME := cann + +GPU_RUNNER_SRCS := \ + $(wildcard llama/ggml-cann/*.cpp) \ + llama/ggml.c llama/ggml-backend.cpp llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-threading.cpp + +GPU_RUNNER_HDRS := \ + $(wildcard ggml-cann/*.h) + +# build custom cann kernels +CANN_KERNEL_SRC_DIR := llama/ggml-cann/kernels +CANN_BUILD_DIR := $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME) +CANN_KERNEL_BUILD_DIR := $(CANN_BUILD_DIR)/kernels +CANN_KERNEL_CMAKE_FLAGS = -DCANN_INSTALL_DIR=$(CANN_INSTALL_DIR) +GGML_SHARED_BUILD_PRE_JOB := $(CANN_BUILD_DIR)/$(SHARED_PREFIX)ascendc_kernels.$(SHARED_EXT) + +GPU_COMPILER_LINUX := $(shell X=$$(which g++ 2>/dev/null) && echo $$X) +GPU_COMPILER:=$(GPU_COMPILER_LINUX) +GPU_RUNNER_LIBS_SHORT := ascendcl nnopbase opapi acl_op_compiler ascendc_kernels +GPU_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) +CANN_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)-cann/lib/ollama +GPU_DIST_DEPS_LIBS = $(sort $(addprefix $(CANN_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS)) $(notdir $(GPU_TRANSITIVE_LIBS)))) +CANN_INCLUDES_DIRS = -I$(CANN_INSTALL_DIR)/include \ + -I$(CANN_INSTALL_DIR)/include/aclnn \ + -I$(CANN_INSTALL_DIR)/acllib/include \ + -I$(CANN_KERNEL_BUILD_DIR)/include/ascendc_kernels + +GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++11 +CANN_COMPILER_CXXFLAGS = \ + $(GPU_COMPILER_FPIC) \ + -c \ + -O3 \ + -DGGML_USE_CANN=1 \ + -DGGML_BUILD=1 \ + -DGGML_NATIVE=OFF \ + -DGGML_SCHED_MAX_COPIES=4 \ + -DGGML_USE_LLAMAFILE \ + -DNDEBUG \ + -D_CRT_SECURE_NO_WARNINGS \ + -D_GNU_SOURCE \ + -D_XOPEN_SOURCE=600 \ + -DUSE_PROF_API=1 \ + -std=gnu++14 \ + -Wno-expansion-to-defined \ + -Wno-invalid-noreturn \ + -Wno-ignored-attributes \ + -Wno-pass-failed \ + -Wno-deprecated-declarations \ + -Wno-unused-result \ + $(CANN_INCLUDES_DIRS) \ + -I. + +GPU_COMPILER_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE -std=c11 +GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE $(CANN_COMPILER_CXXFLAGS) + +include make/gpu.make + +# MAKEFLAGS += -j1 +# $(GPU_RUNNER_NAME): $(CANN_BUILD_DIR)/$(SHARED_PREFIX)ascendc_kernels.$(SHARED_EXT) WAIT +# First detect Ascend SOC type +SOC_VERSION := +SOC_TYPE := +detect_ascend_soc_type = $(shell npu-smi info | awk -F' ' 'NF > 0 && NR==7 {print $$3}') +$(info CANN detect_ascend_soc_type auto-detected is $(detect_ascend_soc_type)) +ifeq ($(SOC_TYPE),) + SOC_VERSION := $(call detect_ascend_soc_type) + ifeq ($(SOC_VERSION),) + $(error Auto-detect ascend soc type failed, please specify manually or check ascend device working normally.) + endif + SOC_VERSION := Ascend$(SOC_VERSION) + SOC_TYPE := $(SOC_VERSION) + $(info CANN SOC_VERSION auto-detected is $(SOC_VERSION)) +endif + +SOC_VERSION := $(shell echo $(SOC_TYPE) | tr '[:upper:]' '[:lower:]') + +# Construct Soc specify compile option: ASCEND_SOC_MAJOR_SN, Such as ASCEND_910B, ASCEND_310P. +SOC_TYPE_MAJOR_SN := $(shell echo $(SOC_VERSION) | grep -o [0-9][0-9][0-9][a-zA-Z]*) +SOC_TYPE_COMPILE_OPTION := ASCEND_$(SOC_TYPE_MAJOR_SN) +SOC_TYPE_COMPILE_OPTION := $(shell echo $(SOC_TYPE_COMPILE_OPTION) | tr '[:lower:]' '[:upper:]') +CANN_KERNEL_CMAKE_FLAGS += -DSOC_TYPE=$(SOC_TYPE) -DSOC_VERSION=$(SOC_VERSION) -DSOC_TYPE_COMPILE_OPTION=$(SOC_TYPE_COMPILE_OPTION) +# Second build cann llama.cpp custom kernel +$(CANN_BUILD_DIR)/$(SHARED_PREFIX)ascendc_kernels.$(SHARED_EXT): + @rm -rf $(CANN_KERNEL_BUILD_DIR) + @-mkdir -p $(CANN_KERNEL_BUILD_DIR) + $(info CANN_INSTALL_DIR: $(CANN_INSTALL_DIR)) + cmake $(CANN_KERNEL_SRC_DIR) -G "Unix Makefiles" $(CANN_KERNEL_CMAKE_FLAGS) -B $(CANN_KERNEL_BUILD_DIR) -C $(CANN_KERNEL_SRC_DIR)/CMakeLists.txt -DCMAKE_BUILD_TYPE=debug + cmake --build $(CANN_KERNEL_BUILD_DIR) --config debug + @-mkdir -p $(DIST_LIB_DIR) + $(CP) $(CANN_KERNEL_BUILD_DIR)/lib/$(SHARED_PREFIX)ascendc_kernels.$(SHARED_EXT) $(DIST_LIB_DIR) + $(CP) $(CANN_KERNEL_BUILD_DIR)/lib/$(SHARED_PREFIX)ascendc_kernels.$(SHARED_EXT) $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ + +# endef \ No newline at end of file diff --git a/make/Makefile.sync b/make/Makefile.sync index 07922131aca..2a78045009a 100644 --- a/make/Makefile.sync +++ b/make/Makefile.sync @@ -131,7 +131,8 @@ GGML_FILES= \ ggml/src/ggml-cpu/amx/amx.h \ ggml/src/ggml-cpu/amx/amx.cpp \ ggml/src/ggml-cpu/amx/mmq.cpp \ - ggml/src/ggml-cpu/amx/mmq.h + ggml/src/ggml-cpu/amx/mmq.h \ + ggml/include/ggml-cann.h $(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR)))) $(DEST_DIR)ggml-metal-embed.metal: $(DEST_DIR)ggml-common.h $(DEST_DIR)ggml-metal-impl.h @@ -173,6 +174,15 @@ GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES))))) $(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/vendors/))) +# ggml-cann -> llama/ggml-cann/ +GGML_CANN_FILES= ggml/src/ggml-cann.cpp ggml/src/ggml-cann/*.cpp ggml/src/ggml-cann/*.h +GGML_CANN_FILES_EXPANDED = $(addprefix ggml/src/ggml-cann/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CANN_FILES))))) +$(foreach name,$(GGML_CANN_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cann/))) + +GGML_CANN_KERNEL_FILES= ggml/src/ggml-cann/kernels/*.cpp ggml/src/ggml-cann/kernels/*.h CMakeLists.txt +GGML_CANN_KERNEL_FILES_EXPANDED = $(addprefix ggml/src/ggml-cann/kernels/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CANN_KERNEL_FILES))))) +$(foreach name,$(GGML_CANN_KERNEL_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$$(DEST_DIR)ggml-cann/kernels/))) + # llava -> llama/ LAVA_FILES= \ examples/llava/clip.cpp \ diff --git a/make/gpu.make b/make/gpu.make index 96e1ad224da..88dc9a43460 100644 --- a/make/gpu.make +++ b/make/gpu.make @@ -14,20 +14,12 @@ DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR) GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))) -GPU_RUNNER_SRCS := \ - $(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \ - $(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \ - llama/ggml.c llama/ggml-backend.cpp llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-threading.cpp -GPU_RUNNER_HDRS := \ - $(wildcard llama/ggml-cuda/*.cuh) - - # Conditional flags and components to speed up developer builds ifneq ($(OLLAMA_FAST_BUILD),) GPU_COMPILER_CUFLAGS += \ -DGGML_DISABLE_FLASH_ATTN else - GPU_RUNNER_SRCS += \ + GPU_RUNNER_SPEED_UP_BUILD_SRCS = \ $(wildcard llama/ggml-cuda/fattn*.cu) \ $(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \ $(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \ @@ -35,6 +27,18 @@ else $(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu) endif +ifndef GPU_RUNNER_SRCS + GPU_RUNNER_SRCS := \ + $(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \ + $(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \ + llama/ggml.c llama/ggml-backend.cpp llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-threading.cpp + GPU_RUNNER_SRCS += $(GPU_RUNNER_SPEED_UP_BUILD_SRCS) +endif +ifndef GPU_RUNNER_HDRS + GPU_RUNNER_HDRS := \ + $(wildcard llama/ggml-cuda/*.cuh) +endif + GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT)) GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT))) @@ -53,7 +57,7 @@ $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $< $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c @-mkdir -p $(dir $@) - $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $< + $(CCACHE) gcc -c $(GPU_COMPILER_CFLAGS) -o $@ $< $(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp @-mkdir -p $(dir $@) $(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $< @@ -61,7 +65,7 @@ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_ $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS) @-mkdir -p $(dir $@) GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner -$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) +$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GGML_SHARED_BUILD_PRE_JOB) $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS) @-mkdir -p $(dir $@) $(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@