Skip to content

Commit

Permalink
adapt to test-backend-ops.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
zhou.weiguo committed Apr 25, 2024
1 parent 180ab5f commit eff9669
Show file tree
Hide file tree
Showing 9 changed files with 242 additions and 43 deletions.
6 changes: 4 additions & 2 deletions README-qnn.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,14 @@ Any **mainstream** Android phone based on Qualcomm's mobile SoC should be suppor
### II. Build llama.cpp + QNN backend


Please refer to [project kantv](https://github.com/zhouwg/kantv) firstly.
Please refer to [project kantv](https://github.com/zhouwg/kantv)


A small and standalone Android example(or re-use [the existing Android example in llama.cpp](https://github.com/ggerganov/llama.cpp/tree/master/examples/llama.android)) for purpose of facilitate community developers to participate in develop/verify QNN backend.
or


using [test-backend-ops.cpp](tests/ggml-qnn) to verify it on Qualcomm mobile SoC based Android phone

### III. Run the inference on Qualcomm mobile SoC based Android phone


Expand Down
5 changes: 5 additions & 0 deletions ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
extern GGML_CALL void ggml_backend_kompute_reg_devices(void);
ggml_backend_kompute_reg_devices();
#endif

#ifdef GGML_USE_QNN
extern GGML_CALL int ggml_backend_qnn_reg_devices(void);
ggml_backend_qnn_reg_devices();
#endif
}

GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
Expand Down
70 changes: 34 additions & 36 deletions ggml-qnn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1176,7 +1176,6 @@ static void qnn_buf_buffer_put(qnn_buf_t * fifo, buf_element_t * element) {
fifo->qnn_buf_size++;
fifo->qnn_buf_data_size += element->size;

LOGJ("put:index %d, fifo->size is %d, self->buffer_pool_num_free %d\n", element->id, fifo->qnn_buf_size, fifo->buffer_pool_num_free);
pthread_cond_signal (&fifo->not_empty);

pthread_mutex_unlock (&fifo->mutex);
Expand Down Expand Up @@ -1426,9 +1425,12 @@ static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const
int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
#if (defined __ANDROID__) || (defined ANDROID)
__android_log_print(level, "ggml-qnn", "%s", s_ggml_qnn_log_internal_buf);
//for Android APP
__android_log_print(level, "ggml-qnn", "%s\n", s_ggml_qnn_log_internal_buf);
//for Android terminal
printf("%s\n", s_ggml_qnn_log_internal_buf);
#else
printf("%s", buffer); //Qualcomm's QNN could running on Windows over ARM(aka WoA)
printf("%s\n", s_ggml_qnn_log_internal_buf);
#endif
}
va_end(args);
Expand Down Expand Up @@ -2125,9 +2127,9 @@ int qnn_instance::load_system() {

_qnn_interface.qnn_system_context_create(&_qnn_system_handle);
if (nullptr == _qnn_system_handle) {
LOGW("can not create QNN system contenxt\n");
QNN_LOG_WARN("can not create QNN system contenxt\n");
} else {
QNN_LOG_DEBUG("initialize qnn system successfully\n");
QNN_LOG_INFO("initialize qnn system successfully\n");
}

return 0;
Expand Down Expand Up @@ -2494,24 +2496,23 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * src0, const struct
if (dst->op == GGML_OP_ADD) {
return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) &&
(src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16) &&
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1)) &&
(src0->rank == src1->rank);
(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16) && ((ne00 > 1 && ne01 > 1 && ne10 > 1 && ne11 > 1));

}

if (dst->op == GGML_OP_MUL_MAT) {
#if 1 // log output have significant effect to performance but useful during development stage
QNN_LOG_DEBUG("GGML_OP_MUL_MAT");
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src0->name, src0->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src0->name,
src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
src0->nb[0], src0->nb[1], src0->nb[2]);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src1->name, src1->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src1->name,
src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
src1->nb[0], src1->nb[1], src1->nb[2]);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
dst->name, dst->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
dst->name,
dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
dst->nb[1], dst->nb[2]);
#endif
Expand Down Expand Up @@ -2576,18 +2577,18 @@ static void ggml_qnn_add(const ggml_tensor * src0, const ggml_tensor * src1, ggm
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;

n_begin_time = ggml_time_us();
#if 0 //it works fine with whisper.cpp and llama.cpp. comment them because focus on mulmat in llama.cpp inference since 04-23-2024
#if 0
QNN_LOG_DEBUG("call %s\n", __func__);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src0->name, src0->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src0->name,
src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
src0->nb[0], src0->nb[1], src0->nb[2]);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src1->name, src1->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src1->name,
src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
src1->nb[0], src1->nb[1], src1->nb[2]);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
dst->name, dst->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
dst->name,
dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
dst->nb[1], dst->nb[2]);
QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
Expand Down Expand Up @@ -2793,16 +2794,16 @@ static void ggml_qnn_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1,

n_begin_time = ggml_time_us();
QNN_LOG_DEBUG("call %s\n", __func__);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src0->name, src0->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src0->name,
src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
src0->nb[0], src0->nb[1], src0->nb[2]);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src1->name, src1->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src1->name,
src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
src1->nb[0], src1->nb[1], src1->nb[2]);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
dst->name, dst->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
dst->name,
dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
dst->nb[1], dst->nb[2]);
QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
Expand Down Expand Up @@ -3000,16 +3001,16 @@ static void ggml_qnn_hanlde_op(const enum ggml_op ggmlop, const ggml_tensor * sr

n_begin_time = ggml_time_us();
QNN_LOG_DEBUG("call %s\n", __func__);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src0->name, src0->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src0->name,
src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
src0->nb[0], src0->nb[1], src0->nb[2]);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src1->name, src1->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
src1->name,
src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
src1->nb[0], src1->nb[1], src1->nb[2]);
QNN_LOG_INFO("%15s: rank = %d, type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
dst->name, dst->rank,
QNN_LOG_INFO("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
dst->name,
dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
dst->nb[1], dst->nb[2]);
QNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
Expand Down Expand Up @@ -4396,7 +4397,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
}


#if 0 //replaced with ggml_status ggml_backend_qnn_graph_compute_multithread
static void * ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;

Expand Down Expand Up @@ -4531,7 +4531,6 @@ static void * ggml_graph_compute_thread(void * data) {

return 0;
}
#endif


static ggml_status ggml_backend_qnn_graph_compute_multithread(ggml_backend_t backend, ggml_cgraph * cgraph) {
Expand Down Expand Up @@ -4830,8 +4829,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
}


extern "C" int ggml_backend_qnn_reg_devices();

extern "C" int ggml_backend_qnn_reg_devices(void);

int ggml_backend_qnn_reg_devices() {
for (size_t idx = 0; idx < GGML_QNN_MAX_DEVICES; idx++) {
Expand Down
6 changes: 1 addition & 5 deletions ggml-qnn.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ enum QNNBackend {
QNN_HTP,
};

GGML_API int ggml_backend_qnn_reg_devices();
GGML_API int ggml_backend_qnn_reg_devices(void);

/**
*
Expand All @@ -39,10 +39,6 @@ GGML_API void ggml_backend_qnn_get_device_description(int device, char

GGML_API ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t dev_num);

// TODO: this is a temporary API, should be removed in the future
GGML_API bool ggml_qnn_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);


#ifdef __cplusplus
}
#endif
3 changes: 3 additions & 0 deletions tests/ggml-qnn/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
out
android-ndk-r26c*
test-qnn*
80 changes: 80 additions & 0 deletions tests/ggml-qnn/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
cmake_minimum_required(VERSION 3.22.1)
project(ggml-qnn)

set(CMAKE_VERBOSE_MAKEFILE on)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

set(TARGET_SNAPDRAGON_8_GEN3 OFF)

set(LLAMACPP_SRC_PATH ${PROJECT_ROOT_PATH})
set(QNN_INC_PATH ${QNN_SDK_PATH}/include/QNN)
set(QNN_LIB_PATH ${QNN_SDK_PATH}/lib/aarch64-android)

include_directories(${QNN_INC_PATH})
include_directories(${LLAMACPP_SRC_PATH})
include_directories(${LLAMACPP_SRC_PATH}/common)

set(SOURCE_FILES
${LLAMACPP_SRC_PATH}/ggml.c
${LLAMACPP_SRC_PATH}/ggml-alloc.c
${LLAMACPP_SRC_PATH}/ggml-backend.c
${LLAMACPP_SRC_PATH}/ggml-quants.c
${LLAMACPP_SRC_PATH}/ggml-qnn.cpp
${LLAMACPP_SRC_PATH}/tests/test-backend-ops.cpp
)


message("PROJECT_ROOT_PATH : ${PROJECT_ROOT_PATH}")
message("LLAMACPP_SRC_PATH : ${LLAMACPP_SRC_PATH}")
message("QNN_SDK_PATH : ${QNN_SDK_PATH}")
message("QNN_INC_PATH : ${QNN_INC_PATH}")
message("QNN_LIB_PATH : ${QNN_LIB_PATH}")
message("target name : ${TARGET_NAME}")


add_definitions(-DTARGET_ANDROID)
add_definitions(-D__ARM_NEON)
add_definitions(-DGGML_USE_QNN)

add_definitions(-DNDEBUG)
add_definitions(-O3)

if (TARGET_SNAPDRAGON_8_GEN3)
add_definitions(-march=armv8.7-a)
add_definitions(-mcpu=cortex-x1)
add_definitions(-mtune=cortex-x1)

else()

# the below build optimization might be works well on ALL mainstream Android phones
add_definitions(-mcpu=cortex-a72)

endif()

add_compile_options("-Wall" "-Wno-sign-compare")

if (GGML_JNI_QNN)
file(GLOB allPrebuiltQNNLibs "${QNN_LIB_PATH}/libQnn*.so")

#file(COPY ${allPrebuiltQNNLibs} DESTINATION ${PREBUILT_LIB_PATH}/ )

endif()

find_library(LOG_LIB log)

add_library(QNNCpu
SHARED
IMPORTED)

set_target_properties(QNNCpu
PROPERTIES
IMPORTED_LOCATION
${PREBUILT_LIB_PATH}/libQnnCpu.so)

link_libraries(${LOG_LIB} android)

add_executable(${TARGET_NAME}
${SOURCE_FILES}
)
89 changes: 89 additions & 0 deletions tests/ggml-qnn/build-ggml-qnn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/bin/bash

set -e

#modify following lines to adapt to local dev envs
PROJECT_ROOT_PATH=~/github/llama.cpp/
#https://qpm.qualcomm.com/#/main/tools/details/qualcomm_ai_engine_direct
#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
QNN_SDK_PATH=/opt/qcom/aistack/qnn/2.20.0.240223/


ANDROID_NDK=`pwd`/android-ndk-r26c
TARGET=ggml-qnn-test


function dump_vars()
{
echo -e "PROJECT_ROOT_PATH: ${PROJECT_ROOT_PATH}"
echo -e "ANDROID_NDK: ${ANDROID_NDK}"
echo -e "QNN_SDK_PATH: ${QNN_SDK_PATH}"
}


function show_pwd()
{
echo -e "current working path:$(pwd)\n"
}


function check_and_download_ndk()
{
is_android_ndk_exist=1

if [ ! -d ${ANDROID_NDK} ]; then
is_android_ndk_exist=0
fi

if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
is_android_ndk_exist=0
fi

if [ ${is_android_ndk_exist} -eq 0 ]; then

if [ ! -f android-ndk-r26c-linux.zip ]; then
wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip https://dl.google.com/android/repository/android-ndk-r26c-linux.zip
fi

unzip android-ndk-r26c-linux.zip

if [ $? -ne 0 ]; then
printf "failed to download android ndk to %s \n" "${ANDROID_NDK}"
exit 1
fi

printf "android ndk saved to ${ANDROID_NDK} \n\n"
else
printf "android ndk already exist:${ANDROID_NDK} \n\n"
fi
}


function build_arm64
{
cmake -H. -B./out/arm64-v8a -DPROJECT_ROOT_PATH=${PROJECT_ROOT_PATH} -DTARGET_NAME=${TARGET} -DCMAKE_BUILD_TYPE=${PROJECT_BUILD_TYPE} -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=${ANDROID_PLATFORM} -DANDROID_NDK=${ANDROID_NDK} -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DQNN_SDK_PATH=${QNN_SDK_PATH}

cd ./out/arm64-v8a
make

ls -lah ${TARGET}
/bin/cp ${TARGET} ../../
cd -
}


function remove_temp_dir()
{
if [ -d out ]; then
echo "remove out directory in `pwd`"
rm -rf out
fi
}



show_pwd
check_and_download_ndk
dump_vars
remove_temp_dir
build_arm64
Loading

0 comments on commit eff9669

Please sign in to comment.