whisper : Metal and ggml-alloc support (ggerganov#1270)

* metal : init * whisper : factor out graph builds * whisper : allocate encoder and decoder using ggml-alloc * whisper : ggml-alloc is now supported * whisper : CoreML support ggml-alloc * build : fix ggml-alloc * ios : update submodule * extra : update sync-ggml.sh script to also sync ggml-alloc * ci : see if this is causing the crash * whisper : refactor ggml-alloc init * whisper.android : try to fix build * whisper : initial Metal version * ci : try to debug vmem issue * metal : decoder works on GPU! * metal : add multi-decoder support * ggml : fix ggml_nbytes (probably temp solution) * metal : run "cross" step on the GPU * whisper : remove ggml_repeat in the encoder * whisper : offload the Encoder to Metal * ggml : use simpler ggml_bytes() implementation * ggml-alloc : try to make CI happy by reducing vram to 128GB * whisper : add whisper_allocr to wrap ggml_allocr * whisper : factor out alloc init in a function * cmake : update to support Metal build * whisper : add <functional> header * objc : fix build (no Metal yet) * ios : add Metal support * swiftui : fix build * metal : speed-up KQ multiplication * metal : sync latest llama.cpp kernels * readme : add Metal info * ios : update submodule * coreml : add code to toggle Core ML config (CPU, ANE, GPU) * bench : fix timings by running a pre-heat * bench : start benching the decoder * whisper : add ggml_mul_mat_pad * bench : fix uninitialized vars * whisper : add comment for disabling mul-mat padding * whisper : add description of ggml_mul_mat_pad * whisper : clean-up ggml_mul_mat_pad * metal : remove the "concurrent" flag * bench : variable n_past * ios : update SPM package
iThalay · Sep 23, 2024 · d2df388 · d2df388
1 parent 66d962e
commit d2df388
Show file tree

Hide file tree

Showing 18 changed files with 1,537 additions and 934 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required (VERSION 3.0)
+cmake_minimum_required (VERSION 3.5)
 
 project(whisper.cpp VERSION 1.4.2)
 
@@ -35,6 +35,12 @@ endif()
 
 # options
 
+if (APPLE)
+    set(WHISPER_METAL_DEFAULT ON)
+else()
+    set(WHISPER_METAL_DEFAULT OFF)
+endif()
+
 option(BUILD_SHARED_LIBS              "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
 
 option(WHISPER_ALL_WARNINGS           "whisper: enable all compiler warnings"                   ON)
@@ -58,6 +64,8 @@ option(WHISPER_OPENVINO               "whisper: support for OpenVINO" OFF)
 
 if (APPLE)
     option(WHISPER_NO_ACCELERATE         "whisper: disable Accelerate framework" OFF)
+    option(WHISPER_METAL                 "whisper: use Metal"                    ${WHISPER_METAL_DEFAULT})
+    option(WHISPER_METAL_NDEBUG          "whisper: disable Metal debugging"      OFF)
     option(WHISPER_COREML                "whisper: enable Core ML framework"     OFF)
     option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
 else()
@@ -113,6 +121,34 @@ if (APPLE)
         endif()
     endif()
 
+    if (WHISPER_METAL)
+        find_library(FOUNDATION_LIBRARY         Foundation              REQUIRED)
+        find_library(METAL_FRAMEWORK            Metal                   REQUIRED)
+        find_library(METALKIT_FRAMEWORK         MetalKit                REQUIRED)
+
+        if (METAL_FRAMEWORK)
+            message(STATUS "Metal framework found")
+
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS}
+                ${FOUNDATION_LIBRARY}
+                ${METAL_FRAMEWORK}
+                ${METALKIT_FRAMEWORK}
+                )
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_METAL)
+
+            if (WHISPER_METAL_NDEBUG)
+                set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_NDEBUG)
+            endif()
+        else()
+            message(WARNING "Metal framework not found")
+        endif()
+
+        set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
+
+        # copy ggml-metal.metal to bin directory
+        configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
+    endif()
+
     if (WHISPER_COREML)
         find_library(FOUNDATION_FRAMEWORK Foundation)
         find_library(COREML_FRAMEWORK CoreML)
@@ -177,7 +213,7 @@ if (WHISPER_CUBLAS)
 
         enable_language(CUDA)
 
-        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
+        set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
 
         add_compile_definitions(GGML_USE_CUBLAS)
 
@@ -228,7 +264,7 @@ if (WHISPER_CLBLAST)
     if (CLBlast_FOUND)
         message(STATUS "CLBlast found")
 
-        set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
+        set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)
 
         add_compile_definitions(GGML_USE_CLBLAST)
 
@@ -426,8 +462,11 @@ set(TARGET whisper)
 add_library(${TARGET}
     ggml.h
     ggml.c
-    ${GGML_CUDA_SOURCES}
-    ${GGML_OPENCL_SOURCES}
+    ggml-alloc.h
+    ggml-alloc.c
+    ${GGML_SOURCES_METAL}
+    ${GGML_SOURCES_CUDA}
+    ${GGML_SOURCES_OPENCL}
     whisper.h
     whisper.cpp
     )
@@ -468,9 +507,15 @@ if (BUILD_SHARED_LIBS)
         WHISPER_BUILD
         GGML_BUILD
         )
+
+    if (WHISPER_METAL)
+        # TODO: I think this should make ggml-metal.m "see" the ggml-metal.metal file from the "bin" directory
+        #       but for some reason it does not work here like it does in llama.cpp
+        set_target_properties(${TARGET} PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
+    endif()
 endif()
 
-if (GGML_CUDA_SOURCES)
+if (GGML_SOURCES_CUDA)
     message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
     set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
     set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
@@ -486,10 +531,13 @@ target_compile_definitions(${TARGET} PUBLIC
 
 set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")
 
+include(GNUInstallDirs)
+
 install(TARGETS ${TARGET}
-    LIBRARY DESTINATION lib
-    ARCHIVE DESTINATION lib/static
-    RUNTIME DESTINATION bin
+    LIBRARY  DESTINATION lib
+    ARCHIVE  DESTINATION lib/static
+    RUNTIME  DESTINATION bin
+    RESOURCE DESTINATION bin
     PUBLIC_HEADER DESTINATION include
     )
 

diff --git a/Makefile b/Makefile
@@ -18,7 +18,7 @@ ifndef NVCC_VERSION
 	endif
 endif
 
-CCV := $(shell $(CC) --version | head -n 1)
+CCV  := $(shell $(CC) --version | head -n 1)
 CXXV := $(shell $(CXX) --version | head -n 1)
 
 # Mac OS + Arm can report x86_64
@@ -182,6 +182,15 @@ ifdef WHISPER_COREML_ALLOW_FALLBACK
 endif
 endif
 
+ifndef WHISPER_NO_METAL
+	ifeq ($(UNAME_S),Darwin)
+		WHISPER_METAL := 1
+
+		CXXFLAGS += -DGGML_USE_METAL
+		LDFLAGS  += -framework Foundation -framework Metal -framework MetalKit
+	endif
+endif
+
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
 	LDFLAGS += -lopenblas
@@ -288,6 +297,11 @@ $(info )
 ggml.o: ggml.c ggml.h ggml-cuda.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
+ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+
+WHISPER_OBJ += ggml-alloc.o
+
 whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
@@ -303,6 +317,13 @@ whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-imp
 WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
 endif
 
+ifdef WHISPER_METAL
+ggml-metal.o: ggml-metal.m ggml-metal.h
+	$(CC) $(CFLAGS) -c $< -o $@
+
+WHISPER_OBJ += ggml-metal.o
+endif
+
 libwhisper.a: ggml.o $(WHISPER_OBJ)
 	$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
 

diff --git a/README.md b/README.md
@@ -11,14 +11,14 @@ Beta: [v1.4.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.2) / S
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
 
 - Plain C/C++ implementation without dependencies
-- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate framework and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
+- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
 - [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
 - Low memory usage (Flash Attention)
 - Zero memory allocations at runtime
-- Runs on the CPU
+- Support for CPU-only inference
 - [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
 - [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
 - [BLAS CPU support via OpenBLAS](https://github.com/ggerganov/whisper.cpp#blas-cpu-support-via-openblas)
@@ -50,6 +50,10 @@ You can also easily make your own offline voice assistant application: [command]
 
 https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
 
+On Apply Silicon, the inference runs fully on the GPU via Metal:
+
+https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
+
 Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
 
 ## Implementation details

diff --git a/bindings/ios b/bindings/ios
diff --git a/coreml/whisper-encoder.mm b/coreml/whisper-encoder.mm
@@ -22,7 +22,13 @@
 
     NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
 
-    const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]);
+    // select which device to run the Core ML model on
+    MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
+    config.computeUnits = MLComputeUnitsCPUAndGPU;
+    //config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
+    //config.computeUnits = MLComputeUnitsAll;
+
+    const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);
 
     if (data == NULL) {
         return NULL;

diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp
@@ -44,13 +44,13 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
     fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
     fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
     fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
-    fprintf(stderr, "                           %-7s  0 - whisper encoder\n",                         "");
+    fprintf(stderr, "                           %-7s  0 - whisper\n",                                 "");
     fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
     fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
     fprintf(stderr, "\n");
 }
 
-int whisper_bench_encoder(const whisper_params & params) {
+int whisper_bench_full(const whisper_params & params) {
     // whisper init
 
     struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
@@ -69,12 +69,49 @@ int whisper_bench_encoder(const whisper_params & params) {
         fprintf(stderr, "error: failed to set mel: %d\n", ret);
         return 3;
     }
+    // heat encoder
+    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        return 4;
+    }
+
+    whisper_token tokens[512];
+    memset(tokens, 0, sizeof(tokens));
+
+    // prompt heat
+    if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        return 4;
+    }
+
+    // text-generation heat
+    if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
+        fprintf(stderr, "error: failed to encode model: %d\n", ret);
+        return 4;
+    }
 
+    whisper_reset_timings(ctx);
+
+    // actual run
     if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
         fprintf(stderr, "error: failed to encode model: %d\n", ret);
         return 4;
     }
 
+    for (int i = 0; i < 16; i++) {
+        if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
+            fprintf(stderr, "error: failed to encode model: %d\n", ret);
+            return 4;
+        }
+    }
+
+    for (int i = 0; i < 256; i++) {
+        if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
+            fprintf(stderr, "error: failed to encode model: %d\n", ret);
+            return 4;
+        }
+    }
+
     whisper_print_timings(ctx);
     whisper_free(ctx);
 
@@ -103,7 +140,7 @@ int main(int argc, char ** argv) {
     int ret = -1;
 
     switch (params.what) {
-        case 0: ret = whisper_bench_encoder(params);                break;
+        case 0: ret = whisper_bench_full(params);                break;
         case 1: ret = whisper_bench_memcpy(params.n_threads);       break;
         case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
         default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;

diff --git a/examples/talk-llama/CMakeLists.txt b/examples/talk-llama/CMakeLists.txt
@@ -7,7 +7,7 @@ if (WHISPER_SDL2)
 
     # TODO: this is temporary
     #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
+    add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../ggml-alloc.c ../../whisper.cpp)
 
     target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
     target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})

diff --git a/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt b/examples/whisper.android/app/src/main/jni/whisper/CMakeLists.txt
@@ -8,6 +8,7 @@ set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
 set(
         SOURCE_FILES
         ${WHISPER_LIB_DIR}/ggml.c
+        ${WHISPER_LIB_DIR}/ggml-alloc.c
         ${WHISPER_LIB_DIR}/whisper.cpp
         ${CMAKE_SOURCE_DIR}/jni.c
 )
@@ -20,7 +21,7 @@ function(build_library target_name)
         SHARED
         ${SOURCE_FILES}
     )
-    
+
     target_link_libraries(${target_name} ${LOG_LIB} android)
 
     if (${target_name} STREQUAL "whisper_v8fp16_va")

diff --git a/examples/whisper.objc/README.md b/examples/whisper.objc/README.md
@@ -28,10 +28,22 @@ This can significantly improve the performance of the transcription:
 
 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
 
+## Core ML
+
 If you want to enable Core ML support, you can add the `-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK` compiler flag for `whisper.cpp` in Build Phases:
 
 <img width="1072" alt="image" src="https://github.com/ggerganov/whisper.cpp/assets/3001525/103e8f57-6eb6-490d-a60c-f6cf6c319324">
 
 Then follow the [`Core ML support` section of readme](../../README.md#core-ml-support) for convert the model.
 
 In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
+
+## Metal
+
+You can also enable Metal to make the inference run on the GPU of your device. This might or might not be more efficient
+compared to Core ML depending on the model and device that you use.
+
+To enable Metal, just add `-DGGML_USE_METAL` instead off the `-DWHISPER_USE_COREML` flag and you are ready.
+This will make both the Encoder and the Decoder run on the GPU.
+
+If you want to run the Encoder with Core ML and the Decoder with Metal then simply add both `-DWHISPER_USE_COREML -DGGML_USE_METAL` flags. That's all!
+9 −4		Makefile
+9 −4		Makefile-tmpl
+66 −2		Package.swift
+146 −0		Sources/whisper/coreml/whisper-decoder-impl.h
+201 −0		Sources/whisper/coreml/whisper-decoder-impl.m
+142 −0		Sources/whisper/coreml/whisper-encoder-impl.h
+197 −0		Sources/whisper/coreml/whisper-encoder-impl.m
+22 −0		Sources/whisper/coreml/whisper-encoder.h
+63 −0		Sources/whisper/coreml/whisper-encoder.mm
+633 −0		Sources/whisper/ggml-alloc.c
+26 −0		Sources/whisper/ggml-alloc.h
+85 −0		Sources/whisper/ggml-metal.h
+1,285 −0		Sources/whisper/ggml-metal.m
+2,279 −0		Sources/whisper/ggml-metal.metal
+13,138 −7,768		Sources/whisper/ggml.c
+959 −82		Sources/whisper/ggml.h
+41 −7		Sources/whisper/include/whisper.h
+1,440 −933		Sources/whisper/whisper.cpp