Skip to content

Commit

Permalink
whisper : Metal and ggml-alloc support (ggerganov#1270)
Browse files Browse the repository at this point in the history
* metal : init

* whisper : factor out graph builds

* whisper : allocate encoder and decoder using ggml-alloc

* whisper : ggml-alloc is now supported

* whisper : CoreML support ggml-alloc

* build : fix ggml-alloc

* ios : update submodule

* extra : update sync-ggml.sh script to also sync ggml-alloc

* ci : see if this is causing the crash

* whisper : refactor ggml-alloc init

* whisper.android : try to fix build

* whisper : initial Metal version

* ci : try to debug vmem issue

* metal : decoder works on GPU!

* metal : add multi-decoder support

* ggml : fix ggml_nbytes (probably temp solution)

* metal : run "cross" step on the GPU

* whisper : remove ggml_repeat in the encoder

* whisper : offload the Encoder to Metal

* ggml : use simpler ggml_bytes() implementation

* ggml-alloc : try to make CI happy by reducing vram to 128GB

* whisper : add whisper_allocr to wrap ggml_allocr

* whisper : factor out alloc init in a function

* cmake : update to support Metal build

* whisper : add <functional> header

* objc : fix build (no Metal yet)

* ios : add Metal support

* swiftui : fix build

* metal : speed-up KQ multiplication

* metal : sync latest llama.cpp kernels

* readme : add Metal info

* ios : update submodule

* coreml : add code to toggle Core ML config (CPU, ANE, GPU)

* bench : fix timings by running a pre-heat

* bench : start benching the decoder

* whisper : add ggml_mul_mat_pad

* bench : fix uninitialized vars

* whisper : add comment for disabling mul-mat padding

* whisper : add description of ggml_mul_mat_pad

* whisper : clean-up ggml_mul_mat_pad

* metal : remove the "concurrent" flag

* bench : variable n_past

* ios : update SPM package
  • Loading branch information
ggerganov authored and iThalay committed Sep 23, 2024
1 parent 66d962e commit d2df388
Show file tree
Hide file tree
Showing 18 changed files with 1,537 additions and 934 deletions.
66 changes: 57 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required (VERSION 3.0)
cmake_minimum_required (VERSION 3.5)

project(whisper.cpp VERSION 1.4.2)

Expand Down Expand Up @@ -35,6 +35,12 @@ endif()

# options

if (APPLE)
set(WHISPER_METAL_DEFAULT ON)
else()
set(WHISPER_METAL_DEFAULT OFF)
endif()

option(BUILD_SHARED_LIBS "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})

option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings" ON)
Expand All @@ -58,6 +64,8 @@ option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)

if (APPLE)
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
option(WHISPER_METAL "whisper: use Metal" ${WHISPER_METAL_DEFAULT})
option(WHISPER_METAL_NDEBUG "whisper: disable Metal debugging" OFF)
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
else()
Expand Down Expand Up @@ -113,6 +121,34 @@ if (APPLE)
endif()
endif()

if (WHISPER_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)

if (METAL_FRAMEWORK)
message(STATUS "Metal framework found")

set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_METAL)

if (WHISPER_METAL_NDEBUG)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_NDEBUG)
endif()
else()
message(WARNING "Metal framework not found")
endif()

set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)

# copy ggml-metal.metal to bin directory
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
endif()

if (WHISPER_COREML)
find_library(FOUNDATION_FRAMEWORK Foundation)
find_library(COREML_FRAMEWORK CoreML)
Expand Down Expand Up @@ -177,7 +213,7 @@ if (WHISPER_CUBLAS)

enable_language(CUDA)

set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)

add_compile_definitions(GGML_USE_CUBLAS)

Expand Down Expand Up @@ -228,7 +264,7 @@ if (WHISPER_CLBLAST)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")

set(GGML_OPENCL_SOURCES ggml-opencl.cpp ggml-opencl.h)
set(GGML_SOURCES_OPENCL ggml-opencl.cpp ggml-opencl.h)

add_compile_definitions(GGML_USE_CLBLAST)

Expand Down Expand Up @@ -426,8 +462,11 @@ set(TARGET whisper)
add_library(${TARGET}
ggml.h
ggml.c
${GGML_CUDA_SOURCES}
${GGML_OPENCL_SOURCES}
ggml-alloc.h
ggml-alloc.c
${GGML_SOURCES_METAL}
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
whisper.h
whisper.cpp
)
Expand Down Expand Up @@ -468,9 +507,15 @@ if (BUILD_SHARED_LIBS)
WHISPER_BUILD
GGML_BUILD
)

if (WHISPER_METAL)
# TODO: I think this should make ggml-metal.m "see" the ggml-metal.metal file from the "bin" directory
# but for some reason it does not work here like it does in llama.cpp
set_target_properties(${TARGET} PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
endif()
endif()

if (GGML_CUDA_SOURCES)
if (GGML_SOURCES_CUDA)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
Expand All @@ -486,10 +531,13 @@ target_compile_definitions(${TARGET} PUBLIC

set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")

include(GNUInstallDirs)

install(TARGETS ${TARGET}
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib/static
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib/static
RUNTIME DESTINATION bin
RESOURCE DESTINATION bin
PUBLIC_HEADER DESTINATION include
)

Expand Down
23 changes: 22 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ ifndef NVCC_VERSION
endif
endif

CCV := $(shell $(CC) --version | head -n 1)
CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1)

# Mac OS + Arm can report x86_64
Expand Down Expand Up @@ -182,6 +182,15 @@ ifdef WHISPER_COREML_ALLOW_FALLBACK
endif
endif

ifndef WHISPER_NO_METAL
ifeq ($(UNAME_S),Darwin)
WHISPER_METAL := 1

CXXFLAGS += -DGGML_USE_METAL
LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
endif
endif

ifdef WHISPER_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
LDFLAGS += -lopenblas
Expand Down Expand Up @@ -288,6 +297,11 @@ $(info )
ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c $< -o $@

ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
$(CC) $(CFLAGS) -c $< -o $@

WHISPER_OBJ += ggml-alloc.o

whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
$(CXX) $(CXXFLAGS) -c $< -o $@

Expand All @@ -303,6 +317,13 @@ whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-imp
WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
endif

ifdef WHISPER_METAL
ggml-metal.o: ggml-metal.m ggml-metal.h
$(CC) $(CFLAGS) -c $< -o $@

WHISPER_OBJ += ggml-metal.o
endif

libwhisper.a: ggml.o $(WHISPER_OBJ)
$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)

Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ Beta: [v1.4.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.2) / S
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

- Plain C/C++ implementation without dependencies
- Apple silicon first-class citizen - optimized via ARM NEON, Accelerate framework and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
- AVX intrinsics support for x86 architectures
- VSX intrinsics support for POWER architectures
- Mixed F16 / F32 precision
- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
- Low memory usage (Flash Attention)
- Zero memory allocations at runtime
- Runs on the CPU
- Support for CPU-only inference
- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
- [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
- [BLAS CPU support via OpenBLAS](https://github.com/ggerganov/whisper.cpp#blas-cpu-support-via-openblas)
Expand Down Expand Up @@ -50,6 +50,10 @@ You can also easily make your own offline voice assistant application: [command]

https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4

On Apply Silicon, the inference runs fully on the GPU via Metal:

https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225

Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)

## Implementation details
Expand Down
8 changes: 7 additions & 1 deletion coreml/whisper-encoder.mm
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,13 @@

NSURL * url_model = [NSURL fileURLWithPath: path_model_str];

const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model error:nil]);
// select which device to run the Core ML model on
MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
config.computeUnits = MLComputeUnitsCPUAndGPU;
//config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
//config.computeUnits = MLComputeUnitsAll;

const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);

if (data == NULL) {
return NULL;
Expand Down
43 changes: 40 additions & 3 deletions examples/bench/bench.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,13 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -w N, --what N [%-7d] what to benchmark:\n", params.what);
fprintf(stderr, " %-7s 0 - whisper encoder\n", "");
fprintf(stderr, " %-7s 0 - whisper\n", "");
fprintf(stderr, " %-7s 1 - memcpy\n", "");
fprintf(stderr, " %-7s 2 - ggml_mul_mat\n", "");
fprintf(stderr, "\n");
}

int whisper_bench_encoder(const whisper_params & params) {
int whisper_bench_full(const whisper_params & params) {
// whisper init

struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
Expand All @@ -69,12 +69,49 @@ int whisper_bench_encoder(const whisper_params & params) {
fprintf(stderr, "error: failed to set mel: %d\n", ret);
return 3;
}
// heat encoder
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret);
return 4;
}

whisper_token tokens[512];
memset(tokens, 0, sizeof(tokens));

// prompt heat
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret);
return 4;
}

// text-generation heat
if (int ret = whisper_decode(ctx, tokens, 1, 256, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret);
return 4;
}

whisper_reset_timings(ctx);

// actual run
if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret);
return 4;
}

for (int i = 0; i < 16; i++) {
if (int ret = whisper_decode(ctx, tokens, 256, 0, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret);
return 4;
}
}

for (int i = 0; i < 256; i++) {
if (int ret = whisper_decode(ctx, tokens, 1, i, params.n_threads) != 0) {
fprintf(stderr, "error: failed to encode model: %d\n", ret);
return 4;
}
}

whisper_print_timings(ctx);
whisper_free(ctx);

Expand Down Expand Up @@ -103,7 +140,7 @@ int main(int argc, char ** argv) {
int ret = -1;

switch (params.what) {
case 0: ret = whisper_bench_encoder(params); break;
case 0: ret = whisper_bench_full(params); break;
case 1: ret = whisper_bench_memcpy(params.n_threads); break;
case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
Expand Down
2 changes: 1 addition & 1 deletion examples/talk-llama/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ if (WHISPER_SDL2)

# TODO: this is temporary
# need to export ggml symbols for MSVC, but too lazy ..
add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../ggml-alloc.c ../../whisper.cpp)

target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
set(
SOURCE_FILES
${WHISPER_LIB_DIR}/ggml.c
${WHISPER_LIB_DIR}/ggml-alloc.c
${WHISPER_LIB_DIR}/whisper.cpp
${CMAKE_SOURCE_DIR}/jni.c
)
Expand All @@ -20,7 +21,7 @@ function(build_library target_name)
SHARED
${SOURCE_FILES}
)

target_link_libraries(${target_name} ${LOG_LIB} android)

if (${target_name} STREQUAL "whisper_v8fp16_va")
Expand Down
12 changes: 12 additions & 0 deletions examples/whisper.objc/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,22 @@ This can significantly improve the performance of the transcription:

<img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">

## Core ML

If you want to enable Core ML support, you can add the `-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK` compiler flag for `whisper.cpp` in Build Phases:

<img width="1072" alt="image" src="https://github.com/ggerganov/whisper.cpp/assets/3001525/103e8f57-6eb6-490d-a60c-f6cf6c319324">

Then follow the [`Core ML support` section of readme](../../README.md#core-ml-support) for convert the model.

In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.

## Metal

You can also enable Metal to make the inference run on the GPU of your device. This might or might not be more efficient
compared to Core ML depending on the model and device that you use.

To enable Metal, just add `-DGGML_USE_METAL` instead off the `-DWHISPER_USE_COREML` flag and you are ready.
This will make both the Encoder and the Decoder run on the GPU.

If you want to run the Encoder with Core ML and the Decoder with Metal then simply add both `-DWHISPER_USE_COREML -DGGML_USE_METAL` flags. That's all!
Loading

0 comments on commit d2df388

Please sign in to comment.