Merge branch 'master' into gpu-param

ggerganov · Nov 5, 2023 · aaf9649 · aaf9649
2 parents 52c3dd6 + f96e1c5
commit aaf9649
Show file tree

Hide file tree

Showing 48 changed files with 30,893 additions and 7,740 deletions.
diff --git a/.devops/cublas.Dockerfile b/.devops/cublas.Dockerfile
@@ -0,0 +1,28 @@
+ARG UBUNTU_VERSION=22.04
+
+# This needs to generally match the container host's environment.
+ARG CUDA_VERSION=11.7.1
+
+# Target the CUDA build image
+ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${BASE_CUDA_DEV_CONTAINER} as build
+
+# Unless otherwise specified, we make a fat build.
+ARG CUDA_DOCKER_ARCH=all
+
+RUN apt-get update && \
+    apt-get install -y build-essential git cmake
+
+WORKDIR /app
+
+COPY . .
+
+# Set nvcc architecture
+ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
+# Enable cuBLAS
+ENV WHISPER_CUBLAS=1
+
+RUN make
+
+ENTRYPOINT ["/app/main"]
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -117,7 +117,7 @@ if (APPLE)
             set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
             set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
         else()
-            message(WARNING "Accelerate framework not found")
+            message(FATAL_ERROR "Accelerate framework not found")
         endif()
     endif()
 
@@ -140,7 +140,7 @@ if (APPLE)
                 set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_NDEBUG)
             endif()
         else()
-            message(WARNING "Metal framework not found")
+            message(FATAL_ERROR "Metal framework not found")
         endif()
 
         set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
@@ -158,7 +158,7 @@ if (APPLE)
 
             set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
         else()
-            message(WARNING "CoreML framework not found")
+            message(FATAL_ERROR "CoreML framework not found")
         endif()
 
         if (WHISPER_COREML_ALLOW_FALLBACK)
@@ -181,13 +181,13 @@ if (WHISPER_BLAS)
             include_directories($ENV{OPENBLAS_PATH}/include)
             set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
         else ()
-            message(WARNING "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
+            message(FATAL_ERROR "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
         endif ()
     else ()
         set(BLA_STATIC 1)
         set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
-        #    set(BLA_PREFER_PKGCONFIG 1)
         set(BLA_SIZEOF_INTEGER 8)
+        set(BLA_PREFER_PKGCONFIG 1)
         find_package(BLAS)
 
         if(BLAS_FOUND)
@@ -198,7 +198,7 @@ if (WHISPER_BLAS)
             include_directories(${BLAS_INCLUDE_DIRS})
             set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
         else()
-            message(WARNING "BLAS library was not found")
+            message(FATAL_ERROR "BLAS library was not found")
         endif()
     endif ()
 endif ()
@@ -224,7 +224,7 @@ if (WHISPER_CUBLAS)
         endif()
 
     else()
-        message(WARNING "cuBLAS not found")
+        message(FATAL_ERROR "cuBLAS not found")
     endif()
 endif()
 
@@ -255,7 +255,7 @@ if (WHISPER_HIPBLAS)
         endif()
         set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ggml-rocm)
     else()
-        message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
+        message(FATAL_ERROR "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
     endif()
 endif()
 
@@ -270,7 +270,7 @@ if (WHISPER_CLBLAST)
 
         set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} clblast)
     else()
-        message(WARNING "CLBlast not found")
+        message(FATAL_ERROR "CLBlast not found")
     endif()
 endif()
 
@@ -464,6 +464,10 @@ add_library(${TARGET}
     ggml.c
     ggml-alloc.h
     ggml-alloc.c
+    ggml-backend.h
+    ggml-backend.c
+    ggml-quants.h
+    ggml-quants.c
     ${GGML_SOURCES_METAL}
     ${GGML_SOURCES_CUDA}
     ${GGML_SOURCES_OPENCL}

diff --git a/Makefile b/Makefile
@@ -301,7 +301,13 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
 ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
 	$(CC)  $(CFLAGS)   -c $< -o $@
 
-WHISPER_OBJ += ggml-alloc.o
+ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+
+ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
+	$(CC)  $(CFLAGS)   -c $< -o $@
+
+WHISPER_OBJ += ggml-alloc.o ggml-backend.o ggml-quants.o
 
 whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@

diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ You can also easily make your own offline voice assistant application: [command]
 
 https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4
 
-On Apply Silicon, the inference runs fully on the GPU via Metal:
+On Apple Silicon, the inference runs fully on the GPU via Metal:
 
 https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
 

diff --git a/bindings/go/params.go b/bindings/go/params.go
@@ -118,6 +118,11 @@ func (p *Params) SetMaxTokensPerSegment(n int) {
 	p.max_tokens = C.int(n)
 }
 
+// Set audio encoder context
+func (p *Params) SetAudioCtx(n int) {
+	p.audio_ctx = C.int(n)
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 // PRIVATE METHODS
 
@@ -141,6 +146,7 @@ func (p *Params) String() string {
 	str += fmt.Sprintf(" n_max_text_ctx=%d", p.n_max_text_ctx)
 	str += fmt.Sprintf(" offset_ms=%d", p.offset_ms)
 	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
+	str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
 	if p.translate {
 		str += " translate"
 	}

diff --git a/bindings/go/pkg/whisper/context.go b/bindings/go/pkg/whisper/context.go
@@ -82,7 +82,7 @@ func (context *context) SetSpeedup(v bool) {
 }
 
 func (context *context) SetSplitOnWord(v bool) {
-        context.params.SetSplitOnWord(v)
+	context.params.SetSplitOnWord(v)
 }
 
 // Set number of threads to use
@@ -125,6 +125,11 @@ func (context *context) SetMaxTokensPerSegment(n uint) {
 	context.params.SetMaxTokensPerSegment(int(n))
 }
 
+// Set audio encoder context
+func (context *context) SetAudioCtx(n uint) {
+	context.params.SetAudioCtx(int(n))
+}
+
 // ResetTimings resets the mode timings. Should be called before processing
 func (context *context) ResetTimings() {
 	context.model.ctx.Whisper_reset_timings()

diff --git a/bindings/go/pkg/whisper/interface.go b/bindings/go/pkg/whisper/interface.go
@@ -48,6 +48,7 @@ type Context interface {
 	SetMaxSegmentLength(uint)     // Set max segment length in characters
 	SetTokenTimestamps(bool)      // Set token timestamps flag
 	SetMaxTokensPerSegment(uint)  // Set max tokens per segment (0 = no limit)
+	SetAudioCtx(uint)             // Set audio encoder context
 
 	// Process mono audio data and return any errors.
 	// If defined, newly generated segments are passed to the

diff --git a/bindings/ios b/bindings/ios
diff --git a/bindings/ruby/ext/extconf.rb b/bindings/ruby/ext/extconf.rb
@@ -3,8 +3,14 @@
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
+system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
+system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
+system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
+system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
+system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
+system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
 system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
 
 

diff --git a/bindings/ruby/ext/ggml-backend-impl.h b/bindings/ruby/ext/ggml-backend-impl.h
@@ -0,0 +1,87 @@
+#pragma once
+
+// ggml-backend internal header
+
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    //
+    // Backend buffer
+    //
+
+    typedef void * ggml_backend_buffer_context_t;
+
+    struct ggml_backend_buffer_i {
+        void   (*free_buffer)   (ggml_backend_buffer_t buffer);
+        void * (*get_base)      (ggml_backend_buffer_t buffer); // get base pointer
+        size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
+        void   (*init_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
+        void   (*free_tensor)   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
+    };
+
+    struct ggml_backend_buffer {
+        struct ggml_backend_buffer_i iface;
+
+        ggml_backend_t                backend;
+        ggml_backend_buffer_context_t context;
+
+        size_t size;
+    };
+
+    GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+            struct ggml_backend                  * backend,
+            struct ggml_backend_buffer_i           iface,
+                   ggml_backend_buffer_context_t   context,
+                   size_t                          size);
+
+    //
+    // Backend
+    //
+
+    typedef void * ggml_backend_context_t;
+
+    struct ggml_backend_i {
+        const char * (*get_name)(ggml_backend_t backend);
+
+        void (*free)(ggml_backend_t backend);
+
+        // buffer allocation
+        ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
+
+        // get buffer alignment
+        size_t (*get_alignment)(ggml_backend_t backend);
+
+        // tensor data access
+        // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        void (*synchronize)     (ggml_backend_t backend);
+
+        // (optional) copy tensor between different backends, allow for single-copy tranfers
+        void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        // compute graph with a plan
+        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+        // compute graph without a plan
+        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+        // check if the backend supports an operation
+        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+    };
+
+    struct ggml_backend {
+        struct ggml_backend_i iface;
+
+        ggml_backend_context_t context;
+    };
+
+#ifdef  __cplusplus
+}
+#endif
+15 −9		Makefile
+15 −9		Makefile-tmpl
+2 −0		Package.swift
+413 −280		Sources/whisper/ggml-alloc.c
+67 −8		Sources/whisper/ggml-alloc.h
+87 −0		Sources/whisper/ggml-backend-impl.h
+950 −0		Sources/whisper/ggml-backend.c
+136 −0		Sources/whisper/ggml-backend.h
+243 −0		Sources/whisper/ggml-impl.h
+21 −0		Sources/whisper/ggml-metal.h
+626 −227		Sources/whisper/ggml-metal.m
+670 −126		Sources/whisper/ggml-metal.metal
+7,277 −0		Sources/whisper/ggml-quants.c
+224 −0		Sources/whisper/ggml-quants.h
+3,494 −4,180		Sources/whisper/ggml.c
+240 −97		Sources/whisper/ggml.h
+10 −0		Sources/whisper/include/whisper.h
+41 −22		Sources/whisper/whisper.cpp