Skip to content

Commit

Permalink
Merge branch 'master' into gpu-param
Browse files Browse the repository at this point in the history
  • Loading branch information
jhen0409 committed Nov 5, 2023
2 parents 52c3dd6 + f96e1c5 commit aaf9649
Show file tree
Hide file tree
Showing 48 changed files with 30,893 additions and 7,740 deletions.
28 changes: 28 additions & 0 deletions .devops/cublas.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
ARG UBUNTU_VERSION=22.04

# This needs to generally match the container host's environment.
ARG CUDA_VERSION=11.7.1

# Target the CUDA build image
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}

FROM ${BASE_CUDA_DEV_CONTAINER} as build

# Unless otherwise specified, we make a fat build.
ARG CUDA_DOCKER_ARCH=all

RUN apt-get update && \
apt-get install -y build-essential git cmake

WORKDIR /app

COPY . .

# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV WHISPER_CUBLAS=1

RUN make

ENTRYPOINT ["/app/main"]
22 changes: 13 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ if (APPLE)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
else()
message(WARNING "Accelerate framework not found")
message(FATAL_ERROR "Accelerate framework not found")
endif()
endif()

Expand All @@ -140,7 +140,7 @@ if (APPLE)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_NDEBUG)
endif()
else()
message(WARNING "Metal framework not found")
message(FATAL_ERROR "Metal framework not found")
endif()

set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
Expand All @@ -158,7 +158,7 @@ if (APPLE)

set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
else()
message(WARNING "CoreML framework not found")
message(FATAL_ERROR "CoreML framework not found")
endif()

if (WHISPER_COREML_ALLOW_FALLBACK)
Expand All @@ -181,13 +181,13 @@ if (WHISPER_BLAS)
include_directories($ENV{OPENBLAS_PATH}/include)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
else ()
message(WARNING "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
message(FATAL_ERROR "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
endif ()
else ()
set(BLA_STATIC 1)
set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
# set(BLA_PREFER_PKGCONFIG 1)
set(BLA_SIZEOF_INTEGER 8)
set(BLA_PREFER_PKGCONFIG 1)
find_package(BLAS)

if(BLAS_FOUND)
Expand All @@ -198,7 +198,7 @@ if (WHISPER_BLAS)
include_directories(${BLAS_INCLUDE_DIRS})
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
else()
message(WARNING "BLAS library was not found")
message(FATAL_ERROR "BLAS library was not found")
endif()
endif ()
endif ()
Expand All @@ -224,7 +224,7 @@ if (WHISPER_CUBLAS)
endif()

else()
message(WARNING "cuBLAS not found")
message(FATAL_ERROR "cuBLAS not found")
endif()
endif()

Expand Down Expand Up @@ -255,7 +255,7 @@ if (WHISPER_HIPBLAS)
endif()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ggml-rocm)
else()
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
message(FATAL_ERROR "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
endif()
endif()

Expand All @@ -270,7 +270,7 @@ if (WHISPER_CLBLAST)

set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} clblast)
else()
message(WARNING "CLBlast not found")
message(FATAL_ERROR "CLBlast not found")
endif()
endif()

Expand Down Expand Up @@ -464,6 +464,10 @@ add_library(${TARGET}
ggml.c
ggml-alloc.h
ggml-alloc.c
ggml-backend.h
ggml-backend.c
ggml-quants.h
ggml-quants.c
${GGML_SOURCES_METAL}
${GGML_SOURCES_CUDA}
${GGML_SOURCES_OPENCL}
Expand Down
8 changes: 7 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,13 @@ ggml.o: ggml.c ggml.h ggml-cuda.h
ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
$(CC) $(CFLAGS) -c $< -o $@

WHISPER_OBJ += ggml-alloc.o
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
$(CC) $(CFLAGS) -c $< -o $@

ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
$(CC) $(CFLAGS) -c $< -o $@

WHISPER_OBJ += ggml-alloc.o ggml-backend.o ggml-quants.o

whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
$(CXX) $(CXXFLAGS) -c $< -o $@
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ You can also easily make your own offline voice assistant application: [command]

https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4

On Apply Silicon, the inference runs fully on the GPU via Metal:
On Apple Silicon, the inference runs fully on the GPU via Metal:

https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225

Expand Down
6 changes: 6 additions & 0 deletions bindings/go/params.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,11 @@ func (p *Params) SetMaxTokensPerSegment(n int) {
p.max_tokens = C.int(n)
}

// Set audio encoder context
func (p *Params) SetAudioCtx(n int) {
p.audio_ctx = C.int(n)
}

///////////////////////////////////////////////////////////////////////////////
// PRIVATE METHODS

Expand All @@ -141,6 +146,7 @@ func (p *Params) String() string {
str += fmt.Sprintf(" n_max_text_ctx=%d", p.n_max_text_ctx)
str += fmt.Sprintf(" offset_ms=%d", p.offset_ms)
str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
if p.translate {
str += " translate"
}
Expand Down
7 changes: 6 additions & 1 deletion bindings/go/pkg/whisper/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func (context *context) SetSpeedup(v bool) {
}

func (context *context) SetSplitOnWord(v bool) {
context.params.SetSplitOnWord(v)
context.params.SetSplitOnWord(v)
}

// Set number of threads to use
Expand Down Expand Up @@ -125,6 +125,11 @@ func (context *context) SetMaxTokensPerSegment(n uint) {
context.params.SetMaxTokensPerSegment(int(n))
}

// Set audio encoder context
func (context *context) SetAudioCtx(n uint) {
context.params.SetAudioCtx(int(n))
}

// ResetTimings resets the mode timings. Should be called before processing
func (context *context) ResetTimings() {
context.model.ctx.Whisper_reset_timings()
Expand Down
1 change: 1 addition & 0 deletions bindings/go/pkg/whisper/interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ type Context interface {
SetMaxSegmentLength(uint) // Set max segment length in characters
SetTokenTimestamps(bool) // Set token timestamps flag
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
SetAudioCtx(uint) // Set audio encoder context

// Process mono audio data and return any errors.
// If defined, newly generated segments are passed to the
Expand Down
6 changes: 6 additions & 0 deletions bindings/ruby/ext/extconf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,14 @@
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")


Expand Down
87 changes: 87 additions & 0 deletions bindings/ruby/ext/ggml-backend-impl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#pragma once

// ggml-backend internal header

#include "ggml-backend.h"

#ifdef __cplusplus
extern "C" {
#endif

//
// Backend buffer
//

typedef void * ggml_backend_buffer_context_t;

struct ggml_backend_buffer_i {
void (*free_buffer) (ggml_backend_buffer_t buffer);
void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
};

struct ggml_backend_buffer {
struct ggml_backend_buffer_i iface;

ggml_backend_t backend;
ggml_backend_buffer_context_t context;

size_t size;
};

GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
struct ggml_backend * backend,
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
size_t size);

//
// Backend
//

typedef void * ggml_backend_context_t;

struct ggml_backend_i {
const char * (*get_name)(ggml_backend_t backend);

void (*free)(ggml_backend_t backend);

// buffer allocation
ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);

// get buffer alignment
size_t (*get_alignment)(ggml_backend_t backend);

// tensor data access
// these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*synchronize) (ggml_backend_t backend);

// (optional) copy tensor between different backends, allow for single-copy tranfers
void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);

// compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);

// compute graph without a plan
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);

// check if the backend supports an operation
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
};

struct ggml_backend {
struct ggml_backend_i iface;

ggml_backend_context_t context;
};

#ifdef __cplusplus
}
#endif
Loading

0 comments on commit aaf9649

Please sign in to comment.