Merge remote-tracking branch 'ggerganov/master'

* ggerganov/master: (60 commits) sync : ggml (#0) ggml : fix IQ3_XXS on Metal (llama/5219) sync : ggml (llama/0) Faster AVX2 dot product for IQ2_XS (llama/5187) SOTA 3-bit quants (llama/5196) ggml alloc: Fix for null dereference on alloc failure (llama/5200) Nomic Vulkan backend (llama/4456) ggml : add max buffer sizes to opencl and metal backends (llama/5181) metal : free metal objects (llama/5161) gguf : fix comparison (ggml/715) `ggml_cuda_cpy` support for 4d tensors and float16->float32 upcasting (ggml/686) gguf : add input validation, prevent integer overflows (ggml/709) ci : fix yolo URLs + fix metal capture (ggml/712) metal : add debug capture backend function (ggml/694) common : fix wav buffer detection (ggerganov#1819) server : add fields to `verbose_json` response (ggerganov#1802) make : update MSYS_NT (ggerganov#1813) talk-llama : sync llama.cpp sync : ggml ggml : add Vulkan backend (llama/2059) ...
bygreencn · Feb 3, 2024 · 59bbd71 · 59bbd71
2 parents eda6990 + 7a74e92
commit 59bbd71
Show file tree

Hide file tree

Showing 33 changed files with 6,811 additions and 2,939 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -320,7 +320,8 @@ if (WHISPER_ALL_WARNINGS)
 endif()
 
 if (NOT MSVC)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
+    # TODO: temporary disabled until we figure out ggml-metal.m
+    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
     #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
 endif()
 
@@ -509,6 +510,7 @@ else()
 endif()
 
 if (BUILD_SHARED_LIBS)
+    set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
     target_link_libraries(${TARGET} PUBLIC
         ${CMAKE_DL_LIBS}
         )

diff --git a/Makefile b/Makefile
@@ -117,7 +117,7 @@ ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686 amd64))
 		CPUINFO_CMD := sysctl machdep.cpu.features machdep.cpu.leaf7_features
 	else ifeq ($(UNAME_S),Linux)
 		CPUINFO_CMD := cat /proc/cpuinfo
-	else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
+	else ifneq (,$(filter MINGW32_NT% MINGW64_NT% MSYS_NT%,$(UNAME_S)))
 		CPUINFO_CMD := cat /proc/cpuinfo
 	else ifneq (,$(filter DragonFly FreeBSD,$(UNAME_S)))
 		CPUINFO_CMD := grep Features /var/run/dmesg.boot

diff --git a/README.md b/README.md
diff --git a/bindings/javascript/README.md b/bindings/javascript/README.md
@@ -41,7 +41,7 @@ make publish-npm
 
 ## Sample run
 
-```java
+```text
 $ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
 
 whisper_model_load: loading model from 'whisper.bin'
@@ -63,7 +63,7 @@ whisper_model_load: ggml ctx size =  140.60 MB
 whisper_model_load: memory size   =   22.83 MB
 whisper_model_load: model size    =  140.54 MB
 
-system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 | 
+system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 |
 
 operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...
 

diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp
@@ -64,6 +64,7 @@ bool ggml_common_quantize_0(
         case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
         case GGML_FTYPE_MOSTLY_IQ2_XXS:
         case GGML_FTYPE_MOSTLY_IQ2_XS:
+        case GGML_FTYPE_MOSTLY_IQ3_XXS:
                 {
                     fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                     return false;
@@ -195,6 +196,7 @@ bool ggml_common_quantize_0(
                 case GGML_TYPE_Q8_K:
                 case GGML_TYPE_IQ2_XXS:
                 case GGML_TYPE_IQ2_XS:
+                case GGML_TYPE_IQ3_XXS:
                 case GGML_TYPE_COUNT:
                     {
                         fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));

diff --git a/examples/common.cpp b/examples/common.cpp
@@ -615,6 +615,21 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
 
 }
 
+bool is_wav_buffer(const std::string buf) {
+    // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
+    // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
+    if (buf.size() < 12 || buf.substr(0, 4) != "RIFF" || buf.substr(8, 4) != "WAVE") {
+        return false;
+    }
+
+    uint32_t chunk_size = *reinterpret_cast<const uint32_t*>(buf.data() + 4);
+    if (chunk_size + 8 != buf.size()) {
+        return false;
+    }
+
+    return true;
+}
+
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
     drwav wav;
     std::vector<uint8_t> wav_data; // used for pipe input from stdin
@@ -639,6 +654,12 @@ bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector
 
         fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
     }
+    else if (is_wav_buffer(fname)) {
+        if (drwav_init_memory(&wav, fname.c_str(), fname.size(), nullptr) == false) {
+            fprintf(stderr, "error: failed to open WAV file from fname buffer\n");
+            return false;
+        }
+    }
     else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
         fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
         return false;

diff --git a/examples/common.h b/examples/common.h
@@ -135,7 +135,11 @@ gpt_vocab::id gpt_sample_top_k_top_p_repeat(
 // Audio utils
 //
 
+// Check if a buffer is a WAV audio file
+bool is_wav_buffer(const std::string buf);
+
 // Read WAV audio file and store the PCM data into pcmf32
+// fname can be a buffer of WAV data instead of a filename
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
 bool read_wav(

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -18,7 +18,7 @@
 #endif
 
 using namespace httplib;
-using json = nlohmann::json;
+using json = nlohmann::ordered_json;
 
 namespace {
 
@@ -543,7 +543,76 @@ int main(int argc, char ** argv) {
                              {"Access-Control-Allow-Origin", "*"},
                              {"Access-Control-Allow-Headers", "content-type"}});
 
-    std::string const default_content = "<html>hello</html>";
+    std::string const default_content = R"(
+    <html>
+    <head>
+        <title>Whisper.cpp Server</title>
+        <meta charset="utf-8">
+        <meta name="viewport" content="width=device-width">
+        <style>
+        body {
+            font-family: sans-serif;
+        }
+        form {
+            display: flex;
+            flex-direction: column;
+            align-items: flex-start;
+        }
+        label {
+            margin-bottom: 0.5rem;
+        }
+        input, select {
+            margin-bottom: 1rem;
+        }
+        button {
+            margin-top: 1rem;
+        }
+        </style>
+    </head>
+    <body>
+        <h1>Whisper.cpp Server</h1>
+
+        <h2>/inference</h2>
+        <pre>
+    curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/inference \
+    -H "Content-Type: multipart/form-data" \
+    -F file="@&lt;file-path&gt;" \
+    -F temperature="0.0" \
+    -F temperature_inc="0.2" \
+    -F response_format="json"
+        </pre>
+
+        <h2>/load</h2>
+        <pre>
+    curl 127.0.0.1:)" + std::to_string(sparams.port) + R"(/load \
+    -H "Content-Type: multipart/form-data" \
+    -F model="&lt;path-to-model-file&gt;"
+        </pre>
+
+        <div>
+            <h2>Try it out</h2>
+            <form action="/inference" method="POST" enctype="multipart/form-data">
+                <label for="file">Choose an audio file:</label>
+                <input type="file" id="file" name="file" accept="audio/*" required><br>
+
+                <label for="temperature">Temperature:</label>
+                <input type="number" id="temperature" name="temperature" value="0.0" step="0.01" placeholder="e.g., 0.0"><br>
+
+                <label for="response_format">Response Format:</label>
+                <select id="response_format" name="response_format">
+                    <option value="verbose_json">Verbose JSON</option>
+                    <option value="json">JSON</option>
+                    <option value="text">Text</option>
+                    <option value="srt">SRT</option>
+                    <option value="vtt">VTT</option>
+                </select><br>
+
+                <button type="submit">Submit</button>
+            </form>
+        </div>
+    </body>
+    </html>
+    )";
 
     // store default params so we can reset after each inference request
     whisper_params default_params = params;
@@ -556,15 +625,14 @@ int main(int argc, char ** argv) {
 
     svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
         // acquire whisper model mutex lock
-        whisper_mutex.lock();
+        std::lock_guard<std::mutex> lock(whisper_mutex);
 
         // first check user requested fields of the request
         if (!req.has_file("file"))
         {
             fprintf(stderr, "error: no 'file' field in the request\n");
             const std::string error_resp = "{\"error\":\"no 'file' field in the request\"}";
             res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
             return;
         }
         auto audio_file = req.get_file_value("file");
@@ -579,35 +647,42 @@ int main(int argc, char ** argv) {
         std::vector<float> pcmf32;               // mono-channel F32 PCM
         std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
 
-        // write to temporary file
-        const std::string temp_filename = "whisper_server_temp_file.wav";
-        std::ofstream temp_file{temp_filename, std::ios::binary};
-        temp_file << audio_file.content;
-        temp_file.close();
-
-        // if file is not wav, convert to wav
-
         if (sparams.ffmpeg_converter) {
+            // if file is not wav, convert to wav
+            // write to temporary file
+            const std::string temp_filename = "whisper_server_temp_file.wav";
+            std::ofstream temp_file{temp_filename, std::ios::binary};
+            temp_file << audio_file.content;
+            temp_file.close();
+
             std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
             const bool is_converted = convert_to_wav(temp_filename, error_resp);
             if (!is_converted) {
                 res.set_content(error_resp, "application/json");
-                whisper_mutex.unlock();
                 return;
             }
-        }
 
-        // read wav content into pcmf32
-        if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
-            fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
-            const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
-            res.set_content(error_resp, "application/json");
+            // read wav content into pcmf32
+            if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize))
+            {
+                fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
+                const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
+                res.set_content(error_resp, "application/json");
+                std::remove(temp_filename.c_str());
+                return;
+            }
+            // remove temp file
             std::remove(temp_filename.c_str());
-            whisper_mutex.unlock();
-            return;
+        } else {
+            if (!::read_wav(audio_file.content, pcmf32, pcmf32s, params.diarize))
+            {
+                fprintf(stderr, "error: failed to read WAV file\n");
+                const std::string error_resp = "{\"error\":\"failed to read WAV file\"}";
+                res.set_content(error_resp, "application/json");
+                return;
+            }
         }
-        // remove temp file
-        std::remove(temp_filename.c_str());
+
 
         printf("Successfully loaded %s\n", filename.c_str());
 
@@ -681,6 +756,7 @@ int main(int argc, char ** argv) {
             wparams.logprob_thold    = params.logprob_thold;
 
             wparams.no_timestamps    = params.no_timestamps;
+            wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
 
             whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
 
@@ -724,7 +800,6 @@ int main(int argc, char ** argv) {
                 fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                 const std::string error_resp = "{\"error\":\"failed to process audio\"}";
                 res.set_content(error_resp, "application/json");
-                whisper_mutex.unlock();
                 return;
             }
         }
@@ -778,6 +853,59 @@ int main(int argc, char ** argv) {
                 ss << speaker << text << "\n\n";
             }
             res.set_content(ss.str(), "text/vtt");
+        } else if (params.response_format == vjson_format) {
+            /* try to match openai/whisper's Python format */
+            std::string results = output_str(ctx, params, pcmf32s);
+            json jres = json{
+                {"task", params.translate ? "translate" : "transcribe"},
+                {"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
+                {"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
+                {"text", results},
+                {"segments", json::array()}
+            };
+            const int n_segments = whisper_full_n_segments(ctx);
+            for (int i = 0; i < n_segments; ++i)
+            {
+                json segment = json{
+                    {"id", i},
+                    {"text", whisper_full_get_segment_text(ctx, i)},
+                };
+
+                if (!params.no_timestamps) {
+                    segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
+                    segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
+                }
+
+                float total_logprob = 0;
+                const int n_tokens = whisper_full_n_tokens(ctx, i);
+                for (int j = 0; j < n_tokens; ++j) {
+                    whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
+                    if (token.id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
+
+                    segment["tokens"].push_back(token.id);
+                    json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
+                    if (!params.no_timestamps) {
+                        word["start"] = token.t0 * 0.01;
+                        word["end"] = token.t1 * 0.01;
+                    }
+                    word["probability"] = token.p;
+                    total_logprob += token.plog;
+                    segment["words"].push_back(word);
+                }
+
+                segment["temperature"] = params.temperature;
+                segment["avg_logprob"] = total_logprob / n_tokens;
+
+                // TODO compression_ratio and no_speech_prob are not implemented yet
+                // segment["compression_ratio"] = 0;
+                // segment["no_speech_prob"] = 0;
+
+                jres["segments"].push_back(segment);
+            }
+            res.set_content(jres.dump(-1, ' ', false, json::error_handler_t::replace),
+                            "application/json");
         }
         // TODO add more output formats
         else
@@ -792,18 +920,14 @@ int main(int argc, char ** argv) {
 
         // reset params to thier defaults
         params = default_params;
-
-        // return whisper model mutex lock
-        whisper_mutex.unlock();
     });
     svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
-        whisper_mutex.lock();
+        std::lock_guard<std::mutex> lock(whisper_mutex);
         if (!req.has_file("model"))
         {
             fprintf(stderr, "error: no 'model' field in the request\n");
             const std::string error_resp = "{\"error\":\"no 'model' field in the request\"}";
             res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
             return;
         }
         std::string model = req.get_file_value("model").content;
@@ -812,7 +936,6 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "error: 'model': %s not found!\n", model.c_str());
             const std::string error_resp = "{\"error\":\"model not found!\"}";
             res.set_content(error_resp, "application/json");
-            whisper_mutex.unlock();
             return;
         }
 
@@ -835,7 +958,6 @@ int main(int argc, char ** argv) {
         res.set_content(success, "application/text");
 
         // check if the model is in the file system
-        whisper_mutex.unlock();
     });
 
     svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {