llama : add vision support for mllama (wip)

danbev · Jan 22, 2025 · 5805490 · 5805490
1 parent 28caaed
commit 5805490
Show file tree

Hide file tree

Showing 27 changed files with 20,726 additions and 15 deletions.
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -867,7 +867,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params) {
             params.warmup = false;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_VISION}));
     add_opt(common_arg(
         {"--spm-infill"},
         string_format(

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -54,6 +54,7 @@ else()
     add_subdirectory(tts)
     add_subdirectory(gen-docs)
     add_subdirectory(vision)
+    add_subdirectory(vision-mllama)
     if (NOT GGML_BACKEND_DL)
         # these examples use the backends directly and cannot be built with dynamic loading
         add_subdirectory(convert-llama2c-to-ggml)

diff --git a/examples/vision-mllama/CMakeLists.txt b/examples/vision-mllama/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-vision-mllama)
+add_executable(${TARGET} vision.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE llama common ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/vision-mllama/README.md b/examples/vision-mllama/README.md
@@ -0,0 +1,82 @@
+# llama.cpp/example/vision-mllama
+
+This examples tries to mimic the vision example which uses the new Vision API.
+The Llama 3.2 Vision model uses cross attention instead where the image patch
+embeddings are used in the cross attention layers.
+
+> **This is a work in progress**
+> There are a number of short-cuts taking in the code as the main goal was to
+get something working which can be iterated upon if this is worth pursuing.
+
+### Model conversion
+To convert the Llama 3.2 Vision Instruct model to GGUF we need to install the
+required python packages:
+```console
+$ python3 -m venv venv
+$ source venv/bin/activate
+(venv) pip install -r requirements.txt
+```
+
+Convert LLaMA 3.2 Vision Instruct model to GGUF format:
+```console
+(venv) python ./convert_hf_to_gguf.py --verbose /path/to/Llama-3.2-11B-Vision-Instruct --outfile models/llama-3-2-11b-f32.gguf --outtype f32
+```
+
+Quantize the model to a lower precision:
+```console
+(venv) ./build/bin/llama-quantize models/llama-3-2-11b-f32.gguf models/llama-3-2-11b-Q4_1.gguf Q4_K
+```
+
+### Building the example
+This can be build with cmake using the following commands.
+
+CUDA:
+```console
+$ cmake -S . -B build -DGGML_CUDA=On
+```
+Metal:
+```console
+$ cmake -S . -B build
+```
+
+Then build the example:
+```console
+$ cmake --build build --target llama-vision-mllama -- -j8
+```
+
+### Running the example:
+An image is required to be passed in this example, and the following image is
+included in the repository, but any JPEG image should work:
+
+![New York skyline](ny.jpg)
+
+```console
+$ ./build/bin/llama-vision-mllama -m models/llama-3-2-11b-Q4_K.gguf --image ny.jpg
+ggml_cuda_init: GGML_CUDA_FORCE_MMQ:    no
+ggml_cuda_init: GGML_CUDA_FORCE_CUBLAS: no
+ggml_cuda_init: found 1 CUDA devices:
+  Device 0: NVIDIA GeForce RTX 4070, compute capability 8.9, VMM: yes
+register_backend: registered backend CUDA (1 devices)
+register_device: registered device CUDA0 (NVIDIA GeForce RTX 4070)
+register_backend: registered backend CPU (1 devices)
+register_device: registered device CPU (12th Gen Intel(R) Core(TM) i7-1260P)
+build: 4511 (d76578a9) with cc (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0 for x86_64-linux-gnu (debug)
+llama_model_load_from_file_impl: using device CUDA0 (NVIDIA GeForce RTX 4070) - 11743 MiB free
+llama_model_loader: loaded meta data with 58 key-value pairs and 908 tensors from models/llama-3-2-11b-Q4_K.gguf (version GGUF V3 (latest))
+llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
+llama_model_loader: - kv   0:                       general.architecture str              = mllama
+llama_model_loader: - kv   1:                               general.type str              = model
+llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 11B Vision Instruct
+llama_model_loader: - kv   3:                           general.finetune str              = vision-instruct
+llama_model_loader: - kv   4:                           general.basename str              = llama-3.2
+llama_model_loader: - kv   5:                         general.size_label str              = 11B
+llama_model_loader: - kv   6:                            general.license str              = llama3.2
+llama_model_loader: - kv   7:                               general.tags arr[str,6]       = ["facebook", "meta", "pytorch", "llam...
+llama_model_loader: - kv   8:                          general.languages arr[str,8]       = ["en", "de", "fr", "it", "pt", "hi", ...
+
+llama_model_loader: - kv   9:                          mllama.vocab_size u32              = 128256
+...
+
+The image depicts a cityscape, likely New York City, with the Empire State
+Building prominently visible in the center.
+```
diff --git a/examples/vision-mllama/ny.jpg b/examples/vision-mllama/ny.jpg