Skip to content

Commit

Permalink
Merge branch 'master' into gg/ggml-common-decl
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Mar 11, 2024
2 parents f33ab14 + 332bdfd commit 6075b27
Show file tree
Hide file tree
Showing 15 changed files with 1,646 additions and 454 deletions.
16 changes: 14 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,7 @@ option(LLAMA_MPI "llama: use MPI"
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
option(LLAMA_SYCL "llama: use SYCL" OFF)
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -207,7 +208,7 @@ if (LLAMA_METAL)
enable_language(ASM)
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)

set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")

Expand Down Expand Up @@ -534,6 +535,10 @@ if (LLAMA_HIPBLAS)
endif()

if (LLAMA_SYCL)
if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
endif()

if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
endif()
Expand All @@ -555,14 +560,21 @@ if (LLAMA_SYCL)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
endif()

set(GGML_HEADERS_SYCL ggml-sycl.h)
set(GGML_SOURCES_SYCL ggml-sycl.cpp)

if (WIN32)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl)
endif()
endif()
endif()

Expand Down
26 changes: 26 additions & 0 deletions README-sycl.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,29 @@ For iGPU, please make sure the shared memory from host memory is enough. For lla

For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.

## Nvidia GPU

### Verified

|Intel GPU| Status | Verified Model|
|-|-|-|
|Ampere Series| Support| A100|

### oneMKL

The current oneMKL release does not contain the oneMKL cuBlas backend.
As a result for Nvidia GPU's oneMKL must be built from source.

```
git clone https://github.com/oneapi-src/oneMKL
cd oneMKL
mkdir build
cd build
cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON
ninja
// Add paths as necessary
```

## Docker

Note:
Expand Down Expand Up @@ -186,6 +209,9 @@ source /opt/intel/oneapi/setvars.sh
# Or, for FP32:
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

# For Nvidia GPUs
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx

# Build example/main only
#cmake --build . --config Release --target main

Expand Down
7 changes: 2 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,6 @@

Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

> [!IMPORTANT]
> **Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962**
>
> Vote for which quantization type provides better responses, all other parameters being the same.
### Recent API changes

- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
Expand All @@ -21,6 +16,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

### Hot topics

- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328

----
Expand Down
53 changes: 51 additions & 2 deletions examples/llama.android/app/src/main/cpp/llama-android.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,45 @@ jclass la_int_var;
jmethodID la_int_var_value;
jmethodID la_int_var_inc;

std::string cached_token_chars;

bool is_valid_utf8(const char * string) {
if (!string) {
return true;
}

const unsigned char * bytes = (const unsigned char *)string;
int num;

while (*bytes != 0x00) {
if ((*bytes & 0x80) == 0x00) {
// U+0000 to U+007F
num = 1;
} else if ((*bytes & 0xE0) == 0xC0) {
// U+0080 to U+07FF
num = 2;
} else if ((*bytes & 0xF0) == 0xE0) {
// U+0800 to U+FFFF
num = 3;
} else if ((*bytes & 0xF8) == 0xF0) {
// U+10000 to U+10FFFF
num = 4;
} else {
return false;
}

bytes += 1;
for (int i = 1; i < num; ++i) {
if ((*bytes & 0xC0) != 0x80) {
return false;
}
bytes += 1;
}
}

return true;
}

static void log_callback(ggml_log_level level, const char * fmt, void * data) {
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
Expand Down Expand Up @@ -295,6 +334,8 @@ Java_com_example_llama_Llm_completion_1init(
jint n_len
) {

cached_token_chars.clear();

const auto text = env->GetStringUTFChars(jtext, 0);
const auto context = reinterpret_cast<llama_context *>(context_pointer);
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
Expand Down Expand Up @@ -372,8 +413,16 @@ Java_com_example_llama_Llm_completion_1loop(
}

auto new_token_chars = llama_token_to_piece(context, new_token_id);
LOGi("new_token_chars: `%s`", new_token_chars.c_str());
auto new_token = env->NewStringUTF(new_token_chars.c_str());
cached_token_chars += new_token_chars;

jstring new_token = nullptr;
if (is_valid_utf8(cached_token_chars.c_str())) {
new_token = env->NewStringUTF(cached_token_chars.c_str());
LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id);
cached_token_chars.clear();
} else {
new_token = env->NewStringUTF("");
}

llama_batch_clear(*batch);
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class Llm {
batch: Long,
nLen: Int,
ncur: IntVar
): String
): String?

private external fun kv_cache_clear(context: Long)

Expand Down Expand Up @@ -115,7 +115,7 @@ class Llm {
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
while (ncur.value <= nlen) {
val str = completion_loop(state.context, state.batch, nlen, ncur)
if (str.isEmpty()) {
if (str == null) {
break
}
emit(str)
Expand Down
7 changes: 4 additions & 3 deletions examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3195,11 +3195,12 @@ int main(int argc, char ** argv) {
ctx_server.queue_results.add_waiting_task_id(id_task);
ctx_server.request_completion(id_task, -1, data, false, false);

const auto completion_id = gen_chatcmplid();
if (!json_value(data, "stream", false)) {
server_task_result result = ctx_server.queue_results.recv(id_task);

if (!result.error && result.stop) {
json result_oai = format_final_response_oaicompat(data, result.data);
json result_oai = format_final_response_oaicompat(data, result.data, completion_id);

res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
} else {
Expand All @@ -3208,11 +3209,11 @@ int main(int argc, char ** argv) {
}
ctx_server.queue_results.remove_waiting_task_id(id_task);
} else {
const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) {
const auto chunked_content_provider = [id_task, &ctx_server, completion_id](size_t, httplib::DataSink & sink) {
while (true) {
server_task_result result = ctx_server.queue_results.recv(id_task);
if (!result.error) {
std::vector<json> result_array = format_partial_response_oaicompat(result.data);
std::vector<json> result_array = format_partial_response_oaicompat(result.data, completion_id);

for (auto it = result_array.begin(); it != result_array.end(); ++it) {
if (!it->empty()) {
Expand Down
12 changes: 6 additions & 6 deletions examples/server/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ static json oaicompat_completion_params_parse(
return llama_params;
}

static json format_final_response_oaicompat(const json & request, json result, bool streaming = false) {
static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
bool stopped_word = result.count("stopped_word") != 0;
bool stopped_eos = json_value(result, "stopped_eos", false);
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
Expand Down Expand Up @@ -412,7 +412,7 @@ static json format_final_response_oaicompat(const json & request, json result, b
{"prompt_tokens", num_prompt_tokens},
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
}},
{"id", gen_chatcmplid()}
{"id", completion_id}
};

if (server_verbose) {
Expand All @@ -427,7 +427,7 @@ static json format_final_response_oaicompat(const json & request, json result, b
}

// return value is vector as there is one case where we might need to generate two responses
static std::vector<json> format_partial_response_oaicompat(json result) {
static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
return std::vector<json>({result});
}
Expand Down Expand Up @@ -471,7 +471,7 @@ static std::vector<json> format_partial_response_oaicompat(json result) {
{"role", "assistant"}
}}}})},
{"created", t},
{"id", gen_chatcmplid()},
{"id", completion_id},
{"model", modelname},
{"object", "chat.completion.chunk"}};

Expand All @@ -482,7 +482,7 @@ static std::vector<json> format_partial_response_oaicompat(json result) {
{"content", content}}}
}})},
{"created", t},
{"id", gen_chatcmplid()},
{"id", completion_id},
{"model", modelname},
{"object", "chat.completion.chunk"}};

Expand All @@ -509,7 +509,7 @@ static std::vector<json> format_partial_response_oaicompat(json result) {
json ret = json {
{"choices", choices},
{"created", t},
{"id", gen_chatcmplid()},
{"id", completion_id},
{"model", modelname},
{"object", "chat.completion.chunk"}
};
Expand Down
Loading

0 comments on commit 6075b27

Please sign in to comment.