Skip to content

Commit

Permalink
Merge branch 'master' into gg/ggml-common-decl
Browse files Browse the repository at this point in the history
  • Loading branch information
ggerganov committed Mar 10, 2024
2 parents 689a4ec + b838b53 commit 470e865
Show file tree
Hide file tree
Showing 29 changed files with 990 additions and 754 deletions.
46 changes: 45 additions & 1 deletion .github/workflows/server.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ jobs:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Dependencies
id: depends
Expand All @@ -58,7 +60,6 @@ jobs:
cmake \
python3-pip \
wget \
psmisc \
language-pack-en
- name: Build
Expand Down Expand Up @@ -90,3 +91,46 @@ jobs:
run: |
cd examples/server/tests
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
server-windows:
runs-on: windows-latest

steps:
- name: Clone
id: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Build
id: cmake_build
run: |
mkdir build
cd build
cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
- name: Python setup
id: setup_python
uses: actions/setup-python@v5
with:
python-version: '3.11'

- name: Tests dependencies
id: test_dependencies
run: |
pip install -r examples/server/tests/requirements.txt
- name: Tests
id: server_integration_tests
run: |
cd examples/server/tests
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests
id: server_integration_tests_slow
if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
run: |
cd examples/server/tests
behave.exe --stop --no-skipped --no-capture --tags slow
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ models-mnt
/embedding
/gguf
/gguf-llama-simple
/gritlm
/imatrix
/infill
/libllama.so
Expand Down
6 changes: 5 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
BUILD_TARGETS = \
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o

# Binaries only useful for tests
TEST_TARGETS = \
Expand Down Expand Up @@ -724,6 +724,10 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
Expand Down
11 changes: 6 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,11 @@

Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++

> [!IMPORTANT]
> **Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962**
>
> Vote for which quantization type provides better responses, all other parameters being the same.
### Recent API changes

- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
Expand All @@ -16,11 +21,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)

### Hot topics

- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
- Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328

----

Expand Down
16 changes: 16 additions & 0 deletions common/grammar-parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,22 @@ namespace grammar_parser {
while (*pos) {
pos = parse_rule(state, pos);
}
// Validate the state to ensure that all rules are defined
for (const auto & rule : state.rules) {
for (const auto & elem : rule) {
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
// Ensure that the rule at that location exists
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
// Get the name of the rule that is missing
for (const auto & kv : state.symbol_ids) {
if (kv.second == elem.value) {
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
}
}
}
}
}
}
return state;
} catch (const std::exception & err) {
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());
Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ else()
add_subdirectory(convert-llama2c-to-ggml)
add_subdirectory(embedding)
add_subdirectory(finetune)
add_subdirectory(gritlm)
add_subdirectory(infill)
add_subdirectory(llama-bench)
add_subdirectory(llava)
Expand Down
6 changes: 2 additions & 4 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,12 +189,10 @@ int main(int argc, char ** argv) {

int32_t nelements = sizex*sizey;

std::vector<int64_t> hist_cur(1 << 4, 0);

// Set up a the benchmark matrices
// printf("Creating new tensor q11 & Running quantize\n");
struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], hist_cur.data(), nullptr);
ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr);

// Set up a the compute graph
// printf("Creating new tensor q31\n");
Expand All @@ -207,7 +205,7 @@ int main(int argc, char ** argv) {
// Set up a second graph computation to make sure we override the CPU cache lines
// printf("Creating new tensor q12 & Running quantize\n");
struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey);
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], hist_cur.data(), nullptr);
ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr);

// printf("Creating new tensor q32\n");
struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2);
Expand Down
5 changes: 5 additions & 0 deletions examples/gritlm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
set(TARGET gritlm)
add_executable(${TARGET} gritlm.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)
Loading

0 comments on commit 470e865

Please sign in to comment.