Skip to content

Commit

Permalink
Merge commit 'f4444d99' into tokenizer-fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
jaime-m-p committed Jul 11, 2024
2 parents c4956e4 + f4444d9 commit 9b8e05b
Show file tree
Hide file tree
Showing 79 changed files with 3,376 additions and 410 deletions.
16 changes: 16 additions & 0 deletions .devops/nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,22 @@ let
ps.tiktoken
ps.torchWithoutCuda
ps.transformers

# server bench
ps.matplotlib

# server tests
ps.openai
ps.behave
ps.prometheus-client

# for examples/pydantic-models-to-grammar-examples.py
ps.docstring-parser
ps.pydantic

# for scripts/compare-llama-bench.py
ps.gitpython
ps.tabulate
]
);

Expand Down
4 changes: 3 additions & 1 deletion .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ SYCL:
- any-glob-to-any-file:
- ggml/include/ggml-sycl.h
- ggml/src/ggml-sycl.cpp
- README-sycl.md
- ggml/src/ggml-sycl/**
- docs/backend/SYCL.md
- examples/sycl/**
Nvidia GPU:
- changed-files:
- any-glob-to-any-file:
Expand Down
38 changes: 38 additions & 0 deletions .github/workflows/python-type-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Python Type-Check

on:
push:
paths:
- '.github/workflows/python-type-check.yml'
- '**.py'
- '**/requirements*.txt'
pull_request:
paths:
- '.github/workflows/python-type-check.yml'
- '**.py'
- '**/requirements*.txt'

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
cancel-in-progress: true

jobs:
python-type-check:
runs-on: ubuntu-latest
name: pyright type-check
steps:
- name: Check out source repository
uses: actions/checkout@v4
- name: Set up Python environment
uses: actions/setup-python@v5
with:
python-version: "3.11"
- name: Install Python dependencies
# TODO: use a venv
run: pip install -r requirements/requirements-all.txt
- name: Type-check with Pyright
uses: jakebailey/pyright-action@v2
with:
version: 1.1.370
level: warning
warnings: true
9 changes: 4 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,6 @@ endif()
# option list
#

# general
option(LLAMA_CCACHE "llama: use ccache if available" ON)

# debug
option(LLAMA_ALL_WARNINGS "llama: enable all compiler warnings" ON)
option(LLAMA_ALL_WARNINGS_3RD_PARTY "llama: enable all compiler warnings in 3rd party libs" OFF)
Expand All @@ -77,7 +74,6 @@ option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

# override ggml options
set(GGML_CCACHE ${LLAMA_CCACHE})
set(GGML_SANITIZE_THREAD ${LLAMA_SANITIZE_THREAD})
set(GGML_SANITIZE_ADDRESS ${LLAMA_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${LLAMA_SANITIZE_UNDEFINED})
Expand Down Expand Up @@ -115,7 +111,10 @@ llama_option_depr(WARNING LLAMA_SYCL_F16 GGML_SYCL_F16)
# build the library
#

add_subdirectory(ggml)
if (NOT TARGET ggml)
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
add_subdirectory(src)

#
Expand Down
108 changes: 96 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,14 @@ TEST_TARGETS = \
tests/test-tokenizer-1-spm

# Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned
LEGACY_TARGETS = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama \
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm

# Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them.
# We don't want to clutter things too much, so we only build replacements for the most commonly used binaries.
LEGACY_TARGETS_BUILD = main quantize perplexity embedding server finetune

# Deprecation aliases
ifdef LLAMA_CUBLAS
$(error LLAMA_CUBLAS is removed. Use GGML_CUDA instead.)
Expand Down Expand Up @@ -193,7 +197,7 @@ ifdef GGML_RPC
BUILD_TARGETS += rpc-server
endif

default: $(BUILD_TARGETS)
default: $(BUILD_TARGETS) $(LEGACY_TARGETS_BUILD)

test: $(TEST_TARGETS)
@failures=0; \
Expand Down Expand Up @@ -228,7 +232,7 @@ test: $(TEST_TARGETS)
fi
@echo 'All tests passed.'

all: $(BUILD_TARGETS) $(TEST_TARGETS)
all: $(BUILD_TARGETS) $(TEST_TARGETS) $(LEGACY_TARGETS_BUILD)

ifdef RISCV_CROSS_COMPILE
CC := riscv64-unknown-linux-gnu-gcc
Expand All @@ -245,17 +249,22 @@ MK_CFLAGS = -std=c11 -fPIC
MK_CXXFLAGS = -std=c++11 -fPIC
MK_NVCCFLAGS = -std=c++11

ifndef LLAMA_NO_CCACHE
ifdef LLAMA_NO_CCACHE
GGML_NO_CCACHE := 1
DEPRECATE_WARNING := 1
endif

ifndef GGML_NO_CCACHE
CCACHE := $(shell which ccache)
ifdef CCACHE
export CCACHE_SLOPPINESS = time_macros
$(info I ccache found, compilation results will be cached. Disable with LLAMA_NO_CCACHE.)
$(info I ccache found, compilation results will be cached. Disable with GGML_NO_CCACHE.)
CC := $(CCACHE) $(CC)
CXX := $(CCACHE) $(CXX)
else
$(info I ccache not found. Consider installing it for faster compilation.)
endif # CCACHE
endif # LLAMA_NO_CCACHE
endif # GGML_NO_CCACHE

# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
Expand Down Expand Up @@ -545,7 +554,7 @@ endif # GGML_BLIS

ifndef GGML_NO_LLAMAFILE
MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
OBJ_GGML += ggml/src/sgemm.o
OBJ_GGML += ggml/src/llamafile/sgemm.o
endif

ifdef GGML_RPC
Expand Down Expand Up @@ -826,7 +835,8 @@ OBJ_GGML += \
ggml/src/ggml.o \
ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \
ggml/src/ggml-quants.o
ggml/src/ggml-quants.o \
ggml/src/ggml-aarch64.o

OBJ_LLAMA = \
src/llama.o \
Expand Down Expand Up @@ -926,6 +936,7 @@ $(info - LLAMA_NO_LLAMAFILE)
$(info - LLAMA_NO_ACCELERATE)
$(info - LLAMA_NO_OPENMP)
$(info - LLAMA_NO_METAL)
$(info - LLAMA_NO_CCACHE)
$(info )
endif

Expand Down Expand Up @@ -959,15 +970,22 @@ ggml/src/ggml-quants.o: \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@

ggml/src/ggml-aarch64.o: \
ggml/src/ggml-aarch64.c \
ggml/include/ggml.h \
ggml/src/ggml-aarch64.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@

ggml/src/ggml-blas.o: \
ggml/src/ggml-blas.cpp \
ggml/include/ggml-blas.h
$(CXX) $(CXXFLAGS) -c $< -o $@

ifndef GGML_NO_LLAMAFILE
ggml/src/sgemm.o: \
ggml/src/sgemm.cpp \
ggml/src/sgemm.h \
ggml/src/llamafile/sgemm.o: \
ggml/src/llamafile/sgemm.cpp \
ggml/src/llamafile/sgemm.h \
ggml/include/ggml.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_NO_LLAMAFILE
Expand Down Expand Up @@ -1092,7 +1110,7 @@ clean:
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
rm -rvf $(BUILD_TARGETS)
rm -rvf $(TEST_TARGETS)
rm -rvf $(LEGACY_TARGETS)
rm -rvf $(LEGACY_TARGETS_CLEAN)
find examples pocs -type f -name "*.o" -delete

#
Expand Down Expand Up @@ -1488,3 +1506,69 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

#
# Deprecated binaries that we want to keep around long enough for people to migrate to the new filenames, then these can be removed.
#
# Mark legacy binary targets as .PHONY so that they are always checked.
.PHONY: main quantize perplexity embedding server finetune

main: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard main))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'main' binary is deprecated. Please use 'llama-cli' instead."
@echo " Remove the 'main' binary to remove this warning."
@echo "#########"
endif

quantize: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard quantize))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
@echo " Remove the 'quantize' binary to remove this warning."
@echo "#########"
endif

perplexity: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard perplexity))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
@echo " Remove the 'perplexity' binary to remove this warning."
@echo "#########"
endif

embedding: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard embedding))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
@echo " Remove the 'embedding' binary to remove this warning."
@echo "#########"
endif

server: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard server))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'server' binary is deprecated. Please use 'llama-server' instead."
@echo " Remove the 'server' binary to remove this warning."
@echo "#########"
endif

finetune: examples/deprecation-warning/deprecation-warning.cpp
ifneq (,$(wildcard finetune))
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo "#########"
@echo "WARNING: The 'finetune' binary is deprecated. Please use 'llama-finetune' instead."
@echo " Remove the 'finetune' binary to remove this warning."
@echo "#########"
endif
1 change: 1 addition & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ var sources = [
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.c",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-aarch64.c",
]

var resources: [Resource] = []
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,9 @@ Typically finetunes of the base models below are supported as well.
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
- [x] [OLMo](https://allenai.org/olmo)
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)

(instructions for supporting more models: [HOWTO-add-model.md](./docs/HOWTO-add-model.md))
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))

**Multimodal models:**

Expand Down Expand Up @@ -452,7 +453,7 @@ To learn more how to measure perplexity using llama.cpp, [read this documentatio
- [How to build](./docs/build.md)
- [Running on Docker](./docs/docker.md)
- [Build on Android](./docs/android.md)
- [Performance troubleshooting](./docs/token_generation_performance_tips.md)
- [Performance troubleshooting](./docs/development/token_generation_performance_tips.md)
- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)

**Seminal papers and background on the models**
Expand Down
4 changes: 4 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif

#include "common.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
Expand Down
2 changes: 1 addition & 1 deletion common/log.h
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ inline std::string LOG_TOKENS_TOSTR_PRETTY(const C & ctx, const T & tokens)
buf << "[ ";

bool first = true;
for (const auto &token : tokens)
for (const auto & token : tokens)
{
if (!first) {
buf << ", ";
Expand Down
11 changes: 6 additions & 5 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -282,8 +282,6 @@ static llama_token llama_sampling_sample_impl(
GGML_ASSERT(!original_logits.empty());
}
llama_token id = 0;
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);

if (temp < 0.0) {
// greedy sampling, with probs
Expand Down Expand Up @@ -324,6 +322,9 @@ static llama_token llama_sampling_sample_impl(
}

if (ctx_sampling->grammar != NULL && !is_resampling) {
// Get a pointer to the logits
float * logits = llama_get_logits_ith(ctx_main, idx);

// Create an array with a single token data element for the sampled id
llama_token_data single_token_data = {id, logits[id], 0.0f};
llama_token_data_array single_token_data_array = { &single_token_data, 1, false };
Expand Down Expand Up @@ -377,7 +378,7 @@ static llama_token_data_array llama_sampling_prepare_impl(
if (ctx_sampling->grammar != NULL && !apply_grammar) {
GGML_ASSERT(original_logits != NULL);
// Only make a copy of the original logits if we are not applying grammar checks, not sure if I actually have to do this.
*original_logits = {logits, logits + llama_n_vocab(llama_get_model(ctx_main))};
*original_logits = {logits, logits + n_vocab};
}

// apply params.logit_bias map
Expand All @@ -390,10 +391,10 @@ static llama_token_data_array llama_sampling_prepare_impl(
llama_sample_apply_guidance(ctx_main, logits, logits_guidance, params.cfg_scale);
}

cur.clear();
cur.resize(n_vocab);

for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
cur.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
cur[token_id] = llama_token_data{token_id, logits[token_id], 0.0f};
}

llama_token_data_array cur_p = { cur.data(), cur.size(), false };
Expand Down
Loading

0 comments on commit 9b8e05b

Please sign in to comment.