Skip to content

Commit

Permalink
llama: update vendored code to commit 40c6d79f (ollama#7875)
Browse files Browse the repository at this point in the history
  • Loading branch information
jmorganca authored Dec 11, 2024
1 parent a37f4a8 commit 527cc97
Show file tree
Hide file tree
Showing 289 changed files with 62,543 additions and 45,797 deletions.
18 changes: 18 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,15 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Add msys paths
if: ${{ startsWith(matrix.os, 'windows-') }}
run: |
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Install msys2 tools
if: ${{ startsWith(matrix.os, 'windows-') }}
run: |
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
Expand Down Expand Up @@ -300,6 +309,15 @@ jobs:
- uses: actions/checkout@v4
with:
submodules: recursive
- name: Add msys paths
if: ${{ startsWith(matrix.os, 'windows-') }}
run: |
echo "c:\msys64\usr\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
echo "C:\msys64\clang64\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
- name: Install msys2 tools
if: ${{ startsWith(matrix.os, 'windows-') }}
run: |
Start-Process "c:\msys64\usr\bin\pacman.exe" -ArgumentList @("-S", "--noconfirm", "mingw-w64-clang-x86_64-gcc-compat", "mingw-w64-clang-x86_64-clang") -NoNewWindow -Wait
- uses: actions/setup-go@v5
with:
go-version-file: go.mod
Expand Down
2 changes: 0 additions & 2 deletions api/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,6 @@ type Options struct {
TopK int `json:"top_k,omitempty"`
TopP float32 `json:"top_p,omitempty"`
MinP float32 `json:"min_p,omitempty"`
TFSZ float32 `json:"tfs_z,omitempty"`
TypicalP float32 `json:"typical_p,omitempty"`
RepeatLastN int `json:"repeat_last_n,omitempty"`
Temperature float32 `json:"temperature,omitempty"`
Expand Down Expand Up @@ -595,7 +594,6 @@ func DefaultOptions() Options {
Temperature: 0.8,
TopK: 40,
TopP: 0.9,
TFSZ: 1.0,
TypicalP: 1.0,
RepeatLastN: 64,
RepeatPenalty: 1.1,
Expand Down
1 change: 0 additions & 1 deletion docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,6 @@ curl http://localhost:11434/api/generate -d '{
"top_k": 20,
"top_p": 0.9,
"min_p": 0.0,
"tfs_z": 0.5,
"typical_p": 0.7,
"repeat_last_n": 33,
"temperature": 0.8,
Expand Down
222 changes: 222 additions & 0 deletions llama/amx.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

#include "amx.h"
#include "common.h"
#include "mmq.h"
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include "ggml-cpu.h"

#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif

#include <cstdlib>
#include <cstring>
#include <memory>

#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)

// AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}

static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context);
}

static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);

GGML_UNUSED(buffer);
}

static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) {
ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else {
memcpy((char *)tensor->data + offset, data, size);
}

GGML_UNUSED(buffer);
}

static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size);

GGML_UNUSED(buffer);
}

static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
if (ggml_backend_buffer_is_host(src->buffer)) {
if (qtype_has_amx_kernels(src->type)) {
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_nbytes(dst));
} else {
memcpy(dst->data, src->data, ggml_nbytes(src));
}
return true;
}
return false;

GGML_UNUSED(buffer);
}

static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}

static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL,
};

static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "AMX";

GGML_UNUSED(buft);
}

static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}

return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
}

static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;

GGML_UNUSED(buft);
}

static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
return ggml_backend_amx_get_alloc_size(tensor);

GGML_UNUSED(buft);
}

static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;

GGML_UNUSED(buft);
}

#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18

static bool ggml_amx_init() {
#if defined(__gnu_linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
}
return true;
#elif defined(_WIN32)
return true;
#endif
}
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
/* .context = */ NULL,
};

if (!ggml_amx_init()) {
return NULL;
}

return &ggml_backend_buffer_type_amx;
}

bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
}

bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};

switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;

case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];

const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];

// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);

bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x

return can_use_amx;
}
default:
return false;
}
}

#endif // defined(__AMX_INT8__) && defined(__AVX512VNNI__)
46 changes: 46 additions & 0 deletions llama/amx.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/**
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

#include "ggml-backend.h"
#include "ggml-cpu-impl.h"

#ifdef __cplusplus
extern "C" {
#endif

#if defined(__AMX_INT8__) && defined(__AVX512VNNI__)

ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
bool ggml_backend_amx_buft_is_amx(ggml_backend_buffer_type_t buft);
bool ggml_backend_amx_device_supports_op(const struct ggml_tensor * op);
void ggml_backend_amx_mul_mat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
size_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);

#endif

#ifdef __cplusplus
}
#endif
2 changes: 1 addition & 1 deletion llama/build-info.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
int LLAMA_BUILD_NUMBER = 0;
char const *LLAMA_COMMIT = "3f1ae2e32cde00c39b96be6d01c2997c29bae555";
char const *LLAMA_COMMIT = "40c6d79fb52f995f47507fedfeaae2ac05d9b35c";
char const *LLAMA_COMPILER = "";
char const *LLAMA_BUILD_TARGET = "";
27 changes: 22 additions & 5 deletions llama/clip.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
Expand Down Expand Up @@ -30,13 +30,18 @@
// Note: Even when using identical normalized image inputs (see normalize_image_u8_to_f32()) we have a significant difference in resulting embeddings compared to pytorch
#include "clip.h"
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"

#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif

#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif

#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
Expand Down Expand Up @@ -65,10 +70,17 @@
#include <cinttypes>
#include <limits>

#define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#define LOG_DBG(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
#if defined(LLAVA_LOG_OFF)
# define LOG_INF(...)
# define LOG_WRN(...)
# define LOG_ERR(...)
# define LOG_DBG(...)
#else // defined(LLAVA_LOG_OFF)
# define LOG_INF(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
# define LOG_WRN(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_ERR(...) do { fprintf(stderr, __VA_ARGS__); } while (0)
# define LOG_DBG(...) do { fprintf(stdout, __VA_ARGS__); } while (0)
#endif // defined(LLAVA_LOG_OFF)

#if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN
Expand Down Expand Up @@ -1200,6 +1212,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
LOG_INF("%s: CLIP using Vulkan backend\n", __func__);
#endif

#ifdef GGML_USE_SYCL
new_clip->backend = ggml_backend_sycl_init(0);
LOG_INF("%s: CLIP using SYCL backend\n", __func__);
#endif

if (!new_clip->backend) {
new_clip->backend = ggml_backend_cpu_init();
LOG_INF("%s: CLIP using CPU backend\n", __func__);
Expand Down
2 changes: 1 addition & 1 deletion llama/clip.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* llama.cpp - commit 3f1ae2e32cde00c39b96be6d01c2997c29bae555 - do not edit this file
* llama.cpp - commit 40c6d79fb52f995f47507fedfeaae2ac05d9b35c - do not edit this file
*
* MIT License
*
Expand Down
Loading

0 comments on commit 527cc97

Please sign in to comment.