Skip to content

Commit

Permalink
Add FP8 support to gguf/llama:
Browse files Browse the repository at this point in the history
E5M2 & E4M3: for use with FP8 distributed model
E4M3_Q & E3M4_Q: for gguf quantized model.

E5M2 and A4M3 type are use like FP16 / BF16 native.
E4M3_Q and E3M4_Q are define like Q8_0 with bloc size of 256 (like QK_K)
  • Loading branch information
Djip007 committed Nov 30, 2024
1 parent 43b5d9e commit 038b5fa
Show file tree
Hide file tree
Showing 20 changed files with 746 additions and 124 deletions.
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ if (NOT DEFINED GGML_LLAMAFILE)
set(GGML_LLAMAFILE_DEFAULT ON)
endif()

if (NOT DEFINED GGML_OPENMP_SIMD)
set(GGML_OPENMP_SIMD_DEFAULT ON)
endif()

if (NOT DEFINED GGML_AMX)
set(GGML_AMX ON)
endif()
Expand Down
32 changes: 20 additions & 12 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,10 @@ GGML_NO_OPENMP := 1
DEPRECATE_WARNING := 1
endif

ifdef LLAMA_NO_OPENMP_SIMD
GGML_NO_OPENMP_SIMD := 1
endif

ifdef LLAMA_NO_METAL
GGML_NO_METAL := 1
DEPRECATE_WARNING := 1
Expand Down Expand Up @@ -542,6 +546,12 @@ ifndef GGML_NO_OPENMP
MK_CXXFLAGS += -fopenmp
endif # GGML_NO_OPENMP

ifndef GGML_NO_OPENMP_SIMD
MK_CPPFLAGS += -DGGML_USE_OPENMP_SIMD
MK_CFLAGS += -fopenmp-simd
MK_CXXFLAGS += -fopenmp-simd
endif # GGML_NO_OPENMP_SIMD

ifdef GGML_OPENBLAS
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
Expand Down Expand Up @@ -948,12 +958,14 @@ OBJ_GGML = \
$(DIR_GGML)/src/ggml-alloc.o \
$(DIR_GGML)/src/ggml-backend.o \
$(DIR_GGML)/src/ggml-backend-reg.o \
$(DIR_GGML)/src/ggml-fp8_cpp11.o \
$(DIR_GGML)/src/ggml-opt.o \
$(DIR_GGML)/src/ggml-quants.o \
$(DIR_GGML)/src/ggml-threading.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp11.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-fp8_cpp11.o \
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
$(OBJ_GGML_EXT)

Expand Down Expand Up @@ -1094,17 +1106,13 @@ DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
# Default target
all: $(BUILD_TARGETS)

# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-cpp.o: \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/include/ggml-backend.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-impl.h
$(CXX) $(CXXFLAGS) -c $< -o $@
# for c++17 build
$(DIR_GGML)/%_cpp17.o: $(DIR_GGML)/%.cpp
$(CXX) $(CXXFLAGS) -MMD -std=c++17 -c $< -o $@

# for c++11 build
$(DIR_GGML)/%_cpp11.o: $(DIR_GGML)/%.cpp
$(CXX) $(CXXFLAGS) -MMD -std=c++11 -c $< -o $@

# Rules for building object files
$(DIR_GGML)/%.o: $(DIR_GGML)/%.c
Expand Down
1 change: 1 addition & 0 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ var sources = [
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
"ggml/src/ggml-threading.cpp",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-fp8.cpp",
]

var resources: [Resource] = []
Expand Down
2 changes: 2 additions & 0 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q4_0_4_4", LLAMA_FTYPE_MOSTLY_Q4_0_4_4, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_4_8", LLAMA_FTYPE_MOSTLY_Q4_0_4_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "Q4_0_8_8", LLAMA_FTYPE_MOSTLY_Q4_0_8_8, " 4.34G, +0.4685 ppl @ Llama-3-8B", },
{ "E4M3_Q", LLAMA_FTYPE_MOSTLY_E4M3_Q, "12.21G, 0.0050 kld @ Mistral-Nemo", },
{ "E3M4_Q", LLAMA_FTYPE_MOSTLY_E3M4_Q, "12.21G, 0.0016 kld @ Mistral-Nemo", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, +0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
Expand Down
5 changes: 5 additions & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ if (NOT GGML_LLAMAFILE_DEFAULT)
set(GGML_LLAMAFILE_DEFAULT OFF)
endif()

if (NOT GGML_OPENMP_SIMD_DEFAULT)
set(GGML_OPENMP_SIMD_DEFAULT OFF)
endif()

if (NOT GGML_CUDA_GRAPHS_DEFAULT)
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
endif()
Expand Down Expand Up @@ -112,6 +116,7 @@ option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_SVE "ggml: enable SVE" OFF)
option(GGML_OPENMP_SIMD "ggml: enable OPENMP_SIMD" ${GGML_OPENMP_SIMD_DEFAULT})

if (WIN32)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
Expand Down
8 changes: 8 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,10 @@ extern "C" {
GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_E5M2 = 39,
GGML_TYPE_E4M3 = 40,
GGML_TYPE_E4M3_Q = 41,
GGML_TYPE_E3M4_Q = 42,
GGML_TYPE_COUNT,
};

Expand Down Expand Up @@ -436,6 +440,10 @@ extern "C" {
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
GGML_FTYPE_MOSTLY_E5M2 = 28, // except 1d tensors
GGML_FTYPE_MOSTLY_E4M3 = 29, // except 1d tensors
GGML_FTYPE_MOSTLY_E4M3_Q = 30, // except 1d tensors
GGML_FTYPE_MOSTLY_E3M4_Q = 31, // except 1d tensors
};

// available tensor operations:
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,10 @@ add_library(ggml-base
ggml-quants.c
ggml-quants.h
ggml-aarch64.c
ggml-aarch64.h)
ggml-aarch64.h
ggml-fp8.cpp
ggml-fp8.h
)

target_include_directories(ggml-base PRIVATE .)

Expand Down
76 changes: 59 additions & 17 deletions ggml/src/ggml-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,20 @@
typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;

#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CPP)
#include <cstdint>

typedef uint16_t ggml_half;
typedef uint32_t ggml_half2;

// std-c++ allow anonymous unions but some compiler warn on it
#define GGML_COMMON_AGGR_U data
// std-c++ do not allow it.
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_METAL)
Expand All @@ -15,7 +28,8 @@ typedef uint32_t ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_CUDA)
Expand All @@ -29,7 +43,8 @@ typedef half2 ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_HIP)
Expand All @@ -39,7 +54,8 @@ typedef half2 ggml_half2;
typedef half ggml_half;
typedef half2 ggml_half2;

#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#elif defined(GGML_COMMON_DECL_SYCL)
Expand All @@ -49,7 +65,8 @@ typedef half2 ggml_half2;
typedef sycl::half ggml_half;
typedef sycl::half2 ggml_half2;

#define GGML_COMMON_AGGR data
#define GGML_COMMON_AGGR_U
#define GGML_COMMON_AGGR_S data

#define GGML_COMMON_DECL
#endif
Expand Down Expand Up @@ -154,9 +171,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half m; // min
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == 2 * sizeof(ggml_half) + QK4_1 / 2, "wrong q4_1 block size/padding");
Expand All @@ -175,9 +192,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half m; // min
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
Expand All @@ -196,9 +213,9 @@ typedef struct {
struct {
ggml_half d; // delta
ggml_half s; // d * sum(qs[i])
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 ds;
};
} GGML_COMMON_AGGR_U;
int8_t qs[QK8_1]; // quants
} block_q8_1;
static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_half) + QK8_1, "wrong q8_1 block size/padding");
Expand Down Expand Up @@ -261,9 +278,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
} block_q2_K;
static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_half) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");

Expand All @@ -288,9 +305,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qs[QK_K/2]; // 4--bit quants
} block_q4_K;
Expand All @@ -305,9 +322,9 @@ typedef struct {
struct {
ggml_half d; // super-block scale for quantized scales
ggml_half dmin; // super-block scale for quantized mins
} GGML_COMMON_AGGR;
} GGML_COMMON_AGGR_S;
ggml_half2 dm;
};
} GGML_COMMON_AGGR_U;
uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
uint8_t qh[QK_K/8]; // quants, high bit
uint8_t qs[QK_K/2]; // quants, low 4 bits
Expand Down Expand Up @@ -424,6 +441,24 @@ typedef struct {
} block_iq4_nlx4;
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");

// fp8 support
// - fp8 simple type
typedef struct { uint8_t bits; } ggml_e5m2_t;
typedef struct { uint8_t bits; } ggml_e4m3_t;

// - fp8 with bloc delta => 8.125 bpw
typedef struct {
float d; // delta
uint8_t qs[QK_K];
} block_e4m3_q;
static_assert(sizeof(block_e4m3_q) == sizeof(float) + QK_K, "wrong block_e4m3_q block size/padding");

typedef struct {
float d; // delta
uint8_t qs[QK_K];
} block_e3m4_q;
static_assert(sizeof(block_e3m4_q) == sizeof(float) + QK_K, "wrong block_e3m4_q block size/padding");

#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL

Expand All @@ -437,6 +472,13 @@ static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wro
#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };

#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_CPP)
#include <cstdint>

#define GGML_TABLE_BEGIN(type, name, size) static const type name[size] = {
#define GGML_TABLE_END() };

#define GGML_COMMON_IMPL
#elif defined(GGML_COMMON_IMPL_METAL)
#include <metal_stdlib>
Expand Down
22 changes: 22 additions & 0 deletions ggml/src/ggml-cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ list (APPEND GGML_CPU_SOURCES
ggml-cpu-aarch64.h
ggml-cpu-quants.c
ggml-cpu-quants.h
ggml-cpu-fp8.cpp
ggml-cpu-fp8.h
amx/amx.cpp
amx/amx.h
amx/mmq.cpp
Expand Down Expand Up @@ -45,6 +47,18 @@ if (GGML_OPENMP)
endif()
endif()

if (GGML_OPENMP_SIMD)
check_cxx_compiler_flag("-fopenmp-simd" SUPPORTS_OPENMP_SIMD)
if (SUPPORTS_OPENMP_SIMD)
# OpenMP_RUNTIME_MSVC=experimental / if (MSVC)
message(STATUS "Using OPENMP_SIMD.")
add_compile_definitions(GGML_USE_OPENMP_SIMD)
set(OPENMP_SIMD_FLAGS -fopenmp-simd)
else()
message(WARNING "C++ compiler lacks OPENMP_SIMD support.")
endif()
endif()

if (GGML_LLAMAFILE)
message(STATUS "Using llamafile")

Expand Down Expand Up @@ -304,3 +318,11 @@ set_source_files_properties(${GGML_CPU_SOURCES} PROPERTIES COMPILE_DEFINITIONS "
if (EMSCRIPTEN)
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
endif()

# FP8
if (OPENMP_SIMD_FLAGS)
# set_source_files_properties(ggml-cpu-fp8.cpp PROPERTIES COMPILE_FLAGS ${OPENMP_SIMD_FLAGS})
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS ${OPENMP_SIMD_FLAGS})
endif()


Loading

0 comments on commit 038b5fa

Please sign in to comment.