From bf290a49c56fd732e3a898188da2ddd5c91ce47f Mon Sep 17 00:00:00 2001 From: Vulcan <93451215+trholding@users.noreply.github.com> Date: Mon, 1 Apr 2024 16:54:07 +0530 Subject: [PATCH] AVX Support - run.c : AVX support based on https://github.com/karpathy/llama2.c/blob/feature/avx2/run.c but loop unrolled and other improvements - Makefile: Applied -march=native -mtune=native to most builds --- Makefile | 97 +++++++++++++++++++++++++++++--------------------------- run.c | 68 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 46 deletions(-) diff --git a/Makefile b/Makefile index 5de509a3..0f50602a 100644 --- a/Makefile +++ b/Makefile @@ -32,11 +32,11 @@ runq: runq_cc .PHONY: run_cc run_cc: ## - Standard build with basic optimizations - $(CC) -O3 -o run run.c -lm + $(CC) -O3 -march=native -mtune=native -o run run.c -lm .PHONY: runq_cc runq_cc: ## - Same for quantized build - $(CC) -O3 -o run runq.c -lm + $(CC) -O3 -march=native -mtune=native -o run runq.c -lm # https://gcc.gnu.org/onlinedocs/gcc/Optimize-Options.html # https://simonbyrne.github.io/notes/fastmath/ @@ -49,118 +49,123 @@ runq_cc: ## - Same for quantized build # In our specific application this is *probably* okay to use .PHONY: run_cc_fast run_cc_fast: ## - More Optimized build. Disregards strict standards compliance - $(CC) -Ofast -o run run.c -lm + $(CC) -Ofast -march=native -mtune=native -o run run.c -lm .PHONY: runq_cc_fast runq_cc_fast: ## - Same for quantized build - $(CC) -Ofast -o run runq.c -lm + $(CC) -Ofast -march=native -mtune=native -o run runq.c -lm # compiles with gnu99 standard flags for amazon linux, coreos, etc. compatibility .PHONY: run_cc_gnu run_cc_gnu: ## - Optimized Generic linux distro build - $(CC) -Ofast -std=gnu11 -o run run.c -lm + $(CC) -Ofast -march=native -mtune=native -std=gnu11 -o run run.c -lm .PHONY: runq_cc_gnu runq_cc_gnu: ## - Same for quantized build - $(CC) -Ofast -std=gnu11 -o run runq.c -lm + $(CC) -Ofast -march=native -mtune=native -std=gnu11 -o run runq.c -lm ##@ Accelerated Builds # additionally compiles with OpenMP, allowing multithreaded runs # make sure to also enable multiple threads when running, e.g.: # OMP_NUM_THREADS=4 ./run out/model.bin + +.PHONY: run_cc_avx +run_cc_avx: ## - ***NEW*** AVX accelerated build + $(CC) -D OPENMP -D ACCELAVX -Ofast -fopenmp -mavx -march=native -mtune=native run.c -lm -o run + .PHONY: run_cc_openmp run_cc_openmp: ## - OpenMP accelerated build - $(CC) -D OPENMP -Ofast -fopenmp -march=native run.c -lm -o run + $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -o run .PHONY: runq_cc_openmp runq_cc_openmp: ## - Same for quantized build - $(CC) -D OPENMP -Ofast -fopenmp -march=native runq.c -lm -o run + $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -o run .PHONY: run_cc_openacc run_cc_openacc: ## - OpenACC accelerated build - $(CC) -D OPENACC -Ofast -fopenacc -march=native run.c -lm -o run + $(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native run.c -lm -o run .PHONY: runq_cc_openacc runq_cc_openacc: ## - Same for quantized build - $(CC) -D OPENACC -Ofast -fopenacc -march=native runq.c -lm -o run + $(CC) -D OPENACC -Ofast -fopenacc -march=native -mtune=native runq.c -lm -o run .PHONY: run_cc_omp_gnu run_cc_omp_gnu: ## - Generic linux distro + OpenMP build - $(CC) -D OPENMP -Ofast -fopenmp -std=gnu11 run.c -lm -o run + $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 run.c -lm -o run .PHONY: runq_cc_omp_gnu runq_cc_omp_gnu: ## - Same for quantized build - $(CC) -D OPENMP -Ofast -fopenmp -std=gnu11 runq.c -lm -o run + $(CC) -D OPENMP -Ofast -fopenmp -march=native -mtune=native -std=gnu11 runq.c -lm -o run .PHONY: run_cc_clblast run_cc_clblast: ## - CLBlast OpenCL CBLAS GPU accelerated build - $(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native run.c -lm -lclblast -o run + $(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native run.c -lm -lclblast -o run .PHONY: runq_cc_clblast runq_cc_clblast: ## - Same for quantized build - $(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native runq.c -lm -lclblast -o run + $(CC) -D OPENMP -D CLBLAST -Ofast -fopenmp -march=native -mtune=native runq.c -lm -lclblast -o run .PHONY: run_cc_openblas run_cc_openblas: ## - Openblas CBLAS accelerated build - $(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -I$(OPENBLAS_INC) run.c -lm -lopenblas -o run + $(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) run.c -lm -lopenblas -o run .PHONY: runq_cc_openblas runq_cc_openblas: ## - Same for quantized build - $(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -I$(OPENBLAS_INC) runq.c -lm -lopenblas -o run + $(CC) -D OPENMP -D OPENBLAS -Ofast -fopenmp -march=native -mtune=native -I$(OPENBLAS_INC) runq.c -lm -lopenblas -o run .PHONY: run_cc_cblas run_cc_cblas: ## - Generic CBLAS accelerated build - $(CC) -D CBLAS -Ofast -fopenmp -march=native run.c -lm -lcblas -o run + $(CC) -D CBLAS -Ofast -fopenmp -march=native -mtune=native run.c -lm -lcblas -o run .PHONY: runq_cc_cblas runq_cc_cblas: ## - Same for quantized build - $(CC) -D CBLAS -Ofast -fopenmp -march=native runq.c -lm -lcblas -o run + $(CC) -D CBLAS -Ofast -fopenmp -march=native -mtune=native runq.c -lm -lcblas -o run .PHONY: run_cc_blis run_cc_blis: ## - BLIS accelerated build - $(CC) -D BLIS -Ofast -fopenmp -march=native -I$(BLIS_INC) run.c -lm -lblis -o run + $(CC) -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) run.c -lm -lblis -o run .PHONY: runq_cc_blis runq_cc_blis: ## - Same for quantized build - $(CC) -D BLIS -Ofast -fopenmp -march=native -I$(BLIS_INC) runq.c -lm -lblis -o run + $(CC) -D BLIS -Ofast -fopenmp -march=native -mtune=native -I$(BLIS_INC) runq.c -lm -lblis -o run ##@ Special Builds ##@ ---> x86_64 # amd64 (x86_64) / Intel Mac (WIP) Do not use! .PHONY: run_cc_mkl run_cc_mkl: ## - OpenMP + Intel MKL CBLAS build (x86_64 / intel Mac) (WIP) - $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native run.c -lm -lblis -o run + $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -lblis -o run .PHONY: runq_cc_mkl runq_cc_mkl: ## - Same for quantized build - $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native runq.c -lm -lblis -o run + $(CC) -D MKL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -lblis -o run ##@ ---> ARM64 / aarch64 .PHONY: run_cc_armpl run_cc_armpl: ## - ARM PL BLAS accelerated build (ARM64 & Mac) (WIP) - $(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native run.c -lm -larmpl_lp64_mp -o run + $(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -larmpl_lp64_mp -o run .PHONY: runq_cc_armpl runq_cc_armpl: ## - Same for quantized build - $(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native runq.c -lm -larmpl_lp64_mp -o run + $(CC) -D ARMPL -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -larmpl_lp64_mp -o run ##@ ---> Macintosh .PHONY: run_cc_mac_accel run_cc_mac_accel: ## - Mac OS OPENMP + CBLAS via Accelerate Framework build (WIP/TEST) - $(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native run.c -lm -framework Accelerate -o run + $(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native run.c -lm -framework Accelerate -o run .PHONY: runq_cc_mac_accel runq_cc_mac_accel: ## - Same for quantized build - $(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native runq.c -lm -framework Accelerate -o run + $(CC) -D AAF -D OPENMP -Ofast -fopenmp -march=native -mtune=native runq.c -lm -framework Accelerate -o run ##@ ---> Windows .PHONY: run_win64 run_win: ## - Optimized Windows build with MinGW-w64 toolchain - x86_64-w64-mingw32-gcc -Ofast -D_WIN32 -o run.exe -I. run.c win.c + x86_64-w64-mingw32-gcc -Ofast -march=native -mtune=native -D_WIN32 -o run.exe -I. run.c win.c .PHONY: runq_win64 runq_win: ## - Same for quantized build - x86_64-w64-mingw32-gcc -Ofast -D_WIN32 -o run.exe -I. runq.c win.c + x86_64-w64-mingw32-gcc -Ofast -march=native -mtune=native -D_WIN32 -o run.exe -I. runq.c win.c .PHONY: run_win_msvc run_win_msvc: ## - OpenMP accelerated Windows build with MSVC toolchain (Untested) @@ -220,105 +225,105 @@ runq_cosmocc_strlit: ## - Same for quantized build # GCC OpenMP + embedded model & tokenizer .PHONY: run_gcc_openmp_incbin run_gcc_openmp_incbin: ## - Gcc + OpenMP + embedded model fast build - gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run + gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run .PHONY: runq_gcc_openmp_incbin runq_gcc_openmp_incbin: ## - Same for quantized build - gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run + gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run .PHONY: run_gcc_openmp_strlit run_gcc_openmp_strlit: ## - Gcc + OpenMP + embedded model build gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -D STRLIT -D LLOOP run.c -lm -o run + gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -mtune=native -D STRLIT -D LLOOP run.c -lm -o run .PHONY: runq_gcc_openmp_strlit runq_gcc_openmp_strlit: ## - Same for quantized build gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -D STRLIT -D LLOOP runq.c -lm -o run + gcc -D OPENMP -Ofast -fopenmp -foffload-options="-Ofast -lm" -march=native -mtune=native -D STRLIT -D LLOOP runq.c -lm -o run # Clang OpenMP + embedded model & tokenizer .PHONY: run_clang_openmp_incbin run_clang_openmp_incbin: ## - Clang + OpenMP + embedded model fast build - clang -D OPENMP -Ofast -fopenmp -march=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run + clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run .PHONY: runq_clang_openmp_incbin runq_clang_openmp_incbin: ## - Same for quantized build - clang -D OPENMP -Ofast -fopenmp -march=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run + clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run .PHONY: run_clang_openmp_strlit run_clang_openmp_strlit: ## - Clang + OpenMP + embedded model build clang -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - clang -D OPENMP -Ofast -fopenmp -march=native -D STRLIT -D LLOOP run.c -lm -o run + clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D STRLIT -D LLOOP run.c -lm -o run .PHONY: runq_clang_openmp_strlit runq_clang_openmp_strlit: ## - Same for quantized build clang -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - clang -D OPENMP -Ofast -fopenmp -march=native -D STRLIT -D LLOOP runq.c -lm -o run + clang -D OPENMP -Ofast -fopenmp -march=native -mtune=native -D STRLIT -D LLOOP runq.c -lm -o run ##@ ---> GCC/Clang Embedded Model Builds ---> Statically Linked # GCC static + embedded model & tokenizer .PHONY: run_gcc_static_incbin run_gcc_static_incbin: ## - Optimized Static gcc + embedded model fast build - gcc -Ofast -static -march=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run + gcc -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run .PHONY: runq_gcc_static_incbin runq_gcc_static_incbin: ## - Same for quantized build - gcc -Ofast -static -march=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run + gcc -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run .PHONY: run_gcc_static_strlit run_gcc_static_strlit: ## - Optimized Static gcc + embedded model build gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - gcc -Ofast -static -march=native -D STRLIT -D LLOOP run.c -lm -o run + gcc -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP run.c -lm -o run .PHONY: runq_gcc_static_strlit runq_gcc_static_strlit: ## - Same for quantized build gcc -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - gcc -Ofast -static -march=native -D STRLIT -D LLOOP runq.c -lm -o run + gcc -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP runq.c -lm -o run # Clang static + embedded model & tokenizer .PHONY: run_clang_static_incbin run_clang_static_incbin: ## - Optimized Static clang + embedded model fast build - clang -Ofast -static -march=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run + clang -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP run.c -lm -o run .PHONY: runq_clang_static_incbin runq_clang_static_incbin: ## - Same for quantized build - clang -Ofast -static -march=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run + clang -Ofast -static -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -D LLOOP runq.c -lm -o run .PHONY: run_clang_static_strlit run_clang_static_strlit: ## - Optimized Static clang + embedded model build clang -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - clang -Ofast -static -march=native -D STRLIT -D LLOOP run.c -lm -o run + clang -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP run.c -lm -o run .PHONY: runq_clang_static_strlit runq_clang_static_strlit: ## - Same for quantized build clang -Ofast strliteral.c -o strlit ./strlit -i emb_Model_data $(MOD_PATH) model.h ./strlit -i emb_Tokenizer_data $(TOK_PATH) tokenizer.h - clang -Ofast -static -march=native -D STRLIT -D LLOOP runq.c -lm -o run + clang -Ofast -static -march=native -mtune=native -D STRLIT -D LLOOP runq.c -lm -o run # Build for termux on Android ##@ ---> Android .PHONY: run_incbin_tmux run_incbin_tmux: get_model ## - Optimized build + Embedded Model for Termux on Android - $(CC) -Ofast -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -o run run.c -lm + $(CC) -Ofast -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -o run run.c -lm .PHONY: runq_incbin_tmux runq_incbin_tmux: get_model ## - Same for quantized build - $(CC) -Ofast -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -o run runq.c -lm + $(CC) -Ofast -march=native -mtune=native -D INC_BIN -D MODPATH=$(MOD_PATH) -D TOKPATH=$(TOK_PATH) -o run runq.c -lm ##@ ---> L2E Unikernel (Asteroid) # Unikraft Unikernel build diff --git a/run.c b/run.c index 082464db..8a17170b 100644 --- a/run.c +++ b/run.c @@ -114,6 +114,13 @@ __static_yoink("zipos"); #include #endif +// ---------------------------------------------------------------------------- +// AVX Support + +#ifdef ACCELAVX +#include +#endif + // ---------------------------------------------------------------------------- // OpenMP and OpenACC Support @@ -378,6 +385,65 @@ void softmax(float* x, int size) { } } +// L2E Addition +#ifdef ACCELAVX +// 4x loop unrolled avx matmul +void avx_matmul(float* xout, const float* x, const float* w, int n, int d) { + int nn = n / 8 * 8; // ensure n is a multiple of 8 + int i; + __m256 sum_vec; + #ifdef ACCEL + #pragma omp parallel for private(i, sum_vec) + #endif + for (i = 0; i < d; i++) { + sum_vec = _mm256_setzero_ps(); // for AVX2, sum of 8 floats + int i_n = i * n; + #ifdef ACCEL + #pragma omp simd + #endif + for (int j = 0; j < nn; j += 32) { + // Load 32 values from w and x + __m256 w_vec0 = _mm256_loadu_ps(&w[i_n + j]); + __m256 w_vec1 = _mm256_loadu_ps(&w[i_n + j + 8]); + __m256 w_vec2 = _mm256_loadu_ps(&w[i_n + j + 16]); + __m256 w_vec3 = _mm256_loadu_ps(&w[i_n + j + 24]); + __m256 x_vec0 = _mm256_loadu_ps(&x[j]); + __m256 x_vec1 = _mm256_loadu_ps(&x[j + 8]); + __m256 x_vec2 = _mm256_loadu_ps(&x[j + 16]); + __m256 x_vec3 = _mm256_loadu_ps(&x[j + 24]); + + // Multiply and accumulate + __m256 prod_vec0 = _mm256_mul_ps(w_vec0, x_vec0); + __m256 prod_vec1 = _mm256_mul_ps(w_vec1, x_vec1); + __m256 prod_vec2 = _mm256_mul_ps(w_vec2, x_vec2); + __m256 prod_vec3 = _mm256_mul_ps(w_vec3, x_vec3); + sum_vec = _mm256_add_ps(sum_vec, prod_vec0); + sum_vec = _mm256_add_ps(sum_vec, prod_vec1); + sum_vec = _mm256_add_ps(sum_vec, prod_vec2); + sum_vec = _mm256_add_ps(sum_vec, prod_vec3); + } + + // Perform horizontal add + sum_vec = _mm256_hadd_ps(sum_vec, sum_vec); + sum_vec = _mm256_hadd_ps(sum_vec, sum_vec); + float vals[8]; + _mm256_storeu_ps(vals, sum_vec); + float val = vals[0] + vals[4]; + + // handle remainder if n is not a multiple of 8 + int j; + #ifdef ACCEL + #pragma omp simd reduction(+:val) + #endif + for (j = nn; j < n; j++) { + val += w[i_n + j] * x[j]; + } + xout[i] = val; + } +} +#endif +// END L2E Addition + void matmul(float* xout, float* x, float* w, int n, int d) { // W (d,n) @ x (n,) -> xout (d,) // by far the most amount of time is spent inside this little function @@ -385,6 +451,8 @@ void matmul(float* xout, float* x, float* w, int n, int d) { // L2E Addition #ifdef BLAS cblas_sgemv(CblasRowMajor, CblasNoTrans, d, n, 1.0f, w, n, x, 1, 0.0f, xout, 1); + #elif defined(ACCELAVX) + avx_matmul(xout, x, w, n, d); #else #ifdef ACCEL ACCEL(i) // OMP/OACC Macro