diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index a0f5e4d4ef6..6ea615539da 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -85,13 +85,12 @@ jobs:
           import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
           Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make
+          make dist
         name: make
       - uses: actions/upload-artifact@v4
         with:
           name: generate-windows-cpu
           path: |
-            build/**/*
             dist/windows-amd64/**
 
   # ROCm generation step
@@ -143,13 +142,12 @@ jobs:
           import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
           Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make -C llama print-HIP_PATH print-HIP_LIB_DIR
-          make rocm
+          make help-runners
+          make dist_rocm
       - uses: actions/upload-artifact@v4
         with:
           name: generate-windows-rocm
           path: |
-            build/**/*
             dist/windows-amd64/**
 
   # CUDA generation step
@@ -226,12 +224,11 @@ jobs:
           import-module 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
           Enter-VsDevShell -vsinstallpath 'C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise' -skipautomaticlocation -DevCmdArguments '-arch=x64 -no_logo'
           if (!(gcc --version | select-string -quiet clang)) { throw "wrong gcc compiler detected - must be clang" }
-          make cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
+          make dist_cuda_v$(($env:CUDA_PATH | split-path -leaf) -replace 'v(\d+).*', '$1')
       - uses: actions/upload-artifact@v4
         with:
           name: generate-windows-cuda-${{ matrix.cuda.version }}
           path: |
-            build/**/*
             dist/windows-amd64/**
 
   # windows arm64 generate, go build, and zip file (no installer)
diff --git a/.gitignore b/.gitignore
index d4785d9c21c..caa62a52418 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,9 +10,6 @@ ollama
 .idea
 test_data
 *.crt
-llm/build
-build/*/*/*
-!build/**/placeholder
 llama/build
 __debug_bin*
 llama/vendor
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index baf259d4e8e..c9bbdea770e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,9 +1,6 @@
 ARG GOLANG_VERSION=1.22.8
-ARG CMAKE_VERSION=3.22.1
 ARG CUDA_VERSION_11=11.3.1
-ARG CUDA_V11_ARCHITECTURES="50;52;53;60;61;62;70;72;75;80;86"
 ARG CUDA_VERSION_12=12.4.0
-ARG CUDA_V12_ARCHITECTURES="60;61;62;70;72;75;80;86;87;89;90;90a"
 ARG ROCM_VERSION=6.1.2
 ARG JETPACK_6=r36.2.0
 ARG JETPACK_5=r35.4.1
@@ -15,24 +12,22 @@ ARG JETPACK_5=r35.4.1
 #
 ### Then incremental builds will be much faster in this container
 #
-# make -j 10 && go build -trimpath -o dist/linux-amd64/ollama .
+# make -j 10 dist
 #
 FROM --platform=linux/amd64 rocm/dev-centos-7:${ROCM_VERSION}-complete AS unified-builder-amd64
-ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
 ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/cuda/bin:$PATH
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
-ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo && \
     dnf clean all && \
     dnf install -y \
     zsh \
-    cuda-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
-    cuda-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
+    cuda-toolkit-$(echo ${CUDA_VERSION_11} | cut -f1-2 -d. | sed -e "s/\./-/g") \
+    cuda-toolkit-$(echo ${CUDA_VERSION_12} | cut -f1-2 -d. | sed -e "s/\./-/g")
 # TODO intel oneapi goes here...
 ENV GOARCH amd64
 ENV CGO_ENABLED 1
@@ -46,12 +41,11 @@ ENTRYPOINT [ "zsh" ]
 # docker run --platform linux/arm64 --rm -it -v $(pwd):/go/src/github.com/ollama/ollama/ builder-arm64
 #
 FROM --platform=linux/arm64 rockylinux:8 AS unified-builder-arm64
-ARG CMAKE_VERSION
 ARG GOLANG_VERSION
 ARG CUDA_VERSION_11
 ARG CUDA_VERSION_12
 COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
+RUN GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
 RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/sbsa/cuda-rhel8.repo && \
     dnf config-manager --set-enabled appstream && \
     dnf clean all && \
@@ -62,37 +56,28 @@ RUN yum-config-manager --add-repo https://developer.download.nvidia.com/compute/
 ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH:/usr/local/cuda/bin
 ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/cuda/lib64
 ENV LIBRARY_PATH=/usr/local/cuda/lib64/stubs:/opt/amdgpu/lib64
-ENV GOARCH amd64
+ENV GOARCH arm64
 ENV CGO_ENABLED 1
 WORKDIR /go/src/github.com/ollama/ollama/
 ENTRYPOINT [ "zsh" ]
 
-FROM --platform=linux/amd64 unified-builder-amd64 AS runners-amd64
+FROM --platform=linux/amd64 unified-builder-amd64 AS build-amd64
 COPY . .
 ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
 ARG OLLAMA_SKIP_ROCM_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
 ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
     if grep "^flags" /proc/cpuinfo|grep avx>/dev/null; then \
-        make -j $(expr $(nproc) / 2 ) ; \
+        make -j $(expr $(nproc) / 2 ) dist ; \
     else \
-        make -j 5 ; \
+        make -j 5 dist ; \
+    fi
+RUN cd dist/linux-$GOARCH && \
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
+    cd dist/linux-$GOARCH-rocm && \
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
     fi
-
-FROM --platform=linux/arm64 unified-builder-arm64 AS runners-arm64
-COPY . .
-ARG OLLAMA_SKIP_CUDA_GENERATE
-ARG OLLAMA_SKIP_CUDA_11_GENERATE
-ARG OLLAMA_SKIP_CUDA_12_GENERATE
-ARG CUDA_V11_ARCHITECTURES
-ARG CUDA_V12_ARCHITECTURES
-ARG OLLAMA_FAST_BUILD
-RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5
 
 # Jetsons need to be built in discrete stages
 FROM --platform=linux/arm64 nvcr.io/nvidia/l4t-jetpack:${JETPACK_5} AS runners-jetpack5-arm64
@@ -107,10 +92,9 @@ COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 cuda_v11 \
+    make -j 5 dist_cuda_v11 \
         CUDA_ARCHITECTURES="72;87" \
         GPU_RUNNER_VARIANT=_jetpack5 \
-        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
         DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama \
         DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ollama/cuda_jetpack5
 
@@ -126,69 +110,26 @@ COPY . .
 ARG CGO_CFLAGS
 ENV GOARCH arm64
 RUN --mount=type=cache,target=/root/.ccache \
-    make -j 5 cuda_v12 \
+    make -j 5 dist_cuda_v12 \
         CUDA_ARCHITECTURES="87" \
         GPU_RUNNER_VARIANT=_jetpack6 \
-        CGO_EXTRA_LDFLAGS_LINUX=-L/usr/local/cuda/lib64/stubs \
         DIST_LIB_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama \
         DIST_GPU_RUNNER_DEPS_DIR=/go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ollama/cuda_jetpack6
 
-
-# Intermediate stages used for ./scripts/build_linux.sh
-FROM --platform=linux/amd64 centos:7 AS builder-amd64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/devtoolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH amd64
-WORKDIR /go/src/github.com/ollama/ollama
-
-FROM --platform=linux/amd64 builder-amd64 AS build-amd64
+FROM --platform=linux/arm64 unified-builder-arm64 AS build-arm64
 COPY . .
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-amd64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-ARG OLLAMA_SKIP_ROCM_GENERATE
+ARG OLLAMA_SKIP_CUDA_GENERATE
+ARG OLLAMA_FAST_BUILD
 RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
-RUN if [ -z ${OLLAMA_SKIP_ROCM_GENERATE} ] ; then \
-    cd dist/linux-$GOARCH-rocm && \
-    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-rocm.tgz ;\
-    fi
-
-FROM --platform=linux/arm64 rockylinux:8 AS builder-arm64
-ARG CMAKE_VERSION
-ARG GOLANG_VERSION
-COPY ./scripts/rh_linux_deps.sh /
-RUN CMAKE_VERSION=${CMAKE_VERSION} GOLANG_VERSION=${GOLANG_VERSION} sh /rh_linux_deps.sh
-ENV PATH /opt/rh/gcc-toolset-10/root/usr/bin:$PATH
-ENV CGO_ENABLED 1
-ENV GOARCH arm64
-WORKDIR /go/src/github.com/ollama/ollama
-
-FROM --platform=linux/arm64 builder-arm64 AS build-arm64
-COPY . .
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/build/ build/
+    make -j 5 dist
 COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/build/ build/
 COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/ dist/
-COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/build/ build/
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
 RUN cd dist/linux-$GOARCH && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH.tgz
 RUN cd dist/linux-$GOARCH-jetpack5 && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack5.tgz
 RUN cd dist/linux-$GOARCH-jetpack6 && \
-    tar --exclude runners -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
+    tar -cf - . | pigz --best > ../ollama-linux-$GOARCH-jetpack6.tgz
 
 FROM --platform=linux/amd64 scratch AS dist-amd64
 COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz /
@@ -197,30 +138,13 @@ COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/ollama-linux-*.tgz
 FROM dist-$TARGETARCH AS dist
 
 
-# Optimized container images do not cary nested payloads
-FROM --platform=linux/amd64 builder-amd64 AS container-build-amd64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-amd64/bin/ollama .
-
-FROM --platform=linux/arm64 builder-arm64 AS container-build-arm64
-WORKDIR /go/src/github.com/ollama/ollama
-COPY . .
-ARG GOFLAGS
-ARG CGO_CFLAGS
-RUN --mount=type=cache,target=/root/.ccache \
-    go build -trimpath -o dist/linux-arm64/bin/ollama .
-
 # For amd64 container images, filter out cuda/rocm to minimize size
-FROM runners-amd64 AS runners-cuda-amd64
+FROM build-amd64 AS runners-cuda-amd64
 RUN rm -rf \
     ./dist/linux-amd64/lib/ollama/libggml_hipblas.so \
     ./dist/linux-amd64/lib/ollama/runners/rocm*
 
-FROM runners-amd64 AS runners-rocm-amd64
+FROM build-amd64 AS runners-rocm-amd64
 RUN rm -rf \
     ./dist/linux-amd64/lib/ollama/libggml_cuda*.so \
     ./dist/linux-amd64/lib/ollama/libcu*.so* \
@@ -230,15 +154,15 @@ FROM --platform=linux/amd64 ubuntu:22.04 AS runtime-amd64
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-cuda-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 
 FROM --platform=linux/arm64 ubuntu:22.04 AS runtime-arm64
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
-COPY --from=runners-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/bin/ /bin/
+COPY --from=build-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64/lib/ /lib/
 COPY --from=runners-jetpack5-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack5/lib/ /lib/
 COPY --from=runners-jetpack6-arm64 /go/src/github.com/ollama/ollama/dist/linux-arm64-jetpack6/lib/ /lib/
 
@@ -251,7 +175,7 @@ COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64-rocm/l
 RUN apt-get update && \
     apt-get install -y ca-certificates && \
     apt-get clean && rm -rf /var/lib/apt/lists/*
-COPY --from=container-build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
+COPY --from=build-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/bin/ /bin/
 COPY --from=runners-rocm-amd64 /go/src/github.com/ollama/ollama/dist/linux-amd64/lib/ /lib/
 
 EXPOSE 11434
diff --git a/Makefile b/Makefile
index f59e072cbd7..26dc418d222 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,105 @@
-GOALS := $(or $(MAKECMDGOALS),all)
-.PHONY: $(GOALS)
-$(GOALS):
-	$(MAKE) -C llama $@
\ No newline at end of file
+# top level makefile for Ollama
+include make/common-defs.make
+
+
+# Determine which if any GPU runners we should build
+include make/cuda-v11-defs.make
+include make/cuda-v12-defs.make
+include make/rocm-defs.make
+
+ifeq ($(CUSTOM_CPU_FLAGS),)
+ifneq ($(OS),darwin)
+ifeq ($(ARCH),amd64)
+	RUNNER_TARGETS=cpu
+endif
+endif
+# Without CUSTOM_CPU_FLAGS we default to build both v11 and v12 if present
+ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
+ifneq ($(CUDA_11_COMPILER),)
+	RUNNER_TARGETS += cuda_v11
+endif
+ifneq ($(CUDA_12_COMPILER),)
+	RUNNER_TARGETS += cuda_v12
+endif
+endif
+else # CUSTOM_CPU_FLAGS is set, we'll build only the latest cuda version detected
+ifneq ($(CUDA_12_COMPILER),)
+	RUNNER_TARGETS += cuda_v12
+else ifneq ($(CUDA_11_COMPILER),)
+	RUNNER_TARGETS += cuda_v11
+endif
+endif
+
+ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
+ifneq ($(HIP_COMPILER),)
+	RUNNER_TARGETS += rocm
+endif
+endif
+
+
+all: runners exe
+
+dist: $(addprefix dist_, $(RUNNER_TARGETS)) dist_exe
+
+dist_%:
+	@$(MAKE) --no-print-directory -f make/Makefile.$* dist
+
+runners: $(RUNNER_TARGETS)
+
+$(RUNNER_TARGETS):
+	@$(MAKE) --no-print-directory -f make/Makefile.$@
+
+exe dist_exe:
+	@$(MAKE) --no-print-directory -f make/Makefile.ollama $@
+
+help-sync apply-patches create-patches sync sync-clean:
+	@$(MAKE) --no-print-directory -f make/Makefile.sync $@
+
+test integration lint:
+	@$(MAKE) --no-print-directory -f make/Makefile.test $@
+
+clean:
+	rm -rf $(BUILD_DIR) $(DIST_LIB_DIR) $(OLLAMA_EXE) $(DIST_OLLAMA_EXE)
+	go clean -cache
+
+help:
+	@echo "The following make targets will help you build Ollama"
+	@echo ""
+	@echo "	make all   		# (default target) Build Ollama llm subprocess runners, and the primary ollama executable"
+	@echo "	make runners		# Build Ollama llm subprocess runners; after you may use 'go build .' to build the primary ollama exectuable"
+	@echo "	make <runner>		# Build specific runners. Enabled: '$(RUNNER_TARGETS)'"
+	@echo "	make dist		# Build the runners and primary ollama executable for distribution"
+	@echo "	make help-sync 		# Help information on vendor update targets"
+	@echo "	make help-runners 	# Help information on runner targets"
+	@echo ""
+	@echo "The following make targets will help you test Ollama"
+	@echo ""
+	@echo "	make test   		# Run unit tests"
+	@echo "	make integration	# Run integration tests.  You must 'make all' first"
+	@echo "	make lint   		# Run lint and style tests"
+	@echo ""
+	@echo "For more information see 'docs/development.md'"
+	@echo ""
+
+
+help-runners:
+	@echo "The following runners will be built based on discovered GPU libraries: '$(RUNNER_TARGETS)'"
+	@echo ""
+	@echo "GPU Runner CPU Flags: '$(GPU_RUNNER_CPU_FLAGS)'  (Override with CUSTOM_CPU_FLAGS)"
+	@echo ""
+	@echo "# CUDA_PATH sets the location where CUDA toolkits are present"
+	@echo "CUDA_PATH=$(CUDA_PATH)"
+	@echo "	CUDA_11_PATH=$(CUDA_11_PATH)"
+	@echo "	CUDA_11_COMPILER=$(CUDA_11_COMPILER)"
+	@echo "	CUDA_12_PATH=$(CUDA_12_PATH)"
+	@echo "	CUDA_12_COMPILER=$(CUDA_12_COMPILER)"
+	@echo ""
+	@echo "# HIP_PATH sets the location where the ROCm toolkit is present"
+	@echo "HIP_PATH=$(HIP_PATH)"
+	@echo "	HIP_COMPILER=$(HIP_COMPILER)"
+
+.PHONY: all exe dist help help-sync help-runners test integration lint runners clean $(RUNNER_TARGETS)
+
+# Handy debugging for make variables
+print-%:
+	@echo '$*=$($*)'
diff --git a/build/darwin/amd64/placeholder b/build/darwin/amd64/placeholder
deleted file mode 100644
index 87dc2738141..00000000000
--- a/build/darwin/amd64/placeholder
+++ /dev/null
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
diff --git a/build/darwin/arm64/placeholder b/build/darwin/arm64/placeholder
deleted file mode 100644
index 87dc2738141..00000000000
--- a/build/darwin/arm64/placeholder
+++ /dev/null
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
diff --git a/build/embed_darwin_amd64.go b/build/embed_darwin_amd64.go
deleted file mode 100644
index af1458ea937..00000000000
--- a/build/embed_darwin_amd64.go
+++ /dev/null
@@ -1,8 +0,0 @@
-package build
-
-import "embed"
-
-// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
-
-//go:embed darwin/amd64/*
-var EmbedFS embed.FS
diff --git a/build/embed_darwin_arm64.go b/build/embed_darwin_arm64.go
deleted file mode 100644
index d885365d0cf..00000000000
--- a/build/embed_darwin_arm64.go
+++ /dev/null
@@ -1,8 +0,0 @@
-package build
-
-import "embed"
-
-// Darwin payloads separated by architecture to avoid duplicate payloads when cross compiling
-
-//go:embed darwin/arm64/*
-var EmbedFS embed.FS
diff --git a/build/embed_linux.go b/build/embed_linux.go
deleted file mode 100644
index 4cf7be4c389..00000000000
--- a/build/embed_linux.go
+++ /dev/null
@@ -1,6 +0,0 @@
-package build
-
-import "embed"
-
-//go:embed linux/*
-var EmbedFS embed.FS
diff --git a/build/embed_unused.go b/build/embed_unused.go
deleted file mode 100644
index 00fbe02e80e..00000000000
--- a/build/embed_unused.go
+++ /dev/null
@@ -1,8 +0,0 @@
-//go:build !linux && !darwin
-
-package build
-
-import "embed"
-
-// unused on windows
-var EmbedFS embed.FS
diff --git a/build/linux/amd64/placeholder b/build/linux/amd64/placeholder
deleted file mode 100644
index 87dc2738141..00000000000
--- a/build/linux/amd64/placeholder
+++ /dev/null
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
diff --git a/build/linux/arm64/placeholder b/build/linux/arm64/placeholder
deleted file mode 100644
index 87dc2738141..00000000000
--- a/build/linux/arm64/placeholder
+++ /dev/null
@@ -1 +0,0 @@
-This is here to make sure the build/ directory exists for the go:embed command
diff --git a/cmd/cmd.go b/cmd/cmd.go
index 29608400f36..f934a267903 100644
--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -37,6 +37,8 @@ import (
 	"github.com/ollama/ollama/api"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/llama"
+	"github.com/ollama/ollama/llama/runner"
 	"github.com/ollama/ollama/parser"
 	"github.com/ollama/ollama/progress"
 	"github.com/ollama/ollama/server"
@@ -1420,6 +1422,19 @@ func NewCLI() *cobra.Command {
 		RunE:    DeleteHandler,
 	}
 
+	runnerCmd := &cobra.Command{
+		Use:    "runner",
+		Short:  llama.PrintSystemInfo(),
+		Hidden: true,
+		RunE: func(cmd *cobra.Command, args []string) error {
+			return runner.Execute(os.Args[1:])
+		},
+		FParseErrWhitelist: cobra.FParseErrWhitelist{UnknownFlags: true},
+	}
+	runnerCmd.SetHelpFunc(func(cmd *cobra.Command, args []string) {
+		_ = runner.Execute(args[1:])
+	})
+
 	envVars := envconfig.AsMap()
 
 	envs := []envconfig.EnvVar{envVars["OLLAMA_HOST"]}
@@ -1476,6 +1491,7 @@ func NewCLI() *cobra.Command {
 		psCmd,
 		copyCmd,
 		deleteCmd,
+		runnerCmd,
 	)
 
 	return rootCmd
diff --git a/cmd/runner/main.go b/cmd/runner/main.go
new file mode 100644
index 00000000000..34b0e9d2124
--- /dev/null
+++ b/cmd/runner/main.go
@@ -0,0 +1,15 @@
+package main
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/ollama/ollama/llama/runner"
+)
+
+func main() {
+	if err := runner.Execute(os.Args[1:]); err != nil {
+		fmt.Fprintf(os.Stderr, "error: %s\n", err)
+		os.Exit(1)
+	}
+}
diff --git a/discover/amd_linux.go b/discover/amd_linux.go
index d092f6b5aa2..ecf91056d87 100644
--- a/discover/amd_linux.go
+++ b/discover/amd_linux.go
@@ -77,6 +77,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
 	var supported []string
+	depPaths := LibraryDirs()
 	libDir := ""
 
 	// The amdgpu driver always exposes the host CPU(s) first, but we have to skip them and subtract
@@ -300,8 +301,11 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 			})
 			continue
 		}
-
-		if int(major) < RocmComputeMin {
+		minVer, err := strconv.Atoi(RocmComputeMajorMin)
+		if err != nil {
+			slog.Error("invalid RocmComputeMajorMin setting", "value", RocmComputeMajorMin, "error", err)
+		}
+		if int(major) < minVer {
 			reason := fmt.Sprintf("amdgpu too old gfx%d%x%x", major, minor, patch)
 			slog.Warn(reason, "gpu", gpuID)
 			unsupportedGPUs = append(unsupportedGPUs, UnsupportedGPUInfo{
@@ -349,8 +353,9 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				})
 				return nil, err
 			}
+			depPaths = append(depPaths, libDir)
 		}
-		gpuInfo.DependencyPath = []string{libDir}
+		gpuInfo.DependencyPath = depPaths
 
 		if gfxOverride == "" {
 			// Only load supported list once
diff --git a/discover/amd_windows.go b/discover/amd_windows.go
index efa5cc23b0c..37a496643b9 100644
--- a/discover/amd_windows.go
+++ b/discover/amd_windows.go
@@ -50,12 +50,14 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 		slog.Info(err.Error())
 		return nil, err
 	}
+	depPaths := LibraryDirs()
 	libDir, err := AMDValidateLibDir()
 	if err != nil {
 		err = fmt.Errorf("unable to verify rocm library: %w", err)
 		slog.Warn(err.Error())
 		return nil, err
 	}
+	depPaths = append(depPaths, libDir)
 
 	var supported []string
 	gfxOverride := envconfig.HsaOverrideGfxVersion()
@@ -111,7 +113,7 @@ func AMDGetGPUInfo() ([]RocmGPUInfo, error) {
 				UnreliableFreeMemory: true,
 
 				ID:             strconv.Itoa(i), // TODO this is probably wrong if we specify visible devices
-				DependencyPath: []string{libDir},
+				DependencyPath: depPaths,
 				MinimumMemory:  rocmMinimumMemory,
 				Name:           name,
 				Compute:        gfx,
diff --git a/discover/cpu_common.go b/discover/cpu_common.go
index 0faac24acf0..242e487912a 100644
--- a/discover/cpu_common.go
+++ b/discover/cpu_common.go
@@ -5,21 +5,8 @@ import (
 	"path/filepath"
 	"runtime"
 	"strings"
-
-	"golang.org/x/sys/cpu"
 )
 
-func GetCPUCapability() CPUCapability {
-	if cpu.X86.HasAVX2 {
-		return CPUCapabilityAVX2
-	}
-	if cpu.X86.HasAVX {
-		return CPUCapabilityAVX
-	}
-	// else LCD
-	return CPUCapabilityNone
-}
-
 func IsNUMA() bool {
 	if runtime.GOOS != "linux" {
 		// numa support in llama.cpp is linux only
diff --git a/discover/gpu.go b/discover/gpu.go
index cf34b904531..e76c844fef2 100644
--- a/discover/gpu.go
+++ b/discover/gpu.go
@@ -16,12 +16,14 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
 
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/runners"
 )
 
 type cudaHandles struct {
@@ -45,7 +47,6 @@ const (
 var (
 	gpuMutex      sync.Mutex
 	bootstrapped  bool
-	cpuCapability CPUCapability
 	cpus          []CPUInfo
 	cudaGPUs      []CudaGPUInfo
 	nvcudaLibPath string
@@ -64,9 +65,13 @@ var (
 )
 
 // With our current CUDA compile flags, older than 5.0 will not work properly
-var CudaComputeMin = [2]C.int{5, 0}
+// (string values used to allow ldflags overrides at build time)
+var (
+	CudaComputeMajorMin = "5"
+	CudaComputeMinorMin = "0"
+)
 
-var RocmComputeMin = 9
+var RocmComputeMajorMin = "9"
 
 // TODO find a better way to detect iGPU instead of minimum memory
 const IGPUMemLimit = 1 * format.GibiByte // 512G is what they typically report, so anything less than 1G must be iGPU
@@ -101,9 +106,9 @@ func initCudaHandles() *cudaHandles {
 		localAppData := os.Getenv("LOCALAPPDATA")
 		cudartMgmtPatterns = []string{filepath.Join(localAppData, "Programs", "Ollama", CudartMgmtName)}
 	}
-	libDir := LibraryDir()
-	if libDir != "" {
-		cudartMgmtPatterns = []string{filepath.Join(libDir, CudartMgmtName)}
+	libDirs := LibraryDirs()
+	for _, d := range libDirs {
+		cudartMgmtPatterns = append(cudartMgmtPatterns, filepath.Join(d, CudartMgmtName))
 	}
 	cudartMgmtPatterns = append(cudartMgmtPatterns, CudartGlobs...)
 
@@ -219,16 +224,23 @@ func GetGPUInfo() GpuInfoList {
 
 	if !bootstrapped {
 		slog.Info("looking for compatible GPUs")
+		cudaComputeMajorMin, err := strconv.Atoi(CudaComputeMajorMin)
+		if err != nil {
+			slog.Error("invalid CudaComputeMajorMin setting", "value", CudaComputeMajorMin, "error", err)
+		}
+		cudaComputeMinorMin, err := strconv.Atoi(CudaComputeMinorMin)
+		if err != nil {
+			slog.Error("invalid CudaComputeMinorMin setting", "value", CudaComputeMinorMin, "error", err)
+		}
 		bootstrapErrors = []error{}
 		needRefresh = false
-		cpuCapability = GetCPUCapability()
 		var memInfo C.mem_info_t
 
 		mem, err := GetCPUMem()
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		}
-		depPath := LibraryDir()
+		depPaths := LibraryDirs()
 		details, err := GetCPUDetails()
 		if err != nil {
 			slog.Warn("failed to lookup CPU details", "error", err)
@@ -238,24 +250,14 @@ func GetGPUInfo() GpuInfoList {
 				GpuInfo: GpuInfo{
 					memInfo:        mem,
 					Library:        "cpu",
-					Variant:        cpuCapability.String(),
+					Variant:        runners.GetCPUCapability().String(),
 					ID:             "0",
-					DependencyPath: []string{depPath},
+					DependencyPath: depPaths,
 				},
 				CPUs: details,
 			},
 		}
 
-		// Fallback to CPU mode if we're lacking required vector extensions on x86
-		if cpuCapability < GPURunnerCPUCapability && runtime.GOARCH == "amd64" {
-			err := fmt.Errorf("CPU does not have minimum vector extensions, GPU inference disabled.  Required:%s  Detected:%s", GPURunnerCPUCapability, cpuCapability)
-			slog.Warn(err.Error())
-			bootstrapErrors = append(bootstrapErrors, err)
-			bootstrapped = true
-			// No need to do any GPU discovery, since we can't run on them
-			return GpuInfoList{cpus[0].GpuInfo}
-		}
-
 		// Load ALL libraries
 		cHandles = initCudaHandles()
 
@@ -292,19 +294,23 @@ func GetGPUInfo() GpuInfoList {
 				gpuInfo.DriverMajor = driverMajor
 				gpuInfo.DriverMinor = driverMinor
 				variant := cudaVariant(gpuInfo)
-				if depPath != "" {
-					gpuInfo.DependencyPath = []string{depPath}
+				if depPaths != nil {
+					gpuInfo.DependencyPath = depPaths
 					// Check for variant specific directory
 					if variant != "" {
-						if _, err := os.Stat(filepath.Join(depPath, "cuda_"+variant)); err == nil {
-							gpuInfo.DependencyPath = []string{filepath.Join(depPath, "cuda_"+variant), depPath}
+						for _, d := range depPaths {
+							if _, err := os.Stat(filepath.Join(d, "cuda_"+variant)); err == nil {
+								// Put the variant directory first in the search path to avoid runtime linking to the wrong library
+								gpuInfo.DependencyPath = append([]string{filepath.Join(d, "cuda_"+variant)}, gpuInfo.DependencyPath...)
+								break
+							}
 						}
 					}
 				}
 				gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
 				gpuInfo.Variant = variant
 
-				if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
+				if int(memInfo.major) < cudaComputeMajorMin || (int(memInfo.major) == cudaComputeMajorMin && int(memInfo.minor) < cudaComputeMinorMin) {
 					unsupportedGPUs = append(unsupportedGPUs,
 						UnsupportedGPUInfo{
 							GpuInfo: gpuInfo.GpuInfo,
@@ -370,7 +376,7 @@ func GetGPUInfo() GpuInfoList {
 						gpuInfo.FreeMemory = uint64(memInfo.free)
 						gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
 						gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-						gpuInfo.DependencyPath = []string{depPath}
+						gpuInfo.DependencyPath = depPaths
 						oneapiGPUs = append(oneapiGPUs, gpuInfo)
 					}
 				}
@@ -385,6 +391,8 @@ func GetGPUInfo() GpuInfoList {
 		if len(cudaGPUs) == 0 && len(rocmGPUs) == 0 && len(oneapiGPUs) == 0 {
 			slog.Info("no compatible GPUs were discovered")
 		}
+
+		// TODO verify we have runners for the discovered GPUs, filter out any that aren't supported with good error messages
 	}
 
 	// For detected GPUs, load library if not loaded
@@ -509,7 +517,10 @@ func FindGPULibs(baseLibName string, defaultPatterns []string) []string {
 	slog.Debug("Searching for GPU library", "name", baseLibName)
 
 	// Start with our bundled libraries
-	patterns := []string{filepath.Join(LibraryDir(), baseLibName)}
+	patterns := []string{}
+	for _, d := range LibraryDirs() {
+		patterns = append(patterns, filepath.Join(d, baseLibName))
+	}
 
 	switch runtime.GOOS {
 	case "windows":
@@ -705,32 +716,26 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 	}
 }
 
-func LibraryDir() string {
-	// On Windows/linux we bundle the dependencies at the same level as the executable
+func LibraryDirs() []string {
+	// dependencies can exist wherever we found the runners (e.g. build tree for developers) and relative to the executable
+	// This can be simplified once we no longer carry runners as payloads
+	paths := []string{}
 	appExe, err := os.Executable()
 	if err != nil {
 		slog.Warn("failed to lookup executable path", "error", err)
+	} else {
+		appRelative := filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe(), "lib", "ollama")
+		if _, err := os.Stat(appRelative); err == nil {
+			paths = append(paths, appRelative)
+		}
 	}
-	cwd, err := os.Getwd()
+	rDir := runners.Locate()
 	if err != nil {
-		slog.Warn("failed to lookup working directory", "error", err)
-	}
-	// Scan for any of our dependeices, and pick first match
-	for _, root := range []string{filepath.Dir(appExe), filepath.Join(filepath.Dir(appExe), envconfig.LibRelativeToExe()), cwd} {
-		libDep := filepath.Join("lib", "ollama")
-		if _, err := os.Stat(filepath.Join(root, libDep)); err == nil {
-			return filepath.Join(root, libDep)
-		}
-		// Developer mode, local build
-		if _, err := os.Stat(filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
-			return filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH, libDep)
-		}
-		if _, err := os.Stat(filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)); err == nil {
-			return filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH, libDep)
-		}
+		slog.Warn("unable to locate gpu dependency libraries", "error", err)
+	} else {
+		paths = append(paths, filepath.Dir(rDir))
 	}
-	slog.Warn("unable to locate gpu dependency libraries")
-	return ""
+	return paths
 }
 
 func GetSystemInfo() SystemInfo {
diff --git a/discover/gpu_darwin.go b/discover/gpu_darwin.go
index d3f0303f793..15f8f799614 100644
--- a/discover/gpu_darwin.go
+++ b/discover/gpu_darwin.go
@@ -15,6 +15,7 @@ import (
 	"syscall"
 
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/runners"
 )
 
 const (
@@ -27,7 +28,7 @@ func GetGPUInfo() GpuInfoList {
 		return []GpuInfo{
 			{
 				Library: "cpu",
-				Variant: GetCPUCapability().String(),
+				Variant: runners.GetCPUCapability().String(),
 				memInfo: mem,
 			},
 		}
@@ -50,7 +51,7 @@ func GetCPUInfo() GpuInfoList {
 	return []GpuInfo{
 		{
 			Library: "cpu",
-			Variant: GetCPUCapability().String(),
+			Variant: runners.GetCPUCapability().String(),
 			memInfo: mem,
 		},
 	}
diff --git a/discover/types.go b/discover/types.go
index 3112d003eab..4568e3b8503 100644
--- a/discover/types.go
+++ b/discover/types.go
@@ -5,6 +5,7 @@ import (
 	"log/slog"
 
 	"github.com/ollama/ollama/format"
+	"github.com/ollama/ollama/runners"
 )
 
 type memInfo struct {
@@ -47,6 +48,13 @@ type GpuInfo struct { // TODO better name maybe "InferenceProcessor"?
 	// TODO other performance capability info to help in scheduling decisions
 }
 
+func (gpu GpuInfo) RunnerName() string {
+	if gpu.Variant != "" {
+		return gpu.Library + "_" + gpu.Variant
+	}
+	return gpu.Library
+}
+
 type CPUInfo struct {
 	GpuInfo
 	CPUs []CPU
@@ -99,7 +107,7 @@ func (l GpuInfoList) ByLibrary() []GpuInfoList {
 	for _, info := range l {
 		found := false
 		requested := info.Library
-		if info.Variant != CPUCapabilityNone.String() {
+		if info.Variant != runners.CPUCapabilityNone.String() {
 			requested += "_" + info.Variant
 		}
 		for i, lib := range libs {
@@ -140,29 +148,6 @@ func (a ByFreeMemory) Len() int           { return len(a) }
 func (a ByFreeMemory) Swap(i, j int)      { a[i], a[j] = a[j], a[i] }
 func (a ByFreeMemory) Less(i, j int) bool { return a[i].FreeMemory < a[j].FreeMemory }
 
-type CPUCapability uint32
-
-// Override at build time when building base GPU runners
-var GPURunnerCPUCapability = CPUCapabilityAVX
-
-const (
-	CPUCapabilityNone CPUCapability = iota
-	CPUCapabilityAVX
-	CPUCapabilityAVX2
-	// TODO AVX512
-)
-
-func (c CPUCapability) String() string {
-	switch c {
-	case CPUCapabilityAVX:
-		return "avx"
-	case CPUCapabilityAVX2:
-		return "avx2"
-	default:
-		return "no vector extensions"
-	}
-}
-
 type SystemInfo struct {
 	System          CPUInfo              `json:"system"`
 	GPUs            []GpuInfo            `json:"gpus"`
diff --git a/docs/development.md b/docs/development.md
index 13457ae3b8f..b1c944eed34 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -3,35 +3,24 @@
 Install required tools:
 
 - go version 1.22 or higher
-- gcc version 11.4.0 or higher
+- OS specific C/C++ compiler (see below)
+- GNU Make
 
 
-### MacOS
-
-[Download Go](https://go.dev/dl/)
+## Overview
 
-Optionally enable debugging and more verbose logging:
+Ollama uses a mix of Go and C/C++ code to interface with GPUs.  The C/C++ code is compiled with both CGO and GPU library specific compilers.  A set of GNU Makefiles are used to compile the project.  GPU Libraries are auto-detected based on the typical environment variables used by the respective libraries, but can be overridden if necessary.  The default make target will build the runners and primary Go Ollama application that will run within the repo directory.  Throughout the examples below `-j 5` is suggested for 5 parallel jobs to speed up the build.  You can adjust the job count based on your CPU Core count to reduce build times.  If you want to relocate the built binaries, use the `dist` target and recursively copy the files in `./dist/$OS-$ARCH/` to your desired location. To learn more about the other make targets use `make help`
 
-```bash
-# At build time
-export CGO_CFLAGS="-g"
+Once you have built the GPU/CPU runners, you can compile the main application with `go build .` 
 
-# At runtime
-export OLLAMA_DEBUG=1
-```
+### MacOS
 
-Get the required libraries and build the native LLM code:  (Adjust the job count based on your number of processors for a faster build)
+[Download Go](https://go.dev/dl/)
 
 ```bash
 make -j 5
 ```
 
-Then build ollama:
-
-```bash
-go build .
-```
-
 Now you can run `ollama`:
 
 ```bash
@@ -51,64 +40,42 @@ _Your operating system distribution may already have packages for NVIDIA CUDA. D
 Install `make`, `gcc` and `golang` as well as [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
 development and runtime packages.
 
-Typically the build scripts will auto-detect CUDA, however, if your Linux distro
-or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `CUDA_LIB_DIR` to the location of the shared
-libraries, and `CUDACXX` to the location of the nvcc compiler. You can customize
-a set of target CUDA architectures by setting `CMAKE_CUDA_ARCHITECTURES` (e.g. "50;60;70")
-
-Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
+Typically the makefile will auto-detect CUDA, however, if your Linux distro
+or installation approach uses alternative paths, you can specify the location by
+overriding `CUDA_PATH` to the location of the CUDA toolkit. You can customize
+a set of target CUDA architectures by setting `CUDA_ARCHITECTURES` (e.g. `CUDA_ARCHITECTURES=50;60;70`)
 
 ```
 make -j 5
 ```
 
-Then build the binary:
+If both v11 and v12 tookkits are detected, runners for both major versions will be built by default.  You can build just v12 with `make cuda_v12`
 
-```
-go build .
-```
+#### Older Linux CUDA (NVIDIA)
+
+To support older GPUs with Compute Capability 3.5 or 3.7, you will need to use an older version of the Driver from [Unix Driver Archive](https://www.nvidia.com/en-us/drivers/unix/) (tested with 470) and [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive) (tested with cuda V11).  When you build Ollama, you will need to set two make variable to adjust the minimum compute capability Ollama supports via `make -j 5 CUDA_ARCHITECTURES="35;37;50;52" EXTRA_GOLDLAGS="\"-X=github.com/ollama/ollama/discover.CudaComputeMajorMin=3\" \"-X=github.com/ollama/ollama/discover.CudaComputeMinorMin=5\""`.  To find the Compute Capability of your older GPU, refer to [GPU Compute Capability](https://developer.nvidia.com/cuda-gpus).
 
 #### Linux ROCm (AMD)
 
-_Your operating system distribution may already have packages for AMD ROCm and CLBlast. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
+_Your operating system distribution may already have packages for AMD ROCm. Distro packages are often preferable, but instructions are distro-specific. Please consult distro-specific docs for dependencies if available!_
 
-Install [CLBlast](https://github.com/CNugteren/CLBlast/blob/master/doc/installation.md) and [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
+Install [ROCm](https://rocm.docs.amd.com/en/latest/) development packages first, as well as `make`, `gcc`, and `golang`.
 
 Typically the build scripts will auto-detect ROCm, however, if your Linux distro
 or installation approach uses unusual paths, you can specify the location by
-specifying an environment variable `ROCM_PATH` to the location of the ROCm
-install (typically `/opt/rocm`), and `CLBlast_DIR` to the location of the
-CLBlast install (typically `/usr/lib/cmake/CLBlast`). You can also customize
-the AMD GPU targets by setting AMDGPU_TARGETS (e.g. `AMDGPU_TARGETS="gfx1101;gfx1102"`)
-
-Then generate dependencies:  (Adjust the job count based on your number of processors for a faster build)
+specifying an environment variable `HIP_PATH` to the location of the ROCm
+install (typically `/opt/rocm`). You can also customize
+the AMD GPU targets by setting HIP_ARCHS (e.g. `HIP_ARCHS=gfx1101;gfx1102`)
 
 ```
 make -j 5
 ```
 
-Then build the binary:
-
-```
-go build .
-```
-
 ROCm requires elevated privileges to access the GPU at runtime. On most distros you can add your user account to the `render` group, or run as root.
 
-#### Advanced CPU Settings
-
-By default, running `make` will compile a few different variations
-of the LLM library based on common CPU families and vector math capabilities,
-including a lowest-common-denominator which should run on almost any 64 bit CPU
-somewhat slowly. At runtime, Ollama will auto-detect the optimal variation to
-load. 
-
-Custom CPU settings are not currently supported in the new Go server build but will be added back after we complete the transition.
-
 #### Containerized Linux Build
 
-If you have Docker available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting binary is placed in `./dist`
+If you have Docker and buildx available, you can build linux binaries with `./scripts/build_linux.sh` which has the CUDA and ROCm dependencies included. The resulting artifacts are placed in `./dist`  and by default the script builds both arm64 and amd64 binaries.  If you want to build only amd64, you can build with `PLATFORM=linux/amd64 ./scripts/build_linux.sh`
 
 ### Windows
 
@@ -126,12 +93,8 @@ The following tools are required as a minimal development environment to build C
 > [!NOTE]  
 > Due to bugs in the GCC C++ library for unicode support, Ollama should be built with clang on windows.
 
-Then, build the `ollama` binary:
-
-```powershell
-$env:CGO_ENABLED="1"
-make -j 8
-go build .
+```
+make -j 5
 ```
 
 #### GPU Support
@@ -173,3 +136,30 @@ pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw
 ```
 
 You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
+
+
+## Advanced CPU Vector Settings
+
+On x86, running `make` will compile several CPU runners which can run on different CPU families. At runtime, Ollama will auto-detect the best variation to load.  If GPU libraries are present at build time, Ollama also compiles GPU runners with the `AVX` CPU vector feature enabled.  This provides a good performance balance when loading large models that split across GPU and CPU with broad compatibility.  Some users may prefer no vector extensions (e.g. older Xeon/Celeron processors, or hypervisors that mask the vector features) while other users may prefer turning on many more vector extensions to further improve performance for split model loads.
+
+To customize the set of CPU vector features enabled for a CPU runner and all GPU runners, use CUSTOM_CPU_FLAGS during the build.
+
+To build without any vector flags:
+
+```
+make CUSTOM_CPU_FLAGS=""
+```
+
+To build with both AVX and AVX2:
+```
+make CUSTOM_CPU_FLAGS=avx,avx2
+```
+
+To build with AVX512 features turned on:
+
+```
+make CUSTOM_CPU_FLAGS=avx,avx2,avx512,avx512vbmi,avx512vnni,avx512bf16
+```
+
+> [!NOTE]  
+> If you are experimenting with different flags, make sure to do a `make clean` between each change to ensure everything is rebuilt with the new compiler flags
diff --git a/docs/gpu.md b/docs/gpu.md
index 2de5bd243fe..691746d0dd8 100644
--- a/docs/gpu.md
+++ b/docs/gpu.md
@@ -28,6 +28,7 @@ Check your compute compatibility to see if your card is supported:
 | 5.0                | GeForce GTX         | `GTX 750 Ti` `GTX 750` `NVS 810`                                                                            |
 |                    | Quadro              | `K2200` `K1200` `K620` `M1200` `M520` `M5000M` `M4000M` `M3000M` `M2000M` `M1000M` `K620M` `M600M` `M500M`  |
 
+For building locally to support older GPUs, see [developer.md](./development.md#linux-cuda-nvidia)
 
 ### GPU Selection
 
diff --git a/docs/linux.md b/docs/linux.md
index 8204ece5e7d..13655f423f4 100644
--- a/docs/linux.md
+++ b/docs/linux.md
@@ -10,6 +10,9 @@ curl -fsSL https://ollama.com/install.sh | sh
 
 ## Manual install
 
+> [!NOTE]
+> If you are upgrading from a prior version, you should remove the old libraries with `sudo rm -rf /usr/lib/ollama` first.
+
 Download and extract the package:
 
 ```shell
diff --git a/docs/windows.md b/docs/windows.md
index 0bdaf08563b..80bebed4782 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -83,3 +83,6 @@ If you'd like to install or integrate Ollama as a service, a standalone
 and GPU library dependencies for Nvidia and AMD. This allows for embedding
 Ollama in existing applications, or running it as a system service via `ollama
 serve` with tools such as [NSSM](https://nssm.cc/).
+
+> [!NOTE]  
+> If you are upgrading from a prior version, you should remove the old directories first.
diff --git a/envconfig/config.go b/envconfig/config.go
index 29c7fa4ffd6..c10095a646b 100644
--- a/envconfig/config.go
+++ b/envconfig/config.go
@@ -175,7 +175,6 @@ func String(s string) func() string {
 
 var (
 	LLMLibrary = String("OLLAMA_LLM_LIBRARY")
-	TmpDir     = String("OLLAMA_TMPDIR")
 
 	CudaVisibleDevices    = String("CUDA_VISIBLE_DEVICES")
 	HipVisibleDevices     = String("HIP_VISIBLE_DEVICES")
@@ -250,7 +249,6 @@ func AsMap() map[string]EnvVar {
 		"OLLAMA_NUM_PARALLEL":      {"OLLAMA_NUM_PARALLEL", NumParallel(), "Maximum number of parallel requests"},
 		"OLLAMA_ORIGINS":           {"OLLAMA_ORIGINS", Origins(), "A comma separated list of allowed origins"},
 		"OLLAMA_SCHED_SPREAD":      {"OLLAMA_SCHED_SPREAD", SchedSpread(), "Always schedule model across all GPUs"},
-		"OLLAMA_TMPDIR":            {"OLLAMA_TMPDIR", TmpDir(), "Location for temporary files"},
 		"OLLAMA_MULTIUSER_CACHE":   {"OLLAMA_MULTIUSER_CACHE", MultiUserCache(), "Optimize prompt caching for multi-user scenarios"},
 
 		// Informational
diff --git a/llama/Makefile b/llama/Makefile
deleted file mode 100644
index 47a87a75546..00000000000
--- a/llama/Makefile
+++ /dev/null
@@ -1,57 +0,0 @@
-# top level makefile for Go server
-include make/common-defs.make
-
-RUNNER_TARGETS := default
-
-# Determine which if any GPU runners we should build
-ifeq ($(OS),windows)
-	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
-	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
-	CUDA_11:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
-	CUDA_12:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
-	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
-else ifeq ($(OS),linux)
-	HIP_PATH?=/opt/rocm
-	HIP_LIB_DIR := $(shell ls -d $(HIP_PATH)/lib 2>/dev/null)
-	CUDA_PATH?=/usr/local/cuda
-	CUDA_11:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
-	CUDA_12:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
-endif
-
-ifeq ($(OLLAMA_SKIP_CUDA_GENERATE),)
-ifneq ($(CUDA_11),)
-	RUNNER_TARGETS += cuda_v11
-endif
-ifneq ($(CUDA_12),)
-	RUNNER_TARGETS += cuda_v12
-endif
-endif
-ifeq ($(OLLAMA_SKIP_ROCM_GENERATE),)
-ifneq ($(HIP_LIB_DIR),)
-	RUNNER_TARGETS += rocm
-endif
-endif
-
-
-all: clean-payload .WAIT runners
-
-runners: $(RUNNER_TARGETS)
-
-$(RUNNER_TARGETS):
-	$(MAKE) -f make/Makefile.$@
-
-help-sync apply-patches create-patches sync:
-	$(MAKE) -f make/Makefile.sync $@
-
-clean:
-	rm -rf $(BUILD_DIR) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-	go clean -cache
-
-clean-payload:
-	rm -rf $(addprefix $(RUNNERS_PAYLOAD_DIR)/, $(RUNNER_TARGETS) metal cpu cpu_avx cpu_avx2)
-
-.PHONY: all runners clean clean-payload $(RUNNER_TARGETS) .WAIT
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
diff --git a/llama/llama.go b/llama/llama.go
index 97b58663c4b..8154b1efd46 100644
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -9,22 +9,24 @@ package llama
 #cgo amd64,avx CXXFLAGS: -mavx
 #cgo amd64,avx2 CFLAGS: -mavx2 -mfma
 #cgo amd64,avx2 CXXFLAGS: -mavx2 -mfma
+#cgo amd64,avx512 CFLAGS: -mavx512f -mavx512dq -mavx512bw
+#cgo amd64,avx512 CXXFLAGS: -mavx512f -mavx512dq -mavx512bw
+#cgo amd64,avx512bf16 CFLAGS: -mavx512bf16 -D__AVX512BF16__
+#cgo amd64,avx512bf16 CXXFLAGS: -mavx512bf16 -D__AVX512BF16__
+#cgo amd64,avx512vbmi CFLAGS: -mavx512vbmi -D__AVX512VBMI__
+#cgo amd64,avx512vbmi CXXFLAGS: -mavx512vbmi -D__AVX512VBMI__
+#cgo amd64,avx512vnni CFLAGS: -mavx512vnni -D__AVX512VNNI__
+#cgo amd64,avx512vnni CXXFLAGS: -mavx512vnni -D__AVX512VNNI__
 #cgo amd64,f16c CFLAGS: -mf16c
 #cgo amd64,f16c CXXFLAGS: -mf16c
 #cgo amd64,fma CFLAGS: -mfma
 #cgo amd64,fma CXXFLAGS: -mfma
-#cgo avx CFLAGS: -mavx
-#cgo avx CXXFLAGS: -mavx
-#cgo avx2 CFLAGS: -mavx2 -mfma -mf16c
-#cgo avx2 CXXFLAGS: -mavx2 -mfma -mf16c
 #cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda CFLAGS: -fPIE -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo cuda CXXFLAGS: -DGGML_USE_CUDA -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
-#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5 -L/usr/local/cuda-11/lib64
-#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6 -L/usr/local/cuda-12/lib64
-#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11 -L/usr/local/cuda-11/lib64
-#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12 -L/usr/local/cuda-12/lib64
+#cgo cuda_jetpack5 LDFLAGS: -lggml_cuda_jetpack5
+#cgo cuda_jetpack6 LDFLAGS: -lggml_cuda_jetpack6
+#cgo cuda_v11 LDFLAGS: -lggml_cuda_v11
+#cgo cuda_v12 LDFLAGS: -lggml_cuda_v12
 #cgo darwin,amd64 CFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
 #cgo darwin,amd64 CXXFLAGS: -Wno-incompatible-pointer-types-discards-qualifiers
 #cgo darwin,amd64 LDFLAGS: -framework Foundation
@@ -36,28 +38,24 @@ package llama
 #cgo darwin,arm64 LDFLAGS: -framework Foundation -framework Metal -framework MetalKit -framework Accelerate
 #cgo linux CFLAGS: -D_GNU_SOURCE
 #cgo linux CXXFLAGS: -D_GNU_SOURCE
-#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
-#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/Linux/amd64
+#cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux-amd64
 #cgo linux,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo linux,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/Linux/arm64
+#cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux-arm64
 #cgo linux,arm64,sve CFLAGS: -march=armv8.6-a+sve
 #cgo linux,arm64,sve CXXFLAGS: -march=armv8.6-a+sve
 #cgo linux,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt -lpthread -ldl -lrt -lresolv
-#cgo linux,rocm LDFLAGS: -L/opt/rocm/lib -lpthread -ldl -lrt -lresolv
+#cgo linux,rocm LDFLAGS: -lpthread -ldl -lrt -lresolv
 #cgo rocm CFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo rocm CXXFLAGS: -DGGML_USE_CUDA -DGGML_USE_HIPBLAS -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 -DGGML_CUDA_MMV_Y=1 -DGGML_BUILD=1
 #cgo rocm LDFLAGS: -L${SRCDIR} -lggml_rocm -lhipblas -lamdhip64 -lrocblas
 #cgo windows CFLAGS: -Wno-discarded-qualifiers -D_WIN32_WINNT=0x602
 #cgo windows CXXFLAGS: -D_WIN32_WINNT=0x602
-#cgo windows LDFLAGS: -lmsvcrt
 #cgo windows LDFLAGS: -lmsvcrt -static-libstdc++ -static-libgcc -static
-#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64
-#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/Windows/amd64
+#cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows-amd64
 #cgo windows,arm64 CFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
 #cgo windows,arm64 CXXFLAGS: -D__aarch64__ -D__ARM_NEON -D__ARM_FEATURE_FMA
-#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64
-#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/Windows/arm64
+#cgo windows,arm64 LDFLAGS: -L${SRCDIR}/build/windows-arm64
 #cgo windows,cuda LDFLAGS: -lcuda -lcudart -lcublas -lcublasLt
 #cgo windows,rocm LDFLAGS: -lggml_rocm -lhipblas -lamdhip64 -lrocblas
 
diff --git a/llama/make/Makefile.default b/llama/make/Makefile.default
deleted file mode 100644
index 95b13a7373e..00000000000
--- a/llama/make/Makefile.default
+++ /dev/null
@@ -1,54 +0,0 @@
-# Build the default runner(s) for the platform which do not rely on 3rd party GPU libraries
-# On Mac arm64, this builds the metal runner
-# On other platforms this builds the CPU runner(s)
-
-include make/common-defs.make
-
-CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(TARGET_CPU_FLAGS))\" $(TARGET_LDFLAGS)"
-DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
-RUNNERS := $(DEFAULT_RUNNER)
-ifeq ($(ARCH),amd64)
-ifeq ($(CUSTOM_CPU_FLAGS),)
-	RUNNERS += cpu_avx cpu_avx2
-endif
-endif
-
-DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
-ifneq ($(OS),windows)
-PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(RUNNERS)))
-endif
-BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
-
-all: $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-
-$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
-$(RUNNERS_BUILD_DIR)/$(DEFAULT_RUNNER)/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS)))  -o $@ ./runner
-
-$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx"
-$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
-
-$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2"
-$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./runner
-
-$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
-	@-mkdir -p $(dir $@)
-	cp $< $@
-
-$(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server$(EXE_EXT).gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server$(EXE_EXT)
-	@-mkdir -p $(dir $@)
-	${GZIP} --best -c $< > $@
-
-clean: 
-	rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-
-.PHONY: clean all
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
diff --git a/llama/make/cuda.make b/llama/make/cuda.make
deleted file mode 100644
index 7a4b1036975..00000000000
--- a/llama/make/cuda.make
+++ /dev/null
@@ -1,50 +0,0 @@
-# Common definitions for all cuda versions
-
-ifndef GPU_RUNNER_VARIANT
-dummy:
-	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
-endif
-
-
-GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT)
-GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
-GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
-GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
-GPU_LIB_DIR_WIN = $(GPU_PATH_ROOT_WIN)/bin
-GPU_LIB_DIR_LINUX = $(GPU_PATH_ROOT_LINUX)/lib64
-CGO_EXTRA_LDFLAGS_WIN = -L"$(GPU_PATH_ROOT_WIN)/lib/x64"
-GPU_COMPILER_WIN = $(GPU_PATH_ROOT_WIN)/bin/nvcc
-GPU_COMPILER_LINUX = $(GPU_PATH_ROOT_LINUX)/bin/nvcc
-GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
-GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT)*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
-
-ifeq ($(OS),linux)
-	CUDA_PATH?=/usr/local/cuda
-	GPU_COMPILER_FPIC = -fPIC -Wno-unused-function -std=c++11
-endif
-GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
-	-DGGML_CUDA_USE_GRAPHS=1
-GPU_COMPILER_CUFLAGS = \
-	$(GPU_COMPILER_FPIC) \
-	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(_OS_GPU_RUNNER_CPU_FLAGS))" \
-	-t2 \
-	-DGGML_CUDA_DMMV_X=32 \
-	-DGGML_CUDA_MMV_Y=1 \
-	-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
-	-DGGML_USE_CUDA=1 \
-	-DGGML_SHARED=1 \
-	-DGGML_BUILD=1 \
-	-DGGML_USE_LLAMAFILE \
-	-DK_QUANTS_PER_ITERATION=2 \
-	-DNDEBUG \
-	-D_GNU_SOURCE \
-	-D_XOPEN_SOURCE=600 \
-	-Wno-deprecated-gpu-targets \
-	--forward-unknown-to-host-compiler \
-	-use_fast_math \
-	-I. \
-	-O3
diff --git a/llama/make/gpu.make b/llama/make/gpu.make
deleted file mode 100644
index fbd8dbca539..00000000000
--- a/llama/make/gpu.make
+++ /dev/null
@@ -1,122 +0,0 @@
-# Generalized GPU runner build
-
-ifndef GPU_RUNNER_NAME
-dummy:
-	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
-endif
-
-ifeq ($(OS),windows)
-	GPU_COMPILER:=$(GPU_COMPILER_WIN)
-	GPU_LIB_DIR:=$(GPU_LIB_DIR_WIN)
-	CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_WIN)
-	GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_WIN)
-	GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_WIN)
-else ifeq ($(OS),linux)
-	GPU_COMPILER:=$(GPU_COMPILER_LINUX)
-	GPU_LIB_DIR:=$(GPU_LIB_DIR_LINUX)
-	CGO_EXTRA_LDFLAGS:=$(CGO_EXTRA_LDFLAGS_LINUX)
-	GPU_COMPILER_CFLAGS = $(GPU_COMPILER_CFLAGS_LINUX)
-	GPU_COMPILER_CXXFLAGS = $(GPU_COMPILER_CXXFLAGS_LINUX)
-endif
-
-GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS))\" $(TARGET_LDFLAGS)"
-
-# TODO Unify how we handle dependencies in the dist/packaging and install flow
-# today, cuda is bundled, but rocm is split out.  Should split them each out by runner
-DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
-
-ifeq ($(OS),windows)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(call uc,$(GPU_RUNNER_CPU_FLAGS))
-else ifeq ($(OS),linux)
-	_OS_GPU_RUNNER_CPU_FLAGS=$(GPU_RUNNER_CPU_FLAGS)
-endif
-
-GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
-DIST_GPU_RUNNER_LIB_DEPS = $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_RUNNER_LIBS)))
-
-GPU_RUNNER_SRCS := \
-	ggml-cuda.cu \
-	$(filter-out $(wildcard ggml-cuda/fattn*.cu),$(wildcard ggml-cuda/*.cu)) \
-	$(wildcard ggml-cuda/template-instances/mmq*.cu) \
-	ggml.c ggml-backend.c ggml-alloc.c ggml-quants.c sgemm.cpp ggml-aarch64.c
-GPU_RUNNER_HDRS := \
-	$(wildcard ggml-cuda/*.cuh)
-
-
-# Conditional flags and components to speed up developer builds
-ifneq ($(OLLAMA_FAST_BUILD),)
-	GPU_COMPILER_CUFLAGS += 	\
-		-DGGML_DISABLE_FLASH_ATTN
-else
-	GPU_RUNNER_SRCS += \
-		$(wildcard ggml-cuda/fattn*.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-wmma*.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
-		$(wildcard ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
-endif
-
-GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
-GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
-GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
-
-DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
-ifneq ($(OS),windows)
-PAYLOAD_RUNNERS = $(addprefix $(RUNNERS_PAYLOAD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT).gz,$(GPU_RUNNER_NAME)))
-endif
-BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)))
-
-
-$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-
-# Build targets
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $<
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $<
-$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/" $(CGO_EXTRA_LDFLAGS)
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) *.go ./runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
-	@-mkdir -p $(dir $@)
-	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie  $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./runner
-$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(DIST_GPU_RUNNER_LIB_DEPS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
-	@-mkdir -p $(dir $@)
-	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
-
-# Distribution targets
-$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
-	@-mkdir -p $(dir $@)
-	$(CP) $< $@
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_DEPS_LIBS)
-$(DIST_LIB_DIR)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
-	@-mkdir -p $(dir $@)
-	$(CP) $< $@
-$(DIST_GPU_RUNNER_LIB_DEPS): 
-	@-mkdir -p $(dir $@)
-	$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
-$(GPU_DIST_DEPS_LIBS): 
-	@-mkdir -p $(dir $@)
-	$(CP) $(dir $(filter %$(notdir $@),$(GPU_LIBS) $(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
-
-# Payload targets
-$(RUNNERS_PAYLOAD_DIR)/%/ollama_llama_server.gz: $(RUNNERS_BUILD_DIR)/%/ollama_llama_server 
-	@-mkdir -p $(dir $@)
-	${GZIP} --best -c $< > $@
-$(RUNNERS_PAYLOAD_DIR)/$(GPU_RUNNER_NAME)/%.gz: $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)/%
-	@-mkdir -p $(dir $@)
-	${GZIP} --best -c $< > $@
-
-clean: 
-	rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS) $(PAYLOAD_RUNNERS)
-
-.PHONY: clean $(GPU_RUNNER_NAME)
-
-
-# Handy debugging for make variables
-print-%:
-	@echo '$*=$($*)'
-
diff --git a/llama/runner/cache.go b/llama/runner/cache.go
index 0f5f0a09a10..e8a2d2994c7 100644
--- a/llama/runner/cache.go
+++ b/llama/runner/cache.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"errors"
diff --git a/llama/runner/cache_test.go b/llama/runner/cache_test.go
index 79cd93cbff2..9c838ed3345 100644
--- a/llama/runner/cache_test.go
+++ b/llama/runner/cache_test.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"testing"
diff --git a/llama/runner/image.go b/llama/runner/image.go
index 70058290006..c1932443c86 100644
--- a/llama/runner/image.go
+++ b/llama/runner/image.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"errors"
diff --git a/llama/runner/image_test.go b/llama/runner/image_test.go
index 4f1d265a749..d5c3bc1e2cb 100644
--- a/llama/runner/image_test.go
+++ b/llama/runner/image_test.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"reflect"
diff --git a/llama/runner/requirements.go b/llama/runner/requirements.go
deleted file mode 100644
index 71b3b9aad11..00000000000
--- a/llama/runner/requirements.go
+++ /dev/null
@@ -1,19 +0,0 @@
-package main
-
-import (
-	"encoding/json"
-	"os"
-
-	"github.com/ollama/ollama/llama"
-	"github.com/ollama/ollama/version"
-)
-
-func printRequirements(fp *os.File) {
-	attrs := map[string]string{
-		"system_info":  llama.PrintSystemInfo(),
-		"version":      version.Version,
-		"cpu_features": llama.CpuFeatures,
-	}
-	enc := json.NewEncoder(fp)
-	_ = enc.Encode(attrs)
-}
diff --git a/llama/runner/runner.go b/llama/runner/runner.go
index 660e8cffa84..be04cfd8f36 100644
--- a/llama/runner/runner.go
+++ b/llama/runner/runner.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"context"
@@ -895,32 +895,37 @@ func (s *Server) loadModel(
 	s.ready.Done()
 }
 
-func main() {
-	mpath := flag.String("model", "", "Path to model binary file")
-	ppath := flag.String("mmproj", "", "Path to projector binary file")
-	parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
-	batchSize := flag.Int("batch-size", 512, "Batch size")
-	nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
-	mainGpu := flag.Int("main-gpu", 0, "Main GPU")
-	flashAttention := flag.Bool("flash-attn", false, "Enable flash attention")
-	kvSize := flag.Int("ctx-size", 2048, "Context (or KV cache) size")
-	kvCacheType := flag.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
-	port := flag.Int("port", 8080, "Port to expose the server on")
-	threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
-	verbose := flag.Bool("verbose", false, "verbose output (default: disabled)")
-	noMmap := flag.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
-	mlock := flag.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
-	tensorSplit := flag.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
-	multiUserCache := flag.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
-	requirements := flag.Bool("requirements", false, "print json requirement information")
+func Execute(args []string) error {
+	if args[0] == "runner" {
+		args = args[1:]
+	}
+	fs := flag.NewFlagSet("runner", flag.ExitOnError)
+	mpath := fs.String("model", "", "Path to model binary file")
+	ppath := fs.String("mmproj", "", "Path to projector binary file")
+	parallel := fs.Int("parallel", 1, "Number of sequences to handle simultaneously")
+	batchSize := fs.Int("batch-size", 512, "Batch size")
+	nGpuLayers := fs.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
+	mainGpu := fs.Int("main-gpu", 0, "Main GPU")
+	flashAttention := fs.Bool("flash-attn", false, "Enable flash attention")
+	kvSize := fs.Int("ctx-size", 2048, "Context (or KV cache) size")
+	kvCacheType := fs.String("kv-cache-type", "", "quantization type for KV cache (default: f16)")
+	port := fs.Int("port", 8080, "Port to expose the server on")
+	threads := fs.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
+	verbose := fs.Bool("verbose", false, "verbose output (default: disabled)")
+	noMmap := fs.Bool("no-mmap", false, "do not memory-map model (slower load but may reduce pageouts if not using mlock)")
+	mlock := fs.Bool("mlock", false, "force system to keep model in RAM rather than swapping or compressing")
+	tensorSplit := fs.String("tensor-split", "", "fraction of the model to offload to each GPU, comma-separated list of proportions")
+	multiUserCache := fs.Bool("multiuser-cache", false, "optimize input cache algorithm for multiple users")
 
 	var lpaths multiLPath
-	flag.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
+	fs.Var(&lpaths, "lora", "Path to lora layer file (can be specified multiple times)")
 
-	flag.Parse()
-	if *requirements {
-		printRequirements(os.Stdout)
-		return
+	fs.Usage = func() {
+		fmt.Fprintf(fs.Output(), "Runner usage\n")
+		fs.PrintDefaults()
+	}
+	if err := fs.Parse(args); err != nil {
+		return err
 	}
 	level := slog.LevelInfo
 	if *verbose {
@@ -983,7 +988,8 @@ func main() {
 	listener, err := net.Listen("tcp", addr)
 	if err != nil {
 		fmt.Println("Listen error:", err)
-		return
+		cancel()
+		return err
 	}
 	defer listener.Close()
 
@@ -999,7 +1005,9 @@ func main() {
 	log.Println("Server listening on", addr)
 	if err := httpServer.Serve(listener); err != nil {
 		log.Fatal("server error:", err)
+		return err
 	}
 
 	cancel()
+	return nil
 }
diff --git a/llama/runner/stop.go b/llama/runner/stop.go
index c05f5e3d5cf..8dcb08d331d 100644
--- a/llama/runner/stop.go
+++ b/llama/runner/stop.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"strings"
diff --git a/llama/runner/stop_test.go b/llama/runner/stop_test.go
index 51b35fde358..31dc161f379 100644
--- a/llama/runner/stop_test.go
+++ b/llama/runner/stop_test.go
@@ -1,4 +1,4 @@
-package main
+package runner
 
 import (
 	"reflect"
diff --git a/llm/server.go b/llm/server.go
index fadf10ef817..79724ca66ce 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -25,7 +25,6 @@ import (
 	"golang.org/x/sync/semaphore"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
@@ -144,20 +143,13 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	// Loop through potential servers
 	finalErr := errors.New("no suitable llama servers found")
 
-	rDir, err := runners.Refresh(build.EmbedFS)
-	if err != nil {
-		return nil, err
-	}
+	availableServers := runners.GetAvailableServers()
 
-	availableServers := runners.GetAvailableServers(rDir)
-	if len(availableServers) == 0 {
-		return nil, finalErr
-	}
 	var servers []string
 	if cpuRunner != "" {
 		servers = []string{cpuRunner}
 	} else {
-		servers = runners.ServersForGpu(gpus[0]) // All GPUs in the list are matching Library and Variant
+		servers = runners.ServersForGpu(gpus[0].RunnerName()) // All GPUs in the list are matching Library and Variant
 	}
 	demandLib := envconfig.LLMLibrary()
 	if demandLib != "" {
@@ -167,7 +159,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		} else {
 			slog.Info("user override", "OLLAMA_LLM_LIBRARY", demandLib, "path", serverPath)
 			servers = []string{demandLib}
-			if strings.HasPrefix(demandLib, "cpu") {
+			if strings.HasPrefix(demandLib, "cpu") || (!(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") && demandLib == runners.BuiltinName()) {
 				// Omit the GPU flag to silence the warning
 				opts.NumGPU = -1
 			}
@@ -279,15 +271,16 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 	}
 
 	for i := range servers {
-		dir := availableServers[servers[i]]
-		if dir == "" {
+		builtin := servers[i] == runners.BuiltinName()
+		server := availableServers[servers[i]]
+		if server == "" {
 			// Shouldn't happen
 			finalErr = fmt.Errorf("[%d] server %s not listed in available servers %v", i, servers[i], availableServers)
 			slog.Error("server list inconsistent", "error", finalErr)
 			continue
 		}
 
-		if strings.HasPrefix(servers[i], "cpu") {
+		if strings.HasPrefix(servers[i], "cpu") || (builtin && !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64")) {
 			gpus = discover.GetCPUInfo()
 		}
 
@@ -304,14 +297,16 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			slog.Debug("ResolveTCPAddr failed ", "error", err)
 			port = rand.Intn(65535-49152) + 49152 // get a random port in the ephemeral range
 		}
-		finalParams := append(params, "--port", strconv.Itoa(port))
+		finalParams := []string{"runner"}
+		finalParams = append(finalParams, params...)
+		finalParams = append(finalParams, "--port", strconv.Itoa(port))
 
 		pathEnv := "LD_LIBRARY_PATH"
 		if runtime.GOOS == "windows" {
 			pathEnv = "PATH"
 		}
 		// Start with the server directory for the LD_LIBRARY_PATH/PATH
-		libraryPaths := []string{dir}
+		libraryPaths := []string{filepath.Dir(server)}
 
 		if libraryPath, ok := os.LookupEnv(pathEnv); ok {
 			// favor our bundled library dependencies over system libraries
@@ -325,22 +320,6 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 			libraryPaths = append(gpus[0].DependencyPath, libraryPaths...)
 		}
 
-		server := filepath.Join(dir, "ollama_llama_server")
-		if runtime.GOOS == "windows" {
-			server += ".exe"
-		}
-
-		// Detect tmp cleaners wiping out the file
-		_, err := os.Stat(server)
-		if errors.Is(err, os.ErrNotExist) {
-			slog.Warn("llama server disappeared, reinitializing payloads", "path", server, "error", err)
-			_, err = runners.Refresh(build.EmbedFS)
-			if err != nil {
-				slog.Warn("failed to reinitialize payloads", "error", err)
-				return nil, err
-			}
-		}
-
 		// TODO - once fully switched to the Go runner, load the model here for tokenize/detokenize cgo access
 		s := &llmServer{
 			port:        port,
@@ -417,7 +396,7 @@ func NewLlamaServer(gpus discover.GpuInfoList, model string, ggml *GGML, adapter
 		if err = s.cmd.Start(); err != nil {
 			// Detect permission denied and augment the message about noexec
 			if errors.Is(err, os.ErrPermission) {
-				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, dir)
+				finalErr = fmt.Errorf("unable to start server %w.  %s may have noexec set.  Set OLLAMA_TMPDIR for server to a writable executable directory", err, server)
 				continue
 			}
 			msg := ""
diff --git a/make/Makefile.cpu b/make/Makefile.cpu
new file mode 100644
index 00000000000..254039eb4b2
--- /dev/null
+++ b/make/Makefile.cpu
@@ -0,0 +1,40 @@
+# Build the discrete cpu runner(s) for the platform which do not rely on 3rd party GPU libraries
+
+include make/common-defs.make
+
+CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(TARGET_CPU_FLAGS))\" $(TARGET_LDFLAGS)"
+ifeq ($(ARCH),amd64)
+ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
+	RUNNERS = cpu_avx cpu_avx2
+endif
+endif
+
+DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
+BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(RUNNERS)))
+
+cpu: $(BUILD_RUNNERS)
+
+dist: $(DIST_RUNNERS)
+
+$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx"
+$(RUNNERS_BUILD_DIR)/cpu_avx/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
+	@-mkdir -p $(dir $@)
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner
+
+$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): TARGET_CPU_FLAGS="avx avx2"
+$(RUNNERS_BUILD_DIR)/cpu_avx2/ollama_llama_server$(EXE_EXT): ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
+	@-mkdir -p $(dir $@)
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(TARGET_CPU_FLAGS)) -o $@ ./cmd/runner
+
+$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
+	@-mkdir -p $(dir $@)
+	cp $< $@
+
+clean:
+	rm -f $(BUILD_RUNNERS) $(DIST_RUNNERS)
+
+.PHONY: clean cpu dist
+
+# Handy debugging for make variables
+print-%:
+	@echo '$*=$($*)'
diff --git a/llama/make/Makefile.cuda_v11 b/make/Makefile.cuda_v11
similarity index 53%
rename from llama/make/Makefile.cuda_v11
rename to make/Makefile.cuda_v11
index 528e0efe463..a6a81823e89 100644
--- a/llama/make/Makefile.cuda_v11
+++ b/make/Makefile.cuda_v11
@@ -1,12 +1,13 @@
 # Build rules for CUDA v11 runner
 
 include make/common-defs.make
-
+include make/cuda-v11-defs.make
 
 GPU_RUNNER_VARIANT := _v11
-GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v11.? 2>/dev/null)
-GPU_PATH_ROOT_LINUX=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
+GPU_COMPILER=$(CUDA_11_COMPILER)
 CUDA_ARCHITECTURES?=50;52;53;60;61;62;70;72;75;80;86
+GPU_LIB_DIR = $(CUDA_11_LIB_DIR)
+CGO_EXTRA_LDFLAGS = $(CUDA_11_CGO_EXTRA_LDFLAGS)
 
 include make/cuda.make
 include make/gpu.make
\ No newline at end of file
diff --git a/llama/make/Makefile.cuda_v12 b/make/Makefile.cuda_v12
similarity index 54%
rename from llama/make/Makefile.cuda_v12
rename to make/Makefile.cuda_v12
index 2418ef0047a..7c50b27b57f 100644
--- a/llama/make/Makefile.cuda_v12
+++ b/make/Makefile.cuda_v12
@@ -1,12 +1,13 @@
 # Build rules for CUDA v12 runner
 
 include make/common-defs.make
-
+include make/cuda-v12-defs.make
 
 GPU_RUNNER_VARIANT := _v12
-GPU_PATH_ROOT_WIN=$(shell ls -d $(dir $(shell cygpath -m -s "$(CUDA_PATH)\.."))/v12.? 2>/dev/null)
-GPU_PATH_ROOT_LINUX=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
+GPU_COMPILER=$(CUDA_12_COMPILER)
 CUDA_ARCHITECTURES?=60;61;62;70;72;75;80;86;87;89;90;90a
+GPU_LIB_DIR = $(CUDA_12_LIB_DIR)
+CGO_EXTRA_LDFLAGS = $(CUDA_12_CGO_EXTRA_LDFLAGS)
 
 include make/cuda.make
 include make/gpu.make
\ No newline at end of file
diff --git a/make/Makefile.ollama b/make/Makefile.ollama
new file mode 100644
index 00000000000..51ef7713f15
--- /dev/null
+++ b/make/Makefile.ollama
@@ -0,0 +1,19 @@
+# Makefile for building top-level ollama binary
+
+include make/common-defs.make
+
+exe: $(OLLAMA_EXE)
+dist_exe dist_ollama: $(DIST_OLLAMA_EXE)
+
+GO_DEPS=$(foreach dir,$(shell go list -deps -f '{{.Dir}}' . ),$(wildcard $(dir)/*.go))
+CPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(TARGET_CPU_FLAGS))\" $(EXTRA_GOLDLAGS) $(TARGET_LDFLAGS)"
+
+$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): TARGET_CPU_FLAGS=$(CUSTOM_CPU_FLAGS)
+$(OLLAMA_EXE) $(DIST_OLLAMA_EXE): $(COMMON_SRCS) $(COMMON_HDRS) $(GO_DEPS)
+	GOARCH=$(ARCH) go build -buildmode=pie $(CPU_GOFLAGS) -trimpath $(if $(CUSTOM_CPU_FLAGS),-tags $(subst $(space),$(comma),$(CUSTOM_CPU_FLAGS))) -o $@ .
+
+.PHONY: ollama dist_ollama exe dist_exe
+
+# Handy debugging for make variables
+print-%:
+	@echo '$*=$($*)'
diff --git a/llama/make/Makefile.rocm b/make/Makefile.rocm
similarity index 55%
rename from llama/make/Makefile.rocm
rename to make/Makefile.rocm
index 4ab176b4d47..7be3f60d108 100644
--- a/llama/make/Makefile.rocm
+++ b/make/Makefile.rocm
@@ -4,22 +4,25 @@
 # unlike CUDA where we'll build both a v11 and v12 variant.
 
 include make/common-defs.make
+include make/rocm-defs.make
 
 HIP_ARCHS_COMMON := gfx900 gfx940 gfx941 gfx942 gfx1010 gfx1012 gfx1030 gfx1100 gfx1101 gfx1102
 HIP_ARCHS_LINUX := gfx906:xnack- gfx908:xnack- gfx90a:xnack+ gfx90a:xnack-
 
 ifeq ($(OS),windows)
-	GPU_LIB_DIR_WIN := $(shell cygpath -m -s "$(HIP_PATH)/bin")
-	CGO_EXTRA_LDFLAGS_WIN := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
-	GPU_COMPILER_WIN := $(HIP_PATH)/bin/hipcc.bin.exe
-	GPU_COMPILER:=$(GPU_COMPILER_WIN)
+	GPU_LIB_DIR := $(shell cygpath -m -s "$(HIP_PATH)/bin")
+	CGO_EXTRA_LDFLAGS := -L$(shell cygpath -m -s "$(HIP_PATH)/lib")
+	HIP_ARCHS?=$(HIP_ARCHS_COMMON)
+	GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
+	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
 else ifeq ($(OS),linux)
-	GPU_LIB_DIR_LINUX := $(HIP_PATH)/lib
-	GPU_COMPILER_LINUX := $(shell X=$$(which hipcc 2>/dev/null) && echo $$X)
-	GPU_COMPILER:=$(GPU_COMPILER_LINUX)
-	ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' '  | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
-	GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
+	GPU_LIB_DIR := $(strip $(shell ls -d $(HIP_PATH)/lib64 2>/dev/null || ls -d $(HIP_PATH)/lib 2>/dev/null))
+	CGO_EXTRA_LDFLAGS := -L$(GPU_LIB_DIR)
+	HIP_ARCHS?=$(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX)
+	GPU_COMPILER_CFLAGS = $(CFLAGS) -fPIC -D_GNU_SOURCE
+	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
 endif
+GPU_COMPILER=$(HIP_COMPILER)
 
 # TODO future multi-variant support for ROCm
 # ROCM_VERSION = $(subst $(space),.,$(wordlist 1,2,$(subst .,$(space),$(word 3,$(subst -,$(space),$(filter HIP version: %,$(shell $(GPU_COMPILER) --version)))))))
@@ -31,36 +34,37 @@ GPU_RUNNER_GO_TAGS := rocm
 GPU_RUNNER_NAME := rocm$(GPU_RUNNER_VARIANT)
 GPU_RUNNER_DRIVER_LIB_LINK := -lamdhip64
 GPU_RUNNER_LIBS_SHORT := hipblas rocblas
-GPU_PATH_ROOT_WIN=$(dir $(GPU_LIB_DIR_WIN))
-GPU_PATH_ROOT_LINUX=$(dir $(GPU_LIB_DIR_LINUX))
-GPU_COMPILER_CFLAGS_WIN = $(CFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CFLAGS_LINUX = $(CFLAGS) -fPIC -D_GNU_SOURCE
-GPU_COMPILER_CXXFLAGS_WIN = $(CXXFLAGS) -D_WIN32_WINNT=0x602
-GPU_COMPILER_CXXFLAGS_LINUX = $(CXXFLAGS) -fPIC -D_GNU_SOURCE
 
-GPU_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
+# Note: ROCm requires an extra step of discovering and copying the transitive dependencies on linux
 ifeq ($(OS),windows)
-	ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))/lib/ollama
+	ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)/lib/ollama
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
 else ifeq ($(OS),linux)
-	ROCM_DIST_DEPS_DIR = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH)-rocm)/lib/ollama
+	ROCM_DIST_DEPS_DIR = ./dist/$(OS)-$(ARCH)-rocm/lib/ollama
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+	ROCM_TRANSITIVE_LIBS_INITIAL = $(sort $(shell ldd $(GPU_LIBS) | grep "=>" | cut -f2 -d= | cut -f2 -d' '  | grep -e rocm -e amdgpu -e libtinfo -e libnuma -e libelf))
+	GPU_TRANSITIVE_LIBS = $(sort $(shell readlink -f $(ROCM_TRANSITIVE_LIBS_INITIAL)) $(ROCM_TRANSITIVE_LIBS_INITIAL))
+	FILTERED_GPU_TRANSITIVE_LIBS=$(sort $(filter-out $(addprefix %,$(notdir $(GPU_LIBS))), $(GPU_TRANSITIVE_LIBS)))
+	GPU_DIST_TRANSITIVE_LIB_DEPS = $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(FILTERED_GPU_TRANSITIVE_LIBS))))
 endif
-GPU_DIST_DEPS_LIBS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS)) $(notdir $(GPU_TRANSITIVE_LIBS))))
+GPU_DIST_LIB_DEPS= $(sort $(addprefix $(ROCM_DIST_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
 ROCBLAS_DIST_DEP_MANIFEST = $(ROCM_DIST_DEPS_DIR)/rocblas/library/TensileManifest.txt
 
 ifeq ($(OS),linux)
 	GPU_COMPILER_FPIC := -fPIC -Wno-unused-function -std=gnu++11
-	GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON) $(HIP_ARCHS_LINUX), --offload-arch=$(arch))
 else ifeq ($(OS),windows)
 	GPU_COMPILER_FPIC := -Xclang --dependent-lib=msvcrt
-	GPU_RUNNER_ARCH_FLAGS := $(foreach arch, $(HIP_ARCHS_COMMON), --offload-arch=$(arch))
 endif
+GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(HIP_ARCHS)),--offload-arch=$(arch))
+
+# HIPCC uses clang which requires avx512 -> -mavx512f -mavx512dq -mavx512bw
+GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
 
 GPU_COMPILER_CUFLAGS = \
 	$(GPU_COMPILER_FPIC) \
-	$(addprefix -m,$(GPU_RUNNER_CPU_FLAGS)) \
+	$(addprefix -m,$(GPU_VECTOR_FLAGS)) \
 	-mf16c \
 	-mfma \
-	-parallel-jobs=2 \
 	-c \
 	-O3 \
 	-DGGML_USE_CUDA \
@@ -90,7 +94,7 @@ GPU_COMPILER_CUFLAGS = \
 	-Wno-pass-failed \
 	-Wno-deprecated-declarations \
 	-Wno-unused-result \
-	-I.
+	-I./llama/
 
 # Workaround buggy P2P copy on some windows multi-GPU setups
 # This workaround breaks linux systems with small system RAM, so only enable on windows
@@ -101,9 +105,13 @@ endif
 include make/gpu.make
 
 # Adjust the rules from gpu.make to handle the ROCm dependencies properly
-$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST)
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(ROCBLAS_DIST_DEP_MANIFEST) $(GPU_DIST_TRANSITIVE_LIB_DEPS)
 $(ROCBLAS_DIST_DEP_MANIFEST):
 	@-mkdir -p $(dir $@)
 	@echo "Copying rocblas library..."
-	cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . | (cd $(dir $@) && tar xf - )
+	(cd $(GPU_LIB_DIR)/rocblas/library/ && tar cf - . ) | (cd $(dir $@) && tar xf - )
 	@echo "rocblas library copy complete"
+
+$(GPU_DIST_TRANSITIVE_LIB_DEPS):
+	@-mkdir -p $(dir $@)
+	$(CP) $(dir $(filter %$(notdir $@),$(GPU_TRANSITIVE_LIBS)))/$(notdir $@) $(dir $@)
diff --git a/llama/make/Makefile.sync b/make/Makefile.sync
similarity index 79%
rename from llama/make/Makefile.sync
rename to make/Makefile.sync
index a6a7124fdc9..8fbc4039e47 100644
--- a/llama/make/Makefile.sync
+++ b/make/Makefile.sync
@@ -1,23 +1,25 @@
 # Helpers for managing our vendored llama.cpp repo and patch set
 
-REPO_ROOT:=$(dir $(patsubst %/,%,$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))))
-DST_DIR:=$(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
+REPO_ROOT:=./
+DEST_DIR:=./llama/
 
-include $(REPO_ROOT)llama/vendoring
+include $(DEST_DIR)vendoring
 
-LLAMACPP_REPO := $(REPO_ROOT)llama/vendor/
+LLAMACPP_REPO := ./llama/vendor/
 
-LLAMACPP_PATCH_DIR := $(DST_DIR)patches/
+# Relative to the vendor dir
+VENDOR_RELATIVE_PATCH_DIR := ../patches/
 
 
 help-sync:
 	@echo "The following make targets will help you update llama.cpp to a new base commit, or work on new features/fixes"
 	@echo ""
-	@echo "\tmake apply-patches   # Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
-	@echo "\tmake sync            # Vendor llama.cpp and ggml from the tracking repo working tree"
-	@echo "\tmake create-patches  # Generate the patch set based on the current commits in the tracking repo since the base commit"
+	@echo "	make apply-patches	# Establish the tracking repo if not already present, reset to the base commit, and apply our patch set"
+	@echo "	make sync		# Vendor llama.cpp and ggml from the tracking repo working tree"
+	@echo "	make sync-clean		# Remove all vendored files"
+	@echo "	make create-patches	# Generate the patch set based on the current commits in the tracking repo since the base commit"
 	@echo ""
-	@echo "For more details on the workflow, see the Vendoring section in ../docs/development.md"
+	@echo "For more details on the workflow, see the Vendoring section in 'docs/development.md'"
 
 apply-patches: $(LLAMACPP_REPO)
 	@if ! git -C $(LLAMACPP_REPO) --no-pager diff --exit-code ; then \
@@ -29,7 +31,7 @@ apply-patches: $(LLAMACPP_REPO)
 	@git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT) || \
 		git -C $(LLAMACPP_REPO) fetch --all && git -C $(LLAMACPP_REPO) checkout -q $(LLAMACPP_BASE_COMMIT)
 	@echo "Applying ollama patches..."
-	@git -c 'user.name=nobody' -c 'user.email=<>' -C $(LLAMACPP_REPO) am -3 $(LLAMACPP_PATCH_DIR)/*.patch || \
+	@cd $(LLAMACPP_REPO) && git -c 'user.name=nobody' -c 'user.email=<>' am -3 $(VENDOR_RELATIVE_PATCH_DIR)*.patch || \
 		echo "Please resolve the conflicts in $(LLAMACPP_REPO), and run 'git am --continue' to continue applying subsequent patches"
 	@echo ""
 	@echo "The tracking repo $(LLAMACPP_REPO) is now in a detached state with all patches applied."
@@ -44,7 +46,7 @@ create-patches: $(LLAMACPP_REPO)
   		echo "ERROR: Your llama.cpp repo is dirty.  You must commit any pending changes for format-patch to generate patches"; \
   		exit 1; \
 	fi
-	git -C $(LLAMACPP_REPO) format-patch --no-signature --no-numbered --zero-commit -o $(LLAMACPP_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
+	@cd $(LLAMACPP_REPO) && git format-patch --no-signature --no-numbered --zero-commit -o $(VENDOR_RELATIVE_PATCH_DIR) $(LLAMACPP_BASE_COMMIT)
 
 # Vendoring template logic
 EXCLUDED_FILES=sgemm.cpp sgemm.h sampling_ext.cpp sampling_ext.h stb_image.h json.hpp llama_darwin.c base64.hpp
@@ -86,12 +88,12 @@ LLAMACPP_FILES=\
 	include/llama.h \
 	ggml/src/llamafile/sgemm.cpp \
 	ggml/src/llamafile/sgemm.h
-$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+$(foreach name,$(LLAMACPP_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
 
 # llama.cpp files -> llama/llamafile
 LLAMAFILE_FILES= \
 	ggml/src/llamafile/sgemm.h
-$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR)llamafile/)))
+$(foreach name,$(LLAMAFILE_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR)llamafile/)))
 
 # ggml files -> llama/
 GGML_FILES= \
@@ -115,10 +117,10 @@ GGML_FILES= \
 	ggml/src/ggml-cpu-impl.h \
 	ggml/include/ggml-blas.h \
 	ggml/src/ggml-blas.cpp
-$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+$(foreach name,$(GGML_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
 
 # TODO generalize renaming pattern if we have more of these
-$(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
+$(DEST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
 	@echo "vendoring $(subst $(LLAMACPP_REPO),,$<)"; \
 		mkdir -p $(dir $@) && \
 		echo "/**" > $@ && \
@@ -128,20 +130,20 @@ $(DST_DIR)ggml-metal_darwin_arm64.m : $(LLAMACPP_REPO)ggml/src/ggml-metal.m
 		echo " */" >> $@ && \
 		echo "" >> $@ && \
 		cat $< >> $@
-VENDORED_FILES += $(DST_DIR)ggml-metal_darwin_arm64.m
+VENDORED_FILES += $(DEST_DIR)ggml-metal_darwin_arm64.m
 
 # ggml-cuda -> llama/ggml-cuda/
 GGML_CUDA_FILES= ggml/src/ggml-cuda/*.cu ggml/src/ggml-cuda/*.cuh
 GGML_CUDA_FILES_EXPANDED = $(addprefix ggml/src/ggml-cuda/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_CUDA_FILES)))))
-$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/)))
+$(foreach name,$(GGML_CUDA_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/)))
 
 GGML_TEMPLATE_FILES= ggml/src/ggml-cuda/template-instances/*.cu
 GGML_TEMPLATE_FILES_EXPANDED = 	$(addprefix ggml/src/ggml-cuda/template-instances/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_TEMPLATE_FILES)))))
-$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/template-instances/)))
+$(foreach name,$(GGML_TEMPLATE_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/template-instances/)))
 
 GGML_VENDOR_FILES= ggml/src/ggml-cuda/vendors/*.h
 GGML_VENDOR_FILES_EXPANDED=$(addprefix ggml/src/ggml-cuda/vendors/,$(notdir $(wildcard $(addprefix $(LLAMACPP_REPO),$(GGML_VENDOR_FILES)))))
-$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DST_DIR)ggml-cuda/vendors/)))
+$(foreach name,$(GGML_VENDOR_FILES_EXPANDED),$(eval $(call vendor_file,$(name),$(DEST_DIR)ggml-cuda/vendors/)))
 
 # llava -> llama/
 LAVA_FILES= \
@@ -163,27 +165,30 @@ LAVA_FILES+= \
 	common/json-schema-to-grammar.cpp \
 	common/json-schema-to-grammar.h \
 	common/base64.hpp
-$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DST_DIR))))
+$(foreach name,$(LAVA_FILES),$(eval $(call vendor_file,$(name),$(DEST_DIR))))
 
-$(DST_DIR)build-info.cpp:
+$(DEST_DIR)build-info.cpp:
 	@echo "Generating $@"
 	@echo "int LLAMA_BUILD_NUMBER = 0;" > $@
 	@echo "char const *LLAMA_COMMIT = \"$(LLAMACPP_BASE_COMMIT)\";" >> $@
 	@echo "char const *LLAMA_COMPILER = \"\";" >> $@
 	@echo "char const *LLAMA_BUILD_TARGET = \"\";" >> $@
-VENDORED_FILES += $(DST_DIR)build-info.cpp
+VENDORED_FILES += $(DEST_DIR)build-info.cpp
 
 
 sync: $(LLAMACPP_REPO) .WAIT $(VENDORED_FILES) .WAIT remove-stale-files
 
+sync-clean:
+	rm -f $(VENDORED_FILES) $(EXTRA_NATIVE_FILES)
+
 PATS=*.c *.h *.cpp *.m *.metal *.cu *.cuh
-NATIVE_DIRS=$(DST_DIR) $(DST_DIR)llamafile/ $(DST_DIR)ggml-cuda/ $(DST_DIR)ggml-cuda/template-instances/ $(DST_DIR)ggml-cuda/vendors/
+NATIVE_DIRS=$(DEST_DIR) $(DEST_DIR)llamafile/ $(DEST_DIR)ggml-cuda/ $(DEST_DIR)ggml-cuda/template-instances/ $(DEST_DIR)ggml-cuda/vendors/
 ALL_NATIVE_FILES=$(foreach dir,$(NATIVE_DIRS),$(wildcard $(addprefix $(dir),$(PATS))))
-EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
+EXTRA_NATIVE_FILES=$(filter-out $(VENDORED_FILES) $(addprefix $(DEST_DIR),$(OLLAMA_NATIVE_FILES)), $(ALL_NATIVE_FILES))
 remove-stale-files:
 	@rm -f $(EXTRA_NATIVE_FILES)
 
-.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT 
+.PHONY: help-sync apply-patches sync create-patches remove-stale-fails .WAIT
 
 
 # Handy debugging for make variables
diff --git a/make/Makefile.test b/make/Makefile.test
new file mode 100644
index 00000000000..3b27d0dbef3
--- /dev/null
+++ b/make/Makefile.test
@@ -0,0 +1,19 @@
+# Targets to assist in running tests
+
+include make/common-defs.make
+
+test:
+	cd .. && go test ./... 
+
+integration: $(OLLAMA_EXE)
+	cd .. && go test --tags=integration ./integration -v
+
+lint:
+	cd .. && golangci-lint run -v
+
+# Note: in this makefile we error instead of building to allow more fine-grain control of testing flows
+$(OLLAMA_EXE):
+	@echo ""
+	@echo "ERROR: You must build ollama first - use 'make all' to build the ollama binaries"
+	@echo ""
+	@exit 1
\ No newline at end of file
diff --git a/llama/make/common-defs.make b/make/common-defs.make
similarity index 72%
rename from llama/make/common-defs.make
rename to make/common-defs.make
index 8ba33501842..03504a690aa 100644
--- a/llama/make/common-defs.make
+++ b/make/common-defs.make
@@ -21,37 +21,43 @@ export CGO_CXXFLAGS_ALLOW = -mfma|-mf16c
 export HIP_PLATFORM = amd
 export CGO_ENABLED=1
 
-SRC_DIR := $(dir $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))))
-BUILD_DIR = $(SRC_DIR)build/$(OS)-$(ARCH)
-DIST_BASE = $(abspath $(SRC_DIR)/../dist/$(OS)-$(ARCH))
+BUILD_DIR = ./llama/build/$(OS)-$(ARCH)
+DIST_BASE = ./dist/$(OS)-$(ARCH)
+
+ifeq ($(OS),windows)
+	# Absolute paths with cygpath to convert to 8.3 without spaces
+	PWD="$(shell pwd)"
+	DIST_OLLAMA_EXE=$(DIST_BASE)/ollama$(EXE_EXT)
+else
+	CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
+	DIST_OLLAMA_EXE=$(DIST_BASE)/bin/ollama$(EXE_EXT)
+endif
 DIST_LIB_DIR = $(DIST_BASE)/lib/ollama
 RUNNERS_DIST_DIR = $(DIST_LIB_DIR)/runners
-RUNNERS_PAYLOAD_DIR = $(abspath $(SRC_DIR)/../build/$(OS)/$(ARCH))
 RUNNERS_BUILD_DIR = $(BUILD_DIR)/runners
-DEFAULT_RUNNER := $(if $(and $(filter darwin,$(OS)),$(filter arm64,$(ARCH))),metal,cpu)
-GZIP:=$(shell command -v pigz 2>/dev/null || echo "gzip")
-ifneq ($(OS),windows)
-	CCACHE:=$(shell command -v ccache 2>/dev/null || echo "")
-endif
 VERSION?=$(shell git describe --tags --first-parent --abbrev=7 --long --dirty --always | sed -e "s/^v//g")
 
 # Conditionally enable ccache for cgo builds too
 ifneq ($(CCACHE),)
-	CC=$(CCACHE) gcc
-	CXX=$(CCACHE) g++
+	CC?=$(CCACHE) gcc
+	CXX?=$(CCACHE) g++
 	export CC
 	export CXX
 endif
 
 
-# Override in environment space separated to tune GPU runner CPU vector flags
+# Override in environment to tune CPU vector flags
 ifeq ($(ARCH),amd64)
-	GPU_RUNNER_CPU_FLAGS ?= avx
+ifeq ($(origin CUSTOM_CPU_FLAGS),undefined)
+	GPU_RUNNER_CPU_FLAGS=avx
+	GPU_RUNNER_EXTRA_VARIANT=_avx
+else
+	GPU_RUNNER_CPU_FLAGS=$(subst $(comma),$(space),$(CUSTOM_CPU_FLAGS))
+endif
 endif
 
 ifeq ($(OS),windows)
 	CP := cp
-	SRC_DIR := $(shell cygpath -m -s "$(SRC_DIR)")
 	OBJ_EXT := obj
 	SHARED_EXT := dll
 	EXE_EXT := .exe
@@ -63,22 +69,23 @@ ifneq ($(HIP_PATH),)
 	export HIP_PATH
 endif
 else ifeq ($(OS),linux)
-	CP := cp -af
+	CP := cp -df
 	OBJ_EXT := o
 	SHARED_EXT := so
 	SHARED_PREFIX := lib
 	CPU_FLAG_PREFIX := -m
-	HIP_PATH?=/opt/rocm
 else
 	OBJ_EXT := o
 	SHARED_EXT := so
 	CPU_FLAG_PREFIX := -m
-	CP := cp -af
+	CP := cp -df
 endif
 
 COMMON_SRCS := \
-	$(wildcard *.c) \
-	$(wildcard *.cpp)
+	$(wildcard ./llama/*.c) \
+	$(wildcard ./llama/*.cpp)
 COMMON_HDRS := \
-	$(wildcard *.h) \
-	$(wildcard *.hpp)
+	$(wildcard ./llama/*.h) \
+	$(wildcard ./llama/*.hpp)
+
+OLLAMA_EXE=./ollama$(EXE_EXT)
\ No newline at end of file
diff --git a/make/cuda-v11-defs.make b/make/cuda-v11-defs.make
new file mode 100644
index 00000000000..264407ddc66
--- /dev/null
+++ b/make/cuda-v11-defs.make
@@ -0,0 +1,17 @@
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
+	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
+	CUDA_11_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v11.? 2>/dev/null)
+	CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc.exe)
+	CUDA_11_LIB_DIR = $(strip $(shell ls -d $(CUDA_11_PATH)/bin 2>/dev/null))
+	CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_PATH)/lib/x64"
+else ifeq ($(OS),linux)
+	CUDA_PATH?=/usr/local/cuda
+	CUDA_11_PATH:=$(shell ls -d $(CUDA_PATH)-11 2>/dev/null)
+	CUDA_11_COMPILER:=$(wildcard $(CUDA_11_PATH)/bin/nvcc)
+	CUDA_11_LIB_DIR=$(strip $(shell ls -d $(CUDA_11_PATH)/lib64 2>/dev/null || ls -d $(CUDA_11_PATH)/lib 2>/dev/null))
+	CUDA_11_CGO_EXTRA_LDFLAGS = -L"$(CUDA_11_LIB_DIR)" -L"$(CUDA_11_LIB_DIR)/stubs"
+endif
diff --git a/make/cuda-v12-defs.make b/make/cuda-v12-defs.make
new file mode 100644
index 00000000000..f7c182b6f1d
--- /dev/null
+++ b/make/cuda-v12-defs.make
@@ -0,0 +1,17 @@
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	CUDA_PATH?=$(shell cygpath -m -s "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\" 2>/dev/null)unknown
+	CUDA_BASE_DIR := $(dir $(shell cygpath -m -s "$(CUDA_PATH)\\.." 2>/dev/null))
+	CUDA_12_PATH:=$(shell ls -d $(CUDA_BASE_DIR)/v12.? 2>/dev/null)
+	CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc.exe)
+	CUDA_12_LIB_DIR = $(strip $(shell ls -d $(CUDA_12_PATH)/bin 2>/dev/null))
+	CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_PATH)/lib/x64"
+else ifeq ($(OS),linux)
+	CUDA_PATH?=/usr/local/cuda
+	CUDA_12_PATH:=$(shell ls -d $(CUDA_PATH)-12 2>/dev/null)
+	CUDA_12_COMPILER:=$(wildcard $(CUDA_12_PATH)/bin/nvcc)
+	CUDA_12_LIB_DIR=$(strip $(shell ls -d $(CUDA_12_PATH)/lib64 2>/dev/null || ls -d $(CUDA_12_PATH)/lib 2>/dev/null))
+	CUDA_12_CGO_EXTRA_LDFLAGS = -L"$(CUDA_12_LIB_DIR)" -L"$(CUDA_12_LIB_DIR)/stubs" 
+endif
diff --git a/make/cuda.make b/make/cuda.make
new file mode 100644
index 00000000000..a40db358116
--- /dev/null
+++ b/make/cuda.make
@@ -0,0 +1,54 @@
+# Common definitions for all cuda versions
+
+ifndef GPU_RUNNER_VARIANT
+dummy:
+	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
+endif
+
+
+GPU_RUNNER_NAME := cuda$(GPU_RUNNER_VARIANT)
+GPU_RUNNER_GO_TAGS := cuda cuda$(GPU_RUNNER_VARIANT)
+GPU_RUNNER_DRIVER_LIB_LINK := -lcuda
+GPU_RUNNER_LIBS_SHORT := cublas cudart cublasLt
+
+ifeq ($(OS),windows)
+	# On windows, nvcc uses msvc which does not support avx512vbmi avx512vnni avx512bf16, but macros can turn them on
+	GPU_VECTOR_FLAGS=$(call uc,$(filter-out avx512bf16,$(filter-out avx512vnni,$(filter-out avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)))))
+	GPU_COMPILER_EXTRA_FLAGS=$(if $(filter avx512vbmi,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VBMI__)
+	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512vnni,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512VNNI__)
+	GPU_COMPILER_EXTRA_FLAGS+=$(if $(filter avx512bf16,$(GPU_RUNNER_CPU_FLAGS)),-D__AVX512BF16__)
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT),$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+	GPU_COMPILER_CFLAGS = $(CFLAGS) -D_WIN32_WINNT=0x602
+	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -D_WIN32_WINNT=0x602
+else ifeq ($(OS),linux)
+	# On linux, nvcc requires avx512 -> -mavx512f -mavx512dq -mavx512bw
+	GPU_VECTOR_FLAGS=$(if $(filter avx512,$(GPU_RUNNER_CPU_FLAGS)),avx512f avx512dq avx512bw) $(filter-out avx512,$(GPU_RUNNER_CPU_FLAGS))
+	GPU_COMPILER_EXTRA_FLAGS = -fPIC -Wno-unused-function -std=c++11
+	GPU_LIBS = $(sort $(wildcard $(addsuffix *.$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT)))))
+	GPU_COMPILER_CFLAGS = $(CFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
+	GPU_COMPILER_CXXFLAGS = $(CXXFLAGS) -Xcompiler -fPIC -D_GNU_SOURCE
+endif
+GPU_DIST_LIB_DEPS= $(sort $(addprefix $(DIST_GPU_RUNNER_DEPS_DIR)/,$(notdir $(GPU_LIBS))))
+
+GPU_RUNNER_ARCH_FLAGS := $(foreach arch,$(subst ;,$(space),$(CUDA_ARCHITECTURES)),--generate-code=arch=compute_$(arch)$(comma)code=[compute_$(arch)$(comma)sm_$(arch)]) \
+	-DGGML_CUDA_USE_GRAPHS=1
+GPU_COMPILER_CUFLAGS = \
+	$(GPU_COMPILER_EXTRA_FLAGS) \
+	-Xcompiler "$(addprefix $(CPU_FLAG_PREFIX),$(GPU_VECTOR_FLAGS))" \
+	-t2 \
+	-DGGML_CUDA_DMMV_X=32 \
+	-DGGML_CUDA_MMV_Y=1 \
+	-DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \
+	-DGGML_USE_CUDA=1 \
+	-DGGML_SHARED=1 \
+	-DGGML_BUILD=1 \
+	-DGGML_USE_LLAMAFILE \
+	-DK_QUANTS_PER_ITERATION=2 \
+	-DNDEBUG \
+	-D_GNU_SOURCE \
+	-D_XOPEN_SOURCE=600 \
+	-Wno-deprecated-gpu-targets \
+	--forward-unknown-to-host-compiler \
+	-use_fast_math \
+	-I./llama/  \
+	-O3
diff --git a/make/gpu.make b/make/gpu.make
new file mode 100644
index 00000000000..9cfb45158b8
--- /dev/null
+++ b/make/gpu.make
@@ -0,0 +1,90 @@
+# Generalized GPU runner build
+
+ifndef GPU_RUNNER_NAME
+dummy:
+	$(error This makefile is not meant to build directly, but instead included in other Makefiles that set required variables)
+endif
+
+GPU_GOFLAGS="-ldflags=-w -s \"-X=github.com/ollama/ollama/version.Version=$(VERSION)\" \"-X=github.com/ollama/ollama/llama.CpuFeatures=$(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS))\" $(EXTRA_GOLDLAGS) $(TARGET_LDFLAGS)"
+
+# TODO Unify how we handle dependencies in the dist/packaging and install flow
+# today, cuda is bundled, but rocm is split out.  Should split them each out by runner
+DIST_GPU_RUNNER_DEPS_DIR = $(DIST_LIB_DIR)
+
+
+GPU_RUNNER_LIBS = $(wildcard $(addsuffix .$(SHARED_EXT).*,$(addprefix $(GPU_LIB_DIR)/$(SHARED_PREFIX),$(GPU_RUNNER_LIBS_SHORT))))
+
+GPU_RUNNER_SRCS := \
+	llama/ggml-cuda.cu \
+	$(filter-out $(wildcard llama/ggml-cuda/fattn*.cu),$(wildcard llama/ggml-cuda/*.cu)) \
+	$(wildcard llama/ggml-cuda/template-instances/mmq*.cu) \
+	llama/ggml.c llama/ggml-backend.c llama/ggml-alloc.c llama/ggml-quants.c llama/sgemm.cpp llama/ggml-aarch64.c
+GPU_RUNNER_HDRS := \
+	$(wildcard llama/ggml-cuda/*.cuh)
+
+
+# Conditional flags and components to speed up developer builds
+ifneq ($(OLLAMA_FAST_BUILD),)
+	GPU_COMPILER_CUFLAGS += 	\
+		-DGGML_DISABLE_FLASH_ATTN
+else
+	GPU_RUNNER_SRCS += \
+		$(wildcard llama/ggml-cuda/fattn*.cu) \
+		$(wildcard llama/ggml-cuda/template-instances/fattn-wmma*.cu) \
+		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu) \
+		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu) \
+		$(wildcard llama/ggml-cuda/template-instances/fattn-vec*f16-f16.cu)
+endif
+
+GPU_RUNNER_OBJS := $(GPU_RUNNER_SRCS:.cu=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
+GPU_RUNNER_OBJS := $(GPU_RUNNER_OBJS:.c=.$(GPU_RUNNER_NAME).$(OBJ_EXT))
+GPU_RUNNER_OBJS := $(addprefix $(BUILD_DIR)/,$(GPU_RUNNER_OBJS:.cpp=.$(GPU_RUNNER_NAME).$(OBJ_EXT)))
+
+DIST_RUNNERS = $(addprefix $(RUNNERS_DIST_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
+BUILD_RUNNERS = $(addprefix $(RUNNERS_BUILD_DIR)/,$(addsuffix /ollama_llama_server$(EXE_EXT),$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)))
+
+
+$(GPU_RUNNER_NAME): $(BUILD_RUNNERS) 
+
+dist: $(DIST_RUNNERS)
+
+# Build targets
+$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cu
+	@-mkdir -p $(dir $@)
+	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) $(GPU_COMPILER_CUFLAGS) $(GPU_RUNNER_ARCH_FLAGS) -o $@ $<
+$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.c
+	@-mkdir -p $(dir $@)
+	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CFLAGS) -o $@ $<
+$(BUILD_DIR)/%.$(GPU_RUNNER_NAME).$(OBJ_EXT): %.cpp
+	@-mkdir -p $(dir $@)
+	$(CCACHE) $(GPU_COMPILER) -c $(GPU_COMPILER_CXXFLAGS) -o $@ $<
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): TARGET_CGO_LDFLAGS = $(CGO_EXTRA_LDFLAGS) -L"$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/"
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) ./llama/*.go ./llama/runner/*.go $(COMMON_SRCS) $(COMMON_HDRS)
+	@-mkdir -p $(dir $@)
+	GOARCH=$(ARCH) CGO_LDFLAGS="$(TARGET_CGO_LDFLAGS)" go build -buildmode=pie $(GPU_GOFLAGS) -trimpath -tags $(subst $(space),$(comma),$(GPU_RUNNER_CPU_FLAGS) $(GPU_RUNNER_GO_TAGS)) -o $@ ./cmd/runner
+$(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(GPU_RUNNER_OBJS) $(COMMON_HDRS) $(GPU_RUNNER_HDRS)
+	@-mkdir -p $(dir $@)
+	$(CCACHE) $(GPU_COMPILER) --shared -L$(GPU_LIB_DIR) $(GPU_RUNNER_DRIVER_LIB_LINK) -L${DIST_GPU_RUNNER_DEPS_DIR} $(foreach lib, $(GPU_RUNNER_LIBS_SHORT), -l$(lib)) $(GPU_RUNNER_OBJS) -o $@
+
+# Distribution targets
+$(RUNNERS_DIST_DIR)/%: $(RUNNERS_BUILD_DIR)/%
+	@-mkdir -p $(dir $@)
+	$(CP) $< $@
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/ollama_llama_server$(EXE_EXT): $(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT) $(GPU_DIST_LIB_DEPS)
+$(RUNNERS_DIST_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT): $(RUNNERS_BUILD_DIR)/$(GPU_RUNNER_NAME)$(GPU_RUNNER_EXTRA_VARIANT)/$(SHARED_PREFIX)ggml_$(GPU_RUNNER_NAME).$(SHARED_EXT)
+	@-mkdir -p $(dir $@)
+	$(CP) $< $@
+$(GPU_DIST_LIB_DEPS):
+	@-mkdir -p $(dir $@)
+	$(CP) $(GPU_LIB_DIR)/$(notdir $@) $(dir $@)
+
+clean: 
+	rm -f $(GPU_RUNNER_OBJS) $(BUILD_RUNNERS) $(DIST_RUNNERS)
+
+.PHONY: clean $(GPU_RUNNER_NAME)
+
+
+# Handy debugging for make variables
+print-%:
+	@echo '$*=$($*)'
+
diff --git a/make/rocm-defs.make b/make/rocm-defs.make
new file mode 100644
index 00000000000..76a11f29679
--- /dev/null
+++ b/make/rocm-defs.make
@@ -0,0 +1,9 @@
+# Common definitions for the various Makefiles which set cuda settings
+# No rules are defined here so this is safe to include at the beginning of other makefiles
+
+ifeq ($(OS),windows)
+	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc.bin.exe)
+else ifeq ($(OS),linux)
+	HIP_PATH?=$(shell ls -d /opt/rocm 2>/dev/null)
+	HIP_COMPILER:=$(wildcard $(HIP_PATH)/bin/hipcc)
+endif
diff --git a/runners/common.go b/runners/common.go
index 19014d75c1c..287a6716a28 100644
--- a/runners/common.go
+++ b/runners/common.go
@@ -1,287 +1,117 @@
 package runners
 
 import (
-	"compress/gzip"
-	"errors"
-	"fmt"
-	"io"
-	"io/fs"
 	"log/slog"
 	"os"
 	"path/filepath"
 	"runtime"
 	"slices"
-	"strconv"
 	"strings"
 	"sync"
-	"syscall"
 
-	"golang.org/x/sync/errgroup"
+	"golang.org/x/sys/cpu"
 
-	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 )
 
-const (
-	binGlob = "*/*/*/*"
-)
-
 var (
-	lock       sync.Mutex
 	runnersDir = ""
+	once       = sync.Once{}
 )
 
-// Return the location where runners are stored
-// If runners are payloads, this will either extract them
-// or refresh them if any have disappeared due to tmp cleaners
-func Refresh(payloadFS fs.FS) (string, error) {
-	lock.Lock()
-	defer lock.Unlock()
-	var err error
+type CPUCapability uint32
 
-	// Wire up extra logging on our first load
-	if runnersDir == "" {
-		defer func() {
-			var runners []string
-			for v := range GetAvailableServers(runnersDir) {
-				runners = append(runners, v)
-			}
-			slog.Info("Dynamic LLM libraries", "runners", runners)
-			slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
-		}()
-	}
+// Override at build time when building base GPU runners
+// var GPURunnerCPUCapability = CPUCapabilityAVX
 
-	if hasPayloads(payloadFS) {
-		if runnersDir == "" {
-			runnersDir, err = extractRunners(payloadFS)
-		} else {
-			err = refreshRunners(payloadFS, runnersDir)
-		}
-	} else if runnersDir == "" {
-		runnersDir, err = locateRunners()
-	}
-
-	return runnersDir, err
-}
+const (
+	CPUCapabilityNone CPUCapability = iota
+	CPUCapabilityAVX
+	CPUCapabilityAVX2
+	// TODO AVX512
+)
 
-func Cleanup(payloadFS fs.FS) {
-	lock.Lock()
-	defer lock.Unlock()
-	if hasPayloads(payloadFS) && runnersDir != "" {
-		// We want to fully clean up the tmpdir parent of the payloads dir
-		tmpDir := filepath.Clean(filepath.Join(runnersDir, ".."))
-		slog.Debug("cleaning up", "dir", tmpDir)
-		err := os.RemoveAll(tmpDir)
-		if err != nil {
-			slog.Warn("failed to clean up", "dir", tmpDir, "err", err)
-		}
+func (c CPUCapability) String() string {
+	switch c {
+	case CPUCapabilityAVX:
+		return "avx"
+	case CPUCapabilityAVX2:
+		return "avx2"
+	default:
+		return "no vector extensions"
 	}
 }
 
-func locateRunners() (string, error) {
-	exe, err := os.Executable()
-	if err != nil {
-		return "", err
-	}
-
-	cwd, err := os.Getwd()
-	if err != nil {
-		return "", err
-	}
-
-	var paths []string
-	for _, root := range []string{filepath.Dir(exe), filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe()), cwd} {
-		paths = append(paths,
-			root,
-			filepath.Join(root, runtime.GOOS+"-"+runtime.GOARCH),
-			filepath.Join(root, "dist", runtime.GOOS+"-"+runtime.GOARCH),
-		)
+func GetCPUCapability() CPUCapability {
+	if cpu.X86.HasAVX2 {
+		return CPUCapabilityAVX2
 	}
-
-	// Try a few variations to improve developer experience when building from source in the local tree
-	for _, path := range paths {
-		candidate := filepath.Join(path, "lib", "ollama", "runners")
-		if _, err := os.Stat(candidate); err == nil {
-			return candidate, nil
-		}
+	if cpu.X86.HasAVX {
+		return CPUCapabilityAVX
 	}
-	return "", fmt.Errorf("unable to locate runners in any search path %v", paths)
+	// else LCD
+	return CPUCapabilityNone
 }
 
-// Return true if we're carying nested payloads for the runners
-func hasPayloads(payloadFS fs.FS) bool {
-	files, err := fs.Glob(payloadFS, binGlob)
-	if err != nil || len(files) == 0 || (len(files) == 1 && strings.Contains(files[0], "placeholder")) {
-		return false
-	}
-	return true
+// Return the location where runners were located
+// empty string indicates only builtin is present
+func Locate() string {
+	once.Do(locateRunnersOnce)
+	return runnersDir
 }
 
-func extractRunners(payloadFS fs.FS) (string, error) {
-	cleanupTmpDirs()
-	tmpDir, err := os.MkdirTemp(envconfig.TmpDir(), "ollama")
-	if err != nil {
-		return "", fmt.Errorf("failed to generate tmp dir: %w", err)
-	}
-	// Track our pid so we can clean up orphaned tmpdirs
-	n := filepath.Join(tmpDir, "ollama.pid")
-	if err := os.WriteFile(n, []byte(strconv.Itoa(os.Getpid())), 0o644); err != nil {
-		slog.Warn("failed to write pid file", "file", n, "error", err)
-	}
-	// We create a distinct subdirectory for payloads within the tmpdir
-	// This will typically look like /tmp/ollama3208993108/runners on linux
-	rDir := filepath.Join(tmpDir, "runners")
-
-	slog.Info("extracting embedded files", "dir", rDir)
-	return rDir, refreshRunners(payloadFS, rDir)
-}
-
-func refreshRunners(payloadFS fs.FS, rDir string) error {
-	// extract or refresh server libraries
-	err := extractFiles(payloadFS, rDir, binGlob)
+// searches for runners in a prioritized set of locations
+// 1. local build, with executable at the top of the tree
+// 2. lib directory relative to executable
+func locateRunnersOnce() {
+	exe, err := os.Executable()
 	if err != nil {
-		return fmt.Errorf("extract binaries: %v", err)
+		slog.Debug("runner locate", "error", err)
 	}
-	return nil
-}
-
-// extract extracts the embedded files to the target directory
-func extractFiles(payloadFS fs.FS, targetDir string, glob string) error {
-	files, err := fs.Glob(payloadFS, glob)
-	if err != nil || len(files) == 0 {
-		// Should not happen
-		return fmt.Errorf("extractFiles called without payload present")
-	}
-
-	if err := os.MkdirAll(targetDir, 0o755); err != nil {
-		return fmt.Errorf("extractFiles could not mkdir %s: %v", targetDir, err)
-	}
-
-	g := new(errgroup.Group)
 
-	// $OS/$GOARCH/$RUNNER/$FILE
-	for _, file := range files {
-		filename := file
-
-		runner := filepath.Base(filepath.Dir(filename))
-
-		slog.Debug("extracting", "runner", runner, "payload", filename)
-
-		g.Go(func() error {
-			srcf, err := payloadFS.Open(filename)
-			if err != nil {
-				return err
-			}
-			defer srcf.Close()
-
-			src := io.Reader(srcf)
-			if strings.HasSuffix(filename, ".gz") {
-				src, err = gzip.NewReader(src)
-				if err != nil {
-					return fmt.Errorf("decompress payload %s: %v", filename, err)
-				}
-				filename = strings.TrimSuffix(filename, ".gz")
-			}
-
-			runnerDir := filepath.Join(targetDir, runner)
-			if err := os.MkdirAll(runnerDir, 0o755); err != nil {
-				return fmt.Errorf("extractFiles could not mkdir %s: %v", runnerDir, err)
-			}
-
-			base := filepath.Base(filename)
-			destFilename := filepath.Join(runnerDir, base)
-
-			_, err = os.Stat(destFilename)
-			switch {
-			case errors.Is(err, os.ErrNotExist):
-				destFile, err := os.OpenFile(destFilename, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0o755)
-				if err != nil {
-					return fmt.Errorf("write payload %s: %v", filename, err)
-				}
-				defer destFile.Close()
-				if _, err := io.Copy(destFile, src); err != nil {
-					return fmt.Errorf("copy payload %s: %v", filename, err)
-				}
-			case err != nil:
-				return fmt.Errorf("stat payload %s: %v", filename, err)
-			}
-			return nil
-		})
+	paths := []string{
+		filepath.Join(filepath.Dir(exe), "llama", "build", runtime.GOOS+"-"+runtime.GOARCH, "runners"),
+		filepath.Join(filepath.Dir(exe), envconfig.LibRelativeToExe(), "lib", "ollama", "runners"),
 	}
-
-	err = g.Wait()
-	if err != nil {
-		slog.Error("failed to extract files", "error", err)
-		// If we fail to extract, the payload dir is most likely unusable, so cleanup whatever we extracted
-		err := os.RemoveAll(targetDir)
-		if err != nil {
-			slog.Warn("failed to cleanup incomplete payload dir", "dir", targetDir, "error", err)
+	for _, path := range paths {
+		if _, err := os.Stat(path); err == nil {
+			runnersDir = path
+			slog.Debug("runners located", "dir", runnersDir)
+			return
 		}
-		return err
 	}
-	return nil
+	// Fall back to built-in
+	slog.Debug("no dynamic runners detected, using only built-in")
+	runnersDir = ""
 }
 
-// Best effort to clean up prior tmpdirs
-func cleanupTmpDirs() {
-	tmpDir := envconfig.TmpDir()
-	if tmpDir == "" {
-		tmpDir = os.TempDir()
-	}
-	matches, err := filepath.Glob(filepath.Join(tmpDir, "ollama*", "ollama.pid"))
-	if err != nil {
-		return
-	}
-
-	for _, match := range matches {
-		raw, err := os.ReadFile(match)
-		if errors.Is(err, os.ErrNotExist) {
-			slog.Debug("not a ollama runtime directory, skipping", "path", match)
-			continue
-		} else if err != nil {
-			slog.Warn("could not read ollama.pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		pid, err := strconv.Atoi(string(raw))
-		if err != nil {
-			slog.Warn("invalid pid, skipping", "path", match, "error", err)
-			continue
-		}
-
-		p, err := os.FindProcess(pid)
-		if err == nil && !errors.Is(p.Signal(syscall.Signal(0)), os.ErrProcessDone) {
-			slog.Warn("process still running, skipping", "pid", pid, "path", match)
-			continue
-		}
-
-		if err := os.Remove(match); err != nil {
-			slog.Warn("could not cleanup stale pidfile", "path", match, "error", err)
-		}
-
-		runners := filepath.Join(filepath.Dir(match), "runners")
-		if err := os.RemoveAll(runners); err != nil {
-			slog.Warn("could not cleanup stale runners", "path", runners, "error", err)
-		}
-
-		if err := os.Remove(filepath.Dir(match)); err != nil {
-			slog.Warn("could not cleanup stale tmpdir", "path", filepath.Dir(match), "error", err)
-		}
+// Return the well-known name of the builtin runner for the given platform
+func BuiltinName() string {
+	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
+		return "metal"
 	}
+	return "cpu"
 }
 
 // directory names are the name of the runner and may contain an optional
 // variant prefixed with '_' as the separator. For example, "cuda_v11" and
 // "cuda_v12" or "cpu" and "cpu_avx2". Any library without a variant is the
 // lowest common denominator
-func GetAvailableServers(payloadsDir string) map[string]string {
-	if payloadsDir == "" {
-		slog.Error("empty runner dir")
-		return nil
+func GetAvailableServers() map[string]string {
+	once.Do(locateRunnersOnce)
+
+	servers := make(map[string]string)
+	exe, err := os.Executable()
+	if err == nil {
+		servers[BuiltinName()] = exe
+	}
+
+	if runnersDir == "" {
+		return servers
 	}
 
-	// glob payloadsDir for files that start with ollama_
-	pattern := filepath.Join(payloadsDir, "*", "ollama_*")
+	// glob runnersDir for files that start with ollama_
+	pattern := filepath.Join(runnersDir, "*", "ollama_*")
 
 	files, err := filepath.Glob(pattern)
 	if err != nil {
@@ -289,96 +119,88 @@ func GetAvailableServers(payloadsDir string) map[string]string {
 		return nil
 	}
 
-	servers := make(map[string]string)
 	for _, file := range files {
 		slog.Debug("availableServers : found", "file", file)
-		servers[filepath.Base(filepath.Dir(file))] = filepath.Dir(file)
+		runnerName := filepath.Base(filepath.Dir(file))
+		// Special case for our GPU runners - if compiled with standard AVX flag
+		// detect incompatible system
+		// Custom builds will omit this and its up to the user to ensure compatibility
+		parsed := strings.Split(runnerName, "_")
+		if len(parsed) == 3 && parsed[2] == "avx" && !cpu.X86.HasAVX {
+			slog.Info("GPU runner incompatible with host system, CPU does not have AVX", "runner", runnerName)
+			continue
+		}
+		servers[runnerName] = file
 	}
 
 	return servers
 }
 
-// serversForGpu returns a list of compatible servers give the provided GPU
-// info, ordered by performance. assumes Init() has been called
-// TODO - switch to metadata based mapping
-func ServersForGpu(info discover.GpuInfo) []string {
+// serversForGpu returns a list of compatible servers give the provided GPU library/variant
+func ServersForGpu(requested string) []string {
 	// glob workDir for files that start with ollama_
-	availableServers := GetAvailableServers(runnersDir)
-	requested := info.Library
-	if info.Variant != discover.CPUCapabilityNone.String() {
-		requested += "_" + info.Variant
+	availableServers := GetAvailableServers()
+
+	// Short circuit if the only option is built-in
+	if _, ok := availableServers[BuiltinName()]; ok && len(availableServers) == 1 {
+		return []string{BuiltinName()}
 	}
 
+	bestCPUVariant := GetCPUCapability()
+	requestedLib := strings.Split(requested, "_")[0]
 	servers := []string{}
 
 	// exact match first
 	for a := range availableServers {
-		if a == requested {
+		short := a
+		parsed := strings.Split(a, "_")
+		if len(parsed) == 3 {
+			// Strip off optional _avx for comparison
+			short = parsed[0] + "_" + parsed[1]
+		}
+		if a == requested || short == requested {
 			servers = []string{a}
-
-			if a == "metal" {
-				return servers
-			}
-
-			break
 		}
 	}
 
-	alt := []string{}
-
-	// Then for GPUs load alternates and sort the list for consistent load ordering
-	if info.Library != "cpu" {
+	// If no exact match, then try without variant
+	if len(servers) == 0 {
+		alt := []string{}
 		for a := range availableServers {
-			if info.Library == strings.Split(a, "_")[0] && a != requested {
+			if requestedLib == strings.Split(a, "_")[0] && a != requested {
 				alt = append(alt, a)
 			}
 		}
-
 		slices.Sort(alt)
 		servers = append(servers, alt...)
 	}
 
-	if !(runtime.GOOS == "darwin" && runtime.GOARCH == "arm64") {
-		// Load up the best CPU variant if not primary requested
-		if info.Library != "cpu" {
-			variant := discover.GetCPUCapability()
-			// If no variant, then we fall back to default
-			// If we have a variant, try that if we find an exact match
-			// Attempting to run the wrong CPU instructions will panic the
-			// process
-			if variant != discover.CPUCapabilityNone {
-				for cmp := range availableServers {
-					if cmp == "cpu_"+variant.String() {
-						servers = append(servers, cmp)
-						break
-					}
-				}
-			} else {
-				servers = append(servers, "cpu")
+	// Finally append the best CPU option if found, then builtin
+	if bestCPUVariant != CPUCapabilityNone {
+		for cmp := range availableServers {
+			if cmp == "cpu_"+bestCPUVariant.String() {
+				servers = append(servers, cmp)
+				break
 			}
 		}
-
-		if len(servers) == 0 {
-			servers = []string{"cpu"}
-		}
 	}
-
+	servers = append(servers, BuiltinName())
 	return servers
 }
 
 // Return the optimal server for this CPU architecture
 func ServerForCpu() string {
 	if runtime.GOOS == "darwin" && runtime.GOARCH == "arm64" {
-		return "metal"
+		return BuiltinName()
 	}
-	variant := discover.GetCPUCapability()
-	availableServers := GetAvailableServers(runnersDir)
-	if variant != discover.CPUCapabilityNone {
+	variant := GetCPUCapability()
+	availableServers := GetAvailableServers()
+	if variant != CPUCapabilityNone {
 		for cmp := range availableServers {
 			if cmp == "cpu_"+variant.String() {
 				return cmp
 			}
 		}
 	}
-	return "cpu"
+	return BuiltinName()
 }
diff --git a/runners/runners_test.go b/runners/runners_test.go
deleted file mode 100644
index e6439448db9..00000000000
--- a/runners/runners_test.go
+++ /dev/null
@@ -1,50 +0,0 @@
-package runners
-
-import (
-	"log/slog"
-	"os"
-	"path"
-	"runtime"
-	"strings"
-	"testing"
-	"testing/fstest"
-)
-
-func TestRefreshRunners(t *testing.T) {
-	slog.SetLogLoggerLevel(slog.LevelDebug)
-
-	payloadFS := fstest.MapFS{
-		path.Join(runtime.GOOS, runtime.GOARCH, "foo", "ollama_llama_server"): {Data: []byte("hello, world\n")},
-	}
-	tmpDir, err := os.MkdirTemp("", "testing")
-	if err != nil {
-		t.Fatalf("failed to make tmp dir %s", err)
-	}
-	t.Setenv("OLLAMA_TMPDIR", tmpDir)
-	rDir, err := Refresh(payloadFS)
-	if err != nil {
-		t.Fatalf("failed to extract to %s %s", tmpDir, err)
-	}
-	if !strings.Contains(rDir, tmpDir) {
-		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
-	}
-
-	// spot check results
-	servers := GetAvailableServers(rDir)
-	if len(servers) < 1 {
-		t.Fatalf("expected at least 1 server")
-	}
-
-	// Refresh contents
-	rDir, err = extractRunners(payloadFS)
-	if err != nil {
-		t.Fatalf("failed to extract to %s %s", tmpDir, err)
-	}
-	if !strings.Contains(rDir, tmpDir) {
-		t.Fatalf("runner dir %s was not in tmp dir %s", rDir, tmpDir)
-	}
-
-	cleanupTmpDirs()
-
-	Cleanup(payloadFS)
-}
diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh
index fd370f48eda..595c3ce7120 100755
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -14,16 +14,14 @@ export CGO_CFLAGS=-mmacosx-version-min=11.3
 export CGO_CXXFLAGS=-mmacosx-version-min=11.3
 export CGO_LDFLAGS=-mmacosx-version-min=11.3
 
-for TARGETARCH in arm64 amd64; do
-    echo "Building Go runner darwin $TARGETARCH"
-    rm -rf llama/build
-    GOOS=darwin ARCH=$TARGETARCH GOARCH=$TARGETARCH make -C llama -j 8
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -o dist/ollama-darwin-$TARGETARCH
-    CGO_ENABLED=1 GOOS=darwin GOARCH=$TARGETARCH go build -trimpath -cover -o dist/ollama-darwin-$TARGETARCH-cov
-done
-
-lipo -create -output dist/ollama dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
-rm -f dist/ollama-darwin-arm64 dist/ollama-darwin-amd64
+rm -rf llama/build dist/darwin-*
+echo "Building darwin arm64"
+GOOS=darwin ARCH=arm64 GOARCH=arm64 make -j 8 dist
+echo "Building darwin amd64 with AVX enabled"
+GOOS=darwin ARCH=amd64 GOARCH=amd64 CUSTOM_CPU_FLAGS="avx" make -j 8 dist
+
+
+lipo -create -output dist/ollama dist/darwin-arm64/bin/ollama dist/darwin-amd64/bin/ollama
 if [ -n "$APPLE_IDENTITY" ]; then
     codesign --deep --force --options=runtime --sign "$APPLE_IDENTITY" --timestamp dist/ollama
 else
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 32ba765238d..0a69c60c617 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -82,7 +82,7 @@ function buildOllama() {
     if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
         write-host "Building ollama runners"
         Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
-        & make -C llama -j 12
+        & make -j 12 dist
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     } else {
         write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
diff --git a/scripts/install.sh b/scripts/install.sh
index bc7b5f58e14..9e146e508e8 100644
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -71,29 +71,20 @@ for BINDIR in /usr/local/bin /usr/bin /bin; do
 done
 OLLAMA_INSTALL_DIR=$(dirname ${BINDIR})
 
+if [ -d "$OLLAMA_INSTALL_DIR/lib/ollama" ] ; then
+    status "Cleaning up old version at $OLLAMA_INSTALL_DIR/lib/ollama"
+    $SUDO rm -rf "$OLLAMA_INSTALL_DIR/lib/ollama"
+fi
 status "Installing ollama to $OLLAMA_INSTALL_DIR"
 $SUDO install -o0 -g0 -m755 -d $BINDIR
 $SUDO install -o0 -g0 -m755 -d "$OLLAMA_INSTALL_DIR"
-if curl -I --silent --fail --location "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" >/dev/null ; then
-    status "Downloading Linux ${ARCH} bundle"
-    curl --fail --show-error --location --progress-bar \
-        "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
-        $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
-    BUNDLE=1
-    if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
-        status "Making ollama accessible in the PATH in $BINDIR"
-        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
-    fi
-else
-    status "Downloading Linux ${ARCH} CLI"
-    curl --fail --show-error --location --progress-bar -o "$TEMP_DIR/ollama"\
-    "https://ollama.com/download/ollama-linux-${ARCH}${VER_PARAM}"
-    $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $OLLAMA_INSTALL_DIR/ollama
-    BUNDLE=0
-    if [ "$OLLAMA_INSTALL_DIR/ollama" != "$BINDIR/ollama" ] ; then
-        status "Making ollama accessible in the PATH in $BINDIR"
-        $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
-    fi
+status "Downloading Linux ${ARCH} bundle"
+curl --fail --show-error --location --progress-bar \
+    "https://ollama.com/download/ollama-linux-${ARCH}.tgz${VER_PARAM}" | \
+    $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
+if [ "$OLLAMA_INSTALL_DIR/bin/ollama" != "$BINDIR/ollama" ] ; then
+    status "Making ollama accessible in the PATH in $BINDIR"
+    $SUDO ln -sf "$OLLAMA_INSTALL_DIR/ollama" "$BINDIR/ollama"
 fi
 
 # Check for NVIDIA JetPack systems with additional downloads
@@ -230,31 +221,11 @@ if ! check_gpu lspci nvidia && ! check_gpu lshw nvidia && ! check_gpu lspci amdg
 fi
 
 if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
-    if [ $BUNDLE -ne 0 ]; then
-        status "Downloading Linux ROCm ${ARCH} bundle"
-        curl --fail --show-error --location --progress-bar \
-            "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
-            $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
-
-        install_success
-        status "AMD GPU ready."
-        exit 0
-    fi
-    # Look for pre-existing ROCm v6 before downloading the dependencies
-    for search in "${HIP_PATH:-''}" "${ROCM_PATH:-''}" "/opt/rocm" "/usr/lib64"; do
-        if [ -n "${search}" ] && [ -e "${search}/libhipblas.so.2" -o -e "${search}/lib/libhipblas.so.2" ]; then
-            status "Compatible AMD GPU ROCm library detected at ${search}"
-            install_success
-            exit 0
-        fi
-    done
+    status "Downloading Linux ROCm ${ARCH} bundle"
+    curl --fail --show-error --location --progress-bar \
+        "https://ollama.com/download/ollama-linux-${ARCH}-rocm.tgz${VER_PARAM}" | \
+        $SUDO tar -xzf - -C "$OLLAMA_INSTALL_DIR"
 
-    status "Downloading AMD GPU dependencies..."
-    $SUDO rm -rf /usr/share/ollama/lib
-    $SUDO chmod o+x /usr/share/ollama
-    $SUDO install -o ollama -g ollama -m 755 -d /usr/share/ollama/lib/rocm
-    curl --fail --show-error --location --progress-bar "https://ollama.com/download/ollama-linux-amd64-rocm.tgz${VER_PARAM}" \
-        | $SUDO tar zx --owner ollama --group ollama -C /usr/share/ollama/lib/rocm .
     install_success
     status "AMD GPU ready."
     exit 0
diff --git a/server/routes.go b/server/routes.go
index 9e97565c994..7e8f664b948 100644
--- a/server/routes.go
+++ b/server/routes.go
@@ -27,7 +27,6 @@ import (
 	"golang.org/x/sync/errgroup"
 
 	"github.com/ollama/ollama/api"
-	"github.com/ollama/ollama/build"
 	"github.com/ollama/ollama/discover"
 	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/llm"
@@ -1264,13 +1263,16 @@ func Serve(ln net.Listener) error {
 		srvr.Close()
 		schedDone()
 		sched.unloadAllRunners()
-		runners.Cleanup(build.EmbedFS)
 		done()
 	}()
 
-	if _, err := runners.Refresh(build.EmbedFS); err != nil {
-		return fmt.Errorf("unable to initialize llm runners %w", err)
+	// Locate and log what runners are present at startup
+	var runnerNames []string
+	for v := range runners.GetAvailableServers() {
+		runnerNames = append(runnerNames, v)
 	}
+	slog.Info("Dynamic LLM libraries", "runners", runnerNames)
+	slog.Debug("Override detection logic by setting OLLAMA_LLM_LIBRARY")
 
 	s.sched.Run(schedCtx)