ggerganov · airMeng · Nov 15, 2024 · Nov 7, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -917,7 +917,7 @@ jobs:
         shell: bash
 
     env:
-      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
+      WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/b380d914-366b-4b77-a74a-05e3c38b3514/intel-oneapi-base-toolkit-2025.0.0.882_offline.exe
       WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
       ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
     steps:

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
@@ -41,6 +41,8 @@ The following release is verified with good quality:
 
 ## News
 
+- 2024.11
+  - Use syclcompat to improve the performance on some backends. This requires to use oneAPI 2025.0 or more recent.
 
 - 2024.8
   - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.

diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -1830,33 +1830,6 @@ namespace dpct
                                            : id);
     }
 
-    template <typename T>
-    sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
-    {
-        return sycl::vec<T, 1>(val)
-            .template as<sycl::vec<
-                std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
-            .template convert<T>();
-    }
-
-    template <typename T1, typename T2>
-    using dot_product_acc_t =
-        std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-                           uint32_t, int32_t>;
-
-    template <typename T1, typename T2, typename T3>
-    inline auto dp4a(T1 a, T2 b, T3 c)
-    {
-        dot_product_acc_t<T1, T2> res = c;
-        auto va = extract_and_sign_or_zero_extend4(a);
-        auto vb = extract_and_sign_or_zero_extend4(b);
-        res += va[0] * vb[0];
-        res += va[1] * vb[1];
-        res += va[2] * vb[2];
-        res += va[3] * vb[3];
-        return res;
-    }
-
     struct sub_sat
     {
         template <typename T>

diff --git a/ggml/src/ggml-sycl/mmq.cpp b/ggml/src/ggml-sycl/mmq.cpp
@@ -575,8 +575,8 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
 
 #pragma unroll
         for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
-            sumi_m = dpct::dp4a(m, u[i],
+            sumi_d_sc = syclcompat::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
+            sumi_m = syclcompat::dp4a(m, u[i],
                                 sumi_m); // multiply sum of q8_1 values with m
         }
 
@@ -730,7 +730,7 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
         int sumi_sc = 0;
 
         for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+            sumi_sc = syclcompat::dp4a(v[i], u[i], sumi_sc); // SIMD dot product
         }
 
         sumi += sumi_sc * scales[i0 / (QI8_1/2)];
@@ -873,7 +873,7 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
 
 #pragma unroll
         for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F,
+            sumi_d = syclcompat::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F,
                                 u[i * QI8_1 + j], sumi_d); // SIMD dot product
         }
 
@@ -1018,7 +1018,7 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
 
 #pragma unroll
         for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j],
+            sumi_d = syclcompat::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j],
                                 sumi_d); // SIMD dot product
         }
 
@@ -1156,14 +1156,14 @@ vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
 
 #pragma unroll
         for (int i = i0; i < i0 + 2; ++i) {
-            sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0],
+            sumi_d.x() = syclcompat::dp4a(v[2 * i + 0], u[2 * i + 0],
                                     sumi_d.x()); // SIMD dot product
-            sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1],
+            sumi_d.x() = syclcompat::dp4a(v[2 * i + 1], u[2 * i + 1],
                                     sumi_d.x()); // SIMD dot product
 
-            sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4],
+            sumi_d.y() = syclcompat::dp4a(v[2 * i + 4], u[2 * i + 4],
                                     sumi_d.y()); // SIMD dot product
-            sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5],
+            sumi_d.y() = syclcompat::dp4a(v[2 * i + 5], u[2 * i + 5],
                                     sumi_d.y()); // SIMD dot product
         }
-Original file line number
+Diff line change
@@ Expand Up / @@ -41,6 +41,8 @@ The following release is verified with good quality: @@
     ## News
+    - 2024.11
+      - Use syclcompat to improve the performance on some backends. This requires to use oneAPI 2025.0 or more recent.
     - 2024.8
       - Use oneDNN as the default GEMM library, improve the compatibility for new Intel GPUs.
@@ Expand Down @@