[GPU] Micro sdpa (openvinotoolkit#24656)

### Details: - Added SDPA impl based on microkernels using internal onednn API and related infra - Current limitations: - fused transpose shouldn't change order of innermost dim (head size). - is_causal = true is not supported - fp16 only - num heads dimension must be static - no indirect kv support - Initial version of KV Cache + SDPA func test - Enabled Transpose+SDPA fusion for static shape too ### Tickets: - CVS-141761
akuporos · Jul 1, 2024 · 2918322 · 2918322
1 parent a3d2b6a
commit 2918322
Show file tree

Hide file tree

Showing 35 changed files with 2,216 additions and 79 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp
@@ -26,8 +26,10 @@ class Plugin : public ov::IPlugin {
 
     std::map<std::string, std::shared_ptr<RemoteContextImpl>> get_default_contexts() const;
 
-    std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network, const ExecutionConfig& config) const;
-    void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config) const;
+    std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network,
+                                                         const ExecutionConfig& config,
+                                                         const std::shared_ptr<RemoteContextImpl>& context) const;
+    void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config, const std::shared_ptr<RemoteContextImpl>& context) const;
     void register_primitives() const;
     std::string get_device_id_from_config(const ov::AnyMap& config) const;
     std::string get_device_id(const ov::AnyMap& config) const;

diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp
@@ -6,6 +6,7 @@
 
 #include <memory>
 
+#include "intel_gpu/plugin/remote_context.hpp"
 #include "openvino/core/model.hpp"
 
 #include "intel_gpu/runtime/execution_config.hpp"
@@ -16,12 +17,13 @@ namespace intel_gpu {
 
 class TransformationsPipeline {
 public:
-    explicit TransformationsPipeline(const ExecutionConfig &conf, const cldnn::device_info &device_info)
-        : config(conf), device_info(device_info) {}
+    explicit TransformationsPipeline(const ExecutionConfig &conf, const std::shared_ptr<RemoteContextImpl>& context)
+        : config(conf), m_context(context), device_info(context->get_engine().get_device_info()) {}
     void apply(std::shared_ptr<ov::Model> func);
 
 private:
     const ExecutionConfig& config;
+    std::shared_ptr<RemoteContextImpl> m_context;
     cldnn::device_info device_info;
 };
 

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -48,7 +48,7 @@ enum class LogLevel : int8_t {
 #else
 #define SEPARATE '/'
 #endif
-#define __FILENAME__ (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
+#define GPU_FILENAME (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
 #define GPU_DEBUG_IF(cond) if (cond)
 #define GPU_DEBUG_CODE(...) __VA_ARGS__
 #define GPU_DEBUG_DEFINE_MEM_LOGGER(stage) \
@@ -62,9 +62,9 @@ enum class LogLevel : int8_t {
 #define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
     ((cldnn::debug_configuration::get_instance()->verbose_color == 0) ? GPU_DEBUG_LOG_PREFIX : GPU_DEBUG_LOG_COLOR_PREFIX)
 #define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast<std::underlying_type<ov::intel_gpu::LogLevel>::type>(min_verbose_level))
-#define GPU_DEBUG_LOG_PREFIX    std::cout << cldnn::debug_configuration::prefix << __FILENAME__ << ":" <<__LINE__ << ":" << __func__ << ": "
+#define GPU_DEBUG_LOG_PREFIX    std::cout << cldnn::debug_configuration::prefix << GPU_FILENAME << ":" <<__LINE__ << ":" << __func__ << ": "
 #define GPU_DEBUG_LOG_COLOR_PREFIX  std::cout << DARK_GRAY << cldnn::debug_configuration::prefix << \
-    BLUE << __FILENAME__ << ":" << PURPLE <<  __LINE__ << ":" << CYAN << __func__ << ": " << RESET
+    BLUE << GPU_FILENAME << ":" << PURPLE <<  __LINE__ << ":" << CYAN << __func__ << ": " << RESET
 #define DARK_GRAY   "\033[1;30m"
 #define BLUE        "\033[1;34m"
 #define PURPLE      "\033[1;35m"

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -10,7 +10,6 @@
 #include <string>
 #include <vector>
 #include <tuple>
-#include <array>
 
 namespace cldnn {
 /// @addtogroup cpp_api C++ API
@@ -25,6 +24,17 @@ enum class device_type {
     discrete_gpu = 1
 };
 
+enum class gpu_arch {
+    unknown = 0,
+    gen9 = 1,
+    gen11 = 2,
+    xe_lp = 3,
+    xe_hp = 4,
+    xe_hpg = 5,
+    xe_hpc = 6,
+    xe2 = 7,
+};
+
 /// @brief Defines version of GFX IP
 struct gfx_version {
     uint16_t major;
@@ -77,6 +87,8 @@ struct device_info {
     device_type dev_type;                       ///< Defines type of current GPU device (integrated or discrete)
 
     gfx_version gfx_ver;                        ///< Defines GFX IP version
+    gpu_arch arch;                              ///< Defines arch human readable name
+    uint32_t ip_version;                        ///< Defines raw GFX IP version
     uint32_t device_id;                         ///< ID of current GPU
     uint32_t num_slices;                        ///< Number of slices
     uint32_t num_sub_slices_per_slice;          ///< Number of subslices in a slice

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp
@@ -121,8 +121,9 @@ struct kernel_string {
     std::string options;
     std::string entry_point;
     bool batch_compilation;
+    bool has_microkernels;
 
-    kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false) {}
+    kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false), has_microkernels(false) {}
 
     std::string get_str() const { return str + jit + undefs + options + entry_point; }
     size_t get_hash() const { return std::hash<std::string>()(get_str()); }

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
@@ -5,6 +5,7 @@
 #include "intel_gpu/graph/program.hpp"
 
 #include "kernel_selector_helper.h"
+#include "intel_gpu/runtime/device_info.hpp"
 #include "kernel_selector_params.h"
 #include "to_string_utils.h"
 #include "program_node.h"
@@ -32,7 +33,6 @@
 #include "intel_gpu/primitives/extract_image_patches.hpp"
 
 #include "activation_inst.h"
-#include "depth_to_space_inst.h"
 #include "eltwise_inst.h"
 #include "quantize_inst.h"
 #include "reorder_inst.h"
@@ -44,9 +44,9 @@
 #include "kernel_selector/kernels/reorder/reorder_kernel_base.h"
 
 #include "runtime/kernels_cache.hpp"
-#include "kernel_base.h"
 
 #include <string>
+#include <type_traits>
 #include <vector>
 
 namespace {
@@ -119,6 +119,48 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) {
 
 namespace cldnn {
 
+bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config) {
+    auto device = e.get_device().get();
+
+    static std::mutex m;
+    std::lock_guard<std::mutex> lock(m);
+    static std::map<cldnn::device*, bool> cache;
+    if (cache.find(device) != cache.end()) {
+        return cache.at(device);
+    }
+
+    std::shared_ptr<kernel_selector::KernelString> kernel_string = std::make_shared<kernel_selector::KernelString>();
+    // This program check that all required vISA features are supported by current IGC version
+    const char* kernel_code = R""""(
+        kernel void igc_check() {
+            __asm__ volatile(
+                    ".decl AA0 v_type=G type=ud num_elts=1\n"
+                    ".decl AA1 v_type=G type=ud num_elts=1\n"
+                    ".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n"
+                    ".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n"
+                    "mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n"
+            );
+        }
+        )"""";
+
+    kernel_string->str = kernel_code;
+    kernel_string->options = "";
+    kernel_string->entry_point = "igc_check";
+    kernel_string->batch_compilation = true;
+
+    try {
+        cldnn::kernel_impl_params dummy_params;
+        auto _kernels_cache_device_query = std::unique_ptr<cldnn::kernels_cache>(new cldnn::kernels_cache(e, config, 0));
+        _kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false);
+        _kernels_cache_device_query->build_all();
+        cache[device] = true;
+    } catch (std::exception&) {
+        cache[device] = false;
+    }
+
+    return cache.at(device);
+}
+
 kernel_selector::data_type to_data_type(data_types dt) {
     switch (dt) {
         case cldnn::data_types::i4:
@@ -1081,6 +1123,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
     params.engineInfo.bOptHintsSupport = false;
 
     params.engineInfo.bLocalBlockIOSupport = query_local_block_io_supported(engine, config);
+    params.engineInfo.supports_microkernels = query_microkernels_supported(engine, config);
     params.engineInfo.deviceType = get_device_type(device_info.dev_type);
     params.engineInfo.maxWorkGroupSize = device_info.max_work_group_size;
     params.engineInfo.maxLocalMemSize = device_info.max_local_mem_size;
@@ -1092,6 +1135,8 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
     params.engineInfo.driverVersion = device_info.driver_version;
     params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
     params.engineInfo.vendor_id = device_info.vendor_id;
+    params.engineInfo.ip_version = device_info.ip_version;
+    params.engineInfo.arch = kernel_selector::gpu_arch(static_cast<std::underlying_type<gpu_arch>::type>(device_info.arch));
 
     auto impl_forcing = config.get_property(ov::intel_gpu::force_implementations);
 

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
@@ -294,4 +294,6 @@ inline void update_shapes(kernel_selector::Params& p, const kernel_impl_params&
     }
 }
 
+bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config);
+
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
@@ -58,7 +58,7 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
         // buffers number and its' sizes (since update_dispatch_data is called for both kernels too), and
         // do not double memory allocations during reallocate_if_needed() function call
         std::vector<layout> layouts;
-        if (_kernels_data.size() > 0) {
+        if (_kernels_data.size() > 0 && !_kernels_data[0].internalBufferSizes.empty()) {
             auto dtype = from_data_type(_kernels_data[0].internalBufferDataType);
             const auto bpp = data_type_traits::size_of(dtype);
             for (auto size : _kernels_data[0].internalBufferSizes) {

diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -223,7 +223,7 @@ void program::init_program() {
     if (_task_executor == nullptr)
         _task_executor = program::make_task_executor(_config);
     _kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
-                                                                      kernel_selector::KernelBase::get_db().get_batch_header_str()));
+                                                                      kernel_selector::KernelBase::get_db().get_batch_headers()));
 
     if (!_compilation_context)
         _compilation_context = program::make_compilation_context(_config);

diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
@@ -67,6 +67,10 @@ target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${INCLUDE_DIR
 target_compile_options(${TARGET_NAME} PRIVATE
   $<$<CONFIG:Release>:$<IF:$<CXX_COMPILER_ID:MSVC>,/Os,-Os>>)
 
+if (ENABLE_ONEDNN_FOR_GPU)
+    target_link_libraries(${TARGET_NAME} PRIVATE onednn_gpu_tgt)
+endif()
+
 if(COMMAND add_cpplint_target)
   add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
 endif()

diff --git a/...gins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl b/...gins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ * Copyright 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#ifndef GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H
+#define GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H
+
+typedef half __attribute__((ext_vector_type(1))) half1;
+typedef uint __attribute__((ext_vector_type(1))) uint1;
+typedef float __attribute__((ext_vector_type(1))) float1;
+
+float1 __attribute__((overloadable)) vmad(float1 a, float1 b, float1 c) {
+    c[0] = mad(a[0], b[0], c[0]);
+    return c;
+}
+float2 __attribute__((overloadable)) vmad(float2 a, float2 b, float2 c) {
+    return mad(a, b, c);
+}
+float4 __attribute__((overloadable)) vmad(float4 a, float4 b, float4 c) {
+    return mad(a, b, c);
+}
+float8 __attribute__((overloadable)) vmad(float8 a, float8 b, float8 c) {
+    return mad(a, b, c);
+}
+float16 __attribute__((overloadable)) vmad(float16 a, float16 b, float16 c) {
+    return mad(a, b, c);
+}
+
+float1 __attribute__((overloadable)) native_vrecip(float1 x) {
+    x[0] = native_recip(x[0]);
+    return x;
+}
+float2 __attribute__((overloadable)) native_vrecip(float2 x) {
+    return native_recip(x);
+}
+float4 __attribute__((overloadable)) native_vrecip(float4 x) {
+    return native_recip(x);
+}
+float8 __attribute__((overloadable)) native_vrecip(float8 x) {
+    return native_recip(x);
+}
+float16 __attribute__((overloadable)) native_vrecip(float16 x) {
+    return native_recip(x);
+}
+
+float1 __attribute__((overloadable)) native_vexp2(float1 x) {
+    x[0] = native_exp2(x[0]);
+    return x;
+}
+float2 __attribute__((overloadable)) native_vexp2(float2 x) {
+    return native_exp2(x);
+}
+float4 __attribute__((overloadable)) native_vexp2(float4 x) {
+    return native_exp2(x);
+}
+float8 __attribute__((overloadable)) native_vexp2(float8 x) {
+    return native_exp2(x);
+}
+float16 __attribute__((overloadable)) native_vexp2(float16 x) {
+    return native_exp2(x);
+}
+
+#endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl
@@ -0,0 +1,35 @@
+/*******************************************************************************
+ * Copyright 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#ifndef GPU_OCL_SDPA_UTILS_H
+#define GPU_OCL_SDPA_UTILS_H
+
+#define _4D_OFF(tag, x0, x1, x2, x3) \
+    (((x0) % tag##_B0) * tag##_SB0 + ((x0) / tag##_B0) * tag##_S0 \
+            + ((x1) % tag##_B1) * tag##_SB1 + ((x1) / tag##_B1) * tag##_S1 \
+            + ((x2) % tag##_B2) * tag##_SB2 + ((x2) / tag##_B2) * tag##_S2 \
+            + ((x3) % tag##_B3) * tag##_SB3 + ((x3) / tag##_B3) * tag##_S3)
+
+#define QRY_OFF(x0, x1, x2, x3) _4D_OFF(QRY, x0, x1, x2, x3)
+#define KEY_OFF(x0, x1, x2, x3) _4D_OFF(KEY, x0, x1, x2, x3)
+#define VAL_OFF(x0, x1, x2, x3) _4D_OFF(VAL, x0, x1, x2, x3)
+#define MSK_OFF(x0, x1, x2, x3) _4D_OFF(MSK, x0, x1, x2, x3)
+
+#define DST_OFF(x0, x1, d, h, w) \
+(((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
++ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1)
+
+#endif
-Original file line number
+Diff line change
@@ Expand Up @@
         }
     }
+    bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config);
     }  // namespace cldnn