From 29183228c18b591fc5e4e6d70620e10639183f51 Mon Sep 17 00:00:00 2001
From: Vladimir Paramuzov <vladimir.paramuzov@intel.com>
Date: Mon, 1 Jul 2024 18:29:21 +0400
Subject: [PATCH] [GPU] Micro sdpa (#24656)

### Details:
- Added SDPA impl based on microkernels using internal onednn API and
related infra
 - Current limitations:
- fused transpose shouldn't change order of innermost dim (head size).
   - is_causal = true is not supported
   - fp16 only
   - num heads dimension must be static
   - no indirect kv support
- Initial version of KV Cache + SDPA func test
- Enabled Transpose+SDPA fusion for static shape too

### Tickets:
 - CVS-141761
---
 .../include/intel_gpu/plugin/plugin.hpp       |   6 +-
 .../plugin/transformations_pipeline.hpp       |   6 +-
 .../intel_gpu/runtime/debug_configuration.hpp |   6 +-
 .../include/intel_gpu/runtime/device_info.hpp |  14 +-
 .../include/intel_gpu/runtime/kernel_args.hpp |   3 +-
 .../impls/ocl/kernel_selector_helper.cpp      |  49 +-
 .../graph/impls/ocl/kernel_selector_helper.h  |   2 +
 .../ocl/scaled_dot_product_attention.cpp      |   2 +-
 src/plugins/intel_gpu/src/graph/program.cpp   |   2 +-
 .../src/kernel_selector/CMakeLists.txt        |   4 +
 .../batch_headers/generic_vector_ops.cl       |  75 +++
 .../include/batch_headers/sdpa_utils.cl       |  35 +
 .../include/batch_headers/tile_ops.cl         | 628 +++++++++++++++++
 .../include/batch_headers/vec_typedefs.cl     |  16 +-
 .../kernel_selector/cl_kernels/sdpa_micro.cl  | 461 +++++++++++++
 .../kernel_selector/kernel_selector_common.h  |   5 +
 .../kernel_selector/kernel_selector_params.h  |  18 +
 .../kernels/sdpa/sdpa_kernel_micro.cpp        | 630 ++++++++++++++++++
 .../kernels/sdpa/sdpa_kernel_micro.h          |  47 ++
 .../kernels/sdpa/sdpa_kernel_opt.cpp          |   4 +-
 .../kernels/sdpa/sdpa_kernel_selector.cpp     |   4 +
 .../src/kernel_selector/micro_utils.hpp       |  58 ++
 .../src/kernel_selector/primitive_db.cpp      |   2 +-
 .../src/kernel_selector/primitive_db.h        |   4 +-
 .../src/kernel_selector/primitive_db_gen.py   |   7 +-
 .../ops/scaled_dot_product_attention.cpp      |   9 +-
 src/plugins/intel_gpu/src/plugin/plugin.cpp   |  23 +-
 .../transformations/transpose_fusion.cpp      |  14 +-
 .../src/plugin/transformations_pipeline.cpp   |  11 +-
 .../intel_gpu/src/runtime/kernels_cache.cpp   |  58 +-
 .../intel_gpu/src/runtime/kernels_cache.hpp   |  26 +-
 .../intel_gpu/src/runtime/ocl/ocl_device.cpp  |  43 +-
 .../intel_gpu/src/runtime/ocl/ocl_wrapper.hpp |   2 +
 .../tests/common/subgraphs_builders.hpp       |   2 +-
 .../intel_gpu/thirdparty/CMakeLists.txt       |  19 +-
 35 files changed, 2216 insertions(+), 79 deletions(-)
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/tile_ops.cl
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.h
 create mode 100644 src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp
index a5ea28b95f78ba..28a20fa737da76 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp
@@ -26,8 +26,10 @@ class Plugin : public ov::IPlugin {
 
     std::map<std::string, std::shared_ptr<RemoteContextImpl>> get_default_contexts() const;
 
-    std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network, const ExecutionConfig& config) const;
-    void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config) const;
+    std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network,
+                                                         const ExecutionConfig& config,
+                                                         const std::shared_ptr<RemoteContextImpl>& context) const;
+    void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config, const std::shared_ptr<RemoteContextImpl>& context) const;
     void register_primitives() const;
     std::string get_device_id_from_config(const ov::AnyMap& config) const;
     std::string get_device_id(const ov::AnyMap& config) const;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp
index f0c0361ce72077..365deb66ad2a4d 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp
@@ -6,6 +6,7 @@
 
 #include <memory>
 
+#include "intel_gpu/plugin/remote_context.hpp"
 #include "openvino/core/model.hpp"
 
 #include "intel_gpu/runtime/execution_config.hpp"
@@ -16,12 +17,13 @@ namespace intel_gpu {
 
 class TransformationsPipeline {
 public:
-    explicit TransformationsPipeline(const ExecutionConfig &conf, const cldnn::device_info &device_info)
-        : config(conf), device_info(device_info) {}
+    explicit TransformationsPipeline(const ExecutionConfig &conf, const std::shared_ptr<RemoteContextImpl>& context)
+        : config(conf), m_context(context), device_info(context->get_engine().get_device_info()) {}
     void apply(std::shared_ptr<ov::Model> func);
 
 private:
     const ExecutionConfig& config;
+    std::shared_ptr<RemoteContextImpl> m_context;
     cldnn::device_info device_info;
 };
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
index b3220136c6c8a8..d9a30e3244b4ae 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp
@@ -48,7 +48,7 @@ enum class LogLevel : int8_t {
 #else
 #define SEPARATE '/'
 #endif
-#define __FILENAME__ (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
+#define GPU_FILENAME (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
 #define GPU_DEBUG_IF(cond) if (cond)
 #define GPU_DEBUG_CODE(...) __VA_ARGS__
 #define GPU_DEBUG_DEFINE_MEM_LOGGER(stage) \
@@ -62,9 +62,9 @@ enum class LogLevel : int8_t {
 #define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
     ((cldnn::debug_configuration::get_instance()->verbose_color == 0) ? GPU_DEBUG_LOG_PREFIX : GPU_DEBUG_LOG_COLOR_PREFIX)
 #define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast<std::underlying_type<ov::intel_gpu::LogLevel>::type>(min_verbose_level))
-#define GPU_DEBUG_LOG_PREFIX    std::cout << cldnn::debug_configuration::prefix << __FILENAME__ << ":" <<__LINE__ << ":" << __func__ << ": "
+#define GPU_DEBUG_LOG_PREFIX    std::cout << cldnn::debug_configuration::prefix << GPU_FILENAME << ":" <<__LINE__ << ":" << __func__ << ": "
 #define GPU_DEBUG_LOG_COLOR_PREFIX  std::cout << DARK_GRAY << cldnn::debug_configuration::prefix << \
-    BLUE << __FILENAME__ << ":" << PURPLE <<  __LINE__ << ":" << CYAN << __func__ << ": " << RESET
+    BLUE << GPU_FILENAME << ":" << PURPLE <<  __LINE__ << ":" << CYAN << __func__ << ": " << RESET
 #define DARK_GRAY   "\033[1;30m"
 #define BLUE        "\033[1;34m"
 #define PURPLE      "\033[1;35m"
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
index 7a18cd311e1fd4..0c83877851b48b 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
@@ -10,7 +10,6 @@
 #include <string>
 #include <vector>
 #include <tuple>
-#include <array>
 
 namespace cldnn {
 /// @addtogroup cpp_api C++ API
@@ -25,6 +24,17 @@ enum class device_type {
     discrete_gpu = 1
 };
 
+enum class gpu_arch {
+    unknown = 0,
+    gen9 = 1,
+    gen11 = 2,
+    xe_lp = 3,
+    xe_hp = 4,
+    xe_hpg = 5,
+    xe_hpc = 6,
+    xe2 = 7,
+};
+
 /// @brief Defines version of GFX IP
 struct gfx_version {
     uint16_t major;
@@ -77,6 +87,8 @@ struct device_info {
     device_type dev_type;                       ///< Defines type of current GPU device (integrated or discrete)
 
     gfx_version gfx_ver;                        ///< Defines GFX IP version
+    gpu_arch arch;                              ///< Defines arch human readable name
+    uint32_t ip_version;                        ///< Defines raw GFX IP version
     uint32_t device_id;                         ///< ID of current GPU
     uint32_t num_slices;                        ///< Number of slices
     uint32_t num_sub_slices_per_slice;          ///< Number of subslices in a slice
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp
index 6bce66e64ef808..09dfcf68f05725 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp
@@ -121,8 +121,9 @@ struct kernel_string {
     std::string options;
     std::string entry_point;
     bool batch_compilation;
+    bool has_microkernels;
 
-    kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false) {}
+    kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false), has_microkernels(false) {}
 
     std::string get_str() const { return str + jit + undefs + options + entry_point; }
     size_t get_hash() const { return std::hash<std::string>()(get_str()); }
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
index 12e20536407d45..1f492e14c9fc7e 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp
@@ -5,6 +5,7 @@
 #include "intel_gpu/graph/program.hpp"
 
 #include "kernel_selector_helper.h"
+#include "intel_gpu/runtime/device_info.hpp"
 #include "kernel_selector_params.h"
 #include "to_string_utils.h"
 #include "program_node.h"
@@ -32,7 +33,6 @@
 #include "intel_gpu/primitives/extract_image_patches.hpp"
 
 #include "activation_inst.h"
-#include "depth_to_space_inst.h"
 #include "eltwise_inst.h"
 #include "quantize_inst.h"
 #include "reorder_inst.h"
@@ -44,9 +44,9 @@
 #include "kernel_selector/kernels/reorder/reorder_kernel_base.h"
 
 #include "runtime/kernels_cache.hpp"
-#include "kernel_base.h"
 
 #include <string>
+#include <type_traits>
 #include <vector>
 
 namespace {
@@ -119,6 +119,48 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) {
 
 namespace cldnn {
 
+bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config) {
+    auto device = e.get_device().get();
+
+    static std::mutex m;
+    std::lock_guard<std::mutex> lock(m);
+    static std::map<cldnn::device*, bool> cache;
+    if (cache.find(device) != cache.end()) {
+        return cache.at(device);
+    }
+
+    std::shared_ptr<kernel_selector::KernelString> kernel_string = std::make_shared<kernel_selector::KernelString>();
+    // This program check that all required vISA features are supported by current IGC version
+    const char* kernel_code = R""""(
+        kernel void igc_check() {
+            __asm__ volatile(
+                    ".decl AA0 v_type=G type=ud num_elts=1\n"
+                    ".decl AA1 v_type=G type=ud num_elts=1\n"
+                    ".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n"
+                    ".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n"
+                    "mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n"
+            );
+        }
+        )"""";
+
+    kernel_string->str = kernel_code;
+    kernel_string->options = "";
+    kernel_string->entry_point = "igc_check";
+    kernel_string->batch_compilation = true;
+
+    try {
+        cldnn::kernel_impl_params dummy_params;
+        auto _kernels_cache_device_query = std::unique_ptr<cldnn::kernels_cache>(new cldnn::kernels_cache(e, config, 0));
+        _kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false);
+        _kernels_cache_device_query->build_all();
+        cache[device] = true;
+    } catch (std::exception&) {
+        cache[device] = false;
+    }
+
+    return cache.at(device);
+}
+
 kernel_selector::data_type to_data_type(data_types dt) {
     switch (dt) {
         case cldnn::data_types::i4:
@@ -1081,6 +1123,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
     params.engineInfo.bOptHintsSupport = false;
 
     params.engineInfo.bLocalBlockIOSupport = query_local_block_io_supported(engine, config);
+    params.engineInfo.supports_microkernels = query_microkernels_supported(engine, config);
     params.engineInfo.deviceType = get_device_type(device_info.dev_type);
     params.engineInfo.maxWorkGroupSize = device_info.max_work_group_size;
     params.engineInfo.maxLocalMemSize = device_info.max_local_mem_size;
@@ -1092,6 +1135,8 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
     params.engineInfo.driverVersion = device_info.driver_version;
     params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
     params.engineInfo.vendor_id = device_info.vendor_id;
+    params.engineInfo.ip_version = device_info.ip_version;
+    params.engineInfo.arch = kernel_selector::gpu_arch(static_cast<std::underlying_type<gpu_arch>::type>(device_info.arch));
 
     auto impl_forcing = config.get_property(ov::intel_gpu::force_implementations);
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
index de746e94661ff7..67e2127a064001 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h
@@ -294,4 +294,6 @@ inline void update_shapes(kernel_selector::Params& p, const kernel_impl_params&
     }
 }
 
+bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config);
+
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
index 681237b2bcd02b..4bc6e16ce55fa9 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp
@@ -58,7 +58,7 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
         // buffers number and its' sizes (since update_dispatch_data is called for both kernels too), and
         // do not double memory allocations during reallocate_if_needed() function call
         std::vector<layout> layouts;
-        if (_kernels_data.size() > 0) {
+        if (_kernels_data.size() > 0 && !_kernels_data[0].internalBufferSizes.empty()) {
             auto dtype = from_data_type(_kernels_data[0].internalBufferDataType);
             const auto bpp = data_type_traits::size_of(dtype);
             for (auto size : _kernels_data[0].internalBufferSizes) {
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index 4b0cc049013ada..1c526ea38188d2 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -223,7 +223,7 @@ void program::init_program() {
     if (_task_executor == nullptr)
         _task_executor = program::make_task_executor(_config);
     _kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
-                                                                      kernel_selector::KernelBase::get_db().get_batch_header_str()));
+                                                                      kernel_selector::KernelBase::get_db().get_batch_headers()));
 
     if (!_compilation_context)
         _compilation_context = program::make_compilation_context(_config);
diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
index 52c3fb052ff093..993ef596a21256 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
+++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
@@ -67,6 +67,10 @@ target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${INCLUDE_DIR
 target_compile_options(${TARGET_NAME} PRIVATE
   $<$<CONFIG:Release>:$<IF:$<CXX_COMPILER_ID:MSVC>,/Os,-Os>>)
 
+if (ENABLE_ONEDNN_FOR_GPU)
+    target_link_libraries(${TARGET_NAME} PRIVATE onednn_gpu_tgt)
+endif()
+
 if(COMMAND add_cpplint_target)
   add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
 endif()
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl
new file mode 100644
index 00000000000000..b9a3df7cc863ea
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl
@@ -0,0 +1,75 @@
+/*******************************************************************************
+ * Copyright 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#ifndef GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H
+#define GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H
+
+typedef half __attribute__((ext_vector_type(1))) half1;
+typedef uint __attribute__((ext_vector_type(1))) uint1;
+typedef float __attribute__((ext_vector_type(1))) float1;
+
+float1 __attribute__((overloadable)) vmad(float1 a, float1 b, float1 c) {
+    c[0] = mad(a[0], b[0], c[0]);
+    return c;
+}
+float2 __attribute__((overloadable)) vmad(float2 a, float2 b, float2 c) {
+    return mad(a, b, c);
+}
+float4 __attribute__((overloadable)) vmad(float4 a, float4 b, float4 c) {
+    return mad(a, b, c);
+}
+float8 __attribute__((overloadable)) vmad(float8 a, float8 b, float8 c) {
+    return mad(a, b, c);
+}
+float16 __attribute__((overloadable)) vmad(float16 a, float16 b, float16 c) {
+    return mad(a, b, c);
+}
+
+float1 __attribute__((overloadable)) native_vrecip(float1 x) {
+    x[0] = native_recip(x[0]);
+    return x;
+}
+float2 __attribute__((overloadable)) native_vrecip(float2 x) {
+    return native_recip(x);
+}
+float4 __attribute__((overloadable)) native_vrecip(float4 x) {
+    return native_recip(x);
+}
+float8 __attribute__((overloadable)) native_vrecip(float8 x) {
+    return native_recip(x);
+}
+float16 __attribute__((overloadable)) native_vrecip(float16 x) {
+    return native_recip(x);
+}
+
+float1 __attribute__((overloadable)) native_vexp2(float1 x) {
+    x[0] = native_exp2(x[0]);
+    return x;
+}
+float2 __attribute__((overloadable)) native_vexp2(float2 x) {
+    return native_exp2(x);
+}
+float4 __attribute__((overloadable)) native_vexp2(float4 x) {
+    return native_exp2(x);
+}
+float8 __attribute__((overloadable)) native_vexp2(float8 x) {
+    return native_exp2(x);
+}
+float16 __attribute__((overloadable)) native_vexp2(float16 x) {
+    return native_exp2(x);
+}
+
+#endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl
new file mode 100644
index 00000000000000..5943f23251bb7a
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl
@@ -0,0 +1,35 @@
+/*******************************************************************************
+ * Copyright 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#ifndef GPU_OCL_SDPA_UTILS_H
+#define GPU_OCL_SDPA_UTILS_H
+
+#define _4D_OFF(tag, x0, x1, x2, x3) \
+    (((x0) % tag##_B0) * tag##_SB0 + ((x0) / tag##_B0) * tag##_S0 \
+            + ((x1) % tag##_B1) * tag##_SB1 + ((x1) / tag##_B1) * tag##_S1 \
+            + ((x2) % tag##_B2) * tag##_SB2 + ((x2) / tag##_B2) * tag##_S2 \
+            + ((x3) % tag##_B3) * tag##_SB3 + ((x3) / tag##_B3) * tag##_S3)
+
+#define QRY_OFF(x0, x1, x2, x3) _4D_OFF(QRY, x0, x1, x2, x3)
+#define KEY_OFF(x0, x1, x2, x3) _4D_OFF(KEY, x0, x1, x2, x3)
+#define VAL_OFF(x0, x1, x2, x3) _4D_OFF(VAL, x0, x1, x2, x3)
+#define MSK_OFF(x0, x1, x2, x3) _4D_OFF(MSK, x0, x1, x2, x3)
+
+#define DST_OFF(x0, x1, d, h, w) \
+(((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
++ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1)
+
+#endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/tile_ops.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/tile_ops.cl
new file mode 100644
index 00000000000000..b2ce72c33fc31c
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/tile_ops.cl
@@ -0,0 +1,628 @@
+/*******************************************************************************
+ * Copyright 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+#ifndef GPU_OCL_TILE_OPS_H
+#define GPU_OCL_TILE_OPS_H
+
+float __builtin_IB_atomic_max_local_f32(__local float *, float);
+
+__attribute__((overloadable)) float local_atomic_max(local float *p, float v) {
+    return __builtin_IB_atomic_max_local_f32(p, v);
+}
+
+__attribute__((overloadable)) half local_atomic_max(
+        local half *p, half v) { /* not implemented */
+    return v;
+}
+
+__attribute__((overloadable)) uint local_atomic_max(local uint *p, uint v) {
+    return atomic_max(p, v);
+}
+
+__attribute__((overloadable)) int local_atomic_max(local int *p, int v) {
+    return atomic_max(p, v);
+}
+
+#define DEF_BLOCK_LOAD_STORE(type, itype, suffix, n) \
+    __attribute__((overloadable)) type##n block_load( \
+            const global type *p, int vlen) \
+            __attribute__((enable_if(vlen == n, "wrong vector length"))) { \
+        return as_##type##n( \
+                intel_sub_group_block_read##suffix##n((global void *)p)); \
+    } \
+    __attribute__((overloadable)) void block_store( \
+            global type *p, type##n v) { \
+        intel_sub_group_block_write##suffix##n( \
+                (global itype *)p, as_##itype##n(v)); \
+    }
+
+#define DEF_BLOCK_LOAD_STORE1(type, itype, suffix) \
+    __attribute__((overloadable)) \
+            type##1 block_load(const global type *p, int vlen) __attribute__( \
+                    (enable_if(vlen == 1, "wrong vector length"))) { \
+        type##1 x; \
+        x[0] = as_##type( \
+                intel_sub_group_block_read##suffix((global void *)p)); \
+        return x; \
+    } \
+    __attribute__((overloadable)) void block_store( \
+            global type *p, type##1 v) { \
+        intel_sub_group_block_write##suffix( \
+                (global itype *)p, as_##itype(v[0])); \
+    }
+
+DEF_BLOCK_LOAD_STORE1(half, ushort, _us)
+DEF_BLOCK_LOAD_STORE(half, ushort, _us, 2)
+DEF_BLOCK_LOAD_STORE(half, ushort, _us, 4)
+DEF_BLOCK_LOAD_STORE(half, ushort, _us, 8)
+DEF_BLOCK_LOAD_STORE(half, ushort, _us, 16)
+DEF_BLOCK_LOAD_STORE1(uint, uint, )
+DEF_BLOCK_LOAD_STORE(uint, uint, , 2)
+DEF_BLOCK_LOAD_STORE(uint, uint, , 4)
+DEF_BLOCK_LOAD_STORE(uint, uint, , 8)
+
+#define DEF_BLOCK2D_LOAD_STORE(type, itype, vl, SG, suffix, BR, BC) \
+    itype##vl __builtin_IB_subgroup_block_read_flat_##suffix( \
+            long, int, int, int, int2); \
+    void __builtin_IB_subgroup_block_write_flat_##suffix( \
+            long, int, int, int, int2, itype##vl); \
+    __attribute__((overloadable)) type##vl block2d_load(const global type *p, \
+            int w, int h, int ld, int x, int y, int br, int bc, \
+            int sg) __attribute__((enable_if(br == BR, "wrong #rows"))) \
+            __attribute__((enable_if(bc == BC, "wrong #columns"))) \
+                    __attribute__( \
+                            (enable_if(sg == SG, "wrong subgroup size"))) { \
+        ulong pp = as_long(p); \
+        ulong prem = pp & 0x3F; \
+        pp &= ~0x3F; \
+        x += (prem / sizeof(type)); \
+        w += prem; \
+        int2 coord = {x, y}; \
+        return as_##type##vl(__builtin_IB_subgroup_block_read_flat_##suffix( \
+                pp, w - 1, h - 1, ld - 1, coord)); \
+    } \
+    __attribute__((overloadable)) void block2d_store(type##vl v, \
+            global type *p, int w, int h, int ld, int x, int y, int br, \
+            int bc, \
+            int sg) __attribute__((enable_if(br == BR, "wrong #rows"))) \
+            __attribute__((enable_if(bc == BC, "wrong #columns"))) \
+                    __attribute__( \
+                            (enable_if(sg == SG, "wrong subgroup size"))) { \
+        ulong pp = as_long(p); \
+        ulong prem = pp & 0x3F; \
+        pp &= ~0x3F; \
+        x += (prem / sizeof(type)); \
+        w += prem; \
+        int2 coord = {x, y}; \
+        __builtin_IB_subgroup_block_write_flat_##suffix( \
+                pp, w - 1, h - 1, ld - 1, coord, as_##itype##vl(v)); \
+    }
+
+DEF_BLOCK2D_LOAD_STORE(half, ushort, 8, 16, u16_m8k16v1, 16, 8)
+DEF_BLOCK2D_LOAD_STORE(half, ushort, 8, 16, u16_m4k32v1, 32, 4)
+DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8)
+
+#define tile_fill(t, v) \
+    do { \
+        _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \
+                               i++) t.x[i] \
+                = v; \
+    } while (0)
+
+#define tile_elementwise(t, f) \
+    do { \
+        _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \
+                               i++) t.x[i] \
+                = f(t.x[i]); \
+    } while (0)
+
+#define tile_elementwise_s(t, f) \
+    do { \
+        _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \
+                               i++) { \
+            _Pragma("unroll") for (int s = 0; \
+                                   s < sizeof(t.x[0]) / sizeof(t.x[0][0]); \
+                                   s++) t.x[i][s] \
+                    = f(t.x[i][s]); \
+        } \
+    } while (0)
+
+#define tile_binary(t, t2, f) \
+    do { \
+        _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \
+                               i++) t.x[i] \
+                = f(t.x[i], t2.x[i]); \
+    } while (0)
+
+#define tile_copy(t, t_new) \
+    do { \
+        _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \
+                               i++) t_new.x[i] \
+                = __builtin_convertvector(t.x[i], __typeof__(t_new.x[i])); \
+    } while (0)
+
+#define tile_copy_to_half2(t, t_new) \
+    do { \
+        _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \
+                               i++) { \
+            _Pragma("unroll") for (int s = 0; \
+                                   s < sizeof(t.x[0]) / sizeof(t.x[0][0]) / 2; \
+                                   s++) { \
+                half2 v = {t.x[i][2 * s], t.x[i][2 * s + 1]}; \
+                t_new.x[i][s] = as_uint(v); \
+            } \
+        } \
+    } while (0)
+
+#define tile_access(t, i0, j, sg, br, bc, nbr) \
+    (t).x[(i0) / (br) + (nbr) * ((j) / (bc))] \
+         [((i0) % (br)) / (sg) + ((j) % (bc)) * ((br) / (sg))]
+
+#define xlane_tile_access(t, i, j, sg, br, bc, nbr) \
+    sub_group_broadcast(tile_access(t, i, j, sg, br, bc, nbr), i % sg)
+
+#define DECLARE_2D_TILE_OPS(tile_type, element_type, sg, br, bc, nbr, nbc) \
+    __attribute__((overloadable)) void tile_load_full(tile_type *t, \
+            const global element_type *ptr, int ld, int offset_r, \
+            int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                int i = i0 + get_sub_group_local_id(); \
+                tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[i]; \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_load_full(tile_type *t, \
+            const local element_type *ptr, int ld, int offset_r, \
+            int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                int i = i0 + get_sub_group_local_id(); \
+                tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[i]; \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_load(tile_type *t, \
+            const global element_type *ptr, int m, int n, int ld, \
+            int offset_r, int offset_c) { \
+        if (m >= offset_r + br * nbr && n >= offset_c + bc * nbc) { \
+            tile_load_full(t, ptr, ld, offset_r, offset_c); \
+            return; \
+        } \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \
+            if (offset_c + j < n) { \
+                _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                    int i = i0 + get_sub_group_local_id(); \
+                    if (offset_r + i < m) \
+                        tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[i]; \
+                } \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_load(tile_type *t, \
+            const global element_type *ptr, int m, int n, int offset_r, \
+            int offset_c) { \
+        tile_load(t, ptr, m, n, m, offset_r, offset_c); \
+    } \
+    __attribute__((overloadable)) void tile_load_t_full(tile_type *t, \
+            const global element_type *ptr, int ld, int offset_r, \
+            int offset_c) { \
+        ptr += ld * offset_r + offset_c; \
+        _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg, ptr += ld*sg) { \
+            _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \
+                tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[get_sub_group_local_id() * ld + j]; \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_load_t(tile_type *t, \
+            const global element_type *ptr, int m, int n, int ld, \
+            int offset_r, int offset_c) { \
+        if (m >= offset_r + br * nbr && n >= offset_c + bc * nbc) { \
+            tile_load_t_full(t, ptr, ld, offset_r, offset_c); \
+            return; \
+        } \
+        ptr += ld * offset_r + offset_c; \
+        _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg, ptr += ld*sg) { \
+            int i = i0 + get_sub_group_local_id(); \
+            if (offset_r + i < m) \
+                _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \
+                    if (offset_c + j < n) { \
+                        tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[get_sub_group_local_id() * ld + j]; \
+                } \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_load_t(tile_type *t, \
+            const global element_type *ptr, int m, int n, int offset_r, \
+            int offset_c) { \
+        tile_load(t, ptr, m, n, n, offset_r, offset_c); \
+    } \
+    __attribute__((overloadable)) void tile_store_full(tile_type t, \
+            local element_type *ptr, int ld, int offset_r, int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                int i = i0 + get_sub_group_local_id(); \
+                ptr[i] = tile_access(t, i0, j, sg, br, bc, nbr); \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_store_full(tile_type t, \
+            global element_type *ptr, int ld, int offset_r, int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                int i = i0 + get_sub_group_local_id(); \
+                ptr[i] = tile_access(t, i0, j, sg, br, bc, nbr); \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_store(tile_type t, \
+            global element_type *ptr, int m, int n, int ld, int offset_r, \
+            int offset_c) { \
+        if (m >= offset_r + br * nbr && n >= offset_c + bc * nbc) { \
+            tile_store_full(t, ptr, ld, offset_r, offset_c); \
+            return; \
+        } \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \
+            if (offset_c + j < n) { \
+                _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                    int i = i0 + get_sub_group_local_id(); \
+                    if (offset_r + i < m) \
+                        ptr[i] = tile_access(t, i0, j, sg, br, bc, nbr); \
+                } \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_store(tile_type t, \
+            global element_type *ptr, int m, int n, int offset_r, \
+            int offset_c) { \
+        tile_store(t, ptr, m, n, m, offset_r, offset_c); \
+    } \
+    __attribute__((overloadable)) void tile_store_t_sys_src1(tile_type t, \
+            local element_type *ptr, int ld, int offset_r, int offset_c) { \
+        offset_c += get_sub_group_local_id(); \
+        int offset_r0 = offset_r & (sg - 1); \
+        int offset_r1 = offset_r & ~(sg - 1); \
+        ptr += offset_r0 + sg * offset_c + ld * offset_r1; \
+        _Pragma("unroll") for (int j0 = 0; j0 < br * nbr; \
+                               j0 += sg, ptr += sg * sg) { \
+            _Pragma("unroll") for (int i = 0; i < bc * nbc; i++) ptr[i] \
+                    = tile_access(t, j0, i, sg, br, bc, nbr); \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_store_t_sys_src2(tile_type t, \
+            local element_type *ptr, int tile_n, int ld, int offset_r, \
+            int offset_c) { \
+        const int cp = 32 / sizeof(element_type); \
+        offset_c += get_sub_group_local_id(); \
+        int offset_r0 = offset_r & (cp - 1); \
+        int offset_r1 = offset_r & ~(cp - 1); \
+        ptr += offset_r0 + tile_n * offset_r1; \
+        _Pragma("unroll") for (int j0 = 0; j0 < br * nbr; \
+                               j0 += sg, offset_c += sg) { \
+            int offset_c0 = offset_c & (tile_n - 1); \
+            int offset_c1 = offset_c & ~(tile_n - 1); \
+            local element_type *ptr_j = ptr + cp * offset_c0 + ld * offset_c1; \
+            _Pragma("unroll") for (int i = 0; i < bc * nbc; i++) { \
+                *ptr_j = tile_access(t, j0, i, sg, br, bc, nbr); \
+                ptr_j++; \
+                if ((~i & (cp - 1)) == 0) ptr_j += cp * (tile_n - 1); \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_atomic_max_full(tile_type t, \
+            local element_type *ptr, int ld, int offset_r, int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                int i = i0 + get_sub_group_local_id(); \
+                (void)local_atomic_max( \
+                        ptr + i, tile_access(t, i0, j, sg, br, bc, nbr)); \
+            } \
+        } \
+    }
+
+#define DECLARE_2D_TILE_VREDUCE(tile_type, sg, br, bc, nbr, nbc, rtile_type, \
+        rsg, rbr, rbc, rnbr, rnbc) \
+    __attribute__((overloadable)) void tile_vreduce_add( \
+            tile_type t, rtile_type *tr) { \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                tile_access(*tr, i0, 0, rsg, rbr, rbc, rnbr) \
+                        += tile_access(t, i0, j, sg, br, bc, nbr); \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_vreduce_max( \
+            tile_type t, rtile_type *tr) { \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                tile_access(*tr, i0, 0, rsg, rbr, rbc, rnbr) \
+                        = max(tile_access(t, i0, j, sg, br, bc, nbr), \
+                                tile_access(*tr, i0, 0, rsg, rbr, rbc, rnbr)); \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_vbroadcast_sub( \
+            tile_type *t, rtile_type tr) { \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                tile_access(*t, i0, j, sg, br, bc, nbr) \
+                        -= tile_access(tr, i0, 0, rsg, rbr, rbc, rnbr); \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_vbroadcast_mul( \
+            tile_type *t, rtile_type tr) { \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                tile_access(*t, i0, j, sg, br, bc, nbr) \
+                        *= tile_access(tr, i0, 0, rsg, rbr, rbc, rnbr); \
+            } \
+        } \
+    }
+
+#define DECLARE_2D_TILE_HREDUCE(tile_type, sg, br, bc, nbr, nbc, rtile_type, \
+        rsg, rbr, rbc, rnbr, rnbc) \
+    __attribute__((overloadable)) void tile_hbroadcast_add( \
+            tile_type *t, rtile_type tr) { \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                tile_access(*t, i0, j, sg, br, bc, nbr) \
+                        += xlane_tile_access(tr, j, 0, rsg, rbr, rbc, rnbr); \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_hbroadcast_mul( \
+            tile_type *t, rtile_type tr) { \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                tile_access(*t, i0, j, sg, br, bc, nbr) \
+                        *= xlane_tile_access(tr, j, 0, rsg, rbr, rbc, rnbr); \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_hbroadcast_min( \
+            tile_type *t, rtile_type tr) { \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                tile_access(*t, i0, j, sg, br, bc, nbr) = min( \
+                        tile_access(*t, i0, j, sg, br, bc, nbr), \
+                        xlane_tile_access(tr, j, 0, rsg, rbr, rbc, rnbr)); \
+            } \
+        } \
+    }
+
+#define DECLARE_2D_TILE_RSELECT(tile_type0, sg0, br0, bc0, nbr0, nbc0, \
+        tile_type1, sg1, br1, bc1, nbr1, nbc1) \
+    __attribute__((overloadable)) void tile_rselect( \
+            tile_type0 *t0, tile_type1 t1, int idx) { \
+        _Pragma("unroll") for (int j = 0; j < bc0 * nbc0; j++) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br0 * nbr0; i0 += sg0) { \
+                tile_access(*t0, i0, j, sg0, br0, bc0, nbr0) \
+                        = tile_access(t1, i0, j, sg1, br1, bc1, nbr1); \
+                _Pragma("unroll") for (int z = 1; \
+                                       z < (br1 * nbr1 / br0 * nbr0); \
+                                       z++) if (z == idx) { \
+                    tile_access(*t0, i0, j, sg0, br0, bc0, nbr0) \
+                            = tile_access(t1, i0 + z * br0 * nbr0, j, sg1, \
+                                    br1, bc1, nbr1); \
+                } \
+            } \
+        } \
+    }
+
+#define DECLARE_2D_TILE_COPY_REBLOCK(tile_type0, sg0, br0, bc0, nbr0, nbc0, \
+        tile_type1, sg1, br1, bc1, nbr1, nbc1) \
+    __attribute__((overloadable)) void tile_copy_reblock( \
+            tile_type0 t0, tile_type1 *t1) { \
+        _Pragma("unroll") for (int j = 0; j < bc0 * nbc0; j++) { \
+            _Pragma("unroll") for (int i0 = 0; i0 < br0 * nbr0; i0 += sg0) { \
+                tile_access(*t1, i0, j, sg1, br1, bc1, nbr1) \
+                        = tile_access(t0, i0, j, sg0, br0, bc0, nbr0); \
+            } \
+        } \
+    }
+
+#define DECLARE_2D_TILE(tile_type, element_type, sg, br, bc, nbr, nbc) \
+    typedef element_type __attribute__((ext_vector_type(br * bc / sg))) \
+            _e_##tile_type; \
+    typedef struct { \
+        _e_##tile_type x[nbr * nbc]; \
+    } tile_type; \
+    DECLARE_2D_TILE_OPS(tile_type, element_type, sg, br, bc, nbr, nbc)
+
+/* Requires bc == 1 currently */
+#define DECLARE_2D_TILE_BLOCK_OPS( \
+        tile_type, element_type, sg, br, bc, nbr, nbc) \
+    __attribute__((overloadable)) void tile_load_block(tile_type *t, \
+            const global element_type *ptr, int ld, int offset_r, \
+            int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \
+            _Pragma("unroll") for (int ii = 0; ii < nbr; ii++)(t) \
+                    ->x[ii + nbr * jj] \
+                    = block_load(ptr + ii * br, br / SUBGROUP_SIZE); \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_store_block(tile_type t, \
+            global element_type *ptr, int ld, int offset_r, int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \
+            _Pragma("unroll") for (int ii = 0; ii < nbr; ii++) \
+                    block_store(ptr + ii * br, (t).x[ii + nbr * jj]); \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_load_block(tile_type *t, \
+            const global element_type *ptr, int n, int ld, int offset_r, \
+            int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        n -= offset_c; \
+        _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \
+            if (jj < n) { \
+                _Pragma("unroll") for (int ii = 0; ii < nbr; ii++)(t) \
+                        ->x[ii + nbr * jj] \
+                        = block_load(ptr + ii * br, br / SUBGROUP_SIZE); \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_store_block(tile_type t, \
+            global element_type *ptr, int n, int ld, int offset_r, \
+            int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        n -= offset_c; \
+        _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \
+            if (jj < n) { \
+                _Pragma("unroll") for (int ii = 0; ii < nbr; ii++) \
+                        block_store(ptr + ii * br, (t).x[ii + nbr * jj]); \
+            } \
+        } \
+    }
+
+#define DECLARE_2D_TILE_BLOCK2D_OPS( \
+        tile_type, element_type, sg, br, bc, nbr, nbc) \
+    __attribute__((overloadable)) void tile_load_block2d(tile_type *t, \
+            const global element_type *ptr, int m, int n, int ld, \
+            int offset_r, int offset_c) { \
+        const int e = sizeof(element_type); \
+        _Pragma("unroll") for (int jj = 0; jj < nbc; jj++) { \
+            _Pragma("unroll") for (int ii = 0; ii < nbr; ii++)(t) \
+                    ->x[ii + nbr * jj] \
+                    = block2d_load(ptr, m * e, n, ld * e, offset_r + ii * br, \
+                            offset_c + jj * bc, br, bc, sg); \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_load_block2d(tile_type *t, \
+            const global element_type *ptr, int m, int n, int offset_r, \
+            int offset_c) { \
+        tile_load_block2d(t, ptr, m, n, m, offset_r, offset_c); \
+    } \
+    __attribute__((overloadable)) void tile_store_block2d(tile_type t, \
+            global element_type *ptr, int m, int n, int ld, int offset_r, \
+            int offset_c) { \
+        const int e = sizeof(element_type); \
+        _Pragma("unroll") for (int jj = 0; jj < nbc; jj++) { \
+            _Pragma("unroll") for (int ii = 0; ii < nbr; ii++) block2d_store( \
+                    (t).x[ii + nbr * jj], ptr, m *e, n, ld *e, \
+                    offset_r + ii * br, offset_c + jj * bc, br, bc, sg); \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_store_block2d(tile_type t, \
+            const global element_type *ptr, int m, int n, int offset_r, \
+            int offset_c) { \
+        tile_store_block2d(t, ptr, m, n, m, offset_r, offset_c); \
+    }
+
+#define DECLARE_2D_TILE_LOAD_PACKED_HALF(tile_type, sg, br, bc, nbr, nbc) \
+    __attribute__((overloadable)) void tile_load_packed_half(tile_type *t, \
+            const global half *ptr, int m, int n, int ld, int offset_r, \
+            int offset_c) { \
+        ptr += ld * offset_c + offset_r; \
+        _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \
+            if (offset_c + j < n) { \
+                _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \
+                    int i = 2 * (i0 + get_sub_group_local_id()); \
+                    half2 loaded = 0; \
+                    if (offset_r + i < m) loaded.s0 = ptr[i]; \
+                    if (offset_r + i + 1 < m) loaded.s1 = ptr[i + 1]; \
+                    tile_access(*t, i0, j, sg, br, bc, nbr) = as_uint(loaded); \
+                } \
+            } \
+        } \
+    } \
+    __attribute__((overloadable)) void tile_load_packed_half(tile_type *t, \
+            const global half *ptr, int m, int n, int offset_r, \
+            int offset_c) { \
+        tile_load_packed_half(t, ptr, m, n, m, offset_r, offset_c); \
+    }
+
+#define cooperative_prefetch_2d(ptr, r, c, ld, sg_id, n_sg, sg_size, caching) \
+    cooperative_prefetch_2d_internal((const global char *)ptr, \
+            (r) * sizeof(*(ptr)), c, (ld) * sizeof(*(ptr)), sg_id, n_sg, \
+            sg_size, caching)
+
+#define cooperative_prefetch_2d_rem( \
+        ptr, r, c, rmax, cmax, ld, sg_id, n_sg, sg_size, caching) \
+    cooperative_prefetch_2d_internal((const global char *)ptr, \
+            (r) * sizeof(*(ptr)), c, (rmax) * sizeof(*(ptr)), cmax, \
+            (ld) * sizeof(*(ptr)), sg_id, n_sg, sg_size, caching)
+
+/* IGC prefetch intrinsics */
+enum LSC_LDCC {
+    LSC_LDCC_DEFAULT = 0,
+    LSC_LDCC_L1UC_L3UC = 1,
+    LSC_LDCC_L1UC_L3C = 2,
+    LSC_LDCC_L1C_L3UC = 3,
+    LSC_LDCC_L1C_L3C = 4,
+    LSC_LDCC_L1S_L3UC = 5,
+    LSC_LDCC_L1S_L3C = 6,
+    LSC_LDCC_L1IAR_L3C = 7,
+};
+
+extern void __builtin_IB_lsc_prefetch_global_uchar(
+        const __global uchar *base, int immElemOff, enum LSC_LDCC cacheOpt);
+
+extern void __builtin_IB_lsc_prefetch_global_uint(
+        const __global uint *base, int immElemOff, enum LSC_LDCC cacheOpt);
+
+__attribute__((overloadable)) void cooperative_prefetch_2d_internal(
+        const global char *ptr, uint rbytes, uint c, uint ld_bytes, uint sg_id,
+        uint n_sg, uint sg_size, enum LSC_LDCC caching) {
+    const uint cl_per_col = (rbytes + 63) >> 6;
+    const uint cl = cl_per_col * c;
+    const uint cl_per_sg = (cl + n_sg - 1) / n_sg;
+    const uint cl_iters = (cl_per_sg + sg_size - 1) / sg_size;
+#pragma unroll
+    for (uint ii_cl = 0; ii_cl < cl_iters; ii_cl++) {
+        uint i_cl = ii_cl + (sg_id * cl_per_sg) + get_sub_group_local_id();
+        uint r_cl = i_cl % cl_per_col;
+        uint c_cl = i_cl / cl_per_col;
+        if (i_cl < cl) {
+            __builtin_IB_lsc_prefetch_global_uint(
+                    (const global uint *)(ptr + r_cl * 64 + c_cl * ld_bytes), 0,
+                    caching);
+        }
+    }
+}
+
+__attribute__((overloadable)) void cooperative_prefetch_2d_internal(
+        const global char *ptr, uint rbytes, uint c, uint rbytes_max,
+        uint c_max, uint ld_bytes, uint sg_id, uint n_sg, uint sg_size,
+        enum LSC_LDCC caching) {
+    const uint cl_per_col = (rbytes_max + 63) >> 6;
+    const uint cl = cl_per_col * c_max;
+    const uint cl_per_sg = (cl + n_sg - 1) / n_sg;
+    const uint cl_iters = (cl_per_sg + sg_size - 1) / sg_size;
+    const uint max_off = rbytes - 1 + (c - 1) * ld_bytes;
+#pragma unroll
+    for (uint ii_cl = 0; ii_cl < cl_iters; ii_cl++) {
+        uint i_cl = ii_cl + (sg_id * cl_per_sg) + get_sub_group_local_id();
+        uint r_cl = i_cl % cl_per_col;
+        uint c_cl = i_cl / cl_per_col;
+        uint pf_off = min(r_cl * 64 + c_cl * ld_bytes, max_off);
+        if (i_cl < cl) {
+            __builtin_IB_lsc_prefetch_global_uchar(
+                    (const global uchar *)(ptr + pf_off), 0, caching);
+        }
+    }
+}
+
+#endif
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/vec_typedefs.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/vec_typedefs.cl
index ae4adfaf717fbd..29ca3ddb7185f7 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/vec_typedefs.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/vec_typedefs.cl
@@ -2,27 +2,25 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
-typedef struct half1  { half s0; }                                                               half1;
 typedef struct half5  { half s0; half s1; half s2; half s3; half s4; }                           half5;
 typedef struct half6  { half s0; half s1; half s2; half s3; half s4; half s5; }                  half6;
 typedef struct half7  { half s0; half s1; half s2; half s3; half s4; half s5; half s6; }         half7;
-typedef struct half9  { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; 
+typedef struct half9  { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7;
                         half s8; }                                                               half9;
-typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; 
+typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7;
                         half s8; half s9; }                                                      half10;
-typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; 
+typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7;
                         half s8; half s9; half sa; }                                             half11;
-typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; 
+typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7;
                         half s8;  half s9; half sa; half sb;}                                    half12;
-typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; 
+typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7;
                         half s8;  half s9; half sa; half sb; half sc;}                           half13;
-typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; 
+typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7;
                         half s8;  half s9; half sa; half sb; half sc; half se;}                  half14;
-typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; 
+typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7;
                        half s8;  half s9; half sa; half sb; half sc; half se; half sf;}          half15;
 typedef struct half0  { half s0; } half0; //never used but makes compiler happy.
 
-typedef struct float1 { float s0; } float1;
 typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5;
 typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6;
 typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl
new file mode 100644
index 00000000000000..575d9fdff5bd32
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl
@@ -0,0 +1,461 @@
+/*******************************************************************************
+* Copyright 2024 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "include/batch_headers/generic_vector_ops.cl"
+#include "include/batch_headers/sdpa_utils.cl"
+#include "include/batch_headers/tile_ops.cl"
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define DIV_UP(x, y) (((x) + (y)-1) / (y))
+
+#define sg_per_wg (ugemm_kq_sg_per_wg_m * ugemm_kq_sg_per_wg_n)
+#define q_tile_sg_n DIV_UP(ugemm_kq_wg_tile_n, sg_per_wg)
+
+/* Instantiate tile types and operations */
+typedef ugemm_kq_c_type s_tile_type;
+typedef ugemm_vs_c_type a_tile_type;
+
+DECLARE_2D_TILE(q_tile_type, uint, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n)
+
+#ifdef BLOCK_Q
+DECLARE_2D_TILE_BLOCK_OPS(
+        q_tile_type, uint, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n)
+#elif Q_ALIGN < 4
+DECLARE_2D_TILE_LOAD_PACKED_HALF(
+        q_tile_type, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n)
+#endif
+
+#ifdef BLOCK_A
+DECLARE_2D_TILE(a_tile_type_half, half, SUBGROUP_SIZE, ugemm_vs_sg_tile_m, 1, 1,
+        ugemm_vs_sg_tile_n)
+#else
+DECLARE_2D_TILE(a_tile_type_half, half, SUBGROUP_SIZE, ugemm_vs_sg_tile_m, 8, 1,
+        ugemm_vs_sg_tile_n / 8)
+#endif
+
+DECLARE_2D_TILE(s_tile_type_half2, uint, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
+        ugemm_kq_c_type_block1 / 2, ugemm_kq_c_type_nblock0,
+        ugemm_kq_c_type_nblock1)
+
+DECLARE_2D_TILE(
+        s_sum_tile_type, float, SUBGROUP_SIZE, ugemm_kq_sg_tile_n, 1, 1, 1)
+
+DECLARE_2D_TILE(
+        a_scale_tile_type, float, SUBGROUP_SIZE, ugemm_vs_sg_tile_n, 1, 1, 1)
+
+
+DECLARE_2D_TILE(mask_tile_type, half, SUBGROUP_SIZE, ugemm_kq_c_type_block0, ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0, ugemm_kq_c_type_nblock1)
+DECLARE_2D_TILE(mask_tile_type_float, float, SUBGROUP_SIZE, ugemm_kq_c_type_block0, ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0, ugemm_kq_c_type_nblock1)
+
+#ifdef BLOCK_A
+DECLARE_2D_TILE_BLOCK_OPS(a_tile_type_half, half, SUBGROUP_SIZE,
+        ugemm_vs_sg_tile_m, 1, 1, ugemm_vs_sg_tile_n)
+#endif
+#ifdef BLOCK_2D_A
+DECLARE_2D_TILE_BLOCK2D_OPS(a_tile_type_half, half, SUBGROUP_SIZE,
+        ugemm_vs_sg_tile_m, 8, 1, ugemm_vs_sg_tile_n / 8)
+#endif
+
+#ifdef BLOCK_A
+DECLARE_2D_TILE_COPY_REBLOCK(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0,
+        ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0,
+        ugemm_vs_c_type_nblock1, a_tile_type_half, SUBGROUP_SIZE,
+        ugemm_vs_sg_tile_m, 1, 1, ugemm_vs_sg_tile_n)
+#else
+DECLARE_2D_TILE_COPY_REBLOCK(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0,
+        ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0,
+        ugemm_vs_c_type_nblock1, a_tile_type_half, SUBGROUP_SIZE,
+        ugemm_vs_sg_tile_m, 8, 1, ugemm_vs_sg_tile_n / 8)
+#endif
+
+DECLARE_2D_TILE_VREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
+        ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0,
+        ugemm_kq_c_type_nblock1, s_sum_tile_type, SUBGROUP_SIZE,
+        ugemm_kq_sg_tile_n, 1, 1, 1)
+
+DECLARE_2D_TILE_HREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0,
+        ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0,
+        ugemm_kq_c_type_nblock1, mask_tile_type_float, SUBGROUP_SIZE,
+        ugemm_kq_sg_tile_m, 1, 1, 1)
+
+DECLARE_2D_TILE_HREDUCE(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0,
+        ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0,
+        ugemm_vs_c_type_nblock1, a_scale_tile_type, SUBGROUP_SIZE,
+        ugemm_vs_sg_tile_n, 1, 1, 1)
+
+#if ugemm_kq_wg_tile_n == ugemm_vs_wg_tile_n \
+        && (ugemm_kq_sg_tile_n % ugemm_vs_sg_tile_n) == 0
+DECLARE_2D_TILE_RSELECT(a_scale_tile_type, SUBGROUP_SIZE, ugemm_vs_sg_tile_n, 1,
+        1, 1, s_sum_tile_type, SUBGROUP_SIZE, ugemm_kq_sg_tile_n, 1, 1, 1)
+#endif
+
+#if PREFETCH_REMAINDER
+#define cooperative_prefetch_2d_maybe_rem cooperative_prefetch_2d_rem
+#else
+#define cooperative_prefetch_2d_maybe_rem( \
+        ptr, r, c, rmax, cmax, ld, sg_id, n_sg, sg_size, caching) \
+    cooperative_prefetch_2d(ptr, rmax, cmax, ld, sg_id, n_sg, sg_size, caching)
+#endif
+
+#if TRANSPOSE_K
+#define cooperative_prefetch_2d_k( \
+        ptr, r, c, rmax, cmax, ld, sg_id, n_sg, sg_size, caching) \
+    cooperative_prefetch_2d_maybe_rem( \
+            ptr, c, r, cmax, rmax, ld, sg_id, n_sg, sg_size, caching)
+#else
+#define cooperative_prefetch_2d_k cooperative_prefetch_2d_maybe_rem
+#endif
+
+#if REMAINDER_Q
+#define tile_load_block_rem_q tile_load_block
+#define tile_store_block_rem_q tile_store_block
+#else
+#define tile_load_block_rem_q(t, ptr, n, ld, off_r, off_c) \
+    tile_load_block(t, ptr, ld, off_r, off_c)
+#define tile_store_block_rem_q(t, ptr, n, ld, off_r, off_c) \
+    tile_store_block(t, ptr, ld, off_r, off_c)
+#endif
+
+#define binary_add(x, y) ((x) + (y))
+
+__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE)))
+KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG
+        const global half *K, const global half *Q, const global half *V,
+        global half *A,
+#if WITH_ATTN_MASK
+        const global half *msk,
+#endif
+#if WITH_SCALE
+        global SCALE_DATA_T *scale_ptr,
+#endif
+        int d, int k, int q) {
+    uint sg_ij = sub_group_broadcast(get_local_id(1), 0);
+    uint b0 = get_group_id(1);
+    uint b1 = get_group_id(2);
+
+    uint wg_j0 = get_group_id(0) * ugemm_kq_wg_tile_n;
+
+    /* Leading dimension for matrices */
+    uint ldk = TRANSPOSE_K ? KEY_S3 : KEY_S2;
+    uint ldq = QRY_S2;
+    uint ldv = VAL_S2;
+    uint lda = DST_S2;
+
+    /* Subgroup IDs for each GEMM */
+    uint sg_i_kq = sg_ij % ugemm_kq_sg_per_wg_m;
+    uint sg_j_kq = sg_ij / ugemm_kq_sg_per_wg_m;
+
+    uint sg_i_vs = sg_ij % ugemm_vs_sg_per_wg_m;
+    uint sg_j_vs = sg_ij / ugemm_vs_sg_per_wg_m;
+
+    /* SLM allocations -- place in one array to work around compiler bug */
+#define Q_slm_size (D_MAX * ugemm_kq_wg_tile_n * sizeof(half))
+#define S_slm_size (ugemm_kq_wg_tile_m * ugemm_kq_wg_tile_n * sizeof(half))
+#define S_sum_slm_size \
+    (ugemm_kq_wg_tile_n * ugemm_kq_sg_per_wg_m * sizeof(float))
+#define S_max_slm_size (ugemm_kq_wg_tile_n * sizeof(float))
+#define ugemm_slm_size MAX(ugemm_kq_slm_size, ugemm_vs_slm_size)
+
+    local char slm[Q_slm_size + S_slm_size + S_sum_slm_size + S_max_slm_size
+            + ugemm_slm_size];
+
+    local half *Q_slm = (local half *)&slm[0];
+    local half *S_slm = (local half *)&slm[Q_slm_size];
+    local float *S_sum_slm = (local float *)&slm[Q_slm_size + S_slm_size];
+    local float *S_max_slm
+            = (local float *)&slm[Q_slm_size + S_slm_size + S_sum_slm_size];
+    local uint *ugemm_slm = (local uint *)&slm[Q_slm_size + S_slm_size
+            + S_sum_slm_size + S_max_slm_size];
+
+    const bool need_sum_barrier = (ugemm_vs_barrier_count == 0);
+
+    /* Locate K/Q/V/A matrices within batch */
+    K += KEY_OFF(b1, (b0 / KV_GROUP_SIZE), 0, 0);
+    Q += QRY_OFF(b1, b0, 0, 0);
+    V += VAL_OFF(b1, (b0 / KV_GROUP_SIZE), 0, 0);
+    A += DST_OFF(b1, b0, 0, 0, 0);
+
+    __builtin_assume_aligned(K, K_ALIGN);
+    __builtin_assume_aligned(Q, Q_ALIGN);
+    __builtin_assume_aligned(V, V_ALIGN);
+    __builtin_assume_aligned(A, A_ALIGN);
+
+    /* Load Q tile, destined for SLM */
+    q_tile_type Q_tile;
+    uint q0_copy = q_tile_sg_n * sg_ij;
+#ifdef BLOCK_Q
+    tile_load_block_rem_q(
+            &Q_tile, (global uint *)Q, q, ldq >> 1, 0, wg_j0 + q0_copy);
+#elif Q_ALIGN >= 4
+    tile_load(&Q_tile, (global uint *)Q, (d + 1) >> 1, q, ldq >> 1, 0,
+            wg_j0 + q0_copy);
+#else
+    tile_load_packed_half(&Q_tile, Q, d, q, ldq, 0, wg_j0 + q0_copy);
+#endif
+
+#if WITH_SCALE
+        /* Load scale */
+    #if INVERT_SCALE
+        float iscale = convert_float(*scale_ptr);
+        float scale = native_recip(iscale);
+    #else
+        float scale = convert_float(*scale_ptr);
+        float iscale = native_recip(scale);
+    #endif
+#else
+    float iscale = sqrt(convert_float(INPUT1_SIZE_X));
+    float scale = native_recip(iscale);
+#endif
+    scale *= 1.442695f; // log2(e)
+
+#ifdef PREFETCH_K0
+    /* Prefetch first K tile. */
+    cooperative_prefetch_2d_k(K, k, d, ugemm_kq_wg_tile_m, PREFETCH_D_MAX, ldk,
+            sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C);
+#endif
+
+    /* Initialize S column sums in SLM to -inf */
+    const uint n_col_sg = DIV_UP(ugemm_kq_wg_tile_n, SUBGROUP_SIZE * sg_per_wg);
+    const float neg_inf = -INFINITY;
+
+#pragma unroll
+    for (int q = 0; q < n_col_sg; q++)
+        intel_sub_group_block_write(
+                (local uint *)&S_max_slm[(q + sg_ij * n_col_sg)
+                        * SUBGROUP_SIZE],
+                as_uint(neg_inf));
+
+    /* Clear accumulator */
+    a_tile_type A_tile;
+    tile_fill(A_tile, 0.0f);
+
+    /* Store Q tile to SLM */
+    tile_store_t_sys_src1(
+            Q_tile, (local uint *)&Q_slm[0], D_MAX / 2, q0_copy, 0);
+
+    /* Clear S column sums/maxes */
+    s_sum_tile_type S_sum_tile;
+    s_sum_tile_type S_max_tile, S_max_tile_old;
+    tile_fill(S_sum_tile, 0.0f);
+    tile_fill(S_max_tile, -INFINITY);
+
+    /* Wait for Q data to reach SLM */
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Main loop over k blocks */
+    for (int k0 = 0; k0 < k; k0 += ugemm_kq_wg_tile_m) {
+        bool first = (k0 == 0);
+        bool last = (k0 + ugemm_kq_wg_tile_m >= k);
+
+        uint sg_i0_kq = sg_i_kq * ugemm_kq_sg_tile_m;
+        uint sg_j0_kq = sg_j_kq * ugemm_kq_sg_tile_n;
+
+#if WITH_ATTN_MASK
+        mask_tile_type mask_tile;
+        tile_load_t(&mask_tile, msk, q, k, q, sg_j0_kq + wg_j0, k0 + sg_i0_kq);
+#endif
+
+#if REMAINDER_K
+        /* Prepare k mask: NaN in bounds, -inf out of bounds */
+        mask_tile_type_float k_mask;
+#pragma unroll
+        for (int ii = 0; ii < ugemm_kq_sg_tile_m / SUBGROUP_SIZE; ii++)
+            k_mask.x[0][ii] = (k0 + sg_i0_kq + ii * SUBGROUP_SIZE
+                                              + get_sub_group_local_id()
+                                      < k)
+                    ? nan(0u)
+                    : -INFINITY;
+#endif
+
+        /* Calculate S = (K^T) * Q */
+        s_tile_type S_tile
+                = ugemm_kq(K, ldk, Q_slm, D_MAX, k, ugemm_kq_wg_tile_n, d, k0,
+                        0, 0, sg_i_kq, sg_j_kq, (local char *)ugemm_slm);
+
+        /* Apply attention mask */
+#if WITH_ATTN_MASK
+#define unscale(x) ((x)*iscale)
+        mask_tile_type_float mask_tile_float;
+        tile_copy(mask_tile, mask_tile_float);
+        tile_elementwise(mask_tile_float, unscale);
+        tile_binary(S_tile, mask_tile_float, binary_add);
+#endif
+
+        /* Apply k mask */
+#if REMAINDER_K
+        tile_hbroadcast_min(&S_tile, k_mask);
+#endif
+
+        /* Before softmax, we will need to scale columns by maximum values to avoid overflow. */
+
+        /* Compute our maxima and reduce across SLM */
+        tile_vreduce_max(S_tile, &S_max_tile);
+        tile_atomic_max_full(
+                S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0);
+        intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE);
+
+#ifdef PREFETCH_V
+        /* Prefetch V tile. */
+        cooperative_prefetch_2d_maybe_rem(V, d, k - k0, D_MAX,
+                (ugemm_kq_wg_tile_m * PREFETCH_D_MAX) / D_MAX, ldv, sg_ij,
+                sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C);
+#endif
+
+#ifndef ALT_MAX
+        /* Read back WG-wide maxima */
+        intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE);
+        tile_load_full(&S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0);
+#endif
+
+        tile_vbroadcast_sub(&S_tile, S_max_tile);
+
+/* Scale + exponentiate */
+#define scaled_exp(x) native_vexp2(x *scale)
+        tile_elementwise(S_tile, scaled_exp);
+
+#ifdef ALT_MAX
+        /* Read back WG-wide maxima and adjust S to match */
+        intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE);
+        s_sum_tile_type S_max_tile1;
+        tile_copy(S_max_tile, S_max_tile1);
+        tile_load_full(&S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0);
+
+#define binary_exp_neg(x, y) native_vexp2(scale *((x) - (y)))
+        tile_binary(S_max_tile1, S_max_tile, binary_exp_neg);
+        tile_vbroadcast_mul(&S_tile, S_max_tile1);
+#endif
+
+        /* Accumulate sums. S tile is transposed for easy summation. */
+        s_sum_tile_type S_sum_tile1;
+        tile_fill(S_sum_tile1, 0.0f);
+        tile_vreduce_add(S_tile, &S_sum_tile1);
+
+        /* Convert to half, VNNI format */
+        s_tile_type_half2 S_tile_half2;
+        tile_copy_to_half2(S_tile, S_tile_half2);
+
+        /* Store to SLM, in packed format */
+        tile_store_t_sys_src2(S_tile_half2, (local uint *)S_slm,
+                ugemm_vs_sg_tile_n, ugemm_kq_wg_tile_m / 2, sg_i0_kq / 2,
+                sg_j0_kq);
+        intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE);
+
+        /* Rescale existing accumulator and sums to match new maxima */
+        if (!first) {
+#define binary_exp_sub(x, y) native_vexp2(scale *((x) - (y)))
+#define binary_mul(x, y) ((x) * (y))
+            tile_binary(S_max_tile_old, S_max_tile, binary_exp_sub);
+            tile_binary(S_sum_tile, S_max_tile_old, binary_mul);
+
+            /* Find the subset of sums that applies to the accumulation tile */
+            a_scale_tile_type A_scale_tile;
+#if ugemm_kq_wg_tile_n == ugemm_vs_wg_tile_n \
+        && ugemm_kq_sg_tile_n == ugemm_vs_sg_tile_n
+            tile_copy(S_max_tile_old, A_scale_tile);
+#elif ugemm_kq_wg_tile_n == ugemm_vs_wg_tile_n \
+        && (ugemm_kq_sg_tile_n % ugemm_vs_sg_tile_n) == 0
+            tile_rselect(&A_scale_tile, S_max_tile_old,
+                    sg_j_vs % (ugemm_kq_sg_tile_n / ugemm_vs_sg_tile_n));
+#else
+#error unimplemented
+#endif
+            tile_hbroadcast_mul(&A_tile, A_scale_tile);
+        }
+
+/* Accumulate sums */
+        tile_binary(S_sum_tile, S_sum_tile1, binary_add);
+
+        /* Save maxima */
+        tile_copy(S_max_tile, S_max_tile_old);
+
+        /* Last iteration: store column sums in SLM */
+        if (last) {
+            tile_store_full(S_sum_tile, S_sum_slm, ugemm_kq_wg_tile_n, sg_j0_kq,
+                    sg_i_kq);
+        }
+
+#ifdef PREFETCH_K
+        /* Prefetch next K tile. */
+        if (!last) {
+#if TRANSPOSE_K
+            const uint stride_k = ldk;
+#else
+            const uint stride_k = 1;
+#endif
+            cooperative_prefetch_2d_k(K + (k0 + ugemm_kq_wg_tile_m) * stride_k,
+                    k - k0 - ugemm_kq_wg_tile_m, d, ugemm_kq_wg_tile_m,
+                    PREFETCH_D_MAX, ldk, sg_ij, sg_per_wg, SUBGROUP_SIZE,
+                    LSC_LDCC_L1C_L3C);
+        }
+#endif
+#if WITH_ATTN_MASK && defined(PREFETCH_MASK)
+        /* Prefetch next mask tile. */
+        if (!last) {
+            cooperative_prefetch_2d(msk + k0 + ugemm_kq_wg_tile_m + sg_i0_kq + (sg_j0_kq + wg_j0) * q,
+                    ugemm_kq_sg_tile_m, ugemm_kq_sg_tile_n, 0, 0, 1, SUBGROUP_SIZE,
+                    LSC_LDCC_L1UC_L3C);
+        }
+#endif
+
+        /* Wait for S stores */
+        intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE);
+
+        /* Last iteration: signal column sums are ready */
+        if (last && need_sum_barrier)
+            intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE);
+
+        /* Accumulate A += V * S */
+        int k_chunk = min(k - k0, ugemm_kq_wg_tile_m);
+        a_tile_type A_tile1 = ugemm_vs(V, ldv, S_slm, ugemm_kq_wg_tile_m, d,
+                ugemm_kq_wg_tile_n, k_chunk, 0, 0, 0, sg_i_vs, sg_j_vs,
+                (local char *)ugemm_slm);
+        V += ldv * ugemm_kq_wg_tile_m;
+        tile_binary(A_tile, A_tile1, binary_add);
+    }
+
+    /* Wait for column sums to be ready */
+    if (need_sum_barrier) intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE);
+
+    /* Load column sums from SLM + reduce in registers */
+    a_scale_tile_type A_scale_tile, A_scale_tile_load;
+    tile_fill(A_scale_tile, 0.0f);
+
+#pragma unroll
+    for (uint sg1 = 0; sg1 < ugemm_kq_sg_per_wg_m; sg1++) {
+        tile_load_full(&A_scale_tile_load, S_sum_slm, ugemm_kq_wg_tile_n,
+                ugemm_vs_sg_tile_n * sg_j_vs, sg1);
+        tile_binary(A_scale_tile, A_scale_tile_load, binary_add);
+    }
+
+    /* Rescale by 1 / (column sums) */
+    tile_elementwise(A_scale_tile, native_vrecip);
+    tile_hbroadcast_mul(&A_tile, A_scale_tile);
+
+    /* Convert to half precision and store */
+    a_tile_type_half A_tile_half;
+    tile_copy_reblock(A_tile, &A_tile_half);
+
+    uint sg_i0_vs = sg_i_vs * ugemm_vs_sg_tile_m;
+    uint sg_j0_vs = sg_j_vs * ugemm_vs_sg_tile_n + wg_j0;
+
+#ifdef BLOCK_2D_A
+    tile_store_block2d(A_tile_half, A, d, q, lda, sg_i0_vs, sg_j0_vs);
+#elif defined(BLOCK_A)
+    tile_store_block_rem_q(A_tile_half, A, q, lda, sg_i0_vs, sg_j0_vs);
+#else
+    tile_store(A_tile_half, A, d, q, lda, sg_i0_vs, sg_j0_vs);
+#endif
+}
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h
index 627fdf758586a3..40ac211b1d1026 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h
@@ -19,6 +19,10 @@
 #define EXE_MODE_AGE_BASED "-cl-no-subgroup-ifp"
 #define EXE_MODE_NO_PRERA_SCH "-cl-intel-no-prera-scheduling"
 
+namespace micro {
+struct MicroKernelPackage;
+}  // namspace
+
 namespace kernel_selector {
 
 #ifndef UNUSED
@@ -64,6 +68,7 @@ struct KernelCode {
 struct clKernelData {
     KernelCode code;
     KernelParams params;
+    std::vector<std::shared_ptr<micro::MicroKernelPackage>> micro_kernels;
     bool skip_execution = false;
 };
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h
index 2154a6c9a51a60..4d776bf41f2439 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h
@@ -357,6 +357,21 @@ enum class dev_type {
     discrete_gpu = 1
 };
 
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Arch type
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+enum class gpu_arch {
+    unknown = 0,
+    gen9 = 1,
+    gen11 = 2,
+    xe_lp = 3,
+    xe_hp = 4,
+    xe_hpg = 5,
+    xe_hpc = 6,
+    xe2 = 7,
+};
+
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 // EngineInfo
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -377,9 +392,12 @@ struct EngineInfo {
     bool enable_sub_groups_emulation = false;
     bool bOptHintsSupport = false;
     bool bLocalBlockIOSupport = false;
+    bool supports_microkernels = false;
     uint32_t vendor_id = 0x0;
     dev_type deviceType = dev_type::integrated_gpu;
     uint32_t computeUnitsCount = 0;
+    uint32_t ip_version = 0;
+    gpu_arch arch = gpu_arch::unknown;
     uint32_t maxThreadsPerExecutionUnit = 0;
     uint32_t maxThreadsPerDevice = 0;
     uint64_t maxWorkGroupSize = 0;
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp
new file mode 100644
index 00000000000000..e3604a481a8f09
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp
@@ -0,0 +1,630 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+
+#include "sdpa_kernel_micro.h"
+#include "common_tools.h"
+#include "common_types.h"
+#include "jitter.h"
+#include "kernel_selector_common.h"
+#include "kernel_selector_params.h"
+#include "micro_utils.hpp"
+#include "tensor_type.h"
+
+#include <algorithm>
+#include <mutex>
+#include <string>
+#include <vector>
+
+namespace kernel_selector {
+
+namespace {
+
+size_t subgroup_size(gpu_arch arch) {
+    switch (arch) {
+        case gpu_arch::gen9:
+        case gpu_arch::gen11:
+        case gpu_arch::xe_lp:
+        case gpu_arch::xe_hp:
+        case gpu_arch::xe_hpg: return 8;
+        case gpu_arch::xe_hpc:
+        case gpu_arch::xe2: return 16;
+        default: return 0;
+    }
+}
+
+inline int64_t get_d_max(int64_t head_size) {
+    for (int64_t i = 32; i <= 1024; i *= 2)
+        if (head_size <= i)
+            return i;
+    return head_size;
+}
+
+micro::Type convert_type(Datatype t) {
+    switch (t) {
+        case Datatype::F32: return micro::Type::f32;
+        case Datatype::F16: return micro::Type::f16;
+        default: break;
+    }
+    OPENVINO_THROW("Unsupported dt: ", toString(t));
+}
+
+Tensor::NDims normalize_dims(const DataTensor& qkv) {
+    auto dims = qkv.GetDims(); // xyfb
+    std::reverse(dims.begin(), dims.end()); // bfyx
+    return dims;
+}
+
+Tensor::Dim get_num_heads(const DataTensor& qkv, const std::vector<int64_t>& order) {
+    return normalize_dims(qkv)[order[1]];
+}
+
+Tensor::Dim get_seq_length(const DataTensor& qkv, const std::vector<int64_t>& order) {
+    return normalize_dims(qkv)[order[2]];
+}
+
+struct sdpa_config_t {
+    int unroll_m_kq, unroll_n_kq; // Subgroup tile sizes for K*Q GEMM
+    int unroll_m_vs, unroll_n_vs; // Subgroup tile sizes for V*S GEMM
+    int wg_m_kq, wg_n_kq; // Workgroup configuration for K*Q GEMM
+    int wg_m_vs, wg_n_vs; // Workgroup configuration for V*S GEMM
+};
+
+// Kernel configurations:
+//  h<N> -- maximum head size = N
+//  s<M> -- target sequence length = M
+//   2nd -- second token (thin Q)
+sdpa_config_t xehpg_h32 = {32, 16, 16, 16, 2, 16, 2, 16};
+sdpa_config_t xehpg_h32_s256 = {16, 16, 16, 16, 2, 8, 2, 8};
+sdpa_config_t xehpg_h32_s64 = {16, 16, 16, 8, 4, 4, 2, 8};
+sdpa_config_t xehpg_h32_s32 = {8, 8, 8, 8, 4, 4, 4, 4};
+sdpa_config_t xehpg_h32_2nd = {8, 32, 16, 8, 8, 1, 2, 4};
+
+sdpa_config_t xehpg_h64 = {32, 16, 16, 16, 4, 8, 4, 8};
+sdpa_config_t xehpg_h64_s128 = {16, 16, 16, 16, 4, 8, 4, 8};
+sdpa_config_t xehpg_h64_s64 = {32, 16, 16, 8, 8, 4, 4, 8};
+sdpa_config_t xehpg_h64_2nd = {8, 16, 16, 8, 8, 1, 4, 2};
+
+sdpa_config_t xehpg_h128 = {16, 16, 32, 8, 8, 4, 4, 8};
+sdpa_config_t xehpg_h128_s32 = {16, 16, 16, 8, 16, 2, 8, 4};
+sdpa_config_t xehpg_h128_2nd = {8, 16, 16, 8, 16, 1, 8, 2};
+sdpa_config_t xehpg_h128_s256_2nd = {8, 16, 32, 8, 8, 1, 4, 2};
+
+sdpa_config_t xehpg_h256 = {16, 16, 32, 8, 16, 2, 8, 4};
+sdpa_config_t xehpg_h256_s128 = {8, 16, 32, 16, 8, 4, 8, 4};
+sdpa_config_t xehpg_h256_s32 = {8, 16, 32, 8, 16, 2, 8, 4};
+sdpa_config_t xehpg_h256_2nd = {8, 8, 16, 8, 16, 1, 16, 1};
+sdpa_config_t xehpg_h256_s64_2nd = {16, 8, 16, 8, 16, 1, 16, 1};
+sdpa_config_t xehpg_h256_s32_2nd = {16, 16, 32, 8, 16, 1, 8, 2};
+
+sdpa_config_t xehpc_h32 = {16, 64, 32, 16, 4, 2, 1, 8};
+sdpa_config_t xehpc_h32_s32 = {16, 16, 16, 16, 2, 4, 2, 4};
+sdpa_config_t xehpc_h32_2nd = {16, 64, 16, 16, 8, 1, 2, 4};
+
+sdpa_config_t xehpc_h64 = {16, 64, 32, 16, 8, 2, 2, 8};
+sdpa_config_t xehpc_h64_s64 = {32, 32, 32, 16, 4, 2, 2, 4};
+sdpa_config_t xehpc_h64_s32 = {16, 16, 16, 16, 4, 2, 4, 2};
+sdpa_config_t xehpc_h64_2nd = {32, 32, 32, 16, 4, 1, 2, 2};
+sdpa_config_t xehpc_h64_s64_2nd = {16, 16, 16, 16, 4, 1, 4, 1};
+
+sdpa_config_t xehpc_h128 = {16, 64, 32, 16, 16, 2, 4, 8};
+sdpa_config_t xehpc_h128_s64 = {16, 32, 32, 32, 4, 2, 4, 2};
+sdpa_config_t xehpc_h128_s32 = {16, 16, 16, 16, 8, 2, 8, 2};
+sdpa_config_t xehpc_h128_2nd = {32, 32, 32, 16, 8, 1, 4, 2};
+
+sdpa_config_t xehpc_h256 = {16, 32, 32, 32, 8, 4, 8, 4};
+sdpa_config_t xehpc_h256_s64 = {16, 32, 32, 32, 8, 1, 8, 1};
+sdpa_config_t xehpc_h256_2nd = {16, 16, 16, 16, 16, 1, 16, 1};
+
+sdpa_config_t *choose_config_xehpg(int head_size, int seq, bool thin_q) {
+    if (head_size <= 32) {
+        if (thin_q) return &xehpg_h32_2nd;
+        if (seq <= 32) return &xehpg_h32_s32;
+        if (seq <= 64) return &xehpg_h32_s64;
+        if (seq <= 256) return &xehpg_h32_s256;
+        return &xehpg_h32;
+    } else if (head_size <= 64) {
+        if (thin_q) return &xehpg_h64_2nd;
+        if (seq <= 64) return &xehpg_h64_s64;
+        if (seq <= 128) return &xehpg_h64_s128;
+        return &xehpg_h64;
+    } else if (head_size <= 128) {
+        if (thin_q) {
+            if (seq <= 256) return &xehpg_h128_s256_2nd;
+            return &xehpg_h128_2nd;
+        }
+        if (seq <= 32) return &xehpg_h128_s32;
+        return &xehpg_h128;
+    } else if (head_size <= 256) {
+        if (thin_q) {
+            if (seq <= 32) return &xehpg_h256_s32_2nd;
+            if (seq <= 64) return &xehpg_h256_s64_2nd;
+            return &xehpg_h256_2nd;
+        }
+        if (seq <= 32) return &xehpg_h256_s32;
+        if (seq <= 128) return &xehpg_h256_s128;
+        return &xehpg_h256;
+    }
+    return nullptr;
+}
+
+sdpa_config_t *choose_config_xehpc(int head_size, int seq, bool thin_q) {
+    if (head_size <= 32) {
+        if (thin_q) return &xehpc_h32_2nd;
+        if (seq <= 32) return &xehpc_h32_s32;
+        return &xehpc_h32;
+    } else if (head_size <= 64) {
+        if (thin_q) {
+            if (seq <= 64) return &xehpc_h64_s64_2nd;
+            return &xehpc_h64_2nd;
+        }
+        if (seq <= 32) return &xehpc_h64_s32;
+        if (seq <= 64) return &xehpc_h64_s64;
+        return &xehpc_h64;
+    } else if (head_size <= 128) {
+        if (thin_q) return &xehpc_h128_2nd;
+        if (seq <= 32) return &xehpc_h128_s32;
+        if (seq <= 64) return &xehpc_h128_s64;
+        return &xehpc_h128;
+    } else if (head_size <= 256) {
+        if (thin_q) return &xehpc_h256_2nd;
+        if (seq <= 64) return &xehpc_h256_s64;
+        return &xehpc_h256;
+    }
+    return nullptr;
+}
+
+}  // namespace
+
+std::mutex SDPAKernelMicro::m;
+
+void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Package& gemm_kq, micro::Package& gemm_vs, bool is_prefill) const {
+    // TODO: Remove once micro API is thread safe
+    std::lock_guard<std::mutex> l(m);
+    const auto& Q = params.inputs[0];
+    const auto& K = params.inputs[1];
+    const auto& V = params.inputs[2];
+
+    auto& out = params.outputs[0];
+    const auto head_size = params.conf.head_size;
+    const auto d_max = get_d_max(head_size);
+    const Tensor::Dim n_keys = get_seq_length(K, params.input1_order);
+    const Tensor::Dim n_queries = get_seq_length(Q, params.input0_order);
+    const Tensor::Dim n_values = V.X();
+    const auto batch = out.Batch().v * out.Feature().v;
+
+    /* Retrieve pre-tuned kernel configuration */
+    sdpa_config_t *config = nullptr;
+    bool thin_q = (!n_queries.is_dynamic && (n_queries.v <= 16)) || !is_prefill;
+
+    switch (params.engineInfo.arch) {
+        case gpu_arch::xe_hpg: {
+            config = choose_config_xehpg(static_cast<int32_t>(head_size), static_cast<int32_t>(n_keys.v), thin_q);
+            break;
+        }
+        case gpu_arch::xe_hpc:
+        case gpu_arch::xe2: {
+            config = choose_config_xehpc(static_cast<int32_t>(head_size), static_cast<int32_t>(n_keys.v), thin_q);
+            break;
+        }
+        default: break;
+    }
+
+    /* Get device information */
+    micro::HWInformation hw_info;
+    hw_info.euCount = params.engineInfo.computeUnitsCount;
+    hw_info.gmdid = params.engineInfo.ip_version;
+    hw_info.systolicAvailable = params.engineInfo.supports_immad;
+
+    /* Set up GEMMProblem structure for first GEMM: K^T * Q */
+    micro::GEMMProblem problem;
+    problem.Ta = problem.Ta_ext = convert_type(K.GetDType());
+    problem.Tb = problem.Tb_ext = convert_type(Q.GetDType());
+    problem.Tc = problem.Tc_ext = micro::Type::f32;
+    problem.Ts = problem.Tc;
+
+    auto problem_kq = problem;
+    problem_kq.A.layout = micro::MatrixLayout::T;
+    problem_kq.B.layout = micro::MatrixLayout::Pr;
+    problem_kq.C.layout = micro::MatrixLayout::T;
+    problem_kq.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta));
+    problem_kq.B.setAlignment(64); // Q is packed in VNNI format in SLM
+    problem_kq.B.crosspack = 2;
+    problem_kq.B.tileR = d_max;
+    problem_kq.B.tileC = static_cast<uint16_t>(subgroup_size(params.engineInfo.arch));
+
+    /* Set up problem size information */
+    micro::SizeParams sizes;
+    sizes.m = static_cast<int64_t>(n_keys.v);
+    sizes.n = static_cast<int64_t>(n_queries.v);
+    sizes.k = static_cast<int64_t>(head_size);
+    sizes.batch = static_cast<int64_t>(batch);
+
+    /* Set up microkernel requirements */
+    std::vector<micro::StrategyRequirement> reqs_kq;
+    reqs_kq.push_back(micro::StrategyRequirement::UnrollM == config->unroll_m_kq);
+    reqs_kq.push_back(micro::StrategyRequirement::UnrollN == config->unroll_n_kq);
+    reqs_kq.push_back(micro::StrategyRequirement::WGM == config->wg_m_kq);
+    reqs_kq.push_back(micro::StrategyRequirement::WGN == config->wg_n_kq);
+
+    /* Set up microkernel options */
+    micro::GEMMProtocol::Options opts_kq;
+    opts_kq.localB = true;
+    opts_kq.slmPtr = true;
+
+    /* Ask microkernel provider for microkernel */
+    gemm_kq = micro::select_gemm_microkernel(opts_kq, hw_info, sizes, problem_kq, reqs_kq);
+
+    /* Update for second GEMM: V*S */
+    auto problem_vs = problem;
+    problem_vs.Ta = problem_vs.Ta_ext = convert_type(V.GetDType());
+    problem_vs.A.layout = micro::MatrixLayout::N;
+    problem_vs.B.layout = micro::MatrixLayout::Pr;
+    problem_vs.C.layout = micro::MatrixLayout::N;
+    problem_vs.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta));
+    problem_vs.B.setAlignment(64); // S is packed in SLM
+    problem_vs.B.crosspack = 16;
+    sizes.m = static_cast<int64_t>(n_values.v);
+    sizes.n = gemm_kq.getSetting("wg_tile_n");
+    sizes.k = gemm_kq.getSetting("wg_tile_m");
+
+    /* Set up special kernel requirements */
+    std::vector<micro::StrategyRequirement> reqs_vs;
+    reqs_vs.push_back(micro::StrategyRequirement::UnrollM == config->unroll_m_vs);
+    reqs_vs.push_back(micro::StrategyRequirement::UnrollN == config->unroll_n_vs);
+    reqs_vs.push_back(micro::StrategyRequirement::WGM == config->wg_m_vs);
+    reqs_vs.push_back(micro::StrategyRequirement::WGN == config->wg_n_vs);
+
+    micro::GEMMProtocol::Options opts_vs;
+    opts_vs.localB = true;
+    opts_vs.slmPtr = true;
+
+    auto adjust_vs = [](micro::GEMMStrategy &strategy) {
+        /* Enable dpasw */
+        strategy.dpasw |= strategy.fused;
+    };
+    /* Ask microkernel provider for microkernel */
+    gemm_vs = micro::select_gemm_microkernel(opts_vs, hw_info, sizes, problem_vs, reqs_vs, adjust_vs);
+}
+
+ParamsKey SDPAKernelMicro::GetSupportedKey() const {
+    ParamsKey k;
+    k.EnableInputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F16);
+
+    k.EnableInputLayout(DataLayout::bfyx);
+    k.EnableOutputLayout(DataLayout::bfyx);
+
+    k.EnableDifferentTypes();
+    k.EnableTensorOffset();
+    k.EnableTensorPitches();
+    k.EnableBatching();
+    k.EnableDynamicShapesSupport();
+
+    return k;
+}
+
+bool SDPAKernelMicro::Validate(const Params& p) const {
+    if (!Parent::Validate(p))
+        return false;
+
+    const sdpa_params& params = static_cast<const sdpa_params&>(p);
+
+    if (params.engineInfo.arch < gpu_arch::xe_hpg || !params.engineInfo.supports_microkernels)
+        return false;
+
+    if (params.conf.is_causal)
+        return false;
+
+    if (params.indirect_axis != -1)
+        return false;
+
+    auto Q_num_heads_dim = get_num_heads(params.inputs[0], params.input0_order);
+    auto K_num_heads_dim = get_num_heads(params.inputs[1], params.input1_order);
+    auto V_num_heads_dim = get_num_heads(params.inputs[2], params.input2_order);
+
+    if (params.input0_order != params.input1_order || params.input0_order != params.input2_order)
+        return false;
+
+    if (params.input0_order[3] != 3)
+        return false;
+
+    if (Q_num_heads_dim.is_dynamic || K_num_heads_dim.is_dynamic || V_num_heads_dim.is_dynamic || K_num_heads_dim.v != V_num_heads_dim.v)
+        return false;
+
+    return true;
+}
+
+JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const micro::Package& gemm_kq, const micro::Package& gemm_vs) const {
+    auto jit = MakeBaseParamsJitConstants(params);
+    const auto& prim_params = dynamic_cast<const sdpa_params&>(params);
+
+    const auto& Q = prim_params.inputs[0];
+    const auto& K = prim_params.inputs[1];
+    const auto& V = prim_params.inputs[2];
+
+    const auto head_size = prim_params.conf.head_size;
+
+    auto ldq = head_size * Q.ElementSize();
+    auto ldk = head_size * K.ElementSize();
+    auto ldv = head_size * V.ElementSize();
+    auto lda = head_size * prim_params.outputs[0].ElementSize();
+
+    const auto d_max = get_d_max(head_size);
+    const auto n_keys = get_seq_length(K, prim_params.input1_order);
+    const auto n_queries = get_seq_length(Q, prim_params.input0_order);
+    const auto n_values = V.X();
+
+    jit.AddConstant(MakeJitConstant("D_MAX", d_max));
+    jit.AddConstant(MakeJitConstant("SUBGROUP_SIZE", subgroup_size(prim_params.engineInfo.arch)));
+    jit.AddConstant(MakeJitConstant("INVERT_SCALE", false));
+    jit.AddConstant(MakeJitConstant("SCALE_DATA_T", "half"));
+
+    jit.AddConstant(MakeJitConstant("WITH_ATTN_MASK", params.inputs.size() > 3));
+    jit.AddConstant(MakeJitConstant("WITH_SCALE", params.inputs.size() > 4));
+    jit.AddConstant(MakeJitConstant("Q_ALIGN", micro::alignment_for_ld(ldq)));
+    jit.AddConstant(MakeJitConstant("K_ALIGN", micro::alignment_for_ld(ldk)));
+    jit.AddConstant(MakeJitConstant("V_ALIGN", micro::alignment_for_ld(ldv)));
+    jit.AddConstant(MakeJitConstant("A_ALIGN", micro::alignment_for_ld(lda)));
+
+    jit.AddConstant(MakeJitConstant("TRANSPOSE_K", false));
+
+    int tile_k = gemm_kq.getSetting("wg_tile_m");
+    int tile_q = gemm_kq.getSetting("wg_tile_n");
+    int tile_v = gemm_vs.getSetting("wg_tile_m");
+
+    bool d_full = (head_size == d_max);
+    bool v_full = (head_size == tile_v);
+    bool k_full = !n_keys.is_dynamic && (n_keys.v % tile_k) == 0;
+    bool q_full = !n_queries.is_dynamic && (n_queries.v % tile_q) != 0;
+
+    auto Q_num_heads_dim = get_num_heads(Q, params.input0_order);
+    auto K_num_heads_dim = get_num_heads(K, params.input1_order);
+
+    jit.AddConstant(MakeJitConstant("REMAINDER_K", !k_full));
+    jit.AddConstant(MakeJitConstant("KV_GROUP_SIZE", Q_num_heads_dim.v / K_num_heads_dim.v));
+
+    if (d_full) {
+        if (ldq % 4 == 0)
+            jit.AddConstant(MakeJitConstant("BLOCK_Q", 1));
+        if (lda % 4 == 0 && v_full)
+            jit.AddConstant(MakeJitConstant("BLOCK_A", 1));
+        jit.AddConstant(MakeJitConstant("REMAINDER_Q", !q_full));
+    } else if (params.engineInfo.arch >= gpu_arch::xe_hpc) {
+        auto vbytes = n_values.v * V.ElementSize();
+        if (lda % 16 == 0 && vbytes % 4 == 0)
+            jit.AddConstant(MakeJitConstant("BLOCK_2D_A", 1));
+    }
+
+    if (params.engineInfo.arch >= gpu_arch::xe_hpc) {
+        jit.AddConstant(MakeJitConstant("PREFETCH_MASK", 1));
+        jit.AddConstant(MakeJitConstant("PREFETCH_K0", 1));
+        jit.AddConstant(MakeJitConstant("PREFETCH_K", 1));
+        jit.AddConstant(MakeJitConstant("PREFETCH_V", 1));
+        bool no_rem = d_full && v_full && k_full;
+        jit.AddConstant(MakeJitConstant("PREFETCH_REMAINDER", !no_rem));
+        jit.AddConstant(MakeJitConstant("PREFETCH_D_MAX", std::min<int64_t>(d_max, 64)));
+    }
+
+    auto unit_parameters = [](std::string prefix) {
+        JitConstants definitions({});
+        for (size_t i = 0; i < 4; i++) {
+            definitions.AddConstant(MakeJitConstant(prefix + "_B" + std::to_string(i), 1));
+            definitions.AddConstant(MakeJitConstant(prefix + "_SB" + std::to_string(i), 1));
+        }
+
+        return definitions;
+    };
+
+    auto convert_strides = [](std::string target_prefix, std::string source_prefix, const std::vector<int64_t> order) {
+        JitConstants definitions({});
+
+        std::vector<std::string> target_definitions = {
+            target_prefix + "_S0",
+            target_prefix + "_S1",
+            target_prefix + "_S2",
+            target_prefix + "_S3",
+        };
+
+        std::vector<std::string> source_definitions = {
+            source_prefix + "_BATCH_PITCH",
+            source_prefix + "_FEATURE_PITCH",
+            source_prefix + "_Y_PITCH",
+            source_prefix + "_X_PITCH",
+        };
+
+        for (size_t i = 0; i < target_definitions.size(); i++) {
+            definitions.AddConstant(MakeJitConstant(target_definitions[order[i]], source_definitions[i]));
+        }
+
+        return definitions;
+    };
+
+    jit.Merge(convert_strides("QRY", "INPUT0", prim_params.input0_order));
+    jit.Merge(convert_strides("KEY", "INPUT1", prim_params.input1_order));
+    jit.Merge(convert_strides("VAL", "INPUT2", prim_params.input2_order));
+    jit.Merge(convert_strides("DST", "OUTPUT", prim_params.output_order));
+
+    jit.Merge(unit_parameters("QRY"));
+    jit.Merge(unit_parameters("KEY"));
+    jit.Merge(unit_parameters("VAL"));
+    jit.Merge(unit_parameters("DST"));
+
+    return jit;
+}
+
+CommonDispatchData SDPAKernelMicro::SetDefault(const sdpa_params& params, const micro::Package& gemm_kq, const micro::Package& gemm_vs) const {
+    CommonDispatchData dispatch_data;
+
+    auto wg_tile_q = gemm_kq.getSetting("wg_tile_n");
+    auto sg_per_wg = gemm_kq.getSetting("sg_per_wg_m") * gemm_kq.getSetting("sg_per_wg_n");
+
+    dispatch_data.lws = {subgroup_size(params.engineInfo.arch), (size_t)sg_per_wg, 1};
+    dispatch_data.gws = dispatch_data.lws;
+
+    dispatch_data.gws[0] *= CeilDiv(get_seq_length(params.inputs[0], params.input0_order).v, wg_tile_q);
+    dispatch_data.gws[1] *= params.outputs[0].Feature().v;
+    dispatch_data.gws[2] *= params.outputs[0].Batch().v;
+
+    return dispatch_data;
+}
+
+clKernelData SDPAKernelMicro::get_kernel_data(const sdpa_params& params, bool is_prefill) const {
+    auto name = kernelName + (is_prefill ? "_prefill" : "_generate");
+
+    std::vector<micro::Package> gemms(2); // KQ and VS
+    init_microkernels(params, gemms[kq_id], gemms[vs_id], is_prefill);
+    auto dispatch_data = SetDefault(params, gemms[kq_id], gemms[vs_id]);
+    auto entry_point = GetEntryPoint(name, params.layerID, params);
+    auto jit = CreateJit(name, GetJitConstants(params, gemms[kq_id], gemms[vs_id]), entry_point);
+    clKernelData kernel;
+
+    FillCLKernelData(kernel, dispatch_data, params.engineInfo, kernelName, jit, entry_point,
+                     "", false, false, static_cast<int>(params.inputs.size()),
+                     GetFusedPrimitiveInputsCount(params), 1, params.is_shape_agnostic);
+
+    kernel.params.arguments.clear();
+    if (params.is_shape_agnostic )
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0});
+
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); // K
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); // Q
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2}); // V
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0}); // A
+
+    if (params.inputs.size() >= 4)
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 3}); // mask
+    if (params.inputs.size() >= 5)
+        kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 4}); // Scale
+
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // D
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // K
+    kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 2}); // Q
+
+    const auto& Q = params.inputs[0];
+    const auto& K = params.inputs[1];
+
+    const auto n_queries = get_seq_length(Q, params.input0_order);
+    const auto n_keys = get_seq_length(K, params.input1_order);
+
+    auto head_size = params.conf.head_size;
+
+    ScalarDescriptor s_d;
+    s_d.t = ScalarDescriptor::Types::INT32;
+    s_d.v.s32 = static_cast<uint32_t>(head_size);
+
+    ScalarDescriptor s_k;
+    s_k.t = ScalarDescriptor::Types::INT32;
+    s_k.v.s32 = static_cast<uint32_t>(n_keys.v);
+
+    ScalarDescriptor s_q;
+    s_q.t = ScalarDescriptor::Types::INT32;
+    s_q.v.s32 = static_cast<uint32_t>(n_queries.v);
+
+    kernel.params.scalars.push_back(s_d);
+    kernel.params.scalars.push_back(s_k);
+    kernel.params.scalars.push_back(s_q);
+
+    /* Generate microkernel shims */
+    micro::ShimOptions shim_options;
+    shim_options.subgroupSize = static_cast<int32_t>(subgroup_size(params.engineInfo.arch));
+    shim_options.useTileOps = true;
+    shim_options.decorator = "kq";
+
+    kernel.code.kernelString->jit += generateShim(gemms[kq_id], micro::HostLanguage::OpenCL_C, shim_options);
+
+    shim_options.microkernelID++;
+    shim_options.decorator = "vs";
+    kernel.code.kernelString->jit += generateShim(gemms[vs_id], micro::HostLanguage::OpenCL_C, shim_options);
+
+    if (gemms[kq_id].grfMin > 128 || gemms[vs_id].grfMin > 128)
+        kernel.code.kernelString->options += " -cl-intel-256-GRF-per-thread";
+
+    std::string extra_options = " -Dcl_intel_dot_accumulate";
+    extra_options += " -Dcl_intel_global_float_atomic";
+    extra_options += " -Dcl_intel_subgroup_matrix_multiply_accumulate";
+    extra_options += " -Dcl_intel_subgroup_split_matrix_multiply_accumulate";
+    kernel.code.kernelString->options += extra_options;
+
+    kernel.code.kernelString->batch_compilation = false;
+    kernel.code.kernelString->has_microkernels = true;
+
+    for (auto& p : gemms) {
+        kernel.micro_kernels.push_back(std::make_shared<micro::MicroKernelPackage>(p));
+    }
+
+    return kernel;
+}
+
+KernelsData SDPAKernelMicro::GetKernelsData(const Params& params) const {
+    const size_t num_kernels = 2;
+    KernelData kd = KernelData::Default<sdpa_params>(params, num_kernels);
+    const auto& prim_params = dynamic_cast<const sdpa_params&>(params);
+
+    if (!Validate(params)) {
+        return {};
+    }
+
+    for (size_t i = 0; i < num_kernels; i++) {
+        kd.kernels[i] = get_kernel_data(prim_params, i == prefill_id);
+    }
+
+    GetUpdateDispatchDataFunc(kd);
+
+    return { kd };
+}
+
+void SDPAKernelMicro::GetUpdateDispatchDataFunc(KernelData& kd) const {
+    kd.update_dispatch_data_func = [this](const Params& params, KernelData& kernel_data) {
+        const auto& prim_params = static_cast<const sdpa_params&>(params);
+        const auto& Q = prim_params.inputs[0];
+        const auto& K = prim_params.inputs[1];
+
+        const auto n_queries = get_seq_length(Q, prim_params.input0_order);
+        const auto n_keys = get_seq_length(K, prim_params.input1_order);
+
+        auto head_size = prim_params.conf.head_size;
+
+        ScalarDescriptor s_d;
+        s_d.t = ScalarDescriptor::Types::INT32;
+        s_d.v.s32 = static_cast<uint32_t>(head_size);
+
+        ScalarDescriptor s_k;
+        s_k.t = ScalarDescriptor::Types::INT32;
+        s_k.v.s32 = static_cast<uint32_t>(n_keys.v);
+
+        ScalarDescriptor s_q;
+        s_q.t = ScalarDescriptor::Types::INT32;
+        s_q.v.s32 = static_cast<uint32_t>(n_queries.v);
+
+        // TODO: Currently 2nd token version works slower than prefill version
+        const bool is_prefill = true;//n_queries.v > 1;
+
+        OPENVINO_ASSERT(kernel_data.kernels.size() == 2, "[GPU] Invalid kernels size for update dispatch data func");
+
+        size_t target_kernel = is_prefill ? prefill_id : generate_id;
+
+        kernel_data.kernels[prefill_id].skip_execution = true;
+        kernel_data.kernels[generate_id].skip_execution = true;
+
+        const auto& gemms = kernel_data.kernels[target_kernel].micro_kernels;
+        auto dispatchData = SetDefault(prim_params, gemms[kq_id]->p, gemms[vs_id]->p);
+        kernel_data.kernels[target_kernel].params.workGroups.global = dispatchData.gws;
+        kernel_data.kernels[target_kernel].params.workGroups.local = dispatchData.lws;
+        kernel_data.kernels[target_kernel].skip_execution = KernelData::SkipKernelExecution(prim_params);
+
+        kernel_data.kernels[target_kernel].params.scalars.clear();
+        kernel_data.kernels[target_kernel].params.scalars.push_back(s_d);
+        kernel_data.kernels[target_kernel].params.scalars.push_back(s_k);
+        kernel_data.kernels[target_kernel].params.scalars.push_back(s_q);
+    };
+}
+
+KernelsPriority SDPAKernelMicro::GetKernelsPriority(const Params& /*params*/) const {
+    return FORCE_PRIORITY_1;
+}
+}  // namespace kernel_selector
+
+#endif // ENABLE_ONEDNN_FOR_GPU
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.h
new file mode 100644
index 00000000000000..f282c52f8437c9
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.h
@@ -0,0 +1,47 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+
+#pragma once
+
+#include "sdpa_kernel_base.h"
+#include "micro_utils.hpp"
+
+namespace kernel_selector {
+class SDPAKernelMicro : public SDPAKernelBase {
+public:
+    using Parent = SDPAKernelBase;
+    SDPAKernelMicro() : SDPAKernelBase("sdpa_micro") {}
+    virtual ~SDPAKernelMicro() {}
+
+    KernelsData GetKernelsData(const Params& params) const override;
+    KernelsPriority GetKernelsPriority(const Params& params) const override;
+    ParamsKey GetSupportedKey() const override;
+
+protected:
+    bool Validate(const Params& p) const override;
+    void GetUpdateDispatchDataFunc(KernelData& kd) const override;
+    CommonDispatchData SetDefault(const sdpa_params& params, const micro::Package& gemm_kq, const micro::Package& gemm_vs) const;
+    JitConstants GetJitConstants(const sdpa_params& params, const micro::Package& gemm_kq, const micro::Package& gemm_vs) const;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return {};
+    }
+
+    void init_microkernels(const sdpa_params& params, micro::Package& gemm_kq, micro::Package& gemm_vs, bool is_prefill) const;
+    clKernelData get_kernel_data(const sdpa_params& params, bool is_prefill) const;
+
+private:
+    static constexpr size_t prefill_id = 0;
+    static constexpr size_t generate_id = 1;
+
+    static constexpr size_t kq_id = 0;
+    static constexpr size_t vs_id = 1;
+
+    static std::mutex m;
+};
+}  // namespace kernel_selector
+
+
+#endif // ENABLE_ONEDNN_FOR_GPU
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
index 23dc964eb18eea..e53338e7c4522d 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp
@@ -273,7 +273,7 @@ void SDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const {
     };
 }
 
-KernelsPriority SDPAKernelOpt::GetKernelsPriority(const Params& /*params*/) const {
-    return FORCE_PRIORITY_1;
+KernelsPriority SDPAKernelOpt::GetKernelsPriority(const Params& params) const {
+    return params.engineInfo.supports_immad ?  FORCE_PRIORITY_2 : FORCE_PRIORITY_1;
 }
 }  // namespace kernel_selector
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
index b58f04f23e2643..7e88d0ad71fed1 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp
@@ -5,12 +5,16 @@
 #include "sdpa_kernel_selector.h"
 #include "sdpa_kernel_ref.h"
 #include "sdpa_kernel_opt.h"
+#include "sdpa_kernel_micro.h"
 
 namespace kernel_selector {
 
 sdpa_kernel_selector::sdpa_kernel_selector() {
     Attach<SDPAKernelOpt>();
     Attach<SDPAKernelRef>();
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    Attach<SDPAKernelMicro>();
+#endif
 }
 
 KernelsData sdpa_kernel_selector::GetBestKernels(const Params& params) const {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp
new file mode 100644
index 00000000000000..828c9016d8669e
--- /dev/null
+++ b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp
@@ -0,0 +1,58 @@
+﻿// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+
+#ifdef UNUSED
+#   undef UNUSED
+#endif
+
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+
+#include "gpu/intel/microkernels/package.hpp"
+#include "gpu/intel/jit/gemm/microkernel_provider.hpp"
+#include "gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp"
+#include "gpu/intel/microkernels/shim.hpp"
+
+namespace micro {
+
+using Package = dnnl::impl::gpu::intel::micro::Package;
+using HWInformation = dnnl::impl::gpu::intel::jit::HWInformation;
+using GEMMProblem = dnnl::impl::gpu::intel::jit::GEMMProblem;
+using GEMMStrategy = dnnl::impl::gpu::intel::jit::GEMMStrategy;
+using GEMMProtocol = dnnl::impl::gpu::intel::micro::GEMMProtocol;
+using MatrixLayout = dnnl::impl::gpu::intel::jit::MatrixLayout;
+using Type = dnnl::impl::gpu::intel::jit::Type;
+using SizeParams = dnnl::impl::gpu::intel::jit::SizeParams;
+using StrategyRequirement = dnnl::impl::gpu::intel::jit::StrategyRequirement;
+using ShimOptions = dnnl::impl::gpu::intel::micro::ShimOptions;
+using HostLanguage = dnnl::impl::gpu::intel::micro::HostLanguage;
+
+// Wrapper for Package which is used in clKernelData with forward declaration
+// to avoid including this header in many places in plugin
+// which may cause symbols conflicts with oneDNN
+struct MicroKernelPackage {
+    explicit MicroKernelPackage(Package _p) : p(_p) {}
+    Package p;
+};
+
+inline Package select_gemm_microkernel(GEMMProtocol protocol, HWInformation hw_info, SizeParams sizes, const GEMMProblem &problem,
+                                        const std::vector<StrategyRequirement> &reqs = std::vector<StrategyRequirement>(),
+                                        void (*strategyAdjuster)(GEMMStrategy &strategy) = nullptr) {
+    return dnnl::impl::gpu::intel::jit::selectGEMMMicrokernel(protocol, hw_info, sizes, problem, reqs, strategyAdjuster);
+}
+
+static inline int alignment_for_ld(int ld) {
+    return  dnnl::impl::gpu::intel::jit::alignmentForLD(ld);
+}
+
+}  // namespace micro
+
+#undef UNUSED
+
+#endif  // ENABLE_ONEDNN_FOR_GPU
diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp
index 8863b47fe55bd2..cd8128baff37c9 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp
@@ -22,7 +22,7 @@ primitive_db::primitive_db()
     : primitives({
 #include "ks_primitive_db.inc"
       }),
-      batch_header_str({
+      batch_headers({
 #include "ks_primitive_db_batch_headers.inc"
       }) {
 }
diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h
index 73858754d35e39..e384f6c9879fb5 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h
+++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h
@@ -22,7 +22,7 @@ struct primitive_db {
     primitive_db();
 
     std::vector<code> get(const primitive_id& id) const;
-    std::vector<code> get_batch_header_str() const { return std::move(batch_header_str); }
+    std::map<std::string, code> get_batch_headers() const { return std::move(batch_headers); }
 
 private:
     struct case_insensitive_compare {
@@ -35,7 +35,7 @@ struct primitive_db {
         }
     };
     std::multimap<primitive_id, code, case_insensitive_compare> primitives;
-    std::vector<code> batch_header_str;
+    std::map<std::string, code> batch_headers;
 };
 
 }  // namespace cache
diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py
index 4f746d8cc526f6..116844f3bccfc7 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py
+++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py
@@ -198,7 +198,8 @@ def batch_headers_to_str(self):
         characters = 1  # Newline character above
         res = ""
         for h in self.batch_headers:
-            res += '(std::string) R"(\n'
+            header_name = h[:h.find('.cl')]
+            res += '{{"{}",\n(std::string) R"(\n'.format(header_name)
             header_file = os.path.abspath(os.path.join(os.path.dirname(self.kernels_folder + "/include/batch_headers"), "batch_headers/" + h))
             content = []
             with open(header_file) as f:
@@ -207,11 +208,11 @@ def batch_headers_to_str(self):
                 if line.startswith('#include'):
                     continue
                 if (i + 1) % max_lines == 0 or characters + len(line) + 1 > max_characters:
-                    res += ')",' + ' (std::string) R"('
+                    res += ')"\n + (std::string) R"('
                     characters = 0
                 res += '{}\n'.format(line.rstrip())
                 characters += len(line) + 1
-            res += ')",\n\n'
+            res += ')"},\n\n'
         return self.post_process_sources(res)
 
     def post_process_sources(self, content):
diff --git a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp
index d002c868ffd225..0ba780183b186d 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp
@@ -3,7 +3,6 @@
 //
 
 #include "intel_gpu/plugin/program_builder.hpp"
-#include "intel_gpu/plugin/common_utils.hpp"
 
 #include "intel_gpu/op/sdpa.hpp"
 #include "intel_gpu/op/indirect_sdpa.hpp"
@@ -30,9 +29,15 @@ static void CreateScaledDotProductAttentionOp(ProgramBuilder& p, const std::shar
     auto layerName = layer_type_name_ID(op);
 
     bool is_causal = op->get_causal();
+    auto order = ov::op::internal::SDPA::default_order(op->get_output_partial_shape(0).size());
     auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName,
                                                          inputs,
-                                                         is_causal);
+                                                         is_causal,
+                                                         -1,
+                                                         order,
+                                                         order,
+                                                         order,
+                                                         order);
 
     p.add_primitive(*op, sdpa_prim);
 }
diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp
index b6db0233ee9093..e3f7c2cfe2f6b5 100644
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -83,17 +83,18 @@ std::string Plugin::get_device_id(const ov::AnyMap& config) const {
     return id;
 }
 
-void Plugin::transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config) const {
+void Plugin::transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config, const std::shared_ptr<RemoteContextImpl>& context) const {
     OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::transform_model");
-    auto deviceInfo = m_device_map.at(config.get_property(ov::device::id))->get_info();
-    TransformationsPipeline transformations(config, deviceInfo);
+    TransformationsPipeline transformations(config, context);
 
     auto start = Time::now();
     transformations.apply(model);
     GPU_DEBUG_LOG << "Transformations time: " << std::chrono::duration_cast<ms>(Time::now() - start).count() << " ms" << std::endl;
 }
 
-std::shared_ptr<ov::Model> Plugin::clone_and_transform_model(const std::shared_ptr<const ov::Model>& model, const ExecutionConfig& config) const {
+std::shared_ptr<ov::Model> Plugin::clone_and_transform_model(const std::shared_ptr<const ov::Model>& model,
+                                                             const ExecutionConfig& config,
+                                                             const std::shared_ptr<RemoteContextImpl>& context) const {
     OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::clone_and_transform_model");
     GPU_DEBUG_GET_INSTANCE(debug_config);
     GPU_DEBUG_DEFINE_MEM_LOGGER("Plugin::clone_and_transform_model");
@@ -106,7 +107,7 @@ std::shared_ptr<ov::Model> Plugin::clone_and_transform_model(const std::shared_p
         ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model);
     }
 
-    transform_model(cloned_model, config);
+    transform_model(cloned_model, config, context);
 
     // Transformations for some reason may drop output tensor names, so here we copy those from the original model
     auto new_results = cloned_model->get_results();
@@ -169,7 +170,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
     config.set_user_property(orig_config);
     config.apply_user_properties(context->get_engine().get_device_info());
 
-    auto transformed_model = clone_and_transform_model(model, config);
+    auto transformed_model = clone_and_transform_model(model, config, context);
     {
         OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::compile_model::CreateCompiledModel");
         return std::make_shared<CompiledModel>(transformed_model, shared_from_this(), context, config);
@@ -188,7 +189,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr<
     config.set_user_property(orig_config);
     config.apply_user_properties(context_impl->get_engine().get_device_info());
 
-    auto transformed_model = clone_and_transform_model(model, config);
+    auto transformed_model = clone_and_transform_model(model, config, context_impl);
     return std::make_shared<CompiledModel>(transformed_model, shared_from_this(), context_impl, config);
 }
 
@@ -259,10 +260,10 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr<const ov::Model>&
     float query_model_ratio = config.get_property(ov::internal::query_model_ratio.name()).as<float>();
 
     auto supported = ov::get_supported_nodes(model,
-        [&config,this](std::shared_ptr<ov::Model>& model) {
+        [&config,&ctx,this](std::shared_ptr<ov::Model>& model) {
             std::map<std::string, ov::PartialShape> shapes;
             std::map<std::string, std::pair<int64_t, int64_t>> batch_dim;
-            transform_model(model, config);
+            transform_model(model, config, ctx);
         },
         [&prog](std::shared_ptr<ov::Node> node) {
             return prog.is_op_supported(node);
@@ -709,7 +710,7 @@ uint32_t Plugin::get_max_batch_size(const ov::AnyMap& options) const {
             return static_cast<uint32_t>(max_batch_size);
         }
 
-        TransformationsPipeline transformations(config, device_info);
+        TransformationsPipeline transformations(config, context);
         transformations.apply(cloned_model);
         program = std::make_shared<ProgramBuilder>(cloned_model, engine, config, true);
         std::pair<int64_t, int64_t> device_memory_usage = program->get_compiled_program()->get_estimated_device_mem_usage();
@@ -778,7 +779,7 @@ uint32_t Plugin::get_optimal_batch_size(const ov::AnyMap& options) const {
                         << ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl;
     }
     auto config = m_configs_map.at(device_id);
-    auto cloned_model = clone_and_transform_model(model, config);
+    auto cloned_model = clone_and_transform_model(model, config, context);
     ov::MemBandwidthPressure memPressure = ov::mem_bandwidth_pressure_tolerance(cloned_model, L3_cache_size);
     uint32_t batch = 1;
     if (memPressure.max_mem_tolerance != ov::MemBandwidthPressure::UNKNOWN)
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp
index 3a6315978577c8..d26e45e8fea9ba 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp
@@ -45,14 +45,6 @@ TransposeSDPAMatcher::TransposeSDPAMatcher() {
         return std::dynamic_pointer_cast<ov::op::v1::Transpose>(output.get_node_shared_ptr()) == nullptr
                && is_fp_type(output);
     };
-    auto is_dynamic = [](const ov::Output<ov::Node>& output) -> bool {
-        bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic();
-        size_t num_inputs = output.get_node_shared_ptr()->get_input_size();
-        for (size_t idx = 0; idx < num_inputs; idx++) {
-            is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic();
-        }
-        return is_dynamic;
-    };
 
     auto input_q_m = any_input(not_transpose);
     auto input_k_m = any_input(not_transpose);
@@ -70,10 +62,10 @@ TransposeSDPAMatcher::TransposeSDPAMatcher() {
     auto sdpa_in_k = std::make_shared<Or>(OutputVector{input_k_m, transpose_k_m});
     auto sdpa_in_v = std::make_shared<Or>(OutputVector{input_v_m, transpose_v_m});
 
-    auto sdpa_without_attn_mask_m = wrap_type<ov::op::v13::ScaledDotProductAttention>({ sdpa_in_q, sdpa_in_k, sdpa_in_v }, is_dynamic);
-    auto sdpa_with_attn_mask_m = wrap_type<ov::op::v13::ScaledDotProductAttention>({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask }, is_dynamic);
+    auto sdpa_without_attn_mask_m = wrap_type<ov::op::v13::ScaledDotProductAttention>({ sdpa_in_q, sdpa_in_k, sdpa_in_v });
+    auto sdpa_with_attn_mask_m = wrap_type<ov::op::v13::ScaledDotProductAttention>({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask });
     auto sdpa_with_attn_mask_and_scale_m =
-        wrap_type<ov::op::v13::ScaledDotProductAttention>({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask, input_scale }, is_dynamic);
+        wrap_type<ov::op::v13::ScaledDotProductAttention>({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask, input_scale });
 
     auto sdpa_m = std::make_shared<Or>(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m});
 
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 4b18dbf3fa4b71..bb7385cbe5dbb1 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -188,6 +188,10 @@ static bool is_non_supported_decompression_op(const std::shared_ptr<const ov::No
 }
 }  // namespace
 
+namespace cldnn {
+extern bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config);
+}  // namespace cldnn
+
 namespace ov {
 namespace intel_gpu {
 
@@ -341,10 +345,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
                 return false;
             }
 
-            // - Head size should be static dim
-            const auto head_size_dim = query_ps[query_ps.size() - 1];
-            if (head_size_dim.is_dynamic())
-                return false;
+            // For platforms with DPAS support we don't have any other shape-based limitations besides head_size being static and equal for QKV
+            if (device_info.supports_immad && cldnn::query_microkernels_supported(m_context->get_engine(), config))
+                return true;
 
             // - Head size should be 128 for any model type; or should be in the range of 64 to 256 for stateful LLMs because of performance reasons.
             //   This limitations is recommended to prevent performance drop in models with small head size, such as SD,
diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
index 47483c6a1a0192..ac8d716882ef82 100644
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp
@@ -37,9 +37,32 @@
 #include <malloc.h>
 #endif
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+#include "gpu/intel/microkernels/fuser.hpp"
+#endif
+
 namespace {
 std::mutex cacheAccessMutex;
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+cl::Program fuse_microkernels(const cl::Context& context, const cl::Device& device, cl::Program& program, const std::string& code) {
+    using namespace dnnl::impl::gpu::intel;
+    std::vector<std::vector<uint8_t>> binaries = program.getInfo<CL_PROGRAM_BINARIES>();
+    OPENVINO_ASSERT(binaries.size() == 1);
+    std::vector<uint8_t> binary = binaries[0];
+    micro::fuseMicrokernels(binary, code.c_str());
+
+    cl::Program::Binaries fused_binary = { binary };
+    cl::Program fused_program(context, {device}, fused_binary);
+    fused_program.build({device});
+
+    return fused_program;
+}
+#endif  // ENABLE_ONEDNN_FOR_GPU
+
 std::string reorder_options(const std::string& org_options) {
     std::stringstream ss(org_options);
     std::set<std::string> sorted_options;
@@ -128,7 +151,7 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
                 const auto& batch_id = 0;
                 // increase bucket id if and only if new bucket comes
                 bucket_id = static_cast<int32_t>(program_buckets.size() - 1);
-                current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
+                current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_headers));
             }
 
             // This is a temporary walk-around to avoid severe performance drop.
@@ -159,13 +182,22 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
                 || current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end()
                 || need_separate_batch(entry_point)) {
                 const auto& batch_id = static_cast<int32_t>(current_bucket.size());
-                current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str));
+                current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_headers));
             }
 
             auto& current_batch = current_bucket.back();
             current_batch.dump_custom_program = dump_custom_program;
             current_batch.entry_point_to_id.emplace(entry_point, std::make_pair(code.params, kernel_part_idx));
 
+            current_batch.has_microkernels |= kernel_string->has_microkernels;
+
+            // TODO: Technically, microkernels doesn't require specific headers, but we don't want to include
+            // some headers to all batches as it may lead to compilation error on some driver versions.
+            // Need to generalize work with headers to include only necessary parts
+            if (current_batch.has_microkernels) {
+                current_batch.source.insert(current_batch.source.begin(), current_batch.micro_headers.begin(), current_batch.micro_headers.end());
+            }
+
             current_batch.source.push_back(std::move(full_code));
             current_batch.kernels_counter++;
         }
@@ -195,12 +227,12 @@ kernels_cache::kernels_cache(engine& engine,
                              const ExecutionConfig& config,
                              uint32_t prog_id,
                              std::shared_ptr<ov::threading::ITaskExecutor> task_executor,
-                             const std::vector<std::string>& batch_header_str)
+                             const std::map<std::string, std::string>& batch_headers)
     : _engine(engine)
     , _task_executor(task_executor)
     , _config(config)
     , _prog_id(prog_id)
-    , batch_header_str(std::move(batch_header_str)) { }
+    , batch_headers(std::move(batch_headers)) { }
 
 static std::vector<unsigned char> getProgramBinaries(cl::Program program) {
     // Get the size of the program binary in bytes.
@@ -288,6 +320,17 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
                 dump_file << "*/\n";
             }
 
+            if (batch.has_microkernels) {
+#ifdef ENABLE_ONEDNN_FOR_GPU
+                OPENVINO_ASSERT(batch.kernels_counter == 1);
+                // Do we need full source code here (with batch headers)?
+                program = fuse_microkernels(cl_build_engine.get_cl_context(), cl_build_engine.get_cl_device(), program, batch.source.back());
+#else  // ENABLE_ONEDNN_FOR_GPU
+                OPENVINO_THROW("[GPU] Can't compile kernel w/ microkernels as onednn is not available");
+#endif  // ENABLE_ONEDNN_FOR_GPU
+            }
+
+
             program.createKernels(&kernels);
 
             if (is_cache_enabled()) {
@@ -348,6 +391,7 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
         GPU_DEBUG_INFO << "-------- End of OpenCL build error" << std::endl;
         std::stringstream err_ss(err_log);
         std::string line;
+        std::stringstream err;
         int cnt = 0;
 
         while (std::getline(err_ss, line, '\n')) {
@@ -355,14 +399,14 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program&
                 cnt = 5;
             cnt--;
             if (cnt > 0)
-                std::cout << line << std::endl;
+                err << line << std::endl;
             else if (cnt == 0)
-                std::cout << "...." << std::endl;
+                err << "...." << std::endl;
         }
 
         throw std::runtime_error("Program build failed(" + std::to_string(batch.bucket_id) + + "_part_"
                                  + std::to_string(batch.batch_id)
-                                 + "): You may enable OCL source dump to see the error log.\n");
+                                 + "):\n" + err.str());
     }
 }
 
diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
index ca389718a49d23..d3775731bb01a5 100644
--- a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
+++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp
@@ -54,19 +54,34 @@ class kernels_cache {
         size_t hash_value;
         uint32_t kernels_counter;
         source_code source;
+        source_code micro_headers;
         std::string options;
         bool dump_custom_program;
+        bool has_microkernels;
         std::map<std::string, std::pair<kernel_impl_params, size_t>> entry_point_to_id;
 
-        explicit batch_program(int32_t _bucket_id, int32_t _batch_id, std::string _options, const std::vector<std::string>& batch_header_str)
+        explicit batch_program(int32_t _bucket_id, int32_t _batch_id, std::string _options, const std::map<std::string, std::string>& batch_headers)
             : bucket_id(_bucket_id),
               batch_id(_batch_id),
               hash_value(0),
               kernels_counter(0),
-              source(std::move(batch_header_str)),
+              source({}),
               options(_options),
               dump_custom_program(false),
+              has_microkernels(false),
               entry_point_to_id({}) {
+            static const std::vector<std::string> micro_kernel_include_names {
+                "generic_vector_ops",
+                "tile_ops",
+                "sdpa_utils"
+            };
+            for (const auto& kv : batch_headers) {
+                if (std::find(micro_kernel_include_names.begin(), micro_kernel_include_names.end(), kv.first) == micro_kernel_include_names.end()) {
+                    source.push_back(kv.second);
+                } else {
+                    micro_headers.push_back(kv.second);
+                }
+            }
         }
     };
 
@@ -83,7 +98,7 @@ class kernels_cache {
     compiled_kernels _kernels;
     std::map<std::vector<unsigned char>, uint32_t> _cached_binaries;
     std::unordered_map<std::string, kernel::ptr> _cached_kernels;
-    std::vector<std::string> batch_header_str;
+    std::map<std::string, std::string> batch_headers;
     std::unordered_map<kernel_impl_params, size_t, impl_hasher> _kernel_batch_hash;
     void get_program_source(const kernels_code& kernels_source_code, std::vector<batch_program>*) const;
     void build_batch(const engine& build_engine, const batch_program& batch, compiled_kernels& compiled_kernels);
@@ -97,12 +112,9 @@ class kernels_cache {
                            const ExecutionConfig& config,
                            uint32_t prog_id,
                            std::shared_ptr<ov::threading::ITaskExecutor> task_executor = nullptr,
-                           const std::vector<std::string>& batch_header_str = {});
+                           const std::map<std::string, std::string>& batch_headers = {});
     kernel::ptr get_kernel_from_cached_kernels(std::string id) const;
     std::vector<kernel::ptr> get_kernels(kernel_impl_params params) const;
-    void set_batch_header_str(const std::vector<std::string> &batch_headers) {
-        batch_header_str = std::move(batch_headers);
-    }
 
     bool validate_simple_kernel_execution(kernel::ptr kernel);
 
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
index 15e379d5ad557b..74d9b033c64ca2 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp
@@ -2,6 +2,13 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#ifdef ENABLE_ONEDNN_FOR_GPU
+#ifndef NOMINMAX
+# define NOMINMAX
+#endif
+#include "gpu/intel/jit/jit_generator.hpp"
+#endif  // ENABLE_ONEDNN_FOR_GPU
+
 #include "ocl_device.hpp"
 #include "ocl_common.hpp"
 #include "intel_gpu/runtime/debug_configuration.hpp"
@@ -43,6 +50,24 @@ namespace cldnn {
 namespace ocl {
 
 namespace {
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+gpu_arch convert_ngen_arch(ngen::HW gpu_arch) {
+    switch (gpu_arch) {
+        case ngen::HW::Gen9: return gpu_arch::gen9;
+        case ngen::HW::Gen11: return gpu_arch::gen11;
+        case ngen::HW::XeLP: return gpu_arch::xe_lp;
+        case ngen::HW::XeHP: return gpu_arch::xe_hp;
+        case ngen::HW::XeHPG: return gpu_arch::xe_hpg;
+        case ngen::HW::XeHPC: return gpu_arch::xe_hpc;
+        case ngen::HW::Xe2: return gpu_arch::xe2;
+        case ngen::HW::Gen10:
+        case ngen::HW::Unknown: return gpu_arch::unknown;
+    }
+    return gpu_arch::unknown;
+}
+#endif
+
 int driver_dev_id() {
     const std::vector<int> unused_ids = {
         0x4905, 0x4906, 0x4907, 0x4908
@@ -172,7 +197,7 @@ bool get_imad_support(const cl::Device& device) {
     return false;
 }
 
-device_info init_device_info(const cl::Device& device) {
+device_info init_device_info(const cl::Device& device, const cl::Context& context) {
     device_info info = {};
     info.vendor_id = static_cast<uint32_t>(device.getInfo<CL_DEVICE_VENDOR_ID>());
     info.dev_name = device.getInfo<CL_DEVICE_NAME>();
@@ -250,7 +275,8 @@ device_info init_device_info(const cl::Device& device) {
     bool nv_device_attr_supported = extensions.find("cl_nv_device_attribute_query") != std::string::npos;
     info.has_separate_cache = false;
     if (device_attr_supported) {
-        info.gfx_ver = parse_version(device.getInfo<CL_DEVICE_IP_VERSION_INTEL>());
+        info.ip_version = device.getInfo<CL_DEVICE_IP_VERSION_INTEL>();
+        info.gfx_ver = parse_version(info.ip_version);
         info.device_id = device.getInfo<CL_DEVICE_ID_INTEL>();
         info.num_slices = device.getInfo<CL_DEVICE_NUM_SLICES_INTEL>();
         info.num_sub_slices_per_slice = device.getInfo<CL_DEVICE_NUM_SUB_SLICES_PER_SLICE_INTEL>();
@@ -296,6 +322,17 @@ device_info init_device_info(const cl::Device& device) {
         info.num_ccs = std::max<uint32_t>(num_queues, info.num_ccs);
     }
 
+
+#ifdef ENABLE_ONEDNN_FOR_GPU
+    using namespace dnnl::impl::gpu::intel::jit;
+    ngen::HW hw = ngen::HW::Unknown;
+    ngen::Product product = {ngen::ProductFamily::Unknown, 0};
+    jit_generator<ngen::HW::Unknown>::detectHWInfo(context.get(), device.get(), hw, product);
+    info.arch = convert_ngen_arch(hw);
+#else  // ENABLE_ONEDNN_FOR_GPU
+    info.arch = gpu_arch::unknown;
+#endif  // ENABLE_ONEDNN_FOR_GPU
+
     return info;
 }
 
@@ -331,7 +368,7 @@ ocl_device::ocl_device(const cl::Device dev, const cl::Context& ctx, const cl::P
 : _context(ctx)
 , _device(dev)
 , _platform(platform)
-, _info(init_device_info(dev))
+, _info(init_device_info(dev, ctx))
 , _mem_caps(init_memory_caps(dev, _info)) { }
 
 bool ocl_device::is_same(const device::ptr other) {
diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_wrapper.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_wrapper.hpp
index 57da8d5c748672..91a4d767ceaebe 100644
--- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_wrapper.hpp
+++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_wrapper.hpp
@@ -50,6 +50,7 @@
     #pragma clang diagnostic ignored "-Wunused-variable"
     #pragma clang diagnostic ignored "-Wunused-function"
     #pragma clang diagnostic ignored "-Wignored-qualifiers"
+    #pragma clang diagnostic ignored "-Wdeprecated-declarations"
 #elif defined __GNUC__
     #pragma GCC diagnostic push
     #pragma GCC diagnostic ignored "-Wsign-compare"
@@ -57,6 +58,7 @@
     #pragma GCC diagnostic ignored "-Wunused-variable"
     #pragma GCC diagnostic ignored "-Wunused-function"
     #pragma GCC diagnostic ignored "-Wignored-qualifiers"
+    #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
     #if __GNUC__ >= 8
     #pragma GCC diagnostic ignored "-Wcatch-value"
     #endif
diff --git a/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp b/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp
index 3ea2a654bb5416..8dd82c7f6f1bd8 100644
--- a/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp
+++ b/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp
@@ -14,6 +14,7 @@
 #include "openvino/op/gather.hpp"
 #include "openvino/op/read_value.hpp"
 #include "openvino/op/reshape.hpp"
+#include "openvino/op/scaled_dot_product_attention.hpp"
 #include "openvino/op/shape_of.hpp"
 #include "openvino/op/transpose.hpp"
 #include "openvino/op/result.hpp"
@@ -119,7 +120,6 @@ inline std::shared_ptr<ov::Model> make_llm_kv_cache_pattern(ov::Dimension batch
 
     ov::ResultVector results{kv_present, matmul_out};
     auto model = std::make_shared<ov::Model>(results, params, "LLM-KV-Cache");
-    ov::pass::VisualizeTree("model.svg").run_on_model(model);
     if (stateful) {
         ov::pass::MakeStateful({{in_kv_prev, kv_present}}).run_on_model(model);
     }
diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
index 8e835c371a6123..3f0a4d60a9dedf 100644
--- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
+++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt
@@ -12,7 +12,7 @@ if(ENABLE_ONEDNN_FOR_GPU)
         set(ONEDNN_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_build")
         set(ONEDNN_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_install" CACHE PATH "Installation path for oneDNN GPU library")
         set(ONEDNN_PREFIX_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_root")
-        set(ONEDNN_ENABLED_PRIMITIVES "CONCAT;CONVOLUTION;DECONVOLUTION;INNER_PRODUCT;MATMUL;REORDER;POOLING;REDUCTION")
+        set(ONEDNN_ENABLED_PRIMITIVES "CONCAT;CONVOLUTION;DECONVOLUTION;INNER_PRODUCT;MATMUL;REORDER;POOLING;REDUCTION;SDPA")
         set(ONEDNN_ENABLED_ISA "XEHPG;XEHPC;XE2")
         set(DNNL_GPU_LIBRARY_NAME "openvino_onednn_gpu" CACHE STRING "Name of oneDNN library for Intel GPU Plugin")
 
@@ -141,11 +141,24 @@ if(ENABLE_ONEDNN_FOR_GPU)
             EXCLUDE_FROM_ALL ON
         )
 
+        list(APPEND LIB_INCLUDE_DIRS ${ONEDNN_INSTALL_DIR}/include)
+        list(APPEND LIB_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src)
+        set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU
+                            DNNL_DLL
+                            DNNL_DLL_EXPORTS
+                            DNNL_ENABLE_CPU_ISA_HINTS
+                            DNNL_ENABLE_MAX_CPU_ISA
+                            DNNL_X64=1
+                            NGEN_CPP11
+                            NGEN_NEO_INTERFACE
+                            NGEN_NO_OP_NAMES
+                            NGEN_SAFE
+                            NGEN_WINDOWS_COMPAT)
         add_library(onednn_gpu_tgt INTERFACE)
         set_target_properties(onednn_gpu_tgt PROPERTIES
             INTERFACE_LINK_LIBRARIES $<BUILD_INTERFACE:${ONEDNN_GPU_LIB_PATH}>
-            INTERFACE_INCLUDE_DIRECTORIES $<BUILD_INTERFACE:${ONEDNN_INSTALL_DIR}/include>
-            INTERFACE_COMPILE_DEFINITIONS ENABLE_ONEDNN_FOR_GPU
+            INTERFACE_INCLUDE_DIRECTORIES "$<BUILD_INTERFACE:${LIB_INCLUDE_DIRS}>"
+            INTERFACE_COMPILE_DEFINITIONS "${LIB_DEFINITIONS}"
         )
         add_dependencies(onednn_gpu_tgt onednn_gpu_build)