From 29183228c18b591fc5e4e6d70620e10639183f51 Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Mon, 1 Jul 2024 18:29:21 +0400 Subject: [PATCH] [GPU] Micro sdpa (#24656) ### Details: - Added SDPA impl based on microkernels using internal onednn API and related infra - Current limitations: - fused transpose shouldn't change order of innermost dim (head size). - is_causal = true is not supported - fp16 only - num heads dimension must be static - no indirect kv support - Initial version of KV Cache + SDPA func test - Enabled Transpose+SDPA fusion for static shape too ### Tickets: - CVS-141761 --- .../include/intel_gpu/plugin/plugin.hpp | 6 +- .../plugin/transformations_pipeline.hpp | 6 +- .../intel_gpu/runtime/debug_configuration.hpp | 6 +- .../include/intel_gpu/runtime/device_info.hpp | 14 +- .../include/intel_gpu/runtime/kernel_args.hpp | 3 +- .../impls/ocl/kernel_selector_helper.cpp | 49 +- .../graph/impls/ocl/kernel_selector_helper.h | 2 + .../ocl/scaled_dot_product_attention.cpp | 2 +- src/plugins/intel_gpu/src/graph/program.cpp | 2 +- .../src/kernel_selector/CMakeLists.txt | 4 + .../batch_headers/generic_vector_ops.cl | 75 +++ .../include/batch_headers/sdpa_utils.cl | 35 + .../include/batch_headers/tile_ops.cl | 628 +++++++++++++++++ .../include/batch_headers/vec_typedefs.cl | 16 +- .../kernel_selector/cl_kernels/sdpa_micro.cl | 461 +++++++++++++ .../kernel_selector/kernel_selector_common.h | 5 + .../kernel_selector/kernel_selector_params.h | 18 + .../kernels/sdpa/sdpa_kernel_micro.cpp | 630 ++++++++++++++++++ .../kernels/sdpa/sdpa_kernel_micro.h | 47 ++ .../kernels/sdpa/sdpa_kernel_opt.cpp | 4 +- .../kernels/sdpa/sdpa_kernel_selector.cpp | 4 + .../src/kernel_selector/micro_utils.hpp | 58 ++ .../src/kernel_selector/primitive_db.cpp | 2 +- .../src/kernel_selector/primitive_db.h | 4 +- .../src/kernel_selector/primitive_db_gen.py | 7 +- .../ops/scaled_dot_product_attention.cpp | 9 +- src/plugins/intel_gpu/src/plugin/plugin.cpp | 23 +- .../transformations/transpose_fusion.cpp | 14 +- .../src/plugin/transformations_pipeline.cpp | 11 +- .../intel_gpu/src/runtime/kernels_cache.cpp | 58 +- .../intel_gpu/src/runtime/kernels_cache.hpp | 26 +- .../intel_gpu/src/runtime/ocl/ocl_device.cpp | 43 +- .../intel_gpu/src/runtime/ocl/ocl_wrapper.hpp | 2 + .../tests/common/subgraphs_builders.hpp | 2 +- .../intel_gpu/thirdparty/CMakeLists.txt | 19 +- 35 files changed, 2216 insertions(+), 79 deletions(-) create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/tile_ops.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.h create mode 100644 src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp index a5ea28b95f78ba..28a20fa737da76 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp @@ -26,8 +26,10 @@ class Plugin : public ov::IPlugin { std::map> get_default_contexts() const; - std::shared_ptr clone_and_transform_model(const std::shared_ptr& network, const ExecutionConfig& config) const; - void transform_model(std::shared_ptr& model, const ExecutionConfig& config) const; + std::shared_ptr clone_and_transform_model(const std::shared_ptr& network, + const ExecutionConfig& config, + const std::shared_ptr& context) const; + void transform_model(std::shared_ptr& model, const ExecutionConfig& config, const std::shared_ptr& context) const; void register_primitives() const; std::string get_device_id_from_config(const ov::AnyMap& config) const; std::string get_device_id(const ov::AnyMap& config) const; diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp index f0c0361ce72077..365deb66ad2a4d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/transformations_pipeline.hpp @@ -6,6 +6,7 @@ #include +#include "intel_gpu/plugin/remote_context.hpp" #include "openvino/core/model.hpp" #include "intel_gpu/runtime/execution_config.hpp" @@ -16,12 +17,13 @@ namespace intel_gpu { class TransformationsPipeline { public: - explicit TransformationsPipeline(const ExecutionConfig &conf, const cldnn::device_info &device_info) - : config(conf), device_info(device_info) {} + explicit TransformationsPipeline(const ExecutionConfig &conf, const std::shared_ptr& context) + : config(conf), m_context(context), device_info(context->get_engine().get_device_info()) {} void apply(std::shared_ptr func); private: const ExecutionConfig& config; + std::shared_ptr m_context; cldnn::device_info device_info; }; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp index b3220136c6c8a8..d9a30e3244b4ae 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/debug_configuration.hpp @@ -48,7 +48,7 @@ enum class LogLevel : int8_t { #else #define SEPARATE '/' #endif -#define __FILENAME__ (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__) +#define GPU_FILENAME (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__) #define GPU_DEBUG_IF(cond) if (cond) #define GPU_DEBUG_CODE(...) __VA_ARGS__ #define GPU_DEBUG_DEFINE_MEM_LOGGER(stage) \ @@ -62,9 +62,9 @@ enum class LogLevel : int8_t { #define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \ ((cldnn::debug_configuration::get_instance()->verbose_color == 0) ? GPU_DEBUG_LOG_PREFIX : GPU_DEBUG_LOG_COLOR_PREFIX) #define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast::type>(min_verbose_level)) -#define GPU_DEBUG_LOG_PREFIX std::cout << cldnn::debug_configuration::prefix << __FILENAME__ << ":" <<__LINE__ << ":" << __func__ << ": " +#define GPU_DEBUG_LOG_PREFIX std::cout << cldnn::debug_configuration::prefix << GPU_FILENAME << ":" <<__LINE__ << ":" << __func__ << ": " #define GPU_DEBUG_LOG_COLOR_PREFIX std::cout << DARK_GRAY << cldnn::debug_configuration::prefix << \ - BLUE << __FILENAME__ << ":" << PURPLE << __LINE__ << ":" << CYAN << __func__ << ": " << RESET + BLUE << GPU_FILENAME << ":" << PURPLE << __LINE__ << ":" << CYAN << __func__ << ": " << RESET #define DARK_GRAY "\033[1;30m" #define BLUE "\033[1;34m" #define PURPLE "\033[1;35m" diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp index 7a18cd311e1fd4..0c83877851b48b 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp @@ -10,7 +10,6 @@ #include #include #include -#include namespace cldnn { /// @addtogroup cpp_api C++ API @@ -25,6 +24,17 @@ enum class device_type { discrete_gpu = 1 }; +enum class gpu_arch { + unknown = 0, + gen9 = 1, + gen11 = 2, + xe_lp = 3, + xe_hp = 4, + xe_hpg = 5, + xe_hpc = 6, + xe2 = 7, +}; + /// @brief Defines version of GFX IP struct gfx_version { uint16_t major; @@ -77,6 +87,8 @@ struct device_info { device_type dev_type; ///< Defines type of current GPU device (integrated or discrete) gfx_version gfx_ver; ///< Defines GFX IP version + gpu_arch arch; ///< Defines arch human readable name + uint32_t ip_version; ///< Defines raw GFX IP version uint32_t device_id; ///< ID of current GPU uint32_t num_slices; ///< Number of slices uint32_t num_sub_slices_per_slice; ///< Number of subslices in a slice diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp index 6bce66e64ef808..09dfcf68f05725 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/kernel_args.hpp @@ -121,8 +121,9 @@ struct kernel_string { std::string options; std::string entry_point; bool batch_compilation; + bool has_microkernels; - kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false) {} + kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false), has_microkernels(false) {} std::string get_str() const { return str + jit + undefs + options + entry_point; } size_t get_hash() const { return std::hash()(get_str()); } diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp index 12e20536407d45..1f492e14c9fc7e 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.cpp @@ -5,6 +5,7 @@ #include "intel_gpu/graph/program.hpp" #include "kernel_selector_helper.h" +#include "intel_gpu/runtime/device_info.hpp" #include "kernel_selector_params.h" #include "to_string_utils.h" #include "program_node.h" @@ -32,7 +33,6 @@ #include "intel_gpu/primitives/extract_image_patches.hpp" #include "activation_inst.h" -#include "depth_to_space_inst.h" #include "eltwise_inst.h" #include "quantize_inst.h" #include "reorder_inst.h" @@ -44,9 +44,9 @@ #include "kernel_selector/kernels/reorder/reorder_kernel_base.h" #include "runtime/kernels_cache.hpp" -#include "kernel_base.h" #include +#include #include namespace { @@ -119,6 +119,48 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) { namespace cldnn { +bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config) { + auto device = e.get_device().get(); + + static std::mutex m; + std::lock_guard lock(m); + static std::map cache; + if (cache.find(device) != cache.end()) { + return cache.at(device); + } + + std::shared_ptr kernel_string = std::make_shared(); + // This program check that all required vISA features are supported by current IGC version + const char* kernel_code = R""""( + kernel void igc_check() { + __asm__ volatile( + ".decl AA0 v_type=G type=ud num_elts=1\n" + ".decl AA1 v_type=G type=ud num_elts=1\n" + ".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n" + ".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n" + "mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n" + ); + } + )""""; + + kernel_string->str = kernel_code; + kernel_string->options = ""; + kernel_string->entry_point = "igc_check"; + kernel_string->batch_compilation = true; + + try { + cldnn::kernel_impl_params dummy_params; + auto _kernels_cache_device_query = std::unique_ptr(new cldnn::kernels_cache(e, config, 0)); + _kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false); + _kernels_cache_device_query->build_all(); + cache[device] = true; + } catch (std::exception&) { + cache[device] = false; + } + + return cache.at(device); +} + kernel_selector::data_type to_data_type(data_types dt) { switch (dt) { case cldnn::data_types::i4: @@ -1081,6 +1123,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p params.engineInfo.bOptHintsSupport = false; params.engineInfo.bLocalBlockIOSupport = query_local_block_io_supported(engine, config); + params.engineInfo.supports_microkernels = query_microkernels_supported(engine, config); params.engineInfo.deviceType = get_device_type(device_info.dev_type); params.engineInfo.maxWorkGroupSize = device_info.max_work_group_size; params.engineInfo.maxLocalMemSize = device_info.max_local_mem_size; @@ -1092,6 +1135,8 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p params.engineInfo.driverVersion = device_info.driver_version; params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes; params.engineInfo.vendor_id = device_info.vendor_id; + params.engineInfo.ip_version = device_info.ip_version; + params.engineInfo.arch = kernel_selector::gpu_arch(static_cast::type>(device_info.arch)); auto impl_forcing = config.get_property(ov::intel_gpu::force_implementations); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h index de746e94661ff7..67e2127a064001 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kernel_selector_helper.h @@ -294,4 +294,6 @@ inline void update_shapes(kernel_selector::Params& p, const kernel_impl_params& } } +bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config); + } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index 681237b2bcd02b..4bc6e16ce55fa9 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -58,7 +58,7 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive layouts; - if (_kernels_data.size() > 0) { + if (_kernels_data.size() > 0 && !_kernels_data[0].internalBufferSizes.empty()) { auto dtype = from_data_type(_kernels_data[0].internalBufferDataType); const auto bpp = data_type_traits::size_of(dtype); for (auto size : _kernels_data[0].internalBufferSizes) { diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index 4b0cc049013ada..1c526ea38188d2 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -223,7 +223,7 @@ void program::init_program() { if (_task_executor == nullptr) _task_executor = program::make_task_executor(_config); _kernels_cache = std::unique_ptr(new kernels_cache(_engine, _config, prog_id, _task_executor, - kernel_selector::KernelBase::get_db().get_batch_header_str())); + kernel_selector::KernelBase::get_db().get_batch_headers())); if (!_compilation_context) _compilation_context = program::make_compilation_context(_config); diff --git a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt index 52c3fb052ff093..993ef596a21256 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt +++ b/src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt @@ -67,6 +67,10 @@ target_include_directories(${TARGET_NAME} PUBLIC $:$,/Os,-Os>>) +if (ENABLE_ONEDNN_FOR_GPU) + target_link_libraries(${TARGET_NAME} PRIVATE onednn_gpu_tgt) +endif() + if(COMMAND add_cpplint_target) add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME}) endif() diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl new file mode 100644 index 00000000000000..b9a3df7cc863ea --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/generic_vector_ops.cl @@ -0,0 +1,75 @@ +/******************************************************************************* + * Copyright 2024 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#ifndef GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H +#define GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H + +typedef half __attribute__((ext_vector_type(1))) half1; +typedef uint __attribute__((ext_vector_type(1))) uint1; +typedef float __attribute__((ext_vector_type(1))) float1; + +float1 __attribute__((overloadable)) vmad(float1 a, float1 b, float1 c) { + c[0] = mad(a[0], b[0], c[0]); + return c; +} +float2 __attribute__((overloadable)) vmad(float2 a, float2 b, float2 c) { + return mad(a, b, c); +} +float4 __attribute__((overloadable)) vmad(float4 a, float4 b, float4 c) { + return mad(a, b, c); +} +float8 __attribute__((overloadable)) vmad(float8 a, float8 b, float8 c) { + return mad(a, b, c); +} +float16 __attribute__((overloadable)) vmad(float16 a, float16 b, float16 c) { + return mad(a, b, c); +} + +float1 __attribute__((overloadable)) native_vrecip(float1 x) { + x[0] = native_recip(x[0]); + return x; +} +float2 __attribute__((overloadable)) native_vrecip(float2 x) { + return native_recip(x); +} +float4 __attribute__((overloadable)) native_vrecip(float4 x) { + return native_recip(x); +} +float8 __attribute__((overloadable)) native_vrecip(float8 x) { + return native_recip(x); +} +float16 __attribute__((overloadable)) native_vrecip(float16 x) { + return native_recip(x); +} + +float1 __attribute__((overloadable)) native_vexp2(float1 x) { + x[0] = native_exp2(x[0]); + return x; +} +float2 __attribute__((overloadable)) native_vexp2(float2 x) { + return native_exp2(x); +} +float4 __attribute__((overloadable)) native_vexp2(float4 x) { + return native_exp2(x); +} +float8 __attribute__((overloadable)) native_vexp2(float8 x) { + return native_exp2(x); +} +float16 __attribute__((overloadable)) native_vexp2(float16 x) { + return native_exp2(x); +} + +#endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl new file mode 100644 index 00000000000000..5943f23251bb7a --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/sdpa_utils.cl @@ -0,0 +1,35 @@ +/******************************************************************************* + * Copyright 2024 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#ifndef GPU_OCL_SDPA_UTILS_H +#define GPU_OCL_SDPA_UTILS_H + +#define _4D_OFF(tag, x0, x1, x2, x3) \ + (((x0) % tag##_B0) * tag##_SB0 + ((x0) / tag##_B0) * tag##_S0 \ + + ((x1) % tag##_B1) * tag##_SB1 + ((x1) / tag##_B1) * tag##_S1 \ + + ((x2) % tag##_B2) * tag##_SB2 + ((x2) / tag##_B2) * tag##_S2 \ + + ((x3) % tag##_B3) * tag##_SB3 + ((x3) / tag##_B3) * tag##_S3) + +#define QRY_OFF(x0, x1, x2, x3) _4D_OFF(QRY, x0, x1, x2, x3) +#define KEY_OFF(x0, x1, x2, x3) _4D_OFF(KEY, x0, x1, x2, x3) +#define VAL_OFF(x0, x1, x2, x3) _4D_OFF(VAL, x0, x1, x2, x3) +#define MSK_OFF(x0, x1, x2, x3) _4D_OFF(MSK, x0, x1, x2, x3) + +#define DST_OFF(x0, x1, d, h, w) \ +(((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \ ++ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1) + +#endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/tile_ops.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/tile_ops.cl new file mode 100644 index 00000000000000..b2ce72c33fc31c --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/tile_ops.cl @@ -0,0 +1,628 @@ +/******************************************************************************* + * Copyright 2024 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + *******************************************************************************/ + +#ifndef GPU_OCL_TILE_OPS_H +#define GPU_OCL_TILE_OPS_H + +float __builtin_IB_atomic_max_local_f32(__local float *, float); + +__attribute__((overloadable)) float local_atomic_max(local float *p, float v) { + return __builtin_IB_atomic_max_local_f32(p, v); +} + +__attribute__((overloadable)) half local_atomic_max( + local half *p, half v) { /* not implemented */ + return v; +} + +__attribute__((overloadable)) uint local_atomic_max(local uint *p, uint v) { + return atomic_max(p, v); +} + +__attribute__((overloadable)) int local_atomic_max(local int *p, int v) { + return atomic_max(p, v); +} + +#define DEF_BLOCK_LOAD_STORE(type, itype, suffix, n) \ + __attribute__((overloadable)) type##n block_load( \ + const global type *p, int vlen) \ + __attribute__((enable_if(vlen == n, "wrong vector length"))) { \ + return as_##type##n( \ + intel_sub_group_block_read##suffix##n((global void *)p)); \ + } \ + __attribute__((overloadable)) void block_store( \ + global type *p, type##n v) { \ + intel_sub_group_block_write##suffix##n( \ + (global itype *)p, as_##itype##n(v)); \ + } + +#define DEF_BLOCK_LOAD_STORE1(type, itype, suffix) \ + __attribute__((overloadable)) \ + type##1 block_load(const global type *p, int vlen) __attribute__( \ + (enable_if(vlen == 1, "wrong vector length"))) { \ + type##1 x; \ + x[0] = as_##type( \ + intel_sub_group_block_read##suffix((global void *)p)); \ + return x; \ + } \ + __attribute__((overloadable)) void block_store( \ + global type *p, type##1 v) { \ + intel_sub_group_block_write##suffix( \ + (global itype *)p, as_##itype(v[0])); \ + } + +DEF_BLOCK_LOAD_STORE1(half, ushort, _us) +DEF_BLOCK_LOAD_STORE(half, ushort, _us, 2) +DEF_BLOCK_LOAD_STORE(half, ushort, _us, 4) +DEF_BLOCK_LOAD_STORE(half, ushort, _us, 8) +DEF_BLOCK_LOAD_STORE(half, ushort, _us, 16) +DEF_BLOCK_LOAD_STORE1(uint, uint, ) +DEF_BLOCK_LOAD_STORE(uint, uint, , 2) +DEF_BLOCK_LOAD_STORE(uint, uint, , 4) +DEF_BLOCK_LOAD_STORE(uint, uint, , 8) + +#define DEF_BLOCK2D_LOAD_STORE(type, itype, vl, SG, suffix, BR, BC) \ + itype##vl __builtin_IB_subgroup_block_read_flat_##suffix( \ + long, int, int, int, int2); \ + void __builtin_IB_subgroup_block_write_flat_##suffix( \ + long, int, int, int, int2, itype##vl); \ + __attribute__((overloadable)) type##vl block2d_load(const global type *p, \ + int w, int h, int ld, int x, int y, int br, int bc, \ + int sg) __attribute__((enable_if(br == BR, "wrong #rows"))) \ + __attribute__((enable_if(bc == BC, "wrong #columns"))) \ + __attribute__( \ + (enable_if(sg == SG, "wrong subgroup size"))) { \ + ulong pp = as_long(p); \ + ulong prem = pp & 0x3F; \ + pp &= ~0x3F; \ + x += (prem / sizeof(type)); \ + w += prem; \ + int2 coord = {x, y}; \ + return as_##type##vl(__builtin_IB_subgroup_block_read_flat_##suffix( \ + pp, w - 1, h - 1, ld - 1, coord)); \ + } \ + __attribute__((overloadable)) void block2d_store(type##vl v, \ + global type *p, int w, int h, int ld, int x, int y, int br, \ + int bc, \ + int sg) __attribute__((enable_if(br == BR, "wrong #rows"))) \ + __attribute__((enable_if(bc == BC, "wrong #columns"))) \ + __attribute__( \ + (enable_if(sg == SG, "wrong subgroup size"))) { \ + ulong pp = as_long(p); \ + ulong prem = pp & 0x3F; \ + pp &= ~0x3F; \ + x += (prem / sizeof(type)); \ + w += prem; \ + int2 coord = {x, y}; \ + __builtin_IB_subgroup_block_write_flat_##suffix( \ + pp, w - 1, h - 1, ld - 1, coord, as_##itype##vl(v)); \ + } + +DEF_BLOCK2D_LOAD_STORE(half, ushort, 8, 16, u16_m8k16v1, 16, 8) +DEF_BLOCK2D_LOAD_STORE(half, ushort, 8, 16, u16_m4k32v1, 32, 4) +DEF_BLOCK2D_LOAD_STORE(half, ushort, 16, 16, u16_m8k32v1, 32, 8) + +#define tile_fill(t, v) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) t.x[i] \ + = v; \ + } while (0) + +#define tile_elementwise(t, f) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) t.x[i] \ + = f(t.x[i]); \ + } while (0) + +#define tile_elementwise_s(t, f) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) { \ + _Pragma("unroll") for (int s = 0; \ + s < sizeof(t.x[0]) / sizeof(t.x[0][0]); \ + s++) t.x[i][s] \ + = f(t.x[i][s]); \ + } \ + } while (0) + +#define tile_binary(t, t2, f) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) t.x[i] \ + = f(t.x[i], t2.x[i]); \ + } while (0) + +#define tile_copy(t, t_new) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) t_new.x[i] \ + = __builtin_convertvector(t.x[i], __typeof__(t_new.x[i])); \ + } while (0) + +#define tile_copy_to_half2(t, t_new) \ + do { \ + _Pragma("unroll") for (int i = 0; i < sizeof(t.x) / sizeof(t.x[0]); \ + i++) { \ + _Pragma("unroll") for (int s = 0; \ + s < sizeof(t.x[0]) / sizeof(t.x[0][0]) / 2; \ + s++) { \ + half2 v = {t.x[i][2 * s], t.x[i][2 * s + 1]}; \ + t_new.x[i][s] = as_uint(v); \ + } \ + } \ + } while (0) + +#define tile_access(t, i0, j, sg, br, bc, nbr) \ + (t).x[(i0) / (br) + (nbr) * ((j) / (bc))] \ + [((i0) % (br)) / (sg) + ((j) % (bc)) * ((br) / (sg))] + +#define xlane_tile_access(t, i, j, sg, br, bc, nbr) \ + sub_group_broadcast(tile_access(t, i, j, sg, br, bc, nbr), i % sg) + +#define DECLARE_2D_TILE_OPS(tile_type, element_type, sg, br, bc, nbr, nbc) \ + __attribute__((overloadable)) void tile_load_full(tile_type *t, \ + const global element_type *ptr, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[i]; \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load_full(tile_type *t, \ + const local element_type *ptr, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[i]; \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load(tile_type *t, \ + const global element_type *ptr, int m, int n, int ld, \ + int offset_r, int offset_c) { \ + if (m >= offset_r + br * nbr && n >= offset_c + bc * nbc) { \ + tile_load_full(t, ptr, ld, offset_r, offset_c); \ + return; \ + } \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + if (offset_c + j < n) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + if (offset_r + i < m) \ + tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[i]; \ + } \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load(tile_type *t, \ + const global element_type *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_load(t, ptr, m, n, m, offset_r, offset_c); \ + } \ + __attribute__((overloadable)) void tile_load_t_full(tile_type *t, \ + const global element_type *ptr, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_r + offset_c; \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg, ptr += ld*sg) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[get_sub_group_local_id() * ld + j]; \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load_t(tile_type *t, \ + const global element_type *ptr, int m, int n, int ld, \ + int offset_r, int offset_c) { \ + if (m >= offset_r + br * nbr && n >= offset_c + bc * nbc) { \ + tile_load_t_full(t, ptr, ld, offset_r, offset_c); \ + return; \ + } \ + ptr += ld * offset_r + offset_c; \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg, ptr += ld*sg) { \ + int i = i0 + get_sub_group_local_id(); \ + if (offset_r + i < m) \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + if (offset_c + j < n) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) = ptr[get_sub_group_local_id() * ld + j]; \ + } \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load_t(tile_type *t, \ + const global element_type *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_load(t, ptr, m, n, n, offset_r, offset_c); \ + } \ + __attribute__((overloadable)) void tile_store_full(tile_type t, \ + local element_type *ptr, int ld, int offset_r, int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + ptr[i] = tile_access(t, i0, j, sg, br, bc, nbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_store_full(tile_type t, \ + global element_type *ptr, int ld, int offset_r, int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + ptr[i] = tile_access(t, i0, j, sg, br, bc, nbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_store(tile_type t, \ + global element_type *ptr, int m, int n, int ld, int offset_r, \ + int offset_c) { \ + if (m >= offset_r + br * nbr && n >= offset_c + bc * nbc) { \ + tile_store_full(t, ptr, ld, offset_r, offset_c); \ + return; \ + } \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + if (offset_c + j < n) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + if (offset_r + i < m) \ + ptr[i] = tile_access(t, i0, j, sg, br, bc, nbr); \ + } \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_store(tile_type t, \ + global element_type *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_store(t, ptr, m, n, m, offset_r, offset_c); \ + } \ + __attribute__((overloadable)) void tile_store_t_sys_src1(tile_type t, \ + local element_type *ptr, int ld, int offset_r, int offset_c) { \ + offset_c += get_sub_group_local_id(); \ + int offset_r0 = offset_r & (sg - 1); \ + int offset_r1 = offset_r & ~(sg - 1); \ + ptr += offset_r0 + sg * offset_c + ld * offset_r1; \ + _Pragma("unroll") for (int j0 = 0; j0 < br * nbr; \ + j0 += sg, ptr += sg * sg) { \ + _Pragma("unroll") for (int i = 0; i < bc * nbc; i++) ptr[i] \ + = tile_access(t, j0, i, sg, br, bc, nbr); \ + } \ + } \ + __attribute__((overloadable)) void tile_store_t_sys_src2(tile_type t, \ + local element_type *ptr, int tile_n, int ld, int offset_r, \ + int offset_c) { \ + const int cp = 32 / sizeof(element_type); \ + offset_c += get_sub_group_local_id(); \ + int offset_r0 = offset_r & (cp - 1); \ + int offset_r1 = offset_r & ~(cp - 1); \ + ptr += offset_r0 + tile_n * offset_r1; \ + _Pragma("unroll") for (int j0 = 0; j0 < br * nbr; \ + j0 += sg, offset_c += sg) { \ + int offset_c0 = offset_c & (tile_n - 1); \ + int offset_c1 = offset_c & ~(tile_n - 1); \ + local element_type *ptr_j = ptr + cp * offset_c0 + ld * offset_c1; \ + _Pragma("unroll") for (int i = 0; i < bc * nbc; i++) { \ + *ptr_j = tile_access(t, j0, i, sg, br, bc, nbr); \ + ptr_j++; \ + if ((~i & (cp - 1)) == 0) ptr_j += cp * (tile_n - 1); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_atomic_max_full(tile_type t, \ + local element_type *ptr, int ld, int offset_r, int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = i0 + get_sub_group_local_id(); \ + (void)local_atomic_max( \ + ptr + i, tile_access(t, i0, j, sg, br, bc, nbr)); \ + } \ + } \ + } + +#define DECLARE_2D_TILE_VREDUCE(tile_type, sg, br, bc, nbr, nbc, rtile_type, \ + rsg, rbr, rbc, rnbr, rnbc) \ + __attribute__((overloadable)) void tile_vreduce_add( \ + tile_type t, rtile_type *tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*tr, i0, 0, rsg, rbr, rbc, rnbr) \ + += tile_access(t, i0, j, sg, br, bc, nbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_vreduce_max( \ + tile_type t, rtile_type *tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*tr, i0, 0, rsg, rbr, rbc, rnbr) \ + = max(tile_access(t, i0, j, sg, br, bc, nbr), \ + tile_access(*tr, i0, 0, rsg, rbr, rbc, rnbr)); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_vbroadcast_sub( \ + tile_type *t, rtile_type tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) \ + -= tile_access(tr, i0, 0, rsg, rbr, rbc, rnbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_vbroadcast_mul( \ + tile_type *t, rtile_type tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) \ + *= tile_access(tr, i0, 0, rsg, rbr, rbc, rnbr); \ + } \ + } \ + } + +#define DECLARE_2D_TILE_HREDUCE(tile_type, sg, br, bc, nbr, nbc, rtile_type, \ + rsg, rbr, rbc, rnbr, rnbc) \ + __attribute__((overloadable)) void tile_hbroadcast_add( \ + tile_type *t, rtile_type tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) \ + += xlane_tile_access(tr, j, 0, rsg, rbr, rbc, rnbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_hbroadcast_mul( \ + tile_type *t, rtile_type tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) \ + *= xlane_tile_access(tr, j, 0, rsg, rbr, rbc, rnbr); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_hbroadcast_min( \ + tile_type *t, rtile_type tr) { \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + tile_access(*t, i0, j, sg, br, bc, nbr) = min( \ + tile_access(*t, i0, j, sg, br, bc, nbr), \ + xlane_tile_access(tr, j, 0, rsg, rbr, rbc, rnbr)); \ + } \ + } \ + } + +#define DECLARE_2D_TILE_RSELECT(tile_type0, sg0, br0, bc0, nbr0, nbc0, \ + tile_type1, sg1, br1, bc1, nbr1, nbc1) \ + __attribute__((overloadable)) void tile_rselect( \ + tile_type0 *t0, tile_type1 t1, int idx) { \ + _Pragma("unroll") for (int j = 0; j < bc0 * nbc0; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br0 * nbr0; i0 += sg0) { \ + tile_access(*t0, i0, j, sg0, br0, bc0, nbr0) \ + = tile_access(t1, i0, j, sg1, br1, bc1, nbr1); \ + _Pragma("unroll") for (int z = 1; \ + z < (br1 * nbr1 / br0 * nbr0); \ + z++) if (z == idx) { \ + tile_access(*t0, i0, j, sg0, br0, bc0, nbr0) \ + = tile_access(t1, i0 + z * br0 * nbr0, j, sg1, \ + br1, bc1, nbr1); \ + } \ + } \ + } \ + } + +#define DECLARE_2D_TILE_COPY_REBLOCK(tile_type0, sg0, br0, bc0, nbr0, nbc0, \ + tile_type1, sg1, br1, bc1, nbr1, nbc1) \ + __attribute__((overloadable)) void tile_copy_reblock( \ + tile_type0 t0, tile_type1 *t1) { \ + _Pragma("unroll") for (int j = 0; j < bc0 * nbc0; j++) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br0 * nbr0; i0 += sg0) { \ + tile_access(*t1, i0, j, sg1, br1, bc1, nbr1) \ + = tile_access(t0, i0, j, sg0, br0, bc0, nbr0); \ + } \ + } \ + } + +#define DECLARE_2D_TILE(tile_type, element_type, sg, br, bc, nbr, nbc) \ + typedef element_type __attribute__((ext_vector_type(br * bc / sg))) \ + _e_##tile_type; \ + typedef struct { \ + _e_##tile_type x[nbr * nbc]; \ + } tile_type; \ + DECLARE_2D_TILE_OPS(tile_type, element_type, sg, br, bc, nbr, nbc) + +/* Requires bc == 1 currently */ +#define DECLARE_2D_TILE_BLOCK_OPS( \ + tile_type, element_type, sg, br, bc, nbr, nbc) \ + __attribute__((overloadable)) void tile_load_block(tile_type *t, \ + const global element_type *ptr, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++)(t) \ + ->x[ii + nbr * jj] \ + = block_load(ptr + ii * br, br / SUBGROUP_SIZE); \ + } \ + } \ + __attribute__((overloadable)) void tile_store_block(tile_type t, \ + global element_type *ptr, int ld, int offset_r, int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++) \ + block_store(ptr + ii * br, (t).x[ii + nbr * jj]); \ + } \ + } \ + __attribute__((overloadable)) void tile_load_block(tile_type *t, \ + const global element_type *ptr, int n, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + n -= offset_c; \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \ + if (jj < n) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++)(t) \ + ->x[ii + nbr * jj] \ + = block_load(ptr + ii * br, br / SUBGROUP_SIZE); \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_store_block(tile_type t, \ + global element_type *ptr, int n, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + n -= offset_c; \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++, ptr += ld * bc) { \ + if (jj < n) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++) \ + block_store(ptr + ii * br, (t).x[ii + nbr * jj]); \ + } \ + } \ + } + +#define DECLARE_2D_TILE_BLOCK2D_OPS( \ + tile_type, element_type, sg, br, bc, nbr, nbc) \ + __attribute__((overloadable)) void tile_load_block2d(tile_type *t, \ + const global element_type *ptr, int m, int n, int ld, \ + int offset_r, int offset_c) { \ + const int e = sizeof(element_type); \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++)(t) \ + ->x[ii + nbr * jj] \ + = block2d_load(ptr, m * e, n, ld * e, offset_r + ii * br, \ + offset_c + jj * bc, br, bc, sg); \ + } \ + } \ + __attribute__((overloadable)) void tile_load_block2d(tile_type *t, \ + const global element_type *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_load_block2d(t, ptr, m, n, m, offset_r, offset_c); \ + } \ + __attribute__((overloadable)) void tile_store_block2d(tile_type t, \ + global element_type *ptr, int m, int n, int ld, int offset_r, \ + int offset_c) { \ + const int e = sizeof(element_type); \ + _Pragma("unroll") for (int jj = 0; jj < nbc; jj++) { \ + _Pragma("unroll") for (int ii = 0; ii < nbr; ii++) block2d_store( \ + (t).x[ii + nbr * jj], ptr, m *e, n, ld *e, \ + offset_r + ii * br, offset_c + jj * bc, br, bc, sg); \ + } \ + } \ + __attribute__((overloadable)) void tile_store_block2d(tile_type t, \ + const global element_type *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_store_block2d(t, ptr, m, n, m, offset_r, offset_c); \ + } + +#define DECLARE_2D_TILE_LOAD_PACKED_HALF(tile_type, sg, br, bc, nbr, nbc) \ + __attribute__((overloadable)) void tile_load_packed_half(tile_type *t, \ + const global half *ptr, int m, int n, int ld, int offset_r, \ + int offset_c) { \ + ptr += ld * offset_c + offset_r; \ + _Pragma("unroll") for (int j = 0; j < bc * nbc; j++, ptr += ld) { \ + if (offset_c + j < n) { \ + _Pragma("unroll") for (int i0 = 0; i0 < br * nbr; i0 += sg) { \ + int i = 2 * (i0 + get_sub_group_local_id()); \ + half2 loaded = 0; \ + if (offset_r + i < m) loaded.s0 = ptr[i]; \ + if (offset_r + i + 1 < m) loaded.s1 = ptr[i + 1]; \ + tile_access(*t, i0, j, sg, br, bc, nbr) = as_uint(loaded); \ + } \ + } \ + } \ + } \ + __attribute__((overloadable)) void tile_load_packed_half(tile_type *t, \ + const global half *ptr, int m, int n, int offset_r, \ + int offset_c) { \ + tile_load_packed_half(t, ptr, m, n, m, offset_r, offset_c); \ + } + +#define cooperative_prefetch_2d(ptr, r, c, ld, sg_id, n_sg, sg_size, caching) \ + cooperative_prefetch_2d_internal((const global char *)ptr, \ + (r) * sizeof(*(ptr)), c, (ld) * sizeof(*(ptr)), sg_id, n_sg, \ + sg_size, caching) + +#define cooperative_prefetch_2d_rem( \ + ptr, r, c, rmax, cmax, ld, sg_id, n_sg, sg_size, caching) \ + cooperative_prefetch_2d_internal((const global char *)ptr, \ + (r) * sizeof(*(ptr)), c, (rmax) * sizeof(*(ptr)), cmax, \ + (ld) * sizeof(*(ptr)), sg_id, n_sg, sg_size, caching) + +/* IGC prefetch intrinsics */ +enum LSC_LDCC { + LSC_LDCC_DEFAULT = 0, + LSC_LDCC_L1UC_L3UC = 1, + LSC_LDCC_L1UC_L3C = 2, + LSC_LDCC_L1C_L3UC = 3, + LSC_LDCC_L1C_L3C = 4, + LSC_LDCC_L1S_L3UC = 5, + LSC_LDCC_L1S_L3C = 6, + LSC_LDCC_L1IAR_L3C = 7, +}; + +extern void __builtin_IB_lsc_prefetch_global_uchar( + const __global uchar *base, int immElemOff, enum LSC_LDCC cacheOpt); + +extern void __builtin_IB_lsc_prefetch_global_uint( + const __global uint *base, int immElemOff, enum LSC_LDCC cacheOpt); + +__attribute__((overloadable)) void cooperative_prefetch_2d_internal( + const global char *ptr, uint rbytes, uint c, uint ld_bytes, uint sg_id, + uint n_sg, uint sg_size, enum LSC_LDCC caching) { + const uint cl_per_col = (rbytes + 63) >> 6; + const uint cl = cl_per_col * c; + const uint cl_per_sg = (cl + n_sg - 1) / n_sg; + const uint cl_iters = (cl_per_sg + sg_size - 1) / sg_size; +#pragma unroll + for (uint ii_cl = 0; ii_cl < cl_iters; ii_cl++) { + uint i_cl = ii_cl + (sg_id * cl_per_sg) + get_sub_group_local_id(); + uint r_cl = i_cl % cl_per_col; + uint c_cl = i_cl / cl_per_col; + if (i_cl < cl) { + __builtin_IB_lsc_prefetch_global_uint( + (const global uint *)(ptr + r_cl * 64 + c_cl * ld_bytes), 0, + caching); + } + } +} + +__attribute__((overloadable)) void cooperative_prefetch_2d_internal( + const global char *ptr, uint rbytes, uint c, uint rbytes_max, + uint c_max, uint ld_bytes, uint sg_id, uint n_sg, uint sg_size, + enum LSC_LDCC caching) { + const uint cl_per_col = (rbytes_max + 63) >> 6; + const uint cl = cl_per_col * c_max; + const uint cl_per_sg = (cl + n_sg - 1) / n_sg; + const uint cl_iters = (cl_per_sg + sg_size - 1) / sg_size; + const uint max_off = rbytes - 1 + (c - 1) * ld_bytes; +#pragma unroll + for (uint ii_cl = 0; ii_cl < cl_iters; ii_cl++) { + uint i_cl = ii_cl + (sg_id * cl_per_sg) + get_sub_group_local_id(); + uint r_cl = i_cl % cl_per_col; + uint c_cl = i_cl / cl_per_col; + uint pf_off = min(r_cl * 64 + c_cl * ld_bytes, max_off); + if (i_cl < cl) { + __builtin_IB_lsc_prefetch_global_uchar( + (const global uchar *)(ptr + pf_off), 0, caching); + } + } +} + +#endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/vec_typedefs.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/vec_typedefs.cl index ae4adfaf717fbd..29ca3ddb7185f7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/vec_typedefs.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/include/batch_headers/vec_typedefs.cl @@ -2,27 +2,25 @@ // SPDX-License-Identifier: Apache-2.0 // -typedef struct half1 { half s0; } half1; typedef struct half5 { half s0; half s1; half s2; half s3; half s4; } half5; typedef struct half6 { half s0; half s1; half s2; half s3; half s4; half s5; } half6; typedef struct half7 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; } half7; -typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; +typedef struct half9 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; } half9; -typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; +typedef struct half10 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; half s9; } half10; -typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; +typedef struct half11 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; half s9; half sa; } half11; -typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; +typedef struct half12 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; half s9; half sa; half sb;} half12; -typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; +typedef struct half13 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; half s9; half sa; half sb; half sc;} half13; -typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; +typedef struct half14 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; half s9; half sa; half sb; half sc; half se;} half14; -typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; +typedef struct half15 { half s0; half s1; half s2; half s3; half s4; half s5; half s6; half s7; half s8; half s9; half sa; half sb; half sc; half se; half sf;} half15; typedef struct half0 { half s0; } half0; //never used but makes compiler happy. -typedef struct float1 { float s0; } float1; typedef struct float5 { float s0; float s1; float s2; float s3; float s4; } float5; typedef struct float6 { float s0; float s1; float s2; float s3; float s4; float s5; } float6; typedef struct float7 { float s0; float s1; float s2; float s3; float s4; float s5; float s6; } float7; diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl new file mode 100644 index 00000000000000..575d9fdff5bd32 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/sdpa_micro.cl @@ -0,0 +1,461 @@ +/******************************************************************************* +* Copyright 2024 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "include/batch_headers/generic_vector_ops.cl" +#include "include/batch_headers/sdpa_utils.cl" +#include "include/batch_headers/tile_ops.cl" + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define DIV_UP(x, y) (((x) + (y)-1) / (y)) + +#define sg_per_wg (ugemm_kq_sg_per_wg_m * ugemm_kq_sg_per_wg_n) +#define q_tile_sg_n DIV_UP(ugemm_kq_wg_tile_n, sg_per_wg) + +/* Instantiate tile types and operations */ +typedef ugemm_kq_c_type s_tile_type; +typedef ugemm_vs_c_type a_tile_type; + +DECLARE_2D_TILE(q_tile_type, uint, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n) + +#ifdef BLOCK_Q +DECLARE_2D_TILE_BLOCK_OPS( + q_tile_type, uint, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n) +#elif Q_ALIGN < 4 +DECLARE_2D_TILE_LOAD_PACKED_HALF( + q_tile_type, SUBGROUP_SIZE, D_MAX / 2, 1, 1, q_tile_sg_n) +#endif + +#ifdef BLOCK_A +DECLARE_2D_TILE(a_tile_type_half, half, SUBGROUP_SIZE, ugemm_vs_sg_tile_m, 1, 1, + ugemm_vs_sg_tile_n) +#else +DECLARE_2D_TILE(a_tile_type_half, half, SUBGROUP_SIZE, ugemm_vs_sg_tile_m, 8, 1, + ugemm_vs_sg_tile_n / 8) +#endif + +DECLARE_2D_TILE(s_tile_type_half2, uint, SUBGROUP_SIZE, ugemm_kq_c_type_block0, + ugemm_kq_c_type_block1 / 2, ugemm_kq_c_type_nblock0, + ugemm_kq_c_type_nblock1) + +DECLARE_2D_TILE( + s_sum_tile_type, float, SUBGROUP_SIZE, ugemm_kq_sg_tile_n, 1, 1, 1) + +DECLARE_2D_TILE( + a_scale_tile_type, float, SUBGROUP_SIZE, ugemm_vs_sg_tile_n, 1, 1, 1) + + +DECLARE_2D_TILE(mask_tile_type, half, SUBGROUP_SIZE, ugemm_kq_c_type_block0, ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0, ugemm_kq_c_type_nblock1) +DECLARE_2D_TILE(mask_tile_type_float, float, SUBGROUP_SIZE, ugemm_kq_c_type_block0, ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0, ugemm_kq_c_type_nblock1) + +#ifdef BLOCK_A +DECLARE_2D_TILE_BLOCK_OPS(a_tile_type_half, half, SUBGROUP_SIZE, + ugemm_vs_sg_tile_m, 1, 1, ugemm_vs_sg_tile_n) +#endif +#ifdef BLOCK_2D_A +DECLARE_2D_TILE_BLOCK2D_OPS(a_tile_type_half, half, SUBGROUP_SIZE, + ugemm_vs_sg_tile_m, 8, 1, ugemm_vs_sg_tile_n / 8) +#endif + +#ifdef BLOCK_A +DECLARE_2D_TILE_COPY_REBLOCK(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0, + ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0, + ugemm_vs_c_type_nblock1, a_tile_type_half, SUBGROUP_SIZE, + ugemm_vs_sg_tile_m, 1, 1, ugemm_vs_sg_tile_n) +#else +DECLARE_2D_TILE_COPY_REBLOCK(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0, + ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0, + ugemm_vs_c_type_nblock1, a_tile_type_half, SUBGROUP_SIZE, + ugemm_vs_sg_tile_m, 8, 1, ugemm_vs_sg_tile_n / 8) +#endif + +DECLARE_2D_TILE_VREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0, + ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0, + ugemm_kq_c_type_nblock1, s_sum_tile_type, SUBGROUP_SIZE, + ugemm_kq_sg_tile_n, 1, 1, 1) + +DECLARE_2D_TILE_HREDUCE(s_tile_type, SUBGROUP_SIZE, ugemm_kq_c_type_block0, + ugemm_kq_c_type_block1, ugemm_kq_c_type_nblock0, + ugemm_kq_c_type_nblock1, mask_tile_type_float, SUBGROUP_SIZE, + ugemm_kq_sg_tile_m, 1, 1, 1) + +DECLARE_2D_TILE_HREDUCE(a_tile_type, SUBGROUP_SIZE, ugemm_vs_c_type_block0, + ugemm_vs_c_type_block1, ugemm_vs_c_type_nblock0, + ugemm_vs_c_type_nblock1, a_scale_tile_type, SUBGROUP_SIZE, + ugemm_vs_sg_tile_n, 1, 1, 1) + +#if ugemm_kq_wg_tile_n == ugemm_vs_wg_tile_n \ + && (ugemm_kq_sg_tile_n % ugemm_vs_sg_tile_n) == 0 +DECLARE_2D_TILE_RSELECT(a_scale_tile_type, SUBGROUP_SIZE, ugemm_vs_sg_tile_n, 1, + 1, 1, s_sum_tile_type, SUBGROUP_SIZE, ugemm_kq_sg_tile_n, 1, 1, 1) +#endif + +#if PREFETCH_REMAINDER +#define cooperative_prefetch_2d_maybe_rem cooperative_prefetch_2d_rem +#else +#define cooperative_prefetch_2d_maybe_rem( \ + ptr, r, c, rmax, cmax, ld, sg_id, n_sg, sg_size, caching) \ + cooperative_prefetch_2d(ptr, rmax, cmax, ld, sg_id, n_sg, sg_size, caching) +#endif + +#if TRANSPOSE_K +#define cooperative_prefetch_2d_k( \ + ptr, r, c, rmax, cmax, ld, sg_id, n_sg, sg_size, caching) \ + cooperative_prefetch_2d_maybe_rem( \ + ptr, c, r, cmax, rmax, ld, sg_id, n_sg, sg_size, caching) +#else +#define cooperative_prefetch_2d_k cooperative_prefetch_2d_maybe_rem +#endif + +#if REMAINDER_Q +#define tile_load_block_rem_q tile_load_block +#define tile_store_block_rem_q tile_store_block +#else +#define tile_load_block_rem_q(t, ptr, n, ld, off_r, off_c) \ + tile_load_block(t, ptr, ld, off_r, off_c) +#define tile_store_block_rem_q(t, ptr, n, ld, off_r, off_c) \ + tile_store_block(t, ptr, ld, off_r, off_c) +#endif + +#define binary_add(x, y) ((x) + (y)) + +__attribute__((intel_reqd_sub_group_size(SUBGROUP_SIZE))) +KERNEL(micro_sdpa)(OPTIONAL_SHAPE_INFO_ARG + const global half *K, const global half *Q, const global half *V, + global half *A, +#if WITH_ATTN_MASK + const global half *msk, +#endif +#if WITH_SCALE + global SCALE_DATA_T *scale_ptr, +#endif + int d, int k, int q) { + uint sg_ij = sub_group_broadcast(get_local_id(1), 0); + uint b0 = get_group_id(1); + uint b1 = get_group_id(2); + + uint wg_j0 = get_group_id(0) * ugemm_kq_wg_tile_n; + + /* Leading dimension for matrices */ + uint ldk = TRANSPOSE_K ? KEY_S3 : KEY_S2; + uint ldq = QRY_S2; + uint ldv = VAL_S2; + uint lda = DST_S2; + + /* Subgroup IDs for each GEMM */ + uint sg_i_kq = sg_ij % ugemm_kq_sg_per_wg_m; + uint sg_j_kq = sg_ij / ugemm_kq_sg_per_wg_m; + + uint sg_i_vs = sg_ij % ugemm_vs_sg_per_wg_m; + uint sg_j_vs = sg_ij / ugemm_vs_sg_per_wg_m; + + /* SLM allocations -- place in one array to work around compiler bug */ +#define Q_slm_size (D_MAX * ugemm_kq_wg_tile_n * sizeof(half)) +#define S_slm_size (ugemm_kq_wg_tile_m * ugemm_kq_wg_tile_n * sizeof(half)) +#define S_sum_slm_size \ + (ugemm_kq_wg_tile_n * ugemm_kq_sg_per_wg_m * sizeof(float)) +#define S_max_slm_size (ugemm_kq_wg_tile_n * sizeof(float)) +#define ugemm_slm_size MAX(ugemm_kq_slm_size, ugemm_vs_slm_size) + + local char slm[Q_slm_size + S_slm_size + S_sum_slm_size + S_max_slm_size + + ugemm_slm_size]; + + local half *Q_slm = (local half *)&slm[0]; + local half *S_slm = (local half *)&slm[Q_slm_size]; + local float *S_sum_slm = (local float *)&slm[Q_slm_size + S_slm_size]; + local float *S_max_slm + = (local float *)&slm[Q_slm_size + S_slm_size + S_sum_slm_size]; + local uint *ugemm_slm = (local uint *)&slm[Q_slm_size + S_slm_size + + S_sum_slm_size + S_max_slm_size]; + + const bool need_sum_barrier = (ugemm_vs_barrier_count == 0); + + /* Locate K/Q/V/A matrices within batch */ + K += KEY_OFF(b1, (b0 / KV_GROUP_SIZE), 0, 0); + Q += QRY_OFF(b1, b0, 0, 0); + V += VAL_OFF(b1, (b0 / KV_GROUP_SIZE), 0, 0); + A += DST_OFF(b1, b0, 0, 0, 0); + + __builtin_assume_aligned(K, K_ALIGN); + __builtin_assume_aligned(Q, Q_ALIGN); + __builtin_assume_aligned(V, V_ALIGN); + __builtin_assume_aligned(A, A_ALIGN); + + /* Load Q tile, destined for SLM */ + q_tile_type Q_tile; + uint q0_copy = q_tile_sg_n * sg_ij; +#ifdef BLOCK_Q + tile_load_block_rem_q( + &Q_tile, (global uint *)Q, q, ldq >> 1, 0, wg_j0 + q0_copy); +#elif Q_ALIGN >= 4 + tile_load(&Q_tile, (global uint *)Q, (d + 1) >> 1, q, ldq >> 1, 0, + wg_j0 + q0_copy); +#else + tile_load_packed_half(&Q_tile, Q, d, q, ldq, 0, wg_j0 + q0_copy); +#endif + +#if WITH_SCALE + /* Load scale */ + #if INVERT_SCALE + float iscale = convert_float(*scale_ptr); + float scale = native_recip(iscale); + #else + float scale = convert_float(*scale_ptr); + float iscale = native_recip(scale); + #endif +#else + float iscale = sqrt(convert_float(INPUT1_SIZE_X)); + float scale = native_recip(iscale); +#endif + scale *= 1.442695f; // log2(e) + +#ifdef PREFETCH_K0 + /* Prefetch first K tile. */ + cooperative_prefetch_2d_k(K, k, d, ugemm_kq_wg_tile_m, PREFETCH_D_MAX, ldk, + sg_ij, sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif + + /* Initialize S column sums in SLM to -inf */ + const uint n_col_sg = DIV_UP(ugemm_kq_wg_tile_n, SUBGROUP_SIZE * sg_per_wg); + const float neg_inf = -INFINITY; + +#pragma unroll + for (int q = 0; q < n_col_sg; q++) + intel_sub_group_block_write( + (local uint *)&S_max_slm[(q + sg_ij * n_col_sg) + * SUBGROUP_SIZE], + as_uint(neg_inf)); + + /* Clear accumulator */ + a_tile_type A_tile; + tile_fill(A_tile, 0.0f); + + /* Store Q tile to SLM */ + tile_store_t_sys_src1( + Q_tile, (local uint *)&Q_slm[0], D_MAX / 2, q0_copy, 0); + + /* Clear S column sums/maxes */ + s_sum_tile_type S_sum_tile; + s_sum_tile_type S_max_tile, S_max_tile_old; + tile_fill(S_sum_tile, 0.0f); + tile_fill(S_max_tile, -INFINITY); + + /* Wait for Q data to reach SLM */ + barrier(CLK_LOCAL_MEM_FENCE); + + /* Main loop over k blocks */ + for (int k0 = 0; k0 < k; k0 += ugemm_kq_wg_tile_m) { + bool first = (k0 == 0); + bool last = (k0 + ugemm_kq_wg_tile_m >= k); + + uint sg_i0_kq = sg_i_kq * ugemm_kq_sg_tile_m; + uint sg_j0_kq = sg_j_kq * ugemm_kq_sg_tile_n; + +#if WITH_ATTN_MASK + mask_tile_type mask_tile; + tile_load_t(&mask_tile, msk, q, k, q, sg_j0_kq + wg_j0, k0 + sg_i0_kq); +#endif + +#if REMAINDER_K + /* Prepare k mask: NaN in bounds, -inf out of bounds */ + mask_tile_type_float k_mask; +#pragma unroll + for (int ii = 0; ii < ugemm_kq_sg_tile_m / SUBGROUP_SIZE; ii++) + k_mask.x[0][ii] = (k0 + sg_i0_kq + ii * SUBGROUP_SIZE + + get_sub_group_local_id() + < k) + ? nan(0u) + : -INFINITY; +#endif + + /* Calculate S = (K^T) * Q */ + s_tile_type S_tile + = ugemm_kq(K, ldk, Q_slm, D_MAX, k, ugemm_kq_wg_tile_n, d, k0, + 0, 0, sg_i_kq, sg_j_kq, (local char *)ugemm_slm); + + /* Apply attention mask */ +#if WITH_ATTN_MASK +#define unscale(x) ((x)*iscale) + mask_tile_type_float mask_tile_float; + tile_copy(mask_tile, mask_tile_float); + tile_elementwise(mask_tile_float, unscale); + tile_binary(S_tile, mask_tile_float, binary_add); +#endif + + /* Apply k mask */ +#if REMAINDER_K + tile_hbroadcast_min(&S_tile, k_mask); +#endif + + /* Before softmax, we will need to scale columns by maximum values to avoid overflow. */ + + /* Compute our maxima and reduce across SLM */ + tile_vreduce_max(S_tile, &S_max_tile); + tile_atomic_max_full( + S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0); + intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE); + +#ifdef PREFETCH_V + /* Prefetch V tile. */ + cooperative_prefetch_2d_maybe_rem(V, d, k - k0, D_MAX, + (ugemm_kq_wg_tile_m * PREFETCH_D_MAX) / D_MAX, ldv, sg_ij, + sg_per_wg, SUBGROUP_SIZE, LSC_LDCC_L1C_L3C); +#endif + +#ifndef ALT_MAX + /* Read back WG-wide maxima */ + intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE); + tile_load_full(&S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0); +#endif + + tile_vbroadcast_sub(&S_tile, S_max_tile); + +/* Scale + exponentiate */ +#define scaled_exp(x) native_vexp2(x *scale) + tile_elementwise(S_tile, scaled_exp); + +#ifdef ALT_MAX + /* Read back WG-wide maxima and adjust S to match */ + intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE); + s_sum_tile_type S_max_tile1; + tile_copy(S_max_tile, S_max_tile1); + tile_load_full(&S_max_tile, S_max_slm, ugemm_kq_wg_tile_n, sg_j0_kq, 0); + +#define binary_exp_neg(x, y) native_vexp2(scale *((x) - (y))) + tile_binary(S_max_tile1, S_max_tile, binary_exp_neg); + tile_vbroadcast_mul(&S_tile, S_max_tile1); +#endif + + /* Accumulate sums. S tile is transposed for easy summation. */ + s_sum_tile_type S_sum_tile1; + tile_fill(S_sum_tile1, 0.0f); + tile_vreduce_add(S_tile, &S_sum_tile1); + + /* Convert to half, VNNI format */ + s_tile_type_half2 S_tile_half2; + tile_copy_to_half2(S_tile, S_tile_half2); + + /* Store to SLM, in packed format */ + tile_store_t_sys_src2(S_tile_half2, (local uint *)S_slm, + ugemm_vs_sg_tile_n, ugemm_kq_wg_tile_m / 2, sg_i0_kq / 2, + sg_j0_kq); + intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE); + + /* Rescale existing accumulator and sums to match new maxima */ + if (!first) { +#define binary_exp_sub(x, y) native_vexp2(scale *((x) - (y))) +#define binary_mul(x, y) ((x) * (y)) + tile_binary(S_max_tile_old, S_max_tile, binary_exp_sub); + tile_binary(S_sum_tile, S_max_tile_old, binary_mul); + + /* Find the subset of sums that applies to the accumulation tile */ + a_scale_tile_type A_scale_tile; +#if ugemm_kq_wg_tile_n == ugemm_vs_wg_tile_n \ + && ugemm_kq_sg_tile_n == ugemm_vs_sg_tile_n + tile_copy(S_max_tile_old, A_scale_tile); +#elif ugemm_kq_wg_tile_n == ugemm_vs_wg_tile_n \ + && (ugemm_kq_sg_tile_n % ugemm_vs_sg_tile_n) == 0 + tile_rselect(&A_scale_tile, S_max_tile_old, + sg_j_vs % (ugemm_kq_sg_tile_n / ugemm_vs_sg_tile_n)); +#else +#error unimplemented +#endif + tile_hbroadcast_mul(&A_tile, A_scale_tile); + } + +/* Accumulate sums */ + tile_binary(S_sum_tile, S_sum_tile1, binary_add); + + /* Save maxima */ + tile_copy(S_max_tile, S_max_tile_old); + + /* Last iteration: store column sums in SLM */ + if (last) { + tile_store_full(S_sum_tile, S_sum_slm, ugemm_kq_wg_tile_n, sg_j0_kq, + sg_i_kq); + } + +#ifdef PREFETCH_K + /* Prefetch next K tile. */ + if (!last) { +#if TRANSPOSE_K + const uint stride_k = ldk; +#else + const uint stride_k = 1; +#endif + cooperative_prefetch_2d_k(K + (k0 + ugemm_kq_wg_tile_m) * stride_k, + k - k0 - ugemm_kq_wg_tile_m, d, ugemm_kq_wg_tile_m, + PREFETCH_D_MAX, ldk, sg_ij, sg_per_wg, SUBGROUP_SIZE, + LSC_LDCC_L1C_L3C); + } +#endif +#if WITH_ATTN_MASK && defined(PREFETCH_MASK) + /* Prefetch next mask tile. */ + if (!last) { + cooperative_prefetch_2d(msk + k0 + ugemm_kq_wg_tile_m + sg_i0_kq + (sg_j0_kq + wg_j0) * q, + ugemm_kq_sg_tile_m, ugemm_kq_sg_tile_n, 0, 0, 1, SUBGROUP_SIZE, + LSC_LDCC_L1UC_L3C); + } +#endif + + /* Wait for S stores */ + intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE); + + /* Last iteration: signal column sums are ready */ + if (last && need_sum_barrier) + intel_work_group_barrier_arrive(CLK_LOCAL_MEM_FENCE); + + /* Accumulate A += V * S */ + int k_chunk = min(k - k0, ugemm_kq_wg_tile_m); + a_tile_type A_tile1 = ugemm_vs(V, ldv, S_slm, ugemm_kq_wg_tile_m, d, + ugemm_kq_wg_tile_n, k_chunk, 0, 0, 0, sg_i_vs, sg_j_vs, + (local char *)ugemm_slm); + V += ldv * ugemm_kq_wg_tile_m; + tile_binary(A_tile, A_tile1, binary_add); + } + + /* Wait for column sums to be ready */ + if (need_sum_barrier) intel_work_group_barrier_wait(CLK_LOCAL_MEM_FENCE); + + /* Load column sums from SLM + reduce in registers */ + a_scale_tile_type A_scale_tile, A_scale_tile_load; + tile_fill(A_scale_tile, 0.0f); + +#pragma unroll + for (uint sg1 = 0; sg1 < ugemm_kq_sg_per_wg_m; sg1++) { + tile_load_full(&A_scale_tile_load, S_sum_slm, ugemm_kq_wg_tile_n, + ugemm_vs_sg_tile_n * sg_j_vs, sg1); + tile_binary(A_scale_tile, A_scale_tile_load, binary_add); + } + + /* Rescale by 1 / (column sums) */ + tile_elementwise(A_scale_tile, native_vrecip); + tile_hbroadcast_mul(&A_tile, A_scale_tile); + + /* Convert to half precision and store */ + a_tile_type_half A_tile_half; + tile_copy_reblock(A_tile, &A_tile_half); + + uint sg_i0_vs = sg_i_vs * ugemm_vs_sg_tile_m; + uint sg_j0_vs = sg_j_vs * ugemm_vs_sg_tile_n + wg_j0; + +#ifdef BLOCK_2D_A + tile_store_block2d(A_tile_half, A, d, q, lda, sg_i0_vs, sg_j0_vs); +#elif defined(BLOCK_A) + tile_store_block_rem_q(A_tile_half, A, q, lda, sg_i0_vs, sg_j0_vs); +#else + tile_store(A_tile_half, A, d, q, lda, sg_i0_vs, sg_j0_vs); +#endif +} diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h index 627fdf758586a3..40ac211b1d1026 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_common.h @@ -19,6 +19,10 @@ #define EXE_MODE_AGE_BASED "-cl-no-subgroup-ifp" #define EXE_MODE_NO_PRERA_SCH "-cl-intel-no-prera-scheduling" +namespace micro { +struct MicroKernelPackage; +} // namspace + namespace kernel_selector { #ifndef UNUSED @@ -64,6 +68,7 @@ struct KernelCode { struct clKernelData { KernelCode code; KernelParams params; + std::vector> micro_kernels; bool skip_execution = false; }; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h index 2154a6c9a51a60..4d776bf41f2439 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernel_selector_params.h @@ -357,6 +357,21 @@ enum class dev_type { discrete_gpu = 1 }; +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +// Arch type +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +enum class gpu_arch { + unknown = 0, + gen9 = 1, + gen11 = 2, + xe_lp = 3, + xe_hp = 4, + xe_hpg = 5, + xe_hpc = 6, + xe2 = 7, +}; + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // EngineInfo //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// @@ -377,9 +392,12 @@ struct EngineInfo { bool enable_sub_groups_emulation = false; bool bOptHintsSupport = false; bool bLocalBlockIOSupport = false; + bool supports_microkernels = false; uint32_t vendor_id = 0x0; dev_type deviceType = dev_type::integrated_gpu; uint32_t computeUnitsCount = 0; + uint32_t ip_version = 0; + gpu_arch arch = gpu_arch::unknown; uint32_t maxThreadsPerExecutionUnit = 0; uint32_t maxThreadsPerDevice = 0; uint64_t maxWorkGroupSize = 0; diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp new file mode 100644 index 00000000000000..e3604a481a8f09 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -0,0 +1,630 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef ENABLE_ONEDNN_FOR_GPU + +#include "sdpa_kernel_micro.h" +#include "common_tools.h" +#include "common_types.h" +#include "jitter.h" +#include "kernel_selector_common.h" +#include "kernel_selector_params.h" +#include "micro_utils.hpp" +#include "tensor_type.h" + +#include +#include +#include +#include + +namespace kernel_selector { + +namespace { + +size_t subgroup_size(gpu_arch arch) { + switch (arch) { + case gpu_arch::gen9: + case gpu_arch::gen11: + case gpu_arch::xe_lp: + case gpu_arch::xe_hp: + case gpu_arch::xe_hpg: return 8; + case gpu_arch::xe_hpc: + case gpu_arch::xe2: return 16; + default: return 0; + } +} + +inline int64_t get_d_max(int64_t head_size) { + for (int64_t i = 32; i <= 1024; i *= 2) + if (head_size <= i) + return i; + return head_size; +} + +micro::Type convert_type(Datatype t) { + switch (t) { + case Datatype::F32: return micro::Type::f32; + case Datatype::F16: return micro::Type::f16; + default: break; + } + OPENVINO_THROW("Unsupported dt: ", toString(t)); +} + +Tensor::NDims normalize_dims(const DataTensor& qkv) { + auto dims = qkv.GetDims(); // xyfb + std::reverse(dims.begin(), dims.end()); // bfyx + return dims; +} + +Tensor::Dim get_num_heads(const DataTensor& qkv, const std::vector& order) { + return normalize_dims(qkv)[order[1]]; +} + +Tensor::Dim get_seq_length(const DataTensor& qkv, const std::vector& order) { + return normalize_dims(qkv)[order[2]]; +} + +struct sdpa_config_t { + int unroll_m_kq, unroll_n_kq; // Subgroup tile sizes for K*Q GEMM + int unroll_m_vs, unroll_n_vs; // Subgroup tile sizes for V*S GEMM + int wg_m_kq, wg_n_kq; // Workgroup configuration for K*Q GEMM + int wg_m_vs, wg_n_vs; // Workgroup configuration for V*S GEMM +}; + +// Kernel configurations: +// h -- maximum head size = N +// s -- target sequence length = M +// 2nd -- second token (thin Q) +sdpa_config_t xehpg_h32 = {32, 16, 16, 16, 2, 16, 2, 16}; +sdpa_config_t xehpg_h32_s256 = {16, 16, 16, 16, 2, 8, 2, 8}; +sdpa_config_t xehpg_h32_s64 = {16, 16, 16, 8, 4, 4, 2, 8}; +sdpa_config_t xehpg_h32_s32 = {8, 8, 8, 8, 4, 4, 4, 4}; +sdpa_config_t xehpg_h32_2nd = {8, 32, 16, 8, 8, 1, 2, 4}; + +sdpa_config_t xehpg_h64 = {32, 16, 16, 16, 4, 8, 4, 8}; +sdpa_config_t xehpg_h64_s128 = {16, 16, 16, 16, 4, 8, 4, 8}; +sdpa_config_t xehpg_h64_s64 = {32, 16, 16, 8, 8, 4, 4, 8}; +sdpa_config_t xehpg_h64_2nd = {8, 16, 16, 8, 8, 1, 4, 2}; + +sdpa_config_t xehpg_h128 = {16, 16, 32, 8, 8, 4, 4, 8}; +sdpa_config_t xehpg_h128_s32 = {16, 16, 16, 8, 16, 2, 8, 4}; +sdpa_config_t xehpg_h128_2nd = {8, 16, 16, 8, 16, 1, 8, 2}; +sdpa_config_t xehpg_h128_s256_2nd = {8, 16, 32, 8, 8, 1, 4, 2}; + +sdpa_config_t xehpg_h256 = {16, 16, 32, 8, 16, 2, 8, 4}; +sdpa_config_t xehpg_h256_s128 = {8, 16, 32, 16, 8, 4, 8, 4}; +sdpa_config_t xehpg_h256_s32 = {8, 16, 32, 8, 16, 2, 8, 4}; +sdpa_config_t xehpg_h256_2nd = {8, 8, 16, 8, 16, 1, 16, 1}; +sdpa_config_t xehpg_h256_s64_2nd = {16, 8, 16, 8, 16, 1, 16, 1}; +sdpa_config_t xehpg_h256_s32_2nd = {16, 16, 32, 8, 16, 1, 8, 2}; + +sdpa_config_t xehpc_h32 = {16, 64, 32, 16, 4, 2, 1, 8}; +sdpa_config_t xehpc_h32_s32 = {16, 16, 16, 16, 2, 4, 2, 4}; +sdpa_config_t xehpc_h32_2nd = {16, 64, 16, 16, 8, 1, 2, 4}; + +sdpa_config_t xehpc_h64 = {16, 64, 32, 16, 8, 2, 2, 8}; +sdpa_config_t xehpc_h64_s64 = {32, 32, 32, 16, 4, 2, 2, 4}; +sdpa_config_t xehpc_h64_s32 = {16, 16, 16, 16, 4, 2, 4, 2}; +sdpa_config_t xehpc_h64_2nd = {32, 32, 32, 16, 4, 1, 2, 2}; +sdpa_config_t xehpc_h64_s64_2nd = {16, 16, 16, 16, 4, 1, 4, 1}; + +sdpa_config_t xehpc_h128 = {16, 64, 32, 16, 16, 2, 4, 8}; +sdpa_config_t xehpc_h128_s64 = {16, 32, 32, 32, 4, 2, 4, 2}; +sdpa_config_t xehpc_h128_s32 = {16, 16, 16, 16, 8, 2, 8, 2}; +sdpa_config_t xehpc_h128_2nd = {32, 32, 32, 16, 8, 1, 4, 2}; + +sdpa_config_t xehpc_h256 = {16, 32, 32, 32, 8, 4, 8, 4}; +sdpa_config_t xehpc_h256_s64 = {16, 32, 32, 32, 8, 1, 8, 1}; +sdpa_config_t xehpc_h256_2nd = {16, 16, 16, 16, 16, 1, 16, 1}; + +sdpa_config_t *choose_config_xehpg(int head_size, int seq, bool thin_q) { + if (head_size <= 32) { + if (thin_q) return &xehpg_h32_2nd; + if (seq <= 32) return &xehpg_h32_s32; + if (seq <= 64) return &xehpg_h32_s64; + if (seq <= 256) return &xehpg_h32_s256; + return &xehpg_h32; + } else if (head_size <= 64) { + if (thin_q) return &xehpg_h64_2nd; + if (seq <= 64) return &xehpg_h64_s64; + if (seq <= 128) return &xehpg_h64_s128; + return &xehpg_h64; + } else if (head_size <= 128) { + if (thin_q) { + if (seq <= 256) return &xehpg_h128_s256_2nd; + return &xehpg_h128_2nd; + } + if (seq <= 32) return &xehpg_h128_s32; + return &xehpg_h128; + } else if (head_size <= 256) { + if (thin_q) { + if (seq <= 32) return &xehpg_h256_s32_2nd; + if (seq <= 64) return &xehpg_h256_s64_2nd; + return &xehpg_h256_2nd; + } + if (seq <= 32) return &xehpg_h256_s32; + if (seq <= 128) return &xehpg_h256_s128; + return &xehpg_h256; + } + return nullptr; +} + +sdpa_config_t *choose_config_xehpc(int head_size, int seq, bool thin_q) { + if (head_size <= 32) { + if (thin_q) return &xehpc_h32_2nd; + if (seq <= 32) return &xehpc_h32_s32; + return &xehpc_h32; + } else if (head_size <= 64) { + if (thin_q) { + if (seq <= 64) return &xehpc_h64_s64_2nd; + return &xehpc_h64_2nd; + } + if (seq <= 32) return &xehpc_h64_s32; + if (seq <= 64) return &xehpc_h64_s64; + return &xehpc_h64; + } else if (head_size <= 128) { + if (thin_q) return &xehpc_h128_2nd; + if (seq <= 32) return &xehpc_h128_s32; + if (seq <= 64) return &xehpc_h128_s64; + return &xehpc_h128; + } else if (head_size <= 256) { + if (thin_q) return &xehpc_h256_2nd; + if (seq <= 64) return &xehpc_h256_s64; + return &xehpc_h256; + } + return nullptr; +} + +} // namespace + +std::mutex SDPAKernelMicro::m; + +void SDPAKernelMicro::init_microkernels(const sdpa_params& params, micro::Package& gemm_kq, micro::Package& gemm_vs, bool is_prefill) const { + // TODO: Remove once micro API is thread safe + std::lock_guard l(m); + const auto& Q = params.inputs[0]; + const auto& K = params.inputs[1]; + const auto& V = params.inputs[2]; + + auto& out = params.outputs[0]; + const auto head_size = params.conf.head_size; + const auto d_max = get_d_max(head_size); + const Tensor::Dim n_keys = get_seq_length(K, params.input1_order); + const Tensor::Dim n_queries = get_seq_length(Q, params.input0_order); + const Tensor::Dim n_values = V.X(); + const auto batch = out.Batch().v * out.Feature().v; + + /* Retrieve pre-tuned kernel configuration */ + sdpa_config_t *config = nullptr; + bool thin_q = (!n_queries.is_dynamic && (n_queries.v <= 16)) || !is_prefill; + + switch (params.engineInfo.arch) { + case gpu_arch::xe_hpg: { + config = choose_config_xehpg(static_cast(head_size), static_cast(n_keys.v), thin_q); + break; + } + case gpu_arch::xe_hpc: + case gpu_arch::xe2: { + config = choose_config_xehpc(static_cast(head_size), static_cast(n_keys.v), thin_q); + break; + } + default: break; + } + + /* Get device information */ + micro::HWInformation hw_info; + hw_info.euCount = params.engineInfo.computeUnitsCount; + hw_info.gmdid = params.engineInfo.ip_version; + hw_info.systolicAvailable = params.engineInfo.supports_immad; + + /* Set up GEMMProblem structure for first GEMM: K^T * Q */ + micro::GEMMProblem problem; + problem.Ta = problem.Ta_ext = convert_type(K.GetDType()); + problem.Tb = problem.Tb_ext = convert_type(Q.GetDType()); + problem.Tc = problem.Tc_ext = micro::Type::f32; + problem.Ts = problem.Tc; + + auto problem_kq = problem; + problem_kq.A.layout = micro::MatrixLayout::T; + problem_kq.B.layout = micro::MatrixLayout::Pr; + problem_kq.C.layout = micro::MatrixLayout::T; + problem_kq.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta)); + problem_kq.B.setAlignment(64); // Q is packed in VNNI format in SLM + problem_kq.B.crosspack = 2; + problem_kq.B.tileR = d_max; + problem_kq.B.tileC = static_cast(subgroup_size(params.engineInfo.arch)); + + /* Set up problem size information */ + micro::SizeParams sizes; + sizes.m = static_cast(n_keys.v); + sizes.n = static_cast(n_queries.v); + sizes.k = static_cast(head_size); + sizes.batch = static_cast(batch); + + /* Set up microkernel requirements */ + std::vector reqs_kq; + reqs_kq.push_back(micro::StrategyRequirement::UnrollM == config->unroll_m_kq); + reqs_kq.push_back(micro::StrategyRequirement::UnrollN == config->unroll_n_kq); + reqs_kq.push_back(micro::StrategyRequirement::WGM == config->wg_m_kq); + reqs_kq.push_back(micro::StrategyRequirement::WGN == config->wg_n_kq); + + /* Set up microkernel options */ + micro::GEMMProtocol::Options opts_kq; + opts_kq.localB = true; + opts_kq.slmPtr = true; + + /* Ask microkernel provider for microkernel */ + gemm_kq = micro::select_gemm_microkernel(opts_kq, hw_info, sizes, problem_kq, reqs_kq); + + /* Update for second GEMM: V*S */ + auto problem_vs = problem; + problem_vs.Ta = problem_vs.Ta_ext = convert_type(V.GetDType()); + problem_vs.A.layout = micro::MatrixLayout::N; + problem_vs.B.layout = micro::MatrixLayout::Pr; + problem_vs.C.layout = micro::MatrixLayout::N; + problem_vs.A.setAlignment(micro::alignment_for_ld(head_size * problem.Ta)); + problem_vs.B.setAlignment(64); // S is packed in SLM + problem_vs.B.crosspack = 16; + sizes.m = static_cast(n_values.v); + sizes.n = gemm_kq.getSetting("wg_tile_n"); + sizes.k = gemm_kq.getSetting("wg_tile_m"); + + /* Set up special kernel requirements */ + std::vector reqs_vs; + reqs_vs.push_back(micro::StrategyRequirement::UnrollM == config->unroll_m_vs); + reqs_vs.push_back(micro::StrategyRequirement::UnrollN == config->unroll_n_vs); + reqs_vs.push_back(micro::StrategyRequirement::WGM == config->wg_m_vs); + reqs_vs.push_back(micro::StrategyRequirement::WGN == config->wg_n_vs); + + micro::GEMMProtocol::Options opts_vs; + opts_vs.localB = true; + opts_vs.slmPtr = true; + + auto adjust_vs = [](micro::GEMMStrategy &strategy) { + /* Enable dpasw */ + strategy.dpasw |= strategy.fused; + }; + /* Ask microkernel provider for microkernel */ + gemm_vs = micro::select_gemm_microkernel(opts_vs, hw_info, sizes, problem_vs, reqs_vs, adjust_vs); +} + +ParamsKey SDPAKernelMicro::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F16); + + k.EnableInputLayout(DataLayout::bfyx); + k.EnableOutputLayout(DataLayout::bfyx); + + k.EnableDifferentTypes(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDynamicShapesSupport(); + + return k; +} + +bool SDPAKernelMicro::Validate(const Params& p) const { + if (!Parent::Validate(p)) + return false; + + const sdpa_params& params = static_cast(p); + + if (params.engineInfo.arch < gpu_arch::xe_hpg || !params.engineInfo.supports_microkernels) + return false; + + if (params.conf.is_causal) + return false; + + if (params.indirect_axis != -1) + return false; + + auto Q_num_heads_dim = get_num_heads(params.inputs[0], params.input0_order); + auto K_num_heads_dim = get_num_heads(params.inputs[1], params.input1_order); + auto V_num_heads_dim = get_num_heads(params.inputs[2], params.input2_order); + + if (params.input0_order != params.input1_order || params.input0_order != params.input2_order) + return false; + + if (params.input0_order[3] != 3) + return false; + + if (Q_num_heads_dim.is_dynamic || K_num_heads_dim.is_dynamic || V_num_heads_dim.is_dynamic || K_num_heads_dim.v != V_num_heads_dim.v) + return false; + + return true; +} + +JitConstants SDPAKernelMicro::GetJitConstants(const sdpa_params& params, const micro::Package& gemm_kq, const micro::Package& gemm_vs) const { + auto jit = MakeBaseParamsJitConstants(params); + const auto& prim_params = dynamic_cast(params); + + const auto& Q = prim_params.inputs[0]; + const auto& K = prim_params.inputs[1]; + const auto& V = prim_params.inputs[2]; + + const auto head_size = prim_params.conf.head_size; + + auto ldq = head_size * Q.ElementSize(); + auto ldk = head_size * K.ElementSize(); + auto ldv = head_size * V.ElementSize(); + auto lda = head_size * prim_params.outputs[0].ElementSize(); + + const auto d_max = get_d_max(head_size); + const auto n_keys = get_seq_length(K, prim_params.input1_order); + const auto n_queries = get_seq_length(Q, prim_params.input0_order); + const auto n_values = V.X(); + + jit.AddConstant(MakeJitConstant("D_MAX", d_max)); + jit.AddConstant(MakeJitConstant("SUBGROUP_SIZE", subgroup_size(prim_params.engineInfo.arch))); + jit.AddConstant(MakeJitConstant("INVERT_SCALE", false)); + jit.AddConstant(MakeJitConstant("SCALE_DATA_T", "half")); + + jit.AddConstant(MakeJitConstant("WITH_ATTN_MASK", params.inputs.size() > 3)); + jit.AddConstant(MakeJitConstant("WITH_SCALE", params.inputs.size() > 4)); + jit.AddConstant(MakeJitConstant("Q_ALIGN", micro::alignment_for_ld(ldq))); + jit.AddConstant(MakeJitConstant("K_ALIGN", micro::alignment_for_ld(ldk))); + jit.AddConstant(MakeJitConstant("V_ALIGN", micro::alignment_for_ld(ldv))); + jit.AddConstant(MakeJitConstant("A_ALIGN", micro::alignment_for_ld(lda))); + + jit.AddConstant(MakeJitConstant("TRANSPOSE_K", false)); + + int tile_k = gemm_kq.getSetting("wg_tile_m"); + int tile_q = gemm_kq.getSetting("wg_tile_n"); + int tile_v = gemm_vs.getSetting("wg_tile_m"); + + bool d_full = (head_size == d_max); + bool v_full = (head_size == tile_v); + bool k_full = !n_keys.is_dynamic && (n_keys.v % tile_k) == 0; + bool q_full = !n_queries.is_dynamic && (n_queries.v % tile_q) != 0; + + auto Q_num_heads_dim = get_num_heads(Q, params.input0_order); + auto K_num_heads_dim = get_num_heads(K, params.input1_order); + + jit.AddConstant(MakeJitConstant("REMAINDER_K", !k_full)); + jit.AddConstant(MakeJitConstant("KV_GROUP_SIZE", Q_num_heads_dim.v / K_num_heads_dim.v)); + + if (d_full) { + if (ldq % 4 == 0) + jit.AddConstant(MakeJitConstant("BLOCK_Q", 1)); + if (lda % 4 == 0 && v_full) + jit.AddConstant(MakeJitConstant("BLOCK_A", 1)); + jit.AddConstant(MakeJitConstant("REMAINDER_Q", !q_full)); + } else if (params.engineInfo.arch >= gpu_arch::xe_hpc) { + auto vbytes = n_values.v * V.ElementSize(); + if (lda % 16 == 0 && vbytes % 4 == 0) + jit.AddConstant(MakeJitConstant("BLOCK_2D_A", 1)); + } + + if (params.engineInfo.arch >= gpu_arch::xe_hpc) { + jit.AddConstant(MakeJitConstant("PREFETCH_MASK", 1)); + jit.AddConstant(MakeJitConstant("PREFETCH_K0", 1)); + jit.AddConstant(MakeJitConstant("PREFETCH_K", 1)); + jit.AddConstant(MakeJitConstant("PREFETCH_V", 1)); + bool no_rem = d_full && v_full && k_full; + jit.AddConstant(MakeJitConstant("PREFETCH_REMAINDER", !no_rem)); + jit.AddConstant(MakeJitConstant("PREFETCH_D_MAX", std::min(d_max, 64))); + } + + auto unit_parameters = [](std::string prefix) { + JitConstants definitions({}); + for (size_t i = 0; i < 4; i++) { + definitions.AddConstant(MakeJitConstant(prefix + "_B" + std::to_string(i), 1)); + definitions.AddConstant(MakeJitConstant(prefix + "_SB" + std::to_string(i), 1)); + } + + return definitions; + }; + + auto convert_strides = [](std::string target_prefix, std::string source_prefix, const std::vector order) { + JitConstants definitions({}); + + std::vector target_definitions = { + target_prefix + "_S0", + target_prefix + "_S1", + target_prefix + "_S2", + target_prefix + "_S3", + }; + + std::vector source_definitions = { + source_prefix + "_BATCH_PITCH", + source_prefix + "_FEATURE_PITCH", + source_prefix + "_Y_PITCH", + source_prefix + "_X_PITCH", + }; + + for (size_t i = 0; i < target_definitions.size(); i++) { + definitions.AddConstant(MakeJitConstant(target_definitions[order[i]], source_definitions[i])); + } + + return definitions; + }; + + jit.Merge(convert_strides("QRY", "INPUT0", prim_params.input0_order)); + jit.Merge(convert_strides("KEY", "INPUT1", prim_params.input1_order)); + jit.Merge(convert_strides("VAL", "INPUT2", prim_params.input2_order)); + jit.Merge(convert_strides("DST", "OUTPUT", prim_params.output_order)); + + jit.Merge(unit_parameters("QRY")); + jit.Merge(unit_parameters("KEY")); + jit.Merge(unit_parameters("VAL")); + jit.Merge(unit_parameters("DST")); + + return jit; +} + +CommonDispatchData SDPAKernelMicro::SetDefault(const sdpa_params& params, const micro::Package& gemm_kq, const micro::Package& gemm_vs) const { + CommonDispatchData dispatch_data; + + auto wg_tile_q = gemm_kq.getSetting("wg_tile_n"); + auto sg_per_wg = gemm_kq.getSetting("sg_per_wg_m") * gemm_kq.getSetting("sg_per_wg_n"); + + dispatch_data.lws = {subgroup_size(params.engineInfo.arch), (size_t)sg_per_wg, 1}; + dispatch_data.gws = dispatch_data.lws; + + dispatch_data.gws[0] *= CeilDiv(get_seq_length(params.inputs[0], params.input0_order).v, wg_tile_q); + dispatch_data.gws[1] *= params.outputs[0].Feature().v; + dispatch_data.gws[2] *= params.outputs[0].Batch().v; + + return dispatch_data; +} + +clKernelData SDPAKernelMicro::get_kernel_data(const sdpa_params& params, bool is_prefill) const { + auto name = kernelName + (is_prefill ? "_prefill" : "_generate"); + + std::vector gemms(2); // KQ and VS + init_microkernels(params, gemms[kq_id], gemms[vs_id], is_prefill); + auto dispatch_data = SetDefault(params, gemms[kq_id], gemms[vs_id]); + auto entry_point = GetEntryPoint(name, params.layerID, params); + auto jit = CreateJit(name, GetJitConstants(params, gemms[kq_id], gemms[vs_id]), entry_point); + clKernelData kernel; + + FillCLKernelData(kernel, dispatch_data, params.engineInfo, kernelName, jit, entry_point, + "", false, false, static_cast(params.inputs.size()), + GetFusedPrimitiveInputsCount(params), 1, params.is_shape_agnostic); + + kernel.params.arguments.clear(); + if (params.is_shape_agnostic ) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0}); + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 1}); // K + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0}); // Q + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 2}); // V + kernel.params.arguments.push_back({ArgumentDescriptor::Types::OUTPUT, 0}); // A + + if (params.inputs.size() >= 4) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 3}); // mask + if (params.inputs.size() >= 5) + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 4}); // Scale + + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 0}); // D + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 1}); // K + kernel.params.arguments.push_back({ArgumentDescriptor::Types::SCALAR, 2}); // Q + + const auto& Q = params.inputs[0]; + const auto& K = params.inputs[1]; + + const auto n_queries = get_seq_length(Q, params.input0_order); + const auto n_keys = get_seq_length(K, params.input1_order); + + auto head_size = params.conf.head_size; + + ScalarDescriptor s_d; + s_d.t = ScalarDescriptor::Types::INT32; + s_d.v.s32 = static_cast(head_size); + + ScalarDescriptor s_k; + s_k.t = ScalarDescriptor::Types::INT32; + s_k.v.s32 = static_cast(n_keys.v); + + ScalarDescriptor s_q; + s_q.t = ScalarDescriptor::Types::INT32; + s_q.v.s32 = static_cast(n_queries.v); + + kernel.params.scalars.push_back(s_d); + kernel.params.scalars.push_back(s_k); + kernel.params.scalars.push_back(s_q); + + /* Generate microkernel shims */ + micro::ShimOptions shim_options; + shim_options.subgroupSize = static_cast(subgroup_size(params.engineInfo.arch)); + shim_options.useTileOps = true; + shim_options.decorator = "kq"; + + kernel.code.kernelString->jit += generateShim(gemms[kq_id], micro::HostLanguage::OpenCL_C, shim_options); + + shim_options.microkernelID++; + shim_options.decorator = "vs"; + kernel.code.kernelString->jit += generateShim(gemms[vs_id], micro::HostLanguage::OpenCL_C, shim_options); + + if (gemms[kq_id].grfMin > 128 || gemms[vs_id].grfMin > 128) + kernel.code.kernelString->options += " -cl-intel-256-GRF-per-thread"; + + std::string extra_options = " -Dcl_intel_dot_accumulate"; + extra_options += " -Dcl_intel_global_float_atomic"; + extra_options += " -Dcl_intel_subgroup_matrix_multiply_accumulate"; + extra_options += " -Dcl_intel_subgroup_split_matrix_multiply_accumulate"; + kernel.code.kernelString->options += extra_options; + + kernel.code.kernelString->batch_compilation = false; + kernel.code.kernelString->has_microkernels = true; + + for (auto& p : gemms) { + kernel.micro_kernels.push_back(std::make_shared(p)); + } + + return kernel; +} + +KernelsData SDPAKernelMicro::GetKernelsData(const Params& params) const { + const size_t num_kernels = 2; + KernelData kd = KernelData::Default(params, num_kernels); + const auto& prim_params = dynamic_cast(params); + + if (!Validate(params)) { + return {}; + } + + for (size_t i = 0; i < num_kernels; i++) { + kd.kernels[i] = get_kernel_data(prim_params, i == prefill_id); + } + + GetUpdateDispatchDataFunc(kd); + + return { kd }; +} + +void SDPAKernelMicro::GetUpdateDispatchDataFunc(KernelData& kd) const { + kd.update_dispatch_data_func = [this](const Params& params, KernelData& kernel_data) { + const auto& prim_params = static_cast(params); + const auto& Q = prim_params.inputs[0]; + const auto& K = prim_params.inputs[1]; + + const auto n_queries = get_seq_length(Q, prim_params.input0_order); + const auto n_keys = get_seq_length(K, prim_params.input1_order); + + auto head_size = prim_params.conf.head_size; + + ScalarDescriptor s_d; + s_d.t = ScalarDescriptor::Types::INT32; + s_d.v.s32 = static_cast(head_size); + + ScalarDescriptor s_k; + s_k.t = ScalarDescriptor::Types::INT32; + s_k.v.s32 = static_cast(n_keys.v); + + ScalarDescriptor s_q; + s_q.t = ScalarDescriptor::Types::INT32; + s_q.v.s32 = static_cast(n_queries.v); + + // TODO: Currently 2nd token version works slower than prefill version + const bool is_prefill = true;//n_queries.v > 1; + + OPENVINO_ASSERT(kernel_data.kernels.size() == 2, "[GPU] Invalid kernels size for update dispatch data func"); + + size_t target_kernel = is_prefill ? prefill_id : generate_id; + + kernel_data.kernels[prefill_id].skip_execution = true; + kernel_data.kernels[generate_id].skip_execution = true; + + const auto& gemms = kernel_data.kernels[target_kernel].micro_kernels; + auto dispatchData = SetDefault(prim_params, gemms[kq_id]->p, gemms[vs_id]->p); + kernel_data.kernels[target_kernel].params.workGroups.global = dispatchData.gws; + kernel_data.kernels[target_kernel].params.workGroups.local = dispatchData.lws; + kernel_data.kernels[target_kernel].skip_execution = KernelData::SkipKernelExecution(prim_params); + + kernel_data.kernels[target_kernel].params.scalars.clear(); + kernel_data.kernels[target_kernel].params.scalars.push_back(s_d); + kernel_data.kernels[target_kernel].params.scalars.push_back(s_k); + kernel_data.kernels[target_kernel].params.scalars.push_back(s_q); + }; +} + +KernelsPriority SDPAKernelMicro::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_1; +} +} // namespace kernel_selector + +#endif // ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.h new file mode 100644 index 00000000000000..f282c52f8437c9 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.h @@ -0,0 +1,47 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef ENABLE_ONEDNN_FOR_GPU + +#pragma once + +#include "sdpa_kernel_base.h" +#include "micro_utils.hpp" + +namespace kernel_selector { +class SDPAKernelMicro : public SDPAKernelBase { +public: + using Parent = SDPAKernelBase; + SDPAKernelMicro() : SDPAKernelBase("sdpa_micro") {} + virtual ~SDPAKernelMicro() {} + + KernelsData GetKernelsData(const Params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + +protected: + bool Validate(const Params& p) const override; + void GetUpdateDispatchDataFunc(KernelData& kd) const override; + CommonDispatchData SetDefault(const sdpa_params& params, const micro::Package& gemm_kq, const micro::Package& gemm_vs) const; + JitConstants GetJitConstants(const sdpa_params& params, const micro::Package& gemm_kq, const micro::Package& gemm_vs) const; + std::vector GetSupportedFusedOps() const override { + return {}; + } + + void init_microkernels(const sdpa_params& params, micro::Package& gemm_kq, micro::Package& gemm_vs, bool is_prefill) const; + clKernelData get_kernel_data(const sdpa_params& params, bool is_prefill) const; + +private: + static constexpr size_t prefill_id = 0; + static constexpr size_t generate_id = 1; + + static constexpr size_t kq_id = 0; + static constexpr size_t vs_id = 1; + + static std::mutex m; +}; +} // namespace kernel_selector + + +#endif // ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp index 23dc964eb18eea..e53338e7c4522d 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_opt.cpp @@ -273,7 +273,7 @@ void SDPAKernelOpt::GetUpdateDispatchDataFunc(KernelData& kd) const { }; } -KernelsPriority SDPAKernelOpt::GetKernelsPriority(const Params& /*params*/) const { - return FORCE_PRIORITY_1; +KernelsPriority SDPAKernelOpt::GetKernelsPriority(const Params& params) const { + return params.engineInfo.supports_immad ? FORCE_PRIORITY_2 : FORCE_PRIORITY_1; } } // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp index b58f04f23e2643..7e88d0ad71fed1 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_selector.cpp @@ -5,12 +5,16 @@ #include "sdpa_kernel_selector.h" #include "sdpa_kernel_ref.h" #include "sdpa_kernel_opt.h" +#include "sdpa_kernel_micro.h" namespace kernel_selector { sdpa_kernel_selector::sdpa_kernel_selector() { Attach(); Attach(); +#ifdef ENABLE_ONEDNN_FOR_GPU + Attach(); +#endif } KernelsData sdpa_kernel_selector::GetBestKernels(const Params& params) const { diff --git a/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp new file mode 100644 index 00000000000000..828c9016d8669e --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/micro_utils.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#ifdef ENABLE_ONEDNN_FOR_GPU + +#ifdef UNUSED +# undef UNUSED +#endif + +#ifndef NOMINMAX +# define NOMINMAX +#endif + +#include "gpu/intel/microkernels/package.hpp" +#include "gpu/intel/jit/gemm/microkernel_provider.hpp" +#include "gpu/intel/jit/gemm/gen_gemm_kernel_generator.hpp" +#include "gpu/intel/microkernels/shim.hpp" + +namespace micro { + +using Package = dnnl::impl::gpu::intel::micro::Package; +using HWInformation = dnnl::impl::gpu::intel::jit::HWInformation; +using GEMMProblem = dnnl::impl::gpu::intel::jit::GEMMProblem; +using GEMMStrategy = dnnl::impl::gpu::intel::jit::GEMMStrategy; +using GEMMProtocol = dnnl::impl::gpu::intel::micro::GEMMProtocol; +using MatrixLayout = dnnl::impl::gpu::intel::jit::MatrixLayout; +using Type = dnnl::impl::gpu::intel::jit::Type; +using SizeParams = dnnl::impl::gpu::intel::jit::SizeParams; +using StrategyRequirement = dnnl::impl::gpu::intel::jit::StrategyRequirement; +using ShimOptions = dnnl::impl::gpu::intel::micro::ShimOptions; +using HostLanguage = dnnl::impl::gpu::intel::micro::HostLanguage; + +// Wrapper for Package which is used in clKernelData with forward declaration +// to avoid including this header in many places in plugin +// which may cause symbols conflicts with oneDNN +struct MicroKernelPackage { + explicit MicroKernelPackage(Package _p) : p(_p) {} + Package p; +}; + +inline Package select_gemm_microkernel(GEMMProtocol protocol, HWInformation hw_info, SizeParams sizes, const GEMMProblem &problem, + const std::vector &reqs = std::vector(), + void (*strategyAdjuster)(GEMMStrategy &strategy) = nullptr) { + return dnnl::impl::gpu::intel::jit::selectGEMMMicrokernel(protocol, hw_info, sizes, problem, reqs, strategyAdjuster); +} + +static inline int alignment_for_ld(int ld) { + return dnnl::impl::gpu::intel::jit::alignmentForLD(ld); +} + +} // namespace micro + +#undef UNUSED + +#endif // ENABLE_ONEDNN_FOR_GPU diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp index 8863b47fe55bd2..cd8128baff37c9 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.cpp @@ -22,7 +22,7 @@ primitive_db::primitive_db() : primitives({ #include "ks_primitive_db.inc" }), - batch_header_str({ + batch_headers({ #include "ks_primitive_db_batch_headers.inc" }) { } diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h index 73858754d35e39..e384f6c9879fb5 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h +++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db.h @@ -22,7 +22,7 @@ struct primitive_db { primitive_db(); std::vector get(const primitive_id& id) const; - std::vector get_batch_header_str() const { return std::move(batch_header_str); } + std::map get_batch_headers() const { return std::move(batch_headers); } private: struct case_insensitive_compare { @@ -35,7 +35,7 @@ struct primitive_db { } }; std::multimap primitives; - std::vector batch_header_str; + std::map batch_headers; }; } // namespace cache diff --git a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py index 4f746d8cc526f6..116844f3bccfc7 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py +++ b/src/plugins/intel_gpu/src/kernel_selector/primitive_db_gen.py @@ -198,7 +198,8 @@ def batch_headers_to_str(self): characters = 1 # Newline character above res = "" for h in self.batch_headers: - res += '(std::string) R"(\n' + header_name = h[:h.find('.cl')] + res += '{{"{}",\n(std::string) R"(\n'.format(header_name) header_file = os.path.abspath(os.path.join(os.path.dirname(self.kernels_folder + "/include/batch_headers"), "batch_headers/" + h)) content = [] with open(header_file) as f: @@ -207,11 +208,11 @@ def batch_headers_to_str(self): if line.startswith('#include'): continue if (i + 1) % max_lines == 0 or characters + len(line) + 1 > max_characters: - res += ')",' + ' (std::string) R"(' + res += ')"\n + (std::string) R"(' characters = 0 res += '{}\n'.format(line.rstrip()) characters += len(line) + 1 - res += ')",\n\n' + res += ')"},\n\n' return self.post_process_sources(res) def post_process_sources(self, content): diff --git a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp index d002c868ffd225..0ba780183b186d 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/scaled_dot_product_attention.cpp @@ -3,7 +3,6 @@ // #include "intel_gpu/plugin/program_builder.hpp" -#include "intel_gpu/plugin/common_utils.hpp" #include "intel_gpu/op/sdpa.hpp" #include "intel_gpu/op/indirect_sdpa.hpp" @@ -30,9 +29,15 @@ static void CreateScaledDotProductAttentionOp(ProgramBuilder& p, const std::shar auto layerName = layer_type_name_ID(op); bool is_causal = op->get_causal(); + auto order = ov::op::internal::SDPA::default_order(op->get_output_partial_shape(0).size()); auto sdpa_prim = cldnn::scaled_dot_product_attention(layerName, inputs, - is_causal); + is_causal, + -1, + order, + order, + order, + order); p.add_primitive(*op, sdpa_prim); } diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index b6db0233ee9093..e3f7c2cfe2f6b5 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -83,17 +83,18 @@ std::string Plugin::get_device_id(const ov::AnyMap& config) const { return id; } -void Plugin::transform_model(std::shared_ptr& model, const ExecutionConfig& config) const { +void Plugin::transform_model(std::shared_ptr& model, const ExecutionConfig& config, const std::shared_ptr& context) const { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::transform_model"); - auto deviceInfo = m_device_map.at(config.get_property(ov::device::id))->get_info(); - TransformationsPipeline transformations(config, deviceInfo); + TransformationsPipeline transformations(config, context); auto start = Time::now(); transformations.apply(model); GPU_DEBUG_LOG << "Transformations time: " << std::chrono::duration_cast(Time::now() - start).count() << " ms" << std::endl; } -std::shared_ptr Plugin::clone_and_transform_model(const std::shared_ptr& model, const ExecutionConfig& config) const { +std::shared_ptr Plugin::clone_and_transform_model(const std::shared_ptr& model, + const ExecutionConfig& config, + const std::shared_ptr& context) const { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::clone_and_transform_model"); GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_DEFINE_MEM_LOGGER("Plugin::clone_and_transform_model"); @@ -106,7 +107,7 @@ std::shared_ptr Plugin::clone_and_transform_model(const std::shared_p ov::pass::VisualizeTree(path_base + ".svg").run_on_model(cloned_model); } - transform_model(cloned_model, config); + transform_model(cloned_model, config, context); // Transformations for some reason may drop output tensor names, so here we copy those from the original model auto new_results = cloned_model->get_results(); @@ -169,7 +170,7 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< config.set_user_property(orig_config); config.apply_user_properties(context->get_engine().get_device_info()); - auto transformed_model = clone_and_transform_model(model, config); + auto transformed_model = clone_and_transform_model(model, config, context); { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::compile_model::CreateCompiledModel"); return std::make_shared(transformed_model, shared_from_this(), context, config); @@ -188,7 +189,7 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< config.set_user_property(orig_config); config.apply_user_properties(context_impl->get_engine().get_device_info()); - auto transformed_model = clone_and_transform_model(model, config); + auto transformed_model = clone_and_transform_model(model, config, context_impl); return std::make_shared(transformed_model, shared_from_this(), context_impl, config); } @@ -259,10 +260,10 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr& float query_model_ratio = config.get_property(ov::internal::query_model_ratio.name()).as(); auto supported = ov::get_supported_nodes(model, - [&config,this](std::shared_ptr& model) { + [&config,&ctx,this](std::shared_ptr& model) { std::map shapes; std::map> batch_dim; - transform_model(model, config); + transform_model(model, config, ctx); }, [&prog](std::shared_ptr node) { return prog.is_op_supported(node); @@ -709,7 +710,7 @@ uint32_t Plugin::get_max_batch_size(const ov::AnyMap& options) const { return static_cast(max_batch_size); } - TransformationsPipeline transformations(config, device_info); + TransformationsPipeline transformations(config, context); transformations.apply(cloned_model); program = std::make_shared(cloned_model, engine, config, true); std::pair device_memory_usage = program->get_compiled_program()->get_estimated_device_mem_usage(); @@ -778,7 +779,7 @@ uint32_t Plugin::get_optimal_batch_size(const ov::AnyMap& options) const { << ", L3_cache_size is (MB): " << float(L3_cache_size) / 1024 / 1024 << std::endl; } auto config = m_configs_map.at(device_id); - auto cloned_model = clone_and_transform_model(model, config); + auto cloned_model = clone_and_transform_model(model, config, context); ov::MemBandwidthPressure memPressure = ov::mem_bandwidth_pressure_tolerance(cloned_model, L3_cache_size); uint32_t batch = 1; if (memPressure.max_mem_tolerance != ov::MemBandwidthPressure::UNKNOWN) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp index 3a6315978577c8..d26e45e8fea9ba 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/transpose_fusion.cpp @@ -45,14 +45,6 @@ TransposeSDPAMatcher::TransposeSDPAMatcher() { return std::dynamic_pointer_cast(output.get_node_shared_ptr()) == nullptr && is_fp_type(output); }; - auto is_dynamic = [](const ov::Output& output) -> bool { - bool is_dynamic = output.get_node_shared_ptr()->get_output_partial_shape(0).is_dynamic(); - size_t num_inputs = output.get_node_shared_ptr()->get_input_size(); - for (size_t idx = 0; idx < num_inputs; idx++) { - is_dynamic |= output.get_node_shared_ptr()->get_input_partial_shape(idx).is_dynamic(); - } - return is_dynamic; - }; auto input_q_m = any_input(not_transpose); auto input_k_m = any_input(not_transpose); @@ -70,10 +62,10 @@ TransposeSDPAMatcher::TransposeSDPAMatcher() { auto sdpa_in_k = std::make_shared(OutputVector{input_k_m, transpose_k_m}); auto sdpa_in_v = std::make_shared(OutputVector{input_v_m, transpose_v_m}); - auto sdpa_without_attn_mask_m = wrap_type({ sdpa_in_q, sdpa_in_k, sdpa_in_v }, is_dynamic); - auto sdpa_with_attn_mask_m = wrap_type({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask }, is_dynamic); + auto sdpa_without_attn_mask_m = wrap_type({ sdpa_in_q, sdpa_in_k, sdpa_in_v }); + auto sdpa_with_attn_mask_m = wrap_type({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask }); auto sdpa_with_attn_mask_and_scale_m = - wrap_type({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask, input_scale }, is_dynamic); + wrap_type({ sdpa_in_q, sdpa_in_k, sdpa_in_v, input_attn_mask, input_scale }); auto sdpa_m = std::make_shared(OutputVector{sdpa_without_attn_mask_m, sdpa_with_attn_mask_m, sdpa_with_attn_mask_and_scale_m}); diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 4b18dbf3fa4b71..bb7385cbe5dbb1 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -188,6 +188,10 @@ static bool is_non_supported_decompression_op(const std::shared_ptr func) { return false; } - // - Head size should be static dim - const auto head_size_dim = query_ps[query_ps.size() - 1]; - if (head_size_dim.is_dynamic()) - return false; + // For platforms with DPAS support we don't have any other shape-based limitations besides head_size being static and equal for QKV + if (device_info.supports_immad && cldnn::query_microkernels_supported(m_context->get_engine(), config)) + return true; // - Head size should be 128 for any model type; or should be in the range of 64 to 256 for stateful LLMs because of performance reasons. // This limitations is recommended to prevent performance drop in models with small head size, such as SD, diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp index 47483c6a1a0192..ac8d716882ef82 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.cpp @@ -37,9 +37,32 @@ #include #endif +#ifdef ENABLE_ONEDNN_FOR_GPU +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include "gpu/intel/microkernels/fuser.hpp" +#endif + namespace { std::mutex cacheAccessMutex; +#ifdef ENABLE_ONEDNN_FOR_GPU +cl::Program fuse_microkernels(const cl::Context& context, const cl::Device& device, cl::Program& program, const std::string& code) { + using namespace dnnl::impl::gpu::intel; + std::vector> binaries = program.getInfo(); + OPENVINO_ASSERT(binaries.size() == 1); + std::vector binary = binaries[0]; + micro::fuseMicrokernels(binary, code.c_str()); + + cl::Program::Binaries fused_binary = { binary }; + cl::Program fused_program(context, {device}, fused_binary); + fused_program.build({device}); + + return fused_program; +} +#endif // ENABLE_ONEDNN_FOR_GPU + std::string reorder_options(const std::string& org_options) { std::stringstream ss(org_options); std::set sorted_options; @@ -128,7 +151,7 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code, const auto& batch_id = 0; // increase bucket id if and only if new bucket comes bucket_id = static_cast(program_buckets.size() - 1); - current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str)); + current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_headers)); } // This is a temporary walk-around to avoid severe performance drop. @@ -159,13 +182,22 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code, || current_bucket.back().entry_point_to_id.find(entry_point) != current_bucket.back().entry_point_to_id.end() || need_separate_batch(entry_point)) { const auto& batch_id = static_cast(current_bucket.size()); - current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_header_str)); + current_bucket.push_back(batch_program(bucket_id, batch_id, options, batch_headers)); } auto& current_batch = current_bucket.back(); current_batch.dump_custom_program = dump_custom_program; current_batch.entry_point_to_id.emplace(entry_point, std::make_pair(code.params, kernel_part_idx)); + current_batch.has_microkernels |= kernel_string->has_microkernels; + + // TODO: Technically, microkernels doesn't require specific headers, but we don't want to include + // some headers to all batches as it may lead to compilation error on some driver versions. + // Need to generalize work with headers to include only necessary parts + if (current_batch.has_microkernels) { + current_batch.source.insert(current_batch.source.begin(), current_batch.micro_headers.begin(), current_batch.micro_headers.end()); + } + current_batch.source.push_back(std::move(full_code)); current_batch.kernels_counter++; } @@ -195,12 +227,12 @@ kernels_cache::kernels_cache(engine& engine, const ExecutionConfig& config, uint32_t prog_id, std::shared_ptr task_executor, - const std::vector& batch_header_str) + const std::map& batch_headers) : _engine(engine) , _task_executor(task_executor) , _config(config) , _prog_id(prog_id) - , batch_header_str(std::move(batch_header_str)) { } + , batch_headers(std::move(batch_headers)) { } static std::vector getProgramBinaries(cl::Program program) { // Get the size of the program binary in bytes. @@ -288,6 +320,17 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program& dump_file << "*/\n"; } + if (batch.has_microkernels) { +#ifdef ENABLE_ONEDNN_FOR_GPU + OPENVINO_ASSERT(batch.kernels_counter == 1); + // Do we need full source code here (with batch headers)? + program = fuse_microkernels(cl_build_engine.get_cl_context(), cl_build_engine.get_cl_device(), program, batch.source.back()); +#else // ENABLE_ONEDNN_FOR_GPU + OPENVINO_THROW("[GPU] Can't compile kernel w/ microkernels as onednn is not available"); +#endif // ENABLE_ONEDNN_FOR_GPU + } + + program.createKernels(&kernels); if (is_cache_enabled()) { @@ -348,6 +391,7 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program& GPU_DEBUG_INFO << "-------- End of OpenCL build error" << std::endl; std::stringstream err_ss(err_log); std::string line; + std::stringstream err; int cnt = 0; while (std::getline(err_ss, line, '\n')) { @@ -355,14 +399,14 @@ void kernels_cache::build_batch(const engine& build_engine, const batch_program& cnt = 5; cnt--; if (cnt > 0) - std::cout << line << std::endl; + err << line << std::endl; else if (cnt == 0) - std::cout << "...." << std::endl; + err << "...." << std::endl; } throw std::runtime_error("Program build failed(" + std::to_string(batch.bucket_id) + + "_part_" + std::to_string(batch.batch_id) - + "): You may enable OCL source dump to see the error log.\n"); + + "):\n" + err.str()); } } diff --git a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp index ca389718a49d23..d3775731bb01a5 100644 --- a/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp +++ b/src/plugins/intel_gpu/src/runtime/kernels_cache.hpp @@ -54,19 +54,34 @@ class kernels_cache { size_t hash_value; uint32_t kernels_counter; source_code source; + source_code micro_headers; std::string options; bool dump_custom_program; + bool has_microkernels; std::map> entry_point_to_id; - explicit batch_program(int32_t _bucket_id, int32_t _batch_id, std::string _options, const std::vector& batch_header_str) + explicit batch_program(int32_t _bucket_id, int32_t _batch_id, std::string _options, const std::map& batch_headers) : bucket_id(_bucket_id), batch_id(_batch_id), hash_value(0), kernels_counter(0), - source(std::move(batch_header_str)), + source({}), options(_options), dump_custom_program(false), + has_microkernels(false), entry_point_to_id({}) { + static const std::vector micro_kernel_include_names { + "generic_vector_ops", + "tile_ops", + "sdpa_utils" + }; + for (const auto& kv : batch_headers) { + if (std::find(micro_kernel_include_names.begin(), micro_kernel_include_names.end(), kv.first) == micro_kernel_include_names.end()) { + source.push_back(kv.second); + } else { + micro_headers.push_back(kv.second); + } + } } }; @@ -83,7 +98,7 @@ class kernels_cache { compiled_kernels _kernels; std::map, uint32_t> _cached_binaries; std::unordered_map _cached_kernels; - std::vector batch_header_str; + std::map batch_headers; std::unordered_map _kernel_batch_hash; void get_program_source(const kernels_code& kernels_source_code, std::vector*) const; void build_batch(const engine& build_engine, const batch_program& batch, compiled_kernels& compiled_kernels); @@ -97,12 +112,9 @@ class kernels_cache { const ExecutionConfig& config, uint32_t prog_id, std::shared_ptr task_executor = nullptr, - const std::vector& batch_header_str = {}); + const std::map& batch_headers = {}); kernel::ptr get_kernel_from_cached_kernels(std::string id) const; std::vector get_kernels(kernel_impl_params params) const; - void set_batch_header_str(const std::vector &batch_headers) { - batch_header_str = std::move(batch_headers); - } bool validate_simple_kernel_execution(kernel::ptr kernel); diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp index 15e379d5ad557b..74d9b033c64ca2 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_device.cpp @@ -2,6 +2,13 @@ // SPDX-License-Identifier: Apache-2.0 // +#ifdef ENABLE_ONEDNN_FOR_GPU +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include "gpu/intel/jit/jit_generator.hpp" +#endif // ENABLE_ONEDNN_FOR_GPU + #include "ocl_device.hpp" #include "ocl_common.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" @@ -43,6 +50,24 @@ namespace cldnn { namespace ocl { namespace { + +#ifdef ENABLE_ONEDNN_FOR_GPU +gpu_arch convert_ngen_arch(ngen::HW gpu_arch) { + switch (gpu_arch) { + case ngen::HW::Gen9: return gpu_arch::gen9; + case ngen::HW::Gen11: return gpu_arch::gen11; + case ngen::HW::XeLP: return gpu_arch::xe_lp; + case ngen::HW::XeHP: return gpu_arch::xe_hp; + case ngen::HW::XeHPG: return gpu_arch::xe_hpg; + case ngen::HW::XeHPC: return gpu_arch::xe_hpc; + case ngen::HW::Xe2: return gpu_arch::xe2; + case ngen::HW::Gen10: + case ngen::HW::Unknown: return gpu_arch::unknown; + } + return gpu_arch::unknown; +} +#endif + int driver_dev_id() { const std::vector unused_ids = { 0x4905, 0x4906, 0x4907, 0x4908 @@ -172,7 +197,7 @@ bool get_imad_support(const cl::Device& device) { return false; } -device_info init_device_info(const cl::Device& device) { +device_info init_device_info(const cl::Device& device, const cl::Context& context) { device_info info = {}; info.vendor_id = static_cast(device.getInfo()); info.dev_name = device.getInfo(); @@ -250,7 +275,8 @@ device_info init_device_info(const cl::Device& device) { bool nv_device_attr_supported = extensions.find("cl_nv_device_attribute_query") != std::string::npos; info.has_separate_cache = false; if (device_attr_supported) { - info.gfx_ver = parse_version(device.getInfo()); + info.ip_version = device.getInfo(); + info.gfx_ver = parse_version(info.ip_version); info.device_id = device.getInfo(); info.num_slices = device.getInfo(); info.num_sub_slices_per_slice = device.getInfo(); @@ -296,6 +322,17 @@ device_info init_device_info(const cl::Device& device) { info.num_ccs = std::max(num_queues, info.num_ccs); } + +#ifdef ENABLE_ONEDNN_FOR_GPU + using namespace dnnl::impl::gpu::intel::jit; + ngen::HW hw = ngen::HW::Unknown; + ngen::Product product = {ngen::ProductFamily::Unknown, 0}; + jit_generator::detectHWInfo(context.get(), device.get(), hw, product); + info.arch = convert_ngen_arch(hw); +#else // ENABLE_ONEDNN_FOR_GPU + info.arch = gpu_arch::unknown; +#endif // ENABLE_ONEDNN_FOR_GPU + return info; } @@ -331,7 +368,7 @@ ocl_device::ocl_device(const cl::Device dev, const cl::Context& ctx, const cl::P : _context(ctx) , _device(dev) , _platform(platform) -, _info(init_device_info(dev)) +, _info(init_device_info(dev, ctx)) , _mem_caps(init_memory_caps(dev, _info)) { } bool ocl_device::is_same(const device::ptr other) { diff --git a/src/plugins/intel_gpu/src/runtime/ocl/ocl_wrapper.hpp b/src/plugins/intel_gpu/src/runtime/ocl/ocl_wrapper.hpp index 57da8d5c748672..91a4d767ceaebe 100644 --- a/src/plugins/intel_gpu/src/runtime/ocl/ocl_wrapper.hpp +++ b/src/plugins/intel_gpu/src/runtime/ocl/ocl_wrapper.hpp @@ -50,6 +50,7 @@ #pragma clang diagnostic ignored "-Wunused-variable" #pragma clang diagnostic ignored "-Wunused-function" #pragma clang diagnostic ignored "-Wignored-qualifiers" + #pragma clang diagnostic ignored "-Wdeprecated-declarations" #elif defined __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wsign-compare" @@ -57,6 +58,7 @@ #pragma GCC diagnostic ignored "-Wunused-variable" #pragma GCC diagnostic ignored "-Wunused-function" #pragma GCC diagnostic ignored "-Wignored-qualifiers" + #pragma GCC diagnostic ignored "-Wdeprecated-declarations" #if __GNUC__ >= 8 #pragma GCC diagnostic ignored "-Wcatch-value" #endif diff --git a/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp b/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp index 3ea2a654bb5416..8dd82c7f6f1bd8 100644 --- a/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp +++ b/src/plugins/intel_gpu/tests/common/subgraphs_builders.hpp @@ -14,6 +14,7 @@ #include "openvino/op/gather.hpp" #include "openvino/op/read_value.hpp" #include "openvino/op/reshape.hpp" +#include "openvino/op/scaled_dot_product_attention.hpp" #include "openvino/op/shape_of.hpp" #include "openvino/op/transpose.hpp" #include "openvino/op/result.hpp" @@ -119,7 +120,6 @@ inline std::shared_ptr make_llm_kv_cache_pattern(ov::Dimension batch ov::ResultVector results{kv_present, matmul_out}; auto model = std::make_shared(results, params, "LLM-KV-Cache"); - ov::pass::VisualizeTree("model.svg").run_on_model(model); if (stateful) { ov::pass::MakeStateful({{in_kv_prev, kv_present}}).run_on_model(model); } diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index 8e835c371a6123..3f0a4d60a9dedf 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -12,7 +12,7 @@ if(ENABLE_ONEDNN_FOR_GPU) set(ONEDNN_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_build") set(ONEDNN_INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_install" CACHE PATH "Installation path for oneDNN GPU library") set(ONEDNN_PREFIX_DIR "${CMAKE_CURRENT_BINARY_DIR}/onednn_gpu_root") - set(ONEDNN_ENABLED_PRIMITIVES "CONCAT;CONVOLUTION;DECONVOLUTION;INNER_PRODUCT;MATMUL;REORDER;POOLING;REDUCTION") + set(ONEDNN_ENABLED_PRIMITIVES "CONCAT;CONVOLUTION;DECONVOLUTION;INNER_PRODUCT;MATMUL;REORDER;POOLING;REDUCTION;SDPA") set(ONEDNN_ENABLED_ISA "XEHPG;XEHPC;XE2") set(DNNL_GPU_LIBRARY_NAME "openvino_onednn_gpu" CACHE STRING "Name of oneDNN library for Intel GPU Plugin") @@ -141,11 +141,24 @@ if(ENABLE_ONEDNN_FOR_GPU) EXCLUDE_FROM_ALL ON ) + list(APPEND LIB_INCLUDE_DIRS ${ONEDNN_INSTALL_DIR}/include) + list(APPEND LIB_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/onednn_gpu/src) + set(LIB_DEFINITIONS ENABLE_ONEDNN_FOR_GPU + DNNL_DLL + DNNL_DLL_EXPORTS + DNNL_ENABLE_CPU_ISA_HINTS + DNNL_ENABLE_MAX_CPU_ISA + DNNL_X64=1 + NGEN_CPP11 + NGEN_NEO_INTERFACE + NGEN_NO_OP_NAMES + NGEN_SAFE + NGEN_WINDOWS_COMPAT) add_library(onednn_gpu_tgt INTERFACE) set_target_properties(onednn_gpu_tgt PROPERTIES INTERFACE_LINK_LIBRARIES $ - INTERFACE_INCLUDE_DIRECTORIES $ - INTERFACE_COMPILE_DEFINITIONS ENABLE_ONEDNN_FOR_GPU + INTERFACE_INCLUDE_DIRECTORIES "$" + INTERFACE_COMPILE_DEFINITIONS "${LIB_DEFINITIONS}" ) add_dependencies(onednn_gpu_tgt onednn_gpu_build)