Skip to content

Commit

Permalink
[GPU] Micro sdpa (openvinotoolkit#24656)
Browse files Browse the repository at this point in the history
### Details:
- Added SDPA impl based on microkernels using internal onednn API and
related infra
 - Current limitations:
- fused transpose shouldn't change order of innermost dim (head size).
   - is_causal = true is not supported
   - fp16 only
   - num heads dimension must be static
   - no indirect kv support
- Initial version of KV Cache + SDPA func test
- Enabled Transpose+SDPA fusion for static shape too

### Tickets:
 - CVS-141761
  • Loading branch information
vladimir-paramuzov authored Jul 1, 2024
1 parent a3d2b6a commit 2918322
Show file tree
Hide file tree
Showing 35 changed files with 2,216 additions and 79 deletions.
6 changes: 4 additions & 2 deletions src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ class Plugin : public ov::IPlugin {

std::map<std::string, std::shared_ptr<RemoteContextImpl>> get_default_contexts() const;

std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network, const ExecutionConfig& config) const;
void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config) const;
std::shared_ptr<ov::Model> clone_and_transform_model(const std::shared_ptr<const ov::Model>& network,
const ExecutionConfig& config,
const std::shared_ptr<RemoteContextImpl>& context) const;
void transform_model(std::shared_ptr<ov::Model>& model, const ExecutionConfig& config, const std::shared_ptr<RemoteContextImpl>& context) const;
void register_primitives() const;
std::string get_device_id_from_config(const ov::AnyMap& config) const;
std::string get_device_id(const ov::AnyMap& config) const;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <memory>

#include "intel_gpu/plugin/remote_context.hpp"
#include "openvino/core/model.hpp"

#include "intel_gpu/runtime/execution_config.hpp"
Expand All @@ -16,12 +17,13 @@ namespace intel_gpu {

class TransformationsPipeline {
public:
explicit TransformationsPipeline(const ExecutionConfig &conf, const cldnn::device_info &device_info)
: config(conf), device_info(device_info) {}
explicit TransformationsPipeline(const ExecutionConfig &conf, const std::shared_ptr<RemoteContextImpl>& context)
: config(conf), m_context(context), device_info(context->get_engine().get_device_info()) {}
void apply(std::shared_ptr<ov::Model> func);

private:
const ExecutionConfig& config;
std::shared_ptr<RemoteContextImpl> m_context;
cldnn::device_info device_info;
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ enum class LogLevel : int8_t {
#else
#define SEPARATE '/'
#endif
#define __FILENAME__ (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
#define GPU_FILENAME (strrchr(__FILE__, SEPARATE) ? strrchr(__FILE__, SEPARATE) + 1 : __FILE__)
#define GPU_DEBUG_IF(cond) if (cond)
#define GPU_DEBUG_CODE(...) __VA_ARGS__
#define GPU_DEBUG_DEFINE_MEM_LOGGER(stage) \
Expand All @@ -62,9 +62,9 @@ enum class LogLevel : int8_t {
#define GPU_DEBUG_LOG_RAW_INT(min_verbose_level) if (cldnn::debug_configuration::get_instance()->verbose >= min_verbose_level) \
((cldnn::debug_configuration::get_instance()->verbose_color == 0) ? GPU_DEBUG_LOG_PREFIX : GPU_DEBUG_LOG_COLOR_PREFIX)
#define GPU_DEBUG_LOG_RAW(min_verbose_level) GPU_DEBUG_LOG_RAW_INT(static_cast<std::underlying_type<ov::intel_gpu::LogLevel>::type>(min_verbose_level))
#define GPU_DEBUG_LOG_PREFIX std::cout << cldnn::debug_configuration::prefix << __FILENAME__ << ":" <<__LINE__ << ":" << __func__ << ": "
#define GPU_DEBUG_LOG_PREFIX std::cout << cldnn::debug_configuration::prefix << GPU_FILENAME << ":" <<__LINE__ << ":" << __func__ << ": "
#define GPU_DEBUG_LOG_COLOR_PREFIX std::cout << DARK_GRAY << cldnn::debug_configuration::prefix << \
BLUE << __FILENAME__ << ":" << PURPLE << __LINE__ << ":" << CYAN << __func__ << ": " << RESET
BLUE << GPU_FILENAME << ":" << PURPLE << __LINE__ << ":" << CYAN << __func__ << ": " << RESET
#define DARK_GRAY "\033[1;30m"
#define BLUE "\033[1;34m"
#define PURPLE "\033[1;35m"
Expand Down
14 changes: 13 additions & 1 deletion src/plugins/intel_gpu/include/intel_gpu/runtime/device_info.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
#include <string>
#include <vector>
#include <tuple>
#include <array>

namespace cldnn {
/// @addtogroup cpp_api C++ API
Expand All @@ -25,6 +24,17 @@ enum class device_type {
discrete_gpu = 1
};

enum class gpu_arch {
unknown = 0,
gen9 = 1,
gen11 = 2,
xe_lp = 3,
xe_hp = 4,
xe_hpg = 5,
xe_hpc = 6,
xe2 = 7,
};

/// @brief Defines version of GFX IP
struct gfx_version {
uint16_t major;
Expand Down Expand Up @@ -77,6 +87,8 @@ struct device_info {
device_type dev_type; ///< Defines type of current GPU device (integrated or discrete)

gfx_version gfx_ver; ///< Defines GFX IP version
gpu_arch arch; ///< Defines arch human readable name
uint32_t ip_version; ///< Defines raw GFX IP version
uint32_t device_id; ///< ID of current GPU
uint32_t num_slices; ///< Number of slices
uint32_t num_sub_slices_per_slice; ///< Number of subslices in a slice
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,9 @@ struct kernel_string {
std::string options;
std::string entry_point;
bool batch_compilation;
bool has_microkernels;

kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false) {}
kernel_string() : str(""), jit(""), undefs(""), options(""), entry_point(""), batch_compilation(false), has_microkernels(false) {}

std::string get_str() const { return str + jit + undefs + options + entry_point; }
size_t get_hash() const { return std::hash<std::string>()(get_str()); }
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "intel_gpu/graph/program.hpp"

#include "kernel_selector_helper.h"
#include "intel_gpu/runtime/device_info.hpp"
#include "kernel_selector_params.h"
#include "to_string_utils.h"
#include "program_node.h"
Expand Down Expand Up @@ -32,7 +33,6 @@
#include "intel_gpu/primitives/extract_image_patches.hpp"

#include "activation_inst.h"
#include "depth_to_space_inst.h"
#include "eltwise_inst.h"
#include "quantize_inst.h"
#include "reorder_inst.h"
Expand All @@ -44,9 +44,9 @@
#include "kernel_selector/kernels/reorder/reorder_kernel_base.h"

#include "runtime/kernels_cache.hpp"
#include "kernel_base.h"

#include <string>
#include <type_traits>
#include <vector>

namespace {
Expand Down Expand Up @@ -119,6 +119,48 @@ bool query_local_block_io_supported(engine& e, const ExecutionConfig& config) {

namespace cldnn {

bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config) {
auto device = e.get_device().get();

static std::mutex m;
std::lock_guard<std::mutex> lock(m);
static std::map<cldnn::device*, bool> cache;
if (cache.find(device) != cache.end()) {
return cache.at(device);
}

std::shared_ptr<kernel_selector::KernelString> kernel_string = std::make_shared<kernel_selector::KernelString>();
// This program check that all required vISA features are supported by current IGC version
const char* kernel_code = R""""(
kernel void igc_check() {
__asm__ volatile(
".decl AA0 v_type=G type=ud num_elts=1\n"
".decl AA1 v_type=G type=ud num_elts=1\n"
".implicit_PSEUDO_INPUT AA0 offset=256 size=4\n"
".implicit_PSEUDO_INPUT AA1 offset=256 size=4\n"
"mov (M1_NM,1) AA0(0,0)<1> AA1(0,0)<0;1,0>\n"
);
}
)"""";

kernel_string->str = kernel_code;
kernel_string->options = "";
kernel_string->entry_point = "igc_check";
kernel_string->batch_compilation = true;

try {
cldnn::kernel_impl_params dummy_params;
auto _kernels_cache_device_query = std::unique_ptr<cldnn::kernels_cache>(new cldnn::kernels_cache(e, config, 0));
_kernels_cache_device_query->add_kernels_source(dummy_params, {kernel_string}, false);
_kernels_cache_device_query->build_all();
cache[device] = true;
} catch (std::exception&) {
cache[device] = false;
}

return cache.at(device);
}

kernel_selector::data_type to_data_type(data_types dt) {
switch (dt) {
case cldnn::data_types::i4:
Expand Down Expand Up @@ -1081,6 +1123,7 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
params.engineInfo.bOptHintsSupport = false;

params.engineInfo.bLocalBlockIOSupport = query_local_block_io_supported(engine, config);
params.engineInfo.supports_microkernels = query_microkernels_supported(engine, config);
params.engineInfo.deviceType = get_device_type(device_info.dev_type);
params.engineInfo.maxWorkGroupSize = device_info.max_work_group_size;
params.engineInfo.maxLocalMemSize = device_info.max_local_mem_size;
Expand All @@ -1092,6 +1135,8 @@ void set_params(const kernel_impl_params& param_info, kernel_selector::params& p
params.engineInfo.driverVersion = device_info.driver_version;
params.engineInfo.supportedSimdSizes = device_info.supported_simd_sizes;
params.engineInfo.vendor_id = device_info.vendor_id;
params.engineInfo.ip_version = device_info.ip_version;
params.engineInfo.arch = kernel_selector::gpu_arch(static_cast<std::underlying_type<gpu_arch>::type>(device_info.arch));

auto impl_forcing = config.get_property(ov::intel_gpu::force_implementations);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -294,4 +294,6 @@ inline void update_shapes(kernel_selector::Params& p, const kernel_impl_params&
}
}

bool query_microkernels_supported(cldnn::engine& e, const cldnn::ExecutionConfig& config);

} // namespace cldnn
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive<scaled_dot_prod
// buffers number and its' sizes (since update_dispatch_data is called for both kernels too), and
// do not double memory allocations during reallocate_if_needed() function call
std::vector<layout> layouts;
if (_kernels_data.size() > 0) {
if (_kernels_data.size() > 0 && !_kernels_data[0].internalBufferSizes.empty()) {
auto dtype = from_data_type(_kernels_data[0].internalBufferDataType);
const auto bpp = data_type_traits::size_of(dtype);
for (auto size : _kernels_data[0].internalBufferSizes) {
Expand Down
2 changes: 1 addition & 1 deletion src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ void program::init_program() {
if (_task_executor == nullptr)
_task_executor = program::make_task_executor(_config);
_kernels_cache = std::unique_ptr<kernels_cache>(new kernels_cache(_engine, _config, prog_id, _task_executor,
kernel_selector::KernelBase::get_db().get_batch_header_str()));
kernel_selector::KernelBase::get_db().get_batch_headers()));

if (!_compilation_context)
_compilation_context = program::make_compilation_context(_config);
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/kernel_selector/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@ target_include_directories(${TARGET_NAME} PUBLIC $<BUILD_INTERFACE:${INCLUDE_DIR
target_compile_options(${TARGET_NAME} PRIVATE
$<$<CONFIG:Release>:$<IF:$<CXX_COMPILER_ID:MSVC>,/Os,-Os>>)

if (ENABLE_ONEDNN_FOR_GPU)
target_link_libraries(${TARGET_NAME} PRIVATE onednn_gpu_tgt)
endif()

if(COMMAND add_cpplint_target)
add_cpplint_target(${TARGET_NAME}_cpplint FOR_TARGETS ${TARGET_NAME})
endif()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H
#define GPU_INTEL_OCL_GENERIC_VECTOR_OPS_H

typedef half __attribute__((ext_vector_type(1))) half1;
typedef uint __attribute__((ext_vector_type(1))) uint1;
typedef float __attribute__((ext_vector_type(1))) float1;

float1 __attribute__((overloadable)) vmad(float1 a, float1 b, float1 c) {
c[0] = mad(a[0], b[0], c[0]);
return c;
}
float2 __attribute__((overloadable)) vmad(float2 a, float2 b, float2 c) {
return mad(a, b, c);
}
float4 __attribute__((overloadable)) vmad(float4 a, float4 b, float4 c) {
return mad(a, b, c);
}
float8 __attribute__((overloadable)) vmad(float8 a, float8 b, float8 c) {
return mad(a, b, c);
}
float16 __attribute__((overloadable)) vmad(float16 a, float16 b, float16 c) {
return mad(a, b, c);
}

float1 __attribute__((overloadable)) native_vrecip(float1 x) {
x[0] = native_recip(x[0]);
return x;
}
float2 __attribute__((overloadable)) native_vrecip(float2 x) {
return native_recip(x);
}
float4 __attribute__((overloadable)) native_vrecip(float4 x) {
return native_recip(x);
}
float8 __attribute__((overloadable)) native_vrecip(float8 x) {
return native_recip(x);
}
float16 __attribute__((overloadable)) native_vrecip(float16 x) {
return native_recip(x);
}

float1 __attribute__((overloadable)) native_vexp2(float1 x) {
x[0] = native_exp2(x[0]);
return x;
}
float2 __attribute__((overloadable)) native_vexp2(float2 x) {
return native_exp2(x);
}
float4 __attribute__((overloadable)) native_vexp2(float4 x) {
return native_exp2(x);
}
float8 __attribute__((overloadable)) native_vexp2(float8 x) {
return native_exp2(x);
}
float16 __attribute__((overloadable)) native_vexp2(float16 x) {
return native_exp2(x);
}

#endif
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
/*******************************************************************************
* Copyright 2024 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

#ifndef GPU_OCL_SDPA_UTILS_H
#define GPU_OCL_SDPA_UTILS_H

#define _4D_OFF(tag, x0, x1, x2, x3) \
(((x0) % tag##_B0) * tag##_SB0 + ((x0) / tag##_B0) * tag##_S0 \
+ ((x1) % tag##_B1) * tag##_SB1 + ((x1) / tag##_B1) * tag##_S1 \
+ ((x2) % tag##_B2) * tag##_SB2 + ((x2) / tag##_B2) * tag##_S2 \
+ ((x3) % tag##_B3) * tag##_SB3 + ((x3) / tag##_B3) * tag##_S3)

#define QRY_OFF(x0, x1, x2, x3) _4D_OFF(QRY, x0, x1, x2, x3)
#define KEY_OFF(x0, x1, x2, x3) _4D_OFF(KEY, x0, x1, x2, x3)
#define VAL_OFF(x0, x1, x2, x3) _4D_OFF(VAL, x0, x1, x2, x3)
#define MSK_OFF(x0, x1, x2, x3) _4D_OFF(MSK, x0, x1, x2, x3)

#define DST_OFF(x0, x1, d, h, w) \
(((x0) % DST_B0) * DST_SB0 + ((x0) / DST_B0) * DST_S0 \
+ ((x1) % DST_B1) * DST_SB1 + ((x1) / DST_B1) * DST_S1)

#endif
Loading

0 comments on commit 2918322

Please sign in to comment.