From af6c6778687d5dee98ca9963f6fbc544dfb53e00 Mon Sep 17 00:00:00 2001
From: Mingyu Kim <mingyu.kim@intel.com>
Date: Fri, 9 Feb 2024 17:25:02 +0900
Subject: [PATCH] [GPU] Exclude gemm from async compilation if the primimtive's
 dynamic impl is optimized kernel.   (#22721)

### Details:
- Do not make spurious task. It caused memory leak from task_key and
promise objects.
- keys are generated before queueing and it was not removed when cache
entry is not added into lru impl cache.
 - This mostly fix memory leak and reduces permanent cache size.
- Remaining issue for memory leak: futures are just added without
freeing.

### Tickets:
 - 131417
---
 .../src/graph/compilation_context.cpp          |  2 +-
 .../intel_gpu/src/graph/primitive_inst.cpp     | 18 ++++++++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/src/plugins/intel_gpu/src/graph/compilation_context.cpp b/src/plugins/intel_gpu/src/graph/compilation_context.cpp
index f811b84ec73c4f..f115cee970c777 100644
--- a/src/plugins/intel_gpu/src/graph/compilation_context.cpp
+++ b/src/plugins/intel_gpu/src/graph/compilation_context.cpp
@@ -28,8 +28,8 @@ class CompilationContext : public ICompilationContext {
         futures.emplace_back(promise->get_future());
 
         if (_task_keys.find(key) == _task_keys.end()) {
-            _task_keys.insert(key);
             if (_task_executor != nullptr) {
+                _task_keys.insert(key);
                 _task_executor->run([task, promise] {
                     task();
                     promise->set_value();
diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
index 21e1fdef6816b9..c752396de67a67 100644
--- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
+++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -713,7 +713,15 @@ bool primitive_inst::use_async_compilation() {
         }
     }
 
-    return (_node->is_type<convolution>() || compile_fc_impls || _node->is_type<gemm>() ||
+    bool compile_gemm_impls = _node->is_type<gemm>();
+    if (compile_gemm_impls) {
+        // Do not async-compile if opt_gemm is chosen for iGPU
+        // Do async-compile if it is to be executed from onednn
+        compile_gemm_impls = _node->get_selected_impl() && _node->get_selected_impl()->get_kernel_name().find("gemm_ref") != std::string::npos;
+        compile_gemm_impls |= (_node->get_preferred_impl_type() == impl_types::onednn);
+    }
+
+    return (_node->is_type<convolution>() || compile_fc_impls || compile_gemm_impls ||
             (_node->is_type<softmax>() && _node->get_selected_impl() &&
              _node->get_selected_impl()->get_kernel_name().find("softmax_gpu_ref") != std::string::npos));
 }
@@ -830,13 +838,7 @@ bool primitive_inst::update_impl() {
 
                         if (!can_be_optimized()) {
                             auto impl = _node->type()->choose_impl(*_node, updated_params_no_dyn_pad);
-                            // In the case of gemm, if current dynamic impl is not gemm_ref and newly chosen impl is gemm_ref,
-                            // the newly chosen impl is not added to the impl cache for beffer performance.
-                            if (_node->is_type<gemm>() &&
-                                    (_node->get_selected_impl() && _node->get_selected_impl()->get_kernel_name().find("gemm_ref") == std::string::npos) &&
-                                    impl->get_kernel_name().find("gemm_ref") != std::string::npos) {
-                                return;
-                            }
+
                             if (impl->get_kernels_source().size() > 0) {
                                 auto kernels = _program->get_kernels_cache().compile(updated_params_no_dyn_pad, impl->get_kernels_source());
                                 impl->set_kernels(kernels);