microsoft · yuslepukhin · Jun 26, 2024 · Jun 27, 2024 · Jun 26, 2024 · Jun 27, 2024
diff --git a/.clang-format b/.clang-format
@@ -4,7 +4,7 @@ BasedOnStyle:  Google
 
 # Setting ColumnLimit to 0 so developer choices about where to break lines are maintained.
 # Developers are responsible for adhering to the 120 character maximum.
-ColumnLimit:     0
+ColumnLimit:     120
 SortIncludes: Never
 DerivePointerAlignment: false
 

diff --git a/src/models/captured_graph_pool.cpp b/src/models/captured_graph_pool.cpp
@@ -26,7 +26,8 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model,
   // Multiple generators can reserve graphs in parallel, so we need to make it thread saf
   std::unique_lock lock(captured_graph_mutex_);
 
-  auto key = std::make_unique<CapturedGraphKey>(params.max_batch_size, params.search.max_length, params.search.num_beams, params.extra_inputs);
+  auto key = std::make_unique<CapturedGraphKey>(params.max_batch_size, params.search.max_length,
+                                                params.search.num_beams, params.extra_inputs);
   auto& captured_graphs = captured_graphs_map_[*key];
 
   // If no graphs are available, create a graph with a new ID
@@ -59,7 +60,8 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model,
     new_captured_graph->sb_kv_caches_.reserve(layer_count * 2);
 
     for (int i = 0; i < layer_count * 2; ++i) {
-      new_captured_graph->sb_kv_caches_.push_back(std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size));
+      new_captured_graph->sb_kv_caches_.push_back(
+          std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size));
     }
 
     // Create the static buffer for the position ids, if needed
@@ -74,7 +76,8 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model,
 #if USE_DML
       // DML currently needs an additional static buffer for the mask
       if (model.device_type_ == DeviceType::DML) {
-        new_captured_graph->sb_attention_mask_next_ = std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size);
+        new_captured_graph->sb_attention_mask_next_ =
+            std::make_unique<StaticBuffer>(allocator_device_, max_beam_batch_size);
       }
 #endif
     }
@@ -92,7 +95,8 @@ CapturedGraphInfoPtr CapturedGraphPool::ReserveCapturedGraph(const Model& model,
     // Create the extra inputs
     for (const auto& extra_input : params.extra_inputs) {
       auto first_dim = extra_input.tensor->ort_tensor_->GetTensorTypeAndShapeInfo()->GetShape()[0];
-      new_captured_graph->sb_extra_inputs_[extra_input.name] = std::make_unique<StaticBuffer>(allocator_device_, first_dim);
+      new_captured_graph->sb_extra_inputs_[extra_input.name] =
+          std::make_unique<StaticBuffer>(allocator_device_, first_dim);
     }
 
     // Create the input embeddings if needed

diff --git a/src/models/debugging.cpp b/src/models/debugging.cpp
@@ -79,47 +79,47 @@ void DumpValues(std::ostream& stream, ONNXTensorElementDataType type, const void
       break;
 
     case Ort::TypeToTensorType<int8_t>::type:
-      DumpSpan(stream, std::span<const int8_t>{reinterpret_cast<const int8_t*>(p_values_raw), count});
+      DumpSpan(stream, std::span<const int8_t>(reinterpret_cast<const int8_t*>(p_values_raw), count));
       break;
 
     case Ort::TypeToTensorType<uint8_t>::type:
-      DumpSpan(stream, std::span<const uint8_t>{reinterpret_cast<const uint8_t*>(p_values_raw), count});
+      DumpSpan(stream, std::span<const uint8_t>(reinterpret_cast<const uint8_t*>(p_values_raw), count));
       break;
 
     case Ort::TypeToTensorType<int16_t>::type:
-      DumpSpan(stream, std::span<const int16_t>{reinterpret_cast<const int16_t*>(p_values_raw), count});
+      DumpSpan(stream, std::span<const int16_t>(reinterpret_cast<const int16_t*>(p_values_raw), count));
       break;
 
     case Ort::TypeToTensorType<uint16_t>::type:
-      DumpSpan(stream, std::span<const uint16_t>{reinterpret_cast<const uint16_t*>(p_values_raw), count});
+      DumpSpan(stream, std::span<const uint16_t>(reinterpret_cast<const uint16_t*>(p_values_raw), count));
       break;
 
     case Ort::TypeToTensorType<int32_t>::type:
       DumpSpan(stream, std::span<const int32_t>{reinterpret_cast<const int32_t*>(p_values_raw), count});
       break;
 
     case Ort::TypeToTensorType<uint32_t>::type:
-      DumpSpan(stream, std::span<const uint32_t>{reinterpret_cast<const uint32_t*>(p_values_raw), count});
+      DumpSpan(stream, std::span<const uint32_t>(reinterpret_cast<const uint32_t*>(p_values_raw), count));
       break;
 
     case Ort::TypeToTensorType<int64_t>::type:
-      DumpSpan(stream, std::span<const int64_t>{reinterpret_cast<const int64_t*>(p_values_raw), count});
+      DumpSpan(stream, std::span<const int64_t>(reinterpret_cast<const int64_t*>(p_values_raw), count));
       break;
 
     case Ort::TypeToTensorType<uint64_t>::type:
       DumpSpan(stream, std::span<const uint64_t>{reinterpret_cast<const uint64_t*>(p_values_raw), count});
       break;
 
     case Ort::TypeToTensorType<Ort::Float16_t>::type:
-      DumpSpan(stream, std::span<const Ort::Float16_t>{reinterpret_cast<const Ort::Float16_t*>(p_values_raw), count});
+      DumpSpan(stream, std::span<const Ort::Float16_t>(reinterpret_cast<const Ort::Float16_t*>(p_values_raw), count));
       break;
 
     case Ort::TypeToTensorType<float>::type:
-      DumpSpan(stream, std::span<const float>{reinterpret_cast<const float*>(p_values_raw), count});
+      DumpSpan(stream, std::span<const float>(reinterpret_cast<const float*>(p_values_raw), count));
       break;
 
     case Ort::TypeToTensorType<double>::type:
-      DumpSpan(stream, std::span<const double>{reinterpret_cast<const double*>(p_values_raw), count});
+      DumpSpan(stream, std::span<const double>(reinterpret_cast<const double*>(p_values_raw), count));
       break;
 
     default:

diff --git a/src/models/extra_inputs.cpp b/src/models/extra_inputs.cpp
@@ -1,86 +1,52 @@
-#include "../generators.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "lora_adapter.h"
 #include "model.h"
 #include "extra_inputs.h"
-#include "kernels.h"
 
 namespace Generators {
 
-ExtraInputs::ExtraInputs(const Model& model, State& state)
-    : model_{model},
-      state_{state} {
-  extra_inputs_.reserve(state_.params_->extra_inputs.size());
+ExtraInputs::ExtraInputs(const Model& model, State& state) : model_{model}, state_{state} {
+  auto& lora_management = model_.GetLoraAdapterManagement();
+  const auto total_inputs = state_.params_->extra_inputs.size() + lora_management.GetParamNum();
+  extra_input_names_.reserve(total_inputs);
+  extra_inputs_.reserve(total_inputs);
 
   if (state_.GetCapturedGraphInfo()) {
-    owned_extra_inputs_.reserve(state_.params_->extra_inputs.size());
-
     for (int i = 0; i < state_.params_->extra_inputs.size(); ++i) {
       auto type_and_shape_info = state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorTypeAndShapeInfo();
       const auto& input_name = state_.params_->extra_inputs[i].name;
 
-      sb_extra_inputs_.emplace(input_name, state_.GetCapturedGraphInfo()->sb_extra_inputs_.at(input_name).get());
-      owned_extra_inputs_.push_back(sb_extra_inputs_.at(input_name)->CreateTensorOnStaticBuffer(type_and_shape_info->GetShape(), type_and_shape_info->GetElementType()));
-      extra_inputs_.push_back(owned_extra_inputs_.back().get());
+      auto* sb_extra = state_.GetCapturedGraphInfo()->sb_extra_inputs_.at(input_name).get();
+      auto ort_value =
+          sb_extra->CreateTensorOnStaticBuffer(type_and_shape_info->GetShape(), type_and_shape_info->GetElementType());
+
+      // Copy to value created on top of the StaticBuffer
+      CopyToDevice(model_, *state_.params_->extra_inputs[1].tensor->ort_tensor_, *ort_value);
+
+      extra_input_names_.push_back(input_name);
+      extra_inputs_.push_back(std::move(ort_value));
     }
   } else {
-    // We don't use graph capture, so simply use the existing pointers
+    // We don't use graph capture
     for (auto& extra_input : state_.params_->extra_inputs) {
-      extra_inputs_.push_back(extra_input.tensor->ort_tensor_.get());
+      extra_input_names_.push_back(extra_input.name);
+      auto ort_value = DuplicateOrtValue(*extra_input.tensor->ort_tensor_);
+      extra_inputs_.push_back(std::move(ort_value));
     }
   }
-}
 
-#pragma warning(push)
-#pragma warning(disable : 4065)  // switch statement contains 'default' but no 'case' labels
-#pragma warning(disable : 4189)  // local variable is initialized but not referenced
-#pragma warning(disable : 4702)  // unreachable code
+  // Add Lora Parameters
+  lora_management.OutputAdaptersParameters(std::back_inserter(extra_input_names_), std::back_inserter(extra_inputs_));
+}
 
 void ExtraInputs::Add() {
-  // Add extra user inputs
-  for (int i = 0; i < state_.params_->extra_inputs.size(); ++i) {
-    state_.input_names_.push_back(state_.params_->extra_inputs[i].name.c_str());
-    state_.inputs_.push_back(extra_inputs_[i]);
-  }
-
-  // Copy the data from the CPU-backed ORT value to the static buffers
-  for (int i = 0; i < sb_extra_inputs_.size(); ++i) {
-    auto type_and_shape_info = extra_inputs_[i]->GetTensorTypeAndShapeInfo();
-    auto shape = type_and_shape_info->GetShape();
-    auto element_count = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies<int64_t>());
-    auto copy_size_in_bytes = element_count * SizeOf(type_and_shape_info->GetElementType());
-
-    switch (model_.device_type_) {
-#if USE_DML
-      case DeviceType::DML: {
-        ComPtr<ID3D12Resource> target_resource;
-        Ort::ThrowOnError(model_.GetOrtDmlApi()->GetD3D12ResourceFromAllocation(model_.allocator_device_, extra_inputs_[i]->GetTensorMutableRawData(), &target_resource));
-
-        auto source = std::span(state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorData<const uint8_t>(), copy_size_in_bytes);
-
-        model_.GetDmlUploadHeap()->BeginUploadToGpu(
-            target_resource.Get(),
-            0,
-            D3D12_RESOURCE_STATE_UNORDERED_ACCESS,
-            source);
-      } break;
-#endif
-
-#if USE_CUDA
-      case DeviceType::CUDA: {
-        cudaMemcpyAsync(
-            extra_inputs_[i]->GetTensorMutableRawData(),
-            state_.params_->extra_inputs[i].tensor->ort_tensor_->GetTensorMutableRawData(),
-            copy_size_in_bytes,
-            cudaMemcpyHostToDevice,
-            model_.cuda_stream_);
-      } break;
-#endif
-
-      default:
-        throw std::runtime_error("Unsupported device for graph capture");
-    }
+  // Add extra user inputs to the state
+  for (size_t i = 0, lim = extra_input_names_.size(); i < lim; ++i) {
+    state_.input_names_.push_back(extra_input_names_[i].c_str());
+    state_.inputs_.push_back(extra_inputs_[i].get());
   }
 }
 
-#pragma warning(pop)
-
 }  // namespace Generators
diff --git a/src/models/extra_inputs.h b/src/models/extra_inputs.h
@@ -1,19 +1,31 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #pragma once
 
-#include "static_buffer.h"
+#include "onnxruntime_api.h"
+
+#include <memory>
+#include <string>
+#include <vector>
 
 namespace Generators {
 
+struct Model;
+struct State;
+
 struct ExtraInputs {
   ExtraInputs(const Model& model, State& state);
+  ExtraInputs(const ExtraInputs&) = delete;
+  ExtraInputs& operator=(const ExtraInputs&) = delete;
+
   void Add();
 
  private:
   const Model& model_;
   State& state_;
-  std::vector<OrtValue*> extra_inputs_;
-  std::vector<std::unique_ptr<OrtValue>> owned_extra_inputs_;
-  std::unordered_map<std::string, StaticBuffer*> sb_extra_inputs_;
+  std::vector<std::string> extra_input_names_;
+  std::vector<std::shared_ptr<OrtValue>> extra_inputs_;
 };
 
 }  // namespace Generators