Update engine hash id generator with model name/model content/metadata (

#13015) **Update engine hash id generator with model name/model content/metadata** **Description**: * Updated engine id generator, which use model name/model inputs & outputs/env metadata (instead of model path) to generate hash * New bridged API were introduced in order to enable id generator in the TRTEP utility **Motivation and Context** - Why is this change required? What problem does it solve? To fix this [issue](triton-inference-server/server#4587) caused by id generator using model path How to use: * Call [TRTGenerateMetaDefId(const GraphViewer& graph_viewer, HashValue& model_hash)](https://github.com/microsoft/onnxruntime/blob/0fcce74a565478b4c83fac5a3230e9786bb53ab3/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc#L715) to generate hash id for TRT engine cache How to test: * On WIndows, run: * .\onnxruntime_test_all.exe --gtest_filter=TensorrtExecutionProviderTest.TRTMetadefIdGeneratorUsingModelHashing * .\onnxruntime_test_all.exe --gtest_filter=TensorrtExecutionProviderTest.TRTSubgraphIdGeneratorUsingModelHashing **Appendix** * [Existing engine id generator that uses model path](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/execution_provider.cc#L112-L182)
microsoft · Sep 21, 2022 · 240aead · 240aead
1 parent 39e2068
commit 240aead
Show file tree

Hide file tree

Showing 9 changed files with 561 additions and 1 deletion.
diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@@ -663,6 +663,10 @@ struct ProviderHost {
   virtual bool Graph__GetInitializedTensor(const Graph* p, const std::string& tensor_name, const ONNX_NAMESPACE::TensorProto*& value) = 0;
 
   virtual const Node* Graph__ParentNode(const Graph* p) const = 0;
+  virtual const Graph* Graph__ParentGraph(const Graph* p) const = 0;
+  virtual const std::string& Graph__Name(const Graph* p) const noexcept = 0;
+  virtual const std::vector<const NodeArg*>& Graph__GetInputsIncludingInitializers(const Graph* p) const noexcept = 0;
+  virtual bool Graph__IsSubgraph(const Graph* p) = 0;
 
   // GraphViewer
   virtual void GraphViewer__operator_delete(GraphViewer* p) = 0;

diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -695,6 +695,10 @@ struct Graph final {
   bool GetInitializedTensor(const std::string& tensor_name, const ONNX_NAMESPACE::TensorProto*& value) const { return g_host->Graph__GetInitializedTensor(this, tensor_name, value); }
 
   const Node* ParentNode() const { return g_host->Graph__ParentNode(this); }
+  const Graph* ParentGraph() const { return g_host->Graph__ParentGraph(this); }
+  const std::string& Name() const noexcept { return g_host->Graph__Name(this); }
+  const std::vector<const NodeArg*>& GetInputsIncludingInitializers() const noexcept { return g_host->Graph__GetInputsIncludingInitializers(this); }
+  bool IsSubgraph() const { return g_host->Graph__IsSubgraph(this); }
 
   PROVIDER_DISALLOW_ALL(Graph)
 };

diff --git a/onnxruntime/core/providers/tensorrt/murmurhash3.cc b/onnxruntime/core/providers/tensorrt/murmurhash3.cc
@@ -0,0 +1,349 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "murmurhash3.h"
+
+// Original source: https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
+//-----------------------------------------------------------------------------
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+
+// Note - The x86 and x64 versions do _not_ produce the same results, as the
+// algorithms are optimized for their respective platforms. You can still
+// compile and run any of them on any platform, but your performance with the
+// non-native version will be less than optimal.
+
+/* Modifications Copyright (c) Microsoft. */
+
+#include "core/framework/endian.h"
+
+//-----------------------------------------------------------------------------
+// Platform-specific functions and macros
+
+// Microsoft Visual Studio
+
+#if defined(_MSC_VER)
+
+#define FORCE_INLINE __forceinline
+
+#include <stdlib.h>
+
+#define ROTL32(x, y) _rotl(x, y)
+#define ROTL64(x, y) _rotl64(x, y)
+
+#define BIG_CONSTANT(x) (x)
+
+// Other compilers
+
+#else  // defined(_MSC_VER)
+
+#define FORCE_INLINE inline __attribute__((always_inline))
+
+inline uint32_t rotl32(uint32_t x, int8_t r) {
+  return (x << r) | (x >> (32 - r));
+}
+
+inline uint64_t rotl64(uint64_t x, int8_t r) {
+  return (x << r) | (x >> (64 - r));
+}
+
+#define ROTL32(x, y) rotl32(x, y)
+#define ROTL64(x, y) rotl64(x, y)
+
+#define BIG_CONSTANT(x) (x##LLU)
+
+#endif  // !defined(_MSC_VER)
+#include <cstddef>
+//-----------------------------------------------------------------------------
+// Block read - on little-endian machines this is a single load,
+// while on big-endian or unknown machines the byte accesses should
+// still get optimized into the most efficient instruction.
+//
+// Changes to support big-endian from https://github.com/explosion/murmurhash/pull/27/
+// were manually applied to original murmurhash3 source code.
+FORCE_INLINE uint32_t getblock32(const uint32_t* p, int i) {
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+    return p[i];
+  } else {
+    const uint8_t* c = (const uint8_t*)&p[i];
+    return (uint32_t)c[0] |
+           (uint32_t)c[1] << 8 |
+           (uint32_t)c[2] << 16 |
+           (uint32_t)c[3] << 24;
+  }
+}
+
+FORCE_INLINE uint64_t getblock64(const uint64_t* p, int i) {
+  if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) {
+    return p[i];
+  } else {
+    const uint8_t* c = (const uint8_t*)&p[i];
+    return (uint64_t)c[0] |
+           (uint64_t)c[1] << 8 |
+           (uint64_t)c[2] << 16 |
+           (uint64_t)c[3] << 24 |
+           (uint64_t)c[4] << 32 |
+           (uint64_t)c[5] << 40 |
+           (uint64_t)c[6] << 48 |
+           (uint64_t)c[7] << 56;
+  }
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+FORCE_INLINE constexpr uint32_t fmix32(uint32_t h) {
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+//----------
+
+FORCE_INLINE constexpr uint64_t fmix64(uint64_t k) {
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xff51afd7ed558ccd);
+  k ^= k >> 33;
+  k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
+  k ^= k >> 33;
+
+  return k;
+}
+
+//-----------------------------------------------------------------------------
+
+namespace onnxruntime {
+void MurmurHash3::x86_32(const void* key, int len,
+                         uint32_t seed, void* out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  constexpr uint32_t c1 = 0xcc9e2d51;
+  constexpr uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  const uint32_t* blocks = (const uint32_t*)(data + static_cast<ptrdiff_t>(nblocks) * 4);
+
+  for (int i = -nblocks; i; i++) {
+    uint32_t k1 = getblock32(blocks, i);
+
+    k1 *= c1;
+    k1 = ROTL32(k1, 15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = ROTL32(h1, 13);
+    h1 = h1 * 5 + 0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + static_cast<ptrdiff_t>(nblocks) * 4);
+
+  uint32_t k1 = 0;
+
+  switch (len & 3) {
+    case 3:
+      k1 ^= tail[2] << 16;
+      [[fallthrough]];
+    case 2:
+      k1 ^= tail[1] << 8;
+      [[fallthrough]];
+    case 1:
+      k1 ^= tail[0];
+      k1 *= c1;
+      k1 = ROTL32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  *(uint32_t*)out = h1;
+}
+
+//-----------------------------------------------------------------------------
+
+void MurmurHash3::x86_128(const void* key, int len, uint32_t seed, void* out) {
+  const uint8_t* data = (const uint8_t*)key;
+  const int nblocks = len / 16;
+
+  uint32_t h1 = seed;
+  uint32_t h2 = seed;
+  uint32_t h3 = seed;
+  uint32_t h4 = seed;
+
+  constexpr uint32_t c1 = 0x239b961b;
+  constexpr uint32_t c2 = 0xab0e9789;
+  constexpr uint32_t c3 = 0x38b34ae5;
+  constexpr uint32_t c4 = 0xa1e38b93;
+
+  //----------
+  // body
+
+  const uint32_t* blocks = (const uint32_t*)(data + static_cast<ptrdiff_t>(nblocks) * 16);
+
+  for (int i = -nblocks; i; i++) {
+    uint32_t k1 = getblock32(blocks, i * 4 + 0);
+    uint32_t k2 = getblock32(blocks, i * 4 + 1);
+    uint32_t k3 = getblock32(blocks, i * 4 + 2);
+    uint32_t k4 = getblock32(blocks, i * 4 + 3);
+
+    k1 *= c1;
+    k1 = ROTL32(k1, 15);
+    k1 *= c2;
+    h1 ^= k1;
+
+    h1 = ROTL32(h1, 19);
+    h1 += h2;
+    h1 = h1 * 5 + 0x561ccd1b;
+
+    k2 *= c2;
+    k2 = ROTL32(k2, 16);
+    k2 *= c3;
+    h2 ^= k2;
+
+    h2 = ROTL32(h2, 17);
+    h2 += h3;
+    h2 = h2 * 5 + 0x0bcaa747;
+
+    k3 *= c3;
+    k3 = ROTL32(k3, 17);
+    k3 *= c4;
+    h3 ^= k3;
+
+    h3 = ROTL32(h3, 15);
+    h3 += h4;
+    h3 = h3 * 5 + 0x96cd1c35;
+
+    k4 *= c4;
+    k4 = ROTL32(k4, 18);
+    k4 *= c1;
+    h4 ^= k4;
+
+    h4 = ROTL32(h4, 13);
+    h4 += h1;
+    h4 = h4 * 5 + 0x32ac3b17;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t* tail = (const uint8_t*)(data + static_cast<ptrdiff_t>(nblocks) * 16);
+
+  uint32_t k1 = 0;
+  uint32_t k2 = 0;
+  uint32_t k3 = 0;
+  uint32_t k4 = 0;
+
+  switch (len & 15) {
+    case 15:
+      k4 ^= tail[14] << 16;
+      [[fallthrough]];
+    case 14:
+      k4 ^= tail[13] << 8;
+      [[fallthrough]];
+    case 13:
+      k4 ^= tail[12] << 0;
+      k4 *= c4;
+      k4 = ROTL32(k4, 18);
+      k4 *= c1;
+      h4 ^= k4;
+      [[fallthrough]];
+    case 12:
+      k3 ^= tail[11] << 24;
+      [[fallthrough]];
+    case 11:
+      k3 ^= tail[10] << 16;
+      [[fallthrough]];
+    case 10:
+      k3 ^= tail[9] << 8;
+      [[fallthrough]];
+    case 9:
+      k3 ^= tail[8] << 0;
+      k3 *= c3;
+      k3 = ROTL32(k3, 17);
+      k3 *= c4;
+      h3 ^= k3;
+      [[fallthrough]];
+    case 8:
+      k2 ^= tail[7] << 24;
+      [[fallthrough]];
+    case 7:
+      k2 ^= tail[6] << 16;
+      [[fallthrough]];
+    case 6:
+      k2 ^= tail[5] << 8;
+      [[fallthrough]];
+    case 5:
+      k2 ^= tail[4] << 0;
+      k2 *= c2;
+      k2 = ROTL32(k2, 16);
+      k2 *= c3;
+      h2 ^= k2;
+      [[fallthrough]];
+    case 4:
+      k1 ^= tail[3] << 24;
+      [[fallthrough]];
+    case 3:
+      k1 ^= tail[2] << 16;
+      [[fallthrough]];
+    case 2:
+      k1 ^= tail[1] << 8;
+      [[fallthrough]];
+    case 1:
+      k1 ^= tail[0] << 0;
+      k1 *= c1;
+      k1 = ROTL32(k1, 15);
+      k1 *= c2;
+      h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+  h2 ^= len;
+  h3 ^= len;
+  h4 ^= len;
+
+  h1 += h2;
+  h1 += h3;
+  h1 += h4;
+  h2 += h1;
+  h3 += h1;
+  h4 += h1;
+
+  h1 = fmix32(h1);
+  h2 = fmix32(h2);
+  h3 = fmix32(h3);
+  h4 = fmix32(h4);
+
+  h1 += h2;
+  h1 += h3;
+  h1 += h4;
+  h2 += h1;
+  h3 += h1;
+  h4 += h1;
+
+  ((uint32_t*)out)[0] = h1;
+  ((uint32_t*)out)[1] = h2;
+  ((uint32_t*)out)[2] = h3;
+  ((uint32_t*)out)[3] = h4;
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/murmurhash3.h b/onnxruntime/core/providers/tensorrt/murmurhash3.h
@@ -0,0 +1,16 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <cstdint>
+
+namespace onnxruntime {
+struct MurmurHash3 {
+  // generate 32-bit hash from input and write to 'out'
+  static void x86_32(const void* key, int len, uint32_t seed, void* out);
+
+  // generate 128-bit hash from input and write to 'out'.
+  static void x86_128(const void* key, int len, uint32_t seed, void* out);
+};
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc
@@ -712,7 +712,7 @@ std::unique_ptr<IndexedSubGraph> TensorrtExecutionProvider::GetSubGraph(SubGraph
 
   // Generate unique kernel name for TRT subgraph
   HashValue model_hash = 0;
-  int id = GenerateMetaDefId(graph, model_hash);
+  int id = TRTGenerateMetaDefId(graph, model_hash);
   std::string subgraph_id = std::to_string(model_hash) + "_" + std::to_string(id);
   auto meta_def = IndexedSubGraph_MetaDef::Create();
   const std::string graph_type = graph.IsSubgraph() ? "subgraph" : "graph";