microsoft · sumitsays · Oct 8, 2022 · Sep 25, 2022 · Sep 27, 2022 · Sep 27, 2022
diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -62,6 +62,9 @@
 #include "core/optimizer/slice_elimination.h"
 #include "core/optimizer/transpose_optimizer/ort_transpose_optimizer.h"
 #include "core/optimizer/unsqueeze_elimination.h"
+#ifdef USE_DML
+#include "core/providers/dml/DmlExecutionProvider/src/GraphTransformer.h"
+#endif
 #ifdef ENABLE_TRAINING
 #include "orttraining/core/optimizer/bitmask_dropout_replacement.h"
 #include "orttraining/core/optimizer/bias_softmax_dropout_fusion.h"
@@ -283,6 +286,11 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       // The QDQFinalCleanupTransformer must run AFTER other transformers that fuse Q/DQ nodes. Otherwise, their
       // fusions might be prevented if this one removes a Q/DQ node too early.
       transformers.emplace_back(std::make_unique<QDQFinalCleanupTransformer>(enable_quant_qdq_cleanup));
+#ifdef USE_DML
+      // This transformer applies DML-specific fusions that go beyond what ORT offers by default
+      const InlinedHashSet<std::string_view> dml_ep = {onnxruntime::kDmlExecutionProvider};
+      transformers.emplace_back(std::make_unique<Dml::GraphTransformer>("DmlOperatorFusionTransformer", dml_ep));
+#endif
     } break;
 
     case TransformerLevel::Level3: {

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.cpp
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlGraphFusionTransformer.h
@@ -0,0 +1,33 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+
+#include "core/optimizer/graph_transformer.h"
+#include "core/framework/execution_providers.h"
+
+namespace Dml
+{
+	class ExecutionProviderImpl;
+
+	class DmlGraphFusionTransformer : public onnxruntime::GraphTransformer
+	{
+	public:
+		DmlGraphFusionTransformer(
+			const std::string& name,
+			const onnxruntime::IExecutionProvider* provider
+		);
+
+	public:
-	public:
+	
-	public:
+	
+		inline const static char* const DML_GRAPH_FUSION_NODE_NAME_PREFIX = "DmlFusedNode_";
+		inline const static char* const DML_GRAPH_FUSION_NODE_DOMAIN = "DmlFusedNodeDomain";
+
+	private:
+		onnxruntime::common::Status ApplyImpl(onnxruntime::Graph& graph, 
+											  bool& modified, 
+											  int graph_level, 
+											  const onnxruntime::logging::Logger& logger) const final;
+	private:
-	private:
+	
-	private:
+	
+		const ExecutionProviderImpl* m_providerImpl = nullptr;
+	};
+}
-}
+}  // namespace Dml
-}
+}  // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.cpp
@@ -526,14 +526,12 @@ namespace Dml
         std::string partitionKernelPrefix = std::to_string(m_partitionKernelPrefixVal++) + "_";
         uint32_t deviceDataTypeMask = GetSupportedDeviceDataTypeMask();
 
-        return PartitionGraph(
-            graph,
-            *m_internalRegInfoMap,
-            kernel_lookup,
-            deviceDataTypeMask,
-            m_kernelRegistry.get(),
-            partitionKernelPrefix
-        );
+        return LightWeightPartitionGraph(
+                graph,
+                *m_internalRegInfoMap,
+                kernel_lookup,
+                deviceDataTypeMask
+            );
     }
 
     bool IsGpuTensor(const onnxruntime::Tensor& tensor)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/ExecutionProvider.h
@@ -158,6 +158,16 @@ namespace Dml
         std::shared_ptr<const Windows::AI::MachineLearning::Adapter::InternalRegistrationInfoMap>
         GetInternalRegistrationInfoMap() const;
 
+        void IncreasePartitionKernelPrefixVal() const
+        {
+            m_partitionKernelPrefixVal++;
+        }
+
+        uint64_t GetPartitionKernelPrefixVal() const
+        {
+            return m_partitionKernelPrefixVal;
+        }
+
         onnxruntime::common::Status OnSessionInitializationEnd();
 
     private:
@@ -176,7 +186,6 @@ namespace Dml
         std::shared_ptr<onnxruntime::KernelRegistry> m_kernelRegistry;
         std::shared_ptr<const Windows::AI::MachineLearning::Adapter::InternalRegistrationInfoMap> m_internalRegInfoMap;
         mutable uint64_t m_partitionKernelPrefixVal = 0;
-
         bool m_closed = false;
     };
 

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.cpp
@@ -18,23 +18,18 @@ namespace Dml
 
         FusedGraphKernel(
             const onnxruntime::OpKernelInfo& kernelInfo,
-            const std::unordered_map<std::string, GraphNodeProperties> &graphNodePropertyMap,
+            ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
-            ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
+            ComPtr<IDMLCompiledOperator>  /*compiledExecutionPlanOperator*/,
-            ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
+            ComPtr<IDMLCompiledOperator>  /*compiledExecutionPlanOperator*/,
+            Windows::AI::MachineLearning::Adapter::EdgeShapes& outputShapes,
+            std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
-            std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
+            std::vector<DML_INPUT_GRAPH_EDGE_DESC>&  /*inputEdges*/,
-            std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
+            std::vector<DML_INPUT_GRAPH_EDGE_DESC>&  /*inputEdges*/,
+            bool reuseCommandList,
+            std::vector<uint8_t>& inputsConstant,
-            std::vector<uint8_t>& inputsConstant,
+            std::vector<uint8_t>&  /*inputsConstant*/,
-            std::vector<uint8_t>& inputsConstant,
+            std::vector<uint8_t>&  /*inputsConstant*/,
             std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap,
-            const gsl::span<const std::string> fusedNodeInputArgOriginalNames,
-            const gsl::span<const std::string> fusedNodeOutputArgOriginalNames) : OpKernel(kernelInfo)
+            const gsl::span<const std::string> fusedNodeInputArgOriginalNames) :
-            const gsl::span<const std::string> fusedNodeInputArgOriginalNames) :
+            const gsl::span<const std::string>  /*fusedNodeInputArgOriginalNames*/) :
-            const gsl::span<const std::string> fusedNodeInputArgOriginalNames) :
+            const gsl::span<const std::string>  /*fusedNodeInputArgOriginalNames*/) :
+            OpKernel(kernelInfo), 
+            m_compiledExecutionPlanOperator(compiledExecutionPlanOperator),
+            m_outputShapes(outputShapes),
+            m_inputsConstant(inputsConstant)
         {       
-            // Get the graph for the function which was created according to the computational
-            // capacity returned by the execution provider's graph partitioner
-            auto& node = kernelInfo.node();
-            ORT_THROW_HR_IF(E_UNEXPECTED, node.NodeType() != onnxruntime::Node::Type::Fused);
-            auto func = node.GetFunctionBody();
-            const onnxruntime::Graph& graph = func->Body();
-
-            // Get the shapes for outputs of the overall graph.  These should be static, because 
-            // the partitioner checked that each node has static shapes before fusing into a 
-            // graph partition.
-            ORT_THROW_HR_IF(E_UNEXPECTED, !TryGetStaticOutputShapes(node, m_outputShapes));
-
             // Get the execution provider interfaces
             m_executionHandle = kernelInfo.GetExecutionProvider()->GetExecutionHandle();
             if (m_executionHandle)
@@ -48,49 +43,22 @@ namespace Dml
             }
 
             TranslateAndCompileGraph(
-                kernelInfo, 
-                graph, 
+                kernelInfo,
                 fusedNodeInputArgOriginalNames,
-                fusedNodeOutputArgOriginalNames, 
-                graphNodePropertyMap, 
-                transferredInitializerMap);
+                transferredInitializerMap,
+                inputEdges,
+                reuseCommandList);
         }
 
         void TranslateAndCompileGraph(
             const onnxruntime::OpKernelInfo& kernelInfo,
-            const onnxruntime::Graph& graph,
             const gsl::span<const std::string> fusedNodeInputArgOriginalNames,
-            const gsl::span<const std::string> fusedNodeOutputArgOriginalNames,
-            const std::unordered_map<std::string, GraphNodeProperties>& graphNodePropertyMap,
-            std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap
+            std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap,
-            std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap,
+            std::unordered_map<std::string, onnx::TensorProto>&  /*transferredInitializerMap*/,
-            std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap,
+            std::unordered_map<std::string, onnx::TensorProto>&  /*transferredInitializerMap*/,
+            std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
-            std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
+            std::vector<DML_INPUT_GRAPH_EDGE_DESC>&  /*inputEdges*/,
-            std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
+            std::vector<DML_INPUT_GRAPH_EDGE_DESC>&  /*inputEdges*/,
+            bool reuseCommandList
         )
         {
-            ComPtr<IDMLDevice> device;
-            ORT_THROW_IF_FAILED(m_provider->GetDmlDevice(device.GetAddressOf()));
-
-            ComPtr<IDMLDevice1> device1;
-            ORT_THROW_IF_FAILED(device.As(&device1));
-
             const uint32_t graphInputCount = kernelInfo.GetInputCount();
-
-            m_inputsConstant.resize(graphInputCount);
-            for (uint32_t i = 0; i < graphInputCount; ++i)
-            {
-              m_inputsConstant[i] = GraphKernelHelper::GetGraphInputConstness(i, kernelInfo, fusedNodeInputArgOriginalNames, transferredInitializerMap);
-            }
-
-            GraphDescBuilder::GraphDesc graphDesc = GraphDescBuilder::BuildGraphDesc(
-                kernelInfo,
-                m_inputsConstant.data(),
-                m_inputsConstant.size(),
-                transferredInitializerMap,
-                graph,
-                fusedNodeInputArgOriginalNames,
-                fusedNodeOutputArgOriginalNames,
-                graphNodePropertyMap,
-                device.Get(),
-                m_executionHandle);
-
             // Populate input bindings for operator initialization
             std::vector<Microsoft::WRL::ComPtr<ID3D12Resource>> initInputResources;  // For lifetime control
             std::vector<DML_BUFFER_BINDING> initInputBindings(graphInputCount);
@@ -102,7 +70,7 @@ namespace Dml
                 m_winmlProvider.Get(),
                 m_inputsConstant,
                 kernelInfo,
-                graphDesc,
+                inputEdges,
                 fusedNodeInputArgOriginalNames,
                 m_inputsUsed,
                 initInputBindings,
@@ -111,42 +79,7 @@ namespace Dml
                 initializeResourceRefs,
                 nullptr,
                 transferredInitializerMap);
-
-            DML_GRAPH_DESC dmlGraphDesc = {};
-            std::vector<DML_OPERATOR_GRAPH_NODE_DESC> dmlOperatorGraphNodes(graphDesc.nodes.size());
-            std::vector<DML_GRAPH_NODE_DESC> dmlGraphNodes(graphDesc.nodes.size());
-
-            std::vector<DML_GRAPH_EDGE_DESC> dmlInputEdges(graphDesc.inputEdges.size());
-            std::vector<DML_GRAPH_EDGE_DESC> dmlOutputEdges(graphDesc.outputEdges.size());
-            std::vector<DML_GRAPH_EDGE_DESC> dmlIntermediateEdges(graphDesc.intermediateEdges.size());
-
-            GraphKernelHelper::ConvertGraphDesc(
-                graphDesc, 
-                dmlGraphDesc, 
-                kernelInfo,
-                dmlOperatorGraphNodes,
-                dmlGraphNodes,
-                dmlInputEdges,
-                dmlOutputEdges,
-                dmlIntermediateEdges);
-
-            DML_EXECUTION_FLAGS executionFlags = DML_EXECUTION_FLAG_NONE;
-            if (graphDesc.reuseCommandList)
-            {
-                executionFlags |= DML_EXECUTION_FLAG_DESCRIPTORS_VOLATILE;
-            }
-
-            // Query DML execution provider to see if metacommands is enabled
-            if (!m_provider->MetacommandsEnabled())
-            {
-                executionFlags |= DML_EXECUTION_FLAG_DISABLE_META_COMMANDS;
-            }
-
-            ORT_THROW_IF_FAILED(device1->CompileGraph(
-                &dmlGraphDesc,
-                executionFlags,
-                IID_PPV_ARGS(&m_compiledExecutionPlanOperator)));
-
+
             // Allocate a persistent resource and initialize the operator
             UINT64 persistentResourceSize = m_compiledExecutionPlanOperator->GetBindingProperties().PersistentResourceSize;
             if (persistentResourceSize > 0)
@@ -175,7 +108,7 @@ namespace Dml
                 [&](ComPtr<ID3D12Resource>& resource){ m_winmlProvider->QueueReference(WRAP_GRAPHICS_UNKNOWN(resource).Get()); }
             );  
 
-            if (graphDesc.reuseCommandList)
+            if (reuseCommandList)
             {
                 BuildReusableCommandList();
             }
@@ -492,7 +425,7 @@ namespace Dml
         const void* m_executionHandle = nullptr;
         ComPtr<IWinmlExecutionProvider> m_winmlProvider;
         ComPtr<Dml::IExecutionProvider> m_provider;
-        Windows::AI::MachineLearning::Adapter::EdgeShapes m_outputShapes;
+        Windows::AI::MachineLearning::Adapter::EdgeShapes& m_outputShapes;
 
         // Re-usable command list, supporting descriptor heap, and DML binding table to update that heap.
         ComPtr<ID3D12GraphicsCommandList> m_graphicsCommandList;
@@ -517,18 +450,24 @@ namespace Dml
     };
 
     onnxruntime::OpKernel* CreateFusedGraphKernel(
-        const onnxruntime::OpKernelInfo& info, 
-        const std::unordered_map<std::string, GraphNodeProperties> &graphNodePropertyMap,
+        const onnxruntime::OpKernelInfo& info,
+        ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
+        Windows::AI::MachineLearning::Adapter::EdgeShapes& outputShapes,
+        std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
+        bool reuseCommandList,
+        std::vector<uint8_t>& inputsConstant,
         std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap,
-        const gsl::span<const std::string> fusedNodeInputArgOriginalNames,
-        const gsl::span<const std::string> fusedNodeOutputArgOriginalNames
+        const gsl::span<const std::string> fusedNodeInputArgOriginalNames
         )
     {
         return new FusedGraphKernel(
             info, 
-            graphNodePropertyMap, 
+            compiledExecutionPlanOperator,
+            outputShapes,
+            inputEdges,
+            reuseCommandList,
+            inputsConstant,
             transferredInitializerMap, 
-            fusedNodeInputArgOriginalNames, 
-            fusedNodeOutputArgOriginalNames);
+            fusedNodeInputArgOriginalNames);
     }
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/FusedGraphKernel.h
@@ -3,14 +3,18 @@
 
 #include "core/framework/op_kernel.h"
 #include "GraphDescBuilder.h"
+#include "DmlGraphFusionTransformer.h"
 
 namespace Dml
 {
     onnxruntime::OpKernel* CreateFusedGraphKernel(
-        const onnxruntime::OpKernelInfo& info, 
-        const std::unordered_map<std::string, GraphNodeProperties>& graphNodePropertyMap,
+        const onnxruntime::OpKernelInfo& info,
+        ComPtr<IDMLCompiledOperator> compiledExecutionPlanOperator,
+        Windows::AI::MachineLearning::Adapter::EdgeShapes& outputShapes,
+        std::vector<DML_INPUT_GRAPH_EDGE_DESC>& inputEdges,
+        bool reuseCommandList,
+        std::vector<uint8_t>& inputsConstant,
         std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap,
-        const gsl::span<const std::string> fusedNodeInputArgOriginalNames,
-        const gsl::span<const std::string> fusedNodeOutputArgOriginalNames
+        const gsl::span<const std::string> fusedNodeInputArgOriginalNames
-        const gsl::span<const std::string> fusedNodeInputArgOriginalNames
+        const gsl::span<std::string> fusedNodeInputArgOriginalNames
-        const gsl::span<const std::string> fusedNodeInputArgOriginalNames
+        const gsl::span<std::string> fusedNodeInputArgOriginalNames
     );
 } // namespace Dml
diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -33,7 +33,6 @@ namespace Dml::GraphDescBuilder
     #pragma warning(pop)
 
     GraphDesc BuildGraphDesc(
-        const onnxruntime::OpKernelInfo& kernelInfo,
         const uint8_t* isConstGpuGraphInput,
         const size_t isConstGpuGraphInputCount,
         std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap,

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.h
@@ -40,7 +40,6 @@ namespace Dml
         };
 
         GraphDesc BuildGraphDesc(
-            const onnxruntime::OpKernelInfo& kernelInfo,
             const uint8_t* isConstGpuGraphInput,
             const size_t isConstGpuGraphInputCount,
             std::unordered_map<std::string, onnx::TensorProto>& transferredInitializerMap,