From c72d018b791b35581c851977ba5740bb7a3c096d Mon Sep 17 00:00:00 2001
From: Zhengxing Li <zhengxingli@microsoft.com>
Date: Tue, 14 May 2024 11:42:40 -0700
Subject: [PATCH] More aggressive reassociations

Although DXC applied the upstream change, Reassociate: add global reassociation algorithm (llvm/llvm-project@b8a330c) in this PR (https://github.com/microsoft/DirectXShaderCompiler/pull/6598), it still might overlook some obvious common factors.

One case has been observed is:
  %Float4_0   = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %1, i32 1)
  %Float4_0.w = extractvalue %dx.types.CBufRet.f32 %Float4, 3
  %Float2_0   = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %1, i32 0)
  %Float2_0.y = extractvalue %dx.types.CBufRet.f32 %4, 1
  %Float4_1   = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %1, i32 1)  ---> %Float4_1 is redundant with %Float4_0 since they invokes cbufferLoadLegacy with same parameters
  %Float4_1.w = extractvalue %dx.types.CBufRet.f32 %6, 3                                                     ---> %Float4_1.w is redundant with %Float4_0.w
  %Float2_1   = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle %1, i32 0)  ---> %Float2_1 is redundant with %Float2_0 since they invokes cbufferLoadLegacy with same parameters
  %Float2_1.y = extractvalue %dx.types.CBufRet.f32 %8, 1                                                     ---> %Float2_1.y is redundant with %Float2_0.y
  ....
  %11 = fmul fast float %Float4_0.w, %10
  %12 = fmul fast float %11, %Float2_0.y
  ....
  %14 = fmul fast float %Float4_1.w, %13
  %15 = fmul fast float %14, %Float2_1.y    ---> (%Float4_0.w * %Float2_0.y) equals to (%Float4_1.w * %Float2_1.y) --> they should be reassociated to a common factor

The upstream change can't identify this common factor because DXC doesn't know (%Float4_0.w, %Float4_1.w) and (%Float2_0.y, %Float2_1.y) are redundant when running Reassociate pass. Those redundancies will be eliminated in GVN pass.

For DXC can identify more common factors, this PR will aggressively run Reassociate pass again after GVN pass and then run GVN pass again to remove the redundancies generared in this run of Reassociate pass.

Changing the order of floating point operations causes the precision issue. In case some shaders get unexpected results due to this PR, use "-opt-disable aggressive-reassociation" to disable this PR and roll back.

This is part 3 of the fix for #6593.
---
 lib/Transforms/IPO/PassManagerBuilder.cpp     | 11 +++
 .../fp-common-factor-optimization.hlsl        | 69 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 tools/clang/test/HLSLFileCheck/passes/llvm/reassociate/fp-common-factor-optimization.hlsl

diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 0495340dc6..118f3c6fcc 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -504,6 +504,17 @@ void PassManagerBuilder::populateModulePassManager(
   }
 
   // HLSL Change Begins.
+  {
+    // Run reassociate pass again after GVN since GVN will expose more
+    // opportunities for reassociation.
+    if (HLSLEnableAggressiveReassociation) {
+      MPM.add(createReassociatePass(true)); // Reassociate expressions
+      if (EnableGVN) {
+        MPM.add(createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+      }
+    }
+  }
+
   // Use value numbering to figure out if regions are equivalent, and branch to only one.
   MPM.add(createDxilSimpleGVNEliminateRegionPass());
   // HLSL don't allow memcpy and memset.
diff --git a/tools/clang/test/HLSLFileCheck/passes/llvm/reassociate/fp-common-factor-optimization.hlsl b/tools/clang/test/HLSLFileCheck/passes/llvm/reassociate/fp-common-factor-optimization.hlsl
new file mode 100644
index 0000000000..9792144f7f
--- /dev/null
+++ b/tools/clang/test/HLSLFileCheck/passes/llvm/reassociate/fp-common-factor-optimization.hlsl
@@ -0,0 +1,69 @@
+// RUN: %dxc -T cs_6_3 -E cs_main %s -opt-enable aggressive-reassociation | FileCheck %s -check-prefixes=CHECK,COMMON_FACTOR
+// RUN: %dxc -T cs_6_3 -E cs_main %s -opt-disable aggressive-reassociation | FileCheck %s -check-prefixes=CHECK,NO_COMMON_FACTOR
+
+// Make sure DXC recognize the common factor and generate optimized dxils.
+
+// CHECK: [[TMP_CBUF:%.*]] = call %dx.types.Handle @dx.op.createHandle(i32 57, i8 2, i32 0, i32 10, i1 false)
+
+// CHECK: [[ELEM_1:%.*]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle [[TMP_CBUF]], i32 1)
+// CHECK: [[FACTOR_SRC1:%.*]] = extractvalue %dx.types.CBufRet.f32 [[ELEM_1]], 2
+
+// CHECK: [[ELEM_0:%.*]] = call %dx.types.CBufRet.f32 @dx.op.cbufferLoadLegacy.f32(i32 59, %dx.types.Handle [[TMP_CBUF]], i32 0)
+// CHECK: [[FACTOR_SRC0:%.*]] = extractvalue %dx.types.CBufRet.f32 [[ELEM_0]], 0
+
+// COMMON_FACTOR: [[FACTOR:%.*]] = fmul fast float [[FACTOR_SRC0]], [[FACTOR_SRC1]]
+// COMMON_FACTOR: fmul fast float [[FACTOR]],
+// COMMON_FACTOR: fmul fast float [[FACTOR]],
+// COMMON_FACTOR: fmul fast float [[FACTOR]],
+
+// NO_COMMON_FACTOR: [[EXPRESSION_0:%.*]] = fmul fast float [[FACTOR_SRC1]],
+// NO_COMMON_FACTOR:                        fmul fast float [[EXPRESSION_0]], [[FACTOR_SRC0]]
+// NO_COMMON_FACTOR: [[EXPRESSION_1:%.*]] = fmul fast float [[FACTOR_SRC1]],
+// NO_COMMON_FACTOR:                        fmul fast float [[EXPRESSION_1]], [[FACTOR_SRC0]] 
+// NO_COMMON_FACTOR: [[EXPRESSION_2:%.*]] = fmul fast float [[FACTOR_SRC1]],
+// NO_COMMON_FACTOR:                        fmul fast float [[EXPRESSION_2]], [[FACTOR_SRC0]]
+
+
+cbuffer TemporalAAData : register ( b10 )
+{
+    float2 viewportRelativeSize ;
+    float4 outputDimensions ;
+}
+
+Texture2D < float4 > HistoryColor : register ( t2 ) ;
+SamplerState s_Linear : register ( s1 ) ;
+RWTexture1D < float3 > outColorBuffer : register ( u0 ) ;
+
+float4 Test ( in Texture2D < float4 > tex , in SamplerState linearSampler , in float2 uv )
+{
+    float2 samplePos = uv;
+    float2 texPos1 = floor ( samplePos - 0.5f ) + 0.5f ;
+
+    float2 texPos0 = texPos1 - 1 ;
+    float2 texPos3 = texPos1 + 2 ;
+    float2 texPos12 = texPos1 + samplePos;
+
+    // DXC should recognize (outputDimensions . zw * viewportRelativeSize) is a common factor.
+    texPos0 *= outputDimensions . zw * viewportRelativeSize ;
+    texPos3 *= outputDimensions . zw * viewportRelativeSize ;
+    texPos12 *= outputDimensions . zw * viewportRelativeSize ;
+
+    float4 result = 0.0f ;
+    result += tex . SampleLevel ( linearSampler , float2 ( texPos0 . x , texPos0 . y ) , 0.0f );
+    result += tex . SampleLevel ( linearSampler , float2 ( texPos12 . x , texPos0 . y ) , 0.0f );
+    result += tex . SampleLevel ( linearSampler , float2 ( texPos3 . x , texPos0 . y ) , 0.0f );
+    result += tex . SampleLevel ( linearSampler , float2 ( texPos0 . x , texPos12 . y ) , 0.0f );
+    result += tex . SampleLevel ( linearSampler , float2 ( texPos12 . x , texPos12 . y ) , 0.0f );
+    result += tex . SampleLevel ( linearSampler , float2 ( texPos3 . x , texPos12 . y ) , 0.0f );
+    result += tex . SampleLevel ( linearSampler , float2 ( texPos0 . x , texPos3 . y ) , 0.0f );
+    result += tex . SampleLevel ( linearSampler , float2 ( texPos12 . x , texPos3 . y ) , 0.0f );
+    result += tex . SampleLevel ( linearSampler , float2 ( texPos3 . x , texPos3 . y ) , 0.0f );
+    return result ;
+}
+
+[ numthreads ( 8 , 8 , 1 ) ] void cs_main ( uint3 GroupID : SV_GroupID , uint GroupIndex : SV_GroupIndex , uint3 GTID : SV_GroupThreadID , uint3 DispatchThreadID : SV_DispatchThreadID )
+{
+    uint2 pixelCoord = GTID . xy ;
+
+    outColorBuffer [ pixelCoord.x ] = Test ( HistoryColor , s_Linear , outputDimensions . zw ) . rgb;
+}
\ No newline at end of file