remove redundant cast in a test (#3852)

I was wondering why there is an additional no op scheduler when the input data is `float32`. It turns out due to the redundant cast, we only need the cast when the input is `float16`. This PR removes that cast for fp32. --------- Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
NVIDIA · Feb 10, 2025 · e12fe2c · e12fe2c
1 parent 39bc83a
commit e12fe2c
Showing 1 changed file with 15 additions and 23 deletions.
diff --git a/tests/cpp/test_persistent_buffer.cpp b/tests/cpp/test_persistent_buffer.cpp
@@ -1472,18 +1472,19 @@ TEST_P(LayerNormSharedMemoryTest, FusionLayerNormSharedMemoryBuffer_CUDA) {
   constexpr int64_t dim0 = 2048;
   std::vector<int64_t> input_shape{dim0, hidden_size};
   std::vector<int64_t> norm_shape{hidden_size};
-  auto input_half = makeContigTensor(2, dtype);
-  auto weight_half = makeContigTensor(1, dtype);
-  auto bias_half = makeContigTensor(1, dtype);
-  fusion.addInput(input_half);
-  fusion.addInput(weight_half);
-  fusion.addInput(bias_half);
-  auto input = castOp(DataType::Float, input_half);
-  auto weight = castOp(DataType::Float, weight_half);
-  auto bias = castOp(DataType::Float, bias_half);
+
+  auto input = makeContigTensor(2, dtype);
+  auto weight = makeContigTensor(1, dtype);
+  auto bias = makeContigTensor(1, dtype);
+  fusion.addInput(input);
+  fusion.addInput(weight);
+  fusion.addInput(bias);
+  input = maybeCastOp(DataType::Float, input);
+  weight = maybeCastOp(DataType::Float, weight);
+  bias = maybeCastOp(DataType::Float, bias);
   auto result = layer_norm(input, norm_shape, weight, bias, eps_ptr);
-  auto result_output = castOp(dtype, result.output);
-  fusion.addOutput(result_output);
+  result.output = maybeCastOp(dtype, result.output);
+  fusion.addOutput(result.output);
   fusion.addOutput(result.mean);
   fusion.addOutput(result.invstd);
 
@@ -1534,18 +1535,9 @@ TEST_P(LayerNormSharedMemoryTest, FusionLayerNormSharedMemoryBuffer_CUDA) {
   auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
   auto runtime = executor_cache.getMostRecentKernelRuntime();
   if (has_enough_regs_smem) {
-    // For dtype float, no op scheduler is also used.
-    if (dtype == DataType::Float) {
-      EXPECT_THAT(
-          runtime->fusionSegments()->groups(),
-          UnorderedElementsAre(
-              HeuristicIs(SchedulerType::NoOp),
-              HeuristicIs(SchedulerType::InnerPersistent)));
-    } else {
-      EXPECT_THAT(
-          runtime->fusionSegments()->groups(),
-          UnorderedElementsAre(HeuristicIs(SchedulerType::InnerPersistent)));
-    }
+    EXPECT_THAT(
+        runtime->fusionSegments()->groups(),
+        UnorderedElementsAre(HeuristicIs(SchedulerType::InnerPersistent)));
     Fusion* scheduled_fusion = runtime->executors()
                                    .back()
                                    ->as<KernelExecutor>()