Skip to content

Commit

Permalink
remove redundant cast in a test (#3852)
Browse files Browse the repository at this point in the history
I was wondering why there is an additional no op scheduler when the
input data is `float32`. It turns out due to the redundant cast, we only
need the cast when the input is `float16`.
This PR removes that cast for fp32.

---------

Co-authored-by: Jingyue Wu <wujingyue@gmail.com>
  • Loading branch information
liqiangxl and wujingyue authored Feb 10, 2025
1 parent 39bc83a commit e12fe2c
Showing 1 changed file with 15 additions and 23 deletions.
38 changes: 15 additions & 23 deletions tests/cpp/test_persistent_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1472,18 +1472,19 @@ TEST_P(LayerNormSharedMemoryTest, FusionLayerNormSharedMemoryBuffer_CUDA) {
constexpr int64_t dim0 = 2048;
std::vector<int64_t> input_shape{dim0, hidden_size};
std::vector<int64_t> norm_shape{hidden_size};
auto input_half = makeContigTensor(2, dtype);
auto weight_half = makeContigTensor(1, dtype);
auto bias_half = makeContigTensor(1, dtype);
fusion.addInput(input_half);
fusion.addInput(weight_half);
fusion.addInput(bias_half);
auto input = castOp(DataType::Float, input_half);
auto weight = castOp(DataType::Float, weight_half);
auto bias = castOp(DataType::Float, bias_half);

auto input = makeContigTensor(2, dtype);
auto weight = makeContigTensor(1, dtype);
auto bias = makeContigTensor(1, dtype);
fusion.addInput(input);
fusion.addInput(weight);
fusion.addInput(bias);
input = maybeCastOp(DataType::Float, input);
weight = maybeCastOp(DataType::Float, weight);
bias = maybeCastOp(DataType::Float, bias);
auto result = layer_norm(input, norm_shape, weight, bias, eps_ptr);
auto result_output = castOp(dtype, result.output);
fusion.addOutput(result_output);
result.output = maybeCastOp(dtype, result.output);
fusion.addOutput(result.output);
fusion.addOutput(result.mean);
fusion.addOutput(result.invstd);

Expand Down Expand Up @@ -1534,18 +1535,9 @@ TEST_P(LayerNormSharedMemoryTest, FusionLayerNormSharedMemoryBuffer_CUDA) {
auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);
auto runtime = executor_cache.getMostRecentKernelRuntime();
if (has_enough_regs_smem) {
// For dtype float, no op scheduler is also used.
if (dtype == DataType::Float) {
EXPECT_THAT(
runtime->fusionSegments()->groups(),
UnorderedElementsAre(
HeuristicIs(SchedulerType::NoOp),
HeuristicIs(SchedulerType::InnerPersistent)));
} else {
EXPECT_THAT(
runtime->fusionSegments()->groups(),
UnorderedElementsAre(HeuristicIs(SchedulerType::InnerPersistent)));
}
EXPECT_THAT(
runtime->fusionSegments()->groups(),
UnorderedElementsAre(HeuristicIs(SchedulerType::InnerPersistent)));
Fusion* scheduled_fusion = runtime->executors()
.back()
->as<KernelExecutor>()
Expand Down

0 comments on commit e12fe2c

Please sign in to comment.