Skip to content

Commit

Permalink
Add SDXL regression test on mi308 (#19747)
Browse files Browse the repository at this point in the history
- Adds mi308 runner for regression testing and specifies sku for tuned
specs
- Adds new unet IR for sdxl with 960x1024 image shape
- New tuning spec for new unet IR for mi308 sku

---------

Signed-off-by: Ian <ian.nordeng@amd.com>
Signed-off-by: ianNod <ian.nordeng@amd.com>
  • Loading branch information
IanNod authored Jan 25, 2025
1 parent ec6e00b commit ea599e2
Show file tree
Hide file tree
Showing 6 changed files with 467 additions and 1 deletion.
37 changes: 37 additions & 0 deletions .github/workflows/pkgci_regression_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,18 @@ jobs:
- name: amdgpu_rocm_mi250_gfx90a
rocm-chip: gfx90a
backend: rocm
sku: mi250
runs-on: nodai-amdgpu-mi250-x86-64
- name: amdgpu_rocm_mi300_gfx942
rocm-chip: gfx942
backend: rocm
sku: mi300
runs-on: nodai-amdgpu-mi300-x86-64
- name: amdgpu_rocm_mi308_gfx942
rocm-chip: gfx942
backend: rocm
sku: mi308
runs-on: nodai-amdgpu-mi308-x86-64
env:
PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
Expand Down Expand Up @@ -95,6 +102,7 @@ jobs:
--durations=0
env:
ROCM_CHIP: ${{ matrix.rocm-chip }}
SKU: ${{ matrix.sku }}

- name: "Running SD3 special model tests"
if: "!cancelled()"
Expand Down Expand Up @@ -165,3 +173,32 @@ jobs:
--timeout=600 \
--retries 7
echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
# Note: allowing 10% deviation from observed averages here to account for
# different runner conditions.
- name: "Running SDXL rocm pipeline benchmark (mi308)"
if: contains(matrix.name, 'rocm_mi308_gfx942')
run: |
source ${VENV_DIR}/bin/activate
pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
--goldentime-tolerance-multiplier 1.1 \
--goldentime-rocm-e2e-ms 800.0 \
--goldentime-rocm-unet-ms 195.0 \
--goldentime-rocm-clip-ms 15.0 \
--goldentime-rocm-vae-ms 190.0 \
--goldendispatch-rocm-unet 1602 \
--goldendispatch-rocm-clip 1139 \
--goldendispatch-rocm-vae 246 \
--goldensize-rocm-unet-bytes 2270000 \
--goldensize-rocm-clip-bytes 860000 \
--goldensize-rocm-vae-bytes 840000 \
--goldentime-rocm-punet-int8-fp16-ms 140.0 \
--goldentime-rocm-punet-int8-fp8-ms 150 \
--goldendispatch-rocm-punet-int8-fp16 1424 \
--goldendispatch-rocm-punet-int8-fp8 1704 \
--goldensize-rocm-punet-int8-fp8-bytes 2800000 \
--goldensize-rocm-punet-int8-fp16-bytes 2560000 \
--rocm-chip gfx942 \
--log-cli-level=info \
--timeout=600 \
--retries 7
echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
module attributes { transform.with_named_sequence } {
//===----------------------------------------------------------------------===//
// Tuning infra
//===----------------------------------------------------------------------===//

transform.named_sequence @apply_op_config(%op: !transform.any_op {transform.readonly},
%config: !transform.any_param {transform.readonly}) {
transform.annotate %op "compilation_info" = %config : !transform.any_op, !transform.any_param
// transform.print %op {name = "Applied"} : !transform.any_op
transform.yield
}

transform.named_sequence @apply_attn_op_config(%attention: !transform.any_op {transform.readonly},
%config: !transform.any_param {transform.readonly},
%decomposition_config: !transform.any_param {transform.readonly}) {
transform.annotate %attention "compilation_info" = %config : !transform.any_op, !transform.any_param
transform.annotate %attention "decomposition_config" = %decomposition_config : !transform.any_op, !transform.any_param
// transform.print %attention {name = "Applied attention config"} : !transform.any_op
transform.yield
}

transform.named_sequence @match_attention_f16(%attention: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param, !transform.any_param) {
transform.match.operation_name %attention ["iree_linalg_ext.attention"] : !transform.any_op
%in0 = transform.get_operand %attention[0] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %in0 = tensor<?x?x?x?xf16> : !transform.any_value

%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 0, 0, 0], reduction=[0, 0, 0, 0, 0, 64], promote_operands = [1, 2]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [64, 4]
subgroup_size = 64 ,
{llvm_func_attrs = { "amdgpu-waves-per-eu" = "2", "denormal-fp-math-f32" = "preserve-sign" }}>>
-> !transform.any_param

%decomposition_config = transform.param.constant {
qk_attrs = {attention_qk_matmul,
lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.virtual_mma_layout<intrinsic = VMFMA_F32_32x32x16_F16>,
subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [1] }>},
pv_attrs = {attention_pv_matmul,
lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [1] }>}
} -> !transform.any_param

transform.yield %attention, %config, %decomposition_config : !transform.any_op, !transform.any_param, !transform.any_param
}

transform.named_sequence @match_mmt_f16_f16_f32(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
// transform.print %root {name = "Generic"} : !transform.any_op
%ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
^bb0(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %out: tensor<?x?xf32>):
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
affine_map<(d0, d1, d2) -> (d1, d2)>,
affine_map<(d0, d1, d2) -> (d0, d1)>],
iterator_types = ["parallel", "parallel", "reduction"]}
ins(%lhs, %rhs : tensor<?x?xf16>, tensor<?x?xf16>) outs(%out : tensor<?x?xf32>) {
^bb0(%in: f16, %in_0: f16, %acc: f32):
%18 = arith.extf %in : f16 to f32
%19 = arith.extf %in_0 : f16 to f32
%20 = arith.mulf %18, %19 : f32
%21 = arith.addf %acc, %20 : f32
linalg.yield %21 : f32
} -> tensor<?x?xf32>
} : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
transform.yield %root : !transform.any_op
}

// TUNING_SPEC_BEGIN DO NOT REMOVE

//===----------------------------------------------------------------------===//
// Matmul tuning
//===----------------------------------------------------------------------===//

transform.named_sequence @match_mmt_1920x10240x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<1920x1280xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<10240x1280xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 4, subgroup_n_count = 2,
reduction = [0, 0, 32],
workgroup = [128, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_1920x1280x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<1920x1280xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<1280x1280xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 4, subgroup_n_count = 2,
reduction = [0, 0, 32],
workgroup = [128, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_1920x1280x5120(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<1920x5120xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<1280x5120xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 4, subgroup_n_count = 2,
reduction = [0, 0, 32],
workgroup = [128, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_7680x5120x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<7680x640xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<5120x640xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 2, subgroup_n_count = 4,
reduction = [0, 0, 32],
workgroup = [128, 256, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [256, 2, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_128x1280x2048(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<1280x2048xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<1280x2048xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
subgroup_m_count = 2, subgroup_n_count = 1,
reduction = [0, 0, 128],
workgroup = [64, 16, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [64, 2, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_7680x640x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<7680x640xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<640x640xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
subgroup_m_count = 1, subgroup_n_count = 4,
reduction = [0, 0, 32],
workgroup = [256, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [256, 1, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

transform.named_sequence @match_mmt_7680x640x2560(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
transform.iree.match.cast_compatible_type %lhs = tensor<7680x2560xf16> : !transform.any_value
transform.iree.match.cast_compatible_type %rhs = tensor<640x2560xf16> : !transform.any_value
%config = transform.param.constant #iree_codegen.compilation_info<
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
subgroup_m_count = 4, subgroup_n_count = 2,
reduction = [0, 0, 32],
workgroup = [256, 128, 0]}>,
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
workgroup_size = [128, 4, 1] subgroup_size = 64,
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}
}>> -> !transform.any_param
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
}

//===----------------------------------------------------------------------===//
// Convolution tuning
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// Batch matmul tuning
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// Broadcast rhs mmt tuning
//===----------------------------------------------------------------------===//

//===----------------------------------------------------------------------===//
// Contraction tuning
//===----------------------------------------------------------------------===//

// TUNING_SPEC_END DO NOT REMOVE

//===----------------------------------------------------------------------===//
// Entry point
//===----------------------------------------------------------------------===//

transform.named_sequence @__kernel_config(%variant_op: !transform.any_op {transform.consumed}) {
transform.foreach_match in %variant_op
@match_attention_f16 -> @apply_attn_op_config

// TUNING_MATCH_BEGIN DO NOT REMOVE

// MMT.
, @match_mmt_1920x10240x1280 -> @apply_op_config
, @match_mmt_1920x1280x1280 -> @apply_op_config
, @match_mmt_1920x1280x5120 -> @apply_op_config
, @match_mmt_7680x5120x640 -> @apply_op_config
, @match_mmt_128x1280x2048 -> @apply_op_config
, @match_mmt_7680x640x640 -> @apply_op_config
, @match_mmt_7680x640x2560 -> @apply_op_config

// TUNING_MATCH_END DO NOT REMOVE
: (!transform.any_op) -> (!transform.any_op)
transform.yield
}
} //// module
Loading

0 comments on commit ea599e2

Please sign in to comment.