diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 548e8985a..db7e8ac7b 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -52,10 +52,10 @@ jobs:
       - name: Compile SW
         run: |
           echo "Compiling SW"
-          make sw CFG_OVERRIDE=target/sw/cfg/hemaia.hjson -j$(nproc)
+          make sw CFG_OVERRIDE=target/sw/cfg/hemaia_ci.hjson -j$(nproc)
       - name: Compile RTL
         run: |
-          make rtl CFG_OVERRIDE=target/rtl/cfg/hemaia.hjson
+          make rtl CFG_OVERRIDE=target/rtl/cfg/hemaia_ci.hjson
       - name: Compile Verilator Binary
         run: |
           make occamy_system_vlt -j$(nproc)
diff --git a/Makefile b/Makefile
index 6da73bc17..de7c844cf 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@
 MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
 MKFILE_DIR := $(dir $(MKFILE_PATH))
 
-CFG_OVERRIDE ?= target/rtl/cfg/hemaia.hjson
+CFG_OVERRIDE ?= target/rtl/cfg/hemaia_tapeout.hjson
 CFG = $(realpath $(CFG_OVERRIDE))
 
 clean:
diff --git a/target/rtl/Makefile b/target/rtl/Makefile
index 985a2faa0..80bcbcb82 100644
--- a/target/rtl/Makefile
+++ b/target/rtl/Makefile
@@ -58,7 +58,7 @@ CFG = $(TARGET_RTL)/cfg/lru.hjson
 $(CFG): FORCE
 	@# If the LRU config file doesn't exist, we use the default config.
 	@if [ ! -e $@ ] ; then \
-		DEFAULT_CFG="$(TARGET_RTL)/cfg/hemaia.hjson"; \
+		DEFAULT_CFG="$(TARGET_RTL)/cfg/hemaia_tapeout.hjson"; \
 		echo "Using default config file: $$DEFAULT_CFG"; \
 		cp $$DEFAULT_CFG $@; \
 	fi
diff --git a/target/rtl/cfg/hemaia.hjson b/target/rtl/cfg/hemaia_ci.hjson
similarity index 95%
rename from target/rtl/cfg/hemaia.hjson
rename to target/rtl/cfg/hemaia_ci.hjson
index e802713b2..df5cb63c8 100755
--- a/target/rtl/cfg/hemaia.hjson
+++ b/target/rtl/cfg/hemaia_ci.hjson
@@ -108,10 +108,10 @@
       cfg_base_offset: 65536 // 0x10000
     },
     clusters:[
+      "snax_KUL_cluster",   // snax_cgra_cluster
       "snax_KUL_cluster",
-      "snax_KUL_dse_cluster",
       "snax_hypercorex_cluster",
-      // "snax_dimc_cluster"
+      snax_dimc_cluster
     ],
 
     // peripherals
diff --git a/target/rtl/cfg/hemaia_two_clusters.hjson b/target/rtl/cfg/hemaia_two_clusters.hjson
deleted file mode 100755
index bae8873c3..000000000
--- a/target/rtl/cfg/hemaia_two_clusters.hjson
+++ /dev/null
@@ -1,198 +0,0 @@
-{
-    bender_target: ["cv64a6_imafdc_sv39", "occamy"],
-    // Remote CFG, about to be removed
-    is_remote_quadrant: false,
-    remote_quadrants: [],
-    // Multi-chip configuration
-    hemaia_multichip: {
-      chip_id_width: 8
-    }
-    addr_width: 48,
-    data_width: 64,
-    // XBARs
-    wide_xbar: {
-      max_slv_trans: 64,
-      max_mst_trans: 64,
-      fall_through: false,
-    },
-    quadrant_inter_xbar_slv_id_width_no_rocache: 3,
-    quadrant_inter_xbar: {
-      max_slv_trans: 64,
-      max_mst_trans: 64,
-      fall_through: false,
-    },
-    narrow_xbar: {
-      max_slv_trans: 32,
-      max_mst_trans: 32,
-      fall_through: false,
-    },
-    cuts: {
-      narrow_to_quad: 3,
-      quad_to_narrow: 3,
-      wide_to_quad: 3,
-      quad_to_wide: 3,
-      narrow_to_cva6: 2,
-      narrow_conv_to_spm_narrow_pre: 2,
-      narrow_conv_to_spm_narrow: 1,
-      narrow_and_pcie: 3,
-      narrow_and_wide: 1,
-      wide_conv_to_spm_wide: 3,
-      wide_to_wide_zero_mem: 0,
-      wide_to_hbm: 3,
-      wide_and_inter: 3,
-      wide_and_hbi: 3,
-      narrow_and_hbi: 3,
-      pre_to_hbmx: 3,
-      hbmx_to_hbm: 3,
-      atomic_adapter_narrow: 1,
-      atomic_adapter_narrow_wide: 1,
-      // Give some flexibility in peripheral xbar placement
-      periph_axi_lite_narrow: 2,
-      periph_axi_lite: 2,
-      periph_axi_lite_narrow_hbm_xbar_cfg: 2,
-      // Non-right-side chip peripherals
-      periph_axi_lite_narrow_hbm_cfg: 3,
-      periph_axi_lite_narrow_pcie_cfg: 3,
-      periph_axi_lite_narrow_chip_ctrl_cfg: 3,
-      periph_axi_lite_narrow_hbi_narrow_cfg: 3,
-      periph_axi_lite_narrow_hbi_wide_cfg: 3,
-      periph_axi_lite_narrow_bootrom_cfg: 3,
-      periph_axi_lite_narrow_fll_system_cfg: 3,
-      periph_axi_lite_narrow_fll_periph_cfg: 3,
-      periph_axi_lite_narrow_fll_hbm2e_cfg: 3,
-      // Right-side or latency-invariant chip peripherals
-      periph_axi_lite_narrow_soc_ctrl_cfg: 1,
-      periph_axi_lite_narrow_uart_cfg: 1,
-      periph_axi_lite_narrow_i2c_cfg: 1,
-      periph_axi_lite_narrow_gpio_cfg: 1,
-      periph_axi_lite_narrow_clint_cfg: 1,
-      periph_axi_lite_narrow_plic_cfg: 1,
-      periph_axi_lite_narrow_spim_cfg: 1,
-      periph_axi_lite_narrow_timer_cfg: 1,
-    },
-    txns: {
-      wide_and_inter: 128,
-      wide_to_hbm: 128,
-      narrow_and_wide: 16,
-      rmq: 4,
-    },
-    narrow_xbar_slv_id_width: 4,
-    narrow_xbar_user_width: 3, // clog2(total number of clusters)
-    nr_s1_quadrant: 1,
-    s1_quadrant: {
-      // number of pending transactions on the narrow/wide network
-      narrow_trans: 32,
-      wide_trans: 32,
-      // Disable for easier flow trials.
-      ro_cache_cfg: {
-          width: 1024,
-          count: 128,
-          sets: 2,
-          max_trans: 32,
-          address_regions: 4,
-      }
-      wide_xbar: {
-        max_slv_trans: 32,
-        max_mst_trans: 32,
-        fall_through: false,
-      },
-      wide_xbar_slv_id_width: 3
-      narrow_xbar: {
-        max_slv_trans: 8,
-        max_mst_trans: 8,
-        fall_through: false,
-      },
-      narrow_xbar_slv_id_width: 4,
-      narrow_xbar_user_width: 3, // clog2(total number of clusters)
-      cfg_base_addr: 184549376, // 0x0b000000
-      cfg_base_offset: 65536 // 0x10000
-    },
-    clusters:[
-      "snax_KUL_cluster",
-      "snax_KUL_dse_cluster"
-    ],
-
-    // peripherals
-    peripherals: {
-        rom: {
-            address: 16777216, // 0x0100_0000
-            length: 131072, // 128 kiB 0x2_0000
-        },
-        clint: {
-                    address: 67108864, // 0x0400_0000
-                    length: 1048576, // 1 MiB 0x10_0000
-        },
-        axi_lite_peripherals: [
-            {
-                name: "debug",
-                address: 0, // 0x0000_0000
-                length: 4096, // 4 kiB 0x1000
-            }, 
-            {
-              name: "spis", // Only Master port, no slave port
-            }
-        ],
-        axi_lite_narrow_peripherals: [
-            {
-                name: "soc_ctrl",
-                address: 33554432, // 0x0200_0000
-                length: 4096, // 4 kiB 0x1000
-            },
-            {
-                name: "uart",
-                address: 33562624, // 0x0200_2000
-                length: 4096, // 4 kiB 0x1000
-            },
-            {
-                name: "gpio",
-                address:  33566720, // 0x0200_3000
-                length: 4096, // 4 kiB 0x1000
-            },
-            {
-                name: "i2c",
-                address: 33570816, // 0x0200_4000
-                length: 4096, // 4 kiB 0x1000
-            },
-            {
-                name: "chip_ctrl",
-                address: 33574912, // 0x0200_5000
-                length: 4096, // 4 kiB 0x1000
-            },
-            {
-                name: "timer",
-                address: 33579008, // 0x0200_6000
-                length: 4096, // 4 kiB 0x1000
-            },
-            {
-                name: "spim",
-                address: 50331648, // 0x0300_0000
-                length: 131072, // 4 kiB 0x2_0000
-            },
-            {
-                name: "plic",
-                address: 201326592, // 0x0C00_0000
-                length: 67108864, // 64 MiB 0x400_0000
-            },
-        ],
-    },
-    // non-peripheral IPs
-    spm_narrow: {
-      address: 1879048192, // 0x7000_0000
-      length: 131072, // 128 kiB 0x2_0000
-    },
-    spm_wide: {
-      address: 2147483648, // 0x8000_0000
-      length: 1048576, // 1 MiB 0x10_0000
-    },
-    wide_zero_mem: {
-      address: 68719476736, // 0x10_0000_0000
-      length: 8589934592, // 8 GiB 0x11_0000_0000
-    },
-    sys_idma_cfg: {
-      address: 285212672, // 0x1100_0000
-      length: 65536, // 64 kiB 0x1_0000
-    },
-    // backup boot address
-    backup_boot_addr: 2147483648 // 0x8000_0000
-
-}
diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/datagen.py b/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/datagen.py
index 9c474c708..24211b872 100755
--- a/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/datagen.py
+++ b/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/datagen.py
@@ -13,20 +13,11 @@
 import sys
 import os
 
-import subprocess
-
 # Add data utility path
-sys.path.append(os.path.join(os.path.dirname(__file__),
-                "../../../../../../../../util/sim/"))
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/"))
 from data_utils import format_scalar_definition, format_vector_definition  # noqa E402
 
 # Add golden model path
-bender_command = subprocess.run(['bender', 'path', 'snitch_cluster'],
-                                capture_output=True, text=True)
-snax_utils_path = bender_command.stdout.strip()
-
-sys.path.append(snax_utils_path + "/util/sim/")
-
 from snax_utils import (  # noqa E402
     conv2d,
     im2col,
diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/params.hjson b/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/params.hjson
index eb8db8552..e69dec035 100644
--- a/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/params.hjson
+++ b/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/params.hjson
@@ -23,7 +23,7 @@
   channel_en_C: 1,
 
   // memory space configurations
-  interleaved_address: 1,
+  interleaved_address: 0,
   memory_size: 128,
 
   // hardware parameters
diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-conv/src/snax-gemmx-conv.c b/target/sim/sw/device/apps/snax/snax-gemmx-conv/src/snax-gemmx-conv.c
index 8699c3eb6..e0620d958 100644
--- a/target/sim/sw/device/apps/snax/snax-gemmx-conv/src/snax-gemmx-conv.c
+++ b/target/sim/sw/device/apps/snax/snax-gemmx-conv/src/snax-gemmx-conv.c
@@ -14,136 +14,141 @@
 // We use several nested loops to iterate over the input data and weights,
 // achieving implicit im2col
 int main() {
-    // Set err value for checking
-    int err = 0;
-
-    // Prepare addresses pointers in TCDM for DMA
-    int8_t *local_a_dma, *local_b_dma;
-    int32_t *local_c_dma, *local_d32_dma;
-    int8_t *local_d8_dma;
-
-    // Allocate space in TCDM for DMA
-    local_a_dma = (int8_t *)(snrt_l1_next() + delta_physical_a);
-    local_b_dma = (int8_t *)(snrt_l1_next() + delta_physical_b);
-    local_c_dma = (int32_t *)(snrt_l1_next() + delta_physical_c);
-    local_d32_dma = (int32_t *)(snrt_l1_next() + delta_physical_d32);
-    local_d8_dma = (int8_t *)(snrt_l1_next() + delta_physical_d8);
-
-    // Prepare addresses pointers in TCDM for streamer
-    int8_t *local_a, *local_b;
-    int32_t *local_c, *local_d32;
-    int8_t *local_d8;
-
-    // Allocate space in TCDM for streamer
-    local_a = (int8_t *)(snrt_l1_next() + delta_local_a);
-    local_b = (int8_t *)(snrt_l1_next() + delta_local_b);
-    local_c = (int32_t *)(snrt_l1_next() + delta_local_c);
-    local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32);
-    local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8);
-
-    // Transfer data from L3 to L1
-    // Using DMA only
-    if (snrt_is_dm_core()) {
-        if (interleaved_address == 1) {
-            snrt_dma_start_1d(local_a, A,
-                              Nbatch * (H + 2 * pad_h) * (W + 2 * pad_w) * Cin *
-                                  sizeof(int8_t));
-            snrt_dma_start_1d(local_b, B,
-                              Cout * Kh * Kw * Cin * sizeof(int8_t));
-        } else {
-            snrt_dma_start_2d(
-                local_a_dma, A, 64 * sizeof(int8_t), 256, 64,
-                Nbatch * (H + 2 * pad_h) * (W + 2 * pad_w) * Cin / 64);
-            snrt_dma_start_2d(local_b_dma, B, 64 * sizeof(int8_t), 256, 64,
-                              Cout * Kh * Kw * Cin / 64);
-        }
-        snrt_dma_wait_all();
-    }
-
-    // Wait for DMA to finish
-    snrt_cluster_hw_barrier();
-    if (snrt_is_dm_core()) {
-        if (interleaved_address == 1) {
-            snrt_dma_start_1d(local_c, C,
-                              M * N * meshRow * meshCol * sizeof(int32_t));
-        } else {
-            snrt_dma_start_2d(local_c_dma, C, 16 * sizeof(int32_t), 256,
-                              16 * sizeof(int32_t),
-                              M * N * meshRow * meshCol / 16);
+    if (snrt_cluster_idx() == 1) {  // Set err value for checking
+        int err = 0;
+
+        // Prepare addresses pointers in TCDM for DMA
+        int8_t *local_a_dma, *local_b_dma;
+        int32_t *local_c_dma, *local_d32_dma;
+        int8_t *local_d8_dma;
+
+        // Allocate space in TCDM for DMA
+        local_a_dma = (int8_t *)(snrt_l1_next() + delta_physical_a);
+        local_b_dma = (int8_t *)(snrt_l1_next() + delta_physical_b);
+        local_c_dma = (int32_t *)(snrt_l1_next() + delta_physical_c);
+        local_d32_dma = (int32_t *)(snrt_l1_next() + delta_physical_d32);
+        local_d8_dma = (int8_t *)(snrt_l1_next() + delta_physical_d8);
+
+        // Prepare addresses pointers in TCDM for streamer
+        int8_t *local_a, *local_b;
+        int32_t *local_c, *local_d32;
+        int8_t *local_d8;
+
+        // Allocate space in TCDM for streamer
+        local_a = (int8_t *)(snrt_l1_next() + delta_local_a);
+        local_b = (int8_t *)(snrt_l1_next() + delta_local_b);
+        local_c = (int32_t *)(snrt_l1_next() + delta_local_c);
+        local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32);
+        local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8);
+
+        // Transfer data from L3 to L1
+        // Using DMA only
+        if (snrt_is_dm_core()) {
+            if (interleaved_address == 1) {
+                snrt_dma_start_1d(local_a, A,
+                                  Nbatch * (H + 2 * pad_h) * (W + 2 * pad_w) *
+                                      Cin * sizeof(int8_t));
+                snrt_dma_start_1d(local_b, B,
+                                  Cout * Kh * Kw * Cin * sizeof(int8_t));
+            } else {
+                snrt_dma_start_2d(
+                    local_a_dma, A, 64 * sizeof(int8_t), 256, 64,
+                    Nbatch * (H + 2 * pad_h) * (W + 2 * pad_w) * Cin / 64);
+                snrt_dma_start_2d(local_b_dma, B, 64 * sizeof(int8_t), 256, 64,
+                                  Cout * Kh * Kw * Cin / 64);
+            }
+            snrt_dma_wait_all();
         }
-        snrt_dma_wait_all();
-    }
-
-    snrt_cluster_hw_barrier();
-
-    if (snrt_global_core_idx() == 0) {
-        // Set Streamer configuration CSR for conv2d
-        set_gemmx_streamer_csr(
-            Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1,
-            Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3, Atlbound4,
-            Atlstride4, Atlbound5, Atlstride5, set_addr_remap_index_A,
-
-            Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1,
-            Btlstride1, Btlbound2, Btlstride2, set_addr_remap_index_B,
-
-            D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1,
-            D8tlstride1, D8tlbound2, D8tlstride2, set_addr_remap_index_D8,
-
-            Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1,
-            Ctlstride1, Ctlbound2, Ctlstride2, set_addr_remap_index_C,
-
-            D32slstride0, D32slstride1, D32tlbound0, D32tlstride0, D32tlbound1,
-            D32tlstride1, D32tlbound2, D32tlstride2, set_addr_remap_index_D32,
-
-            delta_local_a, delta_local_b, delta_local_d8, delta_local_c,
-            delta_local_d32, bypassSIMD, transposed_A, transposed_B,
-            channel_en_C, broadcast_C);
 
-        // Set GEMMX configuration CSR
-        uint32_t subtraction_setting =
-            gen_subtraction_config(subtraction_a, subtraction_b);
-
-        uint32_t csr0 =
-            gen_csr0_config(input_zp_i, output_zp_i, max_int_i, min_int_i);
-        uint32_t csr1 = gen_csr1_config(double_round_i);
-
-        set_gemmx_csr(
-            K, N, M, subtraction_setting, csr0, csr1, shared_bitpacked_shift0,
-            shared_bitpacked_shift1, shared_multiplier0, shared_multiplier1,
-            shared_multiplier2, shared_multiplier3, shared_multiplier4,
-            shared_multiplier5, shared_multiplier6, shared_multiplier7, M * N,
-            bypassSIMD);
-
-        // Set CSR to start Streamer for conv2d
-        set_gemmx_streamer_start();
-
-        // Set CSR to start GEMM
-        set_gemmx_start();
-
-        // Poll until Streamer and GEMM accelerator finish
-        wait_gemmx_and_streamer();
-
-        // check the result of the implicit im2col convolution
-        if (interleaved_address == 1) {
-            if (!bypassSIMD) {
-                err += check_gemmx_result_D8(local_d8, D8, Batch, M, N, false);
+        // Wait for DMA to finish
+        snrt_cluster_hw_barrier();
+        if (snrt_is_dm_core()) {
+            if (interleaved_address == 1) {
+                snrt_dma_start_1d(local_c, C,
+                                  M * N * meshRow * meshCol * sizeof(int32_t));
             } else {
-                err +=
-                    check_gemmx_result_D32(local_d32, D32, Batch, M, N, false);
+                snrt_dma_start_2d(local_c_dma, C, 16 * sizeof(int32_t), 256,
+                                  16 * sizeof(int32_t),
+                                  M * N * meshRow * meshCol / 16);
             }
-        } else {
-            if (!bypassSIMD) {
-                err +=
-                    check_gemmx_result_D8(local_d8_dma, D8, Batch, M, N, true);
+            snrt_dma_wait_all();
+        }
+
+        snrt_cluster_hw_barrier();
+
+        if (snrt_cluster_core_idx() == 0) {
+            // Set Streamer configuration CSR for conv2d
+            set_gemmx_streamer_csr(
+                Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1,
+                Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3,
+                Atlbound4, Atlstride4, Atlbound5, Atlstride5,
+                set_addr_remap_index_A,
+
+                Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1,
+                Btlstride1, Btlbound2, Btlstride2, set_addr_remap_index_B,
+
+                D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1,
+                D8tlstride1, D8tlbound2, D8tlstride2, set_addr_remap_index_D8,
+
+                Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1,
+                Ctlstride1, Ctlbound2, Ctlstride2, set_addr_remap_index_C,
+
+                D32slstride0, D32slstride1, D32tlbound0, D32tlstride0,
+                D32tlbound1, D32tlstride1, D32tlbound2, D32tlstride2,
+                set_addr_remap_index_D32,
+
+                delta_local_a, delta_local_b, delta_local_d8, delta_local_c,
+                delta_local_d32, bypassSIMD, transposed_A, transposed_B,
+                channel_en_C, broadcast_C);
+
+            // Set GEMMX configuration CSR
+            uint32_t subtraction_setting =
+                gen_subtraction_config(subtraction_a, subtraction_b);
+
+            uint32_t csr0 =
+                gen_csr0_config(input_zp_i, output_zp_i, max_int_i, min_int_i);
+            uint32_t csr1 = gen_csr1_config(double_round_i);
+
+            set_gemmx_csr(
+                K, N, M, subtraction_setting, csr0, csr1,
+                shared_bitpacked_shift0, shared_bitpacked_shift1,
+                shared_multiplier0, shared_multiplier1, shared_multiplier2,
+                shared_multiplier3, shared_multiplier4, shared_multiplier5,
+                shared_multiplier6, shared_multiplier7, M * N, bypassSIMD);
+
+            // Set CSR to start Streamer for conv2d
+            set_gemmx_streamer_start();
+
+            // Set CSR to start GEMM
+            set_gemmx_start();
+
+            // Poll until Streamer and GEMM accelerator finish
+            wait_gemmx_and_streamer();
+
+            // check the result of the implicit im2col convolution
+            if (interleaved_address == 1) {
+                if (!bypassSIMD) {
+                    err +=
+                        check_gemmx_result_D8(local_d8, D8, Batch, M, N, false);
+                } else {
+                    err += check_gemmx_result_D32(local_d32, D32, Batch, M, N,
+                                                  false);
+                }
             } else {
-                err += check_gemmx_result_D32(local_d32_dma, D32, Batch, M, N,
-                                              true);
+                if (!bypassSIMD) {
+                    err += check_gemmx_result_D8(local_d8_dma, D8, Batch, M, N,
+                                                 true);
+                } else {
+                    err += check_gemmx_result_D32(local_d32_dma, D32, Batch, M,
+                                                  N, true);
+                }
             }
-        }
 
-        printf("SNAX GEMM Conv2d: %s, Error: %d . bypassSIMD = %d .\n",
-               err ? "FAIL" : "PASS", err, bypassSIMD);
-    };
+            printf("SNAX GEMM Conv2d: %s, Error: %d . bypassSIMD = %d .\r\n",
+                   err ? "FAIL" : "PASS", err, bypassSIMD);
+        };
 
-    return err;
+        return err;
+    } else
+        return 0;
 }
diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-matmul/data/datagen.py b/target/sim/sw/device/apps/snax/snax-gemmx-matmul/data/datagen.py
index e525f740d..1e0d76679 100755
--- a/target/sim/sw/device/apps/snax/snax-gemmx-matmul/data/datagen.py
+++ b/target/sim/sw/device/apps/snax/snax-gemmx-matmul/data/datagen.py
@@ -13,19 +13,11 @@
 import sys
 import os
 
-import subprocess
-
 # Add data utility path
-sys.path.append(os.path.join(os.path.dirname(__file__),
-                "../../../../../../../../util/sim/"))
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/"))
 from data_utils import format_scalar_definition, format_vector_definition  # noqa E402
 
 # Add golden model path
-bender_command = subprocess.run(['bender', 'path', 'snitch_cluster'],
-                                capture_output=True, text=True)
-snax_utils_path = bender_command.stdout.strip()
-
-sys.path.append(snax_utils_path + "/util/sim/")
 from snax_utils import (  # noqa E402
     conv2d,
     im2col,
diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-matmul/src/snax-gemmx-matmul.c b/target/sim/sw/device/apps/snax/snax-gemmx-matmul/src/snax-gemmx-matmul.c
index efc0c088d..3ac105b28 100644
--- a/target/sim/sw/device/apps/snax/snax-gemmx-matmul/src/snax-gemmx-matmul.c
+++ b/target/sim/sw/device/apps/snax/snax-gemmx-matmul/src/snax-gemmx-matmul.c
@@ -14,99 +14,104 @@
 // We use several nested loops to iterate over the input data and weights,
 // achieving implicit im2col
 int main() {
-    // Set err value for checking
-    int err = 0;
-
-    // Prepare addresses in TCDM
-    int8_t *local_a, *local_b;
-    int32_t *local_c, *local_d32;
-    int8_t *local_d8;
-
-    // Allocate space in TCDM
-    local_a = (int8_t *)(snrt_l1_next() + delta_local_a);
-    local_b = (int8_t *)(snrt_l1_next() + delta_local_b);
-    local_c = (int32_t *)(snrt_l1_next() + delta_local_c);
-    local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32);
-    local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8);
-
-    // Transfer data from L3 to L1
-    // Using DMA only
-    if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(local_a, A,
-                          M * K * meshRow * tileSize * sizeof(int8_t));
-        snrt_dma_start_1d(local_b, B,
-                          N * K * tileSize * meshCol * sizeof(int8_t));
-
-        snrt_dma_wait_all();
-    }
-
-    // Wait for DMA to finish
-    snrt_cluster_hw_barrier();
-    if (snrt_is_dm_core()) {
-        snrt_dma_start_1d(local_c, C,
-                          M * N * meshRow * meshCol * sizeof(int32_t));
-        snrt_dma_wait_all();
-    }
-
-    snrt_cluster_hw_barrier();
-
-    if (snrt_global_core_idx() == 0) {
-        // Set Streamer configuration CSR for conv2d
-        set_gemmx_streamer_csr(
-            Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1,
-            Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3, Atlbound4,
-            Atlstride4, Atlbound5, Atlstride5, set_addr_remap_index_A,
-
-            Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1,
-            Btlstride1, Btlbound2, Btlstride2, set_addr_remap_index_B,
-
-            D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1,
-            D8tlstride1, D8tlbound2, D8tlstride2, set_addr_remap_index_D8,
-
-            Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1,
-            Ctlstride1, Ctlbound2, Ctlstride2, set_addr_remap_index_C,
-
-            D32slstride0, D32slstride1, D32tlbound0, D32tlstride0, D32tlbound1,
-            D32tlstride1, D32tlbound2, D32tlstride2, set_addr_remap_index_D32,
-
-            delta_local_a, delta_local_b, delta_local_d8, delta_local_c,
-            delta_local_d32, bypassSIMD, transposed_A, transposed_B,
-            channel_en_C, broadcast_C);
-
-        // Set GEMMX configuration CSR
-        uint32_t subtraction_setting =
-            gen_subtraction_config(subtraction_a, subtraction_b);
-
-        uint32_t csr0 =
-            gen_csr0_config(input_zp_i, output_zp_i, max_int_i, min_int_i);
-        uint32_t csr1 = gen_csr1_config(double_round_i);
-
-        set_gemmx_csr(
-            K, N, M, subtraction_setting, csr0, csr1, shared_bitpacked_shift0,
-            shared_bitpacked_shift1, shared_multiplier0, shared_multiplier1,
-            shared_multiplier2, shared_multiplier3, shared_multiplier4,
-            shared_multiplier5, shared_multiplier6, shared_multiplier7, M * N,
-            bypassSIMD);
-
-        // Set CSR to start Streamer for conv2d
-        set_gemmx_streamer_start();
-
-        // Set CSR to start GEMM
-        set_gemmx_start();
-
-        // Poll until Streamer and GEMM accelerator finish
-        wait_gemmx_and_streamer();
-
-        // check the result of the implicit im2col convolution
-        if (!bypassSIMD) {
-            err += check_gemmx_result_D8(local_d8, D8, Batch, M, N, false);
-        } else {
-            err += check_gemmx_result_D32(local_d32, D32, Batch, M, N, false);
+    if (snrt_cluster_idx() == 1) {  // Set err value for checking
+        int err = 0;
+
+        // Prepare addresses in TCDM
+        int8_t *local_a, *local_b;
+        int32_t *local_c, *local_d32;
+        int8_t *local_d8;
+
+        // Allocate space in TCDM
+        local_a = (int8_t *)(snrt_l1_next() + delta_local_a);
+        local_b = (int8_t *)(snrt_l1_next() + delta_local_b);
+        local_c = (int32_t *)(snrt_l1_next() + delta_local_c);
+        local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32);
+        local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8);
+
+        // Transfer data from L3 to L1
+        // Using DMA only
+        if (snrt_is_dm_core()) {
+            snrt_dma_start_1d(local_a, A,
+                              M * K * meshRow * tileSize * sizeof(int8_t));
+            snrt_dma_start_1d(local_b, B,
+                              N * K * tileSize * meshCol * sizeof(int8_t));
+
+            snrt_dma_wait_all();
         }
 
-        printf("SNAX GEMM Matmul: %s, Error: %d . bypassSIMD = %d .\n",
-               err ? "FAIL" : "PASS", err, bypassSIMD);
-    };
+        // Wait for DMA to finish
+        snrt_cluster_hw_barrier();
+        if (snrt_is_dm_core()) {
+            snrt_dma_start_1d(local_c, C,
+                              M * N * meshRow * meshCol * sizeof(int32_t));
+            snrt_dma_wait_all();
+        }
+
+        snrt_cluster_hw_barrier();
+
+        if (snrt_cluster_core_idx() == 0) {
+            // Set Streamer configuration CSR for conv2d
+            set_gemmx_streamer_csr(
+                Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1,
+                Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3,
+                Atlbound4, Atlstride4, Atlbound5, Atlstride5,
+                set_addr_remap_index_A,
+
+                Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1,
+                Btlstride1, Btlbound2, Btlstride2, set_addr_remap_index_B,
+
+                D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1,
+                D8tlstride1, D8tlbound2, D8tlstride2, set_addr_remap_index_D8,
+
+                Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1,
+                Ctlstride1, Ctlbound2, Ctlstride2, set_addr_remap_index_C,
+
+                D32slstride0, D32slstride1, D32tlbound0, D32tlstride0,
+                D32tlbound1, D32tlstride1, D32tlbound2, D32tlstride2,
+                set_addr_remap_index_D32,
+
+                delta_local_a, delta_local_b, delta_local_d8, delta_local_c,
+                delta_local_d32, bypassSIMD, transposed_A, transposed_B,
+                channel_en_C, broadcast_C);
+
+            // Set GEMMX configuration CSR
+            uint32_t subtraction_setting =
+                gen_subtraction_config(subtraction_a, subtraction_b);
+
+            uint32_t csr0 =
+                gen_csr0_config(input_zp_i, output_zp_i, max_int_i, min_int_i);
+            uint32_t csr1 = gen_csr1_config(double_round_i);
+
+            set_gemmx_csr(
+                K, N, M, subtraction_setting, csr0, csr1,
+                shared_bitpacked_shift0, shared_bitpacked_shift1,
+                shared_multiplier0, shared_multiplier1, shared_multiplier2,
+                shared_multiplier3, shared_multiplier4, shared_multiplier5,
+                shared_multiplier6, shared_multiplier7, M * N, bypassSIMD);
+
+            // Set CSR to start Streamer for conv2d
+            set_gemmx_streamer_start();
+
+            // Set CSR to start GEMM
+            set_gemmx_start();
+
+            // Poll until Streamer and GEMM accelerator finish
+            wait_gemmx_and_streamer();
+
+            // check the result of the implicit im2col convolution
+            if (!bypassSIMD) {
+                err += check_gemmx_result_D8(local_d8, D8, Batch, M, N, false);
+            } else {
+                err +=
+                    check_gemmx_result_D32(local_d32, D32, Batch, M, N, false);
+            }
+
+            printf("SNAX GEMM Matmul: %s, Error: %d . bypassSIMD = %d .\r\n",
+                   err ? "FAIL" : "PASS", err, bypassSIMD);
+        };
 
-    return err;
+        return err;
+    } else
+        return 0;
 }
diff --git a/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c b/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c
index 4ca19a5c6..42e706d60 100644
--- a/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c
+++ b/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c
@@ -22,7 +22,7 @@ int main() {
     if (snrt_cluster_idx() == 0) {
         if (snrt_is_dm_core()) {
             tcdm0_start_addr = (int8_t*)snrt_cluster_base_addrl();
-            printf("The C0 TCDM ADDR is %p \n", tcdm0_start_addr);
+            printf("The C0 TCDM ADDR is %p \r\n", tcdm0_start_addr);
         }
     }
     snrt_global_barrier();
@@ -30,14 +30,14 @@ int main() {
     if (snrt_cluster_idx() == 1) {
         if (snrt_is_dm_core()) {
             tcdm1_start_addr = (int8_t*)snrt_cluster_base_addrl();
-            printf("The C1 TCDM ADDR is %p \n", tcdm1_start_addr);
+            printf("The C1 TCDM ADDR is %p \r\n", tcdm1_start_addr);
         }
     }
     snrt_global_barrier();
     // C0 Load the data from l3 -> l1
     if (snrt_cluster_idx() == 0) {
         if (snrt_is_dm_core()) {
-            printf("[C0] Start to load data from %p\n", test_data);
+            printf("[C0] Start to load data from %p\r\n", test_data);
             snrt_dma_start_1d(tcdm0_start_addr, test_data, length_data);
             snrt_dma_wait_all();
         }
@@ -48,7 +48,7 @@ int main() {
     // Thenc C1 fetches data from C0
     if (snrt_cluster_idx() == 1) {
         if (snrt_is_dm_core()) {
-            printf("[C1] Load data from C0 TCDM %p\n", tcdm0_start_addr);
+            printf("[C1] Load data from C0 TCDM %p\r\n", tcdm0_start_addr);
             snrt_dma_start_1d(tcdm1_start_addr, tcdm0_start_addr, length_data);
             snrt_dma_wait_all();
         }
@@ -59,12 +59,12 @@ int main() {
     // Start to check
     if (snrt_cluster_idx() == 0) {
         if (snrt_cluster_core_idx() == 0) {
-            printf("C0 Checking the results\n");
+            printf("C0 Checking the results\r\n");
             for (int i = 0; i < length_data; i++) {
                 if (tcdm0_start_addr[i] != test_data[i]) {
                     err++;
-                    printf("C0 data is incorrect!\n");
-                    printf("tcdm0[%d]=%d, test_data[%d]=%d\n", i,
+                    printf("C0 data is incorrect!\r\n");
+                    printf("tcdm0[%d]=%d, test_data[%d]=%d\r\n", i,
                            tcdm0_start_addr[i], i, test_data[i]);
                     return -1;
                 }
@@ -74,12 +74,12 @@ int main() {
     snrt_global_barrier();
     if (snrt_cluster_idx() == 1) {
         if (snrt_cluster_core_idx() == 0) {
-            printf("C1 Checking the results\n");
+            printf("C1 Checking the results\r\n");
             for (int i = 0; i < length_data; i++) {
                 if (tcdm1_start_addr[i] != test_data[i]) {
                     err++;
-                    printf("C1 data is incorrect!\n");
-                    printf("tcdm0[%d]=%d, test_data[%d]=%d\n", i,
+                    printf("C1 data is incorrect!\r\n");
+                    printf("tcdm0[%d]=%d, test_data[%d]=%d\r\n", i,
                            tcdm1_start_addr[i], i, test_data[i]);
                     return -1;
                 }
@@ -90,7 +90,7 @@ int main() {
     snrt_global_barrier();
     if (snrt_cluster_idx() == 0) {
         if (snrt_is_dm_core()) {
-            printf("Checking all done! No error!\n");
+            printf("Checking all done! No error!\r\n");
         }
     }
 
diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py
index fccf81ebd..a3eb11fa7 100755
--- a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py
+++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 
-# Copyright 2024 KU Leuven.
+# Copyright 2023 KU Leuven.
 # Licensed under the Apache License, Version 2.0, see LICENSE for details.
 # SPDX-License-Identifier: Apache-2.0
 #
-# Fanchen Kong <fanchen.kong@kuleuven.be>
+# Xiaoling Yi <xiaoling.yi@esat.kuleuven.be>
 
 import numpy as np
 import argparse
@@ -12,19 +12,14 @@
 import hjson
 import sys
 import os
-import subprocess
 
 # Add data utility path
 sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/"))
 from data_utils import format_scalar_definition, format_vector_definition  # noqa E402
 
-bender_command = subprocess.run(['bender', 'path', 'snitch_cluster'],
-                                capture_output=True, text=True)
-snax_utils_path = bender_command.stdout.strip()
+# Add golden model path
+from snax_utils import data_reshuffler_golden_model, max_pooling, im2col  # noqa E402
 
-sys.path.append(snax_utils_path + "/util/sim/")
-
-from snax_utils import max_pooling  # noqa E402
 np.random.seed(42)
 
 
@@ -32,79 +27,605 @@
 def emit_header_file(**kwargs):
     emit_str = "#include <stdint.h>\n\n"
     emit_str += "#include <stdbool.h> \n\n"
-    emit_str += emit_data(**kwargs)
+    emit_str += emit_data_reshuffler(**kwargs)
     return emit_str
 
 
-def emit_data(**kwargs):
-    MIN = -128
-    MAX = 127
-
-    data_str = ""
-    data_str += format_scalar_definition("int8_t",
-                                         "H",
-                                         kwargs["H"]) + "\n"
-    data_str += format_scalar_definition("int8_t",
-                                         "W",
-                                         kwargs["W"]) + "\n"
-    data_str += format_scalar_definition("int8_t",
-                                         "Cin",
-                                         kwargs["Cin"]) + "\n"
-    data_str += format_scalar_definition("int8_t",
-                                         "Kh",
-                                         kwargs["Kh"]) + "\n"
-    data_str += format_scalar_definition("int8_t", "Kw", kwargs["Kw"]) + "\n"
-    data_str += format_scalar_definition("int8_t",
-                                         "pad_h", kwargs["pad_h"]) + "\n"
-    data_str += format_scalar_definition("int8_t",
-                                         "pad_w", kwargs["pad_w"]) + "\n"
-    data_str += format_scalar_definition("int8_t",
-                                         "stride_h", kwargs["stride_h"]) + "\n"
-    data_str += format_scalar_definition("int8_t",
-                                         "stride_w", kwargs["stride_w"]) + "\n"
-    padded_h = kwargs["H"] + 2 * kwargs["pad_h"]
-    padded_w = kwargs["W"] + 2 * kwargs["pad_w"]
-    out_h = (kwargs["H"] + 2 * kwargs["pad_h"] -
-             kwargs["Kh"]) // kwargs["stride_h"] + 1
-    out_w = (kwargs["W"] + 2 * kwargs["pad_w"] -
-             kwargs["Kw"]) // kwargs["stride_w"] + 1
-
-    data_str += format_scalar_definition("int8_t", "out_H", out_h) + "\n"
-    data_str += format_scalar_definition("int8_t", "out_W", out_w) + "\n"
-    data_str += format_scalar_definition("int8_t", "padded_H", padded_h) + "\n"
-    data_str += format_scalar_definition("int8_t", "padded_W", padded_w) + "\n"
-
-    # Generating random input data vector
-    data_in = np.random.randint(
-        MIN, MAX, (1, kwargs["H"], kwargs["W"], kwargs["Cin"])
-    )
-    padded_data_in = np.pad(
-        data_in,
-        (
-            (0, 0),
-            (kwargs["pad_h"], kwargs["pad_h"]),
-            (kwargs["pad_w"], kwargs["pad_w"]),
-            (0, 0),
-        ),
-        "constant",
-    )
-    # Generating golden data
-    c_golden = max_pooling(
-        data_in,
-        kwargs["Kw"],
-        kwargs["Kh"],
-        kwargs["stride_w"],
-        kwargs["stride_h"],
-        kwargs["pad_w"],
-        kwargs["pad_h"],
-        "HWC",
-    )
-    data_str += format_vector_definition("int8_t",
-                                         "padded_data_in",
-                                         padded_data_in.reshape(-1)) + "\n"
-    data_str += format_vector_definition("int8_t",
-                                         "golden_data_out",
-                                         c_golden.reshape(-1)) + "\n"
+MIN = -128
+MAX = 127
+
+
+def emit_data_reshuffler(**kwargs):
+    data_str = []
+
+    assert (
+        kwargs["ifMaxPool"] + kwargs["iftestIm2Col"] + kwargs["ifTestTransposer"] == 1
+    ), "Only one kernel can be tested at a time"
+
+    if kwargs["ifTestTransposer"] is True:
+        # Generating loop bounds settings
+        data_str += [
+            format_scalar_definition("int32_t", "tempLoop0_in", kwargs["tempLoop0"]),
+            format_scalar_definition("int32_t", "tempLoop1_in", kwargs["tempLoop1"]),
+            format_scalar_definition("int32_t", "tempLoop2_in", 1),
+            format_scalar_definition("int32_t", "tempLoop3_in", 1),
+            format_scalar_definition("int32_t", "tempLoop4_in", 1),
+            format_scalar_definition("int32_t", "tempLoop0_out", kwargs["tempLoop0"]),
+            format_scalar_definition("int32_t", "tempLoop1_out", kwargs["tempLoop1"]),
+            format_scalar_definition("int32_t", "tempLoop2_out", 1),
+            format_scalar_definition(
+                "int32_t",
+                "input_data_len",
+                kwargs["tempLoop0"] * kwargs["tempLoop1"] * 8 * 8,
+            ),
+            format_scalar_definition(
+                "int32_t",
+                "output_data_len",
+                kwargs["tempLoop0"] * kwargs["tempLoop1"] * 8 * 8,
+            ),
+        ]
+
+        # Generating temporal strides settings
+        data_str += [
+            # data reshuffler input strides
+            format_scalar_definition(
+                "int32_t", "tempStride0_in", kwargs["tempStride0_in"]
+            ),
+            format_scalar_definition(
+                "int32_t", "tempStride1_in", kwargs["tempStride1_in"]
+            ),
+            format_scalar_definition("int32_t", "tempStride2_in", 0),
+            format_scalar_definition("int32_t", "tempStride3_in", 0),
+            format_scalar_definition("int32_t", "tempStride4_in", 0),
+            format_scalar_definition(
+                "int32_t", "spatialStride1_in", kwargs["spatialStride1_in"]
+            ),
+            # data reshuffler output strides
+            format_scalar_definition(
+                "int32_t",
+                "tempStride0_out",
+                kwargs["tempStride0_out"],
+            ),
+            format_scalar_definition(
+                "int32_t", "tempStride1_out", kwargs["tempStride1_out"]
+            ),
+            format_scalar_definition("int32_t", "tempStride2_out", 0),
+            format_scalar_definition(
+                "int32_t", "spatialStride1_out", kwargs["spatialStride1_out"]
+            ),
+            # Generating base address pointers
+            format_scalar_definition(
+                "int32_t", "delta_local_in", kwargs["delta_local_in"]
+            ),
+            format_scalar_definition(
+                "int32_t", "delta_local_out", kwargs["delta_local_out"]
+            ),
+        ]
+
+        # Generating random input data vector
+        length_in = (
+            kwargs["tempLoop0"]
+            * kwargs["tempLoop1"]
+            * kwargs["spatial_len_0"]
+            * kwargs["spatial_len_1"]
+        )
+
+        data_in = np.random.randint(MIN, MAX, length_in)
+
+        op = kwargs["op"]
+
+        # Generating golden data
+        # NOTE: using 4 loops to iterate through the
+        # input data and reshuffle the data.
+        # different from the hardware data reshuffler,
+        # the golden model uses the pure strided layout mapping equation,
+        # no 64 data granularity constraint, no need to transpose explicitly.
+        if op == "rowmajor2tiledrowmajor":
+            c_golden = data_reshuffler_golden_model(
+                kwargs["tempLoop0"],
+                kwargs["tempLoop1"],
+                kwargs["spatial_len_0"],
+                kwargs["spatial_len_1"],
+                kwargs["tempStride0_in"],
+                kwargs["tempStride1_in"],
+                1,
+                kwargs["spatialStride1_in"],
+                data_in,
+            )
+
+        if op == "rowmajor2tiledcolmajor":
+            c_golden = data_reshuffler_golden_model(
+                kwargs["tempLoop0"],
+                kwargs["tempLoop1"],
+                kwargs["spatial_len_0"],
+                kwargs["spatial_len_1"],
+                kwargs["tempStride0_in"],
+                kwargs["tempStride1_in"],
+                kwargs["tempLoop0"] * 8,
+                1,
+                data_in,
+            )
+
+        if op == "tiledrowmajor2tiledcolmajor":
+            c_golden = data_reshuffler_golden_model(
+                kwargs["tempLoop0"],
+                kwargs["tempLoop1"],
+                kwargs["spatial_len_0"],
+                kwargs["spatial_len_1"],
+                kwargs["tempStride0_in"],
+                kwargs["tempStride1_in"],
+                8,
+                1,
+                data_in,
+            )
+
+        # Generating transpose flag for the data reshuffler hardware
+        if op == "rowmajor2tiledrowmajor":
+            transpose = 0
+        elif op == "rowmajor2tiledcolmajor":
+            transpose = 1
+        elif op == "tiledrowmajor2tiledcolmajor":
+            transpose = 1
+        else:
+            print("Invalid operation")
+
+        # set transpose or not
+        data_str += [
+            format_scalar_definition(
+                "int", "TloopLen", kwargs["tempLoop0"] * kwargs["tempLoop1"]
+            )
+        ]
+        data_str += [format_scalar_definition("int", "reduceLen", 1)]
+        data_str += [format_scalar_definition("int", "opcode", transpose)]
+
+        # Writing testing data and golden data into data.h
+        data_str += [format_vector_definition("int8_t", "DataIn", data_in)]
+        data_str += [format_vector_definition("int8_t", "C_golden", c_golden)]
+
+    elif kwargs["iftestIm2Col"] is True:
+        assert (
+            kwargs["ifC8HW8datalayout"] is True
+        ), "Only C8HW8 data layout is supported for im2col testing"
+
+        # Generating layer settings
+        Nbatch = kwargs["Nbatch"]
+        Cin8 = kwargs["Cin"] // 8
+        H = kwargs["H"]
+        W = kwargs["W"]
+        Kh = kwargs["Kh"]
+        Kw = kwargs["Kw"]
+        stride_h, stride_w = (kwargs["stride_h"], kwargs["stride_w"])
+        pad_h, pad_w = (kwargs["pad_h"], kwargs["pad_w"])
+
+        # make sure the output width is multiple of 8
+        if W // stride_w % 8 != 0:
+            W = W + (stride_w * (8 - (W // stride_w) % 8)) % (stride_w * 8)
+
+        # generate random input and kernel data
+        input_data = np.random.randint(-10, 10, size=(Nbatch, Cin8, H, W, 8))
+        kernel = np.random.randint(-10, 10, size=(1, Cin8, Kh, Kw, 8, 8))
+
+        # Padding the input data
+        input_padding = np.pad(
+            input_data,
+            ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)),
+            mode="constant",
+        )
+
+        # Calculate the size of the output feature map
+        out_height = (H + 2 * pad_h - Kh) // stride_h + 1
+        out_width = (W + 2 * pad_w - Kw) // stride_w + 1
+
+        assert out_width % 8 == 0, "out_width must be multiple of 8"
+
+        tempLoop0_in = Kw
+        tempLoop1_in = Kh
+        tempLoop2_in = Cin8
+        tempLoop3_in = out_width // 8
+        tempLoop4_in = out_height
+
+        spatialStride1_in = 8 * stride_w
+
+        tempStride0_in = 8
+        tempStride1_in = 8 * (W + 2 * pad_w)
+        tempStride2_in = 8 * (W + 2 * pad_w) * (H + 2 * pad_h)
+        tempStride3_in = 8 * 8 * stride_w
+        tempStride4_in = 8 * (W + 2 * pad_w) * stride_h
+
+        tempLoop0_out = Cin8 * Kw * Kh
+        tempLoop1_out = out_width // 8 * out_height
+        tempLoop2_out = 1
+
+        spatialStride1_out = 8
+        tempStride0_out = 8 * 8
+        tempStride1_out = 8 * 8 * Cin8 * Kw * Kh
+        tempStride2_out = 0
+
+        assert (
+            tempLoop0_in * tempLoop1_in * tempLoop2_in * tempLoop3_in * tempLoop4_in
+            == tempLoop0_out * tempLoop1_out * tempLoop2_out
+        )
+
+        input_data_len = input_padding.size
+
+        explicit_im2col, _ = im2col(
+            input_data, kernel, stride=(stride_h, stride_w), padding=(pad_h, pad_w)
+        )
+        output_data_len = explicit_im2col.size
+
+        delta_local_in = 0
+        delta_local_out = input_data_len
+
+        data_str += [
+            format_scalar_definition("int32_t", "tempLoop0_in", tempLoop0_in),
+            format_scalar_definition("int32_t", "tempLoop1_in", tempLoop1_in),
+            format_scalar_definition("int32_t", "tempLoop2_in", tempLoop2_in),
+            format_scalar_definition("int32_t", "tempLoop3_in", tempLoop3_in),
+            format_scalar_definition("int32_t", "tempLoop4_in", tempLoop4_in),
+            format_scalar_definition("int32_t", "tempLoop0_out", tempLoop0_out),
+            format_scalar_definition("int32_t", "tempLoop1_out", tempLoop1_out),
+            format_scalar_definition("int32_t", "tempLoop2_out", tempLoop2_out),
+            format_scalar_definition("int32_t", "input_data_len", input_data_len),
+            format_scalar_definition("int32_t", "output_data_len", output_data_len),
+            format_scalar_definition("int32_t", "spatialStride1_in", spatialStride1_in),
+            format_scalar_definition("int32_t", "tempStride0_in", tempStride0_in),
+            format_scalar_definition("int32_t", "tempStride1_in", tempStride1_in),
+            format_scalar_definition("int32_t", "tempStride2_in", tempStride2_in),
+            format_scalar_definition("int32_t", "tempStride3_in", tempStride3_in),
+            format_scalar_definition("int32_t", "tempStride4_in", tempStride4_in),
+            format_scalar_definition(
+                "int32_t", "spatialStride1_out", spatialStride1_out
+            ),
+            format_scalar_definition("int32_t", "tempStride0_out", tempStride0_out),
+            format_scalar_definition("int32_t", "tempStride1_out", tempStride1_out),
+            format_scalar_definition("int32_t", "tempStride2_out", tempStride2_out),
+            format_scalar_definition("int32_t", "delta_local_in", delta_local_in),
+            format_scalar_definition("int32_t", "delta_local_out", delta_local_out),
+            format_vector_definition("int8_t", "DataIn", input_padding.reshape(-1)),
+            format_vector_definition("int8_t", "C_golden", explicit_im2col.reshape(-1)),
+        ]
+
+        TloopLen = (
+            tempLoop0_in * tempLoop1_in * tempLoop2_in * tempLoop3_in * tempLoop4_in
+        )
+        reduceLen = 1
+        opcode = 0
+
+        data_str += [
+            format_scalar_definition("int", "TloopLen", TloopLen),
+            format_scalar_definition("int", "reduceLen", reduceLen),
+            format_scalar_definition("int", "opcode", opcode),
+        ]
+
+    # max pooling then
+    elif kwargs["ifC8HW8datalayout"] is True:
+        # data layout, C8HW8
+        # Generating loop bounds settings
+        padded_input_tensor_w = kwargs["W"] + kwargs["pad_w"] * 2
+        padded_input_tensor_h = kwargs["H"] + kwargs["pad_h"] * 2
+
+        padded_output_tensor_w = (
+            kwargs["W"] + kwargs["pad_w"] * 2 - kwargs["Kw"]
+        ) // kwargs["stride_w"] + 1
+        padded_output_tensor_h = (
+            kwargs["H"] + kwargs["pad_h"] * 2 - kwargs["Kh"]
+        ) // kwargs["stride_h"] + 1
+
+        input_data_len = padded_input_tensor_w * padded_input_tensor_h * kwargs["Cin"]
+        output_data_len = (
+            padded_output_tensor_w * padded_output_tensor_h * kwargs["Cin"]
+        )
+
+        assert padded_output_tensor_w % 8 == 0
+        assert kwargs["Cin"] % 8 == 0
+
+        assert (
+            input_data_len + output_data_len < 128 * 1024
+        ), "Data size too large for 128 KB TCDM"
+
+        data_str += [
+            format_scalar_definition("int32_t", "input_data_len", input_data_len),
+            # input data reshuffler loop bounds settings
+            format_scalar_definition("int32_t", "tempLoop0_in", kwargs["Kw"]),
+            format_scalar_definition("int32_t", "tempLoop1_in", kwargs["Kh"]),
+            format_scalar_definition(
+                "int32_t", "tempLoop2_in", padded_output_tensor_w // 8
+            ),
+            format_scalar_definition("int32_t", "tempLoop3_in", padded_output_tensor_h),
+            format_scalar_definition("int32_t", "tempLoop4_in", kwargs["Cin"] // 8),
+        ]
+
+        assert padded_output_tensor_w % 8 == 0
+
+        # data reshuffler input strides
+        spatialStride1_in = kwargs["stride_w"] * 8
+        tempStride0_in = 8
+        tempStride1_in = padded_input_tensor_w * 8
+        tempStride2_in = 8 * 8 * kwargs["stride_w"]
+        tempStride3_in = padded_input_tensor_w * 8 * kwargs["stride_h"]
+        tempStride4_in = padded_input_tensor_w * padded_input_tensor_h * 8
+        data_str += [
+            format_scalar_definition("int32_t", "delta_local_in", 0),
+            format_scalar_definition("int32_t", "spatialStride1_in", spatialStride1_in),
+            format_scalar_definition("int32_t", "tempStride0_in", tempStride0_in),
+            format_scalar_definition("int32_t", "tempStride1_in", tempStride1_in),
+            format_scalar_definition("int32_t", "tempStride2_in", tempStride2_in),
+            format_scalar_definition(
+                "int32_t",
+                "tempStride3_in",
+                tempStride3_in,
+            ),
+            format_scalar_definition(
+                "int32_t",
+                "tempStride4_in",
+                tempStride4_in,
+            ),
+        ]
+
+        data_str += [
+            # output data reshuffler loop bounds settings
+            format_scalar_definition(
+                "int32_t", "tempLoop0_out", padded_output_tensor_w // 8
+            ),
+            format_scalar_definition(
+                "int32_t", "tempLoop1_out", padded_output_tensor_h
+            ),
+            format_scalar_definition("int32_t", "tempLoop2_out", kwargs["Cin"] // 8),
+            # data length setting
+            format_scalar_definition("int32_t", "output_data_len", output_data_len),
+        ]
+
+        # data reshuffler output strides
+        delta_local_out = padded_input_tensor_h * padded_input_tensor_w * kwargs["Cin"]
+        spatialStride1_out = 8
+        tempStride0_out = 8 * 8
+        tempStride1_out = padded_output_tensor_w * 8
+        tempStride2_out = padded_output_tensor_w * padded_output_tensor_h * 8
+        data_str += [
+            # Generating base address pointers
+            format_scalar_definition(
+                "int32_t",
+                "delta_local_out",
+                delta_local_out,
+            ),
+            format_scalar_definition(
+                "int32_t", "spatialStride1_out", spatialStride1_out
+            ),
+            format_scalar_definition(
+                "int32_t",
+                "tempStride0_out",
+                tempStride0_out,
+            ),
+            format_scalar_definition("int32_t", "tempStride1_out", tempStride1_out),
+            format_scalar_definition(
+                "int32_t",
+                "tempStride2_out",
+                tempStride2_out,
+            ),
+        ]
+
+        assert delta_local_out % 8 == 0
+        assert tempStride0_in % 8 == 0
+        assert tempStride1_in % 8 == 0
+        assert tempStride2_in % 8 == 0
+        assert tempStride3_in % 8 == 0
+        assert tempStride4_in % 8 == 0
+        assert tempStride0_out % 8 == 0
+        assert tempStride1_out % 8 == 0
+        assert tempStride2_out % 8 == 0
+
+        # Generating random input data vector
+        data_in = np.random.randint(
+            MIN, MAX, (kwargs["Cin"] // 8, kwargs["H"], kwargs["W"], 8)
+        )
+
+        # Generating golden data
+        c_golden = max_pooling(
+            data_in,
+            kwargs["Kw"],
+            kwargs["Kh"],
+            kwargs["stride_w"],
+            kwargs["stride_h"],
+            kwargs["pad_w"],
+            kwargs["pad_h"],
+            "C8HW8",
+        )
+
+        padded_data_in = np.pad(
+            data_in,
+            (
+                (0, 0),
+                (kwargs["pad_h"], kwargs["pad_h"]),
+                (kwargs["pad_w"], kwargs["pad_w"]),
+                (0, 0),
+            ),
+            "constant",
+        )
+
+        # datapath setting
+        # set opcode
+        data_str += [format_scalar_definition("int", "opcode", 2)]
+        # set TloopLen and reduceLen
+        data_str += [
+            format_scalar_definition(
+                "int32_t",
+                "TloopLen",
+                padded_output_tensor_w
+                * padded_output_tensor_h
+                * kwargs["Cin"]
+                // 8
+                // 8,
+            ),
+            format_scalar_definition(
+                "int32_t", "reduceLen", kwargs["Kw"] * kwargs["Kh"]
+            ),
+        ]
+
+        # Writing testing data and golden data into data.h
+        assert padded_data_in.shape == (
+            kwargs["Cin"] // 8,
+            padded_input_tensor_h,
+            padded_input_tensor_w,
+            8,
+        )
+        assert padded_data_in.reshape(-1).shape[0] == input_data_len
+        data_str += [
+            format_vector_definition("int8_t", "DataIn", padded_data_in.reshape(-1))
+        ]
+
+        assert c_golden.shape == (
+            kwargs["Cin"] // 8,
+            padded_output_tensor_h,
+            padded_output_tensor_w,
+            8,
+        )
+        assert c_golden.reshape(-1).shape[0] == output_data_len
+
+        data_str += [
+            format_vector_definition("int8_t", "C_golden", c_golden.reshape(-1))
+        ]
+
+    else:
+        # data layout HWCin
+        # Generating loop bounds settings
+        padded_input_tensor_w = kwargs["W"] + kwargs["pad_w"] * 2
+        padded_input_tensor_h = kwargs["H"] + kwargs["pad_h"] * 2
+
+        padded_output_tensor_w = (
+            kwargs["W"] + kwargs["pad_w"] * 2 - kwargs["Kw"]
+        ) // kwargs["stride_w"] + 1
+        padded_output_tensor_h = (
+            kwargs["H"] + kwargs["pad_h"] * 2 - kwargs["Kh"]
+        ) // kwargs["stride_h"] + 1
+
+        input_data_len = padded_input_tensor_w * padded_input_tensor_h * kwargs["Cin"]
+        output_data_len = (
+            padded_output_tensor_w * padded_output_tensor_h * kwargs["Cin"]
+        )
+
+        assert padded_output_tensor_w == kwargs["W"]
+        assert padded_output_tensor_h == kwargs["H"]
+
+        data_str += [
+            # input data reshuffler loop bounds settings
+            format_scalar_definition("int32_t", "tempLoop0_in", kwargs["Kw"]),
+            format_scalar_definition("int32_t", "tempLoop1_in", kwargs["Kh"]),
+            format_scalar_definition("int32_t", "tempLoop2_in", kwargs["Cin"] // 8),
+            format_scalar_definition(
+                "int32_t", "tempLoop3_in", padded_output_tensor_w // 8
+            ),
+            format_scalar_definition("int32_t", "tempLoop4_in", padded_output_tensor_h),
+            # output data reshuffler loop bounds settings
+            format_scalar_definition("int32_t", "tempLoop0_out", kwargs["Cin"] // 8),
+            format_scalar_definition(
+                "int32_t", "tempLoop1_out", padded_output_tensor_w // 8
+            ),
+            format_scalar_definition(
+                "int32_t", "tempLoop2_out", padded_output_tensor_h
+            ),
+            # data length setting
+            format_scalar_definition("int32_t", "input_data_len", input_data_len),
+            format_scalar_definition("int32_t", "output_data_len", output_data_len),
+            format_scalar_definition(
+                "int32_t",
+                "TloopLen",
+                padded_output_tensor_w
+                * padded_output_tensor_h
+                * kwargs["Cin"]
+                // 8
+                // 8,
+            ),
+            format_scalar_definition(
+                "int32_t", "reduceLen", kwargs["Kw"] * kwargs["Kh"]
+            ),
+        ]
+
+        data_str += [
+            # data reshuffler input strides
+            format_scalar_definition("int32_t", "spatialStride1_in", kwargs["Cin"]),
+            format_scalar_definition(
+                "int32_t", "tempStride0_in", kwargs["stride_w"] * kwargs["Cin"]
+            ),
+            format_scalar_definition(
+                "int32_t", "tempStride1_in", padded_input_tensor_w * kwargs["Cin"]
+            ),
+            format_scalar_definition("int32_t", "tempStride2_in", 8),
+            format_scalar_definition("int32_t", "tempStride3_in", 8 * kwargs["Cin"]),
+            format_scalar_definition(
+                "int32_t", "tempStride4_in", padded_input_tensor_w * kwargs["Cin"]
+            ),
+            # data reshuffler output strides
+            format_scalar_definition("int32_t", "spatialStride1_out", kwargs["Cin"]),
+            format_scalar_definition("int32_t", "tempStride0_out", 8),
+            format_scalar_definition("int32_t", "tempStride1_out", 8 * kwargs["Cin"]),
+            format_scalar_definition(
+                "int32_t", "tempStride2_out", padded_output_tensor_w * kwargs["Cin"]
+            ),
+            # Generating base address pointers
+            format_scalar_definition("int32_t", "delta_local_in", 0),
+            format_scalar_definition(
+                "int32_t",
+                "delta_local_out",
+                padded_input_tensor_h * padded_input_tensor_w * kwargs["Cin"],
+            ),
+        ]
+
+        # Generating random input data vector
+        data_in = np.random.randint(
+            MIN, MAX, (1, kwargs["H"], kwargs["W"], kwargs["Cin"])
+        )
+
+        # Generating golden data
+        c_golden = max_pooling(
+            data_in,
+            kwargs["Kw"],
+            kwargs["Kh"],
+            kwargs["stride_w"],
+            kwargs["stride_h"],
+            kwargs["pad_w"],
+            kwargs["pad_h"],
+            "HWC",
+        )
+
+        padded_data_in = np.pad(
+            data_in,
+            (
+                (0, 0),
+                (kwargs["pad_h"], kwargs["pad_h"]),
+                (kwargs["pad_w"], kwargs["pad_w"]),
+                (0, 0),
+            ),
+            "constant",
+        )
+
+        # set opcode
+        data_str += [format_scalar_definition("int", "opcode", 2)]
+
+        # Writing testing data and golden data into data.h
+        assert padded_data_in.shape == (
+            1,
+            padded_input_tensor_h,
+            padded_input_tensor_w,
+            kwargs["Cin"],
+        )
+        assert padded_data_in.reshape(-1).shape[0] == input_data_len
+        data_str += [
+            format_vector_definition("int8_t", "DataIn", padded_data_in.reshape(-1))
+        ]
+
+        assert c_golden.shape == (
+            1,
+            padded_output_tensor_h,
+            padded_output_tensor_w,
+            kwargs["Cin"],
+        )
+        assert c_golden.reshape(-1).shape[0] == output_data_len
+
+        data_str += [
+            format_vector_definition("int8_t", "C_golden", c_golden.reshape(-1))
+        ]
+
+    data_str = "\n\n".join(data_str)
 
     return data_str
 
diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson
index 59df2f731..a608493ac 100644
--- a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson
+++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson
@@ -2,15 +2,38 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 //
-// Fanchen Kong <fanchen.kong@kuleuven.be>
+// Xiaoling Yi <xiaoling.yi@esat.kuleuven.be>
+
 {
-    H: 32,
-    W: 32,
+    ifMaxPool: true,
+    iftestIm2Col: false,
+    ifTestTransposer: false,
+    
+    // parameters for maxpool
+    ifC8HW8datalayout: true,
+    Nbatch: 1,
+    H: 16,
+    W: 8,
     Cin: 8,
     Kh: 3,
     Kw: 3,
     pad_h: 1,
     pad_w: 1,
     stride_h: 1,
-    stride_w: 1
-}
\ No newline at end of file
+    stride_w: 1,
+
+    // parameters for data layout reshuffling
+    op: 'rowmajor2tiledrowmajor',
+    tempLoop0: 8,
+    tempLoop1: 8,
+    spatialStride1_in: 64,
+    tempStride0_in: 8,
+    tempStride1_in: 512,
+    spatialStride1_out: 8,
+    tempStride0_out: 64,
+    tempStride1_out: 512,
+    delta_local_in: 0,
+    delta_local_out: 4096,
+    spatial_len_0: 8,
+    spatial_len_1: 8
+}
diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c
index 5ca5743e4..29c2ee19b 100644
--- a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c
+++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c
@@ -9,98 +9,95 @@
 #include "snrt.h"
 
 int main() {
-    // Set err value for checking
-    int err = 0;
-    // Obtain the start address of the TCDM memory
-    uint32_t dma_load_input_start;
-    uint32_t dma_load_input_end;
-    uint32_t *tcdm_baseaddress = (uint32_t *)snrt_l1_next();
-    // Put the input at the starting of tcdm
-    uint8_t *tcdm_in = tcdm_baseaddress;
-    // Put the output at the middle of tcdm
-    uint8_t *tcdm_out = tcdm_in + 0x10000 * sizeof(uint8_t);
+    if (snrt_cluster_idx() == 1) {  // Set err value for checking
+        // Set err value for checking
+        int err = 0;
+        // Obtain the start address of the TCDM memory
+        uint32_t dma_load_input_start;
+        uint32_t dma_load_input_end;
+        uint32_t tcdm_baseaddress = snrt_cluster_base_addrl();
+        // Put the input at the starting of tcdm
+        uint8_t *tcdm_in = (uint8_t *)tcdm_baseaddress;
+        // Put the output at the middle of tcdm
+        uint8_t *tcdm_out = (uint8_t *)(tcdm_baseaddress + delta_local_out);
 
-    if (snrt_is_dm_core()) {
-        // The xdma core is the last compute core in the cluster
-        uint32_t sstride_src[1] = {8};
-        uint32_t sstride_dst[1] = {8};
-        uint32_t tstride_src[2] = {8, 512};
-        uint32_t tbound_src[2] = {3, 3};
+        if (snrt_is_dm_core()) {
+            // The xdma core is the last compute core in the cluster
+            uint32_t sstride_src[1] = {0};
+            uint32_t sstride_dst[1] = {0};
+            uint32_t tstride_src[5] = {0};
+            uint32_t tbound_src[5] = {0};
+            uint32_t tstride_dst[3] = {0};
+            uint32_t tbound_dst[3] = {0};
 
-        // First we need to transfer the input data from L3->TCDM
-        // Here we use the 2d iDMA transfer
-        dma_load_input_start = snrt_mcycle();
-        snrt_dma_start_2d(
-            tcdm_in, padded_data_in, padded_W * Cin * sizeof(uint8_t),
-            512 * sizeof(uint8_t), padded_W * Cin * sizeof(uint8_t),
-            padded_H * sizeof(uint8_t));
-        snrt_dma_wait_all();
-        dma_load_input_end = snrt_mcycle();
+            // Load the CFG from data.h
+            sstride_src[0] = spatialStride1_in;
+            sstride_dst[0] = spatialStride1_out;
+            tstride_src[0] = tempStride0_in;
+            tstride_src[1] = tempStride1_in;
+            tstride_src[2] = tempStride2_in;
+            tstride_src[3] = tempStride3_in;
+            tstride_src[4] = tempStride4_in;
+            tbound_src[0] = tempLoop0_in;
+            tbound_src[1] = tempLoop1_in;
+            tbound_src[2] = tempLoop2_in;
+            tbound_src[3] = tempLoop3_in;
+            tbound_src[4] = tempLoop4_in;
+            tstride_dst[0] = tempStride0_out;
+            tstride_dst[1] = tempStride1_out;
+            tstride_dst[2] = tempStride2_out;
+            tbound_dst[0] = tempLoop0_out;
+            tbound_dst[1] = tempLoop1_out;
+            tbound_dst[2] = tempLoop2_out;
 
-        // --------------------- Configure the Ext --------------------- //
+            // First we need to transfer the input data from L3->TCDM
+            snrt_dma_start_1d(tcdm_in, DataIn, input_data_len * sizeof(int8_t));
+            snrt_dma_wait_all();
 
-        // There are three extensions in xdma
-        // VerilogMemset, Maxpool, Transposer
-        // 0            , 1      , 2
-        // We want to only use Maxpool
-        // Hence we need to disable the 0 and 2
-        // and we set the maxpool csr to 9 since we need 3x3 pooling
-        if (xdma_disable_dst_ext(0) != 0) {
-            printf("Error in disabling xdma extension 0 \r\n");
-            err++;
-        } else {
-            printf("The xdma extension 0 is disabled \r\n");
-        }
+            // --------------------- Configure the Ext --------------------- //
 
-        uint32_t ext_param_maxpool_size[1] = {9};
-        if (xdma_enable_dst_ext(1, ext_param_maxpool_size) != 0) {
-            printf("Error in enabling xdma extension 1 \r\n");
-            err++;
-        } else {
-            printf("The xdma extension 1 is enabled \r\n");
-        }
+            if (xdma_disable_dst_ext(0) != 0) {
+                printf("Error in disabling xdma extension 0\r\n");
+                err++;
+            } else {
+                printf("The xdma extension 0 is disabled\r\n");
+            }
 
-        if (xdma_disable_dst_ext(2) != 0) {
-            printf("Error in disabling xdma extension 2 \r\n");
-            err++;
-        } else {
-            printf("The xdma extension 2 is disabled \r\n");
-        }
+            uint32_t ext_param_maxpool_size[1] = {reduceLen};
+            if (xdma_enable_dst_ext(1, ext_param_maxpool_size) != 0) {
+                printf("Error in enabling xdma extension 1\r\n");
+                err++;
+            } else {
+                printf("The xdma extension 1 is enabled\r\n");
+            }
 
-        // --------------------- Configure the AGU --------------------- //
-        uint8_t *local_src_pointer;
-        uint8_t *local_dst_pointer;
-        int task_id;
-        for (int i = 0; i < out_H; i++) {
-            for (int j = 0; j < out_W / 8; j++) {
-                local_src_pointer = tcdm_in + j * 64 + i * 512;
-                local_dst_pointer = tcdm_out + j * 64 + i * 256;
-                if (xdma_memcpy_nd(local_src_pointer, local_dst_pointer,
-                                   sstride_src, sstride_dst, 2, tstride_src,
-                                   tbound_src, 0, NULL, NULL, 0xFFFFFFFF,
-                                   0xFFFFFFFF, 0xFFFFFFFF) != 0) {
-                    printf("Error in xdma agu configuration \r\n");
-                    err++;
-                } else {
-                    printf("The xdma agu is configured \r\n");
-                }
-                int task_id = xdma_start();
-                xdma_wait(task_id);
-                printf("i = %d, j = %d is done \r\n", i, j);
+            if (xdma_disable_dst_ext(2) != 0) {
+                printf("Error in disabling xdma extension 2\r\n");
+                err++;
+            } else {
+                printf("The xdma extension 2 is disabled\r\n");
             }
-        }
 
-        // --------------------- Checking the Results --------------------- //
-        printf("Checking the results \r\n");
-        for (int i = 0; i < out_H * out_W * Cin; i++) {
-            if ((int8_t)tcdm_out[i] != golden_data_out[i]) {
-                printf("The maxpool is incorrect! \r\n");
-                printf("tcdm_out[%d]=%d, golden_data_out[%d]=%d", i,
-                       (int8_t)tcdm_out[i], i, golden_data_out[i]);
+            // --------------------- Configure the AGU --------------------- //
+            xdma_memcpy_nd(tcdm_in, tcdm_out, sstride_src, sstride_dst, 5,
+                           tstride_src, tbound_src, 3, tstride_dst, tbound_dst,
+                           0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
+            int task_id = xdma_start();
+            xdma_wait(task_id);
+
+            // --------------------- Checking the Results ---------------------
+            // //
+            for (int i = 0; i < output_data_len; i++) {
+                if ((int8_t)tcdm_out[i] != C_golden[i]) {
+                    printf("The maxpool is incorrect!\r\n");
+                    printf("tcdm_out[%d]=%d, C_golden[%d]=%d", i,
+                           (int8_t)tcdm_out[i], i, C_golden[i]);
+                }
             }
+            printf("Checking is done. All values are right\r\n");
         }
-        printf("Checking is done. All values are right \r\n");
-    }
 
-    return 0;
-}
\ No newline at end of file
+        return 0;
+    } else
+        return 0;
+}
diff --git a/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c b/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c
index 83d246edd..a6d33972f 100644
--- a/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c
+++ b/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c
@@ -9,192 +9,201 @@
 #include "snrt.h"
 
 int main() {
-    // Set err value for checking
-    int err = 0;
-
-    // Obtain the start address of the TCDM memory
-    uint8_t *tcdm_baseaddress = (uint8_t *)snrt_l1_next();
-    uint8_t *tcdm_0 = tcdm_baseaddress;
-    uint8_t *tcdm_16 = tcdm_baseaddress + 0x4000 * sizeof(uint8_t);
-    uint8_t *tcdm_32 = tcdm_baseaddress + 0x8000 * sizeof(uint8_t);
-    uint8_t *tcdm_48 = tcdm_baseaddress + 0xc000 * sizeof(uint8_t);
-    uint8_t *tcdm_64 = tcdm_baseaddress + 0x10000 * sizeof(uint8_t);
-    uint8_t *tcdm_80 = tcdm_baseaddress + 0x14000 * sizeof(uint8_t);
-    uint8_t *tcdm_96 = tcdm_baseaddress + 0x18000 * sizeof(uint8_t);
-    uint8_t *tcdm_112 = tcdm_baseaddress + 0x1c000 * sizeof(uint8_t);
-
-    // Using xdma core only
-    if (snrt_is_dm_core()) {
-        // The xdma core is the last compute core in the cluster
-
-        // Test 1: Setting the 0-16KB region to 0xFF
-        printf("Core %d is xdma core.  \r\n", snrt_cluster_core_idx());
-        printf("Test 1: Setting the 0-16KB region to 0xFF \r\n");
-        if (xdma_memcpy_1d(tcdm_0, tcdm_0, 0x4000 * sizeof(uint8_t)) != 0) {
-            printf("Error in xdma agu configuration \r\n");
-            err++;
-        } else {
-            printf("The xdma agu is configured \r\n");
-        }
-
-        uint32_t ext_param_t1[1] = {0xFFFFFFFF};
-        if (xdma_enable_dst_ext(0, ext_param_t1) != 0) {
-            printf("Error in enabling xdma extension 0 \r\n");
-            err++;
-        } else {
-            printf("The xdma extension 0 is enabled \r\n");
-        }
-
-        if (xdma_disable_dst_ext(1) != 0) {
-            printf("Error in disabling xdma extension 1 \r\n");
-            err++;
-        } else {
-            printf("The xdma extension 1 is disabled \r\n");
-        }
+    if (snrt_cluster_idx() == 1) {  // Set err value for checking
+        // Set err value for checking
+        int err = 0;
+        // Obtain the start address of the TCDM memory
+        uint8_t *tcdm_baseaddress = (uint8_t *)snrt_l1_next();
+        uint8_t *tcdm_0 = tcdm_baseaddress;
+        uint8_t *tcdm_16 = tcdm_baseaddress + 0x4000 * sizeof(uint8_t);
+        uint8_t *tcdm_32 = tcdm_baseaddress + 0x8000 * sizeof(uint8_t);
+        uint8_t *tcdm_48 = tcdm_baseaddress + 0xc000 * sizeof(uint8_t);
+        uint8_t *tcdm_64 = tcdm_baseaddress + 0x10000 * sizeof(uint8_t);
+        uint8_t *tcdm_80 = tcdm_baseaddress + 0x14000 * sizeof(uint8_t);
+        uint8_t *tcdm_96 = tcdm_baseaddress + 0x18000 * sizeof(uint8_t);
+        uint8_t *tcdm_112 = tcdm_baseaddress + 0x1c000 * sizeof(uint8_t);
+
+        // Using xdma core only
+        if (snrt_is_dm_core()) {
+            // The xdma core is the last compute core in the cluster
+
+            // Test 1: Setting the 0-16KB region to 0xFF
+            printf("Core %d is xdma core.  \r\n", snrt_cluster_core_idx());
+            printf("Test 1: Setting the 0-16KB region to 0xFF \r\n");
+            if (xdma_memcpy_1d(tcdm_0, tcdm_0, 0x4000 * sizeof(uint8_t)) != 0) {
+                printf("Error in xdma agu configuration \r\n");
+                err++;
+            } else {
+                printf("The xdma agu is configured \r\n");
+            }
 
-        if (xdma_disable_dst_ext(2) != 0) {
-            printf("Error in disabling xdma extension 2 \r\n");
-            err++;
-        } else {
-            printf("The xdma extension 2 is disabled \r\n");
-        }
+            uint32_t ext_param_t1[1] = {0xFFFFFFFF};
+            if (xdma_enable_dst_ext(0, ext_param_t1) != 0) {
+                printf("Error in enabling xdma extension 0 \r\n");
+                err++;
+            } else {
+                printf("The xdma extension 0 is enabled \r\n");
+            }
 
-        if (err != 0) {
-            return err;
-        }
+            if (xdma_disable_dst_ext(1) != 0) {
+                printf("Error in disabling xdma extension 1 \r\n");
+                err++;
+            } else {
+                printf("The xdma extension 1 is disabled \r\n");
+            }
 
-        int task_id = xdma_start();
-        printf(
-            "The xdma is started, setting memory region to 0xFF. The task id "
-            "is %d \r\n",
-            task_id);
-        xdma_wait(task_id);
+            if (xdma_disable_dst_ext(2) != 0) {
+                printf("Error in disabling xdma extension 2 \r\n");
+                err++;
+            } else {
+                printf("The xdma extension 2 is disabled \r\n");
+            }
 
-        printf("The xdma is finished \r\n");
-        // Check the data
-        for (int i = 0; i < 0x4000; i++) {
-            if (tcdm_0[i] != 0xFF) {
-                printf("The memset of 0KB - 16KB is not correct \r\n");
-                return -1;
+            if (err != 0) {
+                return err;
             }
-        }
-        printf("The memset of 0KB - 16KB is correct \r\n");
-
-        // Test 2: Setting the 4K-12K region back to 0. Instead of using the
-        // memset, this test do this by disabling all the readers.
-        printf(
-            "Test 2: Setting the 4K-12K region back to 0 by disabling all "
-            "reader channels \r\n");
-        uint32_t sstride_src_t2[1] = {0};
-        uint32_t tstride_src_t2[1] = {64};
-        uint32_t sstride_dst_t2[1] = {8};
-        uint32_t tstride_dst_t2[1] = {64};
-        uint32_t tbound_src_t2[1] = {128};
-        uint32_t tbound_dst_t2[1] = {128};
-
-        if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t),
-                           sstride_src_t2, sstride_dst_t2, 1, tstride_src_t2,
-                           tbound_src_t2, 1, tstride_dst_t2, tbound_dst_t2, 0x0,
-                           0xffffffff, 0xffffffff) != 0) {
-            printf("Error in xdma agu configuration \r\n");
-            err++;
-        } else {
-            printf("The xdma agu is configured \r\n");
-        }
 
-        if (xdma_disable_dst_ext(0) != 0) {
-            printf("Error in enabling xdma extension 0 \r\n");
-            err++;
-        } else {
-            printf("The xdma extension 0 is disabled \r\n");
-        }
+            int task_id = xdma_start();
+            printf(
+                "The xdma is started, setting memory region to 0xFF. The task "
+                "id "
+                "is %d \r\n",
+                task_id);
+            xdma_wait(task_id);
+
+            printf("The xdma is finished \r\n");
+            // Check the data
+            for (int i = 0; i < 0x4000; i++) {
+                if (tcdm_0[i] != 0xFF) {
+                    printf("The memset of 0KB - 16KB is not correct \r\n");
+                    return -1;
+                }
+            }
+            printf("The memset of 0KB - 16KB is correct \r\n");
+
+            // Test 2: Setting the 4K-12K region back to 0. Instead of using the
+            // memset, this test do this by disabling all the readers.
+            printf(
+                "Test 2: Setting the 4K-12K region back to 0 by disabling all "
+                "reader channels \r\n");
+            uint32_t sstride_src_t2[1] = {0};
+            uint32_t tstride_src_t2[1] = {64};
+            uint32_t sstride_dst_t2[1] = {8};
+            uint32_t tstride_dst_t2[1] = {64};
+            uint32_t tbound_src_t2[1] = {128};
+            uint32_t tbound_dst_t2[1] = {128};
+
+            if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t),
+                               sstride_src_t2, sstride_dst_t2, 1,
+                               tstride_src_t2, tbound_src_t2, 1, tstride_dst_t2,
+                               tbound_dst_t2, 0x0, 0xffffffff,
+                               0xffffffff) != 0) {
+                printf("Error in xdma agu configuration \r\n");
+                err++;
+            } else {
+                printf("The xdma agu is configured \r\n");
+            }
 
-        if (err != 0) {
-            return err;
-        }
+            if (xdma_disable_dst_ext(0) != 0) {
+                printf("Error in enabling xdma extension 0 \r\n");
+                err++;
+            } else {
+                printf("The xdma extension 0 is disabled \r\n");
+            }
 
-        task_id = xdma_start();
-        printf(
-            "The xdma is started, setting memory region to 0x00. The task id "
-            "is %d \r\n",
-            task_id);
-        xdma_wait(task_id);
+            if (err != 0) {
+                return err;
+            }
 
-        printf("The xdma is finished \r\n");
-        // Check the data
-        for (int i = 0; i < 0x1000; i++) {
-            if (tcdm_0[i] != 0xFF) {
-                printf("Error in memset (region 0) \r\n");
-                return -1;
+            task_id = xdma_start();
+            printf(
+                "The xdma is started, setting memory region to 0x00. The task "
+                "id "
+                "is %d \r\n",
+                task_id);
+            xdma_wait(task_id);
+
+            printf("The xdma is finished \r\n");
+            // Check the data
+            for (int i = 0; i < 0x1000; i++) {
+                if (tcdm_0[i] != 0xFF) {
+                    printf("Error in memset (region 0) \r\n");
+                    return -1;
+                }
             }
-        }
-        for (int i = 0x1000; i < 0x3000; i++) {
-            if (tcdm_0[i] != 0x00) {
-                printf("The memset is incorrect (region 1) \r\n");
-                return -1;
+            for (int i = 0x1000; i < 0x3000; i++) {
+                if (tcdm_0[i] != 0x00) {
+                    printf("The memset is incorrect (region 1) \r\n");
+                    return -1;
+                }
             }
-        }
-        for (int i = 0x3000; i < 0x4000; i++) {
-            if (tcdm_0[i] != 0xFF) {
-                printf("The memset is incorrect (region 2) \r\n");
-                return -1;
+            for (int i = 0x3000; i < 0x4000; i++) {
+                if (tcdm_0[i] != 0xFF) {
+                    printf("The memset is incorrect (region 2) \r\n");
+                    return -1;
+                }
+            }
+            printf("The memset of 4KB - 12KB is correct \r\n");
+
+            // Test 3: Setting the 4-12KB region to 0x0000000000000001 (uint64_t
+            // 1) This test is to validate the byte mask by shielding all other
+            // bits, so only LSB 8 bits are set.
+            printf(
+                "Test 3: Setting the 4-12KB region to 0x0000000000000001 "
+                "(uint64_t "
+                "1) \r\n");
+            uint32_t sstride_src_t3[1] = {8};
+            uint32_t sstride_dst_t3[1] = {8};
+            uint32_t tstride_src_t3[1] = {64};
+            uint32_t tstride_dst_t3[1] = {64};
+            uint32_t tbound_src_t3[1] = {128};
+            uint32_t tbound_dst_t3[1] = {128};
+            if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t),
+                               sstride_src_t3, sstride_dst_t3, 1,
+                               tstride_src_t3, tbound_src_t3, 1, tstride_dst_t3,
+                               tbound_dst_t3, 0xffffffff, 0xffffffff,
+                               0x1) != 0) {
+                printf("Error in xdma agu configuration \r\n");
+                err++;
+            } else {
+                printf("The xdma agu is configured \r\n");
             }
-        }
-        printf("The memset of 4KB - 12KB is correct \r\n");
-
-        // Test 3: Setting the 4-12KB region to 0x0000000000000001 (uint64_t 1)
-        // This test is to validate the byte mask by shielding all other bits,
-        // so only LSB 8 bits are set.
-        printf(
-            "Test 3: Setting the 4-12KB region to 0x0000000000000001 (uint64_t "
-            "1) \r\n");
-        uint32_t sstride_src_t3[1] = {8};
-        uint32_t sstride_dst_t3[1] = {8};
-        uint32_t tstride_src_t3[1] = {64};
-        uint32_t tstride_dst_t3[1] = {64};
-        uint32_t tbound_src_t3[1] = {128};
-        uint32_t tbound_dst_t3[1] = {128};
-        if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t),
-                           sstride_src_t3, sstride_dst_t3, 1, tstride_src_t3,
-                           tbound_src_t3, 1, tstride_dst_t3, tbound_dst_t3,
-                           0xffffffff, 0xffffffff, 0x1) != 0) {
-            printf("Error in xdma agu configuration \r\n");
-            err++;
-        } else {
-            printf("The xdma agu is configured \r\n");
-        }
-
-        uint32_t ext_param_t3[1] = {0x1};
-        if (xdma_enable_dst_ext(0, ext_param_t3) != 0) {
-            printf("Error in enabling xdma extension 0 \r\n");
-            err++;
-        } else {
-            printf("The xdma extension 0 is disabled \r\n");
-        }
 
-        if (err != 0) {
-            return err;
-        }
+            uint32_t ext_param_t3[1] = {0x1};
+            if (xdma_enable_dst_ext(0, ext_param_t3) != 0) {
+                printf("Error in enabling xdma extension 0 \r\n");
+                err++;
+            } else {
+                printf("The xdma extension 0 is disabled \r\n");
+            }
 
-        task_id = xdma_start();
-        printf(
-            "The xdma is started, setting memory region to 0x0000000000000001 "
-            "(uint64_t 1). The task id is %d \r\n",
-            task_id);
-        xdma_wait(task_id);
+            if (err != 0) {
+                return err;
+            }
 
-        printf("The xdma is finished \r\n");
-        uint64_t *result_t3 = (uint64_t *)(tcdm_0 + 0x1000 * sizeof(uint8_t));
-        for (int i = 0; i < 0x2000 / 8; i++) {
-            if (result_t3[i] != 1) {
-                printf("Error in memset (region 0) \r\n");
-                return -1;
+            task_id = xdma_start();
+            printf(
+                "The xdma is started, setting memory region to "
+                "0x0000000000000001 "
+                "(uint64_t 1). The task id is %d \r\n",
+                task_id);
+            xdma_wait(task_id);
+
+            printf("The xdma is finished \r\n");
+            uint64_t *result_t3 =
+                (uint64_t *)(tcdm_0 + 0x1000 * sizeof(uint8_t));
+            for (int i = 0; i < 0x2000 / 8; i++) {
+                if (result_t3[i] != 1) {
+                    printf("Error in memset (region 0) \r\n");
+                    return -1;
+                }
             }
+            printf("The memset of 4KB - 12KB is correct \r\n");
+        } else {
+            printf("Core %d is not xdma core.  \r\n", snrt_cluster_core_idx());
         }
-        printf("The memset of 4KB - 12KB is correct \r\n");
-    } else {
-        printf("Core %d is not xdma core.  \r\n", snrt_cluster_core_idx());
-    }
 
-    return 0;
+        return 0;
+    } else
+        return 0;
 }
diff --git a/target/tapeout/Makefile b/target/tapeout/Makefile
index 380d1cfce..3a8f237b3 100644
--- a/target/tapeout/Makefile
+++ b/target/tapeout/Makefile
@@ -50,7 +50,7 @@ CFG = $(TARGET_RTL)/cfg/lru.hjson
 $(CFG): FORCE
 	@# If the LRU config file doesn't exist, we use the default config.
 	@if [ ! -e $@ ] ; then \
-		DEFAULT_CFG="$(TARGET_RTL)/cfg/hemaia.hjson"; \
+		DEFAULT_CFG="$(TARGET_RTL)/cfg/hemaia_tapeout.hjson"; \
 		echo "Using default config file: $$DEFAULT_CFG"; \
 		cp $$DEFAULT_CFG $@; \
 	fi
diff --git a/util/sim/snax_utils.py b/util/sim/snax_utils.py
new file mode 100644
index 000000000..d10db9316
--- /dev/null
+++ b/util/sim/snax_utils.py
@@ -0,0 +1,450 @@
+#!/usr/bin/env python3
+
+# Copyright 2024 KU Leuven.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Xiaoling Yi <xiaoling.yi@esat.kuleuven.be>
+
+import numpy as np
+
+
+# Function to perform 2D convolution on the input data using the specified kernel,
+# stride, and padding. It returns the output feature map.
+def conv2d(input_data, kernel, stride=(1, 1), padding=(0, 0), mode="NHWC"):
+    if mode == "NHWC":
+        batch_size, in_height, in_width, in_channels = input_data.shape
+        out_channels, kernel_height, kernel_width, _ = kernel.shape
+        stride_h, stride_w = stride
+        pad_h, pad_w = padding
+
+        # Calculate the output feature map dimensions
+        out_height = (in_height - kernel_height + 2 * pad_h) // stride_h + 1
+        out_width = (in_width - kernel_width + 2 * pad_w) // stride_w + 1
+
+        # Add padding
+        input_data_padded = np.pad(
+            input_data,
+            ((0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)),
+            mode="constant",
+        )
+
+        # Initialize the output feature map
+        output_data = np.zeros(
+            (batch_size, out_height, out_width, out_channels), np.int32
+        )
+
+        # Perform the convolution operation
+        for b in range(batch_size):
+            for oc in range(out_channels):
+                for oh in range(out_height):
+                    for ow in range(out_width):
+                        # Calculate the input region
+                        ih_start = oh * stride_h
+                        ih_end = ih_start + kernel_height
+                        iw_start = ow * stride_w
+                        iw_end = iw_start + kernel_width
+
+                        # Slice to extract the input region
+                        input_region = input_data_padded[
+                            b, ih_start:ih_end, iw_start:iw_end, :
+                        ]
+
+                        # Slice to extract the corresponding convolution kernel
+                        conv_kernel = kernel[oc, :, :, :]
+
+                        # Perform the convolution calculation
+                        output_data[b, oh, ow, oc] = np.sum(input_region * conv_kernel)
+    else:
+        batch_size, Cin8, in_height, in_width, t = input_data.shape
+        assert t == 8
+        Cout8, Cin8, kernel_height, kernel_width, t1, t2 = kernel.shape
+        assert t1 == 8
+        assert t2 == 8
+        stride_h, stride_w = stride
+        pad_h, pad_w = padding
+
+        # Calculate the output feature map dimensions
+        out_height = (in_height - kernel_height + 2 * pad_h) // stride_h + 1
+        out_width = (in_width - kernel_width + 2 * pad_w) // stride_w + 1
+        assert out_width % 8 == 0
+
+        # Add padding
+        input_data_padded = np.pad(
+            input_data,
+            ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)),
+            mode="constant",
+        )
+
+        # Initialize the output feature map
+        output_data = np.zeros(
+            (batch_size, Cout8, out_height, out_width // 8, 8, 8), np.int32
+        )
+
+        # Perform the convolution operation
+        for b in range(batch_size):
+            for oc in range(Cout8):
+                for oc8 in range(8):
+                    for oh in range(out_height):
+                        for ow in range(out_width // 8):
+                            for ow8 in range(8):
+                                # Calculate the input region
+                                iw_start = (ow * 8 + ow8) * stride_w
+                                iw_end = iw_start + kernel_width
+
+                                ih_start = oh * stride_h
+                                ih_end = ih_start + kernel_height
+
+                                # Slice to extract the input region
+                                input_region = input_data_padded[
+                                    b, :, ih_start:ih_end, iw_start:iw_end, :
+                                ]
+
+                                # Slice to extract the corresponding convolution kernel
+                                conv_kernel = kernel[oc, :, :, :, oc8, :]
+
+                                # Perform the convolution calculation
+                                output_data[b, oc, oh, ow, ow8, oc8] = np.sum(
+                                    input_region * conv_kernel
+                                )
+
+    return output_data
+
+
+# Function to transform input data into columns for efficient convolution operations.
+# It returns the transformed input data and reshaped kernel.
+def im2col(input_data, kernel, stride=(1, 1), padding=(0, 0), mode="NC8HW8"):
+    assert mode == "NC8HW8"
+    batch_size, in_channels_8, in_height, in_width, _ = input_data.shape
+    _, out_channels, kernel_height, kernel_width, _, _ = kernel.shape
+    stride_h, stride_w = stride
+    pad_h, pad_w = padding
+
+    # Calculate the size of the output feature map
+    out_height = (in_height + 2 * pad_h - kernel_height) // stride_h + 1
+    out_width = (in_width + 2 * pad_w - kernel_width) // stride_w + 1
+
+    # Apply zero padding to the input data
+    input_data_padded = np.pad(
+        input_data,
+        ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)),
+        mode="constant",
+    )
+
+    # Initialize the im2col matrix
+    im2col_matrix = np.zeros(
+        (
+            batch_size,
+            out_height,
+            out_width // 8,
+            in_channels_8,
+            kernel_height,
+            kernel_width,
+            # ow in 8
+            8,
+            # cin in 8
+            8,
+        )
+    )
+
+    # Perform the im2col transformation on the input data
+    for b in range(batch_size):
+        for oh in range(out_height):
+            for ow in range(out_width // 8):
+                for ow8 in range(8):
+                    for ic in range(in_channels_8):
+                        for ic8 in range(8):
+                            # Calculate the input region
+                            iw_start = (ow * 8 + ow8) * stride_w
+                            iw_end = iw_start + kernel_width
+
+                            ih_start = oh * stride_h
+                            ih_end = ih_start + kernel_height
+
+                            # Slice to extract the input region
+                            input_region = input_data_padded[
+                                b, ic, ih_start:ih_end, iw_start:iw_end, ic8
+                            ]
+
+                            im2col_matrix[b, oh, ow, ic, :, :, ow8, ic8] = input_region
+
+    im2col_kernel = kernel.reshape(out_channels, -1).T
+
+    return im2col_matrix, im2col_kernel
+
+
+# Golden model function to perform block matrix multiplication with specific parameters.
+# It returns the resulting matrix after the computation.
+def block_gemm_golden_model(
+    m, k, n, row, size, col, a, b, subtraction_a, subtraction_b, c
+):
+    # Reshape and subtract
+    a_subtracted = a.reshape(m, k, row, size) - subtraction_a  # Shape: (m, k, row, size)
+    b_subtracted = b.reshape(n, k, col, size) - subtraction_b  # Shape: (n, k, col, size)
+
+    # Initialize output array
+    d = np.zeros((m, n, row, col), dtype=np.int32)
+
+    # Compute
+    for mm in range(m):
+        for nn in range(n):
+            # Perform tensordot over axes k and size (axes 0 and 3 in original arrays)
+            # But after reshaping, axes are (k, row, size) and (k, col, size)
+            # So axes to sum over are 0 (k) and 2 (size)
+            d[mm, nn] = np.tensordot(
+                a_subtracted[mm], b_subtracted[nn], axes=([0, 2], [0, 2])
+            )
+    # Flatten d and add c
+    d = d.reshape(m * n * row * col) + c
+
+    return d
+
+
+# This function Performs a tiled block General Matrix Multiply (GEMM) operation.
+#
+# This function breaks down large matrix multiplication into smaller submatrices
+# (tiles) and performs GEMM on these submatrices. The results are then accumulated
+# into a final result matrix.
+#
+# Parameters:
+# m2, k2, n2: int
+#     The number of tiles in each dimension.
+# m, k, n: int
+#     The dimensions of the submatrices for block matrix multiplication.
+# row, size, col: int
+#     Size parameters for the submatrices in the hardware gemm accelerator.
+# a, b, c: numpy.ndarray
+#     The input matrices.
+# subtraction_a, subtraction_b: bool
+#     Flags indicating whether to perform subtraction in the GEMM computation.
+#
+# Returns:
+# numpy.ndarray
+#     The result of the tiled GEMM operation as a flattened array.
+def tiled_block_gemm_golden_model(
+    m2, k2, n2, m, k, n, row, size, col, a, b, subtraction_a, subtraction_b, c
+):
+    # Create an empty array for the result with the appropriate size
+    result = np.zeros((m2 * m * row * n2 * n * col), dtype=np.int32)
+
+    # Loop over the tiles
+    for mm2 in range(m2):
+        for nn2 in range(n2):
+            for kk2 in range(k2):
+                # Create submatrices for this tile
+                sub_a = a[
+                    (mm2 * k2 + kk2)
+                    * m
+                    * k
+                    * row
+                    * size: (mm2 * k2 + kk2 + 1)
+                    * m
+                    * k
+                    * row
+                    * size
+                ]
+                sub_b = b[
+                    (nn2 * k2 + kk2)
+                    * n
+                    * k
+                    * size
+                    * col: (nn2 * k2 + kk2 + 1)
+                    * n
+                    * k
+                    * size
+                    * col
+                ]
+                sub_c = c[
+                    (mm2 * n2 + nn2)
+                    * m
+                    * row
+                    * n
+                    * col: (mm2 * n2 + nn2 + 1)
+                    * m
+                    * row
+                    * n
+                    * col
+                ]
+
+                # Perform block GEMM on the submatrices
+                sub_d = block_gemm_golden_model(
+                    m,
+                    k,
+                    n,
+                    row,
+                    size,
+                    col,
+                    sub_a,
+                    sub_b,
+                    subtraction_a,
+                    subtraction_b,
+                    sub_c,
+                )
+                # Accumulate the result into the final result matrix at the correct position
+                result[
+                    (mm2 * n2 + nn2)
+                    * m
+                    * row
+                    * n
+                    * col: (mm2 * n2 + nn2 + 1)
+                    * m
+                    * row
+                    * n
+                    * col
+                ] += sub_d
+
+    return result
+
+
+# Golden model function for reshuffling data with specified parameters. It applies
+# strided layout mapping to the input data and returns the reshuffled data array.
+def data_reshuffler_golden_model(
+    tempLoop0,
+    tempLoop1,
+    spatial_len_0,
+    spatial_len_1,
+    tempStride0,
+    tempStride1,
+    spatialStride0,
+    spatialStride1,
+    data,
+    int32=False,
+):
+    # abstract illusion: k innermost loop, m second innermost loop,
+    # K third innermost loop, M outermost loop
+
+    # total loop bounds = spatial loop bounds * temporal loop bounds
+    K = tempLoop0 * spatial_len_0
+    M = tempLoop1 * spatial_len_1
+
+    # loop bounds settings
+    matrix_size = {"K": K, "M": M, "k": spatial_len_0, "m": spatial_len_1}
+
+    # stride settings
+    strides = {
+        "M": tempStride1,
+        "K": tempStride0,
+        "m": spatialStride1,
+        "k": spatialStride0,
+    }
+
+    if int32:
+        result_array = np.zeros((matrix_size["M"] * matrix_size["K"]), np.int32)
+    else:
+        result_array = np.zeros((matrix_size["M"] * matrix_size["K"]), np.int8)
+
+    # apply strided layout mapping for the golden model of data reshuffler
+    for M in range(matrix_size["M"] // matrix_size["m"]):
+        for K in range(matrix_size["K"] // matrix_size["k"]):
+            for m in range(matrix_size["m"]):
+                for k in range(matrix_size["k"]):
+                    result_array[
+                        # output address calculation with coutinued increment
+                        matrix_size["K"]
+                        // matrix_size["k"]
+                        * matrix_size["k"]
+                        * matrix_size["m"]
+                        * M
+                        + matrix_size["k"] * matrix_size["m"] * K
+                        + m * matrix_size["k"]
+                        + k
+                    ] = data[
+                        # input address calculation with
+                        # strided layout mapping eqaution
+                        strides["M"] * M
+                        + strides["K"] * K
+                        + strides["m"] * m
+                        + strides["k"] * k
+                    ]
+
+    return result_array.ravel()
+
+
+# Golden model function for SIMD postprocessing of data. It performs operations such as
+# zero point subtraction, multiplication, right shift, double rounding, and clipping.
+def postprocessing_simd_golden_model(
+    data_in,
+    input_zp_i,
+    output_zp_i,
+    shift_i,
+    max_int_i,
+    min_int_i,
+    double_round_i,
+    multiplier_i,
+):
+
+    # Step 1: Subtract input zero point
+    var = data_in - input_zp_i
+
+    # Step 2: Multiply with the multiplier avoiding overflow
+    var = np.int64(var) * np.int64(multiplier_i)
+
+    # Step 3: Right shift
+    var = np.int32(var >> (shift_i - 1))
+
+    # Step 4: Apply double rounding if necessary
+    if double_round_i:
+        var = np.where(var >= 0, var + 1, var - 1)
+
+    # Step 5: Final right shift
+    var = var >> 1
+
+    # Step 6: Add output zero point
+    var = var + output_zp_i
+
+    # Step 7: Clip the values to be within min and max integer range
+    var = np.clip(var, min_int_i, max_int_i)
+
+    return var
+
+
+def max_pooling(
+    input_tensor,
+    pool_size_w,
+    pool_size_h,
+    stride_w,
+    stride_h,
+    padding_w,
+    padding_h,
+    mode="HWC",
+):
+
+    # if mode == "HWC", C8 is 1, C = realCin
+    # if mode != "HWC", C8 is realCin/8, C = 8
+    C8, H, W, C = input_tensor.shape
+    if mode != "HWC":
+        assert input_tensor.shape[3] == 8 and C == 8
+    elif mode == "HWC":
+        assert input_tensor.shape[0] == 1 and C8 == 1
+
+    out_width = (W + 2 * padding_w - pool_size_w) // stride_w + 1
+    out_height = (H + 2 * padding_h - pool_size_h) // stride_h + 1
+
+    input_padded = np.pad(
+        input_tensor,
+        ((0, 0), (padding_h, padding_h), (padding_w, padding_w), (0, 0)),
+        mode="constant",
+        constant_values=0,
+    )
+
+    pooled_tensor = np.zeros((C8, out_height, out_width, C), dtype=np.int8)
+
+    for c in range(C8):
+        for i in range(out_height):
+            for j in range(out_width):
+                for k in range(C):
+                    h_start = i * stride_h
+                    h_end = h_start + pool_size_h
+                    w_start = j * stride_w
+                    w_end = w_start + pool_size_w
+                    pooled_tensor[c, i, j, k] = np.max(
+                        input_padded[c, h_start:h_end, w_start:w_end, k]
+                    )
+
+    return pooled_tensor
+
+
+def align_wide_addr(addr, alignment=64):
+    if addr % alignment:
+        addr = ((addr // alignment) + 1) * alignment
+    return addr