diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 548e8985a..db7e8ac7b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -52,10 +52,10 @@ jobs: - name: Compile SW run: | echo "Compiling SW" - make sw CFG_OVERRIDE=target/sw/cfg/hemaia.hjson -j$(nproc) + make sw CFG_OVERRIDE=target/sw/cfg/hemaia_ci.hjson -j$(nproc) - name: Compile RTL run: | - make rtl CFG_OVERRIDE=target/rtl/cfg/hemaia.hjson + make rtl CFG_OVERRIDE=target/rtl/cfg/hemaia_ci.hjson - name: Compile Verilator Binary run: | make occamy_system_vlt -j$(nproc) diff --git a/Makefile b/Makefile index 6da73bc17..de7c844cf 100644 --- a/Makefile +++ b/Makefile @@ -6,7 +6,7 @@ MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) MKFILE_DIR := $(dir $(MKFILE_PATH)) -CFG_OVERRIDE ?= target/rtl/cfg/hemaia.hjson +CFG_OVERRIDE ?= target/rtl/cfg/hemaia_tapeout.hjson CFG = $(realpath $(CFG_OVERRIDE)) clean: diff --git a/target/rtl/Makefile b/target/rtl/Makefile index 985a2faa0..80bcbcb82 100644 --- a/target/rtl/Makefile +++ b/target/rtl/Makefile @@ -58,7 +58,7 @@ CFG = $(TARGET_RTL)/cfg/lru.hjson $(CFG): FORCE @# If the LRU config file doesn't exist, we use the default config. @if [ ! -e $@ ] ; then \ - DEFAULT_CFG="$(TARGET_RTL)/cfg/hemaia.hjson"; \ + DEFAULT_CFG="$(TARGET_RTL)/cfg/hemaia_tapeout.hjson"; \ echo "Using default config file: $$DEFAULT_CFG"; \ cp $$DEFAULT_CFG $@; \ fi diff --git a/target/rtl/cfg/hemaia.hjson b/target/rtl/cfg/hemaia_ci.hjson similarity index 95% rename from target/rtl/cfg/hemaia.hjson rename to target/rtl/cfg/hemaia_ci.hjson index e802713b2..df5cb63c8 100755 --- a/target/rtl/cfg/hemaia.hjson +++ b/target/rtl/cfg/hemaia_ci.hjson @@ -108,10 +108,10 @@ cfg_base_offset: 65536 // 0x10000 }, clusters:[ + "snax_KUL_cluster", // snax_cgra_cluster "snax_KUL_cluster", - "snax_KUL_dse_cluster", "snax_hypercorex_cluster", - // "snax_dimc_cluster" + snax_dimc_cluster ], // peripherals diff --git a/target/rtl/cfg/hemaia_two_clusters.hjson b/target/rtl/cfg/hemaia_two_clusters.hjson deleted file mode 100755 index bae8873c3..000000000 --- a/target/rtl/cfg/hemaia_two_clusters.hjson +++ /dev/null @@ -1,198 +0,0 @@ -{ - bender_target: ["cv64a6_imafdc_sv39", "occamy"], - // Remote CFG, about to be removed - is_remote_quadrant: false, - remote_quadrants: [], - // Multi-chip configuration - hemaia_multichip: { - chip_id_width: 8 - } - addr_width: 48, - data_width: 64, - // XBARs - wide_xbar: { - max_slv_trans: 64, - max_mst_trans: 64, - fall_through: false, - }, - quadrant_inter_xbar_slv_id_width_no_rocache: 3, - quadrant_inter_xbar: { - max_slv_trans: 64, - max_mst_trans: 64, - fall_through: false, - }, - narrow_xbar: { - max_slv_trans: 32, - max_mst_trans: 32, - fall_through: false, - }, - cuts: { - narrow_to_quad: 3, - quad_to_narrow: 3, - wide_to_quad: 3, - quad_to_wide: 3, - narrow_to_cva6: 2, - narrow_conv_to_spm_narrow_pre: 2, - narrow_conv_to_spm_narrow: 1, - narrow_and_pcie: 3, - narrow_and_wide: 1, - wide_conv_to_spm_wide: 3, - wide_to_wide_zero_mem: 0, - wide_to_hbm: 3, - wide_and_inter: 3, - wide_and_hbi: 3, - narrow_and_hbi: 3, - pre_to_hbmx: 3, - hbmx_to_hbm: 3, - atomic_adapter_narrow: 1, - atomic_adapter_narrow_wide: 1, - // Give some flexibility in peripheral xbar placement - periph_axi_lite_narrow: 2, - periph_axi_lite: 2, - periph_axi_lite_narrow_hbm_xbar_cfg: 2, - // Non-right-side chip peripherals - periph_axi_lite_narrow_hbm_cfg: 3, - periph_axi_lite_narrow_pcie_cfg: 3, - periph_axi_lite_narrow_chip_ctrl_cfg: 3, - periph_axi_lite_narrow_hbi_narrow_cfg: 3, - periph_axi_lite_narrow_hbi_wide_cfg: 3, - periph_axi_lite_narrow_bootrom_cfg: 3, - periph_axi_lite_narrow_fll_system_cfg: 3, - periph_axi_lite_narrow_fll_periph_cfg: 3, - periph_axi_lite_narrow_fll_hbm2e_cfg: 3, - // Right-side or latency-invariant chip peripherals - periph_axi_lite_narrow_soc_ctrl_cfg: 1, - periph_axi_lite_narrow_uart_cfg: 1, - periph_axi_lite_narrow_i2c_cfg: 1, - periph_axi_lite_narrow_gpio_cfg: 1, - periph_axi_lite_narrow_clint_cfg: 1, - periph_axi_lite_narrow_plic_cfg: 1, - periph_axi_lite_narrow_spim_cfg: 1, - periph_axi_lite_narrow_timer_cfg: 1, - }, - txns: { - wide_and_inter: 128, - wide_to_hbm: 128, - narrow_and_wide: 16, - rmq: 4, - }, - narrow_xbar_slv_id_width: 4, - narrow_xbar_user_width: 3, // clog2(total number of clusters) - nr_s1_quadrant: 1, - s1_quadrant: { - // number of pending transactions on the narrow/wide network - narrow_trans: 32, - wide_trans: 32, - // Disable for easier flow trials. - ro_cache_cfg: { - width: 1024, - count: 128, - sets: 2, - max_trans: 32, - address_regions: 4, - } - wide_xbar: { - max_slv_trans: 32, - max_mst_trans: 32, - fall_through: false, - }, - wide_xbar_slv_id_width: 3 - narrow_xbar: { - max_slv_trans: 8, - max_mst_trans: 8, - fall_through: false, - }, - narrow_xbar_slv_id_width: 4, - narrow_xbar_user_width: 3, // clog2(total number of clusters) - cfg_base_addr: 184549376, // 0x0b000000 - cfg_base_offset: 65536 // 0x10000 - }, - clusters:[ - "snax_KUL_cluster", - "snax_KUL_dse_cluster" - ], - - // peripherals - peripherals: { - rom: { - address: 16777216, // 0x0100_0000 - length: 131072, // 128 kiB 0x2_0000 - }, - clint: { - address: 67108864, // 0x0400_0000 - length: 1048576, // 1 MiB 0x10_0000 - }, - axi_lite_peripherals: [ - { - name: "debug", - address: 0, // 0x0000_0000 - length: 4096, // 4 kiB 0x1000 - }, - { - name: "spis", // Only Master port, no slave port - } - ], - axi_lite_narrow_peripherals: [ - { - name: "soc_ctrl", - address: 33554432, // 0x0200_0000 - length: 4096, // 4 kiB 0x1000 - }, - { - name: "uart", - address: 33562624, // 0x0200_2000 - length: 4096, // 4 kiB 0x1000 - }, - { - name: "gpio", - address: 33566720, // 0x0200_3000 - length: 4096, // 4 kiB 0x1000 - }, - { - name: "i2c", - address: 33570816, // 0x0200_4000 - length: 4096, // 4 kiB 0x1000 - }, - { - name: "chip_ctrl", - address: 33574912, // 0x0200_5000 - length: 4096, // 4 kiB 0x1000 - }, - { - name: "timer", - address: 33579008, // 0x0200_6000 - length: 4096, // 4 kiB 0x1000 - }, - { - name: "spim", - address: 50331648, // 0x0300_0000 - length: 131072, // 4 kiB 0x2_0000 - }, - { - name: "plic", - address: 201326592, // 0x0C00_0000 - length: 67108864, // 64 MiB 0x400_0000 - }, - ], - }, - // non-peripheral IPs - spm_narrow: { - address: 1879048192, // 0x7000_0000 - length: 131072, // 128 kiB 0x2_0000 - }, - spm_wide: { - address: 2147483648, // 0x8000_0000 - length: 1048576, // 1 MiB 0x10_0000 - }, - wide_zero_mem: { - address: 68719476736, // 0x10_0000_0000 - length: 8589934592, // 8 GiB 0x11_0000_0000 - }, - sys_idma_cfg: { - address: 285212672, // 0x1100_0000 - length: 65536, // 64 kiB 0x1_0000 - }, - // backup boot address - backup_boot_addr: 2147483648 // 0x8000_0000 - -} diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/datagen.py b/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/datagen.py index 9c474c708..24211b872 100755 --- a/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/datagen.py +++ b/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/datagen.py @@ -13,20 +13,11 @@ import sys import os -import subprocess - # Add data utility path -sys.path.append(os.path.join(os.path.dirname(__file__), - "../../../../../../../../util/sim/")) +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/")) from data_utils import format_scalar_definition, format_vector_definition # noqa E402 # Add golden model path -bender_command = subprocess.run(['bender', 'path', 'snitch_cluster'], - capture_output=True, text=True) -snax_utils_path = bender_command.stdout.strip() - -sys.path.append(snax_utils_path + "/util/sim/") - from snax_utils import ( # noqa E402 conv2d, im2col, diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/params.hjson b/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/params.hjson index eb8db8552..e69dec035 100644 --- a/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/params.hjson +++ b/target/sim/sw/device/apps/snax/snax-gemmx-conv/data/params.hjson @@ -23,7 +23,7 @@ channel_en_C: 1, // memory space configurations - interleaved_address: 1, + interleaved_address: 0, memory_size: 128, // hardware parameters diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-conv/src/snax-gemmx-conv.c b/target/sim/sw/device/apps/snax/snax-gemmx-conv/src/snax-gemmx-conv.c index 8699c3eb6..e0620d958 100644 --- a/target/sim/sw/device/apps/snax/snax-gemmx-conv/src/snax-gemmx-conv.c +++ b/target/sim/sw/device/apps/snax/snax-gemmx-conv/src/snax-gemmx-conv.c @@ -14,136 +14,141 @@ // We use several nested loops to iterate over the input data and weights, // achieving implicit im2col int main() { - // Set err value for checking - int err = 0; - - // Prepare addresses pointers in TCDM for DMA - int8_t *local_a_dma, *local_b_dma; - int32_t *local_c_dma, *local_d32_dma; - int8_t *local_d8_dma; - - // Allocate space in TCDM for DMA - local_a_dma = (int8_t *)(snrt_l1_next() + delta_physical_a); - local_b_dma = (int8_t *)(snrt_l1_next() + delta_physical_b); - local_c_dma = (int32_t *)(snrt_l1_next() + delta_physical_c); - local_d32_dma = (int32_t *)(snrt_l1_next() + delta_physical_d32); - local_d8_dma = (int8_t *)(snrt_l1_next() + delta_physical_d8); - - // Prepare addresses pointers in TCDM for streamer - int8_t *local_a, *local_b; - int32_t *local_c, *local_d32; - int8_t *local_d8; - - // Allocate space in TCDM for streamer - local_a = (int8_t *)(snrt_l1_next() + delta_local_a); - local_b = (int8_t *)(snrt_l1_next() + delta_local_b); - local_c = (int32_t *)(snrt_l1_next() + delta_local_c); - local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32); - local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8); - - // Transfer data from L3 to L1 - // Using DMA only - if (snrt_is_dm_core()) { - if (interleaved_address == 1) { - snrt_dma_start_1d(local_a, A, - Nbatch * (H + 2 * pad_h) * (W + 2 * pad_w) * Cin * - sizeof(int8_t)); - snrt_dma_start_1d(local_b, B, - Cout * Kh * Kw * Cin * sizeof(int8_t)); - } else { - snrt_dma_start_2d( - local_a_dma, A, 64 * sizeof(int8_t), 256, 64, - Nbatch * (H + 2 * pad_h) * (W + 2 * pad_w) * Cin / 64); - snrt_dma_start_2d(local_b_dma, B, 64 * sizeof(int8_t), 256, 64, - Cout * Kh * Kw * Cin / 64); - } - snrt_dma_wait_all(); - } - - // Wait for DMA to finish - snrt_cluster_hw_barrier(); - if (snrt_is_dm_core()) { - if (interleaved_address == 1) { - snrt_dma_start_1d(local_c, C, - M * N * meshRow * meshCol * sizeof(int32_t)); - } else { - snrt_dma_start_2d(local_c_dma, C, 16 * sizeof(int32_t), 256, - 16 * sizeof(int32_t), - M * N * meshRow * meshCol / 16); + if (snrt_cluster_idx() == 1) { // Set err value for checking + int err = 0; + + // Prepare addresses pointers in TCDM for DMA + int8_t *local_a_dma, *local_b_dma; + int32_t *local_c_dma, *local_d32_dma; + int8_t *local_d8_dma; + + // Allocate space in TCDM for DMA + local_a_dma = (int8_t *)(snrt_l1_next() + delta_physical_a); + local_b_dma = (int8_t *)(snrt_l1_next() + delta_physical_b); + local_c_dma = (int32_t *)(snrt_l1_next() + delta_physical_c); + local_d32_dma = (int32_t *)(snrt_l1_next() + delta_physical_d32); + local_d8_dma = (int8_t *)(snrt_l1_next() + delta_physical_d8); + + // Prepare addresses pointers in TCDM for streamer + int8_t *local_a, *local_b; + int32_t *local_c, *local_d32; + int8_t *local_d8; + + // Allocate space in TCDM for streamer + local_a = (int8_t *)(snrt_l1_next() + delta_local_a); + local_b = (int8_t *)(snrt_l1_next() + delta_local_b); + local_c = (int32_t *)(snrt_l1_next() + delta_local_c); + local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32); + local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8); + + // Transfer data from L3 to L1 + // Using DMA only + if (snrt_is_dm_core()) { + if (interleaved_address == 1) { + snrt_dma_start_1d(local_a, A, + Nbatch * (H + 2 * pad_h) * (W + 2 * pad_w) * + Cin * sizeof(int8_t)); + snrt_dma_start_1d(local_b, B, + Cout * Kh * Kw * Cin * sizeof(int8_t)); + } else { + snrt_dma_start_2d( + local_a_dma, A, 64 * sizeof(int8_t), 256, 64, + Nbatch * (H + 2 * pad_h) * (W + 2 * pad_w) * Cin / 64); + snrt_dma_start_2d(local_b_dma, B, 64 * sizeof(int8_t), 256, 64, + Cout * Kh * Kw * Cin / 64); + } + snrt_dma_wait_all(); } - snrt_dma_wait_all(); - } - - snrt_cluster_hw_barrier(); - - if (snrt_global_core_idx() == 0) { - // Set Streamer configuration CSR for conv2d - set_gemmx_streamer_csr( - Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1, - Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3, Atlbound4, - Atlstride4, Atlbound5, Atlstride5, set_addr_remap_index_A, - - Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1, - Btlstride1, Btlbound2, Btlstride2, set_addr_remap_index_B, - - D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1, - D8tlstride1, D8tlbound2, D8tlstride2, set_addr_remap_index_D8, - - Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1, - Ctlstride1, Ctlbound2, Ctlstride2, set_addr_remap_index_C, - - D32slstride0, D32slstride1, D32tlbound0, D32tlstride0, D32tlbound1, - D32tlstride1, D32tlbound2, D32tlstride2, set_addr_remap_index_D32, - - delta_local_a, delta_local_b, delta_local_d8, delta_local_c, - delta_local_d32, bypassSIMD, transposed_A, transposed_B, - channel_en_C, broadcast_C); - // Set GEMMX configuration CSR - uint32_t subtraction_setting = - gen_subtraction_config(subtraction_a, subtraction_b); - - uint32_t csr0 = - gen_csr0_config(input_zp_i, output_zp_i, max_int_i, min_int_i); - uint32_t csr1 = gen_csr1_config(double_round_i); - - set_gemmx_csr( - K, N, M, subtraction_setting, csr0, csr1, shared_bitpacked_shift0, - shared_bitpacked_shift1, shared_multiplier0, shared_multiplier1, - shared_multiplier2, shared_multiplier3, shared_multiplier4, - shared_multiplier5, shared_multiplier6, shared_multiplier7, M * N, - bypassSIMD); - - // Set CSR to start Streamer for conv2d - set_gemmx_streamer_start(); - - // Set CSR to start GEMM - set_gemmx_start(); - - // Poll until Streamer and GEMM accelerator finish - wait_gemmx_and_streamer(); - - // check the result of the implicit im2col convolution - if (interleaved_address == 1) { - if (!bypassSIMD) { - err += check_gemmx_result_D8(local_d8, D8, Batch, M, N, false); + // Wait for DMA to finish + snrt_cluster_hw_barrier(); + if (snrt_is_dm_core()) { + if (interleaved_address == 1) { + snrt_dma_start_1d(local_c, C, + M * N * meshRow * meshCol * sizeof(int32_t)); } else { - err += - check_gemmx_result_D32(local_d32, D32, Batch, M, N, false); + snrt_dma_start_2d(local_c_dma, C, 16 * sizeof(int32_t), 256, + 16 * sizeof(int32_t), + M * N * meshRow * meshCol / 16); } - } else { - if (!bypassSIMD) { - err += - check_gemmx_result_D8(local_d8_dma, D8, Batch, M, N, true); + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + if (snrt_cluster_core_idx() == 0) { + // Set Streamer configuration CSR for conv2d + set_gemmx_streamer_csr( + Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1, + Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3, + Atlbound4, Atlstride4, Atlbound5, Atlstride5, + set_addr_remap_index_A, + + Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1, + Btlstride1, Btlbound2, Btlstride2, set_addr_remap_index_B, + + D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1, + D8tlstride1, D8tlbound2, D8tlstride2, set_addr_remap_index_D8, + + Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1, + Ctlstride1, Ctlbound2, Ctlstride2, set_addr_remap_index_C, + + D32slstride0, D32slstride1, D32tlbound0, D32tlstride0, + D32tlbound1, D32tlstride1, D32tlbound2, D32tlstride2, + set_addr_remap_index_D32, + + delta_local_a, delta_local_b, delta_local_d8, delta_local_c, + delta_local_d32, bypassSIMD, transposed_A, transposed_B, + channel_en_C, broadcast_C); + + // Set GEMMX configuration CSR + uint32_t subtraction_setting = + gen_subtraction_config(subtraction_a, subtraction_b); + + uint32_t csr0 = + gen_csr0_config(input_zp_i, output_zp_i, max_int_i, min_int_i); + uint32_t csr1 = gen_csr1_config(double_round_i); + + set_gemmx_csr( + K, N, M, subtraction_setting, csr0, csr1, + shared_bitpacked_shift0, shared_bitpacked_shift1, + shared_multiplier0, shared_multiplier1, shared_multiplier2, + shared_multiplier3, shared_multiplier4, shared_multiplier5, + shared_multiplier6, shared_multiplier7, M * N, bypassSIMD); + + // Set CSR to start Streamer for conv2d + set_gemmx_streamer_start(); + + // Set CSR to start GEMM + set_gemmx_start(); + + // Poll until Streamer and GEMM accelerator finish + wait_gemmx_and_streamer(); + + // check the result of the implicit im2col convolution + if (interleaved_address == 1) { + if (!bypassSIMD) { + err += + check_gemmx_result_D8(local_d8, D8, Batch, M, N, false); + } else { + err += check_gemmx_result_D32(local_d32, D32, Batch, M, N, + false); + } } else { - err += check_gemmx_result_D32(local_d32_dma, D32, Batch, M, N, - true); + if (!bypassSIMD) { + err += check_gemmx_result_D8(local_d8_dma, D8, Batch, M, N, + true); + } else { + err += check_gemmx_result_D32(local_d32_dma, D32, Batch, M, + N, true); + } } - } - printf("SNAX GEMM Conv2d: %s, Error: %d . bypassSIMD = %d .\n", - err ? "FAIL" : "PASS", err, bypassSIMD); - }; + printf("SNAX GEMM Conv2d: %s, Error: %d . bypassSIMD = %d .\r\n", + err ? "FAIL" : "PASS", err, bypassSIMD); + }; - return err; + return err; + } else + return 0; } diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-matmul/data/datagen.py b/target/sim/sw/device/apps/snax/snax-gemmx-matmul/data/datagen.py index e525f740d..1e0d76679 100755 --- a/target/sim/sw/device/apps/snax/snax-gemmx-matmul/data/datagen.py +++ b/target/sim/sw/device/apps/snax/snax-gemmx-matmul/data/datagen.py @@ -13,19 +13,11 @@ import sys import os -import subprocess - # Add data utility path -sys.path.append(os.path.join(os.path.dirname(__file__), - "../../../../../../../../util/sim/")) +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/")) from data_utils import format_scalar_definition, format_vector_definition # noqa E402 # Add golden model path -bender_command = subprocess.run(['bender', 'path', 'snitch_cluster'], - capture_output=True, text=True) -snax_utils_path = bender_command.stdout.strip() - -sys.path.append(snax_utils_path + "/util/sim/") from snax_utils import ( # noqa E402 conv2d, im2col, diff --git a/target/sim/sw/device/apps/snax/snax-gemmx-matmul/src/snax-gemmx-matmul.c b/target/sim/sw/device/apps/snax/snax-gemmx-matmul/src/snax-gemmx-matmul.c index efc0c088d..3ac105b28 100644 --- a/target/sim/sw/device/apps/snax/snax-gemmx-matmul/src/snax-gemmx-matmul.c +++ b/target/sim/sw/device/apps/snax/snax-gemmx-matmul/src/snax-gemmx-matmul.c @@ -14,99 +14,104 @@ // We use several nested loops to iterate over the input data and weights, // achieving implicit im2col int main() { - // Set err value for checking - int err = 0; - - // Prepare addresses in TCDM - int8_t *local_a, *local_b; - int32_t *local_c, *local_d32; - int8_t *local_d8; - - // Allocate space in TCDM - local_a = (int8_t *)(snrt_l1_next() + delta_local_a); - local_b = (int8_t *)(snrt_l1_next() + delta_local_b); - local_c = (int32_t *)(snrt_l1_next() + delta_local_c); - local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32); - local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8); - - // Transfer data from L3 to L1 - // Using DMA only - if (snrt_is_dm_core()) { - snrt_dma_start_1d(local_a, A, - M * K * meshRow * tileSize * sizeof(int8_t)); - snrt_dma_start_1d(local_b, B, - N * K * tileSize * meshCol * sizeof(int8_t)); - - snrt_dma_wait_all(); - } - - // Wait for DMA to finish - snrt_cluster_hw_barrier(); - if (snrt_is_dm_core()) { - snrt_dma_start_1d(local_c, C, - M * N * meshRow * meshCol * sizeof(int32_t)); - snrt_dma_wait_all(); - } - - snrt_cluster_hw_barrier(); - - if (snrt_global_core_idx() == 0) { - // Set Streamer configuration CSR for conv2d - set_gemmx_streamer_csr( - Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1, - Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3, Atlbound4, - Atlstride4, Atlbound5, Atlstride5, set_addr_remap_index_A, - - Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1, - Btlstride1, Btlbound2, Btlstride2, set_addr_remap_index_B, - - D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1, - D8tlstride1, D8tlbound2, D8tlstride2, set_addr_remap_index_D8, - - Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1, - Ctlstride1, Ctlbound2, Ctlstride2, set_addr_remap_index_C, - - D32slstride0, D32slstride1, D32tlbound0, D32tlstride0, D32tlbound1, - D32tlstride1, D32tlbound2, D32tlstride2, set_addr_remap_index_D32, - - delta_local_a, delta_local_b, delta_local_d8, delta_local_c, - delta_local_d32, bypassSIMD, transposed_A, transposed_B, - channel_en_C, broadcast_C); - - // Set GEMMX configuration CSR - uint32_t subtraction_setting = - gen_subtraction_config(subtraction_a, subtraction_b); - - uint32_t csr0 = - gen_csr0_config(input_zp_i, output_zp_i, max_int_i, min_int_i); - uint32_t csr1 = gen_csr1_config(double_round_i); - - set_gemmx_csr( - K, N, M, subtraction_setting, csr0, csr1, shared_bitpacked_shift0, - shared_bitpacked_shift1, shared_multiplier0, shared_multiplier1, - shared_multiplier2, shared_multiplier3, shared_multiplier4, - shared_multiplier5, shared_multiplier6, shared_multiplier7, M * N, - bypassSIMD); - - // Set CSR to start Streamer for conv2d - set_gemmx_streamer_start(); - - // Set CSR to start GEMM - set_gemmx_start(); - - // Poll until Streamer and GEMM accelerator finish - wait_gemmx_and_streamer(); - - // check the result of the implicit im2col convolution - if (!bypassSIMD) { - err += check_gemmx_result_D8(local_d8, D8, Batch, M, N, false); - } else { - err += check_gemmx_result_D32(local_d32, D32, Batch, M, N, false); + if (snrt_cluster_idx() == 1) { // Set err value for checking + int err = 0; + + // Prepare addresses in TCDM + int8_t *local_a, *local_b; + int32_t *local_c, *local_d32; + int8_t *local_d8; + + // Allocate space in TCDM + local_a = (int8_t *)(snrt_l1_next() + delta_local_a); + local_b = (int8_t *)(snrt_l1_next() + delta_local_b); + local_c = (int32_t *)(snrt_l1_next() + delta_local_c); + local_d32 = (int32_t *)(snrt_l1_next() + delta_local_d32); + local_d8 = (int8_t *)(snrt_l1_next() + delta_local_d8); + + // Transfer data from L3 to L1 + // Using DMA only + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_a, A, + M * K * meshRow * tileSize * sizeof(int8_t)); + snrt_dma_start_1d(local_b, B, + N * K * tileSize * meshCol * sizeof(int8_t)); + + snrt_dma_wait_all(); } - printf("SNAX GEMM Matmul: %s, Error: %d . bypassSIMD = %d .\n", - err ? "FAIL" : "PASS", err, bypassSIMD); - }; + // Wait for DMA to finish + snrt_cluster_hw_barrier(); + if (snrt_is_dm_core()) { + snrt_dma_start_1d(local_c, C, + M * N * meshRow * meshCol * sizeof(int32_t)); + snrt_dma_wait_all(); + } + + snrt_cluster_hw_barrier(); + + if (snrt_cluster_core_idx() == 0) { + // Set Streamer configuration CSR for conv2d + set_gemmx_streamer_csr( + Aslstride0, Aslstride1, Atlbound0, Atlstride0, Atlbound1, + Atlstride1, Atlbound2, Atlstride2, Atlbound3, Atlstride3, + Atlbound4, Atlstride4, Atlbound5, Atlstride5, + set_addr_remap_index_A, + + Bslstride0, Bslstride1, Btlbound0, Btlstride0, Btlbound1, + Btlstride1, Btlbound2, Btlstride2, set_addr_remap_index_B, + + D8slstride0, D8slstride1, D8tlbound0, D8tlstride0, D8tlbound1, + D8tlstride1, D8tlbound2, D8tlstride2, set_addr_remap_index_D8, + + Cslstride0, Cslstride1, Ctlbound0, Ctlstride0, Ctlbound1, + Ctlstride1, Ctlbound2, Ctlstride2, set_addr_remap_index_C, + + D32slstride0, D32slstride1, D32tlbound0, D32tlstride0, + D32tlbound1, D32tlstride1, D32tlbound2, D32tlstride2, + set_addr_remap_index_D32, + + delta_local_a, delta_local_b, delta_local_d8, delta_local_c, + delta_local_d32, bypassSIMD, transposed_A, transposed_B, + channel_en_C, broadcast_C); + + // Set GEMMX configuration CSR + uint32_t subtraction_setting = + gen_subtraction_config(subtraction_a, subtraction_b); + + uint32_t csr0 = + gen_csr0_config(input_zp_i, output_zp_i, max_int_i, min_int_i); + uint32_t csr1 = gen_csr1_config(double_round_i); + + set_gemmx_csr( + K, N, M, subtraction_setting, csr0, csr1, + shared_bitpacked_shift0, shared_bitpacked_shift1, + shared_multiplier0, shared_multiplier1, shared_multiplier2, + shared_multiplier3, shared_multiplier4, shared_multiplier5, + shared_multiplier6, shared_multiplier7, M * N, bypassSIMD); + + // Set CSR to start Streamer for conv2d + set_gemmx_streamer_start(); + + // Set CSR to start GEMM + set_gemmx_start(); + + // Poll until Streamer and GEMM accelerator finish + wait_gemmx_and_streamer(); + + // check the result of the implicit im2col convolution + if (!bypassSIMD) { + err += check_gemmx_result_D8(local_d8, D8, Batch, M, N, false); + } else { + err += + check_gemmx_result_D32(local_d32, D32, Batch, M, N, false); + } + + printf("SNAX GEMM Matmul: %s, Error: %d . bypassSIMD = %d .\r\n", + err ? "FAIL" : "PASS", err, bypassSIMD); + }; - return err; + return err; + } else + return 0; } diff --git a/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c b/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c index 4ca19a5c6..42e706d60 100644 --- a/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c +++ b/target/sim/sw/device/apps/snax/snax-test-integration/src/snax-test-integration.c @@ -22,7 +22,7 @@ int main() { if (snrt_cluster_idx() == 0) { if (snrt_is_dm_core()) { tcdm0_start_addr = (int8_t*)snrt_cluster_base_addrl(); - printf("The C0 TCDM ADDR is %p \n", tcdm0_start_addr); + printf("The C0 TCDM ADDR is %p \r\n", tcdm0_start_addr); } } snrt_global_barrier(); @@ -30,14 +30,14 @@ int main() { if (snrt_cluster_idx() == 1) { if (snrt_is_dm_core()) { tcdm1_start_addr = (int8_t*)snrt_cluster_base_addrl(); - printf("The C1 TCDM ADDR is %p \n", tcdm1_start_addr); + printf("The C1 TCDM ADDR is %p \r\n", tcdm1_start_addr); } } snrt_global_barrier(); // C0 Load the data from l3 -> l1 if (snrt_cluster_idx() == 0) { if (snrt_is_dm_core()) { - printf("[C0] Start to load data from %p\n", test_data); + printf("[C0] Start to load data from %p\r\n", test_data); snrt_dma_start_1d(tcdm0_start_addr, test_data, length_data); snrt_dma_wait_all(); } @@ -48,7 +48,7 @@ int main() { // Thenc C1 fetches data from C0 if (snrt_cluster_idx() == 1) { if (snrt_is_dm_core()) { - printf("[C1] Load data from C0 TCDM %p\n", tcdm0_start_addr); + printf("[C1] Load data from C0 TCDM %p\r\n", tcdm0_start_addr); snrt_dma_start_1d(tcdm1_start_addr, tcdm0_start_addr, length_data); snrt_dma_wait_all(); } @@ -59,12 +59,12 @@ int main() { // Start to check if (snrt_cluster_idx() == 0) { if (snrt_cluster_core_idx() == 0) { - printf("C0 Checking the results\n"); + printf("C0 Checking the results\r\n"); for (int i = 0; i < length_data; i++) { if (tcdm0_start_addr[i] != test_data[i]) { err++; - printf("C0 data is incorrect!\n"); - printf("tcdm0[%d]=%d, test_data[%d]=%d\n", i, + printf("C0 data is incorrect!\r\n"); + printf("tcdm0[%d]=%d, test_data[%d]=%d\r\n", i, tcdm0_start_addr[i], i, test_data[i]); return -1; } @@ -74,12 +74,12 @@ int main() { snrt_global_barrier(); if (snrt_cluster_idx() == 1) { if (snrt_cluster_core_idx() == 0) { - printf("C1 Checking the results\n"); + printf("C1 Checking the results\r\n"); for (int i = 0; i < length_data; i++) { if (tcdm1_start_addr[i] != test_data[i]) { err++; - printf("C1 data is incorrect!\n"); - printf("tcdm0[%d]=%d, test_data[%d]=%d\n", i, + printf("C1 data is incorrect!\r\n"); + printf("tcdm0[%d]=%d, test_data[%d]=%d\r\n", i, tcdm1_start_addr[i], i, test_data[i]); return -1; } @@ -90,7 +90,7 @@ int main() { snrt_global_barrier(); if (snrt_cluster_idx() == 0) { if (snrt_is_dm_core()) { - printf("Checking all done! No error!\n"); + printf("Checking all done! No error!\r\n"); } } diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py index fccf81ebd..a3eb11fa7 100755 --- a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py +++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/datagen.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 -# Copyright 2024 KU Leuven. +# Copyright 2023 KU Leuven. # Licensed under the Apache License, Version 2.0, see LICENSE for details. # SPDX-License-Identifier: Apache-2.0 # -# Fanchen Kong +# Xiaoling Yi import numpy as np import argparse @@ -12,19 +12,14 @@ import hjson import sys import os -import subprocess # Add data utility path sys.path.append(os.path.join(os.path.dirname(__file__), "../../../../../../../../util/sim/")) from data_utils import format_scalar_definition, format_vector_definition # noqa E402 -bender_command = subprocess.run(['bender', 'path', 'snitch_cluster'], - capture_output=True, text=True) -snax_utils_path = bender_command.stdout.strip() +# Add golden model path +from snax_utils import data_reshuffler_golden_model, max_pooling, im2col # noqa E402 -sys.path.append(snax_utils_path + "/util/sim/") - -from snax_utils import max_pooling # noqa E402 np.random.seed(42) @@ -32,79 +27,605 @@ def emit_header_file(**kwargs): emit_str = "#include \n\n" emit_str += "#include \n\n" - emit_str += emit_data(**kwargs) + emit_str += emit_data_reshuffler(**kwargs) return emit_str -def emit_data(**kwargs): - MIN = -128 - MAX = 127 - - data_str = "" - data_str += format_scalar_definition("int8_t", - "H", - kwargs["H"]) + "\n" - data_str += format_scalar_definition("int8_t", - "W", - kwargs["W"]) + "\n" - data_str += format_scalar_definition("int8_t", - "Cin", - kwargs["Cin"]) + "\n" - data_str += format_scalar_definition("int8_t", - "Kh", - kwargs["Kh"]) + "\n" - data_str += format_scalar_definition("int8_t", "Kw", kwargs["Kw"]) + "\n" - data_str += format_scalar_definition("int8_t", - "pad_h", kwargs["pad_h"]) + "\n" - data_str += format_scalar_definition("int8_t", - "pad_w", kwargs["pad_w"]) + "\n" - data_str += format_scalar_definition("int8_t", - "stride_h", kwargs["stride_h"]) + "\n" - data_str += format_scalar_definition("int8_t", - "stride_w", kwargs["stride_w"]) + "\n" - padded_h = kwargs["H"] + 2 * kwargs["pad_h"] - padded_w = kwargs["W"] + 2 * kwargs["pad_w"] - out_h = (kwargs["H"] + 2 * kwargs["pad_h"] - - kwargs["Kh"]) // kwargs["stride_h"] + 1 - out_w = (kwargs["W"] + 2 * kwargs["pad_w"] - - kwargs["Kw"]) // kwargs["stride_w"] + 1 - - data_str += format_scalar_definition("int8_t", "out_H", out_h) + "\n" - data_str += format_scalar_definition("int8_t", "out_W", out_w) + "\n" - data_str += format_scalar_definition("int8_t", "padded_H", padded_h) + "\n" - data_str += format_scalar_definition("int8_t", "padded_W", padded_w) + "\n" - - # Generating random input data vector - data_in = np.random.randint( - MIN, MAX, (1, kwargs["H"], kwargs["W"], kwargs["Cin"]) - ) - padded_data_in = np.pad( - data_in, - ( - (0, 0), - (kwargs["pad_h"], kwargs["pad_h"]), - (kwargs["pad_w"], kwargs["pad_w"]), - (0, 0), - ), - "constant", - ) - # Generating golden data - c_golden = max_pooling( - data_in, - kwargs["Kw"], - kwargs["Kh"], - kwargs["stride_w"], - kwargs["stride_h"], - kwargs["pad_w"], - kwargs["pad_h"], - "HWC", - ) - data_str += format_vector_definition("int8_t", - "padded_data_in", - padded_data_in.reshape(-1)) + "\n" - data_str += format_vector_definition("int8_t", - "golden_data_out", - c_golden.reshape(-1)) + "\n" +MIN = -128 +MAX = 127 + + +def emit_data_reshuffler(**kwargs): + data_str = [] + + assert ( + kwargs["ifMaxPool"] + kwargs["iftestIm2Col"] + kwargs["ifTestTransposer"] == 1 + ), "Only one kernel can be tested at a time" + + if kwargs["ifTestTransposer"] is True: + # Generating loop bounds settings + data_str += [ + format_scalar_definition("int32_t", "tempLoop0_in", kwargs["tempLoop0"]), + format_scalar_definition("int32_t", "tempLoop1_in", kwargs["tempLoop1"]), + format_scalar_definition("int32_t", "tempLoop2_in", 1), + format_scalar_definition("int32_t", "tempLoop3_in", 1), + format_scalar_definition("int32_t", "tempLoop4_in", 1), + format_scalar_definition("int32_t", "tempLoop0_out", kwargs["tempLoop0"]), + format_scalar_definition("int32_t", "tempLoop1_out", kwargs["tempLoop1"]), + format_scalar_definition("int32_t", "tempLoop2_out", 1), + format_scalar_definition( + "int32_t", + "input_data_len", + kwargs["tempLoop0"] * kwargs["tempLoop1"] * 8 * 8, + ), + format_scalar_definition( + "int32_t", + "output_data_len", + kwargs["tempLoop0"] * kwargs["tempLoop1"] * 8 * 8, + ), + ] + + # Generating temporal strides settings + data_str += [ + # data reshuffler input strides + format_scalar_definition( + "int32_t", "tempStride0_in", kwargs["tempStride0_in"] + ), + format_scalar_definition( + "int32_t", "tempStride1_in", kwargs["tempStride1_in"] + ), + format_scalar_definition("int32_t", "tempStride2_in", 0), + format_scalar_definition("int32_t", "tempStride3_in", 0), + format_scalar_definition("int32_t", "tempStride4_in", 0), + format_scalar_definition( + "int32_t", "spatialStride1_in", kwargs["spatialStride1_in"] + ), + # data reshuffler output strides + format_scalar_definition( + "int32_t", + "tempStride0_out", + kwargs["tempStride0_out"], + ), + format_scalar_definition( + "int32_t", "tempStride1_out", kwargs["tempStride1_out"] + ), + format_scalar_definition("int32_t", "tempStride2_out", 0), + format_scalar_definition( + "int32_t", "spatialStride1_out", kwargs["spatialStride1_out"] + ), + # Generating base address pointers + format_scalar_definition( + "int32_t", "delta_local_in", kwargs["delta_local_in"] + ), + format_scalar_definition( + "int32_t", "delta_local_out", kwargs["delta_local_out"] + ), + ] + + # Generating random input data vector + length_in = ( + kwargs["tempLoop0"] + * kwargs["tempLoop1"] + * kwargs["spatial_len_0"] + * kwargs["spatial_len_1"] + ) + + data_in = np.random.randint(MIN, MAX, length_in) + + op = kwargs["op"] + + # Generating golden data + # NOTE: using 4 loops to iterate through the + # input data and reshuffle the data. + # different from the hardware data reshuffler, + # the golden model uses the pure strided layout mapping equation, + # no 64 data granularity constraint, no need to transpose explicitly. + if op == "rowmajor2tiledrowmajor": + c_golden = data_reshuffler_golden_model( + kwargs["tempLoop0"], + kwargs["tempLoop1"], + kwargs["spatial_len_0"], + kwargs["spatial_len_1"], + kwargs["tempStride0_in"], + kwargs["tempStride1_in"], + 1, + kwargs["spatialStride1_in"], + data_in, + ) + + if op == "rowmajor2tiledcolmajor": + c_golden = data_reshuffler_golden_model( + kwargs["tempLoop0"], + kwargs["tempLoop1"], + kwargs["spatial_len_0"], + kwargs["spatial_len_1"], + kwargs["tempStride0_in"], + kwargs["tempStride1_in"], + kwargs["tempLoop0"] * 8, + 1, + data_in, + ) + + if op == "tiledrowmajor2tiledcolmajor": + c_golden = data_reshuffler_golden_model( + kwargs["tempLoop0"], + kwargs["tempLoop1"], + kwargs["spatial_len_0"], + kwargs["spatial_len_1"], + kwargs["tempStride0_in"], + kwargs["tempStride1_in"], + 8, + 1, + data_in, + ) + + # Generating transpose flag for the data reshuffler hardware + if op == "rowmajor2tiledrowmajor": + transpose = 0 + elif op == "rowmajor2tiledcolmajor": + transpose = 1 + elif op == "tiledrowmajor2tiledcolmajor": + transpose = 1 + else: + print("Invalid operation") + + # set transpose or not + data_str += [ + format_scalar_definition( + "int", "TloopLen", kwargs["tempLoop0"] * kwargs["tempLoop1"] + ) + ] + data_str += [format_scalar_definition("int", "reduceLen", 1)] + data_str += [format_scalar_definition("int", "opcode", transpose)] + + # Writing testing data and golden data into data.h + data_str += [format_vector_definition("int8_t", "DataIn", data_in)] + data_str += [format_vector_definition("int8_t", "C_golden", c_golden)] + + elif kwargs["iftestIm2Col"] is True: + assert ( + kwargs["ifC8HW8datalayout"] is True + ), "Only C8HW8 data layout is supported for im2col testing" + + # Generating layer settings + Nbatch = kwargs["Nbatch"] + Cin8 = kwargs["Cin"] // 8 + H = kwargs["H"] + W = kwargs["W"] + Kh = kwargs["Kh"] + Kw = kwargs["Kw"] + stride_h, stride_w = (kwargs["stride_h"], kwargs["stride_w"]) + pad_h, pad_w = (kwargs["pad_h"], kwargs["pad_w"]) + + # make sure the output width is multiple of 8 + if W // stride_w % 8 != 0: + W = W + (stride_w * (8 - (W // stride_w) % 8)) % (stride_w * 8) + + # generate random input and kernel data + input_data = np.random.randint(-10, 10, size=(Nbatch, Cin8, H, W, 8)) + kernel = np.random.randint(-10, 10, size=(1, Cin8, Kh, Kw, 8, 8)) + + # Padding the input data + input_padding = np.pad( + input_data, + ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), + mode="constant", + ) + + # Calculate the size of the output feature map + out_height = (H + 2 * pad_h - Kh) // stride_h + 1 + out_width = (W + 2 * pad_w - Kw) // stride_w + 1 + + assert out_width % 8 == 0, "out_width must be multiple of 8" + + tempLoop0_in = Kw + tempLoop1_in = Kh + tempLoop2_in = Cin8 + tempLoop3_in = out_width // 8 + tempLoop4_in = out_height + + spatialStride1_in = 8 * stride_w + + tempStride0_in = 8 + tempStride1_in = 8 * (W + 2 * pad_w) + tempStride2_in = 8 * (W + 2 * pad_w) * (H + 2 * pad_h) + tempStride3_in = 8 * 8 * stride_w + tempStride4_in = 8 * (W + 2 * pad_w) * stride_h + + tempLoop0_out = Cin8 * Kw * Kh + tempLoop1_out = out_width // 8 * out_height + tempLoop2_out = 1 + + spatialStride1_out = 8 + tempStride0_out = 8 * 8 + tempStride1_out = 8 * 8 * Cin8 * Kw * Kh + tempStride2_out = 0 + + assert ( + tempLoop0_in * tempLoop1_in * tempLoop2_in * tempLoop3_in * tempLoop4_in + == tempLoop0_out * tempLoop1_out * tempLoop2_out + ) + + input_data_len = input_padding.size + + explicit_im2col, _ = im2col( + input_data, kernel, stride=(stride_h, stride_w), padding=(pad_h, pad_w) + ) + output_data_len = explicit_im2col.size + + delta_local_in = 0 + delta_local_out = input_data_len + + data_str += [ + format_scalar_definition("int32_t", "tempLoop0_in", tempLoop0_in), + format_scalar_definition("int32_t", "tempLoop1_in", tempLoop1_in), + format_scalar_definition("int32_t", "tempLoop2_in", tempLoop2_in), + format_scalar_definition("int32_t", "tempLoop3_in", tempLoop3_in), + format_scalar_definition("int32_t", "tempLoop4_in", tempLoop4_in), + format_scalar_definition("int32_t", "tempLoop0_out", tempLoop0_out), + format_scalar_definition("int32_t", "tempLoop1_out", tempLoop1_out), + format_scalar_definition("int32_t", "tempLoop2_out", tempLoop2_out), + format_scalar_definition("int32_t", "input_data_len", input_data_len), + format_scalar_definition("int32_t", "output_data_len", output_data_len), + format_scalar_definition("int32_t", "spatialStride1_in", spatialStride1_in), + format_scalar_definition("int32_t", "tempStride0_in", tempStride0_in), + format_scalar_definition("int32_t", "tempStride1_in", tempStride1_in), + format_scalar_definition("int32_t", "tempStride2_in", tempStride2_in), + format_scalar_definition("int32_t", "tempStride3_in", tempStride3_in), + format_scalar_definition("int32_t", "tempStride4_in", tempStride4_in), + format_scalar_definition( + "int32_t", "spatialStride1_out", spatialStride1_out + ), + format_scalar_definition("int32_t", "tempStride0_out", tempStride0_out), + format_scalar_definition("int32_t", "tempStride1_out", tempStride1_out), + format_scalar_definition("int32_t", "tempStride2_out", tempStride2_out), + format_scalar_definition("int32_t", "delta_local_in", delta_local_in), + format_scalar_definition("int32_t", "delta_local_out", delta_local_out), + format_vector_definition("int8_t", "DataIn", input_padding.reshape(-1)), + format_vector_definition("int8_t", "C_golden", explicit_im2col.reshape(-1)), + ] + + TloopLen = ( + tempLoop0_in * tempLoop1_in * tempLoop2_in * tempLoop3_in * tempLoop4_in + ) + reduceLen = 1 + opcode = 0 + + data_str += [ + format_scalar_definition("int", "TloopLen", TloopLen), + format_scalar_definition("int", "reduceLen", reduceLen), + format_scalar_definition("int", "opcode", opcode), + ] + + # max pooling then + elif kwargs["ifC8HW8datalayout"] is True: + # data layout, C8HW8 + # Generating loop bounds settings + padded_input_tensor_w = kwargs["W"] + kwargs["pad_w"] * 2 + padded_input_tensor_h = kwargs["H"] + kwargs["pad_h"] * 2 + + padded_output_tensor_w = ( + kwargs["W"] + kwargs["pad_w"] * 2 - kwargs["Kw"] + ) // kwargs["stride_w"] + 1 + padded_output_tensor_h = ( + kwargs["H"] + kwargs["pad_h"] * 2 - kwargs["Kh"] + ) // kwargs["stride_h"] + 1 + + input_data_len = padded_input_tensor_w * padded_input_tensor_h * kwargs["Cin"] + output_data_len = ( + padded_output_tensor_w * padded_output_tensor_h * kwargs["Cin"] + ) + + assert padded_output_tensor_w % 8 == 0 + assert kwargs["Cin"] % 8 == 0 + + assert ( + input_data_len + output_data_len < 128 * 1024 + ), "Data size too large for 128 KB TCDM" + + data_str += [ + format_scalar_definition("int32_t", "input_data_len", input_data_len), + # input data reshuffler loop bounds settings + format_scalar_definition("int32_t", "tempLoop0_in", kwargs["Kw"]), + format_scalar_definition("int32_t", "tempLoop1_in", kwargs["Kh"]), + format_scalar_definition( + "int32_t", "tempLoop2_in", padded_output_tensor_w // 8 + ), + format_scalar_definition("int32_t", "tempLoop3_in", padded_output_tensor_h), + format_scalar_definition("int32_t", "tempLoop4_in", kwargs["Cin"] // 8), + ] + + assert padded_output_tensor_w % 8 == 0 + + # data reshuffler input strides + spatialStride1_in = kwargs["stride_w"] * 8 + tempStride0_in = 8 + tempStride1_in = padded_input_tensor_w * 8 + tempStride2_in = 8 * 8 * kwargs["stride_w"] + tempStride3_in = padded_input_tensor_w * 8 * kwargs["stride_h"] + tempStride4_in = padded_input_tensor_w * padded_input_tensor_h * 8 + data_str += [ + format_scalar_definition("int32_t", "delta_local_in", 0), + format_scalar_definition("int32_t", "spatialStride1_in", spatialStride1_in), + format_scalar_definition("int32_t", "tempStride0_in", tempStride0_in), + format_scalar_definition("int32_t", "tempStride1_in", tempStride1_in), + format_scalar_definition("int32_t", "tempStride2_in", tempStride2_in), + format_scalar_definition( + "int32_t", + "tempStride3_in", + tempStride3_in, + ), + format_scalar_definition( + "int32_t", + "tempStride4_in", + tempStride4_in, + ), + ] + + data_str += [ + # output data reshuffler loop bounds settings + format_scalar_definition( + "int32_t", "tempLoop0_out", padded_output_tensor_w // 8 + ), + format_scalar_definition( + "int32_t", "tempLoop1_out", padded_output_tensor_h + ), + format_scalar_definition("int32_t", "tempLoop2_out", kwargs["Cin"] // 8), + # data length setting + format_scalar_definition("int32_t", "output_data_len", output_data_len), + ] + + # data reshuffler output strides + delta_local_out = padded_input_tensor_h * padded_input_tensor_w * kwargs["Cin"] + spatialStride1_out = 8 + tempStride0_out = 8 * 8 + tempStride1_out = padded_output_tensor_w * 8 + tempStride2_out = padded_output_tensor_w * padded_output_tensor_h * 8 + data_str += [ + # Generating base address pointers + format_scalar_definition( + "int32_t", + "delta_local_out", + delta_local_out, + ), + format_scalar_definition( + "int32_t", "spatialStride1_out", spatialStride1_out + ), + format_scalar_definition( + "int32_t", + "tempStride0_out", + tempStride0_out, + ), + format_scalar_definition("int32_t", "tempStride1_out", tempStride1_out), + format_scalar_definition( + "int32_t", + "tempStride2_out", + tempStride2_out, + ), + ] + + assert delta_local_out % 8 == 0 + assert tempStride0_in % 8 == 0 + assert tempStride1_in % 8 == 0 + assert tempStride2_in % 8 == 0 + assert tempStride3_in % 8 == 0 + assert tempStride4_in % 8 == 0 + assert tempStride0_out % 8 == 0 + assert tempStride1_out % 8 == 0 + assert tempStride2_out % 8 == 0 + + # Generating random input data vector + data_in = np.random.randint( + MIN, MAX, (kwargs["Cin"] // 8, kwargs["H"], kwargs["W"], 8) + ) + + # Generating golden data + c_golden = max_pooling( + data_in, + kwargs["Kw"], + kwargs["Kh"], + kwargs["stride_w"], + kwargs["stride_h"], + kwargs["pad_w"], + kwargs["pad_h"], + "C8HW8", + ) + + padded_data_in = np.pad( + data_in, + ( + (0, 0), + (kwargs["pad_h"], kwargs["pad_h"]), + (kwargs["pad_w"], kwargs["pad_w"]), + (0, 0), + ), + "constant", + ) + + # datapath setting + # set opcode + data_str += [format_scalar_definition("int", "opcode", 2)] + # set TloopLen and reduceLen + data_str += [ + format_scalar_definition( + "int32_t", + "TloopLen", + padded_output_tensor_w + * padded_output_tensor_h + * kwargs["Cin"] + // 8 + // 8, + ), + format_scalar_definition( + "int32_t", "reduceLen", kwargs["Kw"] * kwargs["Kh"] + ), + ] + + # Writing testing data and golden data into data.h + assert padded_data_in.shape == ( + kwargs["Cin"] // 8, + padded_input_tensor_h, + padded_input_tensor_w, + 8, + ) + assert padded_data_in.reshape(-1).shape[0] == input_data_len + data_str += [ + format_vector_definition("int8_t", "DataIn", padded_data_in.reshape(-1)) + ] + + assert c_golden.shape == ( + kwargs["Cin"] // 8, + padded_output_tensor_h, + padded_output_tensor_w, + 8, + ) + assert c_golden.reshape(-1).shape[0] == output_data_len + + data_str += [ + format_vector_definition("int8_t", "C_golden", c_golden.reshape(-1)) + ] + + else: + # data layout HWCin + # Generating loop bounds settings + padded_input_tensor_w = kwargs["W"] + kwargs["pad_w"] * 2 + padded_input_tensor_h = kwargs["H"] + kwargs["pad_h"] * 2 + + padded_output_tensor_w = ( + kwargs["W"] + kwargs["pad_w"] * 2 - kwargs["Kw"] + ) // kwargs["stride_w"] + 1 + padded_output_tensor_h = ( + kwargs["H"] + kwargs["pad_h"] * 2 - kwargs["Kh"] + ) // kwargs["stride_h"] + 1 + + input_data_len = padded_input_tensor_w * padded_input_tensor_h * kwargs["Cin"] + output_data_len = ( + padded_output_tensor_w * padded_output_tensor_h * kwargs["Cin"] + ) + + assert padded_output_tensor_w == kwargs["W"] + assert padded_output_tensor_h == kwargs["H"] + + data_str += [ + # input data reshuffler loop bounds settings + format_scalar_definition("int32_t", "tempLoop0_in", kwargs["Kw"]), + format_scalar_definition("int32_t", "tempLoop1_in", kwargs["Kh"]), + format_scalar_definition("int32_t", "tempLoop2_in", kwargs["Cin"] // 8), + format_scalar_definition( + "int32_t", "tempLoop3_in", padded_output_tensor_w // 8 + ), + format_scalar_definition("int32_t", "tempLoop4_in", padded_output_tensor_h), + # output data reshuffler loop bounds settings + format_scalar_definition("int32_t", "tempLoop0_out", kwargs["Cin"] // 8), + format_scalar_definition( + "int32_t", "tempLoop1_out", padded_output_tensor_w // 8 + ), + format_scalar_definition( + "int32_t", "tempLoop2_out", padded_output_tensor_h + ), + # data length setting + format_scalar_definition("int32_t", "input_data_len", input_data_len), + format_scalar_definition("int32_t", "output_data_len", output_data_len), + format_scalar_definition( + "int32_t", + "TloopLen", + padded_output_tensor_w + * padded_output_tensor_h + * kwargs["Cin"] + // 8 + // 8, + ), + format_scalar_definition( + "int32_t", "reduceLen", kwargs["Kw"] * kwargs["Kh"] + ), + ] + + data_str += [ + # data reshuffler input strides + format_scalar_definition("int32_t", "spatialStride1_in", kwargs["Cin"]), + format_scalar_definition( + "int32_t", "tempStride0_in", kwargs["stride_w"] * kwargs["Cin"] + ), + format_scalar_definition( + "int32_t", "tempStride1_in", padded_input_tensor_w * kwargs["Cin"] + ), + format_scalar_definition("int32_t", "tempStride2_in", 8), + format_scalar_definition("int32_t", "tempStride3_in", 8 * kwargs["Cin"]), + format_scalar_definition( + "int32_t", "tempStride4_in", padded_input_tensor_w * kwargs["Cin"] + ), + # data reshuffler output strides + format_scalar_definition("int32_t", "spatialStride1_out", kwargs["Cin"]), + format_scalar_definition("int32_t", "tempStride0_out", 8), + format_scalar_definition("int32_t", "tempStride1_out", 8 * kwargs["Cin"]), + format_scalar_definition( + "int32_t", "tempStride2_out", padded_output_tensor_w * kwargs["Cin"] + ), + # Generating base address pointers + format_scalar_definition("int32_t", "delta_local_in", 0), + format_scalar_definition( + "int32_t", + "delta_local_out", + padded_input_tensor_h * padded_input_tensor_w * kwargs["Cin"], + ), + ] + + # Generating random input data vector + data_in = np.random.randint( + MIN, MAX, (1, kwargs["H"], kwargs["W"], kwargs["Cin"]) + ) + + # Generating golden data + c_golden = max_pooling( + data_in, + kwargs["Kw"], + kwargs["Kh"], + kwargs["stride_w"], + kwargs["stride_h"], + kwargs["pad_w"], + kwargs["pad_h"], + "HWC", + ) + + padded_data_in = np.pad( + data_in, + ( + (0, 0), + (kwargs["pad_h"], kwargs["pad_h"]), + (kwargs["pad_w"], kwargs["pad_w"]), + (0, 0), + ), + "constant", + ) + + # set opcode + data_str += [format_scalar_definition("int", "opcode", 2)] + + # Writing testing data and golden data into data.h + assert padded_data_in.shape == ( + 1, + padded_input_tensor_h, + padded_input_tensor_w, + kwargs["Cin"], + ) + assert padded_data_in.reshape(-1).shape[0] == input_data_len + data_str += [ + format_vector_definition("int8_t", "DataIn", padded_data_in.reshape(-1)) + ] + + assert c_golden.shape == ( + 1, + padded_output_tensor_h, + padded_output_tensor_w, + kwargs["Cin"], + ) + assert c_golden.reshape(-1).shape[0] == output_data_len + + data_str += [ + format_vector_definition("int8_t", "C_golden", c_golden.reshape(-1)) + ] + + data_str = "\n\n".join(data_str) return data_str diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson index 59df2f731..a608493ac 100644 --- a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson +++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/data/params.hjson @@ -2,15 +2,38 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 // -// Fanchen Kong +// Xiaoling Yi + { - H: 32, - W: 32, + ifMaxPool: true, + iftestIm2Col: false, + ifTestTransposer: false, + + // parameters for maxpool + ifC8HW8datalayout: true, + Nbatch: 1, + H: 16, + W: 8, Cin: 8, Kh: 3, Kw: 3, pad_h: 1, pad_w: 1, stride_h: 1, - stride_w: 1 -} \ No newline at end of file + stride_w: 1, + + // parameters for data layout reshuffling + op: 'rowmajor2tiledrowmajor', + tempLoop0: 8, + tempLoop1: 8, + spatialStride1_in: 64, + tempStride0_in: 8, + tempStride1_in: 512, + spatialStride1_out: 8, + tempStride0_out: 64, + tempStride1_out: 512, + delta_local_in: 0, + delta_local_out: 4096, + spatial_len_0: 8, + spatial_len_1: 8 +} diff --git a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c index 5ca5743e4..29c2ee19b 100644 --- a/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c +++ b/target/sim/sw/device/apps/snax/snax-xdma-maxpool/src/snax-xdma-maxpool.c @@ -9,98 +9,95 @@ #include "snrt.h" int main() { - // Set err value for checking - int err = 0; - // Obtain the start address of the TCDM memory - uint32_t dma_load_input_start; - uint32_t dma_load_input_end; - uint32_t *tcdm_baseaddress = (uint32_t *)snrt_l1_next(); - // Put the input at the starting of tcdm - uint8_t *tcdm_in = tcdm_baseaddress; - // Put the output at the middle of tcdm - uint8_t *tcdm_out = tcdm_in + 0x10000 * sizeof(uint8_t); + if (snrt_cluster_idx() == 1) { // Set err value for checking + // Set err value for checking + int err = 0; + // Obtain the start address of the TCDM memory + uint32_t dma_load_input_start; + uint32_t dma_load_input_end; + uint32_t tcdm_baseaddress = snrt_cluster_base_addrl(); + // Put the input at the starting of tcdm + uint8_t *tcdm_in = (uint8_t *)tcdm_baseaddress; + // Put the output at the middle of tcdm + uint8_t *tcdm_out = (uint8_t *)(tcdm_baseaddress + delta_local_out); - if (snrt_is_dm_core()) { - // The xdma core is the last compute core in the cluster - uint32_t sstride_src[1] = {8}; - uint32_t sstride_dst[1] = {8}; - uint32_t tstride_src[2] = {8, 512}; - uint32_t tbound_src[2] = {3, 3}; + if (snrt_is_dm_core()) { + // The xdma core is the last compute core in the cluster + uint32_t sstride_src[1] = {0}; + uint32_t sstride_dst[1] = {0}; + uint32_t tstride_src[5] = {0}; + uint32_t tbound_src[5] = {0}; + uint32_t tstride_dst[3] = {0}; + uint32_t tbound_dst[3] = {0}; - // First we need to transfer the input data from L3->TCDM - // Here we use the 2d iDMA transfer - dma_load_input_start = snrt_mcycle(); - snrt_dma_start_2d( - tcdm_in, padded_data_in, padded_W * Cin * sizeof(uint8_t), - 512 * sizeof(uint8_t), padded_W * Cin * sizeof(uint8_t), - padded_H * sizeof(uint8_t)); - snrt_dma_wait_all(); - dma_load_input_end = snrt_mcycle(); + // Load the CFG from data.h + sstride_src[0] = spatialStride1_in; + sstride_dst[0] = spatialStride1_out; + tstride_src[0] = tempStride0_in; + tstride_src[1] = tempStride1_in; + tstride_src[2] = tempStride2_in; + tstride_src[3] = tempStride3_in; + tstride_src[4] = tempStride4_in; + tbound_src[0] = tempLoop0_in; + tbound_src[1] = tempLoop1_in; + tbound_src[2] = tempLoop2_in; + tbound_src[3] = tempLoop3_in; + tbound_src[4] = tempLoop4_in; + tstride_dst[0] = tempStride0_out; + tstride_dst[1] = tempStride1_out; + tstride_dst[2] = tempStride2_out; + tbound_dst[0] = tempLoop0_out; + tbound_dst[1] = tempLoop1_out; + tbound_dst[2] = tempLoop2_out; - // --------------------- Configure the Ext --------------------- // + // First we need to transfer the input data from L3->TCDM + snrt_dma_start_1d(tcdm_in, DataIn, input_data_len * sizeof(int8_t)); + snrt_dma_wait_all(); - // There are three extensions in xdma - // VerilogMemset, Maxpool, Transposer - // 0 , 1 , 2 - // We want to only use Maxpool - // Hence we need to disable the 0 and 2 - // and we set the maxpool csr to 9 since we need 3x3 pooling - if (xdma_disable_dst_ext(0) != 0) { - printf("Error in disabling xdma extension 0 \r\n"); - err++; - } else { - printf("The xdma extension 0 is disabled \r\n"); - } + // --------------------- Configure the Ext --------------------- // - uint32_t ext_param_maxpool_size[1] = {9}; - if (xdma_enable_dst_ext(1, ext_param_maxpool_size) != 0) { - printf("Error in enabling xdma extension 1 \r\n"); - err++; - } else { - printf("The xdma extension 1 is enabled \r\n"); - } + if (xdma_disable_dst_ext(0) != 0) { + printf("Error in disabling xdma extension 0\r\n"); + err++; + } else { + printf("The xdma extension 0 is disabled\r\n"); + } - if (xdma_disable_dst_ext(2) != 0) { - printf("Error in disabling xdma extension 2 \r\n"); - err++; - } else { - printf("The xdma extension 2 is disabled \r\n"); - } + uint32_t ext_param_maxpool_size[1] = {reduceLen}; + if (xdma_enable_dst_ext(1, ext_param_maxpool_size) != 0) { + printf("Error in enabling xdma extension 1\r\n"); + err++; + } else { + printf("The xdma extension 1 is enabled\r\n"); + } - // --------------------- Configure the AGU --------------------- // - uint8_t *local_src_pointer; - uint8_t *local_dst_pointer; - int task_id; - for (int i = 0; i < out_H; i++) { - for (int j = 0; j < out_W / 8; j++) { - local_src_pointer = tcdm_in + j * 64 + i * 512; - local_dst_pointer = tcdm_out + j * 64 + i * 256; - if (xdma_memcpy_nd(local_src_pointer, local_dst_pointer, - sstride_src, sstride_dst, 2, tstride_src, - tbound_src, 0, NULL, NULL, 0xFFFFFFFF, - 0xFFFFFFFF, 0xFFFFFFFF) != 0) { - printf("Error in xdma agu configuration \r\n"); - err++; - } else { - printf("The xdma agu is configured \r\n"); - } - int task_id = xdma_start(); - xdma_wait(task_id); - printf("i = %d, j = %d is done \r\n", i, j); + if (xdma_disable_dst_ext(2) != 0) { + printf("Error in disabling xdma extension 2\r\n"); + err++; + } else { + printf("The xdma extension 2 is disabled\r\n"); } - } - // --------------------- Checking the Results --------------------- // - printf("Checking the results \r\n"); - for (int i = 0; i < out_H * out_W * Cin; i++) { - if ((int8_t)tcdm_out[i] != golden_data_out[i]) { - printf("The maxpool is incorrect! \r\n"); - printf("tcdm_out[%d]=%d, golden_data_out[%d]=%d", i, - (int8_t)tcdm_out[i], i, golden_data_out[i]); + // --------------------- Configure the AGU --------------------- // + xdma_memcpy_nd(tcdm_in, tcdm_out, sstride_src, sstride_dst, 5, + tstride_src, tbound_src, 3, tstride_dst, tbound_dst, + 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF); + int task_id = xdma_start(); + xdma_wait(task_id); + + // --------------------- Checking the Results --------------------- + // // + for (int i = 0; i < output_data_len; i++) { + if ((int8_t)tcdm_out[i] != C_golden[i]) { + printf("The maxpool is incorrect!\r\n"); + printf("tcdm_out[%d]=%d, C_golden[%d]=%d", i, + (int8_t)tcdm_out[i], i, C_golden[i]); + } } + printf("Checking is done. All values are right\r\n"); } - printf("Checking is done. All values are right \r\n"); - } - return 0; -} \ No newline at end of file + return 0; + } else + return 0; +} diff --git a/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c b/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c index 83d246edd..a6d33972f 100644 --- a/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c +++ b/target/sim/sw/device/apps/snax/snax-xdma-memset/src/snax-xdma-memset.c @@ -9,192 +9,201 @@ #include "snrt.h" int main() { - // Set err value for checking - int err = 0; - - // Obtain the start address of the TCDM memory - uint8_t *tcdm_baseaddress = (uint8_t *)snrt_l1_next(); - uint8_t *tcdm_0 = tcdm_baseaddress; - uint8_t *tcdm_16 = tcdm_baseaddress + 0x4000 * sizeof(uint8_t); - uint8_t *tcdm_32 = tcdm_baseaddress + 0x8000 * sizeof(uint8_t); - uint8_t *tcdm_48 = tcdm_baseaddress + 0xc000 * sizeof(uint8_t); - uint8_t *tcdm_64 = tcdm_baseaddress + 0x10000 * sizeof(uint8_t); - uint8_t *tcdm_80 = tcdm_baseaddress + 0x14000 * sizeof(uint8_t); - uint8_t *tcdm_96 = tcdm_baseaddress + 0x18000 * sizeof(uint8_t); - uint8_t *tcdm_112 = tcdm_baseaddress + 0x1c000 * sizeof(uint8_t); - - // Using xdma core only - if (snrt_is_dm_core()) { - // The xdma core is the last compute core in the cluster - - // Test 1: Setting the 0-16KB region to 0xFF - printf("Core %d is xdma core. \r\n", snrt_cluster_core_idx()); - printf("Test 1: Setting the 0-16KB region to 0xFF \r\n"); - if (xdma_memcpy_1d(tcdm_0, tcdm_0, 0x4000 * sizeof(uint8_t)) != 0) { - printf("Error in xdma agu configuration \r\n"); - err++; - } else { - printf("The xdma agu is configured \r\n"); - } - - uint32_t ext_param_t1[1] = {0xFFFFFFFF}; - if (xdma_enable_dst_ext(0, ext_param_t1) != 0) { - printf("Error in enabling xdma extension 0 \r\n"); - err++; - } else { - printf("The xdma extension 0 is enabled \r\n"); - } - - if (xdma_disable_dst_ext(1) != 0) { - printf("Error in disabling xdma extension 1 \r\n"); - err++; - } else { - printf("The xdma extension 1 is disabled \r\n"); - } + if (snrt_cluster_idx() == 1) { // Set err value for checking + // Set err value for checking + int err = 0; + // Obtain the start address of the TCDM memory + uint8_t *tcdm_baseaddress = (uint8_t *)snrt_l1_next(); + uint8_t *tcdm_0 = tcdm_baseaddress; + uint8_t *tcdm_16 = tcdm_baseaddress + 0x4000 * sizeof(uint8_t); + uint8_t *tcdm_32 = tcdm_baseaddress + 0x8000 * sizeof(uint8_t); + uint8_t *tcdm_48 = tcdm_baseaddress + 0xc000 * sizeof(uint8_t); + uint8_t *tcdm_64 = tcdm_baseaddress + 0x10000 * sizeof(uint8_t); + uint8_t *tcdm_80 = tcdm_baseaddress + 0x14000 * sizeof(uint8_t); + uint8_t *tcdm_96 = tcdm_baseaddress + 0x18000 * sizeof(uint8_t); + uint8_t *tcdm_112 = tcdm_baseaddress + 0x1c000 * sizeof(uint8_t); + + // Using xdma core only + if (snrt_is_dm_core()) { + // The xdma core is the last compute core in the cluster + + // Test 1: Setting the 0-16KB region to 0xFF + printf("Core %d is xdma core. \r\n", snrt_cluster_core_idx()); + printf("Test 1: Setting the 0-16KB region to 0xFF \r\n"); + if (xdma_memcpy_1d(tcdm_0, tcdm_0, 0x4000 * sizeof(uint8_t)) != 0) { + printf("Error in xdma agu configuration \r\n"); + err++; + } else { + printf("The xdma agu is configured \r\n"); + } - if (xdma_disable_dst_ext(2) != 0) { - printf("Error in disabling xdma extension 2 \r\n"); - err++; - } else { - printf("The xdma extension 2 is disabled \r\n"); - } + uint32_t ext_param_t1[1] = {0xFFFFFFFF}; + if (xdma_enable_dst_ext(0, ext_param_t1) != 0) { + printf("Error in enabling xdma extension 0 \r\n"); + err++; + } else { + printf("The xdma extension 0 is enabled \r\n"); + } - if (err != 0) { - return err; - } + if (xdma_disable_dst_ext(1) != 0) { + printf("Error in disabling xdma extension 1 \r\n"); + err++; + } else { + printf("The xdma extension 1 is disabled \r\n"); + } - int task_id = xdma_start(); - printf( - "The xdma is started, setting memory region to 0xFF. The task id " - "is %d \r\n", - task_id); - xdma_wait(task_id); + if (xdma_disable_dst_ext(2) != 0) { + printf("Error in disabling xdma extension 2 \r\n"); + err++; + } else { + printf("The xdma extension 2 is disabled \r\n"); + } - printf("The xdma is finished \r\n"); - // Check the data - for (int i = 0; i < 0x4000; i++) { - if (tcdm_0[i] != 0xFF) { - printf("The memset of 0KB - 16KB is not correct \r\n"); - return -1; + if (err != 0) { + return err; } - } - printf("The memset of 0KB - 16KB is correct \r\n"); - - // Test 2: Setting the 4K-12K region back to 0. Instead of using the - // memset, this test do this by disabling all the readers. - printf( - "Test 2: Setting the 4K-12K region back to 0 by disabling all " - "reader channels \r\n"); - uint32_t sstride_src_t2[1] = {0}; - uint32_t tstride_src_t2[1] = {64}; - uint32_t sstride_dst_t2[1] = {8}; - uint32_t tstride_dst_t2[1] = {64}; - uint32_t tbound_src_t2[1] = {128}; - uint32_t tbound_dst_t2[1] = {128}; - - if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t), - sstride_src_t2, sstride_dst_t2, 1, tstride_src_t2, - tbound_src_t2, 1, tstride_dst_t2, tbound_dst_t2, 0x0, - 0xffffffff, 0xffffffff) != 0) { - printf("Error in xdma agu configuration \r\n"); - err++; - } else { - printf("The xdma agu is configured \r\n"); - } - if (xdma_disable_dst_ext(0) != 0) { - printf("Error in enabling xdma extension 0 \r\n"); - err++; - } else { - printf("The xdma extension 0 is disabled \r\n"); - } + int task_id = xdma_start(); + printf( + "The xdma is started, setting memory region to 0xFF. The task " + "id " + "is %d \r\n", + task_id); + xdma_wait(task_id); + + printf("The xdma is finished \r\n"); + // Check the data + for (int i = 0; i < 0x4000; i++) { + if (tcdm_0[i] != 0xFF) { + printf("The memset of 0KB - 16KB is not correct \r\n"); + return -1; + } + } + printf("The memset of 0KB - 16KB is correct \r\n"); + + // Test 2: Setting the 4K-12K region back to 0. Instead of using the + // memset, this test do this by disabling all the readers. + printf( + "Test 2: Setting the 4K-12K region back to 0 by disabling all " + "reader channels \r\n"); + uint32_t sstride_src_t2[1] = {0}; + uint32_t tstride_src_t2[1] = {64}; + uint32_t sstride_dst_t2[1] = {8}; + uint32_t tstride_dst_t2[1] = {64}; + uint32_t tbound_src_t2[1] = {128}; + uint32_t tbound_dst_t2[1] = {128}; + + if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t), + sstride_src_t2, sstride_dst_t2, 1, + tstride_src_t2, tbound_src_t2, 1, tstride_dst_t2, + tbound_dst_t2, 0x0, 0xffffffff, + 0xffffffff) != 0) { + printf("Error in xdma agu configuration \r\n"); + err++; + } else { + printf("The xdma agu is configured \r\n"); + } - if (err != 0) { - return err; - } + if (xdma_disable_dst_ext(0) != 0) { + printf("Error in enabling xdma extension 0 \r\n"); + err++; + } else { + printf("The xdma extension 0 is disabled \r\n"); + } - task_id = xdma_start(); - printf( - "The xdma is started, setting memory region to 0x00. The task id " - "is %d \r\n", - task_id); - xdma_wait(task_id); + if (err != 0) { + return err; + } - printf("The xdma is finished \r\n"); - // Check the data - for (int i = 0; i < 0x1000; i++) { - if (tcdm_0[i] != 0xFF) { - printf("Error in memset (region 0) \r\n"); - return -1; + task_id = xdma_start(); + printf( + "The xdma is started, setting memory region to 0x00. The task " + "id " + "is %d \r\n", + task_id); + xdma_wait(task_id); + + printf("The xdma is finished \r\n"); + // Check the data + for (int i = 0; i < 0x1000; i++) { + if (tcdm_0[i] != 0xFF) { + printf("Error in memset (region 0) \r\n"); + return -1; + } } - } - for (int i = 0x1000; i < 0x3000; i++) { - if (tcdm_0[i] != 0x00) { - printf("The memset is incorrect (region 1) \r\n"); - return -1; + for (int i = 0x1000; i < 0x3000; i++) { + if (tcdm_0[i] != 0x00) { + printf("The memset is incorrect (region 1) \r\n"); + return -1; + } } - } - for (int i = 0x3000; i < 0x4000; i++) { - if (tcdm_0[i] != 0xFF) { - printf("The memset is incorrect (region 2) \r\n"); - return -1; + for (int i = 0x3000; i < 0x4000; i++) { + if (tcdm_0[i] != 0xFF) { + printf("The memset is incorrect (region 2) \r\n"); + return -1; + } + } + printf("The memset of 4KB - 12KB is correct \r\n"); + + // Test 3: Setting the 4-12KB region to 0x0000000000000001 (uint64_t + // 1) This test is to validate the byte mask by shielding all other + // bits, so only LSB 8 bits are set. + printf( + "Test 3: Setting the 4-12KB region to 0x0000000000000001 " + "(uint64_t " + "1) \r\n"); + uint32_t sstride_src_t3[1] = {8}; + uint32_t sstride_dst_t3[1] = {8}; + uint32_t tstride_src_t3[1] = {64}; + uint32_t tstride_dst_t3[1] = {64}; + uint32_t tbound_src_t3[1] = {128}; + uint32_t tbound_dst_t3[1] = {128}; + if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t), + sstride_src_t3, sstride_dst_t3, 1, + tstride_src_t3, tbound_src_t3, 1, tstride_dst_t3, + tbound_dst_t3, 0xffffffff, 0xffffffff, + 0x1) != 0) { + printf("Error in xdma agu configuration \r\n"); + err++; + } else { + printf("The xdma agu is configured \r\n"); } - } - printf("The memset of 4KB - 12KB is correct \r\n"); - - // Test 3: Setting the 4-12KB region to 0x0000000000000001 (uint64_t 1) - // This test is to validate the byte mask by shielding all other bits, - // so only LSB 8 bits are set. - printf( - "Test 3: Setting the 4-12KB region to 0x0000000000000001 (uint64_t " - "1) \r\n"); - uint32_t sstride_src_t3[1] = {8}; - uint32_t sstride_dst_t3[1] = {8}; - uint32_t tstride_src_t3[1] = {64}; - uint32_t tstride_dst_t3[1] = {64}; - uint32_t tbound_src_t3[1] = {128}; - uint32_t tbound_dst_t3[1] = {128}; - if (xdma_memcpy_nd(tcdm_0, tcdm_0 + 0x1000 * sizeof(uint8_t), - sstride_src_t3, sstride_dst_t3, 1, tstride_src_t3, - tbound_src_t3, 1, tstride_dst_t3, tbound_dst_t3, - 0xffffffff, 0xffffffff, 0x1) != 0) { - printf("Error in xdma agu configuration \r\n"); - err++; - } else { - printf("The xdma agu is configured \r\n"); - } - - uint32_t ext_param_t3[1] = {0x1}; - if (xdma_enable_dst_ext(0, ext_param_t3) != 0) { - printf("Error in enabling xdma extension 0 \r\n"); - err++; - } else { - printf("The xdma extension 0 is disabled \r\n"); - } - if (err != 0) { - return err; - } + uint32_t ext_param_t3[1] = {0x1}; + if (xdma_enable_dst_ext(0, ext_param_t3) != 0) { + printf("Error in enabling xdma extension 0 \r\n"); + err++; + } else { + printf("The xdma extension 0 is disabled \r\n"); + } - task_id = xdma_start(); - printf( - "The xdma is started, setting memory region to 0x0000000000000001 " - "(uint64_t 1). The task id is %d \r\n", - task_id); - xdma_wait(task_id); + if (err != 0) { + return err; + } - printf("The xdma is finished \r\n"); - uint64_t *result_t3 = (uint64_t *)(tcdm_0 + 0x1000 * sizeof(uint8_t)); - for (int i = 0; i < 0x2000 / 8; i++) { - if (result_t3[i] != 1) { - printf("Error in memset (region 0) \r\n"); - return -1; + task_id = xdma_start(); + printf( + "The xdma is started, setting memory region to " + "0x0000000000000001 " + "(uint64_t 1). The task id is %d \r\n", + task_id); + xdma_wait(task_id); + + printf("The xdma is finished \r\n"); + uint64_t *result_t3 = + (uint64_t *)(tcdm_0 + 0x1000 * sizeof(uint8_t)); + for (int i = 0; i < 0x2000 / 8; i++) { + if (result_t3[i] != 1) { + printf("Error in memset (region 0) \r\n"); + return -1; + } } + printf("The memset of 4KB - 12KB is correct \r\n"); + } else { + printf("Core %d is not xdma core. \r\n", snrt_cluster_core_idx()); } - printf("The memset of 4KB - 12KB is correct \r\n"); - } else { - printf("Core %d is not xdma core. \r\n", snrt_cluster_core_idx()); - } - return 0; + return 0; + } else + return 0; } diff --git a/target/tapeout/Makefile b/target/tapeout/Makefile index 380d1cfce..3a8f237b3 100644 --- a/target/tapeout/Makefile +++ b/target/tapeout/Makefile @@ -50,7 +50,7 @@ CFG = $(TARGET_RTL)/cfg/lru.hjson $(CFG): FORCE @# If the LRU config file doesn't exist, we use the default config. @if [ ! -e $@ ] ; then \ - DEFAULT_CFG="$(TARGET_RTL)/cfg/hemaia.hjson"; \ + DEFAULT_CFG="$(TARGET_RTL)/cfg/hemaia_tapeout.hjson"; \ echo "Using default config file: $$DEFAULT_CFG"; \ cp $$DEFAULT_CFG $@; \ fi diff --git a/util/sim/snax_utils.py b/util/sim/snax_utils.py new file mode 100644 index 000000000..d10db9316 --- /dev/null +++ b/util/sim/snax_utils.py @@ -0,0 +1,450 @@ +#!/usr/bin/env python3 + +# Copyright 2024 KU Leuven. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 +# +# Xiaoling Yi + +import numpy as np + + +# Function to perform 2D convolution on the input data using the specified kernel, +# stride, and padding. It returns the output feature map. +def conv2d(input_data, kernel, stride=(1, 1), padding=(0, 0), mode="NHWC"): + if mode == "NHWC": + batch_size, in_height, in_width, in_channels = input_data.shape + out_channels, kernel_height, kernel_width, _ = kernel.shape + stride_h, stride_w = stride + pad_h, pad_w = padding + + # Calculate the output feature map dimensions + out_height = (in_height - kernel_height + 2 * pad_h) // stride_h + 1 + out_width = (in_width - kernel_width + 2 * pad_w) // stride_w + 1 + + # Add padding + input_data_padded = np.pad( + input_data, + ((0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), + mode="constant", + ) + + # Initialize the output feature map + output_data = np.zeros( + (batch_size, out_height, out_width, out_channels), np.int32 + ) + + # Perform the convolution operation + for b in range(batch_size): + for oc in range(out_channels): + for oh in range(out_height): + for ow in range(out_width): + # Calculate the input region + ih_start = oh * stride_h + ih_end = ih_start + kernel_height + iw_start = ow * stride_w + iw_end = iw_start + kernel_width + + # Slice to extract the input region + input_region = input_data_padded[ + b, ih_start:ih_end, iw_start:iw_end, : + ] + + # Slice to extract the corresponding convolution kernel + conv_kernel = kernel[oc, :, :, :] + + # Perform the convolution calculation + output_data[b, oh, ow, oc] = np.sum(input_region * conv_kernel) + else: + batch_size, Cin8, in_height, in_width, t = input_data.shape + assert t == 8 + Cout8, Cin8, kernel_height, kernel_width, t1, t2 = kernel.shape + assert t1 == 8 + assert t2 == 8 + stride_h, stride_w = stride + pad_h, pad_w = padding + + # Calculate the output feature map dimensions + out_height = (in_height - kernel_height + 2 * pad_h) // stride_h + 1 + out_width = (in_width - kernel_width + 2 * pad_w) // stride_w + 1 + assert out_width % 8 == 0 + + # Add padding + input_data_padded = np.pad( + input_data, + ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), + mode="constant", + ) + + # Initialize the output feature map + output_data = np.zeros( + (batch_size, Cout8, out_height, out_width // 8, 8, 8), np.int32 + ) + + # Perform the convolution operation + for b in range(batch_size): + for oc in range(Cout8): + for oc8 in range(8): + for oh in range(out_height): + for ow in range(out_width // 8): + for ow8 in range(8): + # Calculate the input region + iw_start = (ow * 8 + ow8) * stride_w + iw_end = iw_start + kernel_width + + ih_start = oh * stride_h + ih_end = ih_start + kernel_height + + # Slice to extract the input region + input_region = input_data_padded[ + b, :, ih_start:ih_end, iw_start:iw_end, : + ] + + # Slice to extract the corresponding convolution kernel + conv_kernel = kernel[oc, :, :, :, oc8, :] + + # Perform the convolution calculation + output_data[b, oc, oh, ow, ow8, oc8] = np.sum( + input_region * conv_kernel + ) + + return output_data + + +# Function to transform input data into columns for efficient convolution operations. +# It returns the transformed input data and reshaped kernel. +def im2col(input_data, kernel, stride=(1, 1), padding=(0, 0), mode="NC8HW8"): + assert mode == "NC8HW8" + batch_size, in_channels_8, in_height, in_width, _ = input_data.shape + _, out_channels, kernel_height, kernel_width, _, _ = kernel.shape + stride_h, stride_w = stride + pad_h, pad_w = padding + + # Calculate the size of the output feature map + out_height = (in_height + 2 * pad_h - kernel_height) // stride_h + 1 + out_width = (in_width + 2 * pad_w - kernel_width) // stride_w + 1 + + # Apply zero padding to the input data + input_data_padded = np.pad( + input_data, + ((0, 0), (0, 0), (pad_h, pad_h), (pad_w, pad_w), (0, 0)), + mode="constant", + ) + + # Initialize the im2col matrix + im2col_matrix = np.zeros( + ( + batch_size, + out_height, + out_width // 8, + in_channels_8, + kernel_height, + kernel_width, + # ow in 8 + 8, + # cin in 8 + 8, + ) + ) + + # Perform the im2col transformation on the input data + for b in range(batch_size): + for oh in range(out_height): + for ow in range(out_width // 8): + for ow8 in range(8): + for ic in range(in_channels_8): + for ic8 in range(8): + # Calculate the input region + iw_start = (ow * 8 + ow8) * stride_w + iw_end = iw_start + kernel_width + + ih_start = oh * stride_h + ih_end = ih_start + kernel_height + + # Slice to extract the input region + input_region = input_data_padded[ + b, ic, ih_start:ih_end, iw_start:iw_end, ic8 + ] + + im2col_matrix[b, oh, ow, ic, :, :, ow8, ic8] = input_region + + im2col_kernel = kernel.reshape(out_channels, -1).T + + return im2col_matrix, im2col_kernel + + +# Golden model function to perform block matrix multiplication with specific parameters. +# It returns the resulting matrix after the computation. +def block_gemm_golden_model( + m, k, n, row, size, col, a, b, subtraction_a, subtraction_b, c +): + # Reshape and subtract + a_subtracted = a.reshape(m, k, row, size) - subtraction_a # Shape: (m, k, row, size) + b_subtracted = b.reshape(n, k, col, size) - subtraction_b # Shape: (n, k, col, size) + + # Initialize output array + d = np.zeros((m, n, row, col), dtype=np.int32) + + # Compute + for mm in range(m): + for nn in range(n): + # Perform tensordot over axes k and size (axes 0 and 3 in original arrays) + # But after reshaping, axes are (k, row, size) and (k, col, size) + # So axes to sum over are 0 (k) and 2 (size) + d[mm, nn] = np.tensordot( + a_subtracted[mm], b_subtracted[nn], axes=([0, 2], [0, 2]) + ) + # Flatten d and add c + d = d.reshape(m * n * row * col) + c + + return d + + +# This function Performs a tiled block General Matrix Multiply (GEMM) operation. +# +# This function breaks down large matrix multiplication into smaller submatrices +# (tiles) and performs GEMM on these submatrices. The results are then accumulated +# into a final result matrix. +# +# Parameters: +# m2, k2, n2: int +# The number of tiles in each dimension. +# m, k, n: int +# The dimensions of the submatrices for block matrix multiplication. +# row, size, col: int +# Size parameters for the submatrices in the hardware gemm accelerator. +# a, b, c: numpy.ndarray +# The input matrices. +# subtraction_a, subtraction_b: bool +# Flags indicating whether to perform subtraction in the GEMM computation. +# +# Returns: +# numpy.ndarray +# The result of the tiled GEMM operation as a flattened array. +def tiled_block_gemm_golden_model( + m2, k2, n2, m, k, n, row, size, col, a, b, subtraction_a, subtraction_b, c +): + # Create an empty array for the result with the appropriate size + result = np.zeros((m2 * m * row * n2 * n * col), dtype=np.int32) + + # Loop over the tiles + for mm2 in range(m2): + for nn2 in range(n2): + for kk2 in range(k2): + # Create submatrices for this tile + sub_a = a[ + (mm2 * k2 + kk2) + * m + * k + * row + * size: (mm2 * k2 + kk2 + 1) + * m + * k + * row + * size + ] + sub_b = b[ + (nn2 * k2 + kk2) + * n + * k + * size + * col: (nn2 * k2 + kk2 + 1) + * n + * k + * size + * col + ] + sub_c = c[ + (mm2 * n2 + nn2) + * m + * row + * n + * col: (mm2 * n2 + nn2 + 1) + * m + * row + * n + * col + ] + + # Perform block GEMM on the submatrices + sub_d = block_gemm_golden_model( + m, + k, + n, + row, + size, + col, + sub_a, + sub_b, + subtraction_a, + subtraction_b, + sub_c, + ) + # Accumulate the result into the final result matrix at the correct position + result[ + (mm2 * n2 + nn2) + * m + * row + * n + * col: (mm2 * n2 + nn2 + 1) + * m + * row + * n + * col + ] += sub_d + + return result + + +# Golden model function for reshuffling data with specified parameters. It applies +# strided layout mapping to the input data and returns the reshuffled data array. +def data_reshuffler_golden_model( + tempLoop0, + tempLoop1, + spatial_len_0, + spatial_len_1, + tempStride0, + tempStride1, + spatialStride0, + spatialStride1, + data, + int32=False, +): + # abstract illusion: k innermost loop, m second innermost loop, + # K third innermost loop, M outermost loop + + # total loop bounds = spatial loop bounds * temporal loop bounds + K = tempLoop0 * spatial_len_0 + M = tempLoop1 * spatial_len_1 + + # loop bounds settings + matrix_size = {"K": K, "M": M, "k": spatial_len_0, "m": spatial_len_1} + + # stride settings + strides = { + "M": tempStride1, + "K": tempStride0, + "m": spatialStride1, + "k": spatialStride0, + } + + if int32: + result_array = np.zeros((matrix_size["M"] * matrix_size["K"]), np.int32) + else: + result_array = np.zeros((matrix_size["M"] * matrix_size["K"]), np.int8) + + # apply strided layout mapping for the golden model of data reshuffler + for M in range(matrix_size["M"] // matrix_size["m"]): + for K in range(matrix_size["K"] // matrix_size["k"]): + for m in range(matrix_size["m"]): + for k in range(matrix_size["k"]): + result_array[ + # output address calculation with coutinued increment + matrix_size["K"] + // matrix_size["k"] + * matrix_size["k"] + * matrix_size["m"] + * M + + matrix_size["k"] * matrix_size["m"] * K + + m * matrix_size["k"] + + k + ] = data[ + # input address calculation with + # strided layout mapping eqaution + strides["M"] * M + + strides["K"] * K + + strides["m"] * m + + strides["k"] * k + ] + + return result_array.ravel() + + +# Golden model function for SIMD postprocessing of data. It performs operations such as +# zero point subtraction, multiplication, right shift, double rounding, and clipping. +def postprocessing_simd_golden_model( + data_in, + input_zp_i, + output_zp_i, + shift_i, + max_int_i, + min_int_i, + double_round_i, + multiplier_i, +): + + # Step 1: Subtract input zero point + var = data_in - input_zp_i + + # Step 2: Multiply with the multiplier avoiding overflow + var = np.int64(var) * np.int64(multiplier_i) + + # Step 3: Right shift + var = np.int32(var >> (shift_i - 1)) + + # Step 4: Apply double rounding if necessary + if double_round_i: + var = np.where(var >= 0, var + 1, var - 1) + + # Step 5: Final right shift + var = var >> 1 + + # Step 6: Add output zero point + var = var + output_zp_i + + # Step 7: Clip the values to be within min and max integer range + var = np.clip(var, min_int_i, max_int_i) + + return var + + +def max_pooling( + input_tensor, + pool_size_w, + pool_size_h, + stride_w, + stride_h, + padding_w, + padding_h, + mode="HWC", +): + + # if mode == "HWC", C8 is 1, C = realCin + # if mode != "HWC", C8 is realCin/8, C = 8 + C8, H, W, C = input_tensor.shape + if mode != "HWC": + assert input_tensor.shape[3] == 8 and C == 8 + elif mode == "HWC": + assert input_tensor.shape[0] == 1 and C8 == 1 + + out_width = (W + 2 * padding_w - pool_size_w) // stride_w + 1 + out_height = (H + 2 * padding_h - pool_size_h) // stride_h + 1 + + input_padded = np.pad( + input_tensor, + ((0, 0), (padding_h, padding_h), (padding_w, padding_w), (0, 0)), + mode="constant", + constant_values=0, + ) + + pooled_tensor = np.zeros((C8, out_height, out_width, C), dtype=np.int8) + + for c in range(C8): + for i in range(out_height): + for j in range(out_width): + for k in range(C): + h_start = i * stride_h + h_end = h_start + pool_size_h + w_start = j * stride_w + w_end = w_start + pool_size_w + pooled_tensor[c, i, j, k] = np.max( + input_padded[c, h_start:h_end, w_start:w_end, k] + ) + + return pooled_tensor + + +def align_wide_addr(addr, alignment=64): + if addr % alignment: + addr = ((addr // alignment) + 1) * alignment + return addr