diff --git a/.github/workflows/sphinx.yml b/.github/workflows/sphinx.yml index f3e936d33..caa1b74ae 100644 --- a/.github/workflows/sphinx.yml +++ b/.github/workflows/sphinx.yml @@ -6,16 +6,16 @@ jobs: build: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Build HTML uses: ammaraskar/sphinx-action@master - name: Upload artifacts - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: html-docs path: docs/build/html/ - name: Deploy - uses: peaceiris/actions-gh-pages@v3 + uses: peaceiris/actions-gh-pages@v4 if: github.ref == 'refs/heads/main' with: github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/CHANGELOG.md b/CHANGELOG.md index e8bcbda4a..5f2a7ae0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,9 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Solve type-conversion warnings about type - Indexed loads need to wait for operand requesters ready in sequencer - Drop sequencer `pe_req_valid` in case of exception - Reworked STU exception flush engine + - Correctly flush the backend pipeline upon indexed load exceptions + - Make addrgen wait for index address before making an MMU request + - Fix typos in lane sequencer ### Added diff --git a/cheshire/sw/include/rvv_test.h b/cheshire/sw/include/rvv_test.h index 259a53b26..bf4a52ba8 100644 --- a/cheshire/sw/include/rvv_test.h +++ b/cheshire/sw/include/rvv_test.h @@ -24,21 +24,29 @@ #define _VSETVLI(vl,avl) _VSETVLI_64(vl, avl) #define _VLD(vreg,address_load) __VLD(vreg,64,address_load) #define _VST(vreg,address_store) __VST(vreg,64,address_store) +#define _VLD_IDX(vd,vs2,address_load) __VLD_IDX(vd,vs2,64,address_load) +#define _VST_IDX(vs1,vs2,address_store) __VST_IDX(vs1,vs2,64,address_store) #elif EEW == 32 #define _DTYPE __DTYPE(32) #define _VSETVLI(vl,avl) _VSETVLI_32(vl, avl) #define _VLD(vreg,address_load) __VLD(vreg,32,address_load) #define _VST(vreg,address_store) __VST(vreg,32,address_store) +#define _VLD_IDX(vd,vs2,address_load) __VLD_IDX(vd,vs2,32,address_load) +#define _VST_IDX(vs1,vs2,address_store) __VST_IDX(vs1,vs2,32,address_store) #elif EEW == 16 #define _DTYPE __DTYPE(16) #define _VSETVLI(vl,avl) _VSETVLI_16(vl, avl) #define _VLD(vreg,address_load) __VLD(vreg,16,address_load) #define _VST(vreg,address_store) __VST(vreg,16,address_store) +#define _VLD_IDX(vd,vs2,address_load) __VLD_IDX(vd,vs2,16,address_load) +#define _VST_IDX(vs1,vs2,address_store) __VST_IDX(vs1,vs2,16,address_store) #elif EEW == 8 #define _DTYPE __DTYPE(8) #define _VSETVLI(vl,avl) _VSETVLI_8(vl, avl) #define _VLD(vreg,address_load) __VLD(vreg,8,address_load) #define _VST(vreg,address_store) __VST(vreg,8,address_store) +#define _VLD_IDX(vd,vs2,address_load) __VLD_IDX(vd,vs2,8,address_load) +#define _VST_IDX(vs1,vs2,address_store) __VST_IDX(vs1,vs2,8,address_store) #else #error "ERROR: No EEW was defined. Please specify one in [8,16,32,64]." #endif @@ -49,6 +57,8 @@ #define __DTYPE(eew) uint##eew##_t #define __VLD(vreg,eew,address_load) asm volatile ("vle"#eew".v "#vreg", (%0)" : "+&r"(address_load)); #define __VST(vreg,eew,address_store) asm volatile ("vse"#eew".v "#vreg", (%0)" : "+&r"(address_store)); +#define __VLD_IDX(vd,vs2,eew,address_load) asm volatile ("vluxei"#eew".v "#vd", (%0), "#vs2 : "+&r"(address_load)); +#define __VST_IDX(vs1,vs2,eew,address_store) asm volatile ("vsuxei"#eew".v "#vs1", (%0), "#vs2 : "+&r"(address_store)); /////////////////////// // Reshuffle helpers // @@ -172,11 +182,14 @@ volatile uint64_t ret_cnt; #define RVV_TEST_AVL(EEW) (VLMAX / (EEW)) #endif +#ifndef _ENABLE_RVV_ +#define _ENABLE_RVV_ void enable_rvv() { // Enalbe RVV by seting MSTATUS.VS asm volatile (" li t0, %0 " :: "i"(MSTATUS_VS)); asm volatile (" csrs mstatus, t0" ); } +#endif uint64_t reset_v_state ( uint64_t avl ) { uint64_t vl_local = 0; diff --git a/cheshire/sw/include/vector_util.h b/cheshire/sw/include/vector_util.h index 02959e2c0..436296334 100644 --- a/cheshire/sw/include/vector_util.h +++ b/cheshire/sw/include/vector_util.h @@ -33,10 +33,13 @@ void stop_timer() { timer += get_cycle_count(); } // Get the value of the timer int64_t get_timer() { return timer; } +#ifndef _ENABLE_RVV_ +#define _ENABLE_RVV_ inline void enable_rvv() { asm volatile ("li t0, %0" :: "i"(MSTATUS_VS)); asm volatile ("csrs mstatus, t0" ); } +#endif inline int similarity_check(double a, double b, double threshold) { double diff = a - b; diff --git a/cheshire/sw/src/tests/body/rvv_test_mmu_stub_idx_ld_comprehensive.c.body b/cheshire/sw/src/tests/body/rvv_test_mmu_stub_idx_ld_comprehensive.c.body new file mode 100644 index 000000000..be3b2c5aa --- /dev/null +++ b/cheshire/sw/src/tests/body/rvv_test_mmu_stub_idx_ld_comprehensive.c.body @@ -0,0 +1,303 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Matteo Perotti +// Vincenzo Maisto + +#include "regs/cheshire.h" +#include "dif/clint.h" +#include "dif/uart.h" +#include "params.h" +#include "util.h" +#include "encoding.h" +#include "rvv_test.h" + +#include "cheshire_util.h" + +#if (EXTENSIVE_TEST == 1) +#define VL_LIMIT_LOW ELMMAX + 1 +#define VL_LIMIT_HIGH 0 +#define VSTART_LIMIT_LOW vl + 1 +#define VSTART_LIMIT_HIGH 0 +#else +#define VL_LIMIT_LOW 3*ARA_NR_LANES + 1 +#define VL_LIMIT_HIGH ELMMAX - (3*ARA_NR_LANES + 1) +#define VSTART_LIMIT_LOW 2*ARA_NR_LANES + 1 +#define VSTART_LIMIT_HIGH vl - 2*ARA_NR_LANES - 1 +#endif + +#define INIT_NONZERO_VAL_V0 99 +#define INIT_NONZERO_VAL_V8 67 + +// Derived parameters +#define param_stub_ex { param_stub_ex_ctrl ? 1 : 0; } + +uint64_t stub_req_rsp_lat = param_stub_req_rsp_lat; + +_DTYPE* start_addr_vec [ELMMAX]; + +int main(void) { + cheshire_start(); + + // Clean the exception variable + RVV_TEST_CLEAN_EXCEPTION(); + + // This initialization is controlled through "defines" in the various + // derived tests. + INIT_RVV_TEST_SOC_REGFILE; + VIRTUAL_MEMORY_ON; + STUB_EX_ON; + + // Vector configuration parameters and variables + uint64_t avl_original = RVV_TEST_AVL(64); + uint64_t vl, vstart_read; + vcsr_dump_t vcsr_state = {0}; + + // Helper variables and arrays + _DTYPE array_load [ELMMAX]; + _DTYPE array_load_rev [ELMMAX]; + _DTYPE array_load_idx [ELMMAX]; + _DTYPE array_store_0 [ELMMAX]; + _DTYPE array_store_1 [ELMMAX]; + _DTYPE* address_load = array_load; + _DTYPE* address_load_rev = array_load_rev; + _DTYPE* address_load_idx = array_load_idx; + _DTYPE* address_store_0 = array_store_0; + _DTYPE* address_store_1 = array_store_1; + + // Enalbe RVV + enable_rvv(); + vcsr_dump ( vcsr_state ); + + ////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////// + // START OF TESTS + ////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////////////////// + // TEST: Exception generation and non-zero vstart: vector store + ////////////////////////////////////////////////////////////////// + + // Loop through different avl, from 0 to avlmax + for (uint64_t avl = 1; (avl <= VL_LIMIT_LOW || avl >= VL_LIMIT_HIGH) && avl <= ELMMAX + 1; avl++) { + // Reset vl, vstart, reset exceptions. + RVV_TEST_INIT(vl, avl); + + // Loop over vstart values. Also test vstart > vl. + for (uint64_t vstart_val = 0; (vstart_val <= VSTART_LIMIT_LOW || vstart_val >= VSTART_LIMIT_HIGH) && vstart_val < vl; vstart_val++) { + // Reset vl, vstart, reset exceptions. + RVV_TEST_INIT(vl, avl); + + // Decide latency for next STUB req-rsp + switch (param_stub_req_rsp_lat_ctrl) { + // Fixed STUB req-rsp latency + case 0: + STUB_REQ_RSP_LAT(stub_req_rsp_lat); + break; + // Random STUB req-rsp latency (minimum value should be 1) + case 1: + STUB_REQ_RSP_LAT((stub_req_rsp_lat++ % MAX_LAT_P2) + 1); + break; + default: + cheshire_end(); + return RET_CODE_WRONG_CASE; + } + + // Init memory + for (uint64_t i = 0; i < vl; i++) { + address_store_0[i] = INIT_NONZERO_VAL_ST; + address_store_1[i] = INIT_NONZERO_VAL_ST; + } + for (uint64_t i = 0; i < vl; i++) { + address_load[i] = vl + vstart_val + i + MAGIC_NUM; + } + for (uint64_t i = 0; i < vl; i++) { + address_load_rev[vl-1-i] = vl + vstart_val + i + MAGIC_NUM; + } + for (uint64_t i = 0; i < vl; i++) { + // The idx vector is a byte vector + address_load_idx[i] = (vl-1-i) * (EEW/8); + } + // Init VRF (use v0) + asm volatile("vmv.v.x v0, %0" :: "r" (INIT_NONZERO_VAL_V0)); + asm volatile("vmv.v.x v8, %0" :: "r" (INIT_NONZERO_VAL_V8)); + + // Get information about the next axi transfer + unsigned int addrgen_req = vl - vstart_val; + addrgen_req = vl - vstart_val; + for (int i = 0; i < vl; i++) + start_addr_vec[i] = address_load + (vl-1-i); + + // Load the index vector + STUB_EX_OFF; + _VLD(v24,address_load_idx) + STUB_EX_ON; + + // Setup STUB behavior + uint64_t ex_lat; + switch (param_stub_ex_ctrl) { + // No exceptions + case 0: + ex_lat = addrgen_req; + STUB_EX_OFF; + break; + // Always exceptions at every request + case 1: + ex_lat = 0; + STUB_EX_ON; + STUB_NO_EX_LAT(ex_lat); + break; + // Random exceptions + case 2: + // If ex_lat == axi_log->bursts, no exception for this transaction! + ex_lat = pseudo_rand(addrgen_req); + STUB_EX_ON; + STUB_NO_EX_LAT(ex_lat); + break; + default: + cheshire_end(); + return RET_CODE_WRONG_CASE; + } + + *rf_rvv_debug_reg = vl; + *rf_rvv_debug_reg = vstart_val; + *rf_rvv_debug_reg = ex_lat; + *rf_rvv_debug_reg = start_addr_vec[vstart_val + ex_lat]; + + // Setup vstart + asm volatile("csrs vstart, %0" :: "r"(vstart_val)); + + // Load indexed + _VLD_IDX(v0,v24,address_load) + + // Get information about the next vstart + uint64_t vstart_post_ex = vstart_val + ex_lat; + + *rf_rvv_debug_reg = 0x44444444; + + // Check for illegal new vstart values + RVV_TEST_ASSERT(vstart_post_ex >= vstart_val && (vstart_post_ex < vl || (ex_lat == addrgen_req && vstart_post_ex == vl))) + + *rf_rvv_debug_reg = 0x55555555; + + // Check if we had an exception on this transaction + if (param_stub_ex_ctrl == 1 || (param_stub_ex_ctrl == 2 && ex_lat < addrgen_req)) { + // Check that the new vstart is correct + vstart_read = -1; + asm volatile("csrr %0, vstart" : "=r"(vstart_read)); + ASSERT_EQ(vstart_read, vstart_post_ex) + *rf_rvv_debug_reg = 0x66666666; + // Check the exception + RVV_TEST_ASSERT_EXCEPTION_EXTENDED(1, start_addr_vec[vstart_val + ex_lat], CAUSE_LOAD_PAGE_FAULT) + RVV_TEST_CLEAN_EXCEPTION() + + // Restart the instruction on another reg, or just load everything in v8 too. + // Then, store everything from v8 + STUB_EX_OFF; + _VLD_IDX(v8, v24, address_load) + _VST(v8, address_store_1) + STUB_EX_ON; + + *rf_rvv_debug_reg = 0xffffffff; + + // Pre-body check v8 + for (uint64_t i = 0; i < vstart_val; i++) { + ASSERT_EQ(address_store_1[i], INIT_NONZERO_VAL_V8) + } + + *rf_rvv_debug_reg = 0xeeeeeeee; + + // Body check 0 + for (uint64_t i = vstart_val; i < vstart_post_ex; i++) { + *rf_rvv_debug_reg = i; + *rf_rvv_debug_reg = address_store_1[i]; + *rf_rvv_debug_reg = INIT_NONZERO_VAL_V8; + ASSERT_EQ(address_store_1[i], INIT_NONZERO_VAL_V8) + } + + *rf_rvv_debug_reg = 0xdddddddd; + + // Body check 1 + for (uint64_t i = vstart_post_ex; i < vl; i++) { + ASSERT_EQ(address_store_1[i], address_load_rev[i]) + } + + *rf_rvv_debug_reg = 0xcccccccc; + } + + // Check that vstart was reset at zero + vstart_read = -1; + + *rf_rvv_debug_reg = 0xbbbbbbbb; + + asm volatile("csrr %0, vstart" : "=r"(vstart_read)); + ASSERT_EQ(vstart_read, 0) + // Check that there was no exception + RVV_TEST_ASSERT_EXCEPTION(0) + RVV_TEST_CLEAN_EXCEPTION() + + *rf_rvv_debug_reg = 0xaaaaaaaa; + + // Store back the values of v0 + STUB_EX_OFF; + _VST(v0, address_store_0) + STUB_EX_ON; + + // Pre-body check v0 + for (uint64_t i = 0; i < vstart_val; i++) { + ASSERT_EQ(address_store_0[i], INIT_NONZERO_VAL_V0) + } + + *rf_rvv_debug_reg = 0x99999999; + + // Body check 0 + for (uint64_t i = vstart_val; i < vstart_post_ex; i++) { + ASSERT_EQ(address_store_0[i], address_load_rev[i]) + } + + *rf_rvv_debug_reg = 0x88888888; + + // Body check 1 + for (uint64_t i = vstart_post_ex; i < vl; i++) { + ASSERT_EQ(address_store_0[i], INIT_NONZERO_VAL_V0) + } + + *rf_rvv_debug_reg = 0x77777777; + + // Clean-up + RVV_TEST_CLEANUP(); + + // Jump from limit low to limit high if limit high is higher than low + if ((VSTART_LIMIT_LOW) < (VSTART_LIMIT_HIGH)) + if (vstart_val == VSTART_LIMIT_LOW) + vstart_val = VSTART_LIMIT_HIGH; + + ret_cnt++; + } + + // Jump from limit low to limit high if limit high is higher than low + if ((VL_LIMIT_LOW) < (VL_LIMIT_HIGH)) + if (avl == VL_LIMIT_LOW) + avl = VL_LIMIT_HIGH; + } + + // Clean-up the SoC CSRs + RESET_SOC_CSR; + + ////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////// + // END OF TESTS + ////////////////////////////////////////////////////////////////// + ////////////////////////////////////////////////////////////////// + +#if (PRINTF == 1) + printf("Test SUCCESS!\r\n"); +#endif + + cheshire_end(); + + // If we did not return before, the test passed + return RET_CODE_SUCCESS; +} diff --git a/cheshire/sw/src/tests/rvv_test_mmu_stub_idx_ld_comprehensive_page_fault_var_lat_var_ex.c b/cheshire/sw/src/tests/rvv_test_mmu_stub_idx_ld_comprehensive_page_fault_var_lat_var_ex.c new file mode 100644 index 000000000..7d6ce5445 --- /dev/null +++ b/cheshire/sw/src/tests/rvv_test_mmu_stub_idx_ld_comprehensive_page_fault_var_lat_var_ex.c @@ -0,0 +1,16 @@ +// Copyright 2023 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 +// +// Matteo Perotti +// Vincenzo Maisto + +// Tunable parameters +// param_stub_ex_ctrl. 0: no exceptions, 1: always exceptions, 2: random exceptions +#define param_stub_ex_ctrl 2 + +// param_stub_req_rsp_lat_ctrl. 0: fixed latency (== param_stub_req_rsp_lat), 1: random latency (max == param_stub_req_rsp_lat) +#define param_stub_req_rsp_lat_ctrl 1 +#define param_stub_req_rsp_lat 10 + +#include "rvv_test_mmu_stub_idx_ld_comprehensive.c.body" diff --git a/hardware/src/ara.sv b/hardware/src/ara.sv index 8adcd4333..6a627bfbe 100644 --- a/hardware/src/ara.sv +++ b/hardware/src/ara.sv @@ -185,8 +185,8 @@ module ara import ara_pkg::*; #( logic [NrLanes-1:0] vxsat_flag; vxrm_t [NrLanes-1:0] alu_vxrm; // Flush support for store exceptions - logic stu_ex_flush_lane, stu_ex_flush_done; - logic [NrLanes-1:0] stu_ex_flush_stu; + logic lsu_ex_flush_lane, lsu_ex_flush_done; + logic [NrLanes-1:0] lsu_ex_flush_stu; ara_dispatcher #( .NrLanes (NrLanes ), @@ -214,8 +214,8 @@ module ara import ara_pkg::*; #( .fflags_ex_i (fflags_ex ), .fflags_ex_valid_i (fflags_ex_valid ), // Flush support - .stu_ex_flush_o (stu_ex_flush_lane), - .stu_ex_flush_done_i(stu_ex_flush_done), + .lsu_ex_flush_o (lsu_ex_flush_lane), + .lsu_ex_flush_done_i(lsu_ex_flush_done), // Interface with the Vector Store Unit .core_st_pending_o (core_st_pending ), .load_complete_i (load_complete ), @@ -238,7 +238,7 @@ module ara import ara_pkg::*; #( ariane_pkg::exception_t addrgen_exception; vlen_t addrgen_exception_vstart; logic addrgen_fof_exception; - logic stu_current_burst_exception; + logic lsu_current_burst_exception; logic [NrLanes-1:0] alu_vinsn_done; logic [NrLanes-1:0] mfpu_vinsn_done; // Interface with the operand requesters @@ -295,7 +295,7 @@ module ara import ara_pkg::*; #( .addrgen_exception_i (addrgen_exception ), .addrgen_exception_vstart_i(addrgen_exception_vstart), .addrgen_fof_exception_i(addrgen_fof_exception), - .stu_current_burst_exception_i(stu_current_burst_exception) + .lsu_current_burst_exception_i(lsu_current_burst_exception) ); // Scalar move support @@ -374,8 +374,8 @@ module ara import ara_pkg::*; #( .fflags_ex_o (fflags_ex[lane] ), .fflags_ex_valid_o (fflags_ex_valid[lane] ), // Support for store exception flush - .stu_ex_flush_i (stu_ex_flush_lane ), - .stu_ex_flush_o (stu_ex_flush_stu[lane] ), + .lsu_ex_flush_i (lsu_ex_flush_lane ), + .lsu_ex_flush_o (lsu_ex_flush_stu[lane] ), // Interface with the sequencer .pe_req_i (pe_req ), .pe_req_valid_i (pe_req_valid ), @@ -505,8 +505,8 @@ module ara import ara_pkg::*; #( .store_complete_o (store_complete ), .store_pending_o (store_pending ), // STU exception support - .stu_ex_flush_i (|stu_ex_flush_stu ), - .stu_ex_flush_done_o (stu_ex_flush_done ), + .lsu_ex_flush_i (|lsu_ex_flush_stu ), + .lsu_ex_flush_done_o (lsu_ex_flush_done ), // Interface with the sequencer .pe_req_i (pe_req ), .pe_req_valid_i (pe_req_valid ), @@ -517,7 +517,7 @@ module ara import ara_pkg::*; #( .addrgen_exception_o (addrgen_exception ), .addrgen_exception_vstart_o (addrgen_exception_vstart ), .addrgen_fof_exception_o (addrgen_fof_exception ), - .stu_current_burst_exception_o (stu_current_burst_exception), + .lsu_current_burst_exception_o (lsu_current_burst_exception), // Interface with the Mask unit .mask_i (mask ), .mask_valid_i (mask_valid ), diff --git a/hardware/src/ara_dispatcher.sv b/hardware/src/ara_dispatcher.sv index bf47b4be3..23544b671 100644 --- a/hardware/src/ara_dispatcher.sv +++ b/hardware/src/ara_dispatcher.sv @@ -41,9 +41,9 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Interface with the lanes input logic [NrLanes-1:0][4:0] fflags_ex_i, input logic [NrLanes-1:0] fflags_ex_valid_i, - // STU exception-related flush support - output logic stu_ex_flush_o, - input logic stu_ex_flush_done_i, + // LSU exception-related flush support + output logic lsu_ex_flush_o, + input logic lsu_ex_flush_done_i, // Rounding mode is shared between all lanes input logic [NrLanes-1:0] vxsat_flag_i, output vxrm_t [NrLanes-1:0] alu_vxrm_o, @@ -293,49 +293,50 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( .ara_resp_valid_o(ara_resp_valid) ); - // STU exception flush FSM + // LSU exception flush FSM // Upon exception, Ara should be flushed as soon as no operations older than the store are ongoing. // For this reason, we should first wait until Ara is idle. Then, we can flush. - logic stu_ex_flush_start, stu_ex_flush_done, stu_ex_flush_done_q; + // Flushes are needed after a faulty memory operation. Even loads need a flush if they access the VRF. + logic lsu_ex_flush_start, lsu_ex_flush_done, lsu_ex_flush_done_q; typedef enum logic [1:0] { - STU_FLUSH_IDLE, - STU_FLUSH, - STU_FLUSH_WAIT, - STU_FLUSH_DONE - } stu_ex_flush_fsm_e; - stu_ex_flush_fsm_e stu_ex_state_d, stu_ex_state_q; + LSU_FLUSH_IDLE, + LSU_FLUSH, + LSU_FLUSH_WAIT, + LSU_FLUSH_DONE + } lsu_ex_flush_fsm_e; + lsu_ex_flush_fsm_e lsu_ex_state_d, lsu_ex_state_q; always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - stu_ex_state_q <= STU_FLUSH_IDLE; - stu_ex_flush_done_q <= 1'b0; + lsu_ex_state_q <= LSU_FLUSH_IDLE; + lsu_ex_flush_done_q <= 1'b0; end else begin - stu_ex_state_q <= stu_ex_state_d; - stu_ex_flush_done_q <= stu_ex_flush_done_i; + lsu_ex_state_q <= lsu_ex_state_d; + lsu_ex_flush_done_q <= lsu_ex_flush_done_i; end end - always_comb begin : i_stu_ex_flush_fsm - stu_ex_state_d = stu_ex_state_q; - stu_ex_flush_o = 1'b0; - stu_ex_flush_done = 1'b0; + always_comb begin : i_lsu_ex_flush_fsm + lsu_ex_state_d = lsu_ex_state_q; + lsu_ex_flush_o = 1'b0; + lsu_ex_flush_done = 1'b0; - case (stu_ex_state_q) - STU_FLUSH_IDLE: begin - if (stu_ex_flush_start) - stu_ex_state_d = STU_FLUSH; + case (lsu_ex_state_q) + LSU_FLUSH_IDLE: begin + if (lsu_ex_flush_start) + lsu_ex_state_d = LSU_FLUSH; end - STU_FLUSH: begin - stu_ex_flush_o = 1'b1; - stu_ex_state_d = STU_FLUSH_WAIT; + LSU_FLUSH: begin + lsu_ex_flush_o = 1'b1; + lsu_ex_state_d = LSU_FLUSH_WAIT; end - STU_FLUSH_WAIT: begin - if (stu_ex_flush_done_q) - stu_ex_state_d = STU_FLUSH_DONE; + LSU_FLUSH_WAIT: begin + if (lsu_ex_flush_done_q) + lsu_ex_state_d = LSU_FLUSH_DONE; end - STU_FLUSH_DONE: begin - stu_ex_flush_done = 1'b1; - stu_ex_state_d = STU_FLUSH_IDLE; + LSU_FLUSH_DONE: begin + lsu_ex_flush_done = 1'b1; + lsu_ex_state_d = LSU_FLUSH_IDLE; end endcase end @@ -384,7 +385,7 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( skip_lmul_checks = 1'b0; - stu_ex_flush_start = 1'b0; + lsu_ex_flush_start = 1'b0; null_vslideup = 1'b0; @@ -447,12 +448,12 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Wait for idle and then flush the stu-related pipes. // This operation is not IPC critical. WAIT_IDLE_FLUSH: begin - if ((stu_ex_state_q == STU_FLUSH_IDLE) && ara_idle_i) begin + if ((lsu_ex_state_q == LSU_FLUSH_IDLE) && ara_idle_i) begin // Start the flush FSM - stu_ex_flush_start = 1'b1; + lsu_ex_flush_start = 1'b1; end // Get back to normal operation once the flush is over - if (stu_ex_state_q == STU_FLUSH_DONE) begin + if (lsu_ex_state_q == LSU_FLUSH_DONE) begin state_d = NORMAL_OPERATION; end end @@ -3075,8 +3076,16 @@ module ara_dispatcher import ara_pkg::*; import rvv_pkg::*; #( // Mask exception if we had a fault-only-first with exception on // idx > 0 acc_resp_o.exception.valid = 1'b0; + // Flush if mask reg was involved in the fof operation + if (!ara_req.vm) begin + state_d = WAIT_IDLE_FLUSH; + end end else if (ara_resp.exception.valid) begin csr_vstart_d = ara_resp.exception_vstart; + // If this load has VRF source operands, flush everything + if (!ara_req.vm || ara_req.use_vs2) begin + state_d = WAIT_IDLE_FLUSH; + end end end end diff --git a/hardware/src/ara_sequencer.sv b/hardware/src/ara_sequencer.sv index 4ed6d0ebc..745b6dddd 100644 --- a/hardware/src/ara_sequencer.sv +++ b/hardware/src/ara_sequencer.sv @@ -50,7 +50,7 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i input vlen_t addrgen_exception_vstart_i, input logic addrgen_fof_exception_i, // Interface with the store unit - input logic stu_current_burst_exception_i + input logic lsu_current_burst_exception_i ); `include "common_cells/registers.svh" @@ -339,8 +339,8 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // the MASKU insn to be sure that the forwarded value is the scalar one logic running_mask_insn_d, running_mask_insn_q; - logic stu_current_burst_exception_q; - `FF(stu_current_burst_exception_q, stu_current_burst_exception_i, 1'b0, clk_i, rst_ni); + logic lsu_current_burst_exception_q; + `FF(lsu_current_burst_exception_q, lsu_current_burst_exception_i, 1'b0, clk_i, rst_ni); // pe_req_ready_i comes from all the lanes // It is deasserted if the current request is stuck @@ -391,9 +391,9 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i pe_req_d = pe_req_o; pe_req_valid_d = pe_req_valid_o; - // If we are here after a faulty store, wait until the stu signals the exception on the - // current burst before aborting the request. - if (stu_current_burst_exception_q) + // If we are here after a faulty lsu op with VRF sources, + // wait until the lsu signals the exception on the current burst before aborting the request. + if (lsu_current_burst_exception_q) pe_req_valid_d = 1'b0; // We are not ready @@ -542,13 +542,13 @@ module ara_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg::i // 2) Unmasked non-indexed loads only need ack from the addrgen if (no_src_vrf(pe_req_o) && addrgen_ack_i) pe_req_valid_d = 1'b0; - // 4) In case of an exception on this burst, kill the request. + // 3) In case of an exception on this burst, kill the request. // Exceptions on this burst mean that all the valid sources have been fetched from VRF already. // Don't immediately kill when detecting the exception in the addrgen, as previous valid bursts // can still need operands to be fetched from the VRF. - if (stu_current_burst_exception_q) + if (lsu_current_burst_exception_q) pe_req_valid_d = 1'b0; - // 3) In the other cases, we need an ack from both addrgen and lanes, so keep up the req + // 4) In the other cases, we need an ack from both addrgen and lanes, so keep up the req // Wait for the address translation if ((is_load(pe_req_d.op) || is_store(pe_req_d.op)) && addrgen_ack_i) begin diff --git a/hardware/src/lane/lane.sv b/hardware/src/lane/lane.sv index 4ffaec129..2eb6e3c4b 100644 --- a/hardware/src/lane/lane.sv +++ b/hardware/src/lane/lane.sv @@ -49,8 +49,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( output logic [4:0] fflags_ex_o, output logic fflags_ex_valid_o, // Support for store exception flush - input logic stu_ex_flush_i, - output logic stu_ex_flush_o, + input logic lsu_ex_flush_i, + output logic lsu_ex_flush_o, // Interface with the sequencer input `STRUCT_PORT_BITS(pe_req_t_bits) pe_req_i, input logic pe_req_valid_i, @@ -223,8 +223,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( logic mask_b_cmd_pop; // Support for store exception flush - logic stu_ex_flush_op_req_d, stu_ex_flush_op_req_q; - `FF(stu_ex_flush_op_req_q, stu_ex_flush_op_req_d, 1'b0, clk_i, rst_ni); + logic lsu_ex_flush_op_req_d, lsu_ex_flush_op_req_q; + `FF(lsu_ex_flush_op_req_q, lsu_ex_flush_op_req_d, 1'b0, clk_i, rst_ni); // Additional signals to please Verilator's hierarchical verilation pe_req_t pe_req; @@ -249,8 +249,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( .pe_req_ready_o (pe_req_ready_o ), .pe_resp_o (pe_resp ), // Support for store exception flush - .stu_ex_flush_i (stu_ex_flush_i ), - .stu_ex_flush_o (stu_ex_flush_op_req_d), + .lsu_ex_flush_i (lsu_ex_flush_i ), + .lsu_ex_flush_o (lsu_ex_flush_op_req_d), // Interface with the operand requesters .operand_request_o (operand_request ), .operand_request_valid_o(operand_request_valid), @@ -306,8 +306,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( // To the slide unit (reductions) logic sldu_result_gnt_opqueues; // Support for store exception flush - logic stu_ex_flush_op_queues_d, stu_ex_flush_op_queues_q; - `FF(stu_ex_flush_op_queues_q, stu_ex_flush_op_queues_d, 1'b0, clk_i, rst_ni); + logic lsu_ex_flush_op_queues_d, lsu_ex_flush_op_queues_q; + `FF(lsu_ex_flush_op_queues_q, lsu_ex_flush_op_queues_d, 1'b0, clk_i, rst_ni); operand_requester #( .NrLanes (NrLanes ), @@ -326,8 +326,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( .operand_request_valid_i (operand_request_valid ), .operand_request_ready_o (operand_request_ready ), // Support for store exception flush - .stu_ex_flush_i (stu_ex_flush_op_req_q ), - .stu_ex_flush_o (stu_ex_flush_op_queues_d), + .lsu_ex_flush_i (lsu_ex_flush_op_req_q ), + .lsu_ex_flush_o (lsu_ex_flush_op_queues_d), // Interface with the VRF .vrf_req_o (vrf_req ), .vrf_addr_o (vrf_addr ), @@ -445,8 +445,8 @@ module lane import ara_pkg::*; import rvv_pkg::*; #( .operand_queue_cmd_i (operand_queue_cmd ), .operand_queue_cmd_valid_i (operand_queue_cmd_valid ), // Support for store exception flush - .stu_ex_flush_i (stu_ex_flush_op_queues_q ), - .stu_ex_flush_o (stu_ex_flush_o ), + .lsu_ex_flush_i (lsu_ex_flush_op_queues_q ), + .lsu_ex_flush_o (lsu_ex_flush_o ), // Interface with the Lane Sequencer .mask_b_cmd_pop_o (mask_b_cmd_pop ), // Interface with the VFUs diff --git a/hardware/src/lane/lane_sequencer.sv b/hardware/src/lane/lane_sequencer.sv index 84021d82c..32976cdb1 100644 --- a/hardware/src/lane/lane_sequencer.sv +++ b/hardware/src/lane/lane_sequencer.sv @@ -26,8 +26,8 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: output logic pe_req_ready_o, output pe_resp_t pe_resp_o, // Support for store exception flush - input logic stu_ex_flush_i, - output logic stu_ex_flush_o, + input logic lsu_ex_flush_i, + output logic lsu_ex_flush_o, // Interface with the operand requester output operand_request_cmd_t [NrOperandQueues-1:0] operand_request_o, output logic [NrOperandQueues-1:0] operand_request_valid_o, @@ -52,7 +52,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: `include "common_cells/registers.svh" // STU exception support - `FF(stu_ex_flush_o, stu_ex_flush_i, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni); //////////////////////////// // Register the request // @@ -78,7 +78,7 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: ) i_pe_req_register ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .clr_i (stu_ex_flush_o ), + .clr_i (lsu_ex_flush_o ), .testmode_i(1'b0 ), .data_i (pe_req_i ), .valid_i (pe_req_valid_i_msk), @@ -156,8 +156,12 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: end end - // Flush upon store - if (stu_ex_flush_o) operand_request_valid_d[StA] = 1'b0; + // Flush upon mem op with VRF access (st, idx ld, masked mem op) + if (lsu_ex_flush_o) begin + operand_request_valid_d[StA] = 1'b0; + operand_request_valid_d[SlideAddrGenA] = 1'b0; + operand_request_valid_d[MaskM] = 1'b0; + end end always_ff @(posedge clk_i or negedge rst_ni) begin: p_operand_request_ff @@ -294,12 +298,12 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: operand_request_valid_o[MaskM]); end VFU_LoadUnit : pe_req_ready = !(operand_request_valid_o[MaskM] || - (pe_req_i.op == VLXE && operand_request_valid_o[SlideAddrGenA])); + (pe_req.op == VLXE && operand_request_valid_o[SlideAddrGenA])); VFU_SlideUnit: pe_req_ready = !(operand_request_valid_o[SlideAddrGenA]); VFU_StoreUnit: begin pe_req_ready = !(operand_request_valid_o[StA] || operand_request_valid_o[MaskM] || - (pe_req_i.op == VSXE && operand_request_valid_o[SlideAddrGenA])); + (pe_req.op == VSXE && operand_request_valid_o[SlideAddrGenA])); end VFU_MaskUnit : begin pe_req_ready = !(operand_request_valid_o[AluA] || @@ -544,24 +548,24 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Load indexed operand_request[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, + conv : pe_req.conversion_vs2, target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, + vl : pe_req.vl / NrLanes, + scale_vl : pe_req.scale_vl, vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + vtype : pe_req.vtype, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, cvt_resize: CVT_SAME, default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) + if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req.vl) operand_request[SlideAddrGenA].vl += 1; - operand_request_push[SlideAddrGenA] = pe_req_i.op == VLXE; + operand_request_push[SlideAddrGenA] = pe_req.op == VLXE; end VFU_StoreUnit : begin @@ -611,24 +615,24 @@ module lane_sequencer import ara_pkg::*; import rvv_pkg::*; import cf_math_pkg:: // Store indexed // TODO: add vstart support here operand_request[SlideAddrGenA] = '{ - id : pe_req_i.id, - vs : pe_req_i.vs2, - eew : pe_req_i.eew_vs2, - conv : pe_req_i.conversion_vs2, + id : pe_req.id, + vs : pe_req.vs2, + eew : pe_req.eew_vs2, + conv : pe_req.conversion_vs2, target_fu: MFPU_ADDRGEN, - vl : pe_req_i.vl / NrLanes, - scale_vl : pe_req_i.scale_vl, + vl : pe_req.vl / NrLanes, + scale_vl : pe_req.scale_vl, vstart : vfu_operation_d.vstart, - vtype : pe_req_i.vtype, - hazard : pe_req_i.hazard_vs2 | pe_req_i.hazard_vd, + vtype : pe_req.vtype, + hazard : pe_req.hazard_vs2 | pe_req.hazard_vd, cvt_resize: CVT_SAME, default : '0 }; // Since this request goes outside of the lane, we might need to request an // extra operand regardless of whether it is valid in this lane or not. - if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req_i.vl) + if (operand_request[SlideAddrGenA].vl * NrLanes != pe_req.vl) operand_request[SlideAddrGenA].vl += 1; - operand_request_push[SlideAddrGenA] = pe_req_i.op == VSXE; + operand_request_push[SlideAddrGenA] = pe_req.op == VSXE; end VFU_SlideUnit: begin diff --git a/hardware/src/lane/operand_queues_stage.sv b/hardware/src/lane/operand_queues_stage.sv index 7e320c61c..3310b2de4 100644 --- a/hardware/src/lane/operand_queues_stage.sv +++ b/hardware/src/lane/operand_queues_stage.sv @@ -25,8 +25,8 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math input operand_queue_cmd_t [NrOperandQueues-1:0] operand_queue_cmd_i, input logic [NrOperandQueues-1:0] operand_queue_cmd_valid_i, // Support for store exception flush - input logic stu_ex_flush_i, - output logic stu_ex_flush_o, + input logic lsu_ex_flush_i, + output logic lsu_ex_flush_o, // Interface with the Lane Sequencer output logic mask_b_cmd_pop_o, // Interface with the VFUs @@ -57,7 +57,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math `include "common_cells/registers.svh" // STU flush support - `FF(stu_ex_flush_o, stu_ex_flush_i, 1'b0, clk_i, rst_ni); + `FF(lsu_ex_flush_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni); /////////// // ALU // @@ -225,7 +225,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ) i_operand_queue_st_mask_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .flush_i (stu_ex_flush_o ), + .flush_i (lsu_ex_flush_o ), .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[StA] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[StA]), @@ -270,7 +270,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ) i_operand_queue_slide_addrgen_a ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .flush_i (stu_ex_flush_o ), + .flush_i (lsu_ex_flush_o ), .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[SlideAddrGenA] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[SlideAddrGenA] ), @@ -328,7 +328,7 @@ module operand_queues_stage import ara_pkg::*; import rvv_pkg::*; import cf_math ) i_operand_queue_mask_m ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .flush_i (stu_ex_flush_o ), + .flush_i (lsu_ex_flush_o ), .lane_id_i (lane_id_i ), .operand_queue_cmd_i (operand_queue_cmd_i[MaskM] ), .operand_queue_cmd_valid_i(operand_queue_cmd_valid_i[MaskM]), diff --git a/hardware/src/lane/operand_requester.sv b/hardware/src/lane/operand_requester.sv index 9b03dbe6b..a87cd1541 100644 --- a/hardware/src/lane/operand_requester.sv +++ b/hardware/src/lane/operand_requester.sv @@ -28,8 +28,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( input logic [NrOperandQueues-1:0] operand_request_valid_i, output logic [NrOperandQueues-1:0] operand_request_ready_o, // Support for store exception flush - input logic stu_ex_flush_i, - output logic stu_ex_flush_o, + input logic lsu_ex_flush_i, + output logic lsu_ex_flush_o, // Interface with the VRF output logic [NrBanks-1:0] vrf_req_o, output vaddr_t [NrBanks-1:0] vrf_addr_o, @@ -197,10 +197,10 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( always_ff @(posedge clk_i or negedge rst_ni) begin: p_vinsn_result_written_ff if (!rst_ni) begin vinsn_result_written_q <= '0; - stu_ex_flush_o <= 1'b0; + lsu_ex_flush_o <= 1'b0; end else begin vinsn_result_written_q <= vinsn_result_written_d; - stu_ex_flush_o <= stu_ex_flush_i; + lsu_ex_flush_o <= lsu_ex_flush_i; end end @@ -464,8 +464,8 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( // Always keep the hazard bits up to date with the global hazard table requester_metadata_d.hazard &= global_hazard_table_i[requester_metadata_d.id]; - // Kill all store-unit requests in case of exceptions - if (stu_ex_flush_o && (requester_index == StA)) begin : vstu_exception_idle + // Kill all store-unit, idx, and mem-masked requests in case of exceptions + if (lsu_ex_flush_o && (requester_index == StA || requester_index == SlideAddrGenA || requester_index == MaskM)) begin : vlsu_exception_idle // Reset state state_d = IDLE; // Don't wake up the store queue (redundant, as it will be flushed anyway) @@ -474,7 +474,7 @@ module operand_requester import ara_pkg::*; import rvv_pkg::*; #( requester_metadata_d = '0; // Flush this request lane_operand_req_transposed[requester_index][bank] = '0; - end : vstu_exception_idle + end : vlsu_exception_idle end : operand_requester always_ff @(posedge clk_i or negedge rst_ni) begin diff --git a/hardware/src/vlsu/addrgen.sv b/hardware/src/vlsu/addrgen.sv index cd085c77e..fd23ecf98 100644 --- a/hardware/src/vlsu/addrgen.sv +++ b/hardware/src/vlsu/addrgen.sv @@ -67,7 +67,9 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( input elen_t [NrLanes-1:0] addrgen_operand_i, input target_fu_e [NrLanes-1:0] addrgen_operand_target_fu_i, input logic [NrLanes-1:0] addrgen_operand_valid_i, - output logic addrgen_operand_ready_o + output logic addrgen_operand_ready_o, + // Indexed LSU exception support + input logic lsu_ex_flush_i ); localparam unsigned DataWidth = $bits(elen_t); @@ -178,12 +180,17 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( logic idx_vaddr_valid_d, idx_vaddr_valid_q; logic idx_vaddr_ready_d, idx_vaddr_ready_q; + // Exception support + // This flush should be done after the backend has been flushed, too + logic lsu_ex_flush_q; + // Break the path from the VRF to the AXI request - spill_register #( + spill_register_flushable #( .T(axi_addr_t) ) i_addrgen_idx_op_spill_reg ( .clk_i (clk_i ), .rst_ni (rst_ni ), + .flush_i(lsu_ex_flush_q ), .valid_i(idx_vaddr_valid_d), .ready_o(idx_vaddr_ready_q), .data_i (idx_final_vaddr_d), @@ -556,6 +563,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( lookahead_addr_e_q <= '0; lookahead_addr_se_q <= '0; lookahead_len_q <= '0; + lsu_ex_flush_q <= 1'b0; end else begin state_q <= state_d; pe_req_q <= pe_req_d; @@ -570,6 +578,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( lookahead_addr_e_q <= lookahead_addr_e_d; lookahead_addr_se_q <= lookahead_addr_se_d; lookahead_len_q <= lookahead_len_d; + lsu_ex_flush_q <= lsu_ex_flush_i; end end @@ -1007,7 +1016,7 @@ module addrgen import ara_pkg::*; import rvv_pkg::*; #( end : indexed_data // Ask the MMU for an address translation if virtual memory is enabled - if (en_ld_st_translation_i) begin : translation_req + if (en_ld_st_translation_i && ((state_q != ADDRGEN_IDX_OP) || idx_vaddr_valid_q)) begin : translation_req // Request an address translation mmu_req_d = 1'b1; mmu_vaddr_o = (state_q == ADDRGEN_IDX_OP) ? idx_final_vaddr_q : axi_addrgen_q.addr; diff --git a/hardware/src/vlsu/vldu.sv b/hardware/src/vlsu/vldu.sv index 4171d20c3..934f3c5cf 100644 --- a/hardware/src/vlsu/vldu.sv +++ b/hardware/src/vlsu/vldu.sv @@ -37,6 +37,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrVInsn-1:0] pe_vinsn_running_i, output logic pe_req_ready_o, output pe_resp_t pe_resp_o, + output logic ldu_current_burst_exception_o, // Interface with the address generator input addrgen_axi_req_t axi_addrgen_req_i, input logic axi_addrgen_req_valid_i, @@ -50,6 +51,9 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( output strb_t [NrLanes-1:0] ldu_result_be_o, input logic [NrLanes-1:0] ldu_result_gnt_i, input logic [NrLanes-1:0] ldu_result_final_gnt_i, + // LSU exception support + input logic lsu_ex_flush_i, + output logic lsu_ex_flush_done_o, // Interface with the Mask unit input strb_t [NrLanes-1:0] mask_i, input logic [NrLanes-1:0] mask_valid_i, @@ -73,13 +77,16 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( pe_req_t vinsn_issue_d, vinsn_issue_q; logic vinsn_issue_valid; + // Flush support + logic lsu_ex_flush_q; + for (genvar l = 0; l < NrLanes; l++) begin spill_register_flushable #( .T(strb_t) ) i_vldu_mask_register ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .flush_i (1'b0 ), + .flush_i (ldu_ex_flush_q ), .data_o (mask_q[l] ), .valid_o (mask_valid_q[l] ), .ready_i (mask_ready_d ), @@ -240,10 +247,27 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // (8 * NrLanes) bytes. logic [$clog2(8*NrLanes):0] first_payload_byte_d, first_payload_byte_q; logic [$clog2(8*NrLanes):0] vrf_eff_write_bytes; + // Same thing, but for the commit (resqueue -> VRF) + // Track if this VRF write is the first one for this instruction + logic first_result_queue_read_d, first_result_queue_read_q; + logic [$clog2(8*NrLanes):0] res_queue_eff_write_bytes; + + // Signal that the current burst is having an exception + logic ldu_current_burst_exception_d; // Counter to increase the VRF write address. vlen_t seq_word_wr_offset_d, seq_word_wr_offset_q; + // Exception handling FSM + // Needed because of the result queue buffer, which can contain partial + // results upon exception. + enum logic [1:0] { + IDLE, + VALID_RESULT_QUEUE, + WAIT_RESULT_QUEUE, + HANDLE_EXCEPTION + } ldu_ex_state_d, ldu_ex_state_q; + localparam unsigned DataWidthB = DataWidth / 8; always_comb begin: p_vldu @@ -278,6 +302,16 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( mask_ready_d = 1'b0; load_complete_o = 1'b0; + first_result_queue_read_d = first_result_queue_read_q; + + ldu_ex_state_d = ldu_ex_state_q; + + // Normally write multiple of resqueue width + vrf_eff_write_bytes = (NrLanes * DataWidthB); + res_queue_eff_write_bytes = (NrLanes * DataWidthB); + + ldu_current_burst_exception_d = 1'b0; + // Inform the main sequencer if we are idle pe_req_ready_o = !vinsn_queue_full; @@ -470,6 +504,13 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( end : vrf_grant end: vrf_result_write + // How many result bytes can possibly be committed this cycle? + res_queue_eff_write_bytes = (NrLanes * DataWidthB); + // If vstart > 0, the first payload can contain less than (NrLanes * DataWidthB) Bytes + if (first_result_queue_read_q) begin + res_queue_eff_write_bytes = first_payload_byte_q; + end + // All lanes accepted the VRF request // Wait for all the final grants, to be sure that all the results were written back if (!(|result_queue_valid_d[result_queue_read_pnt_q]) && @@ -487,8 +528,11 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( // Decrement the counter of results waiting to be written result_queue_cnt_d -= 1; + // The next write will surely not be the first one anymore + first_result_queue_read_d = 1'b0; + // Decrement the counter of remaining vector elements waiting to be written - commit_cnt_bytes_d = commit_cnt_bytes_q - (NrLanes * DataWidthB); + commit_cnt_bytes_d = commit_cnt_bytes_q - res_queue_eff_write_bytes; if (commit_cnt_bytes_q < (NrLanes * DataWidthB)) begin : commit_cnt_bytes_overflow commit_cnt_bytes_d = '0; end : commit_cnt_bytes_overflow @@ -511,47 +555,80 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( vinsn_queue_d.commit_pnt += 1; // Update the commit counter for the next instruction - if (vinsn_queue_d.commit_cnt != '0) + if (vinsn_queue_d.commit_cnt != '0) begin + first_result_queue_read_d = 1'b1; commit_cnt_bytes_d = ( vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vl - vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vstart ) << unsigned'(vinsn_queue_q.vinsn[vinsn_queue_d.commit_pnt].vtype.vsew); + end end : vinsn_done ///////////////////////// // Handle exceptions // ///////////////////////// - // Clear instruction queue in case of exceptions from addrgen - if (vinsn_issue_valid && ((axi_addrgen_req_valid_i && axi_addrgen_req_i.is_exception) || addrgen_illegal_load_i)) begin : exception - // Signal done to sequencer - pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; + // Handle exceptions in the clean way + // We cannot just abort the instruction since results can be in the result queue, waiting. + unique case (ldu_ex_state_q) + IDLE: begin + // Handle the exception only if this is the last instruction committing results + if (vinsn_issue_valid && (vinsn_queue_q.commit_cnt == 1) && + ((axi_addrgen_req_valid_i && axi_addrgen_req_i.is_exception) || addrgen_illegal_load_i)) begin + ldu_ex_state_d = VALID_RESULT_QUEUE; + end + end + // Write the partial results to the VRF + VALID_RESULT_QUEUE: begin + ldu_ex_state_d = WAIT_RESULT_QUEUE; + // Send to the lanes what we had written in the resqueue before the exception. + // If this is empty, the byte-enalbe signals should be zero, so no write happens. + result_queue_valid_d[result_queue_write_pnt_q] |= {NrLanes{1'b1}}; + end + // Wait until the resqueue is empty + WAIT_RESULT_QUEUE: begin + if (!(|result_queue_valid_q[result_queue_read_pnt_q])) begin + ldu_ex_state_d = HANDLE_EXCEPTION; + end + end + // Handle the exception + HANDLE_EXCEPTION: begin + ldu_ex_state_d = IDLE; - // Signal complete load - load_complete_o = 1'b1; + // Signal done to sequencer + pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; - // Ack the addrgen for this last faulty request - axi_addrgen_req_ready_o = axi_addrgen_req_valid_i; - // Reset axi state - axi_len_d = '0; - axi_r_byte_pnt_d = '0; + // Signal complete load + load_complete_o = 1'b1; - // Update the commit counters and pointers - vinsn_queue_d.commit_cnt -= 1; - if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) - vinsn_queue_d.commit_pnt = '0; - else - vinsn_queue_d.commit_pnt += 1; + // Reset axi state + axi_len_d = '0; + axi_r_byte_pnt_d = '0; + + // Ack the addrgen for this last faulty request + axi_addrgen_req_ready_o = axi_addrgen_req_valid_i; + + // Abort the main sequencer -> operand-req request + ldu_current_burst_exception_d = 1'b1; + + // Increment vector instruction queue pointers and counters + vinsn_queue_d.issue_cnt -= 1; + if (vinsn_queue_q.issue_pnt == (VInsnQueueDepth-1)) begin : issue_pnt_overflow + vinsn_queue_d.issue_pnt = '0; + end : issue_pnt_overflow + else begin : issue_pnt_increment + vinsn_queue_d.issue_pnt += 1; + end : issue_pnt_increment - // Increment vector instruction queue pointers and counters - vinsn_queue_d.issue_cnt -= 1; - if (vinsn_queue_q.issue_pnt == (VInsnQueueDepth-1)) begin : issue_pnt_overflow - vinsn_queue_d.issue_pnt = '0; - end : issue_pnt_overflow - else begin : issue_pnt_increment - vinsn_queue_d.issue_pnt += 1; - end : issue_pnt_increment - end : exception + // Update the commit counters and pointers + vinsn_queue_d.commit_cnt -= 1; + if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) + vinsn_queue_d.commit_pnt = '0; + else + vinsn_queue_d.commit_pnt += 1; + end + default:; + endcase ////////////////////////////// // Accept new instruction // @@ -567,6 +644,7 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( issue_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew); end : issue_cnt_bytes_init if (vinsn_queue_d.commit_cnt == '0) begin : commit_cnt_bytes_init + first_result_queue_read_d = 1'b1; commit_cnt_bytes_d = (pe_req_i.vl - pe_req_i.vstart) << unsigned'(pe_req_i.vtype.vsew); end : commit_cnt_bytes_init @@ -589,29 +667,39 @@ module vldu import ara_pkg::*; import rvv_pkg::*; #( always_ff @(posedge clk_i or negedge rst_ni) begin if (!rst_ni) begin - vinsn_running_q <= '0; - issue_cnt_bytes_q <= '0; - commit_cnt_bytes_q <= '0; - axi_len_q <= '0; - axi_r_byte_pnt_q <= '0; - vrf_word_byte_pnt_q <= '0; - pe_resp_o <= '0; - result_final_gnt_q <= '0; - seq_word_wr_offset_q <= '0; - first_payload_byte_q <= '0; - vrf_word_byte_cnt_q <= '0; + vinsn_running_q <= '0; + issue_cnt_bytes_q <= '0; + commit_cnt_bytes_q <= '0; + axi_len_q <= '0; + axi_r_byte_pnt_q <= '0; + vrf_word_byte_pnt_q <= '0; + pe_resp_o <= '0; + result_final_gnt_q <= '0; + seq_word_wr_offset_q <= '0; + first_payload_byte_q <= '0; + vrf_word_byte_cnt_q <= '0; + lsu_ex_flush_q <= lsu_ex_flush_i; + lsu_ex_flush_done_o <= lsu_ex_flush_q; + ldu_current_burst_exception_o <= 1'b0; + ldu_ex_state_q <= IDLE; + first_result_queue_read_q <= 1'b0; end else begin - vinsn_running_q <= vinsn_running_d; - issue_cnt_bytes_q <= issue_cnt_bytes_d; - commit_cnt_bytes_q <= commit_cnt_bytes_d; - axi_len_q <= axi_len_d; - axi_r_byte_pnt_q <= axi_r_byte_pnt_d; - vrf_word_byte_pnt_q <= vrf_word_byte_pnt_d; - pe_resp_o <= pe_resp_d; - result_final_gnt_q <= result_final_gnt_d; - seq_word_wr_offset_q <= seq_word_wr_offset_d; - first_payload_byte_q <= first_payload_byte_d; - vrf_word_byte_cnt_q <= vrf_word_byte_cnt_d; + vinsn_running_q <= vinsn_running_d; + issue_cnt_bytes_q <= issue_cnt_bytes_d; + commit_cnt_bytes_q <= commit_cnt_bytes_d; + axi_len_q <= axi_len_d; + axi_r_byte_pnt_q <= axi_r_byte_pnt_d; + vrf_word_byte_pnt_q <= vrf_word_byte_pnt_d; + pe_resp_o <= pe_resp_d; + result_final_gnt_q <= result_final_gnt_d; + seq_word_wr_offset_q <= seq_word_wr_offset_d; + first_payload_byte_q <= first_payload_byte_d; + vrf_word_byte_cnt_q <= vrf_word_byte_cnt_d; + lsu_ex_flush_q <= lsu_ex_flush_i; + lsu_ex_flush_done_o <= lsu_ex_flush_q; + ldu_current_burst_exception_o <= ldu_current_burst_exception_d; + ldu_ex_state_q <= ldu_ex_state_d; + first_result_queue_read_q <= first_result_queue_read_d; end end diff --git a/hardware/src/vlsu/vlsu.sv b/hardware/src/vlsu/vlsu.sv index e3797f88b..c449d73a3 100644 --- a/hardware/src/vlsu/vlsu.sv +++ b/hardware/src/vlsu/vlsu.sv @@ -49,7 +49,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( output ariane_pkg::exception_t addrgen_exception_o, output vlen_t addrgen_exception_vstart_o, output logic addrgen_fof_exception_o, - output logic stu_current_burst_exception_o, + output logic lsu_current_burst_exception_o, // Interface with the lanes // Store unit operands input elen_t [NrLanes-1:0] stu_operand_i, @@ -61,8 +61,8 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrLanes-1:0] addrgen_operand_valid_i, output logic addrgen_operand_ready_o, // STU exception support - input logic stu_ex_flush_i, - output logic stu_ex_flush_done_o, + input logic lsu_ex_flush_i, + output logic lsu_ex_flush_done_o, // Interface with the Mask unit input strb_t [NrLanes-1:0] mask_i, input logic [NrLanes-1:0] mask_valid_i, @@ -97,11 +97,18 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( input logic [NrLanes-1:0] ldu_result_final_gnt_i ); + `include "common_cells/registers.svh" + logic load_complete, store_complete; logic addrgen_illegal_load, addrgen_illegal_store; assign load_complete_o = load_complete; assign store_complete_o = store_complete; + logic stu_current_burst_exception, ldu_current_burst_exception; + assign lsu_current_burst_exception_o = stu_current_burst_exception | ldu_current_burst_exception; + + `FF(lsu_ex_flush_done_o, lsu_ex_flush_i, 1'b0, clk_i, rst_ni); + /////////////////// // Definitions // /////////////////// @@ -184,6 +191,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .axi_addrgen_req_valid_o (axi_addrgen_req_valid ), .ldu_axi_addrgen_req_ready_i(ldu_axi_addrgen_req_ready ), .stu_axi_addrgen_req_ready_i(stu_axi_addrgen_req_ready ), + .lsu_ex_flush_i (lsu_ex_flush_i ), // CSR input .en_ld_st_translation_i, @@ -226,6 +234,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_vinsn_running_i (pe_vinsn_running_i ), .pe_req_ready_o (pe_req_ready_o[OffsetLoad]), .pe_resp_o (pe_resp_o[OffsetLoad] ), + .ldu_current_burst_exception_o (ldu_current_burst_exception), // Interface with the address generator .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ), @@ -242,7 +251,8 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .ldu_result_wdata_o (ldu_result_wdata_o ), .ldu_result_be_o (ldu_result_be_o ), .ldu_result_gnt_i (ldu_result_gnt_i ), - .ldu_result_final_gnt_i (ldu_result_final_gnt_i ) + .ldu_result_final_gnt_i (ldu_result_final_gnt_i ), + .lsu_ex_flush_i (lsu_ex_flush_i ) ); ///////////////////////// @@ -278,7 +288,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .pe_vinsn_running_i (pe_vinsn_running_i ), .pe_req_ready_o (pe_req_ready_o[OffsetStore]), .pe_resp_o (pe_resp_o[OffsetStore] ), - .stu_current_burst_exception_o (stu_current_burst_exception_o), + .stu_current_burst_exception_o (stu_current_burst_exception), // Interface with the address generator .axi_addrgen_req_i (axi_addrgen_req ), .axi_addrgen_req_valid_i(axi_addrgen_req_valid ), @@ -292,8 +302,7 @@ module vlsu import ara_pkg::*; import rvv_pkg::*; #( .stu_operand_i (stu_operand_i ), .stu_operand_valid_i (stu_operand_valid_i ), .stu_operand_ready_o (stu_operand_ready_o ), - .stu_ex_flush_i (stu_ex_flush_i ), - .stu_ex_flush_done_o (stu_ex_flush_done_o ) + .lsu_ex_flush_i (lsu_ex_flush_i ) ); ////////////////// diff --git a/hardware/src/vlsu/vstu.sv b/hardware/src/vlsu/vstu.sv index 56e4ff130..845c81a37 100644 --- a/hardware/src/vlsu/vstu.sv +++ b/hardware/src/vlsu/vstu.sv @@ -58,9 +58,8 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( input elen_t [NrLanes-1:0] stu_operand_i, input logic [NrLanes-1:0] stu_operand_valid_i, output logic [NrLanes-1:0] stu_operand_ready_o, - // STU exception support - input logic stu_ex_flush_i, - output logic stu_ex_flush_done_o, + // LSU exception support + input logic lsu_ex_flush_i, // Interface with the Mask unit input strb_t [NrLanes-1:0] mask_i, input logic [NrLanes-1:0] mask_valid_i, @@ -81,7 +80,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( elen_t [NrLanes-1:0] stu_operand; logic [NrLanes-1:0] stu_operand_valid; logic [NrLanes-1:0] stu_operand_ready; - logic stu_ex_flush_q; + logic lsu_ex_flush_q; for (genvar lane = 0; lane < NrLanes; lane++) begin: gen_regs fall_through_register #( @@ -89,7 +88,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( ) i_register ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .clr_i (stu_ex_flush_q ), + .clr_i (lsu_ex_flush_q ), .testmode_i(1'b0 ), .data_i (stu_operand_i[lane] ), .valid_i (stu_operand_valid_i[lane]), @@ -118,7 +117,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( ) i_vstu_mask_register ( .clk_i (clk_i ), .rst_ni (rst_ni ), - .flush_i (stu_ex_flush_q ), + .flush_i (lsu_ex_flush_q ), .data_o (mask_q[l] ), .valid_o (mask_valid_q[l] ), .ready_i (mask_ready_d ), @@ -473,21 +472,19 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( stu_current_burst_exception_d = 1'b1; // Mark the vector instruction as being done - // if (vinsn_queue_d.issue_pnt != vinsn_queue_d.commit_pnt) begin : instr_done - // Signal done to sequencer - store_complete_o = 1'b1; - - pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; - - // Update the commit counters and pointers - vinsn_queue_d.commit_cnt -= 1; - if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) begin : commit_pnt_overflow - vinsn_queue_d.commit_pnt = '0; - end : commit_pnt_overflow - else begin : commit_pnt_increment - vinsn_queue_d.commit_pnt += 1; - end : commit_pnt_increment - // end : instr_done + // Signal done to sequencer + store_complete_o = 1'b1; + + pe_resp_d.vinsn_done[vinsn_commit.id] = 1'b1; + + // Update the commit counters and pointers + vinsn_queue_d.commit_cnt -= 1; + if (vinsn_queue_d.commit_pnt == VInsnQueueDepth-1) begin : commit_pnt_overflow + vinsn_queue_d.commit_pnt = '0; + end : commit_pnt_overflow + else begin : commit_pnt_increment + vinsn_queue_d.commit_pnt += 1; + end : commit_pnt_increment end : exception ////////////////////////////// @@ -537,9 +534,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( vrf_cnt_q <= '0; - stu_ex_flush_q <= 1'b0; - - stu_ex_flush_done_o <= 1'b0; + lsu_ex_flush_q <= 1'b0; stu_current_burst_exception_o <= 1'b0; end else begin @@ -556,9 +551,7 @@ module vstu import ara_pkg::*; import rvv_pkg::*; #( vrf_cnt_q <= vrf_cnt_d; - stu_ex_flush_q <= stu_ex_flush_i; - - stu_ex_flush_done_o <= stu_ex_flush_q; + lsu_ex_flush_q <= lsu_ex_flush_i; stu_current_burst_exception_o <= stu_current_burst_exception_d; end