diff --git a/kernel/arm64/amax.S b/kernel/arm64/amax.S index c02321ae0e..f535ddf273 100644 --- a/kernel/arm64/amax.S +++ b/kernel/arm64/amax.S @@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble amax_kernel_zero + ble .Lamax_kernel_zero cmp INC_X, xzr - ble amax_kernel_zero + ble .Lamax_kernel_zero cmp INC_X, #1 - bne amax_kernel_S_BEGIN + bne .Lamax_kernel_S_BEGIN -amax_kernel_F_BEGIN: +.Lamax_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq amax_kernel_F1_INIT + beq .Lamax_kernel_F1_INIT INIT_F4 subs I, I, #1 - beq amax_kernel_F1 + beq .Lamax_kernel_F1 -amax_kernel_F4: +.Lamax_kernel_F4: KERNEL_F4 subs I, I, #1 - bne amax_kernel_F4 + bne .Lamax_kernel_F4 -amax_kernel_F1: +.Lamax_kernel_F1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lamax_kernel_L999 -amax_kernel_F10: +.Lamax_kernel_F10: KERNEL_F1 subs I, I, #1 - bne amax_kernel_F10 + bne .Lamax_kernel_F10 ret -amax_kernel_F1_INIT: +.Lamax_kernel_F1_INIT: INIT_F1 subs N, N, #1 - b amax_kernel_F1 + b .Lamax_kernel_F1 -amax_kernel_S_BEGIN: +.Lamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble amax_kernel_L999 + ble .Lamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble amax_kernel_S1 + ble .Lamax_kernel_S1 -amax_kernel_S4: +.Lamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -223,25 +223,25 @@ amax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S4 + bne .Lamax_kernel_S4 -amax_kernel_S1: +.Lamax_kernel_S1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lamax_kernel_L999 -amax_kernel_S10: +.Lamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S10 + bne .Lamax_kernel_S10 -amax_kernel_L999: +.Lamax_kernel_L999: ret -amax_kernel_zero: +.Lamax_kernel_zero: fmov MAXF, REG0 ret diff --git a/kernel/arm64/asum.S b/kernel/arm64/asum.S index bee8927b17..e88eb07c2e 100644 --- a/kernel/arm64/asum.S +++ b/kernel/arm64/asum.S @@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif cmp N, xzr - ble asum_kernel_L999 + ble .Lasum_kernel_L999 cmp INC_X, xzr - ble asum_kernel_L999 + ble .Lasum_kernel_L999 cmp INC_X, #1 - bne asum_kernel_S_BEGIN + bne .Lasum_kernel_S_BEGIN -asum_kernel_F_BEGIN: +.Lasum_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq asum_kernel_F1 + beq .Lasum_kernel_F1 -asum_kernel_F8: +.Lasum_kernel_F8: KERNEL_F8 subs I, I, #1 - bne asum_kernel_F8 + bne .Lasum_kernel_F8 KERNEL_F8_FINALIZE -asum_kernel_F1: +.Lasum_kernel_F1: ands I, N, #7 - ble asum_kernel_L999 + ble .Lasum_kernel_L999 -asum_kernel_F10: +.Lasum_kernel_F10: KERNEL_F1 subs I, I, #1 - bne asum_kernel_F10 + bne .Lasum_kernel_F10 -asum_kernel_L999: +.Lasum_kernel_L999: ret -asum_kernel_S_BEGIN: +.Lasum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble asum_kernel_S1 + ble .Lasum_kernel_S1 -asum_kernel_S4: +.Lasum_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -175,19 +175,19 @@ asum_kernel_S4: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S4 + bne .Lasum_kernel_S4 -asum_kernel_S1: +.Lasum_kernel_S1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lasum_kernel_L999 -asum_kernel_S10: +.Lasum_kernel_S10: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S10 + bne .Lasum_kernel_S10 ret diff --git a/kernel/arm64/axpy.S b/kernel/arm64/axpy.S index 554902c09c..8094351105 100644 --- a/kernel/arm64/axpy.S +++ b/kernel/arm64/axpy.S @@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble axpy_kernel_L999 + ble .Laxpy_kernel_L999 fcmp DA, #0.0 - beq axpy_kernel_L999 + beq .Laxpy_kernel_L999 cmp INC_X, #1 - bne axpy_kernel_S_BEGIN + bne .Laxpy_kernel_S_BEGIN cmp INC_Y, #1 - bne axpy_kernel_S_BEGIN + bne .Laxpy_kernel_S_BEGIN -axpy_kernel_F_BEGIN: +.Laxpy_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq axpy_kernel_F1 + beq .Laxpy_kernel_F1 -axpy_kernel_F8: +.Laxpy_kernel_F8: KERNEL_F8 subs I, I, #1 - bne axpy_kernel_F8 + bne .Laxpy_kernel_F8 -axpy_kernel_F1: +.Laxpy_kernel_F1: ands I, N, #7 - ble axpy_kernel_L999 + ble .Laxpy_kernel_L999 -axpy_kernel_F10: +.Laxpy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne axpy_kernel_F10 + bne .Laxpy_kernel_F10 mov w0, wzr ret -axpy_kernel_S_BEGIN: +.Laxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble axpy_kernel_S1 + ble .Laxpy_kernel_S1 -axpy_kernel_S4: +.Laxpy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -189,21 +189,21 @@ axpy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S4 + bne .Laxpy_kernel_S4 -axpy_kernel_S1: +.Laxpy_kernel_S1: ands I, N, #3 - ble axpy_kernel_L999 + ble .Laxpy_kernel_L999 -axpy_kernel_S10: +.Laxpy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S10 + bne .Laxpy_kernel_S10 -axpy_kernel_L999: +.Laxpy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/casum.S b/kernel/arm64/casum.S index 8f09eecfa5..7c82827a54 100644 --- a/kernel/arm64/casum.S +++ b/kernel/arm64/casum.S @@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov s1, SUMF cmp N, xzr - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 cmp INC_X, xzr - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 cmp INC_X, #1 - bne asum_kernel_S_BEGIN + bne .Lcasum_kernel_S_BEGIN -asum_kernel_F_BEGIN: +.Lcasum_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq asum_kernel_F1 + beq .Lcasum_kernel_F1 -asum_kernel_F8: +.Lcasum_kernel_F8: KERNEL_F8 subs I, I, #1 - bne asum_kernel_F8 + bne .Lcasum_kernel_F8 KERNEL_F8_FINALIZE -asum_kernel_F1: +.Lcasum_kernel_F1: ands I, N, #7 - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 -asum_kernel_F10: +.Lcasum_kernel_F10: KERNEL_F1 subs I, I, #1 - bne asum_kernel_F10 + bne .Lcasum_kernel_F10 -asum_kernel_L999: +.Lcasum_kernel_L999: ret -asum_kernel_S_BEGIN: +.Lcasum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble asum_kernel_S1 + ble .Lcasum_kernel_S1 -asum_kernel_S4: +.Lcasum_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -151,19 +151,19 @@ asum_kernel_S4: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S4 + bne .Lcasum_kernel_S4 -asum_kernel_S1: +.Lcasum_kernel_S1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lcasum_kernel_L999 -asum_kernel_S10: +.Lcasum_kernel_S10: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S10 + bne .Lcasum_kernel_S10 ret diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S index 7f2ddea074..bbf0c7537c 100644 --- a/kernel/arm64/cgemm_kernel_4x4.S +++ b/kernel/arm64/cgemm_kernel_4x4.S @@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble cgemm_kernel_L2_BEGIN + ble .Lcgemm_kernel_L2_BEGIN /******************************************************************************/ -cgemm_kernel_L4_BEGIN: +.Lcgemm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array add ppA, temp, pA -cgemm_kernel_L4_M8_BEGIN: +.Lcgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L4_M4_BEGIN + ble .Lcgemm_kernel_L4_M4_BEGIN -cgemm_kernel_L4_M8_20: +.Lcgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt cgemm_kernel_L4_M8_32 + blt .Lcgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 // subtract 2 - ble cgemm_kernel_L4_M8_22a + ble .Lcgemm_kernel_L4_M8_22a .align 5 -cgemm_kernel_L4_M8_22: +.Lcgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M8_22 + bgt .Lcgemm_kernel_L4_M8_22 -cgemm_kernel_L4_M8_22a: +.Lcgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_32: +.Lcgemm_kernel_L4_M8_32: tst counterL, #1 - ble cgemm_kernel_L4_M8_40 + ble .Lcgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_40: +.Lcgemm_kernel_L4_M8_40: INIT8x4 -cgemm_kernel_L4_M8_44: +.Lcgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble cgemm_kernel_L4_M8_100 + ble .Lcgemm_kernel_L4_M8_100 -cgemm_kernel_L4_M8_46: +.Lcgemm_kernel_L4_M8_46: KERNEL8x4_SUB -cgemm_kernel_L4_M8_100: +.Lcgemm_kernel_L4_M8_100: SAVE8x4 -cgemm_kernel_L4_M8_END: +.Lcgemm_kernel_L4_M8_END: lsl temp, origK, #5 // k * 4 * 8 add pA, pA, temp add ppA, ppA, temp subs counterI, counterI, #1 - bne cgemm_kernel_L4_M8_20 + bne .Lcgemm_kernel_L4_M8_20 -cgemm_kernel_L4_M4_BEGIN: +.Lcgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #4 - ble cgemm_kernel_L4_M2_BEGIN + ble .Lcgemm_kernel_L4_M2_BEGIN -cgemm_kernel_L4_M4_20: +.Lcgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L4_M4_40 + ble .Lcgemm_kernel_L4_M4_40 -cgemm_kernel_L4_M4_22: +.Lcgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_22 + bgt .Lcgemm_kernel_L4_M4_22 -cgemm_kernel_L4_M4_40: +.Lcgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M4_100 + ble .Lcgemm_kernel_L4_M4_100 -cgemm_kernel_L4_M4_42: +.Lcgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_42 + bgt .Lcgemm_kernel_L4_M4_42 -cgemm_kernel_L4_M4_100: +.Lcgemm_kernel_L4_M4_100: SAVE4x4 -cgemm_kernel_L4_M4_END: +.Lcgemm_kernel_L4_M4_END: -cgemm_kernel_L4_M2_BEGIN: +.Lcgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L4_M1_BEGIN + ble .Lcgemm_kernel_L4_M1_BEGIN -cgemm_kernel_L4_M2_20: +.Lcgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M2_40 + ble .Lcgemm_kernel_L4_M2_40 -cgemm_kernel_L4_M2_22: +.Lcgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_22 + bgt .Lcgemm_kernel_L4_M2_22 -cgemm_kernel_L4_M2_40: +.Lcgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M2_100 + ble .Lcgemm_kernel_L4_M2_100 -cgemm_kernel_L4_M2_42: +.Lcgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_42 + bgt .Lcgemm_kernel_L4_M2_42 -cgemm_kernel_L4_M2_100: +.Lcgemm_kernel_L4_M2_100: SAVE2x4 -cgemm_kernel_L4_M2_END: +.Lcgemm_kernel_L4_M2_END: -cgemm_kernel_L4_M1_BEGIN: +.Lcgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END -cgemm_kernel_L4_M1_20: +.Lcgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M1_40 + ble .Lcgemm_kernel_L4_M1_40 -cgemm_kernel_L4_M1_22: +.Lcgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_22 + bgt .Lcgemm_kernel_L4_M1_22 -cgemm_kernel_L4_M1_40: +.Lcgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M1_100 + ble .Lcgemm_kernel_L4_M1_100 -cgemm_kernel_L4_M1_42: +.Lcgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_42 + bgt .Lcgemm_kernel_L4_M1_42 -cgemm_kernel_L4_M1_100: +.Lcgemm_kernel_L4_M1_100: SAVE1x4 -cgemm_kernel_L4_END: +.Lcgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt cgemm_kernel_L4_BEGIN + bgt .Lcgemm_kernel_L4_BEGIN /******************************************************************************/ -cgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble cgemm_kernel_L999 // error, N was less than 4? + ble .Lcgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble cgemm_kernel_L1_BEGIN + ble .Lcgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction -cgemm_kernel_L2_M4_BEGIN: +.Lcgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble cgemm_kernel_L2_M2_BEGIN + ble .Lcgemm_kernel_L2_M2_BEGIN -cgemm_kernel_L2_M4_20: +.Lcgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M4_40 + ble .Lcgemm_kernel_L2_M4_40 .align 5 -cgemm_kernel_L2_M4_22: +.Lcgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_22 + bgt .Lcgemm_kernel_L2_M4_22 -cgemm_kernel_L2_M4_40: +.Lcgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M4_100 + ble .Lcgemm_kernel_L2_M4_100 -cgemm_kernel_L2_M4_42: +.Lcgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_42 + bgt .Lcgemm_kernel_L2_M4_42 -cgemm_kernel_L2_M4_100: +.Lcgemm_kernel_L2_M4_100: SAVE4x2 -cgemm_kernel_L2_M4_END: +.Lcgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L2_M4_20 + bgt .Lcgemm_kernel_L2_M4_20 -cgemm_kernel_L2_M2_BEGIN: +.Lcgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L2_M1_BEGIN + ble .Lcgemm_kernel_L2_M1_BEGIN -cgemm_kernel_L2_M2_20: +.Lcgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M2_40 + ble .Lcgemm_kernel_L2_M2_40 -cgemm_kernel_L2_M2_22: +.Lcgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_22 + bgt .Lcgemm_kernel_L2_M2_22 -cgemm_kernel_L2_M2_40: +.Lcgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M2_100 + ble .Lcgemm_kernel_L2_M2_100 -cgemm_kernel_L2_M2_42: +.Lcgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_42 + bgt .Lcgemm_kernel_L2_M2_42 -cgemm_kernel_L2_M2_100: +.Lcgemm_kernel_L2_M2_100: SAVE2x2 -cgemm_kernel_L2_M2_END: +.Lcgemm_kernel_L2_M2_END: -cgemm_kernel_L2_M1_BEGIN: +.Lcgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END -cgemm_kernel_L2_M1_20: +.Lcgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L2_M1_40 + ble .Lcgemm_kernel_L2_M1_40 -cgemm_kernel_L2_M1_22: +.Lcgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_22 + bgt .Lcgemm_kernel_L2_M1_22 -cgemm_kernel_L2_M1_40: +.Lcgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M1_100 + ble .Lcgemm_kernel_L2_M1_100 -cgemm_kernel_L2_M1_42: +.Lcgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_42 + bgt .Lcgemm_kernel_L2_M1_42 -cgemm_kernel_L2_M1_100: +.Lcgemm_kernel_L2_M1_100: SAVE1x2 -cgemm_kernel_L2_END: +.Lcgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -cgemm_kernel_L1_BEGIN: +.Lcgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble cgemm_kernel_L999 // done + ble .Lcgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN: -cgemm_kernel_L1_M4_BEGIN: +.Lcgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble cgemm_kernel_L1_M2_BEGIN + ble .Lcgemm_kernel_L1_M2_BEGIN -cgemm_kernel_L1_M4_20: +.Lcgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M4_40 + ble .Lcgemm_kernel_L1_M4_40 .align 5 -cgemm_kernel_L1_M4_22: +.Lcgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_22 + bgt .Lcgemm_kernel_L1_M4_22 -cgemm_kernel_L1_M4_40: +.Lcgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M4_100 + ble .Lcgemm_kernel_L1_M4_100 -cgemm_kernel_L1_M4_42: +.Lcgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_42 + bgt .Lcgemm_kernel_L1_M4_42 -cgemm_kernel_L1_M4_100: +.Lcgemm_kernel_L1_M4_100: SAVE4x1 -cgemm_kernel_L1_M4_END: +.Lcgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L1_M4_20 + bgt .Lcgemm_kernel_L1_M4_20 -cgemm_kernel_L1_M2_BEGIN: +.Lcgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L1_M1_BEGIN + ble .Lcgemm_kernel_L1_M1_BEGIN -cgemm_kernel_L1_M2_20: +.Lcgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M2_40 + ble .Lcgemm_kernel_L1_M2_40 -cgemm_kernel_L1_M2_22: +.Lcgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_22 + bgt .Lcgemm_kernel_L1_M2_22 -cgemm_kernel_L1_M2_40: +.Lcgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M2_100 + ble .Lcgemm_kernel_L1_M2_100 -cgemm_kernel_L1_M2_42: +.Lcgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_42 + bgt .Lcgemm_kernel_L1_M2_42 -cgemm_kernel_L1_M2_100: +.Lcgemm_kernel_L1_M2_100: SAVE2x1 -cgemm_kernel_L1_M2_END: +.Lcgemm_kernel_L1_M2_END: -cgemm_kernel_L1_M1_BEGIN: +.Lcgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END -cgemm_kernel_L1_M1_20: +.Lcgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M1_40 + ble .Lcgemm_kernel_L1_M1_40 -cgemm_kernel_L1_M1_22: +.Lcgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_22 + bgt .Lcgemm_kernel_L1_M1_22 -cgemm_kernel_L1_M1_40: +.Lcgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M1_100 + ble .Lcgemm_kernel_L1_M1_100 -cgemm_kernel_L1_M1_42: +.Lcgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_42 + bgt .Lcgemm_kernel_L1_M1_42 -cgemm_kernel_L1_M1_100: +.Lcgemm_kernel_L1_M1_100: SAVE1x1 -cgemm_kernel_L1_END: +.Lcgemm_kernel_L1_END: -cgemm_kernel_L999: +.Lcgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S index 5d1462808b..24e08a646a 100644 --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble cgemm_kernel_L2_BEGIN + ble .Lcgemm_kernel_L2_BEGIN /******************************************************************************/ -cgemm_kernel_L4_BEGIN: +.Lcgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -cgemm_kernel_L4_M8_BEGIN: +.Lcgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L4_M4_BEGIN + ble .Lcgemm_kernel_L4_M4_BEGIN .align 5 -cgemm_kernel_L4_M8_20: +.Lcgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt cgemm_kernel_L4_M8_32 + blt .Lcgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble cgemm_kernel_L4_M8_22a + ble .Lcgemm_kernel_L4_M8_22a .align 5 -cgemm_kernel_L4_M8_22: +.Lcgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M8_22 + bgt .Lcgemm_kernel_L4_M8_22 .align 5 -cgemm_kernel_L4_M8_22a: +.Lcgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 .align 5 -cgemm_kernel_L4_M8_32: +.Lcgemm_kernel_L4_M8_32: tst counterL, #1 - ble cgemm_kernel_L4_M8_40 + ble .Lcgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_40: +.Lcgemm_kernel_L4_M8_40: INIT8x4 -cgemm_kernel_L4_M8_44: +.Lcgemm_kernel_L4_M8_44: ands counterL , origK, #7 - ble cgemm_kernel_L4_M8_100 + ble .Lcgemm_kernel_L4_M8_100 .align 5 -cgemm_kernel_L4_M8_46: +.Lcgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne cgemm_kernel_L4_M8_46 + bne .Lcgemm_kernel_L4_M8_46 -cgemm_kernel_L4_M8_100: +.Lcgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 -cgemm_kernel_L4_M8_END: +.Lcgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne cgemm_kernel_L4_M8_20 + bne .Lcgemm_kernel_L4_M8_20 -cgemm_kernel_L4_M4_BEGIN: +.Lcgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #4 - ble cgemm_kernel_L4_M2_BEGIN + ble .Lcgemm_kernel_L4_M2_BEGIN -cgemm_kernel_L4_M4_20: +.Lcgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt cgemm_kernel_L4_M4_32 + blt .Lcgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble cgemm_kernel_L4_M4_22a + ble .Lcgemm_kernel_L4_M4_22a .align 5 -cgemm_kernel_L4_M4_22: +.Lcgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_22 + bgt .Lcgemm_kernel_L4_M4_22 -cgemm_kernel_L4_M4_22a: +.Lcgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_32: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_32: tst counterL, #1 - ble cgemm_kernel_L4_M4_40 + ble .Lcgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_40: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_40: INIT4x4 -cgemm_kernel_L4_M4_44: +.Lcgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble cgemm_kernel_L4_M4_100 + ble .Lcgemm_kernel_L4_M4_100 -cgemm_kernel_L4_M4_46: +.Lcgemm_kernel_L4_M4_46: KERNEL4x4_SUB -cgemm_kernel_L4_M4_100: +.Lcgemm_kernel_L4_M4_100: SAVE4x4 -cgemm_kernel_L4_M4_END: +.Lcgemm_kernel_L4_M4_END: -cgemm_kernel_L4_M2_BEGIN: +.Lcgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L4_M1_BEGIN + ble .Lcgemm_kernel_L4_M1_BEGIN -cgemm_kernel_L4_M2_20: +.Lcgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M2_40 + ble .Lcgemm_kernel_L4_M2_40 -cgemm_kernel_L4_M2_22: +.Lcgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_22 + bgt .Lcgemm_kernel_L4_M2_22 -cgemm_kernel_L4_M2_40: +.Lcgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M2_100 + ble .Lcgemm_kernel_L4_M2_100 -cgemm_kernel_L4_M2_42: +.Lcgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_42 + bgt .Lcgemm_kernel_L4_M2_42 -cgemm_kernel_L4_M2_100: +.Lcgemm_kernel_L4_M2_100: SAVE2x4 -cgemm_kernel_L4_M2_END: +.Lcgemm_kernel_L4_M2_END: -cgemm_kernel_L4_M1_BEGIN: +.Lcgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END -cgemm_kernel_L4_M1_20: +.Lcgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M1_40 + ble .Lcgemm_kernel_L4_M1_40 -cgemm_kernel_L4_M1_22: +.Lcgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_22 + bgt .Lcgemm_kernel_L4_M1_22 -cgemm_kernel_L4_M1_40: +.Lcgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M1_100 + ble .Lcgemm_kernel_L4_M1_100 -cgemm_kernel_L4_M1_42: +.Lcgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_42 + bgt .Lcgemm_kernel_L4_M1_42 -cgemm_kernel_L4_M1_100: +.Lcgemm_kernel_L4_M1_100: SAVE1x4 -cgemm_kernel_L4_END: +.Lcgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt cgemm_kernel_L4_BEGIN + bgt .Lcgemm_kernel_L4_BEGIN /******************************************************************************/ -cgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble cgemm_kernel_L999 // error, N was less than 4? + ble .Lcgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble cgemm_kernel_L1_BEGIN + ble .Lcgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -cgemm_kernel_L2_M8_BEGIN: +.Lcgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L2_M4_BEGIN + ble .Lcgemm_kernel_L2_M4_BEGIN -cgemm_kernel_L2_M8_20: +.Lcgemm_kernel_L2_M8_20: INIT8x2 @@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M8_40 + ble .Lcgemm_kernel_L2_M8_40 .align 5 -cgemm_kernel_L2_M8_22: +.Lcgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_22 + bgt .Lcgemm_kernel_L2_M8_22 -cgemm_kernel_L2_M8_40: +.Lcgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M8_100 + ble .Lcgemm_kernel_L2_M8_100 -cgemm_kernel_L2_M8_42: +.Lcgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_42 + bgt .Lcgemm_kernel_L2_M8_42 -cgemm_kernel_L2_M8_100: +.Lcgemm_kernel_L2_M8_100: SAVE8x2 -cgemm_kernel_L2_M8_END: +.Lcgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L2_M8_20 + bgt .Lcgemm_kernel_L2_M8_20 -cgemm_kernel_L2_M4_BEGIN: +.Lcgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L2_M2_BEGIN + ble .Lcgemm_kernel_L2_M2_BEGIN -cgemm_kernel_L2_M4_20: +.Lcgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M4_40 + ble .Lcgemm_kernel_L2_M4_40 .align 5 -cgemm_kernel_L2_M4_22: +.Lcgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_22 + bgt .Lcgemm_kernel_L2_M4_22 -cgemm_kernel_L2_M4_40: +.Lcgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M4_100 + ble .Lcgemm_kernel_L2_M4_100 -cgemm_kernel_L2_M4_42: +.Lcgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_42 + bgt .Lcgemm_kernel_L2_M4_42 -cgemm_kernel_L2_M4_100: +.Lcgemm_kernel_L2_M4_100: SAVE4x2 -cgemm_kernel_L2_M4_END: +.Lcgemm_kernel_L2_M4_END: -cgemm_kernel_L2_M2_BEGIN: +.Lcgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L2_M1_BEGIN + ble .Lcgemm_kernel_L2_M1_BEGIN -cgemm_kernel_L2_M2_20: +.Lcgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M2_40 + ble .Lcgemm_kernel_L2_M2_40 -cgemm_kernel_L2_M2_22: +.Lcgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_22 + bgt .Lcgemm_kernel_L2_M2_22 -cgemm_kernel_L2_M2_40: +.Lcgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M2_100 + ble .Lcgemm_kernel_L2_M2_100 -cgemm_kernel_L2_M2_42: +.Lcgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_42 + bgt .Lcgemm_kernel_L2_M2_42 -cgemm_kernel_L2_M2_100: +.Lcgemm_kernel_L2_M2_100: SAVE2x2 -cgemm_kernel_L2_M2_END: +.Lcgemm_kernel_L2_M2_END: -cgemm_kernel_L2_M1_BEGIN: +.Lcgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END -cgemm_kernel_L2_M1_20: +.Lcgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L2_M1_40 + ble .Lcgemm_kernel_L2_M1_40 -cgemm_kernel_L2_M1_22: +.Lcgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_22 + bgt .Lcgemm_kernel_L2_M1_22 -cgemm_kernel_L2_M1_40: +.Lcgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M1_100 + ble .Lcgemm_kernel_L2_M1_100 -cgemm_kernel_L2_M1_42: +.Lcgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_42 + bgt .Lcgemm_kernel_L2_M1_42 -cgemm_kernel_L2_M1_100: +.Lcgemm_kernel_L2_M1_100: SAVE1x2 -cgemm_kernel_L2_END: +.Lcgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -cgemm_kernel_L1_BEGIN: +.Lcgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble cgemm_kernel_L999 // done + ble .Lcgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -cgemm_kernel_L1_M8_BEGIN: +.Lcgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L1_M4_BEGIN + ble .Lcgemm_kernel_L1_M4_BEGIN -cgemm_kernel_L1_M8_20: +.Lcgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M8_40 + ble .Lcgemm_kernel_L1_M8_40 .align 5 -cgemm_kernel_L1_M8_22: +.Lcgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_22 + bgt .Lcgemm_kernel_L1_M8_22 -cgemm_kernel_L1_M8_40: +.Lcgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M8_100 + ble .Lcgemm_kernel_L1_M8_100 -cgemm_kernel_L1_M8_42: +.Lcgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_42 + bgt .Lcgemm_kernel_L1_M8_42 -cgemm_kernel_L1_M8_100: +.Lcgemm_kernel_L1_M8_100: SAVE8x1 -cgemm_kernel_L1_M8_END: +.Lcgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L1_M8_20 + bgt .Lcgemm_kernel_L1_M8_20 -cgemm_kernel_L1_M4_BEGIN: +.Lcgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L1_M2_BEGIN + ble .Lcgemm_kernel_L1_M2_BEGIN -cgemm_kernel_L1_M4_20: +.Lcgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M4_40 + ble .Lcgemm_kernel_L1_M4_40 .align 5 -cgemm_kernel_L1_M4_22: +.Lcgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_22 + bgt .Lcgemm_kernel_L1_M4_22 -cgemm_kernel_L1_M4_40: +.Lcgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M4_100 + ble .Lcgemm_kernel_L1_M4_100 -cgemm_kernel_L1_M4_42: +.Lcgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_42 + bgt .Lcgemm_kernel_L1_M4_42 -cgemm_kernel_L1_M4_100: +.Lcgemm_kernel_L1_M4_100: SAVE4x1 -cgemm_kernel_L1_M4_END: +.Lcgemm_kernel_L1_M4_END: -cgemm_kernel_L1_M2_BEGIN: +.Lcgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L1_M1_BEGIN + ble .Lcgemm_kernel_L1_M1_BEGIN -cgemm_kernel_L1_M2_20: +.Lcgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M2_40 + ble .Lcgemm_kernel_L1_M2_40 -cgemm_kernel_L1_M2_22: +.Lcgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_22 + bgt .Lcgemm_kernel_L1_M2_22 -cgemm_kernel_L1_M2_40: +.Lcgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M2_100 + ble .Lcgemm_kernel_L1_M2_100 -cgemm_kernel_L1_M2_42: +.Lcgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_42 + bgt .Lcgemm_kernel_L1_M2_42 -cgemm_kernel_L1_M2_100: +.Lcgemm_kernel_L1_M2_100: SAVE2x1 -cgemm_kernel_L1_M2_END: +.Lcgemm_kernel_L1_M2_END: -cgemm_kernel_L1_M1_BEGIN: +.Lcgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END -cgemm_kernel_L1_M1_20: +.Lcgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M1_40 + ble .Lcgemm_kernel_L1_M1_40 -cgemm_kernel_L1_M1_22: +.Lcgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_22 + bgt .Lcgemm_kernel_L1_M1_22 -cgemm_kernel_L1_M1_40: +.Lcgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M1_100 + ble .Lcgemm_kernel_L1_M1_100 -cgemm_kernel_L1_M1_42: +.Lcgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_42 + bgt .Lcgemm_kernel_L1_M1_42 -cgemm_kernel_L1_M1_100: +.Lcgemm_kernel_L1_M1_100: SAVE1x1 -cgemm_kernel_L1_END: +.Lcgemm_kernel_L1_END: -cgemm_kernel_L999: +.Lcgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S index 367cd02174..29a68ff227 100644 --- a/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S @@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble cgemm_kernel_L2_BEGIN + ble .Lcgemm_kernel_L2_BEGIN /******************************************************************************/ -cgemm_kernel_L4_BEGIN: +.Lcgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -cgemm_kernel_L4_M8_BEGIN: +.Lcgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L4_M4_BEGIN + ble .Lcgemm_kernel_L4_M4_BEGIN .align 5 -cgemm_kernel_L4_M8_20: +.Lcgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #5 // origK / 32 cmp counterL , #2 - blt cgemm_kernel_L4_M8_32 + blt .Lcgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20: KERNEL8x4_M1_M2_x8 subs counterL, counterL, #2 // subtract 2 - ble cgemm_kernel_L4_M8_22a + ble .Lcgemm_kernel_L4_M8_22a .align 5 -cgemm_kernel_L4_M8_22: +.Lcgemm_kernel_L4_M8_22: KERNEL8x4_M1_M2_x16 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M8_22 + bgt .Lcgemm_kernel_L4_M8_22 .align 5 -cgemm_kernel_L4_M8_22a: +.Lcgemm_kernel_L4_M8_22a: KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x4 @@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 .align 5 -cgemm_kernel_L4_M8_32: +.Lcgemm_kernel_L4_M8_32: tst counterL, #1 - ble cgemm_kernel_L4_M8_40 + ble .Lcgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b cgemm_kernel_L4_M8_44 + b .Lcgemm_kernel_L4_M8_44 -cgemm_kernel_L4_M8_40: +.Lcgemm_kernel_L4_M8_40: INIT8x4 -cgemm_kernel_L4_M8_44: +.Lcgemm_kernel_L4_M8_44: ands counterL , origK, #31 - ble cgemm_kernel_L4_M8_100 + ble .Lcgemm_kernel_L4_M8_100 .align 5 -cgemm_kernel_L4_M8_46: +.Lcgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne cgemm_kernel_L4_M8_46 + bne .Lcgemm_kernel_L4_M8_46 -cgemm_kernel_L4_M8_100: +.Lcgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 -cgemm_kernel_L4_M8_END: +.Lcgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne cgemm_kernel_L4_M8_20 + bne .Lcgemm_kernel_L4_M8_20 -cgemm_kernel_L4_M4_BEGIN: +.Lcgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #4 - ble cgemm_kernel_L4_M2_BEGIN + ble .Lcgemm_kernel_L4_M2_BEGIN -cgemm_kernel_L4_M4_20: +.Lcgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt cgemm_kernel_L4_M4_32 + blt .Lcgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble cgemm_kernel_L4_M4_22a + ble .Lcgemm_kernel_L4_M4_22a .align 5 -cgemm_kernel_L4_M4_22: +.Lcgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M4_22 + bgt .Lcgemm_kernel_L4_M4_22 -cgemm_kernel_L4_M4_22a: +.Lcgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_32: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_32: tst counterL, #1 - ble cgemm_kernel_L4_M4_40 + ble .Lcgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b cgemm_kernel_L4_M4_44 -cgemm_kernel_L4_M4_40: + b .Lcgemm_kernel_L4_M4_44 +.Lcgemm_kernel_L4_M4_40: INIT4x4 -cgemm_kernel_L4_M4_44: +.Lcgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble cgemm_kernel_L4_M4_100 + ble .Lcgemm_kernel_L4_M4_100 -cgemm_kernel_L4_M4_46: +.Lcgemm_kernel_L4_M4_46: KERNEL4x4_SUB -cgemm_kernel_L4_M4_100: +.Lcgemm_kernel_L4_M4_100: SAVE4x4 -cgemm_kernel_L4_M4_END: +.Lcgemm_kernel_L4_M4_END: -cgemm_kernel_L4_M2_BEGIN: +.Lcgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L4_M1_BEGIN + ble .Lcgemm_kernel_L4_M1_BEGIN -cgemm_kernel_L4_M2_20: +.Lcgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M2_40 + ble .Lcgemm_kernel_L4_M2_40 -cgemm_kernel_L4_M2_22: +.Lcgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_22 + bgt .Lcgemm_kernel_L4_M2_22 -cgemm_kernel_L4_M2_40: +.Lcgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M2_100 + ble .Lcgemm_kernel_L4_M2_100 -cgemm_kernel_L4_M2_42: +.Lcgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M2_42 + bgt .Lcgemm_kernel_L4_M2_42 -cgemm_kernel_L4_M2_100: +.Lcgemm_kernel_L4_M2_100: SAVE2x4 -cgemm_kernel_L4_M2_END: +.Lcgemm_kernel_L4_M2_END: -cgemm_kernel_L4_M1_BEGIN: +.Lcgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L4_END + ble .Lcgemm_kernel_L4_END -cgemm_kernel_L4_M1_20: +.Lcgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L4_M1_40 + ble .Lcgemm_kernel_L4_M1_40 -cgemm_kernel_L4_M1_22: +.Lcgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_22 + bgt .Lcgemm_kernel_L4_M1_22 -cgemm_kernel_L4_M1_40: +.Lcgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L4_M1_100 + ble .Lcgemm_kernel_L4_M1_100 -cgemm_kernel_L4_M1_42: +.Lcgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L4_M1_42 + bgt .Lcgemm_kernel_L4_M1_42 -cgemm_kernel_L4_M1_100: +.Lcgemm_kernel_L4_M1_100: SAVE1x4 -cgemm_kernel_L4_END: +.Lcgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt cgemm_kernel_L4_BEGIN + bgt .Lcgemm_kernel_L4_BEGIN /******************************************************************************/ -cgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble cgemm_kernel_L999 // error, N was less than 4? + ble .Lcgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble cgemm_kernel_L1_BEGIN + ble .Lcgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -cgemm_kernel_L2_M8_BEGIN: +.Lcgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L2_M4_BEGIN + ble .Lcgemm_kernel_L2_M4_BEGIN -cgemm_kernel_L2_M8_20: +.Lcgemm_kernel_L2_M8_20: INIT8x2 @@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M8_40 + ble .Lcgemm_kernel_L2_M8_40 .align 5 -cgemm_kernel_L2_M8_22: +.Lcgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_22 + bgt .Lcgemm_kernel_L2_M8_22 -cgemm_kernel_L2_M8_40: +.Lcgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M8_100 + ble .Lcgemm_kernel_L2_M8_100 -cgemm_kernel_L2_M8_42: +.Lcgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M8_42 + bgt .Lcgemm_kernel_L2_M8_42 -cgemm_kernel_L2_M8_100: +.Lcgemm_kernel_L2_M8_100: SAVE8x2 -cgemm_kernel_L2_M8_END: +.Lcgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L2_M8_20 + bgt .Lcgemm_kernel_L2_M8_20 -cgemm_kernel_L2_M4_BEGIN: +.Lcgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L2_M2_BEGIN + ble .Lcgemm_kernel_L2_M2_BEGIN -cgemm_kernel_L2_M4_20: +.Lcgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M4_40 + ble .Lcgemm_kernel_L2_M4_40 .align 5 -cgemm_kernel_L2_M4_22: +.Lcgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_22 + bgt .Lcgemm_kernel_L2_M4_22 -cgemm_kernel_L2_M4_40: +.Lcgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M4_100 + ble .Lcgemm_kernel_L2_M4_100 -cgemm_kernel_L2_M4_42: +.Lcgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M4_42 + bgt .Lcgemm_kernel_L2_M4_42 -cgemm_kernel_L2_M4_100: +.Lcgemm_kernel_L2_M4_100: SAVE4x2 -cgemm_kernel_L2_M4_END: +.Lcgemm_kernel_L2_M4_END: -cgemm_kernel_L2_M2_BEGIN: +.Lcgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L2_M1_BEGIN + ble .Lcgemm_kernel_L2_M1_BEGIN -cgemm_kernel_L2_M2_20: +.Lcgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble cgemm_kernel_L2_M2_40 + ble .Lcgemm_kernel_L2_M2_40 -cgemm_kernel_L2_M2_22: +.Lcgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_22 + bgt .Lcgemm_kernel_L2_M2_22 -cgemm_kernel_L2_M2_40: +.Lcgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M2_100 + ble .Lcgemm_kernel_L2_M2_100 -cgemm_kernel_L2_M2_42: +.Lcgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M2_42 + bgt .Lcgemm_kernel_L2_M2_42 -cgemm_kernel_L2_M2_100: +.Lcgemm_kernel_L2_M2_100: SAVE2x2 -cgemm_kernel_L2_M2_END: +.Lcgemm_kernel_L2_M2_END: -cgemm_kernel_L2_M1_BEGIN: +.Lcgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L2_END + ble .Lcgemm_kernel_L2_END -cgemm_kernel_L2_M1_20: +.Lcgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble cgemm_kernel_L2_M1_40 + ble .Lcgemm_kernel_L2_M1_40 -cgemm_kernel_L2_M1_22: +.Lcgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_22 + bgt .Lcgemm_kernel_L2_M1_22 -cgemm_kernel_L2_M1_40: +.Lcgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L2_M1_100 + ble .Lcgemm_kernel_L2_M1_100 -cgemm_kernel_L2_M1_42: +.Lcgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L2_M1_42 + bgt .Lcgemm_kernel_L2_M1_42 -cgemm_kernel_L2_M1_100: +.Lcgemm_kernel_L2_M1_100: SAVE1x2 -cgemm_kernel_L2_END: +.Lcgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -cgemm_kernel_L1_BEGIN: +.Lcgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble cgemm_kernel_L999 // done + ble .Lcgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -cgemm_kernel_L1_M8_BEGIN: +.Lcgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble cgemm_kernel_L1_M4_BEGIN + ble .Lcgemm_kernel_L1_M4_BEGIN -cgemm_kernel_L1_M8_20: +.Lcgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M8_40 + ble .Lcgemm_kernel_L1_M8_40 .align 5 -cgemm_kernel_L1_M8_22: +.Lcgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_22 + bgt .Lcgemm_kernel_L1_M8_22 -cgemm_kernel_L1_M8_40: +.Lcgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M8_100 + ble .Lcgemm_kernel_L1_M8_100 -cgemm_kernel_L1_M8_42: +.Lcgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M8_42 + bgt .Lcgemm_kernel_L1_M8_42 -cgemm_kernel_L1_M8_100: +.Lcgemm_kernel_L1_M8_100: SAVE8x1 -cgemm_kernel_L1_M8_END: +.Lcgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt cgemm_kernel_L1_M8_20 + bgt .Lcgemm_kernel_L1_M8_20 -cgemm_kernel_L1_M4_BEGIN: +.Lcgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble cgemm_kernel_L1_M2_BEGIN + ble .Lcgemm_kernel_L1_M2_BEGIN -cgemm_kernel_L1_M4_20: +.Lcgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M4_40 + ble .Lcgemm_kernel_L1_M4_40 .align 5 -cgemm_kernel_L1_M4_22: +.Lcgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_22 + bgt .Lcgemm_kernel_L1_M4_22 -cgemm_kernel_L1_M4_40: +.Lcgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M4_100 + ble .Lcgemm_kernel_L1_M4_100 -cgemm_kernel_L1_M4_42: +.Lcgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M4_42 + bgt .Lcgemm_kernel_L1_M4_42 -cgemm_kernel_L1_M4_100: +.Lcgemm_kernel_L1_M4_100: SAVE4x1 -cgemm_kernel_L1_M4_END: +.Lcgemm_kernel_L1_M4_END: -cgemm_kernel_L1_M2_BEGIN: +.Lcgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble cgemm_kernel_L1_M1_BEGIN + ble .Lcgemm_kernel_L1_M1_BEGIN -cgemm_kernel_L1_M2_20: +.Lcgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M2_40 + ble .Lcgemm_kernel_L1_M2_40 -cgemm_kernel_L1_M2_22: +.Lcgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_22 + bgt .Lcgemm_kernel_L1_M2_22 -cgemm_kernel_L1_M2_40: +.Lcgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M2_100 + ble .Lcgemm_kernel_L1_M2_100 -cgemm_kernel_L1_M2_42: +.Lcgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M2_42 + bgt .Lcgemm_kernel_L1_M2_42 -cgemm_kernel_L1_M2_100: +.Lcgemm_kernel_L1_M2_100: SAVE2x1 -cgemm_kernel_L1_M2_END: +.Lcgemm_kernel_L1_M2_END: -cgemm_kernel_L1_M1_BEGIN: +.Lcgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble cgemm_kernel_L1_END + ble .Lcgemm_kernel_L1_END -cgemm_kernel_L1_M1_20: +.Lcgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble cgemm_kernel_L1_M1_40 + ble .Lcgemm_kernel_L1_M1_40 -cgemm_kernel_L1_M1_22: +.Lcgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_22 + bgt .Lcgemm_kernel_L1_M1_22 -cgemm_kernel_L1_M1_40: +.Lcgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble cgemm_kernel_L1_M1_100 + ble .Lcgemm_kernel_L1_M1_100 -cgemm_kernel_L1_M1_42: +.Lcgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt cgemm_kernel_L1_M1_42 + bgt .Lcgemm_kernel_L1_M1_42 -cgemm_kernel_L1_M1_100: +.Lcgemm_kernel_L1_M1_100: SAVE1x1 -cgemm_kernel_L1_END: +.Lcgemm_kernel_L1_END: -cgemm_kernel_L999: +.Lcgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S index 70eab96fb6..b8c6bfcd42 100644 --- a/kernel/arm64/copy.S +++ b/kernel/arm64/copy.S @@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble copy_kernel_L999 + ble .Lcopy_kernel_L999 cmp INC_X, #1 - bne copy_kernel_S_BEGIN + bne .Lcopy_kernel_S_BEGIN cmp INC_Y, #1 - bne copy_kernel_S_BEGIN + bne .Lcopy_kernel_S_BEGIN -copy_kernel_F_BEGIN: +.Lcopy_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq copy_kernel_F1 + beq .Lcopy_kernel_F1 -copy_kernel_F4: +.Lcopy_kernel_F4: KERNEL_F4 subs I, I, #1 - bne copy_kernel_F4 + bne .Lcopy_kernel_F4 -copy_kernel_F1: +.Lcopy_kernel_F1: ands I, N, #3 - ble copy_kernel_L999 + ble .Lcopy_kernel_L999 -copy_kernel_F10: +.Lcopy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne copy_kernel_F10 + bne .Lcopy_kernel_F10 mov w0, wzr ret -copy_kernel_S_BEGIN: +.Lcopy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble copy_kernel_S1 + ble .Lcopy_kernel_S1 -copy_kernel_S4: +.Lcopy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -210,21 +210,21 @@ copy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne copy_kernel_S4 + bne .Lcopy_kernel_S4 -copy_kernel_S1: +.Lcopy_kernel_S1: ands I, N, #3 - ble copy_kernel_L999 + ble .Lcopy_kernel_L999 -copy_kernel_S10: +.Lcopy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne copy_kernel_S10 + bne .Lcopy_kernel_S10 -copy_kernel_L999: +.Lcopy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S index 3de27257ac..79d33e93c0 100644 --- a/kernel/arm64/ctrmm_kernel_4x4.S +++ b/kernel/arm64/ctrmm_kernel_4x4.S @@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble ctrmm_kernel_L2_BEGIN + ble .Lctrmm_kernel_L2_BEGIN /******************************************************************************/ -ctrmm_kernel_L4_BEGIN: +.Lctrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -ctrmm_kernel_L4_M4_BEGIN: +.Lctrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ctrmm_kernel_L4_M2_BEGIN + ble .Lctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: +.Lctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt ctrmm_kernel_L4_M4_32 + blt .Lctrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble ctrmm_kernel_L4_M4_22a + ble .Lctrmm_kernel_L4_M4_22a .align 5 -ctrmm_kernel_L4_M4_22: +.Lctrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_22 + bgt .Lctrmm_kernel_L4_M4_22 -ctrmm_kernel_L4_M4_22a: +.Lctrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 + b .Lctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_32: +.Lctrmm_kernel_L4_M4_32: tst counterL, #1 - ble ctrmm_kernel_L4_M4_40 + ble .Lctrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 + b .Lctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_40: +.Lctrmm_kernel_L4_M4_40: INIT4x4 -ctrmm_kernel_L4_M4_44: +.Lctrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble ctrmm_kernel_L4_M4_100 + ble .Lctrmm_kernel_L4_M4_100 -ctrmm_kernel_L4_M4_46: +.Lctrmm_kernel_L4_M4_46: KERNEL4x4_SUB -ctrmm_kernel_L4_M4_100: +.Lctrmm_kernel_L4_M4_100: SAVE4x4 @@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L4_M4_END: +.Lctrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne ctrmm_kernel_L4_M4_20 + bne .Lctrmm_kernel_L4_M4_20 -ctrmm_kernel_L4_M2_BEGIN: +.Lctrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L4_M1_BEGIN + ble .Lctrmm_kernel_L4_M1_BEGIN -ctrmm_kernel_L4_M2_20: +.Lctrmm_kernel_L4_M2_20: INIT2x4 @@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M2_40 + ble .Lctrmm_kernel_L4_M2_40 -ctrmm_kernel_L4_M2_22: +.Lctrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_22 + bgt .Lctrmm_kernel_L4_M2_22 -ctrmm_kernel_L4_M2_40: +.Lctrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M2_100 + ble .Lctrmm_kernel_L4_M2_100 -ctrmm_kernel_L4_M2_42: +.Lctrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_42 + bgt .Lctrmm_kernel_L4_M2_42 -ctrmm_kernel_L4_M2_100: +.Lctrmm_kernel_L4_M2_100: SAVE2x4 @@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L4_M2_END: +.Lctrmm_kernel_L4_M2_END: -ctrmm_kernel_L4_M1_BEGIN: +.Lctrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END -ctrmm_kernel_L4_M1_20: +.Lctrmm_kernel_L4_M1_20: INIT1x4 @@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M1_40 + ble .Lctrmm_kernel_L4_M1_40 -ctrmm_kernel_L4_M1_22: +.Lctrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_22 + bgt .Lctrmm_kernel_L4_M1_22 -ctrmm_kernel_L4_M1_40: +.Lctrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M1_100 + ble .Lctrmm_kernel_L4_M1_100 -ctrmm_kernel_L4_M1_42: +.Lctrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_42 + bgt .Lctrmm_kernel_L4_M1_42 -ctrmm_kernel_L4_M1_100: +.Lctrmm_kernel_L4_M1_100: SAVE1x4 @@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L4_END: +.Lctrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt ctrmm_kernel_L4_BEGIN + bgt .Lctrmm_kernel_L4_BEGIN /******************************************************************************/ -ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble ctrmm_kernel_L999 // error, N was less than 4? + ble .Lctrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble ctrmm_kernel_L1_BEGIN + ble .Lctrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -ctrmm_kernel_L2_M4_BEGIN: +.Lctrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble ctrmm_kernel_L2_M2_BEGIN + ble .Lctrmm_kernel_L2_M2_BEGIN -ctrmm_kernel_L2_M4_20: +.Lctrmm_kernel_L2_M4_20: INIT4x2 @@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M4_40 + ble .Lctrmm_kernel_L2_M4_40 .align 5 -ctrmm_kernel_L2_M4_22: +.Lctrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_22 + bgt .Lctrmm_kernel_L2_M4_22 -ctrmm_kernel_L2_M4_40: +.Lctrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M4_100 + ble .Lctrmm_kernel_L2_M4_100 -ctrmm_kernel_L2_M4_42: +.Lctrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_42 + bgt .Lctrmm_kernel_L2_M4_42 -ctrmm_kernel_L2_M4_100: +.Lctrmm_kernel_L2_M4_100: SAVE4x2 @@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L2_M4_END: +.Lctrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L2_M4_20 + bgt .Lctrmm_kernel_L2_M4_20 -ctrmm_kernel_L2_M2_BEGIN: +.Lctrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L2_M1_BEGIN + ble .Lctrmm_kernel_L2_M1_BEGIN -ctrmm_kernel_L2_M2_20: +.Lctrmm_kernel_L2_M2_20: INIT2x2 @@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M2_40 + ble .Lctrmm_kernel_L2_M2_40 -ctrmm_kernel_L2_M2_22: +.Lctrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_22 + bgt .Lctrmm_kernel_L2_M2_22 -ctrmm_kernel_L2_M2_40: +.Lctrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M2_100 + ble .Lctrmm_kernel_L2_M2_100 -ctrmm_kernel_L2_M2_42: +.Lctrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_42 + bgt .Lctrmm_kernel_L2_M2_42 -ctrmm_kernel_L2_M2_100: +.Lctrmm_kernel_L2_M2_100: SAVE2x2 @@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L2_M2_END: +.Lctrmm_kernel_L2_M2_END: -ctrmm_kernel_L2_M1_BEGIN: +.Lctrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END -ctrmm_kernel_L2_M1_20: +.Lctrmm_kernel_L2_M1_20: INIT1x2 @@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble ctrmm_kernel_L2_M1_40 + ble .Lctrmm_kernel_L2_M1_40 -ctrmm_kernel_L2_M1_22: +.Lctrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_22 + bgt .Lctrmm_kernel_L2_M1_22 -ctrmm_kernel_L2_M1_40: +.Lctrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M1_100 + ble .Lctrmm_kernel_L2_M1_100 -ctrmm_kernel_L2_M1_42: +.Lctrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_42 + bgt .Lctrmm_kernel_L2_M1_42 -ctrmm_kernel_L2_M1_100: +.Lctrmm_kernel_L2_M1_100: SAVE1x2 @@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L2_END: +.Lctrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END: /******************************************************************************/ -ctrmm_kernel_L1_BEGIN: +.Lctrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble ctrmm_kernel_L999 // done + ble .Lctrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN: mov pA, origPA // pA = A -ctrmm_kernel_L1_M4_BEGIN: +.Lctrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ctrmm_kernel_L1_M2_BEGIN + ble .Lctrmm_kernel_L1_M2_BEGIN -ctrmm_kernel_L1_M4_20: +.Lctrmm_kernel_L1_M4_20: INIT4x1 @@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M4_40 + ble .Lctrmm_kernel_L1_M4_40 .align 5 -ctrmm_kernel_L1_M4_22: +.Lctrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_22 + bgt .Lctrmm_kernel_L1_M4_22 -ctrmm_kernel_L1_M4_40: +.Lctrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M4_100 + ble .Lctrmm_kernel_L1_M4_100 -ctrmm_kernel_L1_M4_42: +.Lctrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_42 + bgt .Lctrmm_kernel_L1_M4_42 -ctrmm_kernel_L1_M4_100: +.Lctrmm_kernel_L1_M4_100: SAVE4x1 @@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L1_M4_END: +.Lctrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L1_M4_20 + bgt .Lctrmm_kernel_L1_M4_20 -ctrmm_kernel_L1_M2_BEGIN: +.Lctrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L1_M1_BEGIN + ble .Lctrmm_kernel_L1_M1_BEGIN -ctrmm_kernel_L1_M2_20: +.Lctrmm_kernel_L1_M2_20: INIT2x1 @@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M2_40 + ble .Lctrmm_kernel_L1_M2_40 -ctrmm_kernel_L1_M2_22: +.Lctrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_22 + bgt .Lctrmm_kernel_L1_M2_22 -ctrmm_kernel_L1_M2_40: +.Lctrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M2_100 + ble .Lctrmm_kernel_L1_M2_100 -ctrmm_kernel_L1_M2_42: +.Lctrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_42 + bgt .Lctrmm_kernel_L1_M2_42 -ctrmm_kernel_L1_M2_100: +.Lctrmm_kernel_L1_M2_100: SAVE2x1 @@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L1_M2_END: +.Lctrmm_kernel_L1_M2_END: -ctrmm_kernel_L1_M1_BEGIN: +.Lctrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END -ctrmm_kernel_L1_M1_20: +.Lctrmm_kernel_L1_M1_20: INIT1x1 @@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M1_40 + ble .Lctrmm_kernel_L1_M1_40 -ctrmm_kernel_L1_M1_22: +.Lctrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_22 + bgt .Lctrmm_kernel_L1_M1_22 -ctrmm_kernel_L1_M1_40: +.Lctrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M1_100 + ble .Lctrmm_kernel_L1_M1_100 -ctrmm_kernel_L1_M1_42: +.Lctrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_42 + bgt .Lctrmm_kernel_L1_M1_42 -ctrmm_kernel_L1_M1_100: +.Lctrmm_kernel_L1_M1_100: SAVE1x1 -ctrmm_kernel_L1_END: +.Lctrmm_kernel_L1_END: -ctrmm_kernel_L999: +.Lctrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index 680fb56c3c..5c08273975 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble ctrmm_kernel_L2_BEGIN + ble .Lctrmm_kernel_L2_BEGIN /******************************************************************************/ -ctrmm_kernel_L4_BEGIN: +.Lctrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -ctrmm_kernel_L4_M8_BEGIN: +.Lctrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble ctrmm_kernel_L4_M4_BEGIN + ble .Lctrmm_kernel_L4_M4_BEGIN -ctrmm_kernel_L4_M8_20: +.Lctrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20: asr counterL , tempK, #3 cmp counterL , #2 - blt ctrmm_kernel_L4_M8_32 + blt .Lctrmm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble ctrmm_kernel_L4_M8_22a + ble .Lctrmm_kernel_L4_M8_22a .align 5 -ctrmm_kernel_L4_M8_22: +.Lctrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M8_22 + bgt .Lctrmm_kernel_L4_M8_22 .align 5 -ctrmm_kernel_L4_M8_22a: +.Lctrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b ctrmm_kernel_L4_M8_44 + b .Lctrmm_kernel_L4_M8_44 .align 5 -ctrmm_kernel_L4_M8_32: +.Lctrmm_kernel_L4_M8_32: tst counterL, #1 - ble ctrmm_kernel_L4_M8_40 + ble .Lctrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b ctrmm_kernel_L4_M8_44 + b .Lctrmm_kernel_L4_M8_44 -ctrmm_kernel_L4_M8_40: +.Lctrmm_kernel_L4_M8_40: INIT8x4 -ctrmm_kernel_L4_M8_44: +.Lctrmm_kernel_L4_M8_44: ands counterL , tempK, #7 - ble ctrmm_kernel_L4_M8_100 + ble .Lctrmm_kernel_L4_M8_100 .align 5 -ctrmm_kernel_L4_M8_46: +.Lctrmm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne ctrmm_kernel_L4_M8_46 + bne .Lctrmm_kernel_L4_M8_46 -ctrmm_kernel_L4_M8_100: +.Lctrmm_kernel_L4_M8_100: SAVE8x4 @@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -ctrmm_kernel_L4_M8_END: +.Lctrmm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne ctrmm_kernel_L4_M8_20 + bne .Lctrmm_kernel_L4_M8_20 -ctrmm_kernel_L4_M4_BEGIN: +.Lctrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END tst counterI, #4 - ble ctrmm_kernel_L4_M2_BEGIN + ble .Lctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: +.Lctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt ctrmm_kernel_L4_M4_32 + blt .Lctrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble ctrmm_kernel_L4_M4_22a + ble .Lctrmm_kernel_L4_M4_22a .align 5 -ctrmm_kernel_L4_M4_22: +.Lctrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_22 + bgt .Lctrmm_kernel_L4_M4_22 -ctrmm_kernel_L4_M4_22a: +.Lctrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_32: + b .Lctrmm_kernel_L4_M4_44 +.Lctrmm_kernel_L4_M4_32: tst counterL, #1 - ble ctrmm_kernel_L4_M4_40 + ble .Lctrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b ctrmm_kernel_L4_M4_44 -ctrmm_kernel_L4_M4_40: + b .Lctrmm_kernel_L4_M4_44 +.Lctrmm_kernel_L4_M4_40: INIT4x4 -ctrmm_kernel_L4_M4_44: +.Lctrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble ctrmm_kernel_L4_M4_100 + ble .Lctrmm_kernel_L4_M4_100 -ctrmm_kernel_L4_M4_46: +.Lctrmm_kernel_L4_M4_46: KERNEL4x4_SUB -ctrmm_kernel_L4_M4_100: +.Lctrmm_kernel_L4_M4_100: SAVE4x4 @@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L4_M4_END: +.Lctrmm_kernel_L4_M4_END: -ctrmm_kernel_L4_M2_BEGIN: +.Lctrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L4_M1_BEGIN + ble .Lctrmm_kernel_L4_M1_BEGIN -ctrmm_kernel_L4_M2_20: +.Lctrmm_kernel_L4_M2_20: INIT2x4 @@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M2_40 + ble .Lctrmm_kernel_L4_M2_40 -ctrmm_kernel_L4_M2_22: +.Lctrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_22 + bgt .Lctrmm_kernel_L4_M2_22 -ctrmm_kernel_L4_M2_40: +.Lctrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M2_100 + ble .Lctrmm_kernel_L4_M2_100 -ctrmm_kernel_L4_M2_42: +.Lctrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M2_42 + bgt .Lctrmm_kernel_L4_M2_42 -ctrmm_kernel_L4_M2_100: +.Lctrmm_kernel_L4_M2_100: SAVE2x4 @@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L4_M2_END: +.Lctrmm_kernel_L4_M2_END: -ctrmm_kernel_L4_M1_BEGIN: +.Lctrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L4_END + ble .Lctrmm_kernel_L4_END -ctrmm_kernel_L4_M1_20: +.Lctrmm_kernel_L4_M1_20: INIT1x4 @@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L4_M1_40 + ble .Lctrmm_kernel_L4_M1_40 -ctrmm_kernel_L4_M1_22: +.Lctrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_22 + bgt .Lctrmm_kernel_L4_M1_22 -ctrmm_kernel_L4_M1_40: +.Lctrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L4_M1_100 + ble .Lctrmm_kernel_L4_M1_100 -ctrmm_kernel_L4_M1_42: +.Lctrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M1_42 + bgt .Lctrmm_kernel_L4_M1_42 -ctrmm_kernel_L4_M1_100: +.Lctrmm_kernel_L4_M1_100: SAVE1x4 @@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L4_END: +.Lctrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt ctrmm_kernel_L4_BEGIN + bgt .Lctrmm_kernel_L4_BEGIN /******************************************************************************/ -ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble ctrmm_kernel_L999 // error, N was less than 4? + ble .Lctrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble ctrmm_kernel_L1_BEGIN + ble .Lctrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction #endif mov pA, origPA // pA = A -ctrmm_kernel_L2_M8_BEGIN: +.Lctrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble ctrmm_kernel_L2_M4_BEGIN + ble .Lctrmm_kernel_L2_M4_BEGIN -ctrmm_kernel_L2_M8_20: +.Lctrmm_kernel_L2_M8_20: INIT8x2 @@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M8_40 + ble .Lctrmm_kernel_L2_M8_40 .align 5 -ctrmm_kernel_L2_M8_22: +.Lctrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M8_22 + bgt .Lctrmm_kernel_L2_M8_22 -ctrmm_kernel_L2_M8_40: +.Lctrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M8_100 + ble .Lctrmm_kernel_L2_M8_100 -ctrmm_kernel_L2_M8_42: +.Lctrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M8_42 + bgt .Lctrmm_kernel_L2_M8_42 -ctrmm_kernel_L2_M8_100: +.Lctrmm_kernel_L2_M8_100: SAVE8x2 @@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100: add tempOffset, tempOffset, #8 #endif -ctrmm_kernel_L2_M8_END: +.Lctrmm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L2_M8_20 + bgt .Lctrmm_kernel_L2_M8_20 -ctrmm_kernel_L2_M4_BEGIN: +.Lctrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble ctrmm_kernel_L2_M2_BEGIN + ble .Lctrmm_kernel_L2_M2_BEGIN -ctrmm_kernel_L2_M4_20: +.Lctrmm_kernel_L2_M4_20: INIT4x2 @@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M4_40 + ble .Lctrmm_kernel_L2_M4_40 .align 5 -ctrmm_kernel_L2_M4_22: +.Lctrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_22 + bgt .Lctrmm_kernel_L2_M4_22 -ctrmm_kernel_L2_M4_40: +.Lctrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M4_100 + ble .Lctrmm_kernel_L2_M4_100 -ctrmm_kernel_L2_M4_42: +.Lctrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M4_42 + bgt .Lctrmm_kernel_L2_M4_42 -ctrmm_kernel_L2_M4_100: +.Lctrmm_kernel_L2_M4_100: SAVE4x2 @@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L2_M4_END: +.Lctrmm_kernel_L2_M4_END: -ctrmm_kernel_L2_M2_BEGIN: +.Lctrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L2_M1_BEGIN + ble .Lctrmm_kernel_L2_M1_BEGIN -ctrmm_kernel_L2_M2_20: +.Lctrmm_kernel_L2_M2_20: INIT2x2 @@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ctrmm_kernel_L2_M2_40 + ble .Lctrmm_kernel_L2_M2_40 -ctrmm_kernel_L2_M2_22: +.Lctrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_22 + bgt .Lctrmm_kernel_L2_M2_22 -ctrmm_kernel_L2_M2_40: +.Lctrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M2_100 + ble .Lctrmm_kernel_L2_M2_100 -ctrmm_kernel_L2_M2_42: +.Lctrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M2_42 + bgt .Lctrmm_kernel_L2_M2_42 -ctrmm_kernel_L2_M2_100: +.Lctrmm_kernel_L2_M2_100: SAVE2x2 @@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L2_M2_END: +.Lctrmm_kernel_L2_M2_END: -ctrmm_kernel_L2_M1_BEGIN: +.Lctrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L2_END + ble .Lctrmm_kernel_L2_END -ctrmm_kernel_L2_M1_20: +.Lctrmm_kernel_L2_M1_20: INIT1x2 @@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble ctrmm_kernel_L2_M1_40 + ble .Lctrmm_kernel_L2_M1_40 -ctrmm_kernel_L2_M1_22: +.Lctrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_22 + bgt .Lctrmm_kernel_L2_M1_22 -ctrmm_kernel_L2_M1_40: +.Lctrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L2_M1_100 + ble .Lctrmm_kernel_L2_M1_100 -ctrmm_kernel_L2_M1_42: +.Lctrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L2_M1_42 + bgt .Lctrmm_kernel_L2_M1_42 -ctrmm_kernel_L2_M1_100: +.Lctrmm_kernel_L2_M1_100: SAVE1x2 @@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -ctrmm_kernel_L2_END: +.Lctrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END: /******************************************************************************/ -ctrmm_kernel_L1_BEGIN: +.Lctrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble ctrmm_kernel_L999 // done + ble .Lctrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next @@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -ctrmm_kernel_L1_M8_BEGIN: +.Lctrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble ctrmm_kernel_L1_M4_BEGIN + ble .Lctrmm_kernel_L1_M4_BEGIN -ctrmm_kernel_L1_M8_20: +.Lctrmm_kernel_L1_M8_20: INIT8x1 @@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M8_40 + ble .Lctrmm_kernel_L1_M8_40 .align 5 -ctrmm_kernel_L1_M8_22: +.Lctrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M8_22 + bgt .Lctrmm_kernel_L1_M8_22 -ctrmm_kernel_L1_M8_40: +.Lctrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M8_100 + ble .Lctrmm_kernel_L1_M8_100 -ctrmm_kernel_L1_M8_42: +.Lctrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M8_42 + bgt .Lctrmm_kernel_L1_M8_42 -ctrmm_kernel_L1_M8_100: +.Lctrmm_kernel_L1_M8_100: SAVE8x1 @@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100: add tempOffset, tempOffset, #8 #endif -ctrmm_kernel_L1_M8_END: +.Lctrmm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt ctrmm_kernel_L1_M8_20 + bgt .Lctrmm_kernel_L1_M8_20 -ctrmm_kernel_L1_M4_BEGIN: +.Lctrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble ctrmm_kernel_L1_M2_BEGIN + ble .Lctrmm_kernel_L1_M2_BEGIN -ctrmm_kernel_L1_M4_20: +.Lctrmm_kernel_L1_M4_20: INIT4x1 @@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M4_40 + ble .Lctrmm_kernel_L1_M4_40 .align 5 -ctrmm_kernel_L1_M4_22: +.Lctrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_22 + bgt .Lctrmm_kernel_L1_M4_22 -ctrmm_kernel_L1_M4_40: +.Lctrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M4_100 + ble .Lctrmm_kernel_L1_M4_100 -ctrmm_kernel_L1_M4_42: +.Lctrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M4_42 + bgt .Lctrmm_kernel_L1_M4_42 -ctrmm_kernel_L1_M4_100: +.Lctrmm_kernel_L1_M4_100: SAVE4x1 @@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -ctrmm_kernel_L1_M4_END: +.Lctrmm_kernel_L1_M4_END: -ctrmm_kernel_L1_M2_BEGIN: +.Lctrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble ctrmm_kernel_L1_M1_BEGIN + ble .Lctrmm_kernel_L1_M1_BEGIN -ctrmm_kernel_L1_M2_20: +.Lctrmm_kernel_L1_M2_20: INIT2x1 @@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M2_40 + ble .Lctrmm_kernel_L1_M2_40 -ctrmm_kernel_L1_M2_22: +.Lctrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_22 + bgt .Lctrmm_kernel_L1_M2_22 -ctrmm_kernel_L1_M2_40: +.Lctrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M2_100 + ble .Lctrmm_kernel_L1_M2_100 -ctrmm_kernel_L1_M2_42: +.Lctrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M2_42 + bgt .Lctrmm_kernel_L1_M2_42 -ctrmm_kernel_L1_M2_100: +.Lctrmm_kernel_L1_M2_100: SAVE2x1 @@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -ctrmm_kernel_L1_M2_END: +.Lctrmm_kernel_L1_M2_END: -ctrmm_kernel_L1_M1_BEGIN: +.Lctrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ctrmm_kernel_L1_END + ble .Lctrmm_kernel_L1_END -ctrmm_kernel_L1_M1_20: +.Lctrmm_kernel_L1_M1_20: INIT1x1 @@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ctrmm_kernel_L1_M1_40 + ble .Lctrmm_kernel_L1_M1_40 -ctrmm_kernel_L1_M1_22: +.Lctrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_22 + bgt .Lctrmm_kernel_L1_M1_22 -ctrmm_kernel_L1_M1_40: +.Lctrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ctrmm_kernel_L1_M1_100 + ble .Lctrmm_kernel_L1_M1_100 -ctrmm_kernel_L1_M1_42: +.Lctrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ctrmm_kernel_L1_M1_42 + bgt .Lctrmm_kernel_L1_M1_42 -ctrmm_kernel_L1_M1_100: +.Lctrmm_kernel_L1_M1_100: SAVE1x1 -ctrmm_kernel_L1_END: +.Lctrmm_kernel_L1_END: -ctrmm_kernel_L999: +.Lctrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/daxpy_thunderx2t99.S b/kernel/arm64/daxpy_thunderx2t99.S index 5eb2ec0c3b..b8d0af5c2d 100644 --- a/kernel/arm64/daxpy_thunderx2t99.S +++ b/kernel/arm64/daxpy_thunderx2t99.S @@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble axpy_kernel_L999 + ble .Ldaxpy_kernel_L999 fcmp DA, #0.0 - beq axpy_kernel_L999 + beq .Ldaxpy_kernel_L999 cmp INC_X, #1 - bne axpy_kernel_S_BEGIN + bne .Ldaxpy_kernel_S_BEGIN cmp INC_Y, #1 - bne axpy_kernel_S_BEGIN + bne .Ldaxpy_kernel_S_BEGIN -axpy_kernel_F_BEGIN: +.Ldaxpy_kernel_F_BEGIN: asr I, N, #5 cmp I, xzr - beq axpy_kernel_F1 + beq .Ldaxpy_kernel_F1 .align 5 -axpy_kernel_F32: +.Ldaxpy_kernel_F32: KERNEL_F32 subs I, I, #1 - bne axpy_kernel_F32 + bne .Ldaxpy_kernel_F32 -axpy_kernel_F1: +.Ldaxpy_kernel_F1: ands I, N, #31 - ble axpy_kernel_L999 + ble .Ldaxpy_kernel_L999 -axpy_kernel_F10: +.Ldaxpy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne axpy_kernel_F10 + bne .Ldaxpy_kernel_F10 - b axpy_kernel_L999 + b .Ldaxpy_kernel_L999 -axpy_kernel_S_BEGIN: +.Ldaxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble axpy_kernel_S1 + ble .Ldaxpy_kernel_S1 -axpy_kernel_S4: +.Ldaxpy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -176,21 +176,21 @@ axpy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S4 + bne .Ldaxpy_kernel_S4 -axpy_kernel_S1: +.Ldaxpy_kernel_S1: ands I, N, #3 - ble axpy_kernel_L999 + ble .Ldaxpy_kernel_L999 -axpy_kernel_S10: +.Ldaxpy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne axpy_kernel_S10 + bne .Ldaxpy_kernel_S10 -axpy_kernel_L999: +.Ldaxpy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S index 44b0f7ff2b..3491670628 100644 --- a/kernel/arm64/dgemm_kernel_4x4.S +++ b/kernel/arm64/dgemm_kernel_4x4.S @@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN: //------------------------------------------------------------------------------ -dgemm_kernel_L4_M8_BEGIN: +.Ldgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L4_M4_BEGIN + ble .Ldgemm_kernel_L4_M4_BEGIN .align 5 -dgemm_kernel_L4_M8_20: +.Ldgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #2 // L = K / 4 cmp counterL , #2 - blt dgemm_kernel_L4_M8_32 + blt .Ldgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble dgemm_kernel_L4_M8_22a + ble .Ldgemm_kernel_L4_M8_22a .align 5 -dgemm_kernel_L4_M8_22: +.Ldgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M8_22 + bgt .Ldgemm_kernel_L4_M8_22 .align 5 -dgemm_kernel_L4_M8_22a: +.Ldgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 .align 5 -dgemm_kernel_L4_M8_32: +.Ldgemm_kernel_L4_M8_32: tst counterL, #1 - ble dgemm_kernel_L4_M8_40 + ble .Ldgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 -dgemm_kernel_L4_M8_40: +.Ldgemm_kernel_L4_M8_40: INIT8x4 -dgemm_kernel_L4_M8_44: +.Ldgemm_kernel_L4_M8_44: ands counterL , origK, #3 - ble dgemm_kernel_L4_M8_100 + ble .Ldgemm_kernel_L4_M8_100 .align 5 -dgemm_kernel_L4_M8_46: +.Ldgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dgemm_kernel_L4_M8_46 + bne .Ldgemm_kernel_L4_M8_46 -dgemm_kernel_L4_M8_100: +.Ldgemm_kernel_L4_M8_100: lsl temp, origK, #5 prfm PLDL1KEEP, [pA, temp] prfm PLDL1KEEP, [ppA, temp] @@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100: SAVE8x4 -dgemm_kernel_L4_M8_END: +.Ldgemm_kernel_L4_M8_END: lsl temp, origK, #5 // k * 4 * 8 add pA, pA, temp add ppA, ppA, temp subs counterI, counterI, #1 - bne dgemm_kernel_L4_M8_20 + bne .Ldgemm_kernel_L4_M8_20 -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #4 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_42: +.Ldgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_42 + bgt .Ldgemm_kernel_L4_M4_42 -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L4_BEGIN + bgt .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M4_20 + bgt .Ldgemm_kernel_L2_M4_20 -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN: -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M4_20 + bgt .Ldgemm_kernel_L1_M4_20 -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S index b04dbb5d55..ced26b49ce 100644 --- a/kernel/arm64/dgemm_kernel_4x8.S +++ b/kernel/arm64/dgemm_kernel_4x8.S @@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble dgemm_kernel_L4_BEGIN + ble .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L8_BEGIN: +.Ldgemm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 mov pA, origPA // pA = start of A array -dgemm_kernel_L8_M4_BEGIN: +.Ldgemm_kernel_L8_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L8_M2_BEGIN + ble .Ldgemm_kernel_L8_M2_BEGIN -dgemm_kernel_L8_M4_20: +.Ldgemm_kernel_L8_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L8_M4_32 + blt .Ldgemm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble dgemm_kernel_L8_M4_22a + ble .Ldgemm_kernel_L8_M4_22a .align 5 -dgemm_kernel_L8_M4_22: +.Ldgemm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M4_22 + bgt .Ldgemm_kernel_L8_M4_22 -dgemm_kernel_L8_M4_22a: +.Ldgemm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b dgemm_kernel_L8_M4_44 + b .Ldgemm_kernel_L8_M4_44 -dgemm_kernel_L8_M4_32: +.Ldgemm_kernel_L8_M4_32: tst counterL, #1 - ble dgemm_kernel_L8_M4_40 + ble .Ldgemm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b dgemm_kernel_L8_M4_44 + b .Ldgemm_kernel_L8_M4_44 -dgemm_kernel_L8_M4_40: +.Ldgemm_kernel_L8_M4_40: INIT4x8 -dgemm_kernel_L8_M4_44: +.Ldgemm_kernel_L8_M4_44: ands counterL , origK, #1 - ble dgemm_kernel_L8_M4_100 + ble .Ldgemm_kernel_L8_M4_100 -dgemm_kernel_L8_M4_46: +.Ldgemm_kernel_L8_M4_46: KERNEL4x8_SUB -dgemm_kernel_L8_M4_100: +.Ldgemm_kernel_L8_M4_100: SAVE4x8 -dgemm_kernel_L8_M4_END: +.Ldgemm_kernel_L8_M4_END: subs counterI, counterI, #1 - bne dgemm_kernel_L8_M4_20 + bne .Ldgemm_kernel_L8_M4_20 -dgemm_kernel_L8_M2_BEGIN: +.Ldgemm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L8_END + ble .Ldgemm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L8_M1_BEGIN + ble .Ldgemm_kernel_L8_M1_BEGIN -dgemm_kernel_L8_M2_20: +.Ldgemm_kernel_L8_M2_20: INIT2x8 @@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L8_M2_40 + ble .Ldgemm_kernel_L8_M2_40 -dgemm_kernel_L8_M2_22: +.Ldgemm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M2_22 + bgt .Ldgemm_kernel_L8_M2_22 -dgemm_kernel_L8_M2_40: +.Ldgemm_kernel_L8_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L8_M2_100 + ble .Ldgemm_kernel_L8_M2_100 -dgemm_kernel_L8_M2_42: +.Ldgemm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M2_42 + bgt .Ldgemm_kernel_L8_M2_42 -dgemm_kernel_L8_M2_100: +.Ldgemm_kernel_L8_M2_100: SAVE2x8 -dgemm_kernel_L8_M2_END: +.Ldgemm_kernel_L8_M2_END: -dgemm_kernel_L8_M1_BEGIN: +.Ldgemm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L8_END + ble .Ldgemm_kernel_L8_END -dgemm_kernel_L8_M1_20: +.Ldgemm_kernel_L8_M1_20: INIT1x8 @@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L8_M1_40 + ble .Ldgemm_kernel_L8_M1_40 -dgemm_kernel_L8_M1_22: +.Ldgemm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M1_22 + bgt .Ldgemm_kernel_L8_M1_22 -dgemm_kernel_L8_M1_40: +.Ldgemm_kernel_L8_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L8_M1_100 + ble .Ldgemm_kernel_L8_M1_100 -dgemm_kernel_L8_M1_42: +.Ldgemm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L8_M1_42 + bgt .Ldgemm_kernel_L8_M1_42 -dgemm_kernel_L8_M1_100: +.Ldgemm_kernel_L8_M1_100: SAVE1x8 -dgemm_kernel_L8_END: +.Ldgemm_kernel_L8_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 8 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L8_BEGIN + bgt .Ldgemm_kernel_L8_BEGIN /******************************************************************************/ -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble dgemm_kernel_L999 + ble .Ldgemm_kernel_L999 tst counterJ , #4 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 mov pA, origPA // pA = start of A array -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L4_M4_32 + blt .Ldgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble dgemm_kernel_L4_M4_22a + ble .Ldgemm_kernel_L4_M4_22a .align 5 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_22a: +.Ldgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b dgemm_kernel_L4_M4_44 + b .Ldgemm_kernel_L4_M4_44 -dgemm_kernel_L4_M4_32: +.Ldgemm_kernel_L4_M4_32: tst counterL, #1 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b dgemm_kernel_L4_M4_44 + b .Ldgemm_kernel_L4_M4_44 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: INIT4x4 -dgemm_kernel_L4_M4_44: +.Ldgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_46: +.Ldgemm_kernel_L4_M4_46: KERNEL4x4_SUB -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne dgemm_kernel_L4_M4_20 + bne .Ldgemm_kernel_L4_M4_20 -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 @@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 @@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 @@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M4_20 + bgt .Ldgemm_kernel_L2_M4_20 -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 @@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 @@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M4_20 + bgt .Ldgemm_kernel_L1_M4_20 -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 @@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 @@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S index 3fd74fc3ba..af3aa0217d 100644 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN /******************************************************************************/ .align 5 -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dgemm_kernel_L4_M8_BEGIN: +.Ldgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L4_M4_BEGIN + ble .Ldgemm_kernel_L4_M4_BEGIN .align 5 -dgemm_kernel_L4_M8_20: +.Ldgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L4_M8_32 + blt .Ldgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble dgemm_kernel_L4_M8_22a + ble .Ldgemm_kernel_L4_M8_22a .align 5 -dgemm_kernel_L4_M8_22: +.Ldgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M8_22 + bgt .Ldgemm_kernel_L4_M8_22 .align 5 -dgemm_kernel_L4_M8_22a: +.Ldgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 .align 5 -dgemm_kernel_L4_M8_32: +.Ldgemm_kernel_L4_M8_32: tst counterL, #1 - ble dgemm_kernel_L4_M8_40 + ble .Ldgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 -dgemm_kernel_L4_M8_40: +.Ldgemm_kernel_L4_M8_40: INIT8x4 -dgemm_kernel_L4_M8_44: +.Ldgemm_kernel_L4_M8_44: ands counterL , origK, #7 - ble dgemm_kernel_L4_M8_100 + ble .Ldgemm_kernel_L4_M8_100 .align 5 -dgemm_kernel_L4_M8_46: +.Ldgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dgemm_kernel_L4_M8_46 + bne .Ldgemm_kernel_L4_M8_46 -dgemm_kernel_L4_M8_100: +.Ldgemm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE8x4 -dgemm_kernel_L4_M8_END: +.Ldgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne dgemm_kernel_L4_M8_20 + bne .Ldgemm_kernel_L4_M8_20 -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #4 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: INIT4x4 @@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 .align 5 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22: prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_42: +.Ldgemm_kernel_L4_M4_42: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_42 + bgt .Ldgemm_kernel_L4_M4_42 -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 @@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 .align 5 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 @@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 .align 5 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] KERNEL1x4_SUB @@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L4_BEGIN + bgt .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dgemm_kernel_L2_M8_BEGIN: +.Ldgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L2_M4_BEGIN + ble .Ldgemm_kernel_L2_M4_BEGIN .align 5 -dgemm_kernel_L2_M8_20: +.Ldgemm_kernel_L2_M8_20: INIT8x2 @@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M8_40 + ble .Ldgemm_kernel_L2_M8_40 .align 5 -dgemm_kernel_L2_M8_22: +.Ldgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_22 + bgt .Ldgemm_kernel_L2_M8_22 -dgemm_kernel_L2_M8_40: +.Ldgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M8_100 + ble .Ldgemm_kernel_L2_M8_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M8_42: +.Ldgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_42 + bgt .Ldgemm_kernel_L2_M8_42 -dgemm_kernel_L2_M8_100: +.Ldgemm_kernel_L2_M8_100: SAVE8x2 -dgemm_kernel_L2_M8_END: +.Ldgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M8_20 + bgt .Ldgemm_kernel_L2_M8_20 -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 @@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x2_SUB @@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 @@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 @@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, #B_PRE_SIZE] @@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A -dgemm_kernel_L1_M8_BEGIN: +.Ldgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L1_M4_BEGIN + ble .Ldgemm_kernel_L1_M4_BEGIN .align 5 -dgemm_kernel_L1_M8_20: +.Ldgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M8_40 + ble .Ldgemm_kernel_L1_M8_40 .align 5 -dgemm_kernel_L1_M8_22: +.Ldgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_22 + bgt .Ldgemm_kernel_L1_M8_22 -dgemm_kernel_L1_M8_40: +.Ldgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M8_100 + ble .Ldgemm_kernel_L1_M8_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M8_42: +.Ldgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_42 + bgt .Ldgemm_kernel_L1_M8_42 -dgemm_kernel_L1_M8_100: +.Ldgemm_kernel_L1_M8_100: SAVE8x1 -dgemm_kernel_L1_M8_END: +.Ldgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M8_20 + bgt .Ldgemm_kernel_L1_M8_20 -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] KERNEL4x1_SUB @@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 @@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 @@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pA, #A_PRE_SIZE] @@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S index 86865d825c..598db6e0cd 100644 --- a/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S +++ b/kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S @@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dgemm_kernel_L2_BEGIN + ble .Ldgemm_kernel_L2_BEGIN /******************************************************************************/ .align 5 -dgemm_kernel_L4_BEGIN: +.Ldgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dgemm_kernel_L4_M8_BEGIN: +.Ldgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L4_M4_BEGIN + ble .Ldgemm_kernel_L4_M4_BEGIN .align 5 -dgemm_kernel_L4_M8_20: +.Ldgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #7 // L = K / 128 cmp counterL , #2 // is there at least 4 to do? - blt dgemm_kernel_L4_M8_32 + blt .Ldgemm_kernel_L4_M8_32 KERNEL8x4_I KERNEL8x4_M2 @@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20: KERNEL8x4_M1_M2_x1 subs counterL, counterL, #2 // subtract 2 - ble dgemm_kernel_L4_M8_22a + ble .Ldgemm_kernel_L4_M8_22a .align 5 -dgemm_kernel_L4_M8_22: +.Ldgemm_kernel_L4_M8_22: KERNEL8x4_M1_M2_x64 subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M8_22 + bgt .Ldgemm_kernel_L4_M8_22 .align 5 -dgemm_kernel_L4_M8_22a: +.Ldgemm_kernel_L4_M8_22a: KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x16 @@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 .align 5 -dgemm_kernel_L4_M8_32: +.Ldgemm_kernel_L4_M8_32: tst counterL, #1 - ble dgemm_kernel_L4_M8_40 + ble .Ldgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b dgemm_kernel_L4_M8_44 + b .Ldgemm_kernel_L4_M8_44 -dgemm_kernel_L4_M8_40: +.Ldgemm_kernel_L4_M8_40: INIT8x4 -dgemm_kernel_L4_M8_44: +.Ldgemm_kernel_L4_M8_44: ands counterL , origK, #127 - ble dgemm_kernel_L4_M8_100 + ble .Ldgemm_kernel_L4_M8_100 .align 5 -dgemm_kernel_L4_M8_46: +.Ldgemm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dgemm_kernel_L4_M8_46 + bne .Ldgemm_kernel_L4_M8_46 -dgemm_kernel_L4_M8_100: +.Ldgemm_kernel_L4_M8_100: prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] @@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100: SAVE8x4 -dgemm_kernel_L4_M8_END: +.Ldgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne dgemm_kernel_L4_M8_20 + bne .Ldgemm_kernel_L4_M8_20 -dgemm_kernel_L4_M4_BEGIN: +.Ldgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #4 - ble dgemm_kernel_L4_M2_BEGIN + ble .Ldgemm_kernel_L4_M2_BEGIN -dgemm_kernel_L4_M4_20: +.Ldgemm_kernel_L4_M4_20: INIT4x4 @@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M4_40 + ble .Ldgemm_kernel_L4_M4_40 .align 5 -dgemm_kernel_L4_M4_22: +.Ldgemm_kernel_L4_M4_22: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22: prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_22 + bgt .Ldgemm_kernel_L4_M4_22 -dgemm_kernel_L4_M4_40: +.Ldgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M4_100 + ble .Ldgemm_kernel_L4_M4_100 -dgemm_kernel_L4_M4_42: +.Ldgemm_kernel_L4_M4_42: KERNEL4x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M4_42 + bgt .Ldgemm_kernel_L4_M4_42 -dgemm_kernel_L4_M4_100: +.Ldgemm_kernel_L4_M4_100: SAVE4x4 -dgemm_kernel_L4_M4_END: +.Ldgemm_kernel_L4_M4_END: -dgemm_kernel_L4_M2_BEGIN: +.Ldgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L4_M1_BEGIN + ble .Ldgemm_kernel_L4_M1_BEGIN -dgemm_kernel_L4_M2_20: +.Ldgemm_kernel_L4_M2_20: INIT2x4 @@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M2_40 + ble .Ldgemm_kernel_L4_M2_40 .align 5 -dgemm_kernel_L4_M2_22: +.Ldgemm_kernel_L4_M2_22: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_22 + bgt .Ldgemm_kernel_L4_M2_22 -dgemm_kernel_L4_M2_40: +.Ldgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M2_100 + ble .Ldgemm_kernel_L4_M2_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] -dgemm_kernel_L4_M2_42: +.Ldgemm_kernel_L4_M2_42: KERNEL2x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M2_42 + bgt .Ldgemm_kernel_L4_M2_42 -dgemm_kernel_L4_M2_100: +.Ldgemm_kernel_L4_M2_100: SAVE2x4 -dgemm_kernel_L4_M2_END: +.Ldgemm_kernel_L4_M2_END: -dgemm_kernel_L4_M1_BEGIN: +.Ldgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L4_END + ble .Ldgemm_kernel_L4_END -dgemm_kernel_L4_M1_20: +.Ldgemm_kernel_L4_M1_20: INIT1x4 @@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L4_M1_40 + ble .Ldgemm_kernel_L4_M1_40 .align 5 -dgemm_kernel_L4_M1_22: +.Ldgemm_kernel_L4_M1_22: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] KERNEL1x4_SUB @@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_22 + bgt .Ldgemm_kernel_L4_M1_22 -dgemm_kernel_L4_M1_40: +.Ldgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L4_M1_100 + ble .Ldgemm_kernel_L4_M1_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] -dgemm_kernel_L4_M1_42: +.Ldgemm_kernel_L4_M1_42: KERNEL1x4_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L4_M1_42 + bgt .Ldgemm_kernel_L4_M1_42 -dgemm_kernel_L4_M1_100: +.Ldgemm_kernel_L4_M1_100: SAVE1x4 -dgemm_kernel_L4_END: +.Ldgemm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 subs counterJ, counterJ , #1 // j-- - bgt dgemm_kernel_L4_BEGIN + bgt .Ldgemm_kernel_L4_BEGIN /******************************************************************************/ -dgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dgemm_kernel_L999 // error, N was less than 4? + ble .Ldgemm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dgemm_kernel_L1_BEGIN + ble .Ldgemm_kernel_L1_BEGIN mov pCRow0, pC add pCRow1, pCRow0, LDC @@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dgemm_kernel_L2_M8_BEGIN: +.Ldgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L2_M4_BEGIN + ble .Ldgemm_kernel_L2_M4_BEGIN .align 5 -dgemm_kernel_L2_M8_20: +.Ldgemm_kernel_L2_M8_20: INIT8x2 @@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M8_40 + ble .Ldgemm_kernel_L2_M8_40 .align 5 -dgemm_kernel_L2_M8_22: +.Ldgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_22 + bgt .Ldgemm_kernel_L2_M8_22 -dgemm_kernel_L2_M8_40: +.Ldgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M8_100 + ble .Ldgemm_kernel_L2_M8_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M8_42: +.Ldgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M8_42 + bgt .Ldgemm_kernel_L2_M8_42 -dgemm_kernel_L2_M8_100: +.Ldgemm_kernel_L2_M8_100: SAVE8x2 -dgemm_kernel_L2_M8_END: +.Ldgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L2_M8_20 + bgt .Ldgemm_kernel_L2_M8_20 -dgemm_kernel_L2_M4_BEGIN: +.Ldgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L2_M2_BEGIN + ble .Ldgemm_kernel_L2_M2_BEGIN -dgemm_kernel_L2_M4_20: +.Ldgemm_kernel_L2_M4_20: INIT4x2 @@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M4_40 + ble .Ldgemm_kernel_L2_M4_40 .align 5 -dgemm_kernel_L2_M4_22: +.Ldgemm_kernel_L2_M4_22: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x2_SUB @@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_22 + bgt .Ldgemm_kernel_L2_M4_22 -dgemm_kernel_L2_M4_40: +.Ldgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M4_100 + ble .Ldgemm_kernel_L2_M4_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M4_42: +.Ldgemm_kernel_L2_M4_42: KERNEL4x2_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M4_42 + bgt .Ldgemm_kernel_L2_M4_42 -dgemm_kernel_L2_M4_100: +.Ldgemm_kernel_L2_M4_100: SAVE4x2 -dgemm_kernel_L2_M4_END: +.Ldgemm_kernel_L2_M4_END: -dgemm_kernel_L2_M2_BEGIN: +.Ldgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L2_M1_BEGIN + ble .Ldgemm_kernel_L2_M1_BEGIN -dgemm_kernel_L2_M2_20: +.Ldgemm_kernel_L2_M2_20: INIT2x2 @@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dgemm_kernel_L2_M2_40 + ble .Ldgemm_kernel_L2_M2_40 -dgemm_kernel_L2_M2_22: +.Ldgemm_kernel_L2_M2_22: KERNEL2x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_22 + bgt .Ldgemm_kernel_L2_M2_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M2_40: +.Ldgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M2_100 + ble .Ldgemm_kernel_L2_M2_100 -dgemm_kernel_L2_M2_42: +.Ldgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M2_42 + bgt .Ldgemm_kernel_L2_M2_42 -dgemm_kernel_L2_M2_100: +.Ldgemm_kernel_L2_M2_100: SAVE2x2 -dgemm_kernel_L2_M2_END: +.Ldgemm_kernel_L2_M2_END: -dgemm_kernel_L2_M1_BEGIN: +.Ldgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L2_END + ble .Ldgemm_kernel_L2_END -dgemm_kernel_L2_M1_20: +.Ldgemm_kernel_L2_M1_20: INIT1x2 @@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dgemm_kernel_L2_M1_40 + ble .Ldgemm_kernel_L2_M1_40 -dgemm_kernel_L2_M1_22: +.Ldgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB prfm PLDL1KEEP, [pB, B_PRE_SIZE] @@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_22 + bgt .Ldgemm_kernel_L2_M1_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] -dgemm_kernel_L2_M1_40: +.Ldgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L2_M1_100 + ble .Ldgemm_kernel_L2_M1_100 -dgemm_kernel_L2_M1_42: +.Ldgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L2_M1_42 + bgt .Ldgemm_kernel_L2_M1_42 -dgemm_kernel_L2_M1_100: +.Ldgemm_kernel_L2_M1_100: SAVE1x2 -dgemm_kernel_L2_END: +.Ldgemm_kernel_L2_END: add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 /******************************************************************************/ -dgemm_kernel_L1_BEGIN: +.Ldgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dgemm_kernel_L999 // done + ble .Ldgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next mov pA, origPA // pA = A -dgemm_kernel_L1_M8_BEGIN: +.Ldgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dgemm_kernel_L1_M4_BEGIN + ble .Ldgemm_kernel_L1_M4_BEGIN .align 5 -dgemm_kernel_L1_M8_20: +.Ldgemm_kernel_L1_M8_20: INIT8x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M8_40 + ble .Ldgemm_kernel_L1_M8_40 .align 5 -dgemm_kernel_L1_M8_22: +.Ldgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_22 + bgt .Ldgemm_kernel_L1_M8_22 -dgemm_kernel_L1_M8_40: +.Ldgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M8_100 + ble .Ldgemm_kernel_L1_M8_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M8_42: +.Ldgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M8_42 + bgt .Ldgemm_kernel_L1_M8_42 -dgemm_kernel_L1_M8_100: +.Ldgemm_kernel_L1_M8_100: SAVE8x1 -dgemm_kernel_L1_M8_END: +.Ldgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt dgemm_kernel_L1_M8_20 + bgt .Ldgemm_kernel_L1_M8_20 -dgemm_kernel_L1_M4_BEGIN: +.Ldgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble dgemm_kernel_L1_M2_BEGIN + ble .Ldgemm_kernel_L1_M2_BEGIN -dgemm_kernel_L1_M4_20: +.Ldgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M4_40 + ble .Ldgemm_kernel_L1_M4_40 .align 5 -dgemm_kernel_L1_M4_22: +.Ldgemm_kernel_L1_M4_22: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] KERNEL4x1_SUB @@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_22 + bgt .Ldgemm_kernel_L1_M4_22 -dgemm_kernel_L1_M4_40: +.Ldgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M4_100 + ble .Ldgemm_kernel_L1_M4_100 prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M4_42: +.Ldgemm_kernel_L1_M4_42: KERNEL4x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M4_42 + bgt .Ldgemm_kernel_L1_M4_42 -dgemm_kernel_L1_M4_100: +.Ldgemm_kernel_L1_M4_100: SAVE4x1 -dgemm_kernel_L1_M4_END: +.Ldgemm_kernel_L1_M4_END: -dgemm_kernel_L1_M2_BEGIN: +.Ldgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dgemm_kernel_L1_M1_BEGIN + ble .Ldgemm_kernel_L1_M1_BEGIN -dgemm_kernel_L1_M2_20: +.Ldgemm_kernel_L1_M2_20: INIT2x1 @@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M2_40 + ble .Ldgemm_kernel_L1_M2_40 -dgemm_kernel_L1_M2_22: +.Ldgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_22 + bgt .Ldgemm_kernel_L1_M2_22 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M2_40: +.Ldgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M2_100 + ble .Ldgemm_kernel_L1_M2_100 -dgemm_kernel_L1_M2_42: +.Ldgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M2_42 + bgt .Ldgemm_kernel_L1_M2_42 -dgemm_kernel_L1_M2_100: +.Ldgemm_kernel_L1_M2_100: SAVE2x1 -dgemm_kernel_L1_M2_END: +.Ldgemm_kernel_L1_M2_END: -dgemm_kernel_L1_M1_BEGIN: +.Ldgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dgemm_kernel_L1_END + ble .Ldgemm_kernel_L1_END -dgemm_kernel_L1_M1_20: +.Ldgemm_kernel_L1_M1_20: INIT1x1 @@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dgemm_kernel_L1_M1_40 + ble .Ldgemm_kernel_L1_M1_40 -dgemm_kernel_L1_M1_22: +.Ldgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB prfm PLDL1KEEP, [pA, A_PRE_SIZE] @@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_22 + bgt .Ldgemm_kernel_L1_M1_22 -dgemm_kernel_L1_M1_40: +.Ldgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble dgemm_kernel_L1_M1_100 + ble .Ldgemm_kernel_L1_M1_100 prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE] -dgemm_kernel_L1_M1_42: +.Ldgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dgemm_kernel_L1_M1_42 + bgt .Ldgemm_kernel_L1_M1_42 -dgemm_kernel_L1_M1_100: +.Ldgemm_kernel_L1_M1_100: SAVE1x1 -dgemm_kernel_L1_END: +.Ldgemm_kernel_L1_END: -dgemm_kernel_L999: +.Ldgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dgemm_ncopy_4.S b/kernel/arm64/dgemm_ncopy_4.S index c98a732770..29d274d931 100644 --- a/kernel/arm64/dgemm_ncopy_4.S +++ b/kernel/arm64/dgemm_ncopy_4.S @@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl LDA, LDA, #3 // LDA = LDA * SIZE -dgemm_ncopy_L4_BEGIN: +.Ldgemm_ncopy_L4_BEGIN: asr J, N, #2 // J = N / 4 cmp J, #0 - ble dgemm_ncopy_L2_BEGIN + ble .Ldgemm_ncopy_L2_BEGIN .align 5 -dgemm_ncopy_L4_M4_BEGIN: +.Ldgemm_ncopy_L4_M4_BEGIN: mov A01, A00 add A02, A01, LDA @@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN: asr I, M, #2 // I = M / 4 cmp I, #0 - ble dgemm_ncopy_L4_M4_40 + ble .Ldgemm_ncopy_L4_M4_40 .align 5 -dgemm_ncopy_L4_M4_20: +.Ldgemm_ncopy_L4_M4_20: COPY4x4 subs I , I , #1 - bne dgemm_ncopy_L4_M4_20 + bne .Ldgemm_ncopy_L4_M4_20 -dgemm_ncopy_L4_M4_40: +.Ldgemm_ncopy_L4_M4_40: and I, M , #3 cmp I, #0 - ble dgemm_ncopy_L4_M4_END + ble .Ldgemm_ncopy_L4_M4_END .align 5 -dgemm_ncopy_L4_M4_60: +.Ldgemm_ncopy_L4_M4_60: COPY1x4 subs I , I , #1 - bne dgemm_ncopy_L4_M4_60 + bne .Ldgemm_ncopy_L4_M4_60 -dgemm_ncopy_L4_M4_END: +.Ldgemm_ncopy_L4_M4_END: subs J , J, #1 // j-- - bne dgemm_ncopy_L4_M4_BEGIN + bne .Ldgemm_ncopy_L4_M4_BEGIN /*********************************************************************************************/ -dgemm_ncopy_L2_BEGIN: +.Ldgemm_ncopy_L2_BEGIN: tst N, #3 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 tst N, #2 - ble dgemm_ncopy_L1_BEGIN + ble .Ldgemm_ncopy_L1_BEGIN -dgemm_ncopy_L2_M4_BEGIN: +.Ldgemm_ncopy_L2_M4_BEGIN: mov A01, A00 add A02, A01, LDA add A00, A02, LDA asr I, M, #2 // I = M / 4 cmp I, #0 - ble dgemm_ncopy_L2_M4_40 + ble .Ldgemm_ncopy_L2_M4_40 .align 5 -dgemm_ncopy_L2_M4_20: +.Ldgemm_ncopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne dgemm_ncopy_L2_M4_20 + bne .Ldgemm_ncopy_L2_M4_20 -dgemm_ncopy_L2_M4_40: +.Ldgemm_ncopy_L2_M4_40: and I, M , #3 cmp I, #0 - ble dgemm_ncopy_L2_M4_END + ble .Ldgemm_ncopy_L2_M4_END .align 5 -dgemm_ncopy_L2_M4_60: +.Ldgemm_ncopy_L2_M4_60: COPY1x2 subs I , I , #1 - bne dgemm_ncopy_L2_M4_60 + bne .Ldgemm_ncopy_L2_M4_60 -dgemm_ncopy_L2_M4_END: +.Ldgemm_ncopy_L2_M4_END: /*********************************************************************************************/ -dgemm_ncopy_L1_BEGIN: +.Ldgemm_ncopy_L1_BEGIN: tst N, #1 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 -dgemm_ncopy_L1_M4_BEGIN: +.Ldgemm_ncopy_L1_M4_BEGIN: mov A01, A00 asr I, M, #2 // I = M / 4 cmp I, #0 - ble dgemm_ncopy_L1_M4_40 + ble .Ldgemm_ncopy_L1_M4_40 .align 5 -dgemm_ncopy_L1_M4_20: +.Ldgemm_ncopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne dgemm_ncopy_L1_M4_20 + bne .Ldgemm_ncopy_L1_M4_20 -dgemm_ncopy_L1_M4_40: +.Ldgemm_ncopy_L1_M4_40: and I, M , #3 cmp I, #0 - ble dgemm_ncopy_L1_M4_END + ble .Ldgemm_ncopy_L1_M4_END .align 5 -dgemm_ncopy_L1_M4_60: +.Ldgemm_ncopy_L1_M4_60: COPY1x1 subs I , I , #1 - bne dgemm_ncopy_L1_M4_60 + bne .Ldgemm_ncopy_L1_M4_60 -dgemm_ncopy_L1_M4_END: +.Ldgemm_ncopy_L1_M4_END: -dgemm_ncopy_L999: +.Ldgemm_ncopy_L999: mov x0, #0 RESTORE_REGS diff --git a/kernel/arm64/dgemm_ncopy_8.S b/kernel/arm64/dgemm_ncopy_8.S index 1f237b42c2..366424830d 100644 --- a/kernel/arm64/dgemm_ncopy_8.S +++ b/kernel/arm64/dgemm_ncopy_8.S @@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl LDA, LDA, #3 // LDA = LDA * SIZE -dgemm_ncopy_L8_BEGIN: +.Ldgemm_ncopy_L8_BEGIN: asr J, N, #3 // J = N / 8 cmp J, #0 - ble dgemm_ncopy_L4_BEGIN + ble .Ldgemm_ncopy_L4_BEGIN -dgemm_ncopy_L8_M8_BEGIN: +.Ldgemm_ncopy_L8_M8_BEGIN: mov A01, A00 add A02, A01, LDA @@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN: asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L8_M8_40 + ble .Ldgemm_ncopy_L8_M8_40 -dgemm_ncopy_L8_M8_20: +.Ldgemm_ncopy_L8_M8_20: COPY8x8 subs I , I , #1 - bne dgemm_ncopy_L8_M8_20 + bne .Ldgemm_ncopy_L8_M8_20 -dgemm_ncopy_L8_M8_40: +.Ldgemm_ncopy_L8_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L8_M8_END + ble .Ldgemm_ncopy_L8_M8_END -dgemm_ncopy_L8_M8_60: +.Ldgemm_ncopy_L8_M8_60: COPY1x8 subs I , I , #1 - bne dgemm_ncopy_L8_M8_60 + bne .Ldgemm_ncopy_L8_M8_60 -dgemm_ncopy_L8_M8_END: +.Ldgemm_ncopy_L8_M8_END: subs J , J, #1 // j-- - bne dgemm_ncopy_L8_M8_BEGIN + bne .Ldgemm_ncopy_L8_M8_BEGIN /*********************************************************************************************/ -dgemm_ncopy_L4_BEGIN: +.Ldgemm_ncopy_L4_BEGIN: tst N, #7 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 tst N, #4 - ble dgemm_ncopy_L2_BEGIN + ble .Ldgemm_ncopy_L2_BEGIN -dgemm_ncopy_L4_M8_BEGIN: +.Ldgemm_ncopy_L4_M8_BEGIN: mov A01, A00 add A02, A01, LDA @@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN: asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L4_M8_40 + ble .Ldgemm_ncopy_L4_M8_40 -dgemm_ncopy_L4_M8_20: +.Ldgemm_ncopy_L4_M8_20: COPY8x4 subs I , I , #1 - bne dgemm_ncopy_L4_M8_20 + bne .Ldgemm_ncopy_L4_M8_20 -dgemm_ncopy_L4_M8_40: +.Ldgemm_ncopy_L4_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L4_M8_END + ble .Ldgemm_ncopy_L4_M8_END -dgemm_ncopy_L4_M8_60: +.Ldgemm_ncopy_L4_M8_60: COPY1x4 subs I , I , #1 - bne dgemm_ncopy_L4_M8_60 + bne .Ldgemm_ncopy_L4_M8_60 -dgemm_ncopy_L4_M8_END: +.Ldgemm_ncopy_L4_M8_END: /*********************************************************************************************/ -dgemm_ncopy_L2_BEGIN: +.Ldgemm_ncopy_L2_BEGIN: tst N, #3 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 tst N, #2 - ble dgemm_ncopy_L1_BEGIN + ble .Ldgemm_ncopy_L1_BEGIN -dgemm_ncopy_L2_M8_BEGIN: +.Ldgemm_ncopy_L2_M8_BEGIN: mov A01, A00 add A02, A01, LDA add A00, A02, LDA asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L2_M8_40 + ble .Ldgemm_ncopy_L2_M8_40 -dgemm_ncopy_L2_M8_20: +.Ldgemm_ncopy_L2_M8_20: COPY8x2 subs I , I , #1 - bne dgemm_ncopy_L2_M8_20 + bne .Ldgemm_ncopy_L2_M8_20 -dgemm_ncopy_L2_M8_40: +.Ldgemm_ncopy_L2_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L2_M8_END + ble .Ldgemm_ncopy_L2_M8_END -dgemm_ncopy_L2_M8_60: +.Ldgemm_ncopy_L2_M8_60: COPY1x2 subs I , I , #1 - bne dgemm_ncopy_L2_M8_60 + bne .Ldgemm_ncopy_L2_M8_60 -dgemm_ncopy_L2_M8_END: +.Ldgemm_ncopy_L2_M8_END: /*********************************************************************************************/ -dgemm_ncopy_L1_BEGIN: +.Ldgemm_ncopy_L1_BEGIN: tst N, #1 - ble dgemm_ncopy_L999 + ble .Ldgemm_ncopy_L999 -dgemm_ncopy_L1_M8_BEGIN: +.Ldgemm_ncopy_L1_M8_BEGIN: mov A01, A00 asr I, M, #3 // I = M / 8 cmp I, #0 - ble dgemm_ncopy_L1_M8_40 + ble .Ldgemm_ncopy_L1_M8_40 -dgemm_ncopy_L1_M8_20: +.Ldgemm_ncopy_L1_M8_20: COPY8x1 subs I , I , #1 - bne dgemm_ncopy_L1_M8_20 + bne .Ldgemm_ncopy_L1_M8_20 -dgemm_ncopy_L1_M8_40: +.Ldgemm_ncopy_L1_M8_40: and I, M , #7 cmp I, #0 - ble dgemm_ncopy_L1_M8_END + ble .Ldgemm_ncopy_L1_M8_END -dgemm_ncopy_L1_M8_60: +.Ldgemm_ncopy_L1_M8_60: COPY1x1 subs I , I , #1 - bne dgemm_ncopy_L1_M8_60 + bne .Ldgemm_ncopy_L1_M8_60 -dgemm_ncopy_L1_M8_END: +.Ldgemm_ncopy_L1_M8_END: -dgemm_ncopy_L999: +.Ldgemm_ncopy_L999: mov x0, #0 RESTORE_REGS diff --git a/kernel/arm64/dgemm_tcopy_4.S b/kernel/arm64/dgemm_tcopy_4.S index 5b2ed43f1a..7c9135287f 100644 --- a/kernel/arm64/dgemm_tcopy_4.S +++ b/kernel/arm64/dgemm_tcopy_4.S @@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl M4, M, #5 // M4 = M * 4 * SIZE -dgemm_tcopy_L4_BEGIN: +.Ldgemm_tcopy_L4_BEGIN: asr J, M, #2 // J = M / 4 cmp J, #0 - ble dgemm_tcopy_L2_BEGIN + ble .Ldgemm_tcopy_L2_BEGIN .align 5 -dgemm_tcopy_L4_M4_BEGIN: +.Ldgemm_tcopy_L4_M4_BEGIN: mov A01, A add A02, A01, LDA @@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN: asr I, N, #2 // I = N / 4 cmp I, #0 - ble dgemm_tcopy_L4_M4_40 + ble .Ldgemm_tcopy_L4_M4_40 .align 5 -dgemm_tcopy_L4_M4_20: +.Ldgemm_tcopy_L4_M4_20: COPY4x4 subs I , I , #1 - bne dgemm_tcopy_L4_M4_20 + bne .Ldgemm_tcopy_L4_M4_20 -dgemm_tcopy_L4_M4_40: +.Ldgemm_tcopy_L4_M4_40: tst N , #2 - ble dgemm_tcopy_L4_M4_60 + ble .Ldgemm_tcopy_L4_M4_60 COPY2x4 -dgemm_tcopy_L4_M4_60: +.Ldgemm_tcopy_L4_M4_60: tst N, #1 - ble dgemm_tcopy_L4_M4_END + ble .Ldgemm_tcopy_L4_M4_END COPY1x4 -dgemm_tcopy_L4_M4_END: +.Ldgemm_tcopy_L4_M4_END: subs J , J, #1 // j-- - bne dgemm_tcopy_L4_M4_BEGIN + bne .Ldgemm_tcopy_L4_M4_BEGIN /*********************************************************************************************/ -dgemm_tcopy_L2_BEGIN: +.Ldgemm_tcopy_L2_BEGIN: tst M, #3 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 tst M, #2 - ble dgemm_tcopy_L1_BEGIN + ble .Ldgemm_tcopy_L1_BEGIN -dgemm_tcopy_L2_M4_BEGIN: +.Ldgemm_tcopy_L2_M4_BEGIN: mov A01, A add A02, A01, LDA add A, A02, LDA @@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN: asr I, N, #2 // I = N / 4 cmp I, #0 - ble dgemm_tcopy_L2_M4_40 + ble .Ldgemm_tcopy_L2_M4_40 .align 5 -dgemm_tcopy_L2_M4_20: +.Ldgemm_tcopy_L2_M4_20: COPY4x2 subs I , I , #1 - bne dgemm_tcopy_L2_M4_20 + bne .Ldgemm_tcopy_L2_M4_20 -dgemm_tcopy_L2_M4_40: +.Ldgemm_tcopy_L2_M4_40: tst N , #2 - ble dgemm_tcopy_L2_M4_60 + ble .Ldgemm_tcopy_L2_M4_60 COPY2x2 -dgemm_tcopy_L2_M4_60: +.Ldgemm_tcopy_L2_M4_60: tst N , #1 - ble dgemm_tcopy_L2_M4_END + ble .Ldgemm_tcopy_L2_M4_END COPY1x2 -dgemm_tcopy_L2_M4_END: +.Ldgemm_tcopy_L2_M4_END: /*********************************************************************************************/ -dgemm_tcopy_L1_BEGIN: +.Ldgemm_tcopy_L1_BEGIN: tst M, #1 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 -dgemm_tcopy_L1_M4_BEGIN: +.Ldgemm_tcopy_L1_M4_BEGIN: mov A01, A // A01 = A mov B01, B asr I, N, #2 // I = M / 4 cmp I, #0 - ble dgemm_tcopy_L1_M4_40 + ble .Ldgemm_tcopy_L1_M4_40 .align 5 -dgemm_tcopy_L1_M4_20: +.Ldgemm_tcopy_L1_M4_20: COPY4x1 subs I , I , #1 - bne dgemm_tcopy_L1_M4_20 + bne .Ldgemm_tcopy_L1_M4_20 -dgemm_tcopy_L1_M4_40: +.Ldgemm_tcopy_L1_M4_40: tst N , #2 - ble dgemm_tcopy_L1_M4_60 + ble .Ldgemm_tcopy_L1_M4_60 COPY2x1 -dgemm_tcopy_L1_M4_60: +.Ldgemm_tcopy_L1_M4_60: tst N , #1 - ble dgemm_tcopy_L1_M4_END + ble .Ldgemm_tcopy_L1_M4_END COPY1x1 -dgemm_tcopy_L1_M4_END: +.Ldgemm_tcopy_L1_M4_END: -dgemm_tcopy_L999: +.Ldgemm_tcopy_L999: mov x0, #0 // set return value RESTORE_REGS ret diff --git a/kernel/arm64/dgemm_tcopy_8.S b/kernel/arm64/dgemm_tcopy_8.S index 1c57e30e03..9ab51ff571 100644 --- a/kernel/arm64/dgemm_tcopy_8.S +++ b/kernel/arm64/dgemm_tcopy_8.S @@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lsl M8, M, #6 // M8 = M * 8 * SIZE -dgemm_tcopy_L8_BEGIN: +.Ldgemm_tcopy_L8_BEGIN: asr J, M, #3 // J = M / 4 cmp J, #0 - ble dgemm_tcopy_L4_BEGIN + ble .Ldgemm_tcopy_L4_BEGIN .align 5 -dgemm_tcopy_L8_M8_BEGIN: +.Ldgemm_tcopy_L8_M8_BEGIN: mov A01, A add A02, A01, LDA @@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN: asr I, N, #3 // I = N / 8 cmp I, #0 - ble dgemm_tcopy_L8_M8_40 + ble .Ldgemm_tcopy_L8_M8_40 .align 5 -dgemm_tcopy_L8_M8_20: +.Ldgemm_tcopy_L8_M8_20: COPY8x8 subs I , I , #1 - bne dgemm_tcopy_L8_M8_20 + bne .Ldgemm_tcopy_L8_M8_20 -dgemm_tcopy_L8_M8_40: +.Ldgemm_tcopy_L8_M8_40: tst N , #4 - ble dgemm_tcopy_L8_M8_60 + ble .Ldgemm_tcopy_L8_M8_60 COPY4x8 -dgemm_tcopy_L8_M8_60: +.Ldgemm_tcopy_L8_M8_60: tst N , #2 - ble dgemm_tcopy_L8_M8_80 + ble .Ldgemm_tcopy_L8_M8_80 COPY2x8 -dgemm_tcopy_L8_M8_80: +.Ldgemm_tcopy_L8_M8_80: tst N, #1 - ble dgemm_tcopy_L8_M8_END + ble .Ldgemm_tcopy_L8_M8_END COPY1x8 -dgemm_tcopy_L8_M8_END: +.Ldgemm_tcopy_L8_M8_END: subs J , J, #1 // j-- - bne dgemm_tcopy_L8_M8_BEGIN + bne .Ldgemm_tcopy_L8_M8_BEGIN /*********************************************************************************************/ -dgemm_tcopy_L4_BEGIN: +.Ldgemm_tcopy_L4_BEGIN: tst M, #7 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 tst M, #4 - ble dgemm_tcopy_L2_BEGIN + ble .Ldgemm_tcopy_L2_BEGIN -dgemm_tcopy_L4_M8_BEGIN: +.Ldgemm_tcopy_L4_M8_BEGIN: mov A01, A add A02, A01, LDA @@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN: asr I, N, #3 // I = N / 8 cmp I, #0 - ble dgemm_tcopy_L4_M8_40 + ble .Ldgemm_tcopy_L4_M8_40 .align 5 -dgemm_tcopy_L4_M8_20: +.Ldgemm_tcopy_L4_M8_20: COPY8x4 subs I , I , #1 - bne dgemm_tcopy_L4_M8_20 + bne .Ldgemm_tcopy_L4_M8_20 -dgemm_tcopy_L4_M8_40: +.Ldgemm_tcopy_L4_M8_40: tst N , #4 - ble dgemm_tcopy_L4_M8_60 + ble .Ldgemm_tcopy_L4_M8_60 COPY4x4 -dgemm_tcopy_L4_M8_60: +.Ldgemm_tcopy_L4_M8_60: tst N , #2 - ble dgemm_tcopy_L4_M8_80 + ble .Ldgemm_tcopy_L4_M8_80 COPY2x4 -dgemm_tcopy_L4_M8_80: +.Ldgemm_tcopy_L4_M8_80: tst N, #1 - ble dgemm_tcopy_L4_M8_END + ble .Ldgemm_tcopy_L4_M8_END COPY1x4 -dgemm_tcopy_L4_M8_END: +.Ldgemm_tcopy_L4_M8_END: /*********************************************************************************************/ -dgemm_tcopy_L2_BEGIN: +.Ldgemm_tcopy_L2_BEGIN: tst M, #3 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 tst M, #2 - ble dgemm_tcopy_L1_BEGIN + ble .Ldgemm_tcopy_L1_BEGIN -dgemm_tcopy_L2_M8_BEGIN: +.Ldgemm_tcopy_L2_M8_BEGIN: mov A01, A add A02, A01, LDA add A, A02, LDA @@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN: asr I, N, #3 // I = N / 8 cmp I, #0 - ble dgemm_tcopy_L2_M8_40 + ble .Ldgemm_tcopy_L2_M8_40 .align 5 -dgemm_tcopy_L2_M8_20: +.Ldgemm_tcopy_L2_M8_20: COPY8x2 subs I , I , #1 - bne dgemm_tcopy_L2_M8_20 + bne .Ldgemm_tcopy_L2_M8_20 -dgemm_tcopy_L2_M8_40: +.Ldgemm_tcopy_L2_M8_40: tst N , #4 - ble dgemm_tcopy_L2_M8_60 + ble .Ldgemm_tcopy_L2_M8_60 COPY4x2 -dgemm_tcopy_L2_M8_60: +.Ldgemm_tcopy_L2_M8_60: tst N , #2 - ble dgemm_tcopy_L2_M8_80 + ble .Ldgemm_tcopy_L2_M8_80 COPY2x2 -dgemm_tcopy_L2_M8_80: +.Ldgemm_tcopy_L2_M8_80: tst N , #1 - ble dgemm_tcopy_L2_M8_END + ble .Ldgemm_tcopy_L2_M8_END COPY1x2 -dgemm_tcopy_L2_M8_END: +.Ldgemm_tcopy_L2_M8_END: /*********************************************************************************************/ -dgemm_tcopy_L1_BEGIN: +.Ldgemm_tcopy_L1_BEGIN: tst M, #1 - ble dgemm_tcopy_L999 + ble .Ldgemm_tcopy_L999 -dgemm_tcopy_L1_M8_BEGIN: +.Ldgemm_tcopy_L1_M8_BEGIN: mov A01, A // A01 = A mov B01, B asr I, N, #3 // I = M / 8 cmp I, #0 - ble dgemm_tcopy_L1_M8_40 + ble .Ldgemm_tcopy_L1_M8_40 .align 5 -dgemm_tcopy_L1_M8_20: +.Ldgemm_tcopy_L1_M8_20: COPY8x1 subs I , I , #1 - bne dgemm_tcopy_L1_M8_20 + bne .Ldgemm_tcopy_L1_M8_20 -dgemm_tcopy_L1_M8_40: +.Ldgemm_tcopy_L1_M8_40: tst N , #4 - ble dgemm_tcopy_L1_M8_60 + ble .Ldgemm_tcopy_L1_M8_60 COPY4x1 -dgemm_tcopy_L1_M8_60: +.Ldgemm_tcopy_L1_M8_60: tst N , #2 - ble dgemm_tcopy_L1_M8_80 + ble .Ldgemm_tcopy_L1_M8_80 COPY2x1 -dgemm_tcopy_L1_M8_80: +.Ldgemm_tcopy_L1_M8_80: tst N , #1 - ble dgemm_tcopy_L1_M8_END + ble .Ldgemm_tcopy_L1_M8_END COPY1x1 -dgemm_tcopy_L1_M8_END: +.Ldgemm_tcopy_L1_M8_END: -dgemm_tcopy_L999: +.Ldgemm_tcopy_L999: mov x0, #0 // set return value RESTORE_REGS ret diff --git a/kernel/arm64/dot.S b/kernel/arm64/dot.S index 35d47790ca..a1a5bf20b4 100644 --- a/kernel/arm64/dot.S +++ b/kernel/arm64/dot.S @@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif cmp N, xzr - ble dot_kernel_L999 + ble .Ldot_kernel_L999 cmp INC_X, #1 - bne dot_kernel_S_BEGIN + bne .Ldot_kernel_S_BEGIN cmp INC_Y, #1 - bne dot_kernel_S_BEGIN + bne .Ldot_kernel_S_BEGIN -dot_kernel_F_BEGIN: +.Ldot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq dot_kernel_F1 + beq .Ldot_kernel_F1 -dot_kernel_F4: +.Ldot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne dot_kernel_F4 + bne .Ldot_kernel_F4 KERNEL_F4_FINALIZE -dot_kernel_F1: +.Ldot_kernel_F1: ands I, N, #3 - ble dot_kernel_L999 + ble .Ldot_kernel_L999 -dot_kernel_F10: +.Ldot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne dot_kernel_F10 + bne .Ldot_kernel_F10 ret -dot_kernel_S_BEGIN: +.Ldot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble dot_kernel_S1 + ble .Ldot_kernel_S1 -dot_kernel_S4: +.Ldot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -206,21 +206,21 @@ dot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S4 + bne .Ldot_kernel_S4 -dot_kernel_S1: +.Ldot_kernel_S1: ands I, N, #3 - ble dot_kernel_L999 + ble .Ldot_kernel_L999 -dot_kernel_S10: +.Ldot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S10 + bne .Ldot_kernel_S10 -dot_kernel_L999: +.Ldot_kernel_L999: ret diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S index 34fb8c2339..b528aeb182 100644 --- a/kernel/arm64/dtrmm_kernel_4x4.S +++ b/kernel/arm64/dtrmm_kernel_4x4.S @@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dtrmm_kernel_L2_BEGIN + ble .Ldtrmm_kernel_L2_BEGIN /******************************************************************************/ -dtrmm_kernel_L4_BEGIN: +.Ldtrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dtrmm_kernel_L4_M4_BEGIN: +.Ldtrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L4_M2_BEGIN + ble .Ldtrmm_kernel_L4_M2_BEGIN -dtrmm_kernel_L4_M4_20: +.Ldtrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L4_M4_32 + blt .Ldtrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble dtrmm_kernel_L4_M4_22a + ble .Ldtrmm_kernel_L4_M4_22a .align 5 -dtrmm_kernel_L4_M4_22: +.Ldtrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_22 + bgt .Ldtrmm_kernel_L4_M4_22 -dtrmm_kernel_L4_M4_22a: +.Ldtrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_32: +.Ldtrmm_kernel_L4_M4_32: tst counterL, #1 - ble dtrmm_kernel_L4_M4_40 + ble .Ldtrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_40: +.Ldtrmm_kernel_L4_M4_40: INIT4x4 -dtrmm_kernel_L4_M4_44: +.Ldtrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble dtrmm_kernel_L4_M4_100 + ble .Ldtrmm_kernel_L4_M4_100 -dtrmm_kernel_L4_M4_46: +.Ldtrmm_kernel_L4_M4_46: KERNEL4x4_SUB -dtrmm_kernel_L4_M4_100: +.Ldtrmm_kernel_L4_M4_100: SAVE4x4 @@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L4_M4_END: +.Ldtrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L4_M4_20 + bne .Ldtrmm_kernel_L4_M4_20 -dtrmm_kernel_L4_M2_BEGIN: +.Ldtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L4_M1_BEGIN + ble .Ldtrmm_kernel_L4_M1_BEGIN -dtrmm_kernel_L4_M2_20: +.Ldtrmm_kernel_L4_M2_20: INIT2x4 @@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M2_40 + ble .Ldtrmm_kernel_L4_M2_40 -dtrmm_kernel_L4_M2_22: +.Ldtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_22 + bgt .Ldtrmm_kernel_L4_M2_22 -dtrmm_kernel_L4_M2_40: +.Ldtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M2_100 + ble .Ldtrmm_kernel_L4_M2_100 -dtrmm_kernel_L4_M2_42: +.Ldtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_42 + bgt .Ldtrmm_kernel_L4_M2_42 -dtrmm_kernel_L4_M2_100: +.Ldtrmm_kernel_L4_M2_100: SAVE2x4 @@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L4_M2_END: +.Ldtrmm_kernel_L4_M2_END: -dtrmm_kernel_L4_M1_BEGIN: +.Ldtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END -dtrmm_kernel_L4_M1_20: +.Ldtrmm_kernel_L4_M1_20: INIT1x4 @@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M1_40 + ble .Ldtrmm_kernel_L4_M1_40 -dtrmm_kernel_L4_M1_22: +.Ldtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_22 + bgt .Ldtrmm_kernel_L4_M1_22 -dtrmm_kernel_L4_M1_40: +.Ldtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M1_100 + ble .Ldtrmm_kernel_L4_M1_100 -dtrmm_kernel_L4_M1_42: +.Ldtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_42 + bgt .Ldtrmm_kernel_L4_M1_42 -dtrmm_kernel_L4_M1_100: +.Ldtrmm_kernel_L4_M1_100: SAVE1x4 @@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L4_END: +.Ldtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -838,19 +838,19 @@ dtrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt dtrmm_kernel_L4_BEGIN + bgt .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ -dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dtrmm_kernel_L999 // error, N was less than 4? + ble .Ldtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dtrmm_kernel_L1_BEGIN + ble .Ldtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dtrmm_kernel_L2_M4_BEGIN: +.Ldtrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dtrmm_kernel_L2_M2_BEGIN + ble .Ldtrmm_kernel_L2_M2_BEGIN -dtrmm_kernel_L2_M4_20: +.Ldtrmm_kernel_L2_M4_20: INIT4x2 @@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M4_40 + ble .Ldtrmm_kernel_L2_M4_40 .align 5 -dtrmm_kernel_L2_M4_22: +.Ldtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_22 + bgt .Ldtrmm_kernel_L2_M4_22 -dtrmm_kernel_L2_M4_40: +.Ldtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M4_100 + ble .Ldtrmm_kernel_L2_M4_100 -dtrmm_kernel_L2_M4_42: +.Ldtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_42 + bgt .Ldtrmm_kernel_L2_M4_42 -dtrmm_kernel_L2_M4_100: +.Ldtrmm_kernel_L2_M4_100: SAVE4x2 @@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L2_M4_END: +.Ldtrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L2_M4_20 + bgt .Ldtrmm_kernel_L2_M4_20 -dtrmm_kernel_L2_M2_BEGIN: +.Ldtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M1_BEGIN + ble .Ldtrmm_kernel_L2_M1_BEGIN -dtrmm_kernel_L2_M2_20: +.Ldtrmm_kernel_L2_M2_20: INIT2x2 @@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M2_40 + ble .Ldtrmm_kernel_L2_M2_40 -dtrmm_kernel_L2_M2_22: +.Ldtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_22 + bgt .Ldtrmm_kernel_L2_M2_22 -dtrmm_kernel_L2_M2_40: +.Ldtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M2_100 + ble .Ldtrmm_kernel_L2_M2_100 -dtrmm_kernel_L2_M2_42: +.Ldtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_42 + bgt .Ldtrmm_kernel_L2_M2_42 -dtrmm_kernel_L2_M2_100: +.Ldtrmm_kernel_L2_M2_100: SAVE2x2 @@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L2_M2_END: +.Ldtrmm_kernel_L2_M2_END: -dtrmm_kernel_L2_M1_BEGIN: +.Ldtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END -dtrmm_kernel_L2_M1_20: +.Ldtrmm_kernel_L2_M1_20: INIT1x2 @@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dtrmm_kernel_L2_M1_40 + ble .Ldtrmm_kernel_L2_M1_40 -dtrmm_kernel_L2_M1_22: +.Ldtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_22 + bgt .Ldtrmm_kernel_L2_M1_22 -dtrmm_kernel_L2_M1_40: +.Ldtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M1_100 + ble .Ldtrmm_kernel_L2_M1_100 -dtrmm_kernel_L2_M1_42: +.Ldtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_42 + bgt .Ldtrmm_kernel_L2_M1_42 -dtrmm_kernel_L2_M1_100: +.Ldtrmm_kernel_L2_M1_100: SAVE1x2 @@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L2_END: +.Ldtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END: /******************************************************************************/ -dtrmm_kernel_L1_BEGIN: +.Ldtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dtrmm_kernel_L999 // done + ble .Ldtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN: mov pA, origPA // pA = A -dtrmm_kernel_L1_M4_BEGIN: +.Ldtrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L1_M2_BEGIN + ble .Ldtrmm_kernel_L1_M2_BEGIN -dtrmm_kernel_L1_M4_20: +.Ldtrmm_kernel_L1_M4_20: INIT4x1 @@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M4_40 + ble .Ldtrmm_kernel_L1_M4_40 .align 5 -dtrmm_kernel_L1_M4_22: +.Ldtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_22 + bgt .Ldtrmm_kernel_L1_M4_22 -dtrmm_kernel_L1_M4_40: +.Ldtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M4_100 + ble .Ldtrmm_kernel_L1_M4_100 -dtrmm_kernel_L1_M4_42: +.Ldtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_42 + bgt .Ldtrmm_kernel_L1_M4_42 -dtrmm_kernel_L1_M4_100: +.Ldtrmm_kernel_L1_M4_100: SAVE4x1 @@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L1_M4_END: +.Ldtrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L1_M4_20 + bgt .Ldtrmm_kernel_L1_M4_20 -dtrmm_kernel_L1_M2_BEGIN: +.Ldtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M1_BEGIN + ble .Ldtrmm_kernel_L1_M1_BEGIN -dtrmm_kernel_L1_M2_20: +.Ldtrmm_kernel_L1_M2_20: INIT2x1 @@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M2_40 + ble .Ldtrmm_kernel_L1_M2_40 -dtrmm_kernel_L1_M2_22: +.Ldtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_22 + bgt .Ldtrmm_kernel_L1_M2_22 -dtrmm_kernel_L1_M2_40: +.Ldtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M2_100 + ble .Ldtrmm_kernel_L1_M2_100 -dtrmm_kernel_L1_M2_42: +.Ldtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_42 + bgt .Ldtrmm_kernel_L1_M2_42 -dtrmm_kernel_L1_M2_100: +.Ldtrmm_kernel_L1_M2_100: SAVE2x1 @@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L1_M2_END: +.Ldtrmm_kernel_L1_M2_END: -dtrmm_kernel_L1_M1_BEGIN: +.Ldtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END -dtrmm_kernel_L1_M1_20: +.Ldtrmm_kernel_L1_M1_20: INIT1x1 @@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M1_40 + ble .Ldtrmm_kernel_L1_M1_40 -dtrmm_kernel_L1_M1_22: +.Ldtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_22 + bgt .Ldtrmm_kernel_L1_M1_22 -dtrmm_kernel_L1_M1_40: +.Ldtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M1_100 + ble .Ldtrmm_kernel_L1_M1_100 -dtrmm_kernel_L1_M1_42: +.Ldtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_42 + bgt .Ldtrmm_kernel_L1_M1_42 -dtrmm_kernel_L1_M1_100: +.Ldtrmm_kernel_L1_M1_100: SAVE1x1 -dtrmm_kernel_L1_END: +.Ldtrmm_kernel_L1_END: -dtrmm_kernel_L999: +.Ldtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S index 4aecf28ebd..47956dec57 100644 --- a/kernel/arm64/dtrmm_kernel_4x8.S +++ b/kernel/arm64/dtrmm_kernel_4x8.S @@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble dtrmm_kernel_L4_BEGIN + ble .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ -dtrmm_kernel_L8_BEGIN: +.Ldtrmm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 @@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN: mov pA, origPA // pA = start of A array -dtrmm_kernel_L8_M4_BEGIN: +.Ldtrmm_kernel_L8_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L8_M2_BEGIN + ble .Ldtrmm_kernel_L8_M2_BEGIN -dtrmm_kernel_L8_M4_20: +.Ldtrmm_kernel_L8_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20: asr counterL, tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L8_M4_32 + blt .Ldtrmm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble dtrmm_kernel_L8_M4_22a + ble .Ldtrmm_kernel_L8_M4_22a .align 5 -dtrmm_kernel_L8_M4_22: +.Ldtrmm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M4_22 + bgt .Ldtrmm_kernel_L8_M4_22 -dtrmm_kernel_L8_M4_22a: +.Ldtrmm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b dtrmm_kernel_L8_M4_44 + b .Ldtrmm_kernel_L8_M4_44 -dtrmm_kernel_L8_M4_32: +.Ldtrmm_kernel_L8_M4_32: tst counterL, #1 - ble dtrmm_kernel_L8_M4_40 + ble .Ldtrmm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b dtrmm_kernel_L8_M4_44 + b .Ldtrmm_kernel_L8_M4_44 -dtrmm_kernel_L8_M4_40: +.Ldtrmm_kernel_L8_M4_40: INIT4x8 -dtrmm_kernel_L8_M4_44: +.Ldtrmm_kernel_L8_M4_44: ands counterL, tempK, #1 - ble dtrmm_kernel_L8_M4_100 + ble .Ldtrmm_kernel_L8_M4_100 -dtrmm_kernel_L8_M4_46: +.Ldtrmm_kernel_L8_M4_46: KERNEL4x8_SUB -dtrmm_kernel_L8_M4_100: +.Ldtrmm_kernel_L8_M4_100: SAVE4x8 @@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L8_M4_END: +.Ldtrmm_kernel_L8_M4_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L8_M4_20 + bne .Ldtrmm_kernel_L8_M4_20 -dtrmm_kernel_L8_M2_BEGIN: +.Ldtrmm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L8_END + ble .Ldtrmm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L8_M1_BEGIN + ble .Ldtrmm_kernel_L8_M1_BEGIN -dtrmm_kernel_L8_M2_20: +.Ldtrmm_kernel_L8_M2_20: INIT2x8 @@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20: asr counterL, tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L8_M2_40 + ble .Ldtrmm_kernel_L8_M2_40 -dtrmm_kernel_L8_M2_22: +.Ldtrmm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M2_22 + bgt .Ldtrmm_kernel_L8_M2_22 -dtrmm_kernel_L8_M2_40: +.Ldtrmm_kernel_L8_M2_40: ands counterL, tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L8_M2_100 + ble .Ldtrmm_kernel_L8_M2_100 -dtrmm_kernel_L8_M2_42: +.Ldtrmm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M2_42 + bgt .Ldtrmm_kernel_L8_M2_42 -dtrmm_kernel_L8_M2_100: +.Ldtrmm_kernel_L8_M2_100: SAVE2x8 @@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L8_M2_END: +.Ldtrmm_kernel_L8_M2_END: -dtrmm_kernel_L8_M1_BEGIN: +.Ldtrmm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L8_END + ble .Ldtrmm_kernel_L8_END -dtrmm_kernel_L8_M1_20: +.Ldtrmm_kernel_L8_M1_20: INIT1x8 @@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20: asr counterL, tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L8_M1_40 + ble .Ldtrmm_kernel_L8_M1_40 -dtrmm_kernel_L8_M1_22: +.Ldtrmm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M1_22 + bgt .Ldtrmm_kernel_L8_M1_22 -dtrmm_kernel_L8_M1_40: +.Ldtrmm_kernel_L8_M1_40: ands counterL, tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L8_M1_100 + ble .Ldtrmm_kernel_L8_M1_100 -dtrmm_kernel_L8_M1_42: +.Ldtrmm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L8_M1_42 + bgt .Ldtrmm_kernel_L8_M1_42 -dtrmm_kernel_L8_M1_100: +.Ldtrmm_kernel_L8_M1_100: SAVE1x8 @@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L8_END: +.Ldtrmm_kernel_L8_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 8 * 8 @@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END: #endif subs counterJ, counterJ , #1 // j-- - bgt dtrmm_kernel_L8_BEGIN + bgt .Ldtrmm_kernel_L8_BEGIN /******************************************************************************/ -dtrmm_kernel_L4_BEGIN: +.Ldtrmm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble dtrmm_kernel_L999 + ble .Ldtrmm_kernel_L999 tst counterJ , #4 - ble dtrmm_kernel_L2_BEGIN + ble .Ldtrmm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -dtrmm_kernel_L4_M4_BEGIN: +.Ldtrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L4_M2_BEGIN + ble .Ldtrmm_kernel_L4_M2_BEGIN -dtrmm_kernel_L4_M4_20: +.Ldtrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20: asr counterL, tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L4_M4_32 + blt .Ldtrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble dtrmm_kernel_L4_M4_22a + ble .Ldtrmm_kernel_L4_M4_22a .align 5 -dtrmm_kernel_L4_M4_22: +.Ldtrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_22 + bgt .Ldtrmm_kernel_L4_M4_22 -dtrmm_kernel_L4_M4_22a: +.Ldtrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_32: +.Ldtrmm_kernel_L4_M4_32: tst counterL, #1 - ble dtrmm_kernel_L4_M4_40 + ble .Ldtrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b dtrmm_kernel_L4_M4_44 + b .Ldtrmm_kernel_L4_M4_44 -dtrmm_kernel_L4_M4_40: +.Ldtrmm_kernel_L4_M4_40: INIT4x4 -dtrmm_kernel_L4_M4_44: +.Ldtrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble dtrmm_kernel_L4_M4_100 + ble .Ldtrmm_kernel_L4_M4_100 -dtrmm_kernel_L4_M4_46: +.Ldtrmm_kernel_L4_M4_46: KERNEL4x4_SUB -dtrmm_kernel_L4_M4_100: +.Ldtrmm_kernel_L4_M4_100: SAVE4x4 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L4_M4_END: +.Ldtrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L4_M4_20 + bne .Ldtrmm_kernel_L4_M4_20 -dtrmm_kernel_L4_M2_BEGIN: +.Ldtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L4_M1_BEGIN + ble .Ldtrmm_kernel_L4_M1_BEGIN -dtrmm_kernel_L4_M2_20: +.Ldtrmm_kernel_L4_M2_20: INIT2x4 @@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M2_40 + ble .Ldtrmm_kernel_L4_M2_40 -dtrmm_kernel_L4_M2_22: +.Ldtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_22 + bgt .Ldtrmm_kernel_L4_M2_22 -dtrmm_kernel_L4_M2_40: +.Ldtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M2_100 + ble .Ldtrmm_kernel_L4_M2_100 -dtrmm_kernel_L4_M2_42: +.Ldtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_42 + bgt .Ldtrmm_kernel_L4_M2_42 -dtrmm_kernel_L4_M2_100: +.Ldtrmm_kernel_L4_M2_100: SAVE2x4 @@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L4_M2_END: +.Ldtrmm_kernel_L4_M2_END: -dtrmm_kernel_L4_M1_BEGIN: +.Ldtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END -dtrmm_kernel_L4_M1_20: +.Ldtrmm_kernel_L4_M1_20: INIT1x4 @@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M1_40 + ble .Ldtrmm_kernel_L4_M1_40 -dtrmm_kernel_L4_M1_22: +.Ldtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_22 + bgt .Ldtrmm_kernel_L4_M1_22 -dtrmm_kernel_L4_M1_40: +.Ldtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M1_100 + ble .Ldtrmm_kernel_L4_M1_100 -dtrmm_kernel_L4_M1_42: +.Ldtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_42 + bgt .Ldtrmm_kernel_L4_M1_42 -dtrmm_kernel_L4_M1_100: +.Ldtrmm_kernel_L4_M1_100: SAVE1x4 @@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L4_END: +.Ldtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END: /******************************************************************************/ -dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dtrmm_kernel_L999 // error, N was less than 4? + ble .Ldtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dtrmm_kernel_L1_BEGIN + ble .Ldtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -dtrmm_kernel_L2_M4_BEGIN: +.Ldtrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble dtrmm_kernel_L2_M2_BEGIN + ble .Ldtrmm_kernel_L2_M2_BEGIN -dtrmm_kernel_L2_M4_20: +.Ldtrmm_kernel_L2_M4_20: INIT4x2 @@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M4_40 + ble .Ldtrmm_kernel_L2_M4_40 .align 5 -dtrmm_kernel_L2_M4_22: +.Ldtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_22 + bgt .Ldtrmm_kernel_L2_M4_22 -dtrmm_kernel_L2_M4_40: +.Ldtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M4_100 + ble .Ldtrmm_kernel_L2_M4_100 -dtrmm_kernel_L2_M4_42: +.Ldtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_42 + bgt .Ldtrmm_kernel_L2_M4_42 -dtrmm_kernel_L2_M4_100: +.Ldtrmm_kernel_L2_M4_100: SAVE4x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L2_M4_END: +.Ldtrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L2_M4_20 + bgt .Ldtrmm_kernel_L2_M4_20 -dtrmm_kernel_L2_M2_BEGIN: +.Ldtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M1_BEGIN + ble .Ldtrmm_kernel_L2_M1_BEGIN -dtrmm_kernel_L2_M2_20: +.Ldtrmm_kernel_L2_M2_20: INIT2x2 @@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M2_40 + ble .Ldtrmm_kernel_L2_M2_40 -dtrmm_kernel_L2_M2_22: +.Ldtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_22 + bgt .Ldtrmm_kernel_L2_M2_22 -dtrmm_kernel_L2_M2_40: +.Ldtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M2_100 + ble .Ldtrmm_kernel_L2_M2_100 -dtrmm_kernel_L2_M2_42: +.Ldtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_42 + bgt .Ldtrmm_kernel_L2_M2_42 -dtrmm_kernel_L2_M2_100: +.Ldtrmm_kernel_L2_M2_100: SAVE2x2 @@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L2_M2_END: +.Ldtrmm_kernel_L2_M2_END: -dtrmm_kernel_L2_M1_BEGIN: +.Ldtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END -dtrmm_kernel_L2_M1_20: +.Ldtrmm_kernel_L2_M1_20: INIT1x2 @@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dtrmm_kernel_L2_M1_40 + ble .Ldtrmm_kernel_L2_M1_40 -dtrmm_kernel_L2_M1_22: +.Ldtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_22 + bgt .Ldtrmm_kernel_L2_M1_22 -dtrmm_kernel_L2_M1_40: +.Ldtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M1_100 + ble .Ldtrmm_kernel_L2_M1_100 -dtrmm_kernel_L2_M1_42: +.Ldtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_42 + bgt .Ldtrmm_kernel_L2_M1_42 -dtrmm_kernel_L2_M1_100: +.Ldtrmm_kernel_L2_M1_100: SAVE1x2 @@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L2_END: +.Ldtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END: /******************************************************************************/ -dtrmm_kernel_L1_BEGIN: +.Ldtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dtrmm_kernel_L999 // done + ble .Ldtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -dtrmm_kernel_L1_M4_BEGIN: +.Ldtrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble dtrmm_kernel_L1_M2_BEGIN + ble .Ldtrmm_kernel_L1_M2_BEGIN -dtrmm_kernel_L1_M4_20: +.Ldtrmm_kernel_L1_M4_20: INIT4x1 @@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M4_40 + ble .Ldtrmm_kernel_L1_M4_40 .align 5 -dtrmm_kernel_L1_M4_22: +.Ldtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_22 + bgt .Ldtrmm_kernel_L1_M4_22 -dtrmm_kernel_L1_M4_40: +.Ldtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M4_100 + ble .Ldtrmm_kernel_L1_M4_100 -dtrmm_kernel_L1_M4_42: +.Ldtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_42 + bgt .Ldtrmm_kernel_L1_M4_42 -dtrmm_kernel_L1_M4_100: +.Ldtrmm_kernel_L1_M4_100: SAVE4x1 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L1_M4_END: +.Ldtrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L1_M4_20 + bgt .Ldtrmm_kernel_L1_M4_20 -dtrmm_kernel_L1_M2_BEGIN: +.Ldtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M1_BEGIN + ble .Ldtrmm_kernel_L1_M1_BEGIN -dtrmm_kernel_L1_M2_20: +.Ldtrmm_kernel_L1_M2_20: INIT2x1 @@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M2_40 + ble .Ldtrmm_kernel_L1_M2_40 -dtrmm_kernel_L1_M2_22: +.Ldtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_22 + bgt .Ldtrmm_kernel_L1_M2_22 -dtrmm_kernel_L1_M2_40: +.Ldtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M2_100 + ble .Ldtrmm_kernel_L1_M2_100 -dtrmm_kernel_L1_M2_42: +.Ldtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_42 + bgt .Ldtrmm_kernel_L1_M2_42 -dtrmm_kernel_L1_M2_100: +.Ldtrmm_kernel_L1_M2_100: SAVE2x1 @@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L1_M2_END: +.Ldtrmm_kernel_L1_M2_END: -dtrmm_kernel_L1_M1_BEGIN: +.Ldtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END -dtrmm_kernel_L1_M1_20: +.Ldtrmm_kernel_L1_M1_20: INIT1x1 @@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M1_40 + ble .Ldtrmm_kernel_L1_M1_40 -dtrmm_kernel_L1_M1_22: +.Ldtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_22 + bgt .Ldtrmm_kernel_L1_M1_22 -dtrmm_kernel_L1_M1_40: +.Ldtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M1_100 + ble .Ldtrmm_kernel_L1_M1_100 -dtrmm_kernel_L1_M1_42: +.Ldtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_42 + bgt .Ldtrmm_kernel_L1_M1_42 -dtrmm_kernel_L1_M1_100: +.Ldtrmm_kernel_L1_M1_100: SAVE1x1 -dtrmm_kernel_L1_END: +.Ldtrmm_kernel_L1_END: -dtrmm_kernel_L999: +.Ldtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index 2b8173715d..0ac5a5f24d 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble dtrmm_kernel_L2_BEGIN + ble .Ldtrmm_kernel_L2_BEGIN /******************************************************************************/ -dtrmm_kernel_L4_BEGIN: +.Ldtrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -dtrmm_kernel_L4_M8_BEGIN: +.Ldtrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dtrmm_kernel_L4_M4_BEGIN + ble .Ldtrmm_kernel_L4_M4_BEGIN .align 5 -dtrmm_kernel_L4_M8_20: +.Ldtrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20: asr counterL , tempK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? - blt dtrmm_kernel_L4_M8_32 + blt .Ldtrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K @@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20: KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 - ble dtrmm_kernel_L4_M8_22a + ble .Ldtrmm_kernel_L4_M8_22a .align 5 -dtrmm_kernel_L4_M8_22: +.Ldtrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 @@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22: KERNEL8x4_M2 subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M8_22 + bgt .Ldtrmm_kernel_L4_M8_22 .align 5 -dtrmm_kernel_L4_M8_22a: +.Ldtrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_M2 @@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b dtrmm_kernel_L4_M8_44 + b .Ldtrmm_kernel_L4_M8_44 .align 5 -dtrmm_kernel_L4_M8_32: +.Ldtrmm_kernel_L4_M8_32: tst counterL, #1 - ble dtrmm_kernel_L4_M8_40 + ble .Ldtrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_M2 @@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32: KERNEL8x4_M1 KERNEL8x4_E - b dtrmm_kernel_L4_M8_44 + b .Ldtrmm_kernel_L4_M8_44 -dtrmm_kernel_L4_M8_40: +.Ldtrmm_kernel_L4_M8_40: INIT8x4 -dtrmm_kernel_L4_M8_44: +.Ldtrmm_kernel_L4_M8_44: ands counterL , tempK, #7 - ble dtrmm_kernel_L4_M8_100 + ble .Ldtrmm_kernel_L4_M8_100 .align 5 -dtrmm_kernel_L4_M8_46: +.Ldtrmm_kernel_L4_M8_46: KERNEL8x4_SUB subs counterL, counterL, #1 - bne dtrmm_kernel_L4_M8_46 + bne .Ldtrmm_kernel_L4_M8_46 -dtrmm_kernel_L4_M8_100: +.Ldtrmm_kernel_L4_M8_100: SAVE8x4 @@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -dtrmm_kernel_L4_M8_END: +.Ldtrmm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne dtrmm_kernel_L4_M8_20 + bne .Ldtrmm_kernel_L4_M8_20 -dtrmm_kernel_L4_M4_BEGIN: +.Ldtrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #4 - ble dtrmm_kernel_L4_M2_BEGIN + ble .Ldtrmm_kernel_L4_M2_BEGIN -dtrmm_kernel_L4_M4_20: +.Ldtrmm_kernel_L4_M4_20: INIT4x4 @@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M4_40 + ble .Ldtrmm_kernel_L4_M4_40 -dtrmm_kernel_L4_M4_22: +.Ldtrmm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_22 + bgt .Ldtrmm_kernel_L4_M4_22 -dtrmm_kernel_L4_M4_40: +.Ldtrmm_kernel_L4_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M4_100 + ble .Ldtrmm_kernel_L4_M4_100 -dtrmm_kernel_L4_M4_42: +.Ldtrmm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M4_42 + bgt .Ldtrmm_kernel_L4_M4_42 -dtrmm_kernel_L4_M4_100: +.Ldtrmm_kernel_L4_M4_100: SAVE4x4 @@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L4_M4_END: +.Ldtrmm_kernel_L4_M4_END: -dtrmm_kernel_L4_M2_BEGIN: +.Ldtrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L4_M1_BEGIN + ble .Ldtrmm_kernel_L4_M1_BEGIN -dtrmm_kernel_L4_M2_20: +.Ldtrmm_kernel_L4_M2_20: INIT2x4 @@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M2_40 + ble .Ldtrmm_kernel_L4_M2_40 -dtrmm_kernel_L4_M2_22: +.Ldtrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_22 + bgt .Ldtrmm_kernel_L4_M2_22 -dtrmm_kernel_L4_M2_40: +.Ldtrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M2_100 + ble .Ldtrmm_kernel_L4_M2_100 -dtrmm_kernel_L4_M2_42: +.Ldtrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M2_42 + bgt .Ldtrmm_kernel_L4_M2_42 -dtrmm_kernel_L4_M2_100: +.Ldtrmm_kernel_L4_M2_100: SAVE2x4 @@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L4_M2_END: +.Ldtrmm_kernel_L4_M2_END: -dtrmm_kernel_L4_M1_BEGIN: +.Ldtrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L4_END + ble .Ldtrmm_kernel_L4_END -dtrmm_kernel_L4_M1_20: +.Ldtrmm_kernel_L4_M1_20: INIT1x4 @@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L4_M1_40 + ble .Ldtrmm_kernel_L4_M1_40 -dtrmm_kernel_L4_M1_22: +.Ldtrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_22 + bgt .Ldtrmm_kernel_L4_M1_22 -dtrmm_kernel_L4_M1_40: +.Ldtrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L4_M1_100 + ble .Ldtrmm_kernel_L4_M1_100 -dtrmm_kernel_L4_M1_42: +.Ldtrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L4_M1_42 + bgt .Ldtrmm_kernel_L4_M1_42 -dtrmm_kernel_L4_M1_100: +.Ldtrmm_kernel_L4_M1_100: SAVE1x4 @@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L4_END: +.Ldtrmm_kernel_L4_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 4 * 8 @@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt dtrmm_kernel_L4_BEGIN + bgt .Ldtrmm_kernel_L4_BEGIN /******************************************************************************/ -dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble dtrmm_kernel_L999 // error, N was less than 4? + ble .Ldtrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble dtrmm_kernel_L1_BEGIN + ble .Ldtrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction #endif mov pA, origPA // pA = A -dtrmm_kernel_L2_M8_BEGIN: +.Ldtrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dtrmm_kernel_L2_M4_BEGIN + ble .Ldtrmm_kernel_L2_M4_BEGIN -dtrmm_kernel_L2_M8_20: +.Ldtrmm_kernel_L2_M8_20: INIT8x2 @@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M8_40 + ble .Ldtrmm_kernel_L2_M8_40 .align 5 -dtrmm_kernel_L2_M8_22: +.Ldtrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M8_22 + bgt .Ldtrmm_kernel_L2_M8_22 -dtrmm_kernel_L2_M8_40: +.Ldtrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M8_100 + ble .Ldtrmm_kernel_L2_M8_100 -dtrmm_kernel_L2_M8_42: +.Ldtrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M8_42 + bgt .Ldtrmm_kernel_L2_M8_42 -dtrmm_kernel_L2_M8_100: +.Ldtrmm_kernel_L2_M8_100: SAVE8x2 @@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100: add tempOffset, tempOffset, #8 #endif -dtrmm_kernel_L2_M8_END: +.Ldtrmm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L2_M8_20 + bgt .Ldtrmm_kernel_L2_M8_20 -dtrmm_kernel_L2_M4_BEGIN: +.Ldtrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #4 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M2_BEGIN + ble .Ldtrmm_kernel_L2_M2_BEGIN -dtrmm_kernel_L2_M4_20: +.Ldtrmm_kernel_L2_M4_20: INIT4x2 @@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M4_40 + ble .Ldtrmm_kernel_L2_M4_40 .align 5 -dtrmm_kernel_L2_M4_22: +.Ldtrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_22 + bgt .Ldtrmm_kernel_L2_M4_22 -dtrmm_kernel_L2_M4_40: +.Ldtrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M4_100 + ble .Ldtrmm_kernel_L2_M4_100 -dtrmm_kernel_L2_M4_42: +.Ldtrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M4_42 + bgt .Ldtrmm_kernel_L2_M4_42 -dtrmm_kernel_L2_M4_100: +.Ldtrmm_kernel_L2_M4_100: SAVE4x2 @@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L2_M4_END: +.Ldtrmm_kernel_L2_M4_END: -dtrmm_kernel_L2_M2_BEGIN: +.Ldtrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L2_M1_BEGIN + ble .Ldtrmm_kernel_L2_M1_BEGIN -dtrmm_kernel_L2_M2_20: +.Ldtrmm_kernel_L2_M2_20: INIT2x2 @@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble dtrmm_kernel_L2_M2_40 + ble .Ldtrmm_kernel_L2_M2_40 -dtrmm_kernel_L2_M2_22: +.Ldtrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_22 + bgt .Ldtrmm_kernel_L2_M2_22 -dtrmm_kernel_L2_M2_40: +.Ldtrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M2_100 + ble .Ldtrmm_kernel_L2_M2_100 -dtrmm_kernel_L2_M2_42: +.Ldtrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M2_42 + bgt .Ldtrmm_kernel_L2_M2_42 -dtrmm_kernel_L2_M2_100: +.Ldtrmm_kernel_L2_M2_100: SAVE2x2 @@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L2_M2_END: +.Ldtrmm_kernel_L2_M2_END: -dtrmm_kernel_L2_M1_BEGIN: +.Ldtrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L2_END + ble .Ldtrmm_kernel_L2_END -dtrmm_kernel_L2_M1_20: +.Ldtrmm_kernel_L2_M1_20: INIT1x2 @@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble dtrmm_kernel_L2_M1_40 + ble .Ldtrmm_kernel_L2_M1_40 -dtrmm_kernel_L2_M1_22: +.Ldtrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_22 + bgt .Ldtrmm_kernel_L2_M1_22 -dtrmm_kernel_L2_M1_40: +.Ldtrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L2_M1_100 + ble .Ldtrmm_kernel_L2_M1_100 -dtrmm_kernel_L2_M1_42: +.Ldtrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L2_M1_42 + bgt .Ldtrmm_kernel_L2_M1_42 -dtrmm_kernel_L2_M1_100: +.Ldtrmm_kernel_L2_M1_100: SAVE1x2 @@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -dtrmm_kernel_L2_END: +.Ldtrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END: /******************************************************************************/ -dtrmm_kernel_L1_BEGIN: +.Ldtrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble dtrmm_kernel_L999 // done + ble .Ldtrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C add pC , pC , LDC // Update pC to point to next @@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -dtrmm_kernel_L1_M8_BEGIN: +.Ldtrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble dtrmm_kernel_L1_M4_BEGIN + ble .Ldtrmm_kernel_L1_M4_BEGIN -dtrmm_kernel_L1_M8_20: +.Ldtrmm_kernel_L1_M8_20: INIT8x1 @@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M8_40 + ble .Ldtrmm_kernel_L1_M8_40 .align 5 -dtrmm_kernel_L1_M8_22: +.Ldtrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M8_22 + bgt .Ldtrmm_kernel_L1_M8_22 -dtrmm_kernel_L1_M8_40: +.Ldtrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M8_100 + ble .Ldtrmm_kernel_L1_M8_100 -dtrmm_kernel_L1_M8_42: +.Ldtrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M8_42 + bgt .Ldtrmm_kernel_L1_M8_42 -dtrmm_kernel_L1_M8_100: +.Ldtrmm_kernel_L1_M8_100: SAVE8x1 @@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100: add tempOffset, tempOffset, #8 #endif -dtrmm_kernel_L1_M8_END: +.Ldtrmm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt dtrmm_kernel_L1_M8_20 + bgt .Ldtrmm_kernel_L1_M8_20 -dtrmm_kernel_L1_M4_BEGIN: +.Ldtrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #4 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M2_BEGIN + ble .Ldtrmm_kernel_L1_M2_BEGIN -dtrmm_kernel_L1_M4_20: +.Ldtrmm_kernel_L1_M4_20: INIT4x1 @@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M4_40 + ble .Ldtrmm_kernel_L1_M4_40 .align 5 -dtrmm_kernel_L1_M4_22: +.Ldtrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_22 + bgt .Ldtrmm_kernel_L1_M4_22 -dtrmm_kernel_L1_M4_40: +.Ldtrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M4_100 + ble .Ldtrmm_kernel_L1_M4_100 -dtrmm_kernel_L1_M4_42: +.Ldtrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M4_42 + bgt .Ldtrmm_kernel_L1_M4_42 -dtrmm_kernel_L1_M4_100: +.Ldtrmm_kernel_L1_M4_100: SAVE4x1 @@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -dtrmm_kernel_L1_M4_END: +.Ldtrmm_kernel_L1_M4_END: -dtrmm_kernel_L1_M2_BEGIN: +.Ldtrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble dtrmm_kernel_L1_M1_BEGIN + ble .Ldtrmm_kernel_L1_M1_BEGIN -dtrmm_kernel_L1_M2_20: +.Ldtrmm_kernel_L1_M2_20: INIT2x1 @@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M2_40 + ble .Ldtrmm_kernel_L1_M2_40 -dtrmm_kernel_L1_M2_22: +.Ldtrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_22 + bgt .Ldtrmm_kernel_L1_M2_22 -dtrmm_kernel_L1_M2_40: +.Ldtrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M2_100 + ble .Ldtrmm_kernel_L1_M2_100 -dtrmm_kernel_L1_M2_42: +.Ldtrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M2_42 + bgt .Ldtrmm_kernel_L1_M2_42 -dtrmm_kernel_L1_M2_100: +.Ldtrmm_kernel_L1_M2_100: SAVE2x1 @@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -dtrmm_kernel_L1_M2_END: +.Ldtrmm_kernel_L1_M2_END: -dtrmm_kernel_L1_M1_BEGIN: +.Ldtrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble dtrmm_kernel_L1_END + ble .Ldtrmm_kernel_L1_END -dtrmm_kernel_L1_M1_20: +.Ldtrmm_kernel_L1_M1_20: INIT1x1 @@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble dtrmm_kernel_L1_M1_40 + ble .Ldtrmm_kernel_L1_M1_40 -dtrmm_kernel_L1_M1_22: +.Ldtrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_22 + bgt .Ldtrmm_kernel_L1_M1_22 -dtrmm_kernel_L1_M1_40: +.Ldtrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble dtrmm_kernel_L1_M1_100 + ble .Ldtrmm_kernel_L1_M1_100 -dtrmm_kernel_L1_M1_42: +.Ldtrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt dtrmm_kernel_L1_M1_42 + bgt .Ldtrmm_kernel_L1_M1_42 -dtrmm_kernel_L1_M1_100: +.Ldtrmm_kernel_L1_M1_100: SAVE1x1 -dtrmm_kernel_L1_END: +.Ldtrmm_kernel_L1_END: -dtrmm_kernel_L999: +.Ldtrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S index 162f721c38..658551f4f0 100644 --- a/kernel/arm64/gemv_n.S +++ b/kernel/arm64/gemv_n.S @@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble gemv_n_kernel_L999 + ble .Lgemv_n_kernel_L999 cmp M, xzr - ble gemv_n_kernel_L999 + ble .Lgemv_n_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_X, INC_X, #SHZ mov J, N cmp INC_Y, #1 - bne gemv_n_kernel_S_BEGIN + bne .Lgemv_n_kernel_S_BEGIN -gemv_n_kernel_F_LOOP: +.Lgemv_n_kernel_F_LOOP: ld1 TEMPV, [X], INC_X fmul TEMP, ALPHA, TEMP @@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP: mov Y_IPTR, Y mov Y_OPTR, Y -gemv_n_kernel_F32: +.Lgemv_n_kernel_F32: asr I, M, #5 cmp I, xzr - beq gemv_n_kernel_F4 + beq .Lgemv_n_kernel_F4 -gemv_n_kernel_F320: +.Lgemv_n_kernel_F320: KERNEL_F16 KERNEL_F16 subs I, I, #1 - bne gemv_n_kernel_F320 + bne .Lgemv_n_kernel_F320 -gemv_n_kernel_F4: +.Lgemv_n_kernel_F4: ands I, M, #31 asr I, I, #2 cmp I, xzr - beq gemv_n_kernel_F1 + beq .Lgemv_n_kernel_F1 -gemv_n_kernel_F40: +.Lgemv_n_kernel_F40: KERNEL_F4 subs I, I, #1 - bne gemv_n_kernel_F40 + bne .Lgemv_n_kernel_F40 -gemv_n_kernel_F1: +.Lgemv_n_kernel_F1: ands I, M, #3 - ble gemv_n_kernel_F_END + ble .Lgemv_n_kernel_F_END -gemv_n_kernel_F10: +.Lgemv_n_kernel_F10: KERNEL_F1 subs I, I, #1 - bne gemv_n_kernel_F10 + bne .Lgemv_n_kernel_F10 -gemv_n_kernel_F_END: +.Lgemv_n_kernel_F_END: add A, A, LDA subs J, J, #1 - bne gemv_n_kernel_F_LOOP + bne .Lgemv_n_kernel_F_LOOP - b gemv_n_kernel_L999 + b .Lgemv_n_kernel_L999 -gemv_n_kernel_S_BEGIN: +.Lgemv_n_kernel_S_BEGIN: INIT_S -gemv_n_kernel_S_LOOP: +.Lgemv_n_kernel_S_LOOP: ld1 TEMPV, [X], INC_X fmul TEMP, ALPHA, TEMP @@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble gemv_n_kernel_S1 + ble .Lgemv_n_kernel_S1 -gemv_n_kernel_S4: +.Lgemv_n_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -298,27 +298,27 @@ gemv_n_kernel_S4: KERNEL_S1 subs I, I, #1 - bne gemv_n_kernel_S4 + bne .Lgemv_n_kernel_S4 -gemv_n_kernel_S1: +.Lgemv_n_kernel_S1: ands I, M, #3 - ble gemv_n_kernel_S_END + ble .Lgemv_n_kernel_S_END -gemv_n_kernel_S10: +.Lgemv_n_kernel_S10: KERNEL_S1 subs I, I, #1 - bne gemv_n_kernel_S10 + bne .Lgemv_n_kernel_S10 -gemv_n_kernel_S_END: +.Lgemv_n_kernel_S_END: add A, A, LDA subs J, J, #1 - bne gemv_n_kernel_S_LOOP + bne .Lgemv_n_kernel_S_LOOP -gemv_n_kernel_L999: +.Lgemv_n_kernel_L999: mov w0, wzr diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S index 28325f784b..b04367ab37 100644 --- a/kernel/arm64/gemv_t.S +++ b/kernel/arm64/gemv_t.S @@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble gemv_t_kernel_L999 + ble .Lgemv_t_kernel_L999 cmp M, xzr - ble gemv_t_kernel_L999 + ble .Lgemv_t_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_Y, INC_Y, #SHZ mov J, N cmp INC_X, #1 - bne gemv_t_kernel_S_BEGIN + bne .Lgemv_t_kernel_S_BEGIN -gemv_t_kernel_F_LOOP: +.Lgemv_t_kernel_F_LOOP: fmov TEMP, REG0 fmov TEMP1, REG0 @@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP: mov A_PTR, A mov X_PTR, X -gemv_t_kernel_F32: +.Lgemv_t_kernel_F32: asr I, M, #5 cmp I, xzr - beq gemv_t_kernel_F4 + beq .Lgemv_t_kernel_F4 -gemv_t_kernel_F320: +.Lgemv_t_kernel_F320: KERNEL_F32 subs I, I, #1 - bne gemv_t_kernel_F320 + bne .Lgemv_t_kernel_F320 KERNEL_F32_FINALIZE -gemv_t_kernel_F4: +.Lgemv_t_kernel_F4: ands I, M, #31 asr I, I, #2 cmp I, xzr - beq gemv_t_kernel_F1 + beq .Lgemv_t_kernel_F1 -gemv_t_kernel_F40: +.Lgemv_t_kernel_F40: KERNEL_F4 subs I, I, #1 - bne gemv_t_kernel_F40 + bne .Lgemv_t_kernel_F40 -gemv_t_kernel_F1: +.Lgemv_t_kernel_F1: KERNEL_F4_FINALIZE ands I, M, #3 - ble gemv_t_kernel_F_END + ble .Lgemv_t_kernel_F_END -gemv_t_kernel_F10: +.Lgemv_t_kernel_F10: KERNEL_F1 subs I, I, #1 - bne gemv_t_kernel_F10 + bne .Lgemv_t_kernel_F10 -gemv_t_kernel_F_END: +.Lgemv_t_kernel_F_END: ld1 TMPV1, [Y] add A, A, LDA subs J, J, #1 fmadd TMP1, ALPHA, TEMP, TMP1 st1 TMPV1, [Y], INC_Y - bne gemv_t_kernel_F_LOOP + bne .Lgemv_t_kernel_F_LOOP - b gemv_t_kernel_L999 + b .Lgemv_t_kernel_L999 -gemv_t_kernel_S_BEGIN: +.Lgemv_t_kernel_S_BEGIN: INIT_S -gemv_t_kernel_S_LOOP: +.Lgemv_t_kernel_S_LOOP: fmov TEMP, REG0 mov A_PTR, A @@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble gemv_t_kernel_S1 + ble .Lgemv_t_kernel_S1 -gemv_t_kernel_S4: +.Lgemv_t_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -329,30 +329,30 @@ gemv_t_kernel_S4: KERNEL_S1 subs I, I, #1 - bne gemv_t_kernel_S4 + bne .Lgemv_t_kernel_S4 -gemv_t_kernel_S1: +.Lgemv_t_kernel_S1: ands I, M, #3 - ble gemv_t_kernel_S_END + ble .Lgemv_t_kernel_S_END -gemv_t_kernel_S10: +.Lgemv_t_kernel_S10: KERNEL_S1 subs I, I, #1 - bne gemv_t_kernel_S10 + bne .Lgemv_t_kernel_S10 -gemv_t_kernel_S_END: +.Lgemv_t_kernel_S_END: ld1 TMPV1, [Y] add A, A, LDA subs J, J, #1 fmadd TMP1, ALPHA, TEMP, TMP1 st1 TMPV1, [Y], INC_Y - bne gemv_t_kernel_S_LOOP + bne .Lgemv_t_kernel_S_LOOP -gemv_t_kernel_L999: +.Lgemv_t_kernel_L999: RESTORE_REGS diff --git a/kernel/arm64/iamax.S b/kernel/arm64/iamax.S index 6c0d84f988..31d0cd6466 100644 --- a/kernel/arm64/iamax.S +++ b/kernel/arm64/iamax.S @@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble iamax_kernel_zero + ble .Liamax_kernel_zero cmp INC_X, xzr - ble iamax_kernel_zero + ble .Liamax_kernel_zero cmp INC_X, #1 - bne iamax_kernel_S_BEGIN + bne .Liamax_kernel_S_BEGIN mov x7, X -iamax_kernel_F_BEGIN: +.Liamax_kernel_F_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 asr I, N, #3 cmp I, xzr - beq iamax_kernel_F1 + beq .Liamax_kernel_F1 add Z, Z, #1 -iamax_kernel_F8: +.Liamax_kernel_F8: KERNEL_F8 subs I, I, #1 - bne iamax_kernel_F8 + bne .Liamax_kernel_F8 KERNEL_F8_FINALIZE sub Z, Z, #1 -iamax_kernel_F1: +.Liamax_kernel_F1: ands I, N, #7 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 -iamax_kernel_F10: +.Liamax_kernel_F10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_F10 + bne .Liamax_kernel_F10 - b iamax_kernel_L999 + b .Liamax_kernel_L999 -iamax_kernel_S_BEGIN: +.Liamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble iamax_kernel_S1 + ble .Liamax_kernel_S1 -iamax_kernel_S4: +.Liamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -293,25 +293,25 @@ iamax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S4 + bne .Liamax_kernel_S4 -iamax_kernel_S1: +.Liamax_kernel_S1: ands I, N, #3 - ble iamax_kernel_L999 + ble .Liamax_kernel_L999 -iamax_kernel_S10: +.Liamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S10 + bne .Liamax_kernel_S10 -iamax_kernel_L999: +.Liamax_kernel_L999: mov x0, INDEX ret -iamax_kernel_zero: +.Liamax_kernel_zero: mov x0, xzr ret diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S index 9b252ec98c..42fa4e7111 100644 --- a/kernel/arm64/izamax.S +++ b/kernel/arm64/izamax.S @@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble iamax_kernel_zero + ble .Lizamax_kernel_zero cmp INC_X, xzr - ble iamax_kernel_zero + ble .Lizamax_kernel_zero cmp INC_X, #1 - bne iamax_kernel_S_BEGIN + bne .Lizamax_kernel_S_BEGIN mov x7, X -iamax_kernel_F_BEGIN: +.Lizamax_kernel_F_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 asr I, N, #3 cmp I, xzr - ble iamax_kernel_F1 + ble .Lizamax_kernel_F1 add Z, Z, #1 -iamax_kernel_F8: +.Lizamax_kernel_F8: KERNEL_F8 subs I, I, #1 - bne iamax_kernel_F8 + bne .Lizamax_kernel_F8 KERNEL_F8_FINALIZE sub Z, Z, #1 -iamax_kernel_F1: +.Lizamax_kernel_F1: ands I, N, #7 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 -iamax_kernel_F10: +.Lizamax_kernel_F10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_F10 + bne .Lizamax_kernel_F10 - b iamax_kernel_L999 + b .Lizamax_kernel_L999 -iamax_kernel_S_BEGIN: +.Lizamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble iamax_kernel_S1 + ble .Lizamax_kernel_S1 -iamax_kernel_S4: +.Lizamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -341,26 +341,26 @@ iamax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S4 + bne .Lizamax_kernel_S4 -iamax_kernel_S1: +.Lizamax_kernel_S1: ands I, N, #3 - ble iamax_kernel_L999 + ble .Lizamax_kernel_L999 -iamax_kernel_S10: +.Lizamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne iamax_kernel_S10 + bne .Lizamax_kernel_S10 -iamax_kernel_L999: +.Lizamax_kernel_L999: mov x0, INDEX ret -iamax_kernel_zero: +.Lizamax_kernel_zero: mov x0, xzr ret diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S index 5d06c13c0a..e2cbd4def8 100644 --- a/kernel/arm64/nrm2.S +++ b/kernel/arm64/nrm2.S @@ -162,44 +162,44 @@ KERNEL_S1_NEXT: INIT cmp N, #0 - ble nrm2_kernel_L999 + ble .Lnrm2_kernel_L999 cmp INC_X, #0 - beq nrm2_kernel_L999 + beq .Lnrm2_kernel_L999 cmp INC_X, #1 - bne nrm2_kernel_S_BEGIN + bne .Lnrm2_kernel_S_BEGIN -nrm2_kernel_F_BEGIN: +.Lnrm2_kernel_F_BEGIN: asr I, N, #3 // I = N / 8 cmp I, xzr - ble nrm2_kernel_F1 + ble .Lnrm2_kernel_F1 -nrm2_kernel_F8: +.Lnrm2_kernel_F8: KERNEL_F8 subs I, I, #1 - bne nrm2_kernel_F8 + bne .Lnrm2_kernel_F8 -nrm2_kernel_F1: +.Lnrm2_kernel_F1: ands I, N, #7 - ble nrm2_kernel_L999 + ble .Lnrm2_kernel_L999 -nrm2_kernel_F10: +.Lnrm2_kernel_F10: KERNEL_F1 subs I, I, #1 - bne nrm2_kernel_F10 + bne .Lnrm2_kernel_F10 - b nrm2_kernel_L999 + b .Lnrm2_kernel_L999 -nrm2_kernel_S_BEGIN: +.Lnrm2_kernel_S_BEGIN: INIT_S @@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN: .align 5 -nrm2_kernel_S10: +.Lnrm2_kernel_S10: KERNEL_S1 subs I, I, #1 - bne nrm2_kernel_S10 + bne .Lnrm2_kernel_S10 -nrm2_kernel_L999: +.Lnrm2_kernel_L999: fsqrt SSQ, SSQ fmul SSQ, SCALE, SSQ diff --git a/kernel/arm64/rot.S b/kernel/arm64/rot.S index 5721252327..00c3085fa3 100644 --- a/kernel/arm64/rot.S +++ b/kernel/arm64/rot.S @@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble rot_kernel_L999 + ble .Lrot_kernel_L999 INIT cmp INC_X, #1 - bne rot_kernel_S_BEGIN + bne .Lrot_kernel_S_BEGIN cmp INC_Y, #1 - bne rot_kernel_S_BEGIN + bne .Lrot_kernel_S_BEGIN -rot_kernel_F_BEGIN: +.Lrot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq rot_kernel_F1 + beq .Lrot_kernel_F1 KERNEL_INIT_F4 -rot_kernel_F4: +.Lrot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne rot_kernel_F4 + bne .Lrot_kernel_F4 -rot_kernel_F1: +.Lrot_kernel_F1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lrot_kernel_L999 INIT_F1 -rot_kernel_F10: +.Lrot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne rot_kernel_F10 + bne .Lrot_kernel_F10 mov w0, wzr ret -rot_kernel_S_BEGIN: +.Lrot_kernel_S_BEGIN: INIT_S INIT_F1 @@ -214,9 +214,9 @@ rot_kernel_S_BEGIN: asr I, N, #2 cmp I, xzr - ble rot_kernel_S1 + ble .Lrot_kernel_S1 -rot_kernel_S4: +.Lrot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -224,22 +224,22 @@ rot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S4 + bne .Lrot_kernel_S4 -rot_kernel_S1: +.Lrot_kernel_S1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lrot_kernel_L999 -rot_kernel_S10: +.Lrot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S10 + bne .Lrot_kernel_S10 -rot_kernel_L999: +.Lrot_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/scal.S b/kernel/arm64/scal.S index 91d469d037..09c41cdaab 100644 --- a/kernel/arm64/scal.S +++ b/kernel/arm64/scal.S @@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble scal_kernel_L999 + ble .Lscal_kernel_L999 fcmp DA, #0.0 - beq scal_kernel_zero + beq .Lscal_kernel_zero cmp INC_X, #1 - bne scal_kernel_S_BEGIN + bne .Lscal_kernel_S_BEGIN -scal_kernel_F_BEGIN: +.Lscal_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq scal_kernel_F1 + beq .Lscal_kernel_F1 KERNEL_INIT_F8 -scal_kernel_F8: +.Lscal_kernel_F8: KERNEL_F8 subs I, I, #1 - bne scal_kernel_F8 + bne .Lscal_kernel_F8 -scal_kernel_F1: +.Lscal_kernel_F1: ands I, N, #7 - ble scal_kernel_L999 + ble .Lscal_kernel_L999 -scal_kernel_F10: +.Lscal_kernel_F10: KERNEL_F1 subs I, I, #1 - bne scal_kernel_F10 + bne .Lscal_kernel_F10 mov w0, wzr ret -scal_kernel_S_BEGIN: +.Lscal_kernel_S_BEGIN: INIT_S mov X_COPY, X asr I, N, #2 cmp I, xzr - ble scal_kernel_S1 + ble .Lscal_kernel_S1 -scal_kernel_S4: +.Lscal_kernel_S4: KERNEL_S4 subs I, I, #1 - bne scal_kernel_S4 + bne .Lscal_kernel_S4 -scal_kernel_S1: +.Lscal_kernel_S1: ands I, N, #3 - ble scal_kernel_L999 + ble .Lscal_kernel_L999 -scal_kernel_S10: +.Lscal_kernel_S10: KERNEL_S1 subs I, I, #1 - bne scal_kernel_S10 + bne .Lscal_kernel_S10 -scal_kernel_L999: +.Lscal_kernel_L999: mov w0, wzr ret -scal_kernel_zero: +.Lscal_kernel_zero: INIT_S -scal_kernel_Z1: +.Lscal_kernel_Z1: st1 DAV, [X], INC_X subs N, N, #1 - bne scal_kernel_Z1 + bne .Lscal_kernel_Z1 mov w0, wzr ret diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 6e3645b767..99099ea6fd 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -1070,7 +1070,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -sgemm_kernel_begin: +.Lsgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1098,11 +1098,11 @@ sgemm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1112,21 +1112,21 @@ sgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -sgemm_kernel_L4_M16_BEGIN: +.Lsgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L4_M8_BEGIN + ble .Lsgemm_kernel_L4_M8_BEGIN .align 5 -sgemm_kernel_L4_M16_20: +.Lsgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt sgemm_kernel_L4_M16_32 + blt .Lsgemm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 @@ -1138,10 +1138,10 @@ sgemm_kernel_L4_M16_20: KERNEL16x4_M2 subs counterL, counterL, #2 - ble sgemm_kernel_L4_M16_22a + ble .Lsgemm_kernel_L4_M16_22a .align 5 -sgemm_kernel_L4_M16_22: +.Lsgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1153,10 +1153,10 @@ sgemm_kernel_L4_M16_22: KERNEL16x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M16_22 + bgt .Lsgemm_kernel_L4_M16_22 .align 5 -sgemm_kernel_L4_M16_22a: +.Lsgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1167,13 +1167,13 @@ sgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 .align 5 -sgemm_kernel_L4_M16_32: +.Lsgemm_kernel_L4_M16_32: tst counterL, #1 - ble sgemm_kernel_L4_M16_40 + ble .Lsgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 @@ -1184,187 +1184,187 @@ sgemm_kernel_L4_M16_32: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_40: +.Lsgemm_kernel_L4_M16_40: INIT16x4 -sgemm_kernel_L4_M16_44: +.Lsgemm_kernel_L4_M16_44: ands counterL , origK, #7 - ble sgemm_kernel_L4_M16_100 + ble .Lsgemm_kernel_L4_M16_100 .align 5 -sgemm_kernel_L4_M16_46: +.Lsgemm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 - bne sgemm_kernel_L4_M16_46 + bne .Lsgemm_kernel_L4_M16_46 -sgemm_kernel_L4_M16_100: +.Lsgemm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE16x4 -sgemm_kernel_L4_M16_END: +.Lsgemm_kernel_L4_M16_END: subs counterI, counterI, #1 - bne sgemm_kernel_L4_M16_20 + bne .Lsgemm_kernel_L4_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #8 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M8_32 + blt .Lsgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M8_22a + ble .Lsgemm_kernel_L4_M8_22a .align 5 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_22a: +.Lsgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_32: +.Lsgemm_kernel_L4_M8_32: tst counterL, #1 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: INIT8x4 -sgemm_kernel_L4_M8_44: +.Lsgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_46: +.Lsgemm_kernel_L4_M8_46: KERNEL8x4_SUB -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 @@ -1372,9 +1372,9 @@ sgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1387,34 +1387,34 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 @@ -1422,9 +1422,9 @@ sgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1436,42 +1436,42 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1479,14 +1479,14 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -sgemm_kernel_L2_M16_BEGIN: +.Lsgemm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 - ble sgemm_kernel_L2_M8_BEGIN + ble .Lsgemm_kernel_L2_M8_BEGIN -sgemm_kernel_L2_M16_20: +.Lsgemm_kernel_L2_M16_20: INIT16x2 @@ -1494,10 +1494,10 @@ sgemm_kernel_L2_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M16_40 + ble .Lsgemm_kernel_L2_M16_40 .align 5 -sgemm_kernel_L2_M16_22: +.Lsgemm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB @@ -1509,41 +1509,41 @@ sgemm_kernel_L2_M16_22: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_22 + bgt .Lsgemm_kernel_L2_M16_22 -sgemm_kernel_L2_M16_40: +.Lsgemm_kernel_L2_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M16_100 + ble .Lsgemm_kernel_L2_M16_100 -sgemm_kernel_L2_M16_42: +.Lsgemm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_42 + bgt .Lsgemm_kernel_L2_M16_42 -sgemm_kernel_L2_M16_100: +.Lsgemm_kernel_L2_M16_100: SAVE16x2 -sgemm_kernel_L2_M16_END: +.Lsgemm_kernel_L2_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M16_20 + bgt .Lsgemm_kernel_L2_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L2_M8_BEGIN: +.Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #8 - ble sgemm_kernel_L2_M4_BEGIN + ble .Lsgemm_kernel_L2_M4_BEGIN -sgemm_kernel_L2_M8_20: +.Lsgemm_kernel_L2_M8_20: INIT8x2 @@ -1551,10 +1551,10 @@ sgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M8_40 + ble .Lsgemm_kernel_L2_M8_40 .align 5 -sgemm_kernel_L2_M8_22: +.Lsgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1566,38 +1566,38 @@ sgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_22 + bgt .Lsgemm_kernel_L2_M8_22 -sgemm_kernel_L2_M8_40: +.Lsgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M8_100 + ble .Lsgemm_kernel_L2_M8_100 -sgemm_kernel_L2_M8_42: +.Lsgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_42 + bgt .Lsgemm_kernel_L2_M8_42 -sgemm_kernel_L2_M8_100: +.Lsgemm_kernel_L2_M8_100: SAVE8x2 -sgemm_kernel_L2_M8_END: +.Lsgemm_kernel_L2_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 @@ -1605,10 +1605,10 @@ sgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1620,40 +1620,40 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 @@ -1661,9 +1661,9 @@ sgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1676,34 +1676,34 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 @@ -1711,9 +1711,9 @@ sgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1725,36 +1725,36 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1762,14 +1762,14 @@ sgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -sgemm_kernel_L1_M16_BEGIN: +.Lsgemm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L1_M8_BEGIN + ble .Lsgemm_kernel_L1_M8_BEGIN -sgemm_kernel_L1_M16_20: +.Lsgemm_kernel_L1_M16_20: INIT16x1 @@ -1777,10 +1777,10 @@ sgemm_kernel_L1_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M16_40 + ble .Lsgemm_kernel_L1_M16_40 .align 5 -sgemm_kernel_L1_M16_22: +.Lsgemm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB @@ -1792,42 +1792,42 @@ sgemm_kernel_L1_M16_22: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_22 + bgt .Lsgemm_kernel_L1_M16_22 -sgemm_kernel_L1_M16_40: +.Lsgemm_kernel_L1_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M16_100 + ble .Lsgemm_kernel_L1_M16_100 -sgemm_kernel_L1_M16_42: +.Lsgemm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_42 + bgt .Lsgemm_kernel_L1_M16_42 -sgemm_kernel_L1_M16_100: +.Lsgemm_kernel_L1_M16_100: SAVE16x1 -sgemm_kernel_L1_M16_END: +.Lsgemm_kernel_L1_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M16_20 + bgt .Lsgemm_kernel_L1_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L1_M8_BEGIN: +.Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #8 - ble sgemm_kernel_L1_M4_BEGIN + ble .Lsgemm_kernel_L1_M4_BEGIN -sgemm_kernel_L1_M8_20: +.Lsgemm_kernel_L1_M8_20: INIT8x1 @@ -1835,10 +1835,10 @@ sgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M8_40 + ble .Lsgemm_kernel_L1_M8_40 .align 5 -sgemm_kernel_L1_M8_22: +.Lsgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1850,38 +1850,38 @@ sgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_22 + bgt .Lsgemm_kernel_L1_M8_22 -sgemm_kernel_L1_M8_40: +.Lsgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M8_100 + ble .Lsgemm_kernel_L1_M8_100 -sgemm_kernel_L1_M8_42: +.Lsgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_42 + bgt .Lsgemm_kernel_L1_M8_42 -sgemm_kernel_L1_M8_100: +.Lsgemm_kernel_L1_M8_100: SAVE8x1 -sgemm_kernel_L1_M8_END: +.Lsgemm_kernel_L1_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 @@ -1889,10 +1889,10 @@ sgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1904,39 +1904,39 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 @@ -1944,9 +1944,9 @@ sgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1959,34 +1959,34 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 @@ -1994,9 +1994,9 @@ sgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2008,28 +2008,28 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S b/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S index 0ee10e130e..144d4bcd69 100644 --- a/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S +++ b/kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S @@ -1117,7 +1117,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -sgemm_kernel_begin: +.Lsgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1145,11 +1145,11 @@ sgemm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1159,21 +1159,21 @@ sgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -sgemm_kernel_L4_M16_BEGIN: +.Lsgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L4_M8_BEGIN + ble .Lsgemm_kernel_L4_M8_BEGIN .align 5 -sgemm_kernel_L4_M16_20: +.Lsgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #4 // L = K / 16 cmp counterL , #2 - blt sgemm_kernel_L4_M16_32 + blt .Lsgemm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 @@ -1182,18 +1182,18 @@ sgemm_kernel_L4_M16_20: KERNEL16x4_M1_M2_x1 subs counterL, counterL, #2 - ble sgemm_kernel_L4_M16_22a + ble .Lsgemm_kernel_L4_M16_22a .align 5 -sgemm_kernel_L4_M16_22: +.Lsgemm_kernel_L4_M16_22: KERNEL16x4_M1_M2_x8 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M16_22 + bgt .Lsgemm_kernel_L4_M16_22 .align 5 -sgemm_kernel_L4_M16_22a: +.Lsgemm_kernel_L4_M16_22a: KERNEL16x4_M1_M2_x4 KERNEL16x4_M1_M2_x2 @@ -1201,13 +1201,13 @@ sgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 .align 5 -sgemm_kernel_L4_M16_32: +.Lsgemm_kernel_L4_M16_32: tst counterL, #1 - ble sgemm_kernel_L4_M16_40 + ble .Lsgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 @@ -1216,187 +1216,187 @@ sgemm_kernel_L4_M16_32: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_40: +.Lsgemm_kernel_L4_M16_40: INIT16x4 -sgemm_kernel_L4_M16_44: +.Lsgemm_kernel_L4_M16_44: ands counterL , origK, #15 - ble sgemm_kernel_L4_M16_100 + ble .Lsgemm_kernel_L4_M16_100 .align 5 -sgemm_kernel_L4_M16_46: +.Lsgemm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 - bne sgemm_kernel_L4_M16_46 + bne .Lsgemm_kernel_L4_M16_46 -sgemm_kernel_L4_M16_100: +.Lsgemm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE16x4 -sgemm_kernel_L4_M16_END: +.Lsgemm_kernel_L4_M16_END: subs counterI, counterI, #1 - bne sgemm_kernel_L4_M16_20 + bne .Lsgemm_kernel_L4_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #8 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M8_32 + blt .Lsgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M8_22a + ble .Lsgemm_kernel_L4_M8_22a .align 5 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_22a: +.Lsgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_32: +.Lsgemm_kernel_L4_M8_32: tst counterL, #1 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: INIT8x4 -sgemm_kernel_L4_M8_44: +.Lsgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_46: +.Lsgemm_kernel_L4_M8_46: KERNEL8x4_SUB -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 @@ -1404,9 +1404,9 @@ sgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1419,34 +1419,34 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 @@ -1454,9 +1454,9 @@ sgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1468,42 +1468,42 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1511,14 +1511,14 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -sgemm_kernel_L2_M16_BEGIN: +.Lsgemm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 - ble sgemm_kernel_L2_M8_BEGIN + ble .Lsgemm_kernel_L2_M8_BEGIN -sgemm_kernel_L2_M16_20: +.Lsgemm_kernel_L2_M16_20: INIT16x2 @@ -1526,10 +1526,10 @@ sgemm_kernel_L2_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M16_40 + ble .Lsgemm_kernel_L2_M16_40 .align 5 -sgemm_kernel_L2_M16_22: +.Lsgemm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB @@ -1541,41 +1541,41 @@ sgemm_kernel_L2_M16_22: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_22 + bgt .Lsgemm_kernel_L2_M16_22 -sgemm_kernel_L2_M16_40: +.Lsgemm_kernel_L2_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M16_100 + ble .Lsgemm_kernel_L2_M16_100 -sgemm_kernel_L2_M16_42: +.Lsgemm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M16_42 + bgt .Lsgemm_kernel_L2_M16_42 -sgemm_kernel_L2_M16_100: +.Lsgemm_kernel_L2_M16_100: SAVE16x2 -sgemm_kernel_L2_M16_END: +.Lsgemm_kernel_L2_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M16_20 + bgt .Lsgemm_kernel_L2_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L2_M8_BEGIN: +.Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #8 - ble sgemm_kernel_L2_M4_BEGIN + ble .Lsgemm_kernel_L2_M4_BEGIN -sgemm_kernel_L2_M8_20: +.Lsgemm_kernel_L2_M8_20: INIT8x2 @@ -1583,10 +1583,10 @@ sgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M8_40 + ble .Lsgemm_kernel_L2_M8_40 .align 5 -sgemm_kernel_L2_M8_22: +.Lsgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1598,38 +1598,38 @@ sgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_22 + bgt .Lsgemm_kernel_L2_M8_22 -sgemm_kernel_L2_M8_40: +.Lsgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M8_100 + ble .Lsgemm_kernel_L2_M8_100 -sgemm_kernel_L2_M8_42: +.Lsgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_42 + bgt .Lsgemm_kernel_L2_M8_42 -sgemm_kernel_L2_M8_100: +.Lsgemm_kernel_L2_M8_100: SAVE8x2 -sgemm_kernel_L2_M8_END: +.Lsgemm_kernel_L2_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 @@ -1637,10 +1637,10 @@ sgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1652,40 +1652,40 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 @@ -1693,9 +1693,9 @@ sgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1708,34 +1708,34 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 @@ -1743,9 +1743,9 @@ sgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1757,36 +1757,36 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1794,14 +1794,14 @@ sgemm_kernel_L1_BEGIN: mov pA, origPA // pA = A -sgemm_kernel_L1_M16_BEGIN: +.Lsgemm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L1_M8_BEGIN + ble .Lsgemm_kernel_L1_M8_BEGIN -sgemm_kernel_L1_M16_20: +.Lsgemm_kernel_L1_M16_20: INIT16x1 @@ -1809,10 +1809,10 @@ sgemm_kernel_L1_M16_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M16_40 + ble .Lsgemm_kernel_L1_M16_40 .align 5 -sgemm_kernel_L1_M16_22: +.Lsgemm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB @@ -1824,42 +1824,42 @@ sgemm_kernel_L1_M16_22: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_22 + bgt .Lsgemm_kernel_L1_M16_22 -sgemm_kernel_L1_M16_40: +.Lsgemm_kernel_L1_M16_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M16_100 + ble .Lsgemm_kernel_L1_M16_100 -sgemm_kernel_L1_M16_42: +.Lsgemm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M16_42 + bgt .Lsgemm_kernel_L1_M16_42 -sgemm_kernel_L1_M16_100: +.Lsgemm_kernel_L1_M16_100: SAVE16x1 -sgemm_kernel_L1_M16_END: +.Lsgemm_kernel_L1_M16_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M16_20 + bgt .Lsgemm_kernel_L1_M16_20 //------------------------------------------------------------------------------ -sgemm_kernel_L1_M8_BEGIN: +.Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #8 - ble sgemm_kernel_L1_M4_BEGIN + ble .Lsgemm_kernel_L1_M4_BEGIN -sgemm_kernel_L1_M8_20: +.Lsgemm_kernel_L1_M8_20: INIT8x1 @@ -1867,10 +1867,10 @@ sgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M8_40 + ble .Lsgemm_kernel_L1_M8_40 .align 5 -sgemm_kernel_L1_M8_22: +.Lsgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -1882,38 +1882,38 @@ sgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_22 + bgt .Lsgemm_kernel_L1_M8_22 -sgemm_kernel_L1_M8_40: +.Lsgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M8_100 + ble .Lsgemm_kernel_L1_M8_100 -sgemm_kernel_L1_M8_42: +.Lsgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_42 + bgt .Lsgemm_kernel_L1_M8_42 -sgemm_kernel_L1_M8_100: +.Lsgemm_kernel_L1_M8_100: SAVE8x1 -sgemm_kernel_L1_M8_END: +.Lsgemm_kernel_L1_M8_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 @@ -1921,10 +1921,10 @@ sgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1936,39 +1936,39 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: //------------------------------------------------------------------------------ -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 @@ -1976,9 +1976,9 @@ sgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1991,34 +1991,34 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 @@ -2026,9 +2026,9 @@ sgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2040,28 +2040,28 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S index a5cf7baffb..76c11f1e14 100644 --- a/kernel/arm64/sgemm_kernel_4x4.S +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN: add pA_2, temp, pA_1 add pA_3, temp, pA_2 -sgemm_kernel_L4_M16_BEGIN: +.Lsgemm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble sgemm_kernel_L4_M8_BEGIN + ble .Lsgemm_kernel_L4_M8_BEGIN -sgemm_kernel_L4_M16_20: +.Lsgemm_kernel_L4_M16_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M16_32 + blt .Lsgemm_kernel_L4_M16_32 KERNEL16x4_I // do one in the K KERNEL16x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M16_22a + ble .Lsgemm_kernel_L4_M16_22a .align 5 -sgemm_kernel_L4_M16_22: +.Lsgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M16_22 + bgt .Lsgemm_kernel_L4_M16_22 -sgemm_kernel_L4_M16_22a: +.Lsgemm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_32: +.Lsgemm_kernel_L4_M16_32: tst counterL, #1 - ble sgemm_kernel_L4_M16_40 + ble .Lsgemm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_E - b sgemm_kernel_L4_M16_44 + b .Lsgemm_kernel_L4_M16_44 -sgemm_kernel_L4_M16_40: +.Lsgemm_kernel_L4_M16_40: INIT16x4 -sgemm_kernel_L4_M16_44: +.Lsgemm_kernel_L4_M16_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M16_100 + ble .Lsgemm_kernel_L4_M16_100 -sgemm_kernel_L4_M16_46: +.Lsgemm_kernel_L4_M16_46: KERNEL16x4_SUB -sgemm_kernel_L4_M16_100: +.Lsgemm_kernel_L4_M16_100: SAVE16x4 -sgemm_kernel_L4_M16_END: +.Lsgemm_kernel_L4_M16_END: lsl temp, origK, #4 // k * 4 * 4 = Four rows of A add pA_0, pA_0, temp add pA_0, pA_0, temp @@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END: add pA_2, pA_1, temp add pA_3, pA_2, temp subs counterI, counterI, #1 - bne sgemm_kernel_L4_M16_20 + bne .Lsgemm_kernel_L4_M16_20 -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #8 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: INIT8x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_SUB KERNEL8x4_SUB @@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22: KERNEL8x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_42: +.Lsgemm_kernel_L4_M8_42: KERNEL8x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_42 + bgt .Lsgemm_kernel_L4_M8_42 -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: lsl temp, origK, #4 // k * 4 * 4 add pA_0, pA_0, temp -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: INIT4x4 mov pB, origPB asr counterL, origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_SUB KERNEL4x4_SUB @@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_42: +.Lsgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_42 + bgt .Lsgemm_kernel_L4_M4_42 -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: lsl temp, origK, #4 add origPB, origPB, temp // B = B + K * 4 * 4 subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L4_BEGIN + bgt .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M4_20 + bgt .Lsgemm_kernel_L2_M4_20 -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN: -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M4_20 + bgt .Lsgemm_kernel_L1_M4_20 -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S index bd47bed310..6ba64dd355 100644 --- a/kernel/arm64/sgemm_kernel_8x8.S +++ b/kernel/arm64/sgemm_kernel_8x8.S @@ -1263,7 +1263,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -sgemm_kernel_begin: +.Lsgemm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1291,12 +1291,12 @@ sgemm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble sgemm_kernel_L4_BEGIN + ble .Lsgemm_kernel_L4_BEGIN /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L8_BEGIN: +.Lsgemm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 @@ -1304,156 +1304,156 @@ sgemm_kernel_L8_BEGIN: /******************************************************************************/ -sgemm_kernel_L8_M8_BEGIN: +.Lsgemm_kernel_L8_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble sgemm_kernel_L8_M4_BEGIN + ble .Lsgemm_kernel_L8_M4_BEGIN -sgemm_kernel_L8_M8_20: +.Lsgemm_kernel_L8_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L8_M8_32 + blt .Lsgemm_kernel_L8_M8_32 KERNEL8x8_I // do one in the K KERNEL8x8_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L8_M8_22a + ble .Lsgemm_kernel_L8_M8_22a .align 5 -sgemm_kernel_L8_M8_22: +.Lsgemm_kernel_L8_M8_22: KERNEL8x8_M1 KERNEL8x8_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M8_22 + bgt .Lsgemm_kernel_L8_M8_22 -sgemm_kernel_L8_M8_22a: +.Lsgemm_kernel_L8_M8_22a: KERNEL8x8_M1 KERNEL8x8_E - b sgemm_kernel_L8_M8_44 + b .Lsgemm_kernel_L8_M8_44 -sgemm_kernel_L8_M8_32: +.Lsgemm_kernel_L8_M8_32: tst counterL, #1 - ble sgemm_kernel_L8_M8_40 + ble .Lsgemm_kernel_L8_M8_40 KERNEL8x8_I KERNEL8x8_E - b sgemm_kernel_L8_M8_44 + b .Lsgemm_kernel_L8_M8_44 -sgemm_kernel_L8_M8_40: +.Lsgemm_kernel_L8_M8_40: INIT8x8 -sgemm_kernel_L8_M8_44: +.Lsgemm_kernel_L8_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L8_M8_100 + ble .Lsgemm_kernel_L8_M8_100 -sgemm_kernel_L8_M8_46: +.Lsgemm_kernel_L8_M8_46: KERNEL8x8_SUB -sgemm_kernel_L8_M8_100: +.Lsgemm_kernel_L8_M8_100: SAVE8x8 -sgemm_kernel_L8_M8_END: +.Lsgemm_kernel_L8_M8_END: subs counterI, counterI, #1 - bne sgemm_kernel_L8_M8_20 + bne .Lsgemm_kernel_L8_M8_20 /******************************************************************************/ -sgemm_kernel_L8_M4_BEGIN: +.Lsgemm_kernel_L8_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L8_END + ble .Lsgemm_kernel_L8_END tst counterI, #4 - ble sgemm_kernel_L8_M2_BEGIN + ble .Lsgemm_kernel_L8_M2_BEGIN -sgemm_kernel_L8_M4_20: +.Lsgemm_kernel_L8_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L8_M4_32 + blt .Lsgemm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L8_M4_22a + ble .Lsgemm_kernel_L8_M4_22a .align 5 -sgemm_kernel_L8_M4_22: +.Lsgemm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M4_22 + bgt .Lsgemm_kernel_L8_M4_22 -sgemm_kernel_L8_M4_22a: +.Lsgemm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b sgemm_kernel_L8_M4_44 + b .Lsgemm_kernel_L8_M4_44 -sgemm_kernel_L8_M4_32: +.Lsgemm_kernel_L8_M4_32: tst counterL, #1 - ble sgemm_kernel_L8_M4_40 + ble .Lsgemm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b sgemm_kernel_L8_M4_44 + b .Lsgemm_kernel_L8_M4_44 -sgemm_kernel_L8_M4_40: +.Lsgemm_kernel_L8_M4_40: INIT4x8 -sgemm_kernel_L8_M4_44: +.Lsgemm_kernel_L8_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L8_M4_100 + ble .Lsgemm_kernel_L8_M4_100 -sgemm_kernel_L8_M4_46: +.Lsgemm_kernel_L8_M4_46: KERNEL4x8_SUB -sgemm_kernel_L8_M4_100: +.Lsgemm_kernel_L8_M4_100: SAVE4x8 -sgemm_kernel_L8_M4_END: +.Lsgemm_kernel_L8_M4_END: /******************************************************************************/ -sgemm_kernel_L8_M2_BEGIN: +.Lsgemm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L8_END + ble .Lsgemm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L8_M1_BEGIN + ble .Lsgemm_kernel_L8_M1_BEGIN -sgemm_kernel_L8_M2_20: +.Lsgemm_kernel_L8_M2_20: INIT2x8 @@ -1461,9 +1461,9 @@ sgemm_kernel_L8_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L8_M2_40 + ble .Lsgemm_kernel_L8_M2_40 -sgemm_kernel_L8_M2_22: +.Lsgemm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1476,35 +1476,35 @@ sgemm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M2_22 + bgt .Lsgemm_kernel_L8_M2_22 -sgemm_kernel_L8_M2_40: +.Lsgemm_kernel_L8_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L8_M2_100 + ble .Lsgemm_kernel_L8_M2_100 -sgemm_kernel_L8_M2_42: +.Lsgemm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M2_42 + bgt .Lsgemm_kernel_L8_M2_42 -sgemm_kernel_L8_M2_100: +.Lsgemm_kernel_L8_M2_100: SAVE2x8 -sgemm_kernel_L8_M2_END: +.Lsgemm_kernel_L8_M2_END: /******************************************************************************/ -sgemm_kernel_L8_M1_BEGIN: +.Lsgemm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L8_END + ble .Lsgemm_kernel_L8_END -sgemm_kernel_L8_M1_20: +.Lsgemm_kernel_L8_M1_20: INIT1x8 @@ -1512,9 +1512,9 @@ sgemm_kernel_L8_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L8_M1_40 + ble .Lsgemm_kernel_L8_M1_40 -sgemm_kernel_L8_M1_22: +.Lsgemm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1526,43 +1526,43 @@ sgemm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M1_22 + bgt .Lsgemm_kernel_L8_M1_22 -sgemm_kernel_L8_M1_40: +.Lsgemm_kernel_L8_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L8_M1_100 + ble .Lsgemm_kernel_L8_M1_100 -sgemm_kernel_L8_M1_42: +.Lsgemm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L8_M1_42 + bgt .Lsgemm_kernel_L8_M1_42 -sgemm_kernel_L8_M1_100: +.Lsgemm_kernel_L8_M1_100: SAVE1x8 -sgemm_kernel_L8_END: +.Lsgemm_kernel_L8_END: lsl temp, origK, #5 // B = B + K * 4 * 8 add origPB, origPB, temp subs counterJ, counterJ , #1 // j-- - bgt sgemm_kernel_L8_BEGIN + bgt .Lsgemm_kernel_L8_BEGIN /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L4_BEGIN: +.Lsgemm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #4 - ble sgemm_kernel_L2_BEGIN + ble .Lsgemm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1572,156 +1572,156 @@ sgemm_kernel_L4_BEGIN: /******************************************************************************/ -sgemm_kernel_L4_M8_BEGIN: +.Lsgemm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble sgemm_kernel_L4_M4_BEGIN + ble .Lsgemm_kernel_L4_M4_BEGIN -sgemm_kernel_L4_M8_20: +.Lsgemm_kernel_L4_M8_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M8_32 + blt .Lsgemm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M8_22a + ble .Lsgemm_kernel_L4_M8_22a .align 5 -sgemm_kernel_L4_M8_22: +.Lsgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M8_22 + bgt .Lsgemm_kernel_L4_M8_22 -sgemm_kernel_L4_M8_22a: +.Lsgemm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_32: +.Lsgemm_kernel_L4_M8_32: tst counterL, #1 - ble sgemm_kernel_L4_M8_40 + ble .Lsgemm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b sgemm_kernel_L4_M8_44 + b .Lsgemm_kernel_L4_M8_44 -sgemm_kernel_L4_M8_40: +.Lsgemm_kernel_L4_M8_40: INIT8x4 -sgemm_kernel_L4_M8_44: +.Lsgemm_kernel_L4_M8_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M8_100 + ble .Lsgemm_kernel_L4_M8_100 -sgemm_kernel_L4_M8_46: +.Lsgemm_kernel_L4_M8_46: KERNEL8x4_SUB -sgemm_kernel_L4_M8_100: +.Lsgemm_kernel_L4_M8_100: SAVE8x4 -sgemm_kernel_L4_M8_END: +.Lsgemm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne sgemm_kernel_L4_M8_20 + bne .Lsgemm_kernel_L4_M8_20 /******************************************************************************/ -sgemm_kernel_L4_M4_BEGIN: +.Lsgemm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #4 - ble sgemm_kernel_L4_M2_BEGIN + ble .Lsgemm_kernel_L4_M2_BEGIN -sgemm_kernel_L4_M4_20: +.Lsgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 + blt .Lsgemm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble sgemm_kernel_L4_M4_22a + ble .Lsgemm_kernel_L4_M4_22a .align 5 -sgemm_kernel_L4_M4_22: +.Lsgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M4_22 + bgt .Lsgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: +.Lsgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_32: +.Lsgemm_kernel_L4_M4_32: tst counterL, #1 - ble sgemm_kernel_L4_M4_40 + ble .Lsgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b sgemm_kernel_L4_M4_44 + b .Lsgemm_kernel_L4_M4_44 -sgemm_kernel_L4_M4_40: +.Lsgemm_kernel_L4_M4_40: INIT4x4 -sgemm_kernel_L4_M4_44: +.Lsgemm_kernel_L4_M4_44: ands counterL , origK, #1 - ble sgemm_kernel_L4_M4_100 + ble .Lsgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +.Lsgemm_kernel_L4_M4_46: KERNEL4x4_SUB -sgemm_kernel_L4_M4_100: +.Lsgemm_kernel_L4_M4_100: SAVE4x4 -sgemm_kernel_L4_M4_END: +.Lsgemm_kernel_L4_M4_END: /******************************************************************************/ -sgemm_kernel_L4_M2_BEGIN: +.Lsgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L4_M1_BEGIN + ble .Lsgemm_kernel_L4_M1_BEGIN -sgemm_kernel_L4_M2_20: +.Lsgemm_kernel_L4_M2_20: INIT2x4 @@ -1729,9 +1729,9 @@ sgemm_kernel_L4_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M2_40 + ble .Lsgemm_kernel_L4_M2_40 -sgemm_kernel_L4_M2_22: +.Lsgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1744,35 +1744,35 @@ sgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_22 + bgt .Lsgemm_kernel_L4_M2_22 -sgemm_kernel_L4_M2_40: +.Lsgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M2_100 + ble .Lsgemm_kernel_L4_M2_100 -sgemm_kernel_L4_M2_42: +.Lsgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M2_42 + bgt .Lsgemm_kernel_L4_M2_42 -sgemm_kernel_L4_M2_100: +.Lsgemm_kernel_L4_M2_100: SAVE2x4 -sgemm_kernel_L4_M2_END: +.Lsgemm_kernel_L4_M2_END: /******************************************************************************/ -sgemm_kernel_L4_M1_BEGIN: +.Lsgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L4_END + ble .Lsgemm_kernel_L4_END -sgemm_kernel_L4_M1_20: +.Lsgemm_kernel_L4_M1_20: INIT1x4 @@ -1780,9 +1780,9 @@ sgemm_kernel_L4_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L4_M1_40 + ble .Lsgemm_kernel_L4_M1_40 -sgemm_kernel_L4_M1_22: +.Lsgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1794,39 +1794,39 @@ sgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_22 + bgt .Lsgemm_kernel_L4_M1_22 -sgemm_kernel_L4_M1_40: +.Lsgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L4_M1_100 + ble .Lsgemm_kernel_L4_M1_100 -sgemm_kernel_L4_M1_42: +.Lsgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L4_M1_42 + bgt .Lsgemm_kernel_L4_M1_42 -sgemm_kernel_L4_M1_100: +.Lsgemm_kernel_L4_M1_100: SAVE1x4 -sgemm_kernel_L4_END: +.Lsgemm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 + ble .Lsgemm_kernel_L999 tst counterJ , #2 - ble sgemm_kernel_L1_BEGIN + ble .Lsgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1836,14 +1836,14 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction /******************************************************************************/ -sgemm_kernel_L2_M8_BEGIN: +.Lsgemm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 - ble sgemm_kernel_L2_M4_BEGIN + ble .Lsgemm_kernel_L2_M4_BEGIN -sgemm_kernel_L2_M8_20: +.Lsgemm_kernel_L2_M8_20: INIT8x2 @@ -1851,10 +1851,10 @@ sgemm_kernel_L2_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M8_40 + ble .Lsgemm_kernel_L2_M8_40 .align 5 -sgemm_kernel_L2_M8_22: +.Lsgemm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1866,42 +1866,42 @@ sgemm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_22 + bgt .Lsgemm_kernel_L2_M8_22 -sgemm_kernel_L2_M8_40: +.Lsgemm_kernel_L2_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M8_100 + ble .Lsgemm_kernel_L2_M8_100 -sgemm_kernel_L2_M8_42: +.Lsgemm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M8_42 + bgt .Lsgemm_kernel_L2_M8_42 -sgemm_kernel_L2_M8_100: +.Lsgemm_kernel_L2_M8_100: SAVE8x2 -sgemm_kernel_L2_M8_END: +.Lsgemm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L2_M8_20 + bgt .Lsgemm_kernel_L2_M8_20 /******************************************************************************/ -sgemm_kernel_L2_M4_BEGIN: +.Lsgemm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #4 - ble sgemm_kernel_L2_M2_BEGIN + ble .Lsgemm_kernel_L2_M2_BEGIN -sgemm_kernel_L2_M4_20: +.Lsgemm_kernel_L2_M4_20: INIT4x2 @@ -1909,10 +1909,10 @@ sgemm_kernel_L2_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M4_40 + ble .Lsgemm_kernel_L2_M4_40 .align 5 -sgemm_kernel_L2_M4_22: +.Lsgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1924,39 +1924,39 @@ sgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_22 + bgt .Lsgemm_kernel_L2_M4_22 -sgemm_kernel_L2_M4_40: +.Lsgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M4_100 + ble .Lsgemm_kernel_L2_M4_100 -sgemm_kernel_L2_M4_42: +.Lsgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M4_42 + bgt .Lsgemm_kernel_L2_M4_42 -sgemm_kernel_L2_M4_100: +.Lsgemm_kernel_L2_M4_100: SAVE4x2 -sgemm_kernel_L2_M4_END: +.Lsgemm_kernel_L2_M4_END: /******************************************************************************/ -sgemm_kernel_L2_M2_BEGIN: +.Lsgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L2_M1_BEGIN + ble .Lsgemm_kernel_L2_M1_BEGIN -sgemm_kernel_L2_M2_20: +.Lsgemm_kernel_L2_M2_20: INIT2x2 @@ -1964,9 +1964,9 @@ sgemm_kernel_L2_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble sgemm_kernel_L2_M2_40 + ble .Lsgemm_kernel_L2_M2_40 -sgemm_kernel_L2_M2_22: +.Lsgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1979,35 +1979,35 @@ sgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_22 + bgt .Lsgemm_kernel_L2_M2_22 -sgemm_kernel_L2_M2_40: +.Lsgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M2_100 + ble .Lsgemm_kernel_L2_M2_100 -sgemm_kernel_L2_M2_42: +.Lsgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M2_42 + bgt .Lsgemm_kernel_L2_M2_42 -sgemm_kernel_L2_M2_100: +.Lsgemm_kernel_L2_M2_100: SAVE2x2 -sgemm_kernel_L2_M2_END: +.Lsgemm_kernel_L2_M2_END: /******************************************************************************/ -sgemm_kernel_L2_M1_BEGIN: +.Lsgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L2_END + ble .Lsgemm_kernel_L2_END -sgemm_kernel_L2_M1_20: +.Lsgemm_kernel_L2_M1_20: INIT1x2 @@ -2015,9 +2015,9 @@ sgemm_kernel_L2_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble sgemm_kernel_L2_M1_40 + ble .Lsgemm_kernel_L2_M1_40 -sgemm_kernel_L2_M1_22: +.Lsgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -2029,37 +2029,37 @@ sgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_22 + bgt .Lsgemm_kernel_L2_M1_22 -sgemm_kernel_L2_M1_40: +.Lsgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L2_M1_100 + ble .Lsgemm_kernel_L2_M1_100 -sgemm_kernel_L2_M1_42: +.Lsgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L2_M1_42 + bgt .Lsgemm_kernel_L2_M1_42 -sgemm_kernel_L2_M1_100: +.Lsgemm_kernel_L2_M1_100: SAVE1x2 -sgemm_kernel_L2_END: +.Lsgemm_kernel_L2_END: add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 /******************************************************************************/ /******************************************************************************/ -sgemm_kernel_L1_BEGIN: +.Lsgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble sgemm_kernel_L999 // done + ble .Lsgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -2069,14 +2069,14 @@ sgemm_kernel_L1_BEGIN: /******************************************************************************/ -sgemm_kernel_L1_M8_BEGIN: +.Lsgemm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 cmp counterI, #0 - ble sgemm_kernel_L1_M4_BEGIN + ble .Lsgemm_kernel_L1_M4_BEGIN -sgemm_kernel_L1_M8_20: +.Lsgemm_kernel_L1_M8_20: INIT8x1 @@ -2084,10 +2084,10 @@ sgemm_kernel_L1_M8_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M8_40 + ble .Lsgemm_kernel_L1_M8_40 .align 5 -sgemm_kernel_L1_M8_22: +.Lsgemm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2099,42 +2099,42 @@ sgemm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_22 + bgt .Lsgemm_kernel_L1_M8_22 -sgemm_kernel_L1_M8_40: +.Lsgemm_kernel_L1_M8_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M8_100 + ble .Lsgemm_kernel_L1_M8_100 -sgemm_kernel_L1_M8_42: +.Lsgemm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M8_42 + bgt .Lsgemm_kernel_L1_M8_42 -sgemm_kernel_L1_M8_100: +.Lsgemm_kernel_L1_M8_100: SAVE8x1 -sgemm_kernel_L1_M8_END: +.Lsgemm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt sgemm_kernel_L1_M8_20 + bgt .Lsgemm_kernel_L1_M8_20 /******************************************************************************/ -sgemm_kernel_L1_M4_BEGIN: +.Lsgemm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #4 - ble sgemm_kernel_L1_M2_BEGIN + ble .Lsgemm_kernel_L1_M2_BEGIN -sgemm_kernel_L1_M4_20: +.Lsgemm_kernel_L1_M4_20: INIT4x1 @@ -2142,10 +2142,10 @@ sgemm_kernel_L1_M4_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M4_40 + ble .Lsgemm_kernel_L1_M4_40 .align 5 -sgemm_kernel_L1_M4_22: +.Lsgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2157,39 +2157,39 @@ sgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_22 + bgt .Lsgemm_kernel_L1_M4_22 -sgemm_kernel_L1_M4_40: +.Lsgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M4_100 + ble .Lsgemm_kernel_L1_M4_100 -sgemm_kernel_L1_M4_42: +.Lsgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M4_42 + bgt .Lsgemm_kernel_L1_M4_42 -sgemm_kernel_L1_M4_100: +.Lsgemm_kernel_L1_M4_100: SAVE4x1 -sgemm_kernel_L1_M4_END: +.Lsgemm_kernel_L1_M4_END: /******************************************************************************/ -sgemm_kernel_L1_M2_BEGIN: +.Lsgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble sgemm_kernel_L1_M1_BEGIN + ble .Lsgemm_kernel_L1_M1_BEGIN -sgemm_kernel_L1_M2_20: +.Lsgemm_kernel_L1_M2_20: INIT2x1 @@ -2197,9 +2197,9 @@ sgemm_kernel_L1_M2_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M2_40 + ble .Lsgemm_kernel_L1_M2_40 -sgemm_kernel_L1_M2_22: +.Lsgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2212,35 +2212,35 @@ sgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_22 + bgt .Lsgemm_kernel_L1_M2_22 -sgemm_kernel_L1_M2_40: +.Lsgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M2_100 + ble .Lsgemm_kernel_L1_M2_100 -sgemm_kernel_L1_M2_42: +.Lsgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M2_42 + bgt .Lsgemm_kernel_L1_M2_42 -sgemm_kernel_L1_M2_100: +.Lsgemm_kernel_L1_M2_100: SAVE2x1 -sgemm_kernel_L1_M2_END: +.Lsgemm_kernel_L1_M2_END: /******************************************************************************/ -sgemm_kernel_L1_M1_BEGIN: +.Lsgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble sgemm_kernel_L1_END + ble .Lsgemm_kernel_L1_END -sgemm_kernel_L1_M1_20: +.Lsgemm_kernel_L1_M1_20: INIT1x1 @@ -2248,9 +2248,9 @@ sgemm_kernel_L1_M1_20: asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble sgemm_kernel_L1_M1_40 + ble .Lsgemm_kernel_L1_M1_40 -sgemm_kernel_L1_M1_22: +.Lsgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2262,30 +2262,30 @@ sgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_22 + bgt .Lsgemm_kernel_L1_M1_22 -sgemm_kernel_L1_M1_40: +.Lsgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble sgemm_kernel_L1_M1_100 + ble .Lsgemm_kernel_L1_M1_100 -sgemm_kernel_L1_M1_42: +.Lsgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt sgemm_kernel_L1_M1_42 + bgt .Lsgemm_kernel_L1_M1_42 -sgemm_kernel_L1_M1_100: +.Lsgemm_kernel_L1_M1_100: SAVE1x1 -sgemm_kernel_L1_END: +.Lsgemm_kernel_L1_END: /******************************************************************************/ -sgemm_kernel_L999: +.Lsgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 77e05103d7..985a0a9a6d 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -1035,7 +1035,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -strmm_kernel_begin: +.Lstrmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1066,11 +1066,11 @@ strmm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble strmm_kernel_L2_BEGIN + ble .Lstrmm_kernel_L2_BEGIN /******************************************************************************/ -strmm_kernel_L4_BEGIN: +.Lstrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1084,15 +1084,15 @@ strmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -strmm_kernel_L4_M16_BEGIN: +.Lstrmm_kernel_L4_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble strmm_kernel_L4_M8_BEGIN + ble .Lstrmm_kernel_L4_M8_BEGIN .align 5 -strmm_kernel_L4_M16_20: +.Lstrmm_kernel_L4_M16_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1114,7 +1114,7 @@ strmm_kernel_L4_M16_20: asr counterL , tempK, #3 cmp counterL , #2 - blt strmm_kernel_L4_M16_32 + blt .Lstrmm_kernel_L4_M16_32 KERNEL16x4_I KERNEL16x4_M2 @@ -1126,10 +1126,10 @@ strmm_kernel_L4_M16_20: KERNEL16x4_M2 subs counterL, counterL, #2 - ble strmm_kernel_L4_M16_22a + ble .Lstrmm_kernel_L4_M16_22a .align 5 -strmm_kernel_L4_M16_22: +.Lstrmm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1141,10 +1141,10 @@ strmm_kernel_L4_M16_22: KERNEL16x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M16_22 + bgt .Lstrmm_kernel_L4_M16_22 .align 5 -strmm_kernel_L4_M16_22a: +.Lstrmm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_M2 @@ -1155,13 +1155,13 @@ strmm_kernel_L4_M16_22a: KERNEL16x4_M1 KERNEL16x4_E - b strmm_kernel_L4_M16_44 + b .Lstrmm_kernel_L4_M16_44 .align 5 -strmm_kernel_L4_M16_32: +.Lstrmm_kernel_L4_M16_32: tst counterL, #1 - ble strmm_kernel_L4_M16_40 + ble .Lstrmm_kernel_L4_M16_40 KERNEL16x4_I KERNEL16x4_M2 @@ -1172,25 +1172,25 @@ strmm_kernel_L4_M16_32: KERNEL16x4_M1 KERNEL16x4_E - b strmm_kernel_L4_M16_44 + b .Lstrmm_kernel_L4_M16_44 -strmm_kernel_L4_M16_40: +.Lstrmm_kernel_L4_M16_40: INIT16x4 -strmm_kernel_L4_M16_44: +.Lstrmm_kernel_L4_M16_44: ands counterL , tempK, #7 - ble strmm_kernel_L4_M16_100 + ble .Lstrmm_kernel_L4_M16_100 .align 5 -strmm_kernel_L4_M16_46: +.Lstrmm_kernel_L4_M16_46: KERNEL16x4_SUB subs counterL, counterL, #1 - bne strmm_kernel_L4_M16_46 + bne .Lstrmm_kernel_L4_M16_46 -strmm_kernel_L4_M16_100: +.Lstrmm_kernel_L4_M16_100: SAVE16x4 @@ -1213,22 +1213,22 @@ strmm_kernel_L4_M16_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -strmm_kernel_L4_M16_END: +.Lstrmm_kernel_L4_M16_END: subs counterI, counterI, #1 - bne strmm_kernel_L4_M16_20 + bne .Lstrmm_kernel_L4_M16_20 //------------------------------------------------------------------------------ -strmm_kernel_L4_M8_BEGIN: +.Lstrmm_kernel_L4_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #8 - ble strmm_kernel_L4_M4_BEGIN + ble .Lstrmm_kernel_L4_M4_BEGIN -strmm_kernel_L4_M8_20: +.Lstrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1250,54 +1250,54 @@ strmm_kernel_L4_M8_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M8_32 + blt .Lstrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M8_22a + ble .Lstrmm_kernel_L4_M8_22a .align 5 -strmm_kernel_L4_M8_22: +.Lstrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M8_22 + bgt .Lstrmm_kernel_L4_M8_22 -strmm_kernel_L4_M8_22a: +.Lstrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_32: +.Lstrmm_kernel_L4_M8_32: tst counterL, #1 - ble strmm_kernel_L4_M8_40 + ble .Lstrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_40: +.Lstrmm_kernel_L4_M8_40: INIT8x4 -strmm_kernel_L4_M8_44: +.Lstrmm_kernel_L4_M8_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M8_100 + ble .Lstrmm_kernel_L4_M8_100 -strmm_kernel_L4_M8_46: +.Lstrmm_kernel_L4_M8_46: KERNEL8x4_SUB -strmm_kernel_L4_M8_100: +.Lstrmm_kernel_L4_M8_100: SAVE8x4 @@ -1317,20 +1317,20 @@ strmm_kernel_L4_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L4_M8_END: +.Lstrmm_kernel_L4_M8_END: //------------------------------------------------------------------------------ -strmm_kernel_L4_M4_BEGIN: +.Lstrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #4 - ble strmm_kernel_L4_M2_BEGIN + ble .Lstrmm_kernel_L4_M2_BEGIN -strmm_kernel_L4_M4_20: +.Lstrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1350,54 +1350,54 @@ strmm_kernel_L4_M4_20: #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M4_32 + blt .Lstrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M4_22a + ble .Lstrmm_kernel_L4_M4_22a .align 5 -strmm_kernel_L4_M4_22: +.Lstrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M4_22 + bgt .Lstrmm_kernel_L4_M4_22 -strmm_kernel_L4_M4_22a: +.Lstrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_32: +.Lstrmm_kernel_L4_M4_32: tst counterL, #1 - ble strmm_kernel_L4_M4_40 + ble .Lstrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_40: +.Lstrmm_kernel_L4_M4_40: INIT4x4 -strmm_kernel_L4_M4_44: +.Lstrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M4_100 + ble .Lstrmm_kernel_L4_M4_100 -strmm_kernel_L4_M4_46: +.Lstrmm_kernel_L4_M4_46: KERNEL4x4_SUB -strmm_kernel_L4_M4_100: +.Lstrmm_kernel_L4_M4_100: SAVE4x4 @@ -1415,20 +1415,20 @@ strmm_kernel_L4_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L4_M4_END: +.Lstrmm_kernel_L4_M4_END: //------------------------------------------------------------------------------ -strmm_kernel_L4_M2_BEGIN: +.Lstrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L4_M1_BEGIN + ble .Lstrmm_kernel_L4_M1_BEGIN -strmm_kernel_L4_M2_20: +.Lstrmm_kernel_L4_M2_20: INIT2x4 @@ -1451,9 +1451,9 @@ strmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M2_40 + ble .Lstrmm_kernel_L4_M2_40 -strmm_kernel_L4_M2_22: +.Lstrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1466,22 +1466,22 @@ strmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_22 + bgt .Lstrmm_kernel_L4_M2_22 -strmm_kernel_L4_M2_40: +.Lstrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M2_100 + ble .Lstrmm_kernel_L4_M2_100 -strmm_kernel_L4_M2_42: +.Lstrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_42 + bgt .Lstrmm_kernel_L4_M2_42 -strmm_kernel_L4_M2_100: +.Lstrmm_kernel_L4_M2_100: SAVE2x4 @@ -1500,15 +1500,15 @@ strmm_kernel_L4_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L4_M2_END: +.Lstrmm_kernel_L4_M2_END: -strmm_kernel_L4_M1_BEGIN: +.Lstrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END -strmm_kernel_L4_M1_20: +.Lstrmm_kernel_L4_M1_20: INIT1x4 @@ -1531,9 +1531,9 @@ strmm_kernel_L4_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M1_40 + ble .Lstrmm_kernel_L4_M1_40 -strmm_kernel_L4_M1_22: +.Lstrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1545,22 +1545,22 @@ strmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_22 + bgt .Lstrmm_kernel_L4_M1_22 -strmm_kernel_L4_M1_40: +.Lstrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M1_100 + ble .Lstrmm_kernel_L4_M1_100 -strmm_kernel_L4_M1_42: +.Lstrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_42 + bgt .Lstrmm_kernel_L4_M1_42 -strmm_kernel_L4_M1_100: +.Lstrmm_kernel_L4_M1_100: SAVE1x4 @@ -1579,26 +1579,26 @@ strmm_kernel_L4_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L4_END: +.Lstrmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) add tempOffset, tempOffset, #4 #endif subs counterJ, counterJ , #1 // j-- - bgt strmm_kernel_L4_BEGIN + bgt .Lstrmm_kernel_L4_BEGIN /******************************************************************************/ -strmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #2 - ble strmm_kernel_L1_BEGIN + ble .Lstrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1609,14 +1609,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction #endif mov pA, origPA // pA = A -strmm_kernel_L2_M16_BEGIN: +.Lstrmm_kernel_L2_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI,#0 - ble strmm_kernel_L2_M8_BEGIN + ble .Lstrmm_kernel_L2_M8_BEGIN -strmm_kernel_L2_M16_20: +.Lstrmm_kernel_L2_M16_20: INIT16x2 @@ -1640,10 +1640,10 @@ strmm_kernel_L2_M16_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M16_40 + ble .Lstrmm_kernel_L2_M16_40 .align 5 -strmm_kernel_L2_M16_22: +.Lstrmm_kernel_L2_M16_22: KERNEL16x2_SUB KERNEL16x2_SUB KERNEL16x2_SUB @@ -1655,22 +1655,22 @@ strmm_kernel_L2_M16_22: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M16_22 + bgt .Lstrmm_kernel_L2_M16_22 -strmm_kernel_L2_M16_40: +.Lstrmm_kernel_L2_M16_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M16_100 + ble .Lstrmm_kernel_L2_M16_100 -strmm_kernel_L2_M16_42: +.Lstrmm_kernel_L2_M16_42: KERNEL16x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M16_42 + bgt .Lstrmm_kernel_L2_M16_42 -strmm_kernel_L2_M16_100: +.Lstrmm_kernel_L2_M16_100: SAVE16x2 @@ -1690,22 +1690,22 @@ strmm_kernel_L2_M16_100: add tempOffset, tempOffset, #16 #endif -strmm_kernel_L2_M16_END: +.Lstrmm_kernel_L2_M16_END: subs counterI, counterI, #1 - bgt strmm_kernel_L2_M16_20 + bgt .Lstrmm_kernel_L2_M16_20 //------------------------------------------------------------------------------ -strmm_kernel_L2_M8_BEGIN: +.Lstrmm_kernel_L2_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #8 - ble strmm_kernel_L2_M4_BEGIN + ble .Lstrmm_kernel_L2_M4_BEGIN -strmm_kernel_L2_M8_20: +.Lstrmm_kernel_L2_M8_20: INIT8x2 @@ -1729,10 +1729,10 @@ strmm_kernel_L2_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M8_40 + ble .Lstrmm_kernel_L2_M8_40 .align 5 -strmm_kernel_L2_M8_22: +.Lstrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -1744,22 +1744,22 @@ strmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_22 + bgt .Lstrmm_kernel_L2_M8_22 -strmm_kernel_L2_M8_40: +.Lstrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M8_100 + ble .Lstrmm_kernel_L2_M8_100 -strmm_kernel_L2_M8_42: +.Lstrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_42 + bgt .Lstrmm_kernel_L2_M8_42 -strmm_kernel_L2_M8_100: +.Lstrmm_kernel_L2_M8_100: SAVE8x2 @@ -1779,19 +1779,19 @@ strmm_kernel_L2_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L2_M8_END: +.Lstrmm_kernel_L2_M8_END: //------------------------------------------------------------------------------ -strmm_kernel_L2_M4_BEGIN: +.Lstrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #4 - ble strmm_kernel_L2_M2_BEGIN + ble .Lstrmm_kernel_L2_M2_BEGIN -strmm_kernel_L2_M4_20: +.Lstrmm_kernel_L2_M4_20: INIT4x2 @@ -1814,10 +1814,10 @@ strmm_kernel_L2_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M4_40 + ble .Lstrmm_kernel_L2_M4_40 .align 5 -strmm_kernel_L2_M4_22: +.Lstrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1829,22 +1829,22 @@ strmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_22 + bgt .Lstrmm_kernel_L2_M4_22 -strmm_kernel_L2_M4_40: +.Lstrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M4_100 + ble .Lstrmm_kernel_L2_M4_100 -strmm_kernel_L2_M4_42: +.Lstrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_42 + bgt .Lstrmm_kernel_L2_M4_42 -strmm_kernel_L2_M4_100: +.Lstrmm_kernel_L2_M4_100: SAVE4x2 @@ -1863,21 +1863,21 @@ strmm_kernel_L2_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L2_M4_END: +.Lstrmm_kernel_L2_M4_END: //------------------------------------------------------------------------------ -strmm_kernel_L2_M2_BEGIN: +.Lstrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L2_M1_BEGIN + ble .Lstrmm_kernel_L2_M1_BEGIN -strmm_kernel_L2_M2_20: +.Lstrmm_kernel_L2_M2_20: INIT2x2 @@ -1900,9 +1900,9 @@ strmm_kernel_L2_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M2_40 + ble .Lstrmm_kernel_L2_M2_40 -strmm_kernel_L2_M2_22: +.Lstrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1915,22 +1915,22 @@ strmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_22 + bgt .Lstrmm_kernel_L2_M2_22 -strmm_kernel_L2_M2_40: +.Lstrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M2_100 + ble .Lstrmm_kernel_L2_M2_100 -strmm_kernel_L2_M2_42: +.Lstrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_42 + bgt .Lstrmm_kernel_L2_M2_42 -strmm_kernel_L2_M2_100: +.Lstrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1949,15 +1949,15 @@ strmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L2_M2_END: +.Lstrmm_kernel_L2_M2_END: -strmm_kernel_L2_M1_BEGIN: +.Lstrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END -strmm_kernel_L2_M1_20: +.Lstrmm_kernel_L2_M1_20: INIT1x2 @@ -1980,9 +1980,9 @@ strmm_kernel_L2_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble strmm_kernel_L2_M1_40 + ble .Lstrmm_kernel_L2_M1_40 -strmm_kernel_L2_M1_22: +.Lstrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1994,22 +1994,22 @@ strmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_22 + bgt .Lstrmm_kernel_L2_M1_22 -strmm_kernel_L2_M1_40: +.Lstrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M1_100 + ble .Lstrmm_kernel_L2_M1_100 -strmm_kernel_L2_M1_42: +.Lstrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_42 + bgt .Lstrmm_kernel_L2_M1_42 -strmm_kernel_L2_M1_100: +.Lstrmm_kernel_L2_M1_100: SAVE1x2 @@ -2028,7 +2028,7 @@ strmm_kernel_L2_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L2_END: +.Lstrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -2036,11 +2036,11 @@ strmm_kernel_L2_END: /******************************************************************************/ -strmm_kernel_L1_BEGIN: +.Lstrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble strmm_kernel_L999 // done + ble .Lstrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -2051,14 +2051,14 @@ strmm_kernel_L1_BEGIN: #endif mov pA, origPA // pA = A -strmm_kernel_L1_M16_BEGIN: +.Lstrmm_kernel_L1_M16_BEGIN: mov counterI, origM asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 - ble strmm_kernel_L1_M8_BEGIN + ble .Lstrmm_kernel_L1_M8_BEGIN -strmm_kernel_L1_M16_20: +.Lstrmm_kernel_L1_M16_20: INIT16x1 @@ -2082,10 +2082,10 @@ strmm_kernel_L1_M16_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M16_40 + ble .Lstrmm_kernel_L1_M16_40 .align 5 -strmm_kernel_L1_M16_22: +.Lstrmm_kernel_L1_M16_22: KERNEL16x1_SUB KERNEL16x1_SUB KERNEL16x1_SUB @@ -2097,22 +2097,22 @@ strmm_kernel_L1_M16_22: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M16_22 + bgt .Lstrmm_kernel_L1_M16_22 -strmm_kernel_L1_M16_40: +.Lstrmm_kernel_L1_M16_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M16_100 + ble .Lstrmm_kernel_L1_M16_100 -strmm_kernel_L1_M16_42: +.Lstrmm_kernel_L1_M16_42: KERNEL16x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M16_42 + bgt .Lstrmm_kernel_L1_M16_42 -strmm_kernel_L1_M16_100: +.Lstrmm_kernel_L1_M16_100: SAVE16x1 @@ -2132,23 +2132,23 @@ strmm_kernel_L1_M16_100: add tempOffset, tempOffset, #16 #endif -strmm_kernel_L1_M16_END: +.Lstrmm_kernel_L1_M16_END: subs counterI, counterI, #1 - bgt strmm_kernel_L1_M16_20 + bgt .Lstrmm_kernel_L1_M16_20 //------------------------------------------------------------------------------ -strmm_kernel_L1_M8_BEGIN: +.Lstrmm_kernel_L1_M8_BEGIN: mov counterI, origM tst counterI , #15 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #8 - ble strmm_kernel_L1_M4_BEGIN + ble .Lstrmm_kernel_L1_M4_BEGIN -strmm_kernel_L1_M8_20: +.Lstrmm_kernel_L1_M8_20: INIT8x1 @@ -2172,10 +2172,10 @@ strmm_kernel_L1_M8_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M8_40 + ble .Lstrmm_kernel_L1_M8_40 .align 5 -strmm_kernel_L1_M8_22: +.Lstrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2187,22 +2187,22 @@ strmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_22 + bgt .Lstrmm_kernel_L1_M8_22 -strmm_kernel_L1_M8_40: +.Lstrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M8_100 + ble .Lstrmm_kernel_L1_M8_100 -strmm_kernel_L1_M8_42: +.Lstrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_42 + bgt .Lstrmm_kernel_L1_M8_42 -strmm_kernel_L1_M8_100: +.Lstrmm_kernel_L1_M8_100: SAVE8x1 @@ -2222,19 +2222,19 @@ strmm_kernel_L1_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L1_M8_END: +.Lstrmm_kernel_L1_M8_END: //------------------------------------------------------------------------------ -strmm_kernel_L1_M4_BEGIN: +.Lstrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #4 - ble strmm_kernel_L1_M2_BEGIN + ble .Lstrmm_kernel_L1_M2_BEGIN -strmm_kernel_L1_M4_20: +.Lstrmm_kernel_L1_M4_20: INIT4x1 @@ -2257,10 +2257,10 @@ strmm_kernel_L1_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M4_40 + ble .Lstrmm_kernel_L1_M4_40 .align 5 -strmm_kernel_L1_M4_22: +.Lstrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2272,22 +2272,22 @@ strmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_22 + bgt .Lstrmm_kernel_L1_M4_22 -strmm_kernel_L1_M4_40: +.Lstrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M4_100 + ble .Lstrmm_kernel_L1_M4_100 -strmm_kernel_L1_M4_42: +.Lstrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_42 + bgt .Lstrmm_kernel_L1_M4_42 -strmm_kernel_L1_M4_100: +.Lstrmm_kernel_L1_M4_100: SAVE4x1 @@ -2306,20 +2306,20 @@ strmm_kernel_L1_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L1_M4_END: +.Lstrmm_kernel_L1_M4_END: //------------------------------------------------------------------------------ -strmm_kernel_L1_M2_BEGIN: +.Lstrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L1_M1_BEGIN + ble .Lstrmm_kernel_L1_M1_BEGIN -strmm_kernel_L1_M2_20: +.Lstrmm_kernel_L1_M2_20: INIT2x1 @@ -2342,9 +2342,9 @@ strmm_kernel_L1_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M2_40 + ble .Lstrmm_kernel_L1_M2_40 -strmm_kernel_L1_M2_22: +.Lstrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2357,22 +2357,22 @@ strmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_22 + bgt .Lstrmm_kernel_L1_M2_22 -strmm_kernel_L1_M2_40: +.Lstrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M2_100 + ble .Lstrmm_kernel_L1_M2_100 -strmm_kernel_L1_M2_42: +.Lstrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_42 + bgt .Lstrmm_kernel_L1_M2_42 -strmm_kernel_L1_M2_100: +.Lstrmm_kernel_L1_M2_100: SAVE2x1 @@ -2391,15 +2391,15 @@ strmm_kernel_L1_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L1_M2_END: +.Lstrmm_kernel_L1_M2_END: -strmm_kernel_L1_M1_BEGIN: +.Lstrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END -strmm_kernel_L1_M1_20: +.Lstrmm_kernel_L1_M1_20: INIT1x1 @@ -2422,9 +2422,9 @@ strmm_kernel_L1_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M1_40 + ble .Lstrmm_kernel_L1_M1_40 -strmm_kernel_L1_M1_22: +.Lstrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2436,28 +2436,28 @@ strmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_22 + bgt .Lstrmm_kernel_L1_M1_22 -strmm_kernel_L1_M1_40: +.Lstrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M1_100 + ble .Lstrmm_kernel_L1_M1_100 -strmm_kernel_L1_M1_42: +.Lstrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_42 + bgt .Lstrmm_kernel_L1_M1_42 -strmm_kernel_L1_M1_100: +.Lstrmm_kernel_L1_M1_100: SAVE1x1 -strmm_kernel_L1_END: +.Lstrmm_kernel_L1_END: -strmm_kernel_L999: +.Lstrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S index eeb3e6e72c..5f7818c40a 100644 --- a/kernel/arm64/strmm_kernel_4x4.S +++ b/kernel/arm64/strmm_kernel_4x4.S @@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -strmm_kernel_begin: +.Lstrmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -539,11 +539,11 @@ strmm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble strmm_kernel_L2_BEGIN + ble .Lstrmm_kernel_L2_BEGIN /******************************************************************************/ -strmm_kernel_L4_BEGIN: +.Lstrmm_kernel_L4_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #2 @@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -strmm_kernel_L4_M4_BEGIN: +.Lstrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble strmm_kernel_L4_M2_BEGIN + ble .Lstrmm_kernel_L4_M2_BEGIN -strmm_kernel_L4_M4_20: +.Lstrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M4_32 + blt .Lstrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M4_22a + ble .Lstrmm_kernel_L4_M4_22a .align 5 -strmm_kernel_L4_M4_22: +.Lstrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M4_22 + bgt .Lstrmm_kernel_L4_M4_22 -strmm_kernel_L4_M4_22a: +.Lstrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_32: +.Lstrmm_kernel_L4_M4_32: tst counterL, #1 - ble strmm_kernel_L4_M4_40 + ble .Lstrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_40: +.Lstrmm_kernel_L4_M4_40: INIT4x4 -strmm_kernel_L4_M4_44: +.Lstrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M4_100 + ble .Lstrmm_kernel_L4_M4_100 -strmm_kernel_L4_M4_46: +.Lstrmm_kernel_L4_M4_46: KERNEL4x4_SUB -strmm_kernel_L4_M4_100: +.Lstrmm_kernel_L4_M4_100: SAVE4x4 @@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L4_M4_END: +.Lstrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne strmm_kernel_L4_M4_20 + bne .Lstrmm_kernel_L4_M4_20 -strmm_kernel_L4_M2_BEGIN: +.Lstrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L4_M1_BEGIN + ble .Lstrmm_kernel_L4_M1_BEGIN -strmm_kernel_L4_M2_20: +.Lstrmm_kernel_L4_M2_20: INIT2x4 @@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M2_40 + ble .Lstrmm_kernel_L4_M2_40 -strmm_kernel_L4_M2_22: +.Lstrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_22 + bgt .Lstrmm_kernel_L4_M2_22 -strmm_kernel_L4_M2_40: +.Lstrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M2_100 + ble .Lstrmm_kernel_L4_M2_100 -strmm_kernel_L4_M2_42: +.Lstrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_42 + bgt .Lstrmm_kernel_L4_M2_42 -strmm_kernel_L4_M2_100: +.Lstrmm_kernel_L4_M2_100: SAVE2x4 @@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100: #endif -strmm_kernel_L4_M2_END: +.Lstrmm_kernel_L4_M2_END: -strmm_kernel_L4_M1_BEGIN: +.Lstrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END -strmm_kernel_L4_M1_20: +.Lstrmm_kernel_L4_M1_20: INIT1x4 @@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M1_40 + ble .Lstrmm_kernel_L4_M1_40 -strmm_kernel_L4_M1_22: +.Lstrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_22 + bgt .Lstrmm_kernel_L4_M1_22 -strmm_kernel_L4_M1_40: +.Lstrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M1_100 + ble .Lstrmm_kernel_L4_M1_100 -strmm_kernel_L4_M1_42: +.Lstrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_42 + bgt .Lstrmm_kernel_L4_M1_42 -strmm_kernel_L4_M1_100: +.Lstrmm_kernel_L4_M1_100: SAVE1x4 @@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100: #endif -strmm_kernel_L4_END: +.Lstrmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) @@ -825,19 +825,19 @@ strmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt strmm_kernel_L4_BEGIN + bgt .Lstrmm_kernel_L4_BEGIN /******************************************************************************/ -strmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #2 - ble strmm_kernel_L1_BEGIN + ble .Lstrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -strmm_kernel_L2_M4_BEGIN: +.Lstrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble strmm_kernel_L2_M2_BEGIN + ble .Lstrmm_kernel_L2_M2_BEGIN -strmm_kernel_L2_M4_20: +.Lstrmm_kernel_L2_M4_20: INIT4x2 @@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M4_40 + ble .Lstrmm_kernel_L2_M4_40 .align 5 -strmm_kernel_L2_M4_22: +.Lstrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_22 + bgt .Lstrmm_kernel_L2_M4_22 -strmm_kernel_L2_M4_40: +.Lstrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M4_100 + ble .Lstrmm_kernel_L2_M4_100 -strmm_kernel_L2_M4_42: +.Lstrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_42 + bgt .Lstrmm_kernel_L2_M4_42 -strmm_kernel_L2_M4_100: +.Lstrmm_kernel_L2_M4_100: SAVE4x2 @@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L2_M4_END: +.Lstrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt strmm_kernel_L2_M4_20 + bgt .Lstrmm_kernel_L2_M4_20 -strmm_kernel_L2_M2_BEGIN: +.Lstrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L2_M1_BEGIN + ble .Lstrmm_kernel_L2_M1_BEGIN -strmm_kernel_L2_M2_20: +.Lstrmm_kernel_L2_M2_20: INIT2x2 @@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M2_40 + ble .Lstrmm_kernel_L2_M2_40 -strmm_kernel_L2_M2_22: +.Lstrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_22 + bgt .Lstrmm_kernel_L2_M2_22 -strmm_kernel_L2_M2_40: +.Lstrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M2_100 + ble .Lstrmm_kernel_L2_M2_100 -strmm_kernel_L2_M2_42: +.Lstrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_42 + bgt .Lstrmm_kernel_L2_M2_42 -strmm_kernel_L2_M2_100: +.Lstrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L2_M2_END: +.Lstrmm_kernel_L2_M2_END: -strmm_kernel_L2_M1_BEGIN: +.Lstrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END -strmm_kernel_L2_M1_20: +.Lstrmm_kernel_L2_M1_20: INIT1x2 @@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble strmm_kernel_L2_M1_40 + ble .Lstrmm_kernel_L2_M1_40 -strmm_kernel_L2_M1_22: +.Lstrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_22 + bgt .Lstrmm_kernel_L2_M1_22 -strmm_kernel_L2_M1_40: +.Lstrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M1_100 + ble .Lstrmm_kernel_L2_M1_100 -strmm_kernel_L2_M1_42: +.Lstrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_42 + bgt .Lstrmm_kernel_L2_M1_42 -strmm_kernel_L2_M1_100: +.Lstrmm_kernel_L2_M1_100: SAVE1x2 @@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100: add tempOffset, tempOffset, #1 #endif -strmm_kernel_L2_END: +.Lstrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1107,11 +1107,11 @@ strmm_kernel_L2_END: /******************************************************************************/ -strmm_kernel_L1_BEGIN: +.Lstrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble strmm_kernel_L999 // done + ble .Lstrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN: mov pA, origPA // pA = A -strmm_kernel_L1_M4_BEGIN: +.Lstrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble strmm_kernel_L1_M2_BEGIN + ble .Lstrmm_kernel_L1_M2_BEGIN -strmm_kernel_L1_M4_20: +.Lstrmm_kernel_L1_M4_20: INIT4x1 @@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M4_40 + ble .Lstrmm_kernel_L1_M4_40 .align 5 -strmm_kernel_L1_M4_22: +.Lstrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_22 + bgt .Lstrmm_kernel_L1_M4_22 -strmm_kernel_L1_M4_40: +.Lstrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M4_100 + ble .Lstrmm_kernel_L1_M4_100 -strmm_kernel_L1_M4_42: +.Lstrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_42 + bgt .Lstrmm_kernel_L1_M4_42 -strmm_kernel_L1_M4_100: +.Lstrmm_kernel_L1_M4_100: SAVE4x1 @@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L1_M4_END: +.Lstrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt strmm_kernel_L1_M4_20 + bgt .Lstrmm_kernel_L1_M4_20 -strmm_kernel_L1_M2_BEGIN: +.Lstrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L1_M1_BEGIN + ble .Lstrmm_kernel_L1_M1_BEGIN -strmm_kernel_L1_M2_20: +.Lstrmm_kernel_L1_M2_20: INIT2x1 @@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M2_40 + ble .Lstrmm_kernel_L1_M2_40 -strmm_kernel_L1_M2_22: +.Lstrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_22 + bgt .Lstrmm_kernel_L1_M2_22 -strmm_kernel_L1_M2_40: +.Lstrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M2_100 + ble .Lstrmm_kernel_L1_M2_100 -strmm_kernel_L1_M2_42: +.Lstrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_42 + bgt .Lstrmm_kernel_L1_M2_42 -strmm_kernel_L1_M2_100: +.Lstrmm_kernel_L1_M2_100: SAVE2x1 @@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100: #endif -strmm_kernel_L1_M2_END: +.Lstrmm_kernel_L1_M2_END: -strmm_kernel_L1_M1_BEGIN: +.Lstrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END -strmm_kernel_L1_M1_20: +.Lstrmm_kernel_L1_M1_20: INIT1x1 @@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M1_40 + ble .Lstrmm_kernel_L1_M1_40 -strmm_kernel_L1_M1_22: +.Lstrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_22 + bgt .Lstrmm_kernel_L1_M1_22 -strmm_kernel_L1_M1_40: +.Lstrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M1_100 + ble .Lstrmm_kernel_L1_M1_100 -strmm_kernel_L1_M1_42: +.Lstrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_42 + bgt .Lstrmm_kernel_L1_M1_42 -strmm_kernel_L1_M1_100: +.Lstrmm_kernel_L1_M1_100: SAVE1x1 @@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100: #endif #endif -strmm_kernel_L1_END: +.Lstrmm_kernel_L1_END: #if 0 #if !defined(LEFT) @@ -1385,7 +1385,7 @@ strmm_kernel_L1_END: #endif #endif -strmm_kernel_L999: +.Lstrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S index 843f0c890a..cd18e6847d 100644 --- a/kernel/arm64/strmm_kernel_8x8.S +++ b/kernel/arm64/strmm_kernel_8x8.S @@ -1257,7 +1257,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE -strmm_kernel_begin: +.Lstrmm_kernel_begin: .align 5 add sp, sp, #-(11 * 16) @@ -1288,12 +1288,12 @@ strmm_kernel_begin: mov counterJ, origN asr counterJ, counterJ, #3 // J = J / 8 cmp counterJ, #0 - ble strmm_kernel_L4_BEGIN + ble .Lstrmm_kernel_L4_BEGIN /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L8_BEGIN: +.Lstrmm_kernel_L8_BEGIN: mov pCRow0, pC // pCRow0 = C add pC, pC, LDC, lsl #3 @@ -1305,14 +1305,14 @@ strmm_kernel_L8_BEGIN: /******************************************************************************/ -strmm_kernel_L8_M8_BEGIN: +.Lstrmm_kernel_L8_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble strmm_kernel_L8_M4_BEGIN + ble .Lstrmm_kernel_L8_M4_BEGIN -strmm_kernel_L8_M8_20: +.Lstrmm_kernel_L8_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1333,54 +1333,54 @@ strmm_kernel_L8_M8_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L8_M8_32 + blt .Lstrmm_kernel_L8_M8_32 KERNEL8x8_I // do one in the K KERNEL8x8_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L8_M8_22a + ble .Lstrmm_kernel_L8_M8_22a .align 5 -strmm_kernel_L8_M8_22: +.Lstrmm_kernel_L8_M8_22: KERNEL8x8_M1 KERNEL8x8_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L8_M8_22 + bgt .Lstrmm_kernel_L8_M8_22 -strmm_kernel_L8_M8_22a: +.Lstrmm_kernel_L8_M8_22a: KERNEL8x8_M1 KERNEL8x8_E - b strmm_kernel_L8_M8_44 + b .Lstrmm_kernel_L8_M8_44 -strmm_kernel_L8_M8_32: +.Lstrmm_kernel_L8_M8_32: tst counterL, #1 - ble strmm_kernel_L8_M8_40 + ble .Lstrmm_kernel_L8_M8_40 KERNEL8x8_I KERNEL8x8_E - b strmm_kernel_L8_M8_44 + b .Lstrmm_kernel_L8_M8_44 -strmm_kernel_L8_M8_40: +.Lstrmm_kernel_L8_M8_40: INIT8x8 -strmm_kernel_L8_M8_44: +.Lstrmm_kernel_L8_M8_44: ands counterL , tempK, #1 - ble strmm_kernel_L8_M8_100 + ble .Lstrmm_kernel_L8_M8_100 -strmm_kernel_L8_M8_46: +.Lstrmm_kernel_L8_M8_46: KERNEL8x8_SUB -strmm_kernel_L8_M8_100: +.Lstrmm_kernel_L8_M8_100: SAVE8x8 @@ -1399,22 +1399,22 @@ strmm_kernel_L8_M8_100: add tempOffset, tempOffset, #8 #endif -strmm_kernel_L8_M8_END: +.Lstrmm_kernel_L8_M8_END: subs counterI, counterI, #1 - bne strmm_kernel_L8_M8_20 + bne .Lstrmm_kernel_L8_M8_20 /******************************************************************************/ -strmm_kernel_L8_M4_BEGIN: +.Lstrmm_kernel_L8_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L8_END + ble .Lstrmm_kernel_L8_END tst counterI, #4 - ble strmm_kernel_L8_M2_BEGIN + ble .Lstrmm_kernel_L8_M2_BEGIN -strmm_kernel_L8_M4_20: +.Lstrmm_kernel_L8_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1436,54 +1436,54 @@ strmm_kernel_L8_M4_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L8_M4_32 + blt .Lstrmm_kernel_L8_M4_32 KERNEL4x8_I // do one in the K KERNEL4x8_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L8_M4_22a + ble .Lstrmm_kernel_L8_M4_22a .align 5 -strmm_kernel_L8_M4_22: +.Lstrmm_kernel_L8_M4_22: KERNEL4x8_M1 KERNEL4x8_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L8_M4_22 + bgt .Lstrmm_kernel_L8_M4_22 -strmm_kernel_L8_M4_22a: +.Lstrmm_kernel_L8_M4_22a: KERNEL4x8_M1 KERNEL4x8_E - b strmm_kernel_L8_M4_44 + b .Lstrmm_kernel_L8_M4_44 -strmm_kernel_L8_M4_32: +.Lstrmm_kernel_L8_M4_32: tst counterL, #1 - ble strmm_kernel_L8_M4_40 + ble .Lstrmm_kernel_L8_M4_40 KERNEL4x8_I KERNEL4x8_E - b strmm_kernel_L8_M4_44 + b .Lstrmm_kernel_L8_M4_44 -strmm_kernel_L8_M4_40: +.Lstrmm_kernel_L8_M4_40: INIT4x8 -strmm_kernel_L8_M4_44: +.Lstrmm_kernel_L8_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L8_M4_100 + ble .Lstrmm_kernel_L8_M4_100 -strmm_kernel_L8_M4_46: +.Lstrmm_kernel_L8_M4_46: KERNEL4x8_SUB -strmm_kernel_L8_M4_100: +.Lstrmm_kernel_L8_M4_100: SAVE4x8 @@ -1503,20 +1503,20 @@ strmm_kernel_L8_M4_100: add tempOffset, tempOffset, #4 #endif -strmm_kernel_L8_M4_END: +.Lstrmm_kernel_L8_M4_END: /******************************************************************************/ -strmm_kernel_L8_M2_BEGIN: +.Lstrmm_kernel_L8_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L8_END + ble .Lstrmm_kernel_L8_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L8_M1_BEGIN + ble .Lstrmm_kernel_L8_M1_BEGIN -strmm_kernel_L8_M2_20: +.Lstrmm_kernel_L8_M2_20: INIT2x8 @@ -1540,9 +1540,9 @@ strmm_kernel_L8_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L8_M2_40 + ble .Lstrmm_kernel_L8_M2_40 -strmm_kernel_L8_M2_22: +.Lstrmm_kernel_L8_M2_22: KERNEL2x8_SUB KERNEL2x8_SUB @@ -1555,22 +1555,22 @@ strmm_kernel_L8_M2_22: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M2_22 + bgt .Lstrmm_kernel_L8_M2_22 -strmm_kernel_L8_M2_40: +.Lstrmm_kernel_L8_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L8_M2_100 + ble .Lstrmm_kernel_L8_M2_100 -strmm_kernel_L8_M2_42: +.Lstrmm_kernel_L8_M2_42: KERNEL2x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M2_42 + bgt .Lstrmm_kernel_L8_M2_42 -strmm_kernel_L8_M2_100: +.Lstrmm_kernel_L8_M2_100: SAVE2x8 @@ -1590,16 +1590,16 @@ strmm_kernel_L8_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L8_M2_END: +.Lstrmm_kernel_L8_M2_END: /******************************************************************************/ -strmm_kernel_L8_M1_BEGIN: +.Lstrmm_kernel_L8_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L8_END + ble .Lstrmm_kernel_L8_END -strmm_kernel_L8_M1_20: +.Lstrmm_kernel_L8_M1_20: INIT1x8 @@ -1623,9 +1623,9 @@ strmm_kernel_L8_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L8_M1_40 + ble .Lstrmm_kernel_L8_M1_40 -strmm_kernel_L8_M1_22: +.Lstrmm_kernel_L8_M1_22: KERNEL1x8_SUB KERNEL1x8_SUB KERNEL1x8_SUB @@ -1637,22 +1637,22 @@ strmm_kernel_L8_M1_22: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M1_22 + bgt .Lstrmm_kernel_L8_M1_22 -strmm_kernel_L8_M1_40: +.Lstrmm_kernel_L8_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L8_M1_100 + ble .Lstrmm_kernel_L8_M1_100 -strmm_kernel_L8_M1_42: +.Lstrmm_kernel_L8_M1_42: KERNEL1x8_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L8_M1_42 + bgt .Lstrmm_kernel_L8_M1_42 -strmm_kernel_L8_M1_100: +.Lstrmm_kernel_L8_M1_100: SAVE1x8 @@ -1672,7 +1672,7 @@ strmm_kernel_L8_M1_100: add tempOffset, tempOffset, #1 #endif -strmm_kernel_L8_END: +.Lstrmm_kernel_L8_END: lsl temp, origK, #5 // B = B + K * 4 * 8 add origPB, origPB, temp @@ -1681,19 +1681,19 @@ strmm_kernel_L8_END: #endif subs counterJ, counterJ , #1 // j-- - bgt strmm_kernel_L8_BEGIN + bgt .Lstrmm_kernel_L8_BEGIN /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L4_BEGIN: +.Lstrmm_kernel_L4_BEGIN: mov counterJ , origN tst counterJ , #7 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #4 - ble strmm_kernel_L2_BEGIN + ble .Lstrmm_kernel_L2_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1707,14 +1707,14 @@ strmm_kernel_L4_BEGIN: /******************************************************************************/ -strmm_kernel_L4_M8_BEGIN: +.Lstrmm_kernel_L4_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI, #0 - ble strmm_kernel_L4_M4_BEGIN + ble .Lstrmm_kernel_L4_M4_BEGIN -strmm_kernel_L4_M8_20: +.Lstrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1736,54 +1736,54 @@ strmm_kernel_L4_M8_20: asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M8_32 + blt .Lstrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M8_22a + ble .Lstrmm_kernel_L4_M8_22a .align 5 -strmm_kernel_L4_M8_22: +.Lstrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M8_22 + bgt .Lstrmm_kernel_L4_M8_22 -strmm_kernel_L4_M8_22a: +.Lstrmm_kernel_L4_M8_22a: KERNEL8x4_M1 KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_32: +.Lstrmm_kernel_L4_M8_32: tst counterL, #1 - ble strmm_kernel_L4_M8_40 + ble .Lstrmm_kernel_L4_M8_40 KERNEL8x4_I KERNEL8x4_E - b strmm_kernel_L4_M8_44 + b .Lstrmm_kernel_L4_M8_44 -strmm_kernel_L4_M8_40: +.Lstrmm_kernel_L4_M8_40: INIT8x4 -strmm_kernel_L4_M8_44: +.Lstrmm_kernel_L4_M8_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M8_100 + ble .Lstrmm_kernel_L4_M8_100 -strmm_kernel_L4_M8_46: +.Lstrmm_kernel_L4_M8_46: KERNEL8x4_SUB -strmm_kernel_L4_M8_100: +.Lstrmm_kernel_L4_M8_100: SAVE8x4 @@ -1802,22 +1802,22 @@ strmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif -strmm_kernel_L4_M8_END: +.Lstrmm_kernel_L4_M8_END: subs counterI, counterI, #1 - bne strmm_kernel_L4_M8_20 + bne .Lstrmm_kernel_L4_M8_20 /******************************************************************************/ -strmm_kernel_L4_M4_BEGIN: +.Lstrmm_kernel_L4_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #4 - ble strmm_kernel_L4_M2_BEGIN + ble .Lstrmm_kernel_L4_M2_BEGIN -strmm_kernel_L4_M4_20: +.Lstrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1837,54 +1837,54 @@ strmm_kernel_L4_M4_20: #endif asr counterL , tempK, #1 // L = K / 2 cmp counterL , #2 // is there at least 4 to do? - blt strmm_kernel_L4_M4_32 + blt .Lstrmm_kernel_L4_M4_32 KERNEL4x4_I // do one in the K KERNEL4x4_M2 // do another in the K subs counterL, counterL, #2 - ble strmm_kernel_L4_M4_22a + ble .Lstrmm_kernel_L4_M4_22a .align 5 -strmm_kernel_L4_M4_22: +.Lstrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 subs counterL, counterL, #1 - bgt strmm_kernel_L4_M4_22 + bgt .Lstrmm_kernel_L4_M4_22 -strmm_kernel_L4_M4_22a: +.Lstrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_32: +.Lstrmm_kernel_L4_M4_32: tst counterL, #1 - ble strmm_kernel_L4_M4_40 + ble .Lstrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_E - b strmm_kernel_L4_M4_44 + b .Lstrmm_kernel_L4_M4_44 -strmm_kernel_L4_M4_40: +.Lstrmm_kernel_L4_M4_40: INIT4x4 -strmm_kernel_L4_M4_44: +.Lstrmm_kernel_L4_M4_44: ands counterL , tempK, #1 - ble strmm_kernel_L4_M4_100 + ble .Lstrmm_kernel_L4_M4_100 -strmm_kernel_L4_M4_46: +.Lstrmm_kernel_L4_M4_46: KERNEL4x4_SUB -strmm_kernel_L4_M4_100: +.Lstrmm_kernel_L4_M4_100: SAVE4x4 @@ -1902,20 +1902,20 @@ strmm_kernel_L4_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L4_M4_END: +.Lstrmm_kernel_L4_M4_END: /******************************************************************************/ -strmm_kernel_L4_M2_BEGIN: +.Lstrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L4_M1_BEGIN + ble .Lstrmm_kernel_L4_M1_BEGIN -strmm_kernel_L4_M2_20: +.Lstrmm_kernel_L4_M2_20: INIT2x4 @@ -1938,9 +1938,9 @@ strmm_kernel_L4_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M2_40 + ble .Lstrmm_kernel_L4_M2_40 -strmm_kernel_L4_M2_22: +.Lstrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1953,22 +1953,22 @@ strmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_22 + bgt .Lstrmm_kernel_L4_M2_22 -strmm_kernel_L4_M2_40: +.Lstrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M2_100 + ble .Lstrmm_kernel_L4_M2_100 -strmm_kernel_L4_M2_42: +.Lstrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M2_42 + bgt .Lstrmm_kernel_L4_M2_42 -strmm_kernel_L4_M2_100: +.Lstrmm_kernel_L4_M2_100: SAVE2x4 @@ -1987,16 +1987,16 @@ strmm_kernel_L4_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L4_M2_END: +.Lstrmm_kernel_L4_M2_END: /******************************************************************************/ -strmm_kernel_L4_M1_BEGIN: +.Lstrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L4_END + ble .Lstrmm_kernel_L4_END -strmm_kernel_L4_M1_20: +.Lstrmm_kernel_L4_M1_20: INIT1x4 @@ -2019,9 +2019,9 @@ strmm_kernel_L4_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L4_M1_40 + ble .Lstrmm_kernel_L4_M1_40 -strmm_kernel_L4_M1_22: +.Lstrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -2033,22 +2033,22 @@ strmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_22 + bgt .Lstrmm_kernel_L4_M1_22 -strmm_kernel_L4_M1_40: +.Lstrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L4_M1_100 + ble .Lstrmm_kernel_L4_M1_100 -strmm_kernel_L4_M1_42: +.Lstrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L4_M1_42 + bgt .Lstrmm_kernel_L4_M1_42 -strmm_kernel_L4_M1_100: +.Lstrmm_kernel_L4_M1_100: SAVE1x4 @@ -2067,7 +2067,7 @@ strmm_kernel_L4_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L4_END: +.Lstrmm_kernel_L4_END: add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 #if !defined(LEFT) add tempOffset, tempOffset, #4 @@ -2076,14 +2076,14 @@ strmm_kernel_L4_END: /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble strmm_kernel_L999 + ble .Lstrmm_kernel_L999 tst counterJ , #2 - ble strmm_kernel_L1_BEGIN + ble .Lstrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -2096,14 +2096,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction /******************************************************************************/ -strmm_kernel_L2_M8_BEGIN: +.Lstrmm_kernel_L2_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 // counterI = counterI / 8 cmp counterI,#0 - ble strmm_kernel_L2_M4_BEGIN + ble .Lstrmm_kernel_L2_M4_BEGIN -strmm_kernel_L2_M8_20: +.Lstrmm_kernel_L2_M8_20: INIT8x2 @@ -2126,10 +2126,10 @@ strmm_kernel_L2_M8_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M8_40 + ble .Lstrmm_kernel_L2_M8_40 .align 5 -strmm_kernel_L2_M8_22: +.Lstrmm_kernel_L2_M8_22: KERNEL8x2_SUB KERNEL8x2_SUB KERNEL8x2_SUB @@ -2141,22 +2141,22 @@ strmm_kernel_L2_M8_22: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_22 + bgt .Lstrmm_kernel_L2_M8_22 -strmm_kernel_L2_M8_40: +.Lstrmm_kernel_L2_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M8_100 + ble .Lstrmm_kernel_L2_M8_100 -strmm_kernel_L2_M8_42: +.Lstrmm_kernel_L2_M8_42: KERNEL8x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M8_42 + bgt .Lstrmm_kernel_L2_M8_42 -strmm_kernel_L2_M8_100: +.Lstrmm_kernel_L2_M8_100: SAVE8x2 @@ -2175,23 +2175,23 @@ strmm_kernel_L2_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif -strmm_kernel_L2_M8_END: +.Lstrmm_kernel_L2_M8_END: subs counterI, counterI, #1 - bgt strmm_kernel_L2_M8_20 + bgt .Lstrmm_kernel_L2_M8_20 /******************************************************************************/ -strmm_kernel_L2_M4_BEGIN: +.Lstrmm_kernel_L2_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #4 - ble strmm_kernel_L2_M2_BEGIN + ble .Lstrmm_kernel_L2_M2_BEGIN -strmm_kernel_L2_M4_20: +.Lstrmm_kernel_L2_M4_20: INIT4x2 @@ -2214,10 +2214,10 @@ strmm_kernel_L2_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M4_40 + ble .Lstrmm_kernel_L2_M4_40 .align 5 -strmm_kernel_L2_M4_22: +.Lstrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -2229,22 +2229,22 @@ strmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_22 + bgt .Lstrmm_kernel_L2_M4_22 -strmm_kernel_L2_M4_40: +.Lstrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M4_100 + ble .Lstrmm_kernel_L2_M4_100 -strmm_kernel_L2_M4_42: +.Lstrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M4_42 + bgt .Lstrmm_kernel_L2_M4_42 -strmm_kernel_L2_M4_100: +.Lstrmm_kernel_L2_M4_100: SAVE4x2 @@ -2263,20 +2263,20 @@ strmm_kernel_L2_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L2_M4_END: +.Lstrmm_kernel_L2_M4_END: /******************************************************************************/ -strmm_kernel_L2_M2_BEGIN: +.Lstrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L2_M1_BEGIN + ble .Lstrmm_kernel_L2_M1_BEGIN -strmm_kernel_L2_M2_20: +.Lstrmm_kernel_L2_M2_20: INIT2x2 @@ -2299,9 +2299,9 @@ strmm_kernel_L2_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble strmm_kernel_L2_M2_40 + ble .Lstrmm_kernel_L2_M2_40 -strmm_kernel_L2_M2_22: +.Lstrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -2314,22 +2314,22 @@ strmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_22 + bgt .Lstrmm_kernel_L2_M2_22 -strmm_kernel_L2_M2_40: +.Lstrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M2_100 + ble .Lstrmm_kernel_L2_M2_100 -strmm_kernel_L2_M2_42: +.Lstrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M2_42 + bgt .Lstrmm_kernel_L2_M2_42 -strmm_kernel_L2_M2_100: +.Lstrmm_kernel_L2_M2_100: SAVE2x2 #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2348,16 +2348,16 @@ strmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -strmm_kernel_L2_M2_END: +.Lstrmm_kernel_L2_M2_END: /******************************************************************************/ -strmm_kernel_L2_M1_BEGIN: +.Lstrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L2_END + ble .Lstrmm_kernel_L2_END -strmm_kernel_L2_M1_20: +.Lstrmm_kernel_L2_M1_20: INIT1x2 @@ -2380,9 +2380,9 @@ strmm_kernel_L2_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble strmm_kernel_L2_M1_40 + ble .Lstrmm_kernel_L2_M1_40 -strmm_kernel_L2_M1_22: +.Lstrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -2394,22 +2394,22 @@ strmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_22 + bgt .Lstrmm_kernel_L2_M1_22 -strmm_kernel_L2_M1_40: +.Lstrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L2_M1_100 + ble .Lstrmm_kernel_L2_M1_100 -strmm_kernel_L2_M1_42: +.Lstrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L2_M1_42 + bgt .Lstrmm_kernel_L2_M1_42 -strmm_kernel_L2_M1_100: +.Lstrmm_kernel_L2_M1_100: SAVE1x2 @@ -2428,7 +2428,7 @@ strmm_kernel_L2_M1_100: #if defined(LEFT) add tempOffset, tempOffset, #1 #endif -strmm_kernel_L2_END: +.Lstrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -2437,11 +2437,11 @@ strmm_kernel_L2_END: /******************************************************************************/ /******************************************************************************/ -strmm_kernel_L1_BEGIN: +.Lstrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble strmm_kernel_L999 // done + ble .Lstrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -2454,14 +2454,14 @@ strmm_kernel_L1_BEGIN: /******************************************************************************/ -strmm_kernel_L1_M8_BEGIN: +.Lstrmm_kernel_L1_M8_BEGIN: mov counterI, origM asr counterI, counterI, #3 cmp counterI, #0 - ble strmm_kernel_L1_M4_BEGIN + ble .Lstrmm_kernel_L1_M4_BEGIN -strmm_kernel_L1_M8_20: +.Lstrmm_kernel_L1_M8_20: INIT8x1 @@ -2484,10 +2484,10 @@ strmm_kernel_L1_M8_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M8_40 + ble .Lstrmm_kernel_L1_M8_40 .align 5 -strmm_kernel_L1_M8_22: +.Lstrmm_kernel_L1_M8_22: KERNEL8x1_SUB KERNEL8x1_SUB KERNEL8x1_SUB @@ -2499,22 +2499,22 @@ strmm_kernel_L1_M8_22: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_22 + bgt .Lstrmm_kernel_L1_M8_22 -strmm_kernel_L1_M8_40: +.Lstrmm_kernel_L1_M8_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M8_100 + ble .Lstrmm_kernel_L1_M8_100 -strmm_kernel_L1_M8_42: +.Lstrmm_kernel_L1_M8_42: KERNEL8x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M8_42 + bgt .Lstrmm_kernel_L1_M8_42 -strmm_kernel_L1_M8_100: +.Lstrmm_kernel_L1_M8_100: SAVE8x1 @@ -2533,23 +2533,23 @@ strmm_kernel_L1_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif -strmm_kernel_L1_M8_END: +.Lstrmm_kernel_L1_M8_END: subs counterI, counterI, #1 - bgt strmm_kernel_L1_M8_20 + bgt .Lstrmm_kernel_L1_M8_20 /******************************************************************************/ -strmm_kernel_L1_M4_BEGIN: +.Lstrmm_kernel_L1_M4_BEGIN: mov counterI, origM tst counterI , #7 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #4 - ble strmm_kernel_L1_M2_BEGIN + ble .Lstrmm_kernel_L1_M2_BEGIN -strmm_kernel_L1_M4_20: +.Lstrmm_kernel_L1_M4_20: INIT4x1 @@ -2572,10 +2572,10 @@ strmm_kernel_L1_M4_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M4_40 + ble .Lstrmm_kernel_L1_M4_40 .align 5 -strmm_kernel_L1_M4_22: +.Lstrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -2587,22 +2587,22 @@ strmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_22 + bgt .Lstrmm_kernel_L1_M4_22 -strmm_kernel_L1_M4_40: +.Lstrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M4_100 + ble .Lstrmm_kernel_L1_M4_100 -strmm_kernel_L1_M4_42: +.Lstrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M4_42 + bgt .Lstrmm_kernel_L1_M4_42 -strmm_kernel_L1_M4_100: +.Lstrmm_kernel_L1_M4_100: SAVE4x1 @@ -2621,20 +2621,20 @@ strmm_kernel_L1_M4_100: #if defined(LEFT) add tempOffset, tempOffset, #4 #endif -strmm_kernel_L1_M4_END: +.Lstrmm_kernel_L1_M4_END: /******************************************************************************/ -strmm_kernel_L1_M2_BEGIN: +.Lstrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble strmm_kernel_L1_M1_BEGIN + ble .Lstrmm_kernel_L1_M1_BEGIN -strmm_kernel_L1_M2_20: +.Lstrmm_kernel_L1_M2_20: INIT2x1 @@ -2657,9 +2657,9 @@ strmm_kernel_L1_M2_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M2_40 + ble .Lstrmm_kernel_L1_M2_40 -strmm_kernel_L1_M2_22: +.Lstrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -2672,22 +2672,22 @@ strmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_22 + bgt .Lstrmm_kernel_L1_M2_22 -strmm_kernel_L1_M2_40: +.Lstrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M2_100 + ble .Lstrmm_kernel_L1_M2_100 -strmm_kernel_L1_M2_42: +.Lstrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M2_42 + bgt .Lstrmm_kernel_L1_M2_42 -strmm_kernel_L1_M2_100: +.Lstrmm_kernel_L1_M2_100: SAVE2x1 @@ -2706,16 +2706,16 @@ strmm_kernel_L1_M2_100: #if defined(LEFT) add tempOffset, tempOffset, #2 #endif -strmm_kernel_L1_M2_END: +.Lstrmm_kernel_L1_M2_END: /******************************************************************************/ -strmm_kernel_L1_M1_BEGIN: +.Lstrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble strmm_kernel_L1_END + ble .Lstrmm_kernel_L1_END -strmm_kernel_L1_M1_20: +.Lstrmm_kernel_L1_M1_20: INIT1x1 @@ -2738,9 +2738,9 @@ strmm_kernel_L1_M1_20: #endif asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble strmm_kernel_L1_M1_40 + ble .Lstrmm_kernel_L1_M1_40 -strmm_kernel_L1_M1_22: +.Lstrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -2752,30 +2752,30 @@ strmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_22 + bgt .Lstrmm_kernel_L1_M1_22 -strmm_kernel_L1_M1_40: +.Lstrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble strmm_kernel_L1_M1_100 + ble .Lstrmm_kernel_L1_M1_100 -strmm_kernel_L1_M1_42: +.Lstrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt strmm_kernel_L1_M1_42 + bgt .Lstrmm_kernel_L1_M1_42 -strmm_kernel_L1_M1_100: +.Lstrmm_kernel_L1_M1_100: SAVE1x1 -strmm_kernel_L1_END: +.Lstrmm_kernel_L1_END: /******************************************************************************/ -strmm_kernel_L999: +.Lstrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/swap.S b/kernel/arm64/swap.S index 37ed83f2af..184e02e9cf 100644 --- a/kernel/arm64/swap.S +++ b/kernel/arm64/swap.S @@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble swap_kernel_L999 + ble .Lswap_kernel_L999 cmp INC_X, #1 - bne swap_kernel_S_BEGIN + bne .Lswap_kernel_S_BEGIN cmp INC_Y, #1 - bne swap_kernel_S_BEGIN + bne .Lswap_kernel_S_BEGIN -swap_kernel_F_BEGIN: +.Lswap_kernel_F_BEGIN: asr I, N, #3 cmp I, xzr - beq swap_kernel_F1 + beq .Lswap_kernel_F1 -swap_kernel_F8: +.Lswap_kernel_F8: KERNEL_F8 subs I, I, #1 - bne swap_kernel_F8 + bne .Lswap_kernel_F8 -swap_kernel_F1: +.Lswap_kernel_F1: ands I, N, #7 - ble swap_kernel_L999 + ble .Lswap_kernel_L999 -swap_kernel_F10: +.Lswap_kernel_F10: KERNEL_F1 subs I, I, #1 - bne swap_kernel_F10 + bne .Lswap_kernel_F10 - b swap_kernel_L999 + b .Lswap_kernel_L999 -swap_kernel_S_BEGIN: +.Lswap_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble swap_kernel_S1 + ble .Lswap_kernel_S1 -swap_kernel_S4: +.Lswap_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -244,21 +244,21 @@ swap_kernel_S4: KERNEL_S1 subs I, I, #1 - bne swap_kernel_S4 + bne .Lswap_kernel_S4 -swap_kernel_S1: +.Lswap_kernel_S1: ands I, N, #3 - ble swap_kernel_L999 + ble .Lswap_kernel_L999 -swap_kernel_S10: +.Lswap_kernel_S10: KERNEL_S1 subs I, I, #1 - bne swap_kernel_S10 + bne .Lswap_kernel_S10 -swap_kernel_L999: +.Lswap_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/zamax.S b/kernel/arm64/zamax.S index 7db339f538..c2c0a5374d 100644 --- a/kernel/arm64/zamax.S +++ b/kernel/arm64/zamax.S @@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble amax_kernel_zero + ble .Lzamax_kernel_zero cmp INC_X, xzr - ble amax_kernel_zero + ble .Lzamax_kernel_zero cmp INC_X, #1 - bne amax_kernel_S_BEGIN + bne .Lzamax_kernel_S_BEGIN -amax_kernel_F_BEGIN: +.Lzamax_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq amax_kernel_F1_INIT + beq .Lzamax_kernel_F1_INIT INIT_F4 subs I, I, #1 - beq amax_kernel_F1 + beq .Lzamax_kernel_F1 -amax_kernel_F4: +.Lzamax_kernel_F4: KERNEL_F4 subs I, I, #1 - bne amax_kernel_F4 + bne .Lzamax_kernel_F4 -amax_kernel_F1: +.Lzamax_kernel_F1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lzamax_kernel_L999 -amax_kernel_F10: +.Lzamax_kernel_F10: KERNEL_F1 subs I, I, #1 - bne amax_kernel_F10 + bne .Lzamax_kernel_F10 ret -amax_kernel_F1_INIT: +.Lzamax_kernel_F1_INIT: INIT_F1 subs N, N, #1 - b amax_kernel_F1 + b .Lzamax_kernel_F1 -amax_kernel_S_BEGIN: +.Lzamax_kernel_S_BEGIN: INIT_S subs N, N, #1 - ble amax_kernel_L999 + ble .Lzamax_kernel_L999 asr I, N, #2 cmp I, xzr - ble amax_kernel_S1 + ble .Lzamax_kernel_S1 -amax_kernel_S4: +.Lzamax_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -247,25 +247,25 @@ amax_kernel_S4: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S4 + bne .Lzamax_kernel_S4 -amax_kernel_S1: +.Lzamax_kernel_S1: ands I, N, #3 - ble amax_kernel_L999 + ble .Lzamax_kernel_L999 -amax_kernel_S10: +.Lzamax_kernel_S10: KERNEL_S1 subs I, I, #1 - bne amax_kernel_S10 + bne .Lzamax_kernel_S10 -amax_kernel_L999: +.Lzamax_kernel_L999: ret -amax_kernel_zero: +.Lzamax_kernel_zero: fmov MAXF, REG0 ret diff --git a/kernel/arm64/zasum.S b/kernel/arm64/zasum.S index bf586d3679..0d5ec952bb 100644 --- a/kernel/arm64/zasum.S +++ b/kernel/arm64/zasum.S @@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov SUMF, REG0 cmp N, xzr - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 cmp INC_X, xzr - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 cmp INC_X, #1 - bne asum_kernel_S_BEGIN + bne .Lzasum_kernel_S_BEGIN -asum_kernel_F_BEGIN: +.Lzasum_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq asum_kernel_F1 + beq .Lzasum_kernel_F1 -asum_kernel_F4: +.Lzasum_kernel_F4: KERNEL_F4 subs I, I, #1 - bne asum_kernel_F4 + bne .Lzasum_kernel_F4 KERNEL_F4_FINALIZE -asum_kernel_F1: +.Lzasum_kernel_F1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 -asum_kernel_F10: +.Lzasum_kernel_F10: KERNEL_F1 subs I, I, #1 - bne asum_kernel_F10 + bne .Lzasum_kernel_F10 -asum_kernel_L999: +.Lzasum_kernel_L999: ret -asum_kernel_S_BEGIN: +.Lzasum_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble asum_kernel_S1 + ble .Lzasum_kernel_S1 -asum_kernel_S4: +.Lzasum_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -145,19 +145,19 @@ asum_kernel_S4: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S4 + bne .Lzasum_kernel_S4 -asum_kernel_S1: +.Lzasum_kernel_S1: ands I, N, #3 - ble asum_kernel_L999 + ble .Lzasum_kernel_L999 -asum_kernel_S10: +.Lzasum_kernel_S10: KERNEL_S1 subs I, I, #1 - bne asum_kernel_S10 + bne .Lzasum_kernel_S10 ret diff --git a/kernel/arm64/zaxpy.S b/kernel/arm64/zaxpy.S index 70c2499815..46d7b04788 100644 --- a/kernel/arm64/zaxpy.S +++ b/kernel/arm64/zaxpy.S @@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble zaxpy_kernel_L999 + ble .Lzaxpy_kernel_L999 mov Y_COPY, Y fcmp DA_R, #0.0 bne .L1 fcmp DA_I, #0.0 - beq zaxpy_kernel_L999 + beq .Lzaxpy_kernel_L999 .L1: INIT cmp INC_X, #1 - bne zaxpy_kernel_S_BEGIN + bne .Lzaxpy_kernel_S_BEGIN cmp INC_Y, #1 - bne zaxpy_kernel_S_BEGIN + bne .Lzaxpy_kernel_S_BEGIN -zaxpy_kernel_F_BEGIN: +.Lzaxpy_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq zaxpy_kernel_F1 + beq .Lzaxpy_kernel_F1 KERNEL_INIT_F4 -zaxpy_kernel_F4: +.Lzaxpy_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zaxpy_kernel_F4 + bne .Lzaxpy_kernel_F4 -zaxpy_kernel_F1: +.Lzaxpy_kernel_F1: ands I, N, #3 - ble zaxpy_kernel_L999 + ble .Lzaxpy_kernel_L999 -zaxpy_kernel_F10: +.Lzaxpy_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zaxpy_kernel_F10 + bne .Lzaxpy_kernel_F10 mov w0, wzr ret -zaxpy_kernel_S_BEGIN: +.Lzaxpy_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble zaxpy_kernel_S1 + ble .Lzaxpy_kernel_S1 -zaxpy_kernel_S4: +.Lzaxpy_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -304,21 +304,21 @@ zaxpy_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zaxpy_kernel_S4 + bne .Lzaxpy_kernel_S4 -zaxpy_kernel_S1: +.Lzaxpy_kernel_S1: ands I, N, #3 - ble zaxpy_kernel_L999 + ble .Lzaxpy_kernel_L999 -zaxpy_kernel_S10: +.Lzaxpy_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zaxpy_kernel_S10 + bne .Lzaxpy_kernel_S10 -zaxpy_kernel_L999: +.Lzaxpy_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/zdot.S b/kernel/arm64/zdot.S index 3e8e3d7d98..044ace3b82 100644 --- a/kernel/arm64/zdot.S +++ b/kernel/arm64/zdot.S @@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif cmp N, xzr - ble dot_kernel_L999 + ble .Lzdot_kernel_L999 cmp INC_X, #1 - bne dot_kernel_S_BEGIN + bne .Lzdot_kernel_S_BEGIN cmp INC_Y, #1 - bne dot_kernel_S_BEGIN + bne .Lzdot_kernel_S_BEGIN -dot_kernel_F_BEGIN: +.Lzdot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq dot_kernel_F1 + beq .Lzdot_kernel_F1 -dot_kernel_F4: +.Lzdot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne dot_kernel_F4 + bne .Lzdot_kernel_F4 KERNEL_F4_FINALIZE -dot_kernel_F1: +.Lzdot_kernel_F1: ands I, N, #3 - ble dot_kernel_L999 + ble .Lzdot_kernel_L999 -dot_kernel_F10: +.Lzdot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne dot_kernel_F10 + bne .Lzdot_kernel_F10 ret -dot_kernel_S_BEGIN: +.Lzdot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble dot_kernel_S1 + ble .Lzdot_kernel_S1 -dot_kernel_S4: +.Lzdot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -281,21 +281,21 @@ dot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S4 + bne .Lzdot_kernel_S4 -dot_kernel_S1: +.Lzdot_kernel_S1: ands I, N, #3 - ble dot_kernel_L999 + ble .Lzdot_kernel_L999 -dot_kernel_S10: +.Lzdot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne dot_kernel_S10 + bne .Lzdot_kernel_S10 -dot_kernel_L999: +.Lzdot_kernel_L999: ret diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 08a1531cff..f8e877f3cf 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble zgemm_kernel_L2_BEGIN + ble .Lzgemm_kernel_L2_BEGIN -zgemm_kernel_L4_BEGIN: +.Lzgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -zgemm_kernel_L4_M4_BEGIN: +.Lzgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L4_M2_BEGIN + ble .Lzgemm_kernel_L4_M2_BEGIN .align 5 -zgemm_kernel_L4_M4_20: +.Lzgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt zgemm_kernel_L4_M4_32 + blt .Lzgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 @@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20: KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 - ble zgemm_kernel_L4_M4_22a + ble .Lzgemm_kernel_L4_M4_22a .align 5 -zgemm_kernel_L4_M4_22: +.Lzgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22: KERNEL4x4_M2 subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M4_22 + bgt .Lzgemm_kernel_L4_M4_22 .align 5 -zgemm_kernel_L4_M4_22a: +.Lzgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 .align 5 -zgemm_kernel_L4_M4_32: +.Lzgemm_kernel_L4_M4_32: tst counterL, #1 - ble zgemm_kernel_L4_M4_40 + ble .Lzgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 -zgemm_kernel_L4_M4_40: +.Lzgemm_kernel_L4_M4_40: INIT4x4 -zgemm_kernel_L4_M4_44: +.Lzgemm_kernel_L4_M4_44: ands counterL , origK, #7 - ble zgemm_kernel_L4_M4_100 + ble .Lzgemm_kernel_L4_M4_100 .align 5 -zgemm_kernel_L4_M4_46: +.Lzgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 - bne zgemm_kernel_L4_M4_46 + bne .Lzgemm_kernel_L4_M4_46 -zgemm_kernel_L4_M4_100: +.Lzgemm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE4x4 -zgemm_kernel_L4_M4_END: +.Lzgemm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne zgemm_kernel_L4_M4_20 + bne .Lzgemm_kernel_L4_M4_20 -zgemm_kernel_L4_M2_BEGIN: +.Lzgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L4_M1_BEGIN + ble .Lzgemm_kernel_L4_M1_BEGIN -zgemm_kernel_L4_M2_20: +.Lzgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M2_40 + ble .Lzgemm_kernel_L4_M2_40 -zgemm_kernel_L4_M2_22: +.Lzgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_22 + bgt .Lzgemm_kernel_L4_M2_22 -zgemm_kernel_L4_M2_40: +.Lzgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M2_100 + ble .Lzgemm_kernel_L4_M2_100 -zgemm_kernel_L4_M2_42: +.Lzgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_42 + bgt .Lzgemm_kernel_L4_M2_42 -zgemm_kernel_L4_M2_100: +.Lzgemm_kernel_L4_M2_100: SAVE2x4 -zgemm_kernel_L4_M2_END: +.Lzgemm_kernel_L4_M2_END: -zgemm_kernel_L4_M1_BEGIN: +.Lzgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END -zgemm_kernel_L4_M1_20: +.Lzgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M1_40 + ble .Lzgemm_kernel_L4_M1_40 -zgemm_kernel_L4_M1_22: +.Lzgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_22 + bgt .Lzgemm_kernel_L4_M1_22 -zgemm_kernel_L4_M1_40: +.Lzgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M1_100 + ble .Lzgemm_kernel_L4_M1_100 -zgemm_kernel_L4_M1_42: +.Lzgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_42 + bgt .Lzgemm_kernel_L4_M1_42 -zgemm_kernel_L4_M1_100: +.Lzgemm_kernel_L4_M1_100: SAVE1x4 -zgemm_kernel_L4_END: +.Lzgemm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 subs counterJ, counterJ , #1 // j-- - bgt zgemm_kernel_L4_BEGIN + bgt .Lzgemm_kernel_L4_BEGIN /******************************************************************************/ -zgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble zgemm_kernel_L999 + ble .Lzgemm_kernel_L999 tst counterJ , #2 - ble zgemm_kernel_L1_BEGIN + ble .Lzgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction -zgemm_kernel_L2_M4_BEGIN: +.Lzgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble zgemm_kernel_L2_M2_BEGIN + ble .Lzgemm_kernel_L2_M2_BEGIN -zgemm_kernel_L2_M4_20: +.Lzgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M4_40 + ble .Lzgemm_kernel_L2_M4_40 .align 5 -zgemm_kernel_L2_M4_22: +.Lzgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_22 + bgt .Lzgemm_kernel_L2_M4_22 -zgemm_kernel_L2_M4_40: +.Lzgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M4_100 + ble .Lzgemm_kernel_L2_M4_100 -zgemm_kernel_L2_M4_42: +.Lzgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_42 + bgt .Lzgemm_kernel_L2_M4_42 -zgemm_kernel_L2_M4_100: +.Lzgemm_kernel_L2_M4_100: SAVE4x2 -zgemm_kernel_L2_M4_END: +.Lzgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L2_M4_20 + bgt .Lzgemm_kernel_L2_M4_20 -zgemm_kernel_L2_M2_BEGIN: +.Lzgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L2_M1_BEGIN + ble .Lzgemm_kernel_L2_M1_BEGIN -zgemm_kernel_L2_M2_20: +.Lzgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M2_40 + ble .Lzgemm_kernel_L2_M2_40 -zgemm_kernel_L2_M2_22: +.Lzgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_22 + bgt .Lzgemm_kernel_L2_M2_22 -zgemm_kernel_L2_M2_40: +.Lzgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M2_100 + ble .Lzgemm_kernel_L2_M2_100 -zgemm_kernel_L2_M2_42: +.Lzgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_42 + bgt .Lzgemm_kernel_L2_M2_42 -zgemm_kernel_L2_M2_100: +.Lzgemm_kernel_L2_M2_100: SAVE2x2 -zgemm_kernel_L2_M2_END: +.Lzgemm_kernel_L2_M2_END: -zgemm_kernel_L2_M1_BEGIN: +.Lzgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END -zgemm_kernel_L2_M1_20: +.Lzgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble zgemm_kernel_L2_M1_40 + ble .Lzgemm_kernel_L2_M1_40 -zgemm_kernel_L2_M1_22: +.Lzgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_22 + bgt .Lzgemm_kernel_L2_M1_22 -zgemm_kernel_L2_M1_40: +.Lzgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M1_100 + ble .Lzgemm_kernel_L2_M1_100 -zgemm_kernel_L2_M1_42: +.Lzgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_42 + bgt .Lzgemm_kernel_L2_M1_42 -zgemm_kernel_L2_M1_100: +.Lzgemm_kernel_L2_M1_100: SAVE1x2 -zgemm_kernel_L2_END: +.Lzgemm_kernel_L2_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 2 * 8 * 2 /******************************************************************************/ -zgemm_kernel_L1_BEGIN: +.Lzgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble zgemm_kernel_L999 // done + ble .Lzgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN: -zgemm_kernel_L1_M4_BEGIN: +.Lzgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L1_M2_BEGIN + ble .Lzgemm_kernel_L1_M2_BEGIN -zgemm_kernel_L1_M4_20: +.Lzgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M4_40 + ble .Lzgemm_kernel_L1_M4_40 .align 5 -zgemm_kernel_L1_M4_22: +.Lzgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_22 + bgt .Lzgemm_kernel_L1_M4_22 -zgemm_kernel_L1_M4_40: +.Lzgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M4_100 + ble .Lzgemm_kernel_L1_M4_100 -zgemm_kernel_L1_M4_42: +.Lzgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_42 + bgt .Lzgemm_kernel_L1_M4_42 -zgemm_kernel_L1_M4_100: +.Lzgemm_kernel_L1_M4_100: SAVE4x1 -zgemm_kernel_L1_M4_END: +.Lzgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L1_M4_20 + bgt .Lzgemm_kernel_L1_M4_20 -zgemm_kernel_L1_M2_BEGIN: +.Lzgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L1_M1_BEGIN + ble .Lzgemm_kernel_L1_M1_BEGIN -zgemm_kernel_L1_M2_20: +.Lzgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M2_40 + ble .Lzgemm_kernel_L1_M2_40 -zgemm_kernel_L1_M2_22: +.Lzgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_22 + bgt .Lzgemm_kernel_L1_M2_22 -zgemm_kernel_L1_M2_40: +.Lzgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M2_100 + ble .Lzgemm_kernel_L1_M2_100 -zgemm_kernel_L1_M2_42: +.Lzgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_42 + bgt .Lzgemm_kernel_L1_M2_42 -zgemm_kernel_L1_M2_100: +.Lzgemm_kernel_L1_M2_100: SAVE2x1 -zgemm_kernel_L1_M2_END: +.Lzgemm_kernel_L1_M2_END: -zgemm_kernel_L1_M1_BEGIN: +.Lzgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END -zgemm_kernel_L1_M1_20: +.Lzgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M1_40 + ble .Lzgemm_kernel_L1_M1_40 -zgemm_kernel_L1_M1_22: +.Lzgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_22 + bgt .Lzgemm_kernel_L1_M1_22 -zgemm_kernel_L1_M1_40: +.Lzgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M1_100 + ble .Lzgemm_kernel_L1_M1_100 -zgemm_kernel_L1_M1_42: +.Lzgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_42 + bgt .Lzgemm_kernel_L1_M1_42 -zgemm_kernel_L1_M1_100: +.Lzgemm_kernel_L1_M1_100: SAVE1x1 -zgemm_kernel_L1_END: +.Lzgemm_kernel_L1_END: -zgemm_kernel_L999: +.Lzgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S b/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S index e5b4cba9c5..8e6ff655de 100644 --- a/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S +++ b/kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S @@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble zgemm_kernel_L2_BEGIN + ble .Lzgemm_kernel_L2_BEGIN -zgemm_kernel_L4_BEGIN: +.Lzgemm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN: mov pA, origPA // pA = start of A array -zgemm_kernel_L4_M4_BEGIN: +.Lzgemm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L4_M2_BEGIN + ble .Lzgemm_kernel_L4_M2_BEGIN .align 5 -zgemm_kernel_L4_M4_20: +.Lzgemm_kernel_L4_M4_20: mov pB, origPB asr counterL , origK, #3 cmp counterL , #2 - blt zgemm_kernel_L4_M4_32 + blt .Lzgemm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 @@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20: KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 - ble zgemm_kernel_L4_M4_22a + ble .Lzgemm_kernel_L4_M4_22a .align 5 -zgemm_kernel_L4_M4_22: +.Lzgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22: KERNEL4x4_M2 subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M4_22 + bgt .Lzgemm_kernel_L4_M4_22 .align 5 -zgemm_kernel_L4_M4_22a: +.Lzgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 .align 5 -zgemm_kernel_L4_M4_32: +.Lzgemm_kernel_L4_M4_32: tst counterL, #1 - ble zgemm_kernel_L4_M4_40 + ble .Lzgemm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b zgemm_kernel_L4_M4_44 + b .Lzgemm_kernel_L4_M4_44 -zgemm_kernel_L4_M4_40: +.Lzgemm_kernel_L4_M4_40: INIT4x4 -zgemm_kernel_L4_M4_44: +.Lzgemm_kernel_L4_M4_44: ands counterL , origK, #7 - ble zgemm_kernel_L4_M4_100 + ble .Lzgemm_kernel_L4_M4_100 .align 5 -zgemm_kernel_L4_M4_46: +.Lzgemm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 - bne zgemm_kernel_L4_M4_46 + bne .Lzgemm_kernel_L4_M4_46 -zgemm_kernel_L4_M4_100: +.Lzgemm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] SAVE4x4 -zgemm_kernel_L4_M4_END: +.Lzgemm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne zgemm_kernel_L4_M4_20 + bne .Lzgemm_kernel_L4_M4_20 -zgemm_kernel_L4_M2_BEGIN: +.Lzgemm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L4_M1_BEGIN + ble .Lzgemm_kernel_L4_M1_BEGIN -zgemm_kernel_L4_M2_20: +.Lzgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M2_40 + ble .Lzgemm_kernel_L4_M2_40 -zgemm_kernel_L4_M2_22: +.Lzgemm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_22 + bgt .Lzgemm_kernel_L4_M2_22 -zgemm_kernel_L4_M2_40: +.Lzgemm_kernel_L4_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M2_100 + ble .Lzgemm_kernel_L4_M2_100 -zgemm_kernel_L4_M2_42: +.Lzgemm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M2_42 + bgt .Lzgemm_kernel_L4_M2_42 -zgemm_kernel_L4_M2_100: +.Lzgemm_kernel_L4_M2_100: SAVE2x4 -zgemm_kernel_L4_M2_END: +.Lzgemm_kernel_L4_M2_END: -zgemm_kernel_L4_M1_BEGIN: +.Lzgemm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L4_END + ble .Lzgemm_kernel_L4_END -zgemm_kernel_L4_M1_20: +.Lzgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L4_M1_40 + ble .Lzgemm_kernel_L4_M1_40 -zgemm_kernel_L4_M1_22: +.Lzgemm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_22 + bgt .Lzgemm_kernel_L4_M1_22 -zgemm_kernel_L4_M1_40: +.Lzgemm_kernel_L4_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L4_M1_100 + ble .Lzgemm_kernel_L4_M1_100 -zgemm_kernel_L4_M1_42: +.Lzgemm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L4_M1_42 + bgt .Lzgemm_kernel_L4_M1_42 -zgemm_kernel_L4_M1_100: +.Lzgemm_kernel_L4_M1_100: SAVE1x4 -zgemm_kernel_L4_END: +.Lzgemm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 subs counterJ, counterJ , #1 // j-- - bgt zgemm_kernel_L4_BEGIN + bgt .Lzgemm_kernel_L4_BEGIN /******************************************************************************/ -zgemm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble zgemm_kernel_L999 + ble .Lzgemm_kernel_L999 tst counterJ , #2 - ble zgemm_kernel_L1_BEGIN + ble .Lzgemm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction -zgemm_kernel_L2_M4_BEGIN: +.Lzgemm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble zgemm_kernel_L2_M2_BEGIN + ble .Lzgemm_kernel_L2_M2_BEGIN -zgemm_kernel_L2_M4_20: +.Lzgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M4_40 + ble .Lzgemm_kernel_L2_M4_40 .align 5 -zgemm_kernel_L2_M4_22: +.Lzgemm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_22 + bgt .Lzgemm_kernel_L2_M4_22 -zgemm_kernel_L2_M4_40: +.Lzgemm_kernel_L2_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M4_100 + ble .Lzgemm_kernel_L2_M4_100 -zgemm_kernel_L2_M4_42: +.Lzgemm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M4_42 + bgt .Lzgemm_kernel_L2_M4_42 -zgemm_kernel_L2_M4_100: +.Lzgemm_kernel_L2_M4_100: SAVE4x2 -zgemm_kernel_L2_M4_END: +.Lzgemm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L2_M4_20 + bgt .Lzgemm_kernel_L2_M4_20 -zgemm_kernel_L2_M2_BEGIN: +.Lzgemm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L2_M1_BEGIN + ble .Lzgemm_kernel_L2_M1_BEGIN -zgemm_kernel_L2_M2_20: +.Lzgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble zgemm_kernel_L2_M2_40 + ble .Lzgemm_kernel_L2_M2_40 -zgemm_kernel_L2_M2_22: +.Lzgemm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_22 + bgt .Lzgemm_kernel_L2_M2_22 -zgemm_kernel_L2_M2_40: +.Lzgemm_kernel_L2_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M2_100 + ble .Lzgemm_kernel_L2_M2_100 -zgemm_kernel_L2_M2_42: +.Lzgemm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M2_42 + bgt .Lzgemm_kernel_L2_M2_42 -zgemm_kernel_L2_M2_100: +.Lzgemm_kernel_L2_M2_100: SAVE2x2 -zgemm_kernel_L2_M2_END: +.Lzgemm_kernel_L2_M2_END: -zgemm_kernel_L2_M1_BEGIN: +.Lzgemm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L2_END + ble .Lzgemm_kernel_L2_END -zgemm_kernel_L2_M1_20: +.Lzgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble zgemm_kernel_L2_M1_40 + ble .Lzgemm_kernel_L2_M1_40 -zgemm_kernel_L2_M1_22: +.Lzgemm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_22 + bgt .Lzgemm_kernel_L2_M1_22 -zgemm_kernel_L2_M1_40: +.Lzgemm_kernel_L2_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L2_M1_100 + ble .Lzgemm_kernel_L2_M1_100 -zgemm_kernel_L2_M1_42: +.Lzgemm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L2_M1_42 + bgt .Lzgemm_kernel_L2_M1_42 -zgemm_kernel_L2_M1_100: +.Lzgemm_kernel_L2_M1_100: SAVE1x2 -zgemm_kernel_L2_END: +.Lzgemm_kernel_L2_END: lsl temp, origK, #5 add origPB, origPB, temp // B = B + K * 2 * 8 * 2 /******************************************************************************/ -zgemm_kernel_L1_BEGIN: +.Lzgemm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble zgemm_kernel_L999 // done + ble .Lzgemm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN: -zgemm_kernel_L1_M4_BEGIN: +.Lzgemm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble zgemm_kernel_L1_M2_BEGIN + ble .Lzgemm_kernel_L1_M2_BEGIN -zgemm_kernel_L1_M4_20: +.Lzgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M4_40 + ble .Lzgemm_kernel_L1_M4_40 .align 5 -zgemm_kernel_L1_M4_22: +.Lzgemm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_22 + bgt .Lzgemm_kernel_L1_M4_22 -zgemm_kernel_L1_M4_40: +.Lzgemm_kernel_L1_M4_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M4_100 + ble .Lzgemm_kernel_L1_M4_100 -zgemm_kernel_L1_M4_42: +.Lzgemm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M4_42 + bgt .Lzgemm_kernel_L1_M4_42 -zgemm_kernel_L1_M4_100: +.Lzgemm_kernel_L1_M4_100: SAVE4x1 -zgemm_kernel_L1_M4_END: +.Lzgemm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt zgemm_kernel_L1_M4_20 + bgt .Lzgemm_kernel_L1_M4_20 -zgemm_kernel_L1_M2_BEGIN: +.Lzgemm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble zgemm_kernel_L1_M1_BEGIN + ble .Lzgemm_kernel_L1_M1_BEGIN -zgemm_kernel_L1_M2_20: +.Lzgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M2_40 + ble .Lzgemm_kernel_L1_M2_40 -zgemm_kernel_L1_M2_22: +.Lzgemm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_22 + bgt .Lzgemm_kernel_L1_M2_22 -zgemm_kernel_L1_M2_40: +.Lzgemm_kernel_L1_M2_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M2_100 + ble .Lzgemm_kernel_L1_M2_100 -zgemm_kernel_L1_M2_42: +.Lzgemm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M2_42 + bgt .Lzgemm_kernel_L1_M2_42 -zgemm_kernel_L1_M2_100: +.Lzgemm_kernel_L1_M2_100: SAVE2x1 -zgemm_kernel_L1_M2_END: +.Lzgemm_kernel_L1_M2_END: -zgemm_kernel_L1_M1_BEGIN: +.Lzgemm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble zgemm_kernel_L1_END + ble .Lzgemm_kernel_L1_END -zgemm_kernel_L1_M1_20: +.Lzgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble zgemm_kernel_L1_M1_40 + ble .Lzgemm_kernel_L1_M1_40 -zgemm_kernel_L1_M1_22: +.Lzgemm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_22 + bgt .Lzgemm_kernel_L1_M1_22 -zgemm_kernel_L1_M1_40: +.Lzgemm_kernel_L1_M1_40: ands counterL , origK, #7 // counterL = counterL % 8 - ble zgemm_kernel_L1_M1_100 + ble .Lzgemm_kernel_L1_M1_100 -zgemm_kernel_L1_M1_42: +.Lzgemm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt zgemm_kernel_L1_M1_42 + bgt .Lzgemm_kernel_L1_M1_42 -zgemm_kernel_L1_M1_100: +.Lzgemm_kernel_L1_M1_100: SAVE1x1 -zgemm_kernel_L1_END: +.Lzgemm_kernel_L1_END: -zgemm_kernel_L999: +.Lzgemm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)] diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S index a28d1b0cee..28afcada5a 100644 --- a/kernel/arm64/zgemv_n.S +++ b/kernel/arm64/zgemv_n.S @@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble zgemv_n_kernel_L999 + ble .Lzgemv_n_kernel_L999 cmp M, xzr - ble zgemv_n_kernel_L999 + ble .Lzgemv_n_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_X, INC_X, #SHZ @@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. INIT cmp INC_Y, #1 - bne zgemv_n_kernel_S_BEGIN + bne .Lzgemv_n_kernel_S_BEGIN -zgemv_n_kernel_F_LOOP: +.Lzgemv_n_kernel_F_LOOP: mov A_PTR, A mov Y_IPTR, Y mov Y_OPTR, Y @@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP: asr I, M, #2 cmp I, xzr - beq zgemv_n_kernel_F1 + beq .Lzgemv_n_kernel_F1 -zgemv_n_kernel_F4: +.Lzgemv_n_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zgemv_n_kernel_F4 + bne .Lzgemv_n_kernel_F4 -zgemv_n_kernel_F1: +.Lzgemv_n_kernel_F1: ands I, M, #3 - ble zgemv_n_kernel_F_END + ble .Lzgemv_n_kernel_F_END -zgemv_n_kernel_F10: +.Lzgemv_n_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zgemv_n_kernel_F10 + bne .Lzgemv_n_kernel_F10 -zgemv_n_kernel_F_END: +.Lzgemv_n_kernel_F_END: add A, A, LDA subs J, J, #1 - bne zgemv_n_kernel_F_LOOP + bne .Lzgemv_n_kernel_F_LOOP - b zgemv_n_kernel_L999 + b .Lzgemv_n_kernel_L999 -zgemv_n_kernel_S_BEGIN: +.Lzgemv_n_kernel_S_BEGIN: INIT_S -zgemv_n_kernel_S_LOOP: +.Lzgemv_n_kernel_S_LOOP: mov A_PTR, A mov Y_IPTR, Y mov Y_OPTR, Y @@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble zgemv_n_kernel_S1 + ble .Lzgemv_n_kernel_S1 -zgemv_n_kernel_S4: +.Lzgemv_n_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -440,27 +440,27 @@ zgemv_n_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zgemv_n_kernel_S4 + bne .Lzgemv_n_kernel_S4 -zgemv_n_kernel_S1: +.Lzgemv_n_kernel_S1: ands I, M, #3 - ble zgemv_n_kernel_S_END + ble .Lzgemv_n_kernel_S_END -zgemv_n_kernel_S10: +.Lzgemv_n_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zgemv_n_kernel_S10 + bne .Lzgemv_n_kernel_S10 -zgemv_n_kernel_S_END: +.Lzgemv_n_kernel_S_END: add A, A, LDA subs J, J, #1 - bne zgemv_n_kernel_S_LOOP + bne .Lzgemv_n_kernel_S_LOOP -zgemv_n_kernel_L999: +.Lzgemv_n_kernel_L999: RESTORE_REGS mov w0, wzr diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S index 79ce9bcf28..0151029c77 100644 --- a/kernel/arm64/zgemv_t.S +++ b/kernel/arm64/zgemv_t.S @@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SAVE_REGS cmp N, xzr - ble zgemv_t_kernel_L999 + ble .Lzgemv_t_kernel_L999 cmp M, xzr - ble zgemv_t_kernel_L999 + ble .Lzgemv_t_kernel_L999 lsl LDA, LDA, #SHZ lsl INC_Y, INC_Y, #SHZ @@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. INIT cmp INC_X, #1 - bne zgemv_t_kernel_S_BEGIN + bne .Lzgemv_t_kernel_S_BEGIN -zgemv_t_kernel_F_LOOP: +.Lzgemv_t_kernel_F_LOOP: mov A_PTR, A mov X_PTR, X @@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP: asr I, M, #2 cmp I, xzr - beq zgemv_t_kernel_F1 + beq .Lzgemv_t_kernel_F1 -zgemv_t_kernel_F4: +.Lzgemv_t_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zgemv_t_kernel_F4 + bne .Lzgemv_t_kernel_F4 KERNEL_F4_FINALIZE -zgemv_t_kernel_F1: +.Lzgemv_t_kernel_F1: ands I, M, #3 - ble zgemv_t_kernel_F_END + ble .Lzgemv_t_kernel_F_END -zgemv_t_kernel_F10: +.Lzgemv_t_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zgemv_t_kernel_F10 + bne .Lzgemv_t_kernel_F10 -zgemv_t_kernel_F_END: +.Lzgemv_t_kernel_F_END: #if !defined(DOUBLE) ld1 {v4.2s}, [Y] @@ -355,15 +355,15 @@ zgemv_t_kernel_F_END: add A, A, LDA subs J, J, #1 - bne zgemv_t_kernel_F_LOOP + bne .Lzgemv_t_kernel_F_LOOP - b zgemv_t_kernel_L999 + b .Lzgemv_t_kernel_L999 -zgemv_t_kernel_S_BEGIN: +.Lzgemv_t_kernel_S_BEGIN: INIT_S -zgemv_t_kernel_S_LOOP: +.Lzgemv_t_kernel_S_LOOP: mov A_PTR, A mov X_PTR, X @@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP: asr I, M, #2 cmp I, xzr - ble zgemv_t_kernel_S1 + ble .Lzgemv_t_kernel_S1 -zgemv_t_kernel_S4: +.Lzgemv_t_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -381,21 +381,21 @@ zgemv_t_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zgemv_t_kernel_S4 + bne .Lzgemv_t_kernel_S4 -zgemv_t_kernel_S1: +.Lzgemv_t_kernel_S1: ands I, M, #3 - ble zgemv_t_kernel_S_END + ble .Lzgemv_t_kernel_S_END -zgemv_t_kernel_S10: +.Lzgemv_t_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zgemv_t_kernel_S10 + bne .Lzgemv_t_kernel_S10 -zgemv_t_kernel_S_END: +.Lzgemv_t_kernel_S_END: #if !defined(DOUBLE) ld1 {v4.2s}, [Y] @@ -413,9 +413,9 @@ zgemv_t_kernel_S_END: add A, A, LDA subs J, J, #1 - bne zgemv_t_kernel_S_LOOP + bne .Lzgemv_t_kernel_S_LOOP -zgemv_t_kernel_L999: +.Lzgemv_t_kernel_L999: RESTORE_REGS mov w0, wzr ret diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S index 1360dc9932..1c89685ea6 100644 --- a/kernel/arm64/znrm2.S +++ b/kernel/arm64/znrm2.S @@ -226,43 +226,43 @@ KERNEL_S1_END_\@: INIT cmp N, #0 - ble nrm2_kernel_L999 + ble .Lznrm2_kernel_L999 cmp INC_X, #0 - beq nrm2_kernel_L999 + beq .Lznrm2_kernel_L999 cmp INC_X, #1 - bne nrm2_kernel_S_BEGIN + bne .Lznrm2_kernel_S_BEGIN -nrm2_kernel_F_BEGIN: +.Lznrm2_kernel_F_BEGIN: asr I, N, #3 // I = N / 8 cmp I, xzr - ble nrm2_kernel_F1 + ble .Lznrm2_kernel_F1 -nrm2_kernel_F8: +.Lznrm2_kernel_F8: KERNEL_F8 subs I, I, #1 - bne nrm2_kernel_F8 + bne .Lznrm2_kernel_F8 -nrm2_kernel_F1: +.Lznrm2_kernel_F1: ands I, N, #7 - ble nrm2_kernel_L999 + ble .Lznrm2_kernel_L999 -nrm2_kernel_F10: +.Lznrm2_kernel_F10: KERNEL_F1 subs I, I, #1 - bne nrm2_kernel_F10 + bne .Lznrm2_kernel_F10 - b nrm2_kernel_L999 + b .Lznrm2_kernel_L999 -nrm2_kernel_S_BEGIN: +.Lznrm2_kernel_S_BEGIN: INIT_S @@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN: .align 5 -nrm2_kernel_S10: +.Lznrm2_kernel_S10: KERNEL_S1 subs I, I, #1 - bne nrm2_kernel_S10 + bne .Lznrm2_kernel_S10 -nrm2_kernel_L999: +.Lznrm2_kernel_L999: fsqrt SSQ, SSQ fmul SSQ, SCALE, SSQ diff --git a/kernel/arm64/zrot.S b/kernel/arm64/zrot.S index 90f138a193..b5e510ebea 100644 --- a/kernel/arm64/zrot.S +++ b/kernel/arm64/zrot.S @@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE cmp N, xzr - ble rot_kernel_L999 + ble .Lzrot_kernel_L999 INIT cmp INC_X, #1 - bne rot_kernel_S_BEGIN + bne .Lzrot_kernel_S_BEGIN cmp INC_Y, #1 - bne rot_kernel_S_BEGIN + bne .Lzrot_kernel_S_BEGIN -rot_kernel_F_BEGIN: +.Lzrot_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq rot_kernel_F1 + beq .Lzrot_kernel_F1 KERNEL_INIT_F4 -rot_kernel_F4: +.Lzrot_kernel_F4: KERNEL_F4 subs I, I, #1 - bne rot_kernel_F4 + bne .Lzrot_kernel_F4 -rot_kernel_F1: +.Lzrot_kernel_F1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lzrot_kernel_L999 -rot_kernel_F10: +.Lzrot_kernel_F10: KERNEL_F1 subs I, I, #1 - bne rot_kernel_F10 + bne .Lzrot_kernel_F10 mov w0, wzr ret -rot_kernel_S_BEGIN: +.Lzrot_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble rot_kernel_S1 + ble .Lzrot_kernel_S1 -rot_kernel_S4: +.Lzrot_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -236,21 +236,21 @@ rot_kernel_S4: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S4 + bne .Lzrot_kernel_S4 -rot_kernel_S1: +.Lzrot_kernel_S1: ands I, N, #3 - ble rot_kernel_L999 + ble .Lzrot_kernel_L999 -rot_kernel_S10: +.Lzrot_kernel_S10: KERNEL_S1 subs I, I, #1 - bne rot_kernel_S10 + bne .Lzrot_kernel_S10 -rot_kernel_L999: +.Lzrot_kernel_L999: mov w0, wzr ret diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S index daaa55e9d2..929455975d 100644 --- a/kernel/arm64/zscal.S +++ b/kernel/arm64/zscal.S @@ -215,71 +215,71 @@ zscal_begin: mov X_COPY, X cmp N, xzr - ble zscal_kernel_L999 + ble .Lzscal_kernel_L999 fcmp DA_R, #0.0 - bne zscal_kernel_R_non_zero + bne .Lzscal_kernel_R_non_zero fcmp DA_I, #0.0 - beq zscal_kernel_RI_zero + beq .Lzscal_kernel_RI_zero - b zscal_kernel_R_zero + b .Lzscal_kernel_R_zero -zscal_kernel_R_non_zero: +.Lzscal_kernel_R_non_zero: fcmp DA_I, #0.0 - beq zscal_kernel_I_zero + beq .Lzscal_kernel_I_zero /******************************************************************************* * A_R != 0 && A_I != 0 *******************************************************************************/ -zscal_kernel_RI_non_zero: +.Lzscal_kernel_RI_non_zero: INIT cmp INC_X, #1 - bne zscal_kernel_S_BEGIN + bne .Lzscal_kernel_S_BEGIN -zscal_kernel_F_BEGIN: +.Lzscal_kernel_F_BEGIN: asr I, N, #2 cmp I, xzr - beq zscal_kernel_F1 + beq .Lzscal_kernel_F1 KERNEL_INIT_F4 -zscal_kernel_F4: +.Lzscal_kernel_F4: KERNEL_F4 subs I, I, #1 - bne zscal_kernel_F4 + bne .Lzscal_kernel_F4 -zscal_kernel_F1: +.Lzscal_kernel_F1: ands I, N, #3 - ble zscal_kernel_L999 + ble .Lzscal_kernel_L999 -zscal_kernel_F10: +.Lzscal_kernel_F10: KERNEL_F1 subs I, I, #1 - bne zscal_kernel_F10 + bne .Lzscal_kernel_F10 mov w0, wzr ret -zscal_kernel_S_BEGIN: +.Lzscal_kernel_S_BEGIN: INIT_S asr I, N, #2 cmp I, xzr - ble zscal_kernel_S1 + ble .Lzscal_kernel_S1 -zscal_kernel_S4: +.Lzscal_kernel_S4: KERNEL_S1 KERNEL_S1 @@ -287,21 +287,21 @@ zscal_kernel_S4: KERNEL_S1 subs I, I, #1 - bne zscal_kernel_S4 + bne .Lzscal_kernel_S4 -zscal_kernel_S1: +.Lzscal_kernel_S1: ands I, N, #3 - ble zscal_kernel_L999 + ble .Lzscal_kernel_L999 -zscal_kernel_S10: +.Lzscal_kernel_S10: KERNEL_S1 subs I, I, #1 - bne zscal_kernel_S10 + bne .Lzscal_kernel_S10 -zscal_kernel_L999: +.Lzscal_kernel_L999: mov w0, wzr ret @@ -310,7 +310,7 @@ zscal_kernel_L999: * A_R == 0 && A_I != 0 *******************************************************************************/ -zscal_kernel_R_zero: +.Lzscal_kernel_R_zero: INIT_S #if !defined(DOUBLE) @@ -323,7 +323,7 @@ zscal_kernel_R_zero: ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I #endif -zscal_kernel_R_zero_1: +.Lzscal_kernel_R_zero_1: #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 @@ -337,7 +337,7 @@ zscal_kernel_R_zero_1: #endif add X, X, INC_X subs N, N, #1 - bne zscal_kernel_R_zero_1 + bne .Lzscal_kernel_R_zero_1 mov w0, wzr ret @@ -346,7 +346,7 @@ zscal_kernel_R_zero_1: * A_R != 0 && A_I == 0 *******************************************************************************/ -zscal_kernel_I_zero: +.Lzscal_kernel_I_zero: INIT_S #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R @@ -354,7 +354,7 @@ zscal_kernel_I_zero: ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R #endif -zscal_kernel_I_zero_1: +.Lzscal_kernel_I_zero_1: #if !defined(DOUBLE) ld1 {v2.2s}, [X] // X1, X0 fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 @@ -366,7 +366,7 @@ zscal_kernel_I_zero_1: #endif add X, X, INC_X subs N, N, #1 - bne zscal_kernel_I_zero_1 + bne .Lzscal_kernel_I_zero_1 mov w0, wzr ret @@ -375,16 +375,16 @@ zscal_kernel_I_zero_1: * A_R == 0 && A_I == 0 *******************************************************************************/ -zscal_kernel_RI_zero: +.Lzscal_kernel_RI_zero: INIT_S -zscal_kernel_RI_zero_1: +.Lzscal_kernel_RI_zero_1: stp DA_R, DA_I, [X] add X, X, INC_X subs N, N, #1 - bne zscal_kernel_RI_zero_1 + bne .Lzscal_kernel_RI_zero_1 mov w0, wzr ret diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 77a7857ffe..462acfe2b5 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mov counterJ, origN asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 - ble ztrmm_kernel_L2_BEGIN + ble .Lztrmm_kernel_L2_BEGIN -ztrmm_kernel_L4_BEGIN: +.Lztrmm_kernel_L4_BEGIN: mov pCRow0, pC add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC @@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN: #endif mov pA, origPA // pA = start of A array -ztrmm_kernel_L4_M4_BEGIN: +.Lztrmm_kernel_L4_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ztrmm_kernel_L4_M2_BEGIN + ble .Lztrmm_kernel_L4_M2_BEGIN .align 5 -ztrmm_kernel_L4_M4_20: +.Lztrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20: asr counterL , tempK, #3 cmp counterL , #2 - blt ztrmm_kernel_L4_M4_32 + blt .Lztrmm_kernel_L4_M4_32 KERNEL4x4_I KERNEL4x4_M2 @@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20: KERNEL4x4_M2 subs counterL, counterL, #2 - ble ztrmm_kernel_L4_M4_22a + ble .Lztrmm_kernel_L4_M4_22a .align 5 -ztrmm_kernel_L4_M4_22: +.Lztrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22: KERNEL4x4_M2 subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M4_22 + bgt .Lztrmm_kernel_L4_M4_22 .align 5 -ztrmm_kernel_L4_M4_22a: +.Lztrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_M2 @@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a: KERNEL4x4_M1 KERNEL4x4_E - b ztrmm_kernel_L4_M4_44 + b .Lztrmm_kernel_L4_M4_44 .align 5 -ztrmm_kernel_L4_M4_32: +.Lztrmm_kernel_L4_M4_32: tst counterL, #1 - ble ztrmm_kernel_L4_M4_40 + ble .Lztrmm_kernel_L4_M4_40 KERNEL4x4_I KERNEL4x4_M2 @@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32: KERNEL4x4_M1 KERNEL4x4_E - b ztrmm_kernel_L4_M4_44 + b .Lztrmm_kernel_L4_M4_44 -ztrmm_kernel_L4_M4_40: +.Lztrmm_kernel_L4_M4_40: INIT4x4 -ztrmm_kernel_L4_M4_44: +.Lztrmm_kernel_L4_M4_44: ands counterL , tempK, #7 - ble ztrmm_kernel_L4_M4_100 + ble .Lztrmm_kernel_L4_M4_100 .align 5 -ztrmm_kernel_L4_M4_46: +.Lztrmm_kernel_L4_M4_46: KERNEL4x4_SUB subs counterL, counterL, #1 - bne ztrmm_kernel_L4_M4_46 + bne .Lztrmm_kernel_L4_M4_46 -ztrmm_kernel_L4_M4_100: +.Lztrmm_kernel_L4_M4_100: SAVE4x4 @@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100: prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [origPB] -ztrmm_kernel_L4_M4_END: +.Lztrmm_kernel_L4_M4_END: subs counterI, counterI, #1 - bne ztrmm_kernel_L4_M4_20 + bne .Lztrmm_kernel_L4_M4_20 -ztrmm_kernel_L4_M2_BEGIN: +.Lztrmm_kernel_L4_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ztrmm_kernel_L4_END + ble .Lztrmm_kernel_L4_END tst counterI, #2 // counterI = counterI / 2 - ble ztrmm_kernel_L4_M1_BEGIN + ble .Lztrmm_kernel_L4_M1_BEGIN -ztrmm_kernel_L4_M2_20: +.Lztrmm_kernel_L4_M2_20: INIT2x4 @@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L4_M2_40 + ble .Lztrmm_kernel_L4_M2_40 -ztrmm_kernel_L4_M2_22: +.Lztrmm_kernel_L4_M2_22: KERNEL2x4_SUB KERNEL2x4_SUB @@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M2_22 + bgt .Lztrmm_kernel_L4_M2_22 -ztrmm_kernel_L4_M2_40: +.Lztrmm_kernel_L4_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L4_M2_100 + ble .Lztrmm_kernel_L4_M2_100 -ztrmm_kernel_L4_M2_42: +.Lztrmm_kernel_L4_M2_42: KERNEL2x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M2_42 + bgt .Lztrmm_kernel_L4_M2_42 -ztrmm_kernel_L4_M2_100: +.Lztrmm_kernel_L4_M2_100: SAVE2x4 @@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100: add tempOffset, tempOffset, #2 #endif -ztrmm_kernel_L4_M2_END: +.Lztrmm_kernel_L4_M2_END: -ztrmm_kernel_L4_M1_BEGIN: +.Lztrmm_kernel_L4_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ztrmm_kernel_L4_END + ble .Lztrmm_kernel_L4_END -ztrmm_kernel_L4_M1_20: +.Lztrmm_kernel_L4_M1_20: INIT1x4 @@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L4_M1_40 + ble .Lztrmm_kernel_L4_M1_40 -ztrmm_kernel_L4_M1_22: +.Lztrmm_kernel_L4_M1_22: KERNEL1x4_SUB KERNEL1x4_SUB KERNEL1x4_SUB @@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M1_22 + bgt .Lztrmm_kernel_L4_M1_22 -ztrmm_kernel_L4_M1_40: +.Lztrmm_kernel_L4_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L4_M1_100 + ble .Lztrmm_kernel_L4_M1_100 -ztrmm_kernel_L4_M1_42: +.Lztrmm_kernel_L4_M1_42: KERNEL1x4_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L4_M1_42 + bgt .Lztrmm_kernel_L4_M1_42 -ztrmm_kernel_L4_M1_100: +.Lztrmm_kernel_L4_M1_100: SAVE1x4 @@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100: #endif -ztrmm_kernel_L4_END: +.Lztrmm_kernel_L4_END: lsl temp, origK, #6 add origPB, origPB, temp // B = B + K * 4 * 8 * 2 @@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END: #endif subs counterJ, counterJ , #1 // j-- - bgt ztrmm_kernel_L4_BEGIN + bgt .Lztrmm_kernel_L4_BEGIN /******************************************************************************/ -ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction +.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble ztrmm_kernel_L999 // error, N was less than 4? + ble .Lztrmm_kernel_L999 // error, N was less than 4? tst counterJ , #2 - ble ztrmm_kernel_L1_BEGIN + ble .Lztrmm_kernel_L1_BEGIN mov pCRow0, pC // pCRow0 = pC @@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction mov pA, origPA // pA = A -ztrmm_kernel_L2_M4_BEGIN: +.Lztrmm_kernel_L2_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 - ble ztrmm_kernel_L2_M2_BEGIN + ble .Lztrmm_kernel_L2_M2_BEGIN -ztrmm_kernel_L2_M4_20: +.Lztrmm_kernel_L2_M4_20: INIT4x2 @@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ztrmm_kernel_L2_M4_40 + ble .Lztrmm_kernel_L2_M4_40 .align 5 -ztrmm_kernel_L2_M4_22: +.Lztrmm_kernel_L2_M4_22: KERNEL4x2_SUB KERNEL4x2_SUB KERNEL4x2_SUB @@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M4_22 + bgt .Lztrmm_kernel_L2_M4_22 -ztrmm_kernel_L2_M4_40: +.Lztrmm_kernel_L2_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L2_M4_100 + ble .Lztrmm_kernel_L2_M4_100 -ztrmm_kernel_L2_M4_42: +.Lztrmm_kernel_L2_M4_42: KERNEL4x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M4_42 + bgt .Lztrmm_kernel_L2_M4_42 -ztrmm_kernel_L2_M4_100: +.Lztrmm_kernel_L2_M4_100: SAVE4x2 @@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100: add tempOffset, tempOffset, #4 #endif -ztrmm_kernel_L2_M4_END: +.Lztrmm_kernel_L2_M4_END: subs counterI, counterI, #1 - bgt ztrmm_kernel_L2_M4_20 + bgt .Lztrmm_kernel_L2_M4_20 -ztrmm_kernel_L2_M2_BEGIN: +.Lztrmm_kernel_L2_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ztrmm_kernel_L2_END + ble .Lztrmm_kernel_L2_END tst counterI, #2 // counterI = counterI / 2 - ble ztrmm_kernel_L2_M1_BEGIN + ble .Lztrmm_kernel_L2_M1_BEGIN -ztrmm_kernel_L2_M2_20: +.Lztrmm_kernel_L2_M2_20: INIT2x2 @@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL,#0 - ble ztrmm_kernel_L2_M2_40 + ble .Lztrmm_kernel_L2_M2_40 -ztrmm_kernel_L2_M2_22: +.Lztrmm_kernel_L2_M2_22: KERNEL2x2_SUB KERNEL2x2_SUB @@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M2_22 + bgt .Lztrmm_kernel_L2_M2_22 -ztrmm_kernel_L2_M2_40: +.Lztrmm_kernel_L2_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L2_M2_100 + ble .Lztrmm_kernel_L2_M2_100 -ztrmm_kernel_L2_M2_42: +.Lztrmm_kernel_L2_M2_42: KERNEL2x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M2_42 + bgt .Lztrmm_kernel_L2_M2_42 -ztrmm_kernel_L2_M2_100: +.Lztrmm_kernel_L2_M2_100: SAVE2x2 @@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100: add tempOffset, tempOffset, #2 #endif -ztrmm_kernel_L2_M2_END: +.Lztrmm_kernel_L2_M2_END: -ztrmm_kernel_L2_M1_BEGIN: +.Lztrmm_kernel_L2_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ztrmm_kernel_L2_END + ble .Lztrmm_kernel_L2_END -ztrmm_kernel_L2_M1_20: +.Lztrmm_kernel_L2_M1_20: INIT1x2 @@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL, #0 - ble ztrmm_kernel_L2_M1_40 + ble .Lztrmm_kernel_L2_M1_40 -ztrmm_kernel_L2_M1_22: +.Lztrmm_kernel_L2_M1_22: KERNEL1x2_SUB KERNEL1x2_SUB KERNEL1x2_SUB @@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M1_22 + bgt .Lztrmm_kernel_L2_M1_22 -ztrmm_kernel_L2_M1_40: +.Lztrmm_kernel_L2_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L2_M1_100 + ble .Lztrmm_kernel_L2_M1_100 -ztrmm_kernel_L2_M1_42: +.Lztrmm_kernel_L2_M1_42: KERNEL1x2_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L2_M1_42 + bgt .Lztrmm_kernel_L2_M1_42 -ztrmm_kernel_L2_M1_100: +.Lztrmm_kernel_L2_M1_100: SAVE1x2 @@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100: #endif -ztrmm_kernel_L2_END: +.Lztrmm_kernel_L2_END: #if !defined(LEFT) add tempOffset, tempOffset, #2 #endif @@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END: /******************************************************************************/ -ztrmm_kernel_L1_BEGIN: +.Lztrmm_kernel_L1_BEGIN: mov counterJ , origN tst counterJ , #1 - ble ztrmm_kernel_L999 // done + ble .Lztrmm_kernel_L999 // done mov pCRow0, pC // pCRow0 = C @@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN: -ztrmm_kernel_L1_M4_BEGIN: +.Lztrmm_kernel_L1_M4_BEGIN: mov counterI, origM asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 - ble ztrmm_kernel_L1_M2_BEGIN + ble .Lztrmm_kernel_L1_M2_BEGIN -ztrmm_kernel_L1_M4_20: +.Lztrmm_kernel_L1_M4_20: INIT4x1 @@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L1_M4_40 + ble .Lztrmm_kernel_L1_M4_40 .align 5 -ztrmm_kernel_L1_M4_22: +.Lztrmm_kernel_L1_M4_22: KERNEL4x1_SUB KERNEL4x1_SUB KERNEL4x1_SUB @@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M4_22 + bgt .Lztrmm_kernel_L1_M4_22 -ztrmm_kernel_L1_M4_40: +.Lztrmm_kernel_L1_M4_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L1_M4_100 + ble .Lztrmm_kernel_L1_M4_100 -ztrmm_kernel_L1_M4_42: +.Lztrmm_kernel_L1_M4_42: KERNEL4x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M4_42 + bgt .Lztrmm_kernel_L1_M4_42 -ztrmm_kernel_L1_M4_100: +.Lztrmm_kernel_L1_M4_100: SAVE4x1 @@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100: add tempOffset, tempOffset, #4 #endif -ztrmm_kernel_L1_M4_END: +.Lztrmm_kernel_L1_M4_END: subs counterI, counterI, #1 - bgt ztrmm_kernel_L1_M4_20 + bgt .Lztrmm_kernel_L1_M4_20 -ztrmm_kernel_L1_M2_BEGIN: +.Lztrmm_kernel_L1_M2_BEGIN: mov counterI, origM tst counterI , #3 - ble ztrmm_kernel_L1_END + ble .Lztrmm_kernel_L1_END tst counterI, #2 // counterI = counterI / 2 - ble ztrmm_kernel_L1_M1_BEGIN + ble .Lztrmm_kernel_L1_M1_BEGIN -ztrmm_kernel_L1_M2_20: +.Lztrmm_kernel_L1_M2_20: INIT2x1 @@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L1_M2_40 + ble .Lztrmm_kernel_L1_M2_40 -ztrmm_kernel_L1_M2_22: +.Lztrmm_kernel_L1_M2_22: KERNEL2x1_SUB KERNEL2x1_SUB @@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M2_22 + bgt .Lztrmm_kernel_L1_M2_22 -ztrmm_kernel_L1_M2_40: +.Lztrmm_kernel_L1_M2_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L1_M2_100 + ble .Lztrmm_kernel_L1_M2_100 -ztrmm_kernel_L1_M2_42: +.Lztrmm_kernel_L1_M2_42: KERNEL2x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M2_42 + bgt .Lztrmm_kernel_L1_M2_42 -ztrmm_kernel_L1_M2_100: +.Lztrmm_kernel_L1_M2_100: SAVE2x1 @@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100: add tempOffset, tempOffset, #2 #endif -ztrmm_kernel_L1_M2_END: +.Lztrmm_kernel_L1_M2_END: -ztrmm_kernel_L1_M1_BEGIN: +.Lztrmm_kernel_L1_M1_BEGIN: tst counterI, #1 // counterI = counterI % 2 - ble ztrmm_kernel_L1_END + ble .Lztrmm_kernel_L1_END -ztrmm_kernel_L1_M1_20: +.Lztrmm_kernel_L1_M1_20: INIT1x1 @@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20: asr counterL , tempK, #3 // counterL = counterL / 8 cmp counterL , #0 - ble ztrmm_kernel_L1_M1_40 + ble .Lztrmm_kernel_L1_M1_40 -ztrmm_kernel_L1_M1_22: +.Lztrmm_kernel_L1_M1_22: KERNEL1x1_SUB KERNEL1x1_SUB KERNEL1x1_SUB @@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M1_22 + bgt .Lztrmm_kernel_L1_M1_22 -ztrmm_kernel_L1_M1_40: +.Lztrmm_kernel_L1_M1_40: ands counterL , tempK, #7 // counterL = counterL % 8 - ble ztrmm_kernel_L1_M1_100 + ble .Lztrmm_kernel_L1_M1_100 -ztrmm_kernel_L1_M1_42: +.Lztrmm_kernel_L1_M1_42: KERNEL1x1_SUB subs counterL, counterL, #1 - bgt ztrmm_kernel_L1_M1_42 + bgt .Lztrmm_kernel_L1_M1_42 -ztrmm_kernel_L1_M1_100: +.Lztrmm_kernel_L1_M1_100: SAVE1x1 -ztrmm_kernel_L1_END: +.Lztrmm_kernel_L1_END: -ztrmm_kernel_L999: +.Lztrmm_kernel_L999: mov x0, #0 // set return value ldp d8, d9, [sp, #(0 * 16)] ldp d10, d11, [sp, #(1 * 16)]