diff --git a/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c b/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c index dba0e88b98..15ee67a5be 100644 --- a/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c +++ b/kernels/x86_64/knl/1m/bli_packm_opt_24x8.c @@ -1,6 +1,6 @@ /* - BLIS + BLIS An object-based framework for developing high-performance BLAS-like libraries. @@ -105,16 +105,22 @@ extern int32_t offsets[24]; void bli_dpackm_8xk_opt ( conj_t conja, - dim_t n, + dim_t n_, void* restrict kappa_, - void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_ ) { + (void)conja; + const int32_t * offsetPtr = &offsets[0]; double* a = (double*)a_; double* p = (double*)p_; double* kappa = (double*)kappa_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; __asm__ volatile ( @@ -291,16 +297,22 @@ void bli_dpackm_8xk_opt void bli_dpackm_24xk_opt ( conj_t conja, - dim_t n, + dim_t n_, void* restrict kappa_, - void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_ ) { + (void)conja; + const int32_t * offsetPtr = &offsets[0]; double* a = (double*)a_; double* p = (double*)p_; double* kappa = (double*)kappa_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; __asm__ volatile ( diff --git a/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c b/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c index eeab3c71dd..181c5deabe 100644 --- a/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c +++ b/kernels/x86_64/knl/1m/bli_packm_opt_30x8.c @@ -1,6 +1,6 @@ /* - BLIS + BLIS An object-based framework for developing high-performance BLAS-like libraries. @@ -133,16 +133,22 @@ extern int32_t offsets[32]; void bli_dpackm_30xk_opt ( conj_t conja, - dim_t n, + dim_t n_, void* restrict kappa_, - void* restrict a_, inc_t inca, inc_t lda, - void* restrict p_, inc_t ldp + void* restrict a_, inc_t inca_, inc_t lda_, + void* restrict p_, inc_t ldp_ ) { + (void)conja; + const int32_t * offsetPtr = &offsets[0]; double* a = (double*)a_; double* p = (double*)p_; double* kappa = (double*)kappa_; + const int64_t n = n_; + const int64_t inca = inca_; + const int64_t lda = lda_; + const int64_t ldp = ldp_; __asm__ volatile ( diff --git a/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c b/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c index 492e2009e3..71e6d93271 100644 --- a/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c +++ b/kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c @@ -1,6 +1,6 @@ /* - BLIS + BLIS An object-based framework for developing high-performance BLAS-like libraries. @@ -181,22 +181,26 @@ extern int32_t offsets[24]; //#define MONITORS //#define LOOPMON void bli_dgemm_opt_24x8( - dim_t k, + dim_t k_, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, - double* restrict c, inc_t rs_c, inc_t cs_c, + double* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, cntx_t* restrict cntx ) { + (void)data; + (void)cntx; + const double * a_next = bli_auxinfo_next_a( data ); const double * b_next = bli_auxinfo_next_b( data ); const int32_t * offsetPtr = &offsets[0]; - - uint64_t k64 = k; + const int64_t k = k_; + const int64_t rs_c = rs_c_; + const int64_t cs_c = cs_c_; #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; @@ -204,7 +208,7 @@ void bli_dgemm_opt_24x8( #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif - + __asm__ volatile ( #ifdef MONITORS @@ -223,22 +227,22 @@ void bli_dgemm_opt_24x8( VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr)) VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI)) #if SCATTER_PREFETCH_C - VMOVAPS(ZMM(17), ZMM(8)) - VMOVAPS(ZMM(18), ZMM(8)) + VMOVAPS(ZMM(17), ZMM(8)) + VMOVAPS(ZMM(18), ZMM(8)) VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c)) - VMOVAPS(ZMM(20), ZMM(8)) + VMOVAPS(ZMM(20), ZMM(8)) VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5)) VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64)) VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5)) #else - VMOVAPS(ZMM(17), ZMM(8)) + VMOVAPS(ZMM(17), ZMM(8)) VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2)) VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4)) VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4)) VMOVAPS(ZMM(21), ZMM(8)) VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(ZMM(23), ZMM(8)) -#endif +#endif VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(3)) VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*8)) //offset for 4 iterations VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3 @@ -670,7 +674,7 @@ void bli_dgemm_opt_24x8( [both] "=m" (both) #endif : // input operands - [k] "m" (k64), + [k] "m" (k), [a] "m" (a), [b] "m" (b), [alpha] "m" (alpha), diff --git a/kernels/x86_64/knl/3/bli_sgemm_opt_30x16_knc.c b/kernels/x86_64/knl/3/bli_sgemm_opt_30x16_knc.c index acc7b341d9..889fd8d199 100644 --- a/kernels/x86_64/knl/3/bli_sgemm_opt_30x16_knc.c +++ b/kernels/x86_64/knl/3/bli_sgemm_opt_30x16_knc.c @@ -1,6 +1,6 @@ /* - BLIS + BLIS An object-based framework for developing high-performance BLAS-like libraries. @@ -166,20 +166,26 @@ int32_t offsets[32] __attribute__((aligned(0x1000))) = { 0, 1, 2, 3, 4, 5, //#define MONITORS //#define LOOPMON void bli_sgemm_opt_30x16_knc( - dim_t k, + dim_t k_, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, - float* restrict c, inc_t rs_c, inc_t cs_c, + float* restrict c, inc_t rs_c_, inc_t cs_c_, auxinfo_t* data, cntx_t* restrict cntx ) { + (void)data; + (void)cntx; + const float * a_next = bli_auxinfo_next_a( data ); const float * b_next = bli_auxinfo_next_b( data ); const int32_t * offsetPtr = &offsets[0]; + const int64_t k = k_; + const int64_t rs_c = rs_c_; + const int64_t cs_c = cs_c_; #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h;