Skip to content

Commit

Permalink
Cast dim_t and inc_t parameters to 64-bit in KNL microkernels.
Browse files Browse the repository at this point in the history
  • Loading branch information
devinamatthews committed Feb 20, 2017
1 parent c362afc commit 7d42fc0
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 26 deletions.
26 changes: 19 additions & 7 deletions kernels/x86_64/knl/1m/bli_packm_opt_24x8.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
BLIS
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Expand Down Expand Up @@ -105,16 +105,22 @@ extern int32_t offsets[24];
void bli_dpackm_8xk_opt
(
conj_t conja,
dim_t n,
dim_t n_,
void* restrict kappa_,
void* restrict a_, inc_t inca, inc_t lda,
void* restrict p_, inc_t ldp
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_
)
{
(void)conja;

const int32_t * offsetPtr = &offsets[0];
double* a = (double*)a_;
double* p = (double*)p_;
double* kappa = (double*)kappa_;
const int64_t n = n_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;

__asm__ volatile
(
Expand Down Expand Up @@ -291,16 +297,22 @@ void bli_dpackm_8xk_opt
void bli_dpackm_24xk_opt
(
conj_t conja,
dim_t n,
dim_t n_,
void* restrict kappa_,
void* restrict a_, inc_t inca, inc_t lda,
void* restrict p_, inc_t ldp
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_
)
{
(void)conja;

const int32_t * offsetPtr = &offsets[0];
double* a = (double*)a_;
double* p = (double*)p_;
double* kappa = (double*)kappa_;
const int64_t n = n_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;

__asm__ volatile
(
Expand Down
14 changes: 10 additions & 4 deletions kernels/x86_64/knl/1m/bli_packm_opt_30x8.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
BLIS
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Expand Down Expand Up @@ -133,16 +133,22 @@ extern int32_t offsets[32];
void bli_dpackm_30xk_opt
(
conj_t conja,
dim_t n,
dim_t n_,
void* restrict kappa_,
void* restrict a_, inc_t inca, inc_t lda,
void* restrict p_, inc_t ldp
void* restrict a_, inc_t inca_, inc_t lda_,
void* restrict p_, inc_t ldp_
)
{
(void)conja;

const int32_t * offsetPtr = &offsets[0];
double* a = (double*)a_;
double* p = (double*)p_;
double* kappa = (double*)kappa_;
const int64_t n = n_;
const int64_t inca = inca_;
const int64_t lda = lda_;
const int64_t ldp = ldp_;

__asm__ volatile
(
Expand Down
28 changes: 16 additions & 12 deletions kernels/x86_64/knl/3/bli_dgemm_opt_24x8.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
BLIS
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Expand Down Expand Up @@ -181,30 +181,34 @@ extern int32_t offsets[24];
//#define MONITORS
//#define LOOPMON
void bli_dgemm_opt_24x8(
dim_t k,
dim_t k_,
double* restrict alpha,
double* restrict a,
double* restrict b,
double* restrict beta,
double* restrict c, inc_t rs_c, inc_t cs_c,
double* restrict c, inc_t rs_c_, inc_t cs_c_,
auxinfo_t* data,
cntx_t* restrict cntx
)
{
(void)data;
(void)cntx;

const double * a_next = bli_auxinfo_next_a( data );
const double * b_next = bli_auxinfo_next_b( data );

const int32_t * offsetPtr = &offsets[0];

uint64_t k64 = k;
const int64_t k = k_;
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;

#ifdef MONITORS
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
#endif
#ifdef LOOPMON
int tlooph, tloopl, blooph, bloopl;
#endif

__asm__ volatile
(
#ifdef MONITORS
Expand All @@ -223,22 +227,22 @@ void bli_dgemm_opt_24x8(
VMOVAPS(ZMM(15), ZMM(8)) MOV(RDI, VAR(offsetPtr))
VMOVAPS(ZMM(16), ZMM(8)) VMOVAPS(ZMM(4), MEM(RDI))
#if SCATTER_PREFETCH_C
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(18), ZMM(8))
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(18), ZMM(8))
VMOVAPS(ZMM(19), ZMM(8)) VBROADCASTSS(ZMM(5), VAR(rs_c))
VMOVAPS(ZMM(20), ZMM(8))
VMOVAPS(ZMM(20), ZMM(8))
VMOVAPS(ZMM(21), ZMM(8)) VPMULLD(ZMM(2), ZMM(4), ZMM(5))
VMOVAPS(ZMM(22), ZMM(8)) VMOVAPS(YMM(3), MEM(RDI,64))
VMOVAPS(ZMM(23), ZMM(8)) VPMULLD(YMM(3), YMM(3), YMM(5))
#else
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(17), ZMM(8))
VMOVAPS(ZMM(18), ZMM(8)) LEA(R13, MEM(R12,R12,2))
VMOVAPS(ZMM(19), ZMM(8)) LEA(R14, MEM(R12,R12,4))
VMOVAPS(ZMM(20), ZMM(8)) LEA(R15, MEM(R13,R12,4))
VMOVAPS(ZMM(21), ZMM(8))
VMOVAPS(ZMM(22), ZMM(8))
VMOVAPS(ZMM(23), ZMM(8))
#endif
#endif
VMOVAPS(ZMM(24), ZMM(8)) VPSLLD(ZMM(4), ZMM(4), IMM(3))
VMOVAPS(ZMM(25), ZMM(8)) MOV(R8, IMM(4*24*8)) //offset for 4 iterations
VMOVAPS(ZMM(26), ZMM(8)) LEA(R9, MEM(R8,R8,2)) //*3
Expand Down Expand Up @@ -670,7 +674,7 @@ void bli_dgemm_opt_24x8(
[both] "=m" (both)
#endif
: // input operands
[k] "m" (k64),
[k] "m" (k),
[a] "m" (a),
[b] "m" (b),
[alpha] "m" (alpha),
Expand Down
12 changes: 9 additions & 3 deletions kernels/x86_64/knl/3/bli_sgemm_opt_30x16_knc.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
BLIS
BLIS
An object-based framework for developing high-performance BLAS-like
libraries.
Expand Down Expand Up @@ -166,20 +166,26 @@ int32_t offsets[32] __attribute__((aligned(0x1000))) = { 0, 1, 2, 3, 4, 5,
//#define MONITORS
//#define LOOPMON
void bli_sgemm_opt_30x16_knc(
dim_t k,
dim_t k_,
float* restrict alpha,
float* restrict a,
float* restrict b,
float* restrict beta,
float* restrict c, inc_t rs_c, inc_t cs_c,
float* restrict c, inc_t rs_c_, inc_t cs_c_,
auxinfo_t* data,
cntx_t* restrict cntx
)
{
(void)data;
(void)cntx;

const float * a_next = bli_auxinfo_next_a( data );
const float * b_next = bli_auxinfo_next_b( data );

const int32_t * offsetPtr = &offsets[0];
const int64_t k = k_;
const int64_t rs_c = rs_c_;
const int64_t cs_c = cs_c_;

#ifdef MONITORS
int toph, topl, both, botl, midl, midh, mid2l, mid2h;
Expand Down

0 comments on commit 7d42fc0

Please sign in to comment.