From 8ac87c1cb63fd8518c7a99d6b06fb47524f2153b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 16 Oct 2017 23:27:51 +0200 Subject: [PATCH 1/4] Implement DSDOT with unchanged sdot microkernels --- kernel/x86_64/sdot.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 389252f8b5..f786d18953 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -68,7 +68,11 @@ static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) #endif +#if defined (DSDOT) +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#else FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +#endif { BLASLONG i=0; BLASLONG ix=0,iy=0; @@ -91,12 +95,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) i = n1; while(i < n) { - +#if defined(DSDOT) + dot += (double)y[i] * (double)x[i] ; +#else dot += y[i] * x[i] ; +#endif i++ ; } +#if defined(DSDOT) + dot+=(double)mydot; +#else dot+=mydot; +#endif return(dot); @@ -106,8 +117,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) while(i < n1) { - +#if defined (DSDOT) + dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x]; +#else dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; +#endif ix += inc_x*2 ; iy += inc_y*2 ; i+=2 ; @@ -116,8 +130,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) while(i < n) { - +#if defined (DSDOT) + dot += (double)y[iy] * (double)x[ix] ; +#else dot += y[iy] * x[ix] ; +#endif ix += inc_x ; iy += inc_y ; i++ ; From 28c3fa8950045d658e2c9b604a061927ffb9dc61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 16 Oct 2017 23:29:03 +0200 Subject: [PATCH 2/4] Add dsdot --- kernel/x86_64/KERNEL.HASWELL | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index f2e1374d32..848de38dfc 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -24,6 +24,8 @@ DDOTKERNEL = ddot.c CDOTKERNEL = cdot.c ZDOTKERNEL = zdot.c +DSDOTKERNEL = sdot.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c From 5e3e91d0fc5562782ddac9c01d6765cb24f171a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 22 Oct 2017 18:18:51 +0200 Subject: [PATCH 3/4] Split the microkernel workload into chunks of 32 floats for dsdot mode to limit loss of precision --- kernel/x86_64/sdot.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index f786d18953..b6f3c21afe 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -78,7 +78,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) BLASLONG ix=0,iy=0; double dot = 0.0 ; +#if defined (DSDOT) + double mydot = 0.0; + FLOAT asmdot = 0.0; +#else FLOAT mydot=0.0; +#endif BLASLONG n1; if ( n <= 0 ) return(dot); @@ -89,9 +94,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) n1 = n & (BLASLONG)(-32); if ( n1 ) +#if defined(DSDOT) + { + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG n2 = 32; + while (i Date: Wed, 25 Oct 2017 16:45:41 +0200 Subject: [PATCH 4/4] Eliminate loop code when called as/from dsdot --- kernel/x86_64/sdot_microk_haswell-2.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c index 4051f9c1ba..3248c408cf 100644 --- a/kernel/x86_64/sdot_microk_haswell-2.c +++ b/kernel/x86_64/sdot_microk_haswell-2.c @@ -53,9 +53,11 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vfmadd231ps 64(%3,%0,4), %%ymm14, %%ymm6 \n\t" // 2 * y "vfmadd231ps 96(%3,%0,4), %%ymm15, %%ymm7 \n\t" // 2 * y +#ifndef DSDOT "addq $32 , %0 \n\t" "subq $32 , %1 \n\t" "jnz 1b \n\t" +#endif "vextractf128 $1 , %%ymm4 , %%xmm12 \n\t" "vextractf128 $1 , %%ymm5 , %%xmm13 \n\t"