From 547d8b13b4b3a4beded9cf183db4be47252f7c94 Mon Sep 17 00:00:00 2001
From: Reese Baird <reese.baird@intel.com>
Date: Mon, 28 Aug 2017 09:50:07 -0700
Subject: [PATCH] PATCH for https://github.com/xianyi/OpenBLAS/pull/1262 PATCH
 for https://github.com/xianyi/OpenBLAS/pull/1236 PATCH for
 https://github.com/xianyi/OpenBLAS/pull/1247

---
 .../serial-libs/openblas/SOURCES/1236.patch   | 221 ++++++++++++++++++
 .../serial-libs/openblas/SOURCES/1247.patch   | 144 ++++++++++++
 .../serial-libs/openblas/SOURCES/1262.patch   | 197 ++++++++++++++++
 .../serial-libs/openblas/SPECS/openblas.spec  |   9 +
 4 files changed, 571 insertions(+)
 create mode 100644 components/serial-libs/openblas/SOURCES/1236.patch
 create mode 100644 components/serial-libs/openblas/SOURCES/1247.patch
 create mode 100644 components/serial-libs/openblas/SOURCES/1262.patch

diff --git a/components/serial-libs/openblas/SOURCES/1236.patch b/components/serial-libs/openblas/SOURCES/1236.patch
new file mode 100644
index 0000000000..f605429031
--- /dev/null
+++ b/components/serial-libs/openblas/SOURCES/1236.patch
@@ -0,0 +1,221 @@
+From 6497aae57c77253b2d717b01f5ec17e137954395 Mon Sep 17 00:00:00 2001
+From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
+Date: Wed, 12 Jul 2017 20:43:09 +0200
+Subject: [PATCH] Use cpuid 4 with subleafs to query L1 cache size on Intel
+ processors
+
+---
+ cpuid_x86.c | 117 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------
+ 1 file changed, 102 insertions(+), 15 deletions(-)
+
+diff --git a/cpuid_x86.c b/cpuid_x86.c
+index ab2ecdcaf..73b4df6b3 100644
+--- a/cpuid_x86.c
++++ b/cpuid_x86.c
+@@ -71,12 +71,23 @@ void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx)
+   *edx = cpuInfo[3];
+ }
+ 
++void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, int *edx)
++{
++  int cpuInfo[4] = {-1};
++  __cpuidex(cpuInfo, op, count);
++  *eax = cpuInfo[0];
++  *ebx = cpuInfo[1];
++  *ecx = cpuInfo[2];
++  *edx = cpuInfo[3];
++}
++
+ #else
+ 
+ #ifndef CPUIDEMU
+ 
+ #if defined(__APPLE__) && defined(__i386__)
+ void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx);
++void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, int *edx);
+ #else
+ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
+ #if defined(__i386__) && defined(__PIC__)
+@@ -90,6 +101,19 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
+     ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc");
+ #endif
+ }
++
++static C_INLINE void cpuid_count(int op, int count ,int *eax, int *ebx, int *ecx, int *edx){
++#if defined(__i386__) && defined(__PIC__)
++  __asm__ __volatile__
++    ("mov %%ebx, %%edi;"
++     "cpuid;"
++     "xchgl %%ebx, %%edi;"
++     : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "2" (count) : "cc");
++#else
++  __asm__ __volatile__
++    ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "2" (count) : "cc");
++#endif
++}
+ #endif
+ 
+ #else
+@@ -312,9 +336,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
+   cpuid(0, &cpuid_level, &ebx, &ecx, &edx);
+ 
+   if (cpuid_level > 1) {
+-
++    int numcalls =0 ;
+     cpuid(2, &eax, &ebx, &ecx, &edx);
+-
++    numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries
+     info[ 0] = BITMASK(eax,  8, 0xff);
+     info[ 1] = BITMASK(eax, 16, 0xff);
+     info[ 2] = BITMASK(eax, 24, 0xff);
+@@ -335,7 +359,6 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
+     info[14] = BITMASK(edx, 24, 0xff);
+ 
+     for (i = 0; i < 15; i++){
+-
+       switch (info[i]){
+ 
+    /* This table is from http://www.sandpile.org/ia32/cpuid.htm */
+@@ -637,12 +660,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
+    LD1.linesize    = 64;
+    break;
+       case 0x63 :
+-  DTB.size        = 2048;
+-  DTB.associative = 4;
+-  DTB.linesize    = 32;
+-  LDTB.size       = 4096;
+-  LDTB.associative= 4;
+-  LDTB.linesize   = 32;
++   DTB.size        = 2048;
++   DTB.associative = 4;
++   DTB.linesize    = 32;
++   LDTB.size       = 4096;
++   LDTB.associative= 4;
++   LDTB.linesize   = 32;
++   break;
+       case 0x66 :
+    LD1.size        = 8;
+    LD1.associative = 4;
+@@ -675,12 +699,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
+    LC1.associative = 8;
+    break;
+       case 0x76 :
+-  ITB.size        = 2048;
+-  ITB.associative = 0;
+-  ITB.linesize    = 8;
+-  LITB.size       = 4096;
+-  LITB.associative= 0;
+-  LITB.linesize   = 8;
++   ITB.size        = 2048;
++   ITB.associative = 0;
++   ITB.linesize    = 8;
++   LITB.size       = 4096;
++   LITB.associative= 0;
++   LITB.linesize   = 8;
++   break;
+       case 0x77 :
+    LC1.size        = 16;
+    LC1.associative = 4;
+@@ -891,6 +916,68 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
+   }
+ 
+   if (get_vendor() == VENDOR_INTEL) {
++      if(LD1.size<=0 || LC1.size<=0){
++   //If we didn't detect L1 correctly before,
++   int count;
++   for (count=0;count <4;count++) {
++   cpuid_count(4, count, &eax, &ebx, &ecx, &edx);
++        switch (eax &0x1f) {
++        case 0:
++          continue;
++          case 1:
++          case 3:
++          {
++            switch ((eax >>5) &0x07)
++            {
++            case 1:
++            {
++//            fprintf(stderr,"L1 data cache...\n");
++            int sets = ecx+1;
++            int lines = (ebx & 0x0fff) +1;
++            ebx>>=12;
++            int part = (ebx&0x03ff)+1;
++            ebx >>=10;
++            int assoc = (ebx&0x03ff)+1;
++            LD1.size = (assoc*part*lines*sets)/1024;
++            LD1.associative = assoc;
++            LD1.linesize= lines;
++            break;
++            }
++            default: 
++              break;
++           }
++          break;
++          }
++         case 2:
++          {
++            switch ((eax >>5) &0x07)
++            {
++            case 1:
++            {
++//            fprintf(stderr,"L1 instruction cache...\n");
++            int sets = ecx+1;
++            int lines = (ebx & 0x0fff) +1;
++            ebx>>=12;
++            int part = (ebx&0x03ff)+1;
++            ebx >>=10;
++            int assoc = (ebx&0x03ff)+1;
++            LC1.size = (assoc*part*lines*sets)/1024;
++            LC1.associative = assoc;
++            LC1.linesize= lines;
++            break;
++            }
++            default: 
++              break;
++           }
++          break;
++          
++          }
++          default:
++          break;
++        }
++      }
++    }
++
+     cpuid(0x80000000, &cpuid_level, &ebx, &ecx, &edx);
+     if (cpuid_level >= 0x80000006) {
+       if(L2.size<=0){
+
+From 00774b1105ad5dbfe0e6be671096d51ad4a97b2e Mon Sep 17 00:00:00 2001
+From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
+Date: Wed, 12 Jul 2017 21:56:23 +0200
+Subject: [PATCH] Add dummy implementation of cpuid_count for the CPUIDEMU case
+
+---
+ cpuid_x86.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/cpuid_x86.c b/cpuid_x86.c
+index 73b4df6b3..103128a33 100644
+--- a/cpuid_x86.c
++++ b/cpuid_x86.c
+@@ -157,6 +157,10 @@ void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *
+   *edx = idlist[current].d;
+ }
+ 
++void cpuid_count (unsigned int op, unsigned int count, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) {
++  return cpuid (op, eax, ebx, ecx, edx);
++}
++
+ #endif
+ 
+ #endif // _MSC_VER
+@@ -977,7 +981,6 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
+         }
+       }
+     }
+-
+     cpuid(0x80000000, &cpuid_level, &ebx, &ecx, &edx);
+     if (cpuid_level >= 0x80000006) {
+       if(L2.size<=0){
+
diff --git a/components/serial-libs/openblas/SOURCES/1247.patch b/components/serial-libs/openblas/SOURCES/1247.patch
new file mode 100644
index 0000000000..c44c4326e2
--- /dev/null
+++ b/components/serial-libs/openblas/SOURCES/1247.patch
@@ -0,0 +1,144 @@
+From 88a35ff457f55e527e0e8a503a0dc61976c1846d Mon Sep 17 00:00:00 2001
+From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
+Date: Tue, 25 Jul 2017 08:39:35 +0200
+Subject: [PATCH] Revert #1246, "honor cgroup/cpuset limits" for now
+
+Unsafe usage of the __GLIBC_PREREQ macro lead to build breakage on non-glibc systems
+---
+ driver/others/init.c   | 49 +++++--------------------------------------------
+ driver/others/memory.c | 37 -------------------------------------
+ 2 files changed, 5 insertions(+), 81 deletions(-)
+
+diff --git a/driver/others/init.c b/driver/others/init.c
+index 4c75d72e4..3e6176967 100644
+--- a/driver/others/init.c
++++ b/driver/others/init.c
+@@ -778,11 +778,11 @@ static int initialized = 0;
+ void gotoblas_affinity_init(void) {
+ 
+   int cpu, num_avail;
+-#ifndef USE_OPENMP	
++#ifndef USE_OPENMP
+   cpu_set_t cpu_mask;
+ #endif
+   int i;
+-	
++
+   if (initialized) return;
+ 
+   initialized = 1;
+@@ -826,54 +826,15 @@ void gotoblas_affinity_init(void) {
+   common -> shmid = pshmid;
+ 
+   if (common -> magic != SH_MAGIC) {
+-    cpu_set_t *cpusetp;
+-    int nums;
+-    int ret;
+-
+ #ifdef DEBUG
+     fprintf(stderr, "Shared Memory Initialization.\n");
+ #endif
+ 
+     //returns the number of processors which are currently online
+-
+-    nums = sysconf(_SC_NPROCESSORS_CONF);
+-     
+-#if !defined(__GLIBC_PREREQ) || !__GLIBC_PREREQ(2, 3)
+-    common->num_procs = nums;
+-#elif __GLIBC_PREREQ(2, 7)
+-    cpusetp = CPU_ALLOC(nums);
+-    if (cpusetp == NULL) {
+-        common->num_procs = nums;
+-    } else {
+-        size_t size;
+-        size = CPU_ALLOC_SIZE(nums);
+-        ret = sched_getaffinity(0,size,cpusetp);
+-        if (ret!=0) 
+-            common->num_procs = nums;
+-        else
+-            common->num_procs = CPU_COUNT_S(size,cpusetp);
+-    }
+-    CPU_FREE(cpusetp);
+-#else
+-    ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
+-    if (ret!=0) {
+-        common->num_procs = nums;
+-    } else {
+-#if !__GLIBC_PREREQ(2, 6)  
+-    int i;
+-    int n = 0;
+-    for (i=0;i<nums;i++)
+-        if (CPU_ISSET(i,cpusetp)) n++;
+-    common->num_procs = n;
+-    }
+-#else
+-    common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
+-#endif
+-
+-#endif 
++    common -> num_procs = sysconf(_SC_NPROCESSORS_CONF);;
+ 
+     if(common -> num_procs > MAX_CPUS) {
+-      fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
++      fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
+       exit(1);
+     }
+ 
+@@ -886,7 +847,7 @@ void gotoblas_affinity_init(void) {
+     if (common -> num_nodes > 1) numa_mapping();
+ 
+     common -> final_num_procs = 0;
+-    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number.
++    for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number. 
+ 
+     for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] =  0;
+ 
+diff --git a/driver/others/memory.c b/driver/others/memory.c
+index 38d063715..916950315 100644
+--- a/driver/others/memory.c
++++ b/driver/others/memory.c
+@@ -175,44 +175,7 @@ int get_num_procs(void);
+ #else
+ int get_num_procs(void) {
+   static int nums = 0;
+-cpu_set_t *cpusetp;
+-size_t size;
+-int ret;
+-int i,n;
+-
+   if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
+-#if !defined(OS_LINUX)
+-     return nums;
+-#endif
+-     
+-#if !defined(__GLIBC_PREREQ)
+-   return nums;
+-#endif   
+-#if !__GLIBC_PREREQ(2, 3)
+-   return nums;
+-#endif   
+-
+-#if !__GLIBC_PREREQ(2, 7)
+-  ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
+-  if (ret!=0) return nums;
+-  n=0;
+-#if !__GLIBC_PREREQ(2, 6)  
+-  for (i=0;i<nums;i++)
+-     if (CPU_ISSET(i,cpusetp)) n++;
+-  nums=n;   
+-#else
+-  nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
+-#endif
+-  return nums;
+-#endif
+-
+-  cpusetp = CPU_ALLOC(nums);
+-  if (cpusetp == NULL) return nums;
+-  size = CPU_ALLOC_SIZE(nums);
+-  ret = sched_getaffinity(0,size,cpusetp);
+-  if (ret!=0) return nums;
+-  nums = CPU_COUNT_S(size,cpusetp);
+-  CPU_FREE(cpusetp);
+   return nums;
+ }
+ #endif
diff --git a/components/serial-libs/openblas/SOURCES/1262.patch b/components/serial-libs/openblas/SOURCES/1262.patch
new file mode 100644
index 0000000000..aa6c90d2b6
--- /dev/null
+++ b/components/serial-libs/openblas/SOURCES/1262.patch
@@ -0,0 +1,197 @@
+From c4e5ba1bfe8c7c4e263d5c14f4034e657347b591 Mon Sep 17 00:00:00 2001
+From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
+Date: Wed, 2 Aug 2017 00:37:58 +0200
+Subject: [PATCH 1/2] Make sure that range_n of last thread never exceeds the
+ actual data size when splitting the workload
+
+---
+ driver/level2/gbmv_thread.c | 2 ++
+ driver/level2/sbmv_thread.c | 3 +++
+ driver/level2/spmv_thread.c | 2 ++
+ driver/level2/symv_thread.c | 4 +++-
+ driver/level2/tbmv_thread.c | 3 +++
+ driver/level2/tpmv_thread.c | 4 +++-
+ driver/level2/trmv_thread.c | 4 +++-
+ 7 files changed, 19 insertions(+), 3 deletions(-)
+
+diff --git a/driver/level2/gbmv_thread.c b/driver/level2/gbmv_thread.c
+index e86b565f8..9d374676e 100644
+--- a/driver/level2/gbmv_thread.c
++++ b/driver/level2/gbmv_thread.c
+@@ -230,8 +230,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
+ 
+ #ifndef TRANSA
+     range_m[num_cpu] = num_cpu * ((m + 15) & ~15);
++    if (range_m[num_cpu] > m) range_m[num_cpu] = m;
+ #else
+     range_m[num_cpu] = num_cpu * ((n + 15) & ~15);
++    if (range_m[num_cpu] > n) range_m[num_cpu] = n;
+ #endif
+ 
+     queue[num_cpu].mode    = mode;
+diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c
+index 5718c0ec9..ce841ee0e 100644
+--- a/driver/level2/sbmv_thread.c
++++ b/driver/level2/sbmv_thread.c
+@@ -246,6 +246,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
+ 
+       range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
+       range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
++      if (range_n[num_cpu] > n) range_n[num_cpu] = n;
+ 
+       queue[num_cpu].mode    = mode;
+       queue[num_cpu].routine = sbmv_kernel;
+@@ -285,6 +286,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
+ 
+       range_m[num_cpu + 1] = range_m[num_cpu] + width;
+       range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
++      if (range_n[num_cpu] > n) range_n[num_cpu] = n;
+ 
+       queue[num_cpu].mode    = mode;
+       queue[num_cpu].routine = sbmv_kernel;
+@@ -316,6 +318,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
+       range_m[num_cpu + 1] = range_m[num_cpu] + width;
+ 
+       range_n[num_cpu] = num_cpu * ((n + 15) & ~15);
++      if (range_n[num_cpu] > n) range_n[num_cpu] = n;
+ 
+       queue[num_cpu].mode    = mode;
+       queue[num_cpu].routine = sbmv_kernel;
+diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c
+index 035300841..0b4087430 100644
+--- a/driver/level2/spmv_thread.c
++++ b/driver/level2/spmv_thread.c
+@@ -246,6 +246,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
+ 
+     range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
+     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
++    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
+ 
+     queue[num_cpu].mode    = mode;
+     queue[num_cpu].routine = spmv_kernel;
+@@ -285,6 +286,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
+ 
+     range_m[num_cpu + 1] = range_m[num_cpu] + width;
+     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
++    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
+ 
+     queue[num_cpu].mode    = mode;
+     queue[num_cpu].routine = spmv_kernel;
+diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c
+index 6580178f1..8d4cd249c 100644
+--- a/driver/level2/symv_thread.c
++++ b/driver/level2/symv_thread.c
+@@ -177,7 +177,8 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
+ 
+     range_m[num_cpu + 1] = range_m[num_cpu] + width;
+     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
+-
++    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
++    
+     queue[MAX_CPU_NUMBER - num_cpu - 1].mode    = mode;
+     queue[MAX_CPU_NUMBER - num_cpu - 1].routine = symv_kernel;
+     queue[MAX_CPU_NUMBER - num_cpu - 1].args    = &args;
+@@ -225,6 +226,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
+ 
+     range_m[num_cpu + 1] = range_m[num_cpu] + width;
+     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
++    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
+ 
+     queue[num_cpu].mode    = mode;
+     queue[num_cpu].routine = symv_kernel;
+diff --git a/driver/level2/tbmv_thread.c b/driver/level2/tbmv_thread.c
+index 226a922e9..aaf4958e2 100644
+--- a/driver/level2/tbmv_thread.c
++++ b/driver/level2/tbmv_thread.c
+@@ -288,6 +288,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
+ 
+       range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
+       range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
++      if (range_n[num_cpu] > n) range_n[num_cpu] = n;
+ 
+       queue[num_cpu].mode    = mode;
+       queue[num_cpu].routine = trmv_kernel;
+@@ -327,6 +328,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
+ 
+       range_m[num_cpu + 1] = range_m[num_cpu] + width;
+       range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
++      if (range_n[num_cpu] > n) range_n[num_cpu] = n;
+ 
+       queue[num_cpu].mode    = mode;
+       queue[num_cpu].routine = trmv_kernel;
+@@ -356,6 +358,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
+ 
+       range_m[num_cpu + 1] = range_m[num_cpu] + width;
+       range_n[num_cpu] = num_cpu * (((n + 15) & ~15) + 16);
++      if (range_n[num_cpu] > n) range_n[num_cpu] = n;
+ 
+       queue[num_cpu].mode    = mode;
+       queue[num_cpu].routine = trmv_kernel;
+diff --git a/driver/level2/tpmv_thread.c b/driver/level2/tpmv_thread.c
+index c91b52775..79438ba29 100644
+--- a/driver/level2/tpmv_thread.c
++++ b/driver/level2/tpmv_thread.c
+@@ -307,7 +307,8 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
+ 
+     range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
+     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
+-
++    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
++    
+     queue[num_cpu].mode    = mode;
+     queue[num_cpu].routine = tpmv_kernel;
+     queue[num_cpu].args    = &args;
+@@ -346,6 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
+ 
+     range_m[num_cpu + 1] = range_m[num_cpu] + width;
+     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
++    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
+ 
+     queue[num_cpu].mode    = mode;
+     queue[num_cpu].routine = tpmv_kernel;
+diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c
+index 0a155366c..8b931a0e8 100644
+--- a/driver/level2/trmv_thread.c
++++ b/driver/level2/trmv_thread.c
+@@ -346,7 +346,8 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
+ 
+     range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
+     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
+-
++    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
++    
+     queue[num_cpu].mode    = mode;
+     queue[num_cpu].routine = trmv_kernel;
+     queue[num_cpu].args    = &args;
+@@ -385,6 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
+ 
+     range_m[num_cpu + 1] = range_m[num_cpu] + width;
+     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
++    if (range_n[num_cpu] > m) range_n[num_cpu] = m;
+ 
+     queue[num_cpu].mode    = mode;
+     queue[num_cpu].routine = trmv_kernel;
+
+From 0ba64cee60c90f2533b918bc026283f5d5288a89 Mon Sep 17 00:00:00 2001
+From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
+Date: Wed, 2 Aug 2017 12:03:54 +0200
+Subject: [PATCH 2/2] Update trmv_thread.c
+
+---
+ driver/level2/trmv_thread.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c
+index 8b931a0e8..24b881a93 100644
+--- a/driver/level2/trmv_thread.c
++++ b/driver/level2/trmv_thread.c
+@@ -347,7 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
+     range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
+     range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
+     if (range_n[num_cpu] > m) range_n[num_cpu] = m;
+-    
++
+     queue[num_cpu].mode    = mode;
+     queue[num_cpu].routine = trmv_kernel;
+     queue[num_cpu].args    = &args;
+
diff --git a/components/serial-libs/openblas/SPECS/openblas.spec b/components/serial-libs/openblas/SPECS/openblas.spec
index d771566ac3..1ecfc4d9e7 100644
--- a/components/serial-libs/openblas/SPECS/openblas.spec
+++ b/components/serial-libs/openblas/SPECS/openblas.spec
@@ -52,6 +52,12 @@ Patch2:         openblas-noexecstack.patch
 Patch3:         openblas-gemv.patch
 # PATCH-FIX-UPSTREADM fix-arm64-cpuid-return.patch
 Patch4:         fix-arm64-cpuid-return.patch
+# PATCH for https://github.com/xianyi/OpenBLAS/pull/1262
+Patch5:         1262.patch
+# PATCH for https://github.com/xianyi/OpenBLAS/pull/1236
+Patch6:         1236.patch
+# PATCH for https://github.com/xianyi/OpenBLAS/pull/1247
+Patch7:         1247.patch
 ExclusiveArch:  %ix86 ia64 ppc ppc64 x86_64 aarch64
 
 %description
@@ -69,6 +75,9 @@ OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
 # karl.w.schulz@intel.com (9/19/16) - disabling patch3 for v0.2.19
 #%patch3 -p1
 %patch4 -p1
+%patch5 -p0
+%patch6 -p0
+%patch7 -p0
 
 %build
 # OpenHPC compiler/mpi designation