From 1e139c49f63f4c9acfa9aab925231fbd0b72fb8c Mon Sep 17 00:00:00 2001
From: Algy <algy@ubuntu.ubuntu-domain>
Date: Fri, 30 Aug 2019 11:31:14 +0900
Subject: [PATCH] Add num_threads option

---
 arch/arm/neon.h        |  3 +-
 arch/x64/avx2.h        |  3 +-
 cca.cpp                |  7 +++--
 cfast_slic.pxd         |  1 +
 cfast_slic.pyx         |  3 ++
 cielab.h               |  7 +++--
 context.cpp            | 26 ++++++-----------
 context.h              |  2 +-
 fast-slic.cpp          |  3 +-
 fast_slic/base_slic.py |  4 ++-
 lsc.cpp                | 19 +++++++------
 parallel.cpp           | 63 ++++++++++++++++++++++++++++++++++++++++++
 parallel.h             | 16 +++++++++++
 preemptive.h           |  3 +-
 setup.py               |  2 +-
 15 files changed, 122 insertions(+), 40 deletions(-)
 create mode 100644 parallel.cpp
 create mode 100644 parallel.h
diff --git a/arch/arm/neon.h b/arch/arm/neon.h
index 07880c5..6936a3a 100644
--- a/arch/arm/neon.h
+++ b/arch/arm/neon.h
@@ -2,6 +2,7 @@
 #include <cassert>
 #include "../../context.h"
 #include "../../lsc.h"
+#include "../../parallel.h"
 
 inline void get_assignment_value_vec(
         const Cluster* cluster,
@@ -244,7 +245,7 @@ namespace fslic {
         }
 
     	virtual void normalize_features(float * __restrict numers[10], float* __restrict weights, int size) {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < size; i += 4) {
                 float32x4_t reciprocal_w = vrecpeq_f32(vld1q_f32(&weights[i]));
                 vst1q_f32(&numers[0][i], vmulq_f32(vld1q_f32(&numers[0][i]), reciprocal_w));
diff --git a/arch/x64/avx2.h b/arch/x64/avx2.h
index 4ad5e49..74fcbaf 100644
--- a/arch/x64/avx2.h
+++ b/arch/x64/avx2.h
@@ -1,6 +1,7 @@
 #include <immintrin.h>
 #include "../../context.h"
 #include "../../lsc.h"
+#include "../../parallel.h"
 
 
 inline __m256 _mm256_set_ps1(float v) {
@@ -280,7 +281,7 @@ namespace fslic {
         }
 
     	void normalize_features(float * __restrict img_feats[10], float* __restrict weights, int size) {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < size; i += 8) {
                 __m256 reciprocal_w = _mm256_rcp_ps(_mm256_loadu_ps(&weights[i]));
                 _mm256_storeu_ps(&img_feats[0][i], _mm256_mul_ps(_mm256_loadu_ps(&img_feats[0][i]), reciprocal_w));
diff --git a/cca.cpp b/cca.cpp
index a94a487..7493c43 100644
--- a/cca.cpp
+++ b/cca.cpp
@@ -13,6 +13,7 @@
 #include <queue>
 #include <deque>
 #include "timer.h"
+#include "parallel.h"
 
 typedef std::chrono::high_resolution_clock Clock;
 
@@ -33,7 +34,7 @@ namespace cca {
         DisjointSet cc_set(H * W);
 
         std::vector<int> seam_ys;
-        #pragma omp parallel
+        #pragma omp parallel num_threads(fsparallel::nth())
         {
             bool is_first = true;
             int seam = 0;
@@ -104,7 +105,7 @@ namespace cca {
         std::unique_ptr<ComponentSet> result { new ComponentSet(size) };
         std::vector<std::vector<tree_node_t>> rootset;
         std::vector<int> root_offsets;
-        #pragma omp parallel
+        #pragma omp parallel num_threads(fsparallel::nth())
         {
             #pragma omp single
             {
@@ -256,7 +257,7 @@ namespace cca {
 
         {
             fstimer::Scope s("output");
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < H * W; i++) {
                 out[i] = substitute[cc_set->component_assignment[i]];
             }
diff --git a/cfast_slic.pxd b/cfast_slic.pxd
index 0f5dd71..a83ac02 100644
--- a/cfast_slic.pxd
+++ b/cfast_slic.pxd
@@ -113,6 +113,7 @@ cdef class NodeConnectivity:
 cdef class SlicModel:
     cdef Cluster* _c_clusters
     cdef readonly int num_components
+    cdef public int num_threads
     cdef public object initialized
     cdef public object arch_name
     cdef public object real_dist
diff --git a/cfast_slic.pyx b/cfast_slic.pyx
index cc616b9..a2b0cb8 100644
--- a/cfast_slic.pyx
+++ b/cfast_slic.pyx
@@ -27,6 +27,7 @@ cdef class SlicModel:
             raise ValueError("num_components should be a non-negative integer")
 
         self.num_components = num_components
+        self.num_threads = -1
         self.arch_name = arch_name
         self.real_dist = real_dist
         self.real_dist_type = "standard"
@@ -172,6 +173,7 @@ cdef class SlicModel:
                 c_clusters,
             )
             try:
+                context.num_threads = self.num_threads
                 context.compactness = compactness
                 context.min_size_factor = min_size_factor
                 context.subsample_stride_config = subsample_stride
@@ -227,6 +229,7 @@ cdef class SlicModel:
                 raise RuntimeError("No such real_dist_type " + repr(self.real_dist_type))
 
             try:
+                context_real_dist.num_threads = self.num_threads
                 context_real_dist.compactness = compactness
                 context_real_dist.min_size_factor = min_size_factor
                 context_real_dist.subsample_stride_config = subsample_stride
diff --git a/cielab.h b/cielab.h
index 9ab5dd0..7811dac 100644
--- a/cielab.h
+++ b/cielab.h
@@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdint>
 #include <vector>
+#include "parallel.h"
 /*
 def get_xyz_nonlin_tbl(a):
     v = a / 255.
@@ -335,7 +336,7 @@ static FastCIELabCvt fast_cielab_cvt;
 
 static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool scale_L = false) {
     if (scale_L) {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int s = 0; s < size; s += 4) {
             fast_cielab_cvt.convert<true>(
                 aligned_quad_image[s],
@@ -347,7 +348,7 @@ static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int s
             );
         }
     } else {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int s = 0; s < size; s += 4) {
             fast_cielab_cvt.convert<false>(
                 aligned_quad_image[s],
@@ -364,7 +365,7 @@ static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int s
 
 #if 0
 static void rgb_to_cielab_orig(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool parallel) {
-    #pragma omp parallel for if(parallel)
+    #pragma omp parallel for if(parallel) num_threads(fsparallel::nth())
     for (int s = 0; s < size; s += 4) {
         float r = _srgb_gamma_tbl[aligned_quad_image[s]],
             g = _srgb_gamma_tbl[aligned_quad_image[s+1]],
diff --git a/context.cpp b/context.cpp
index 83a2633..67508b8 100644
--- a/context.cpp
+++ b/context.cpp
@@ -2,17 +2,10 @@
 #include "cca.h"
 #include "cielab.h"
 #include "timer.h"
+#include "parallel.h"
 
 #include <limits>
 
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#ifndef _OPENMP
-#define omp_get_num_threads() 1
-#endif
-
 namespace fslic {
     template<typename DistType>
     BaseContext<DistType>::~BaseContext() {
@@ -105,20 +98,17 @@ namespace fslic {
 
     template<typename DistType>
     bool BaseContext<DistType>::parallelism_supported() {
-#if defined(_OPENMP)
-    return true;
-#else
-    return false;
-#endif
+        return fsparallel::parallelism_supported();
     }
 
     template<typename DistType>
     void BaseContext<DistType>::iterate(uint16_t *assignment, int max_iter) {
         {
+            fsparallel::Scope parallel_scope(num_threads);
             fstimer::Scope s("iterate");
             {
                 fstimer::Scope s("write_to_buffer");
-                #pragma omp parallel
+                #pragma omp parallel num_threads(fsparallel::nth())
                 {
                     #pragma omp for
                     for (int i = 0; i < H; i++) {
@@ -180,7 +170,7 @@ namespace fslic {
             }
             {
                 fstimer::Scope s("write_back");
-                #pragma omp parallel for
+                #pragma omp parallel for num_threads(fsparallel::nth())
                 for (int i = 0; i < H; i++) {
                     for (int j = 0; j < W; j++) {
                         assignment[W * i + j] = this->assignment.get(i, j);
@@ -197,7 +187,7 @@ namespace fslic {
 
     template<typename DistType>
     void BaseContext<DistType>::assign() {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int i = 0; i < H; i++) {
             for (int j = 0; j < W; j++) {
                 min_dists.get(i, j) = std::numeric_limits<DistType>::max();
@@ -226,7 +216,7 @@ namespace fslic {
                     grid_indices.push_back(i * cell_W + j);
                 }
             }
-            #pragma omp parallel
+            #pragma omp parallel num_threads(fsparallel::nth())
             {
                 std::vector<const Cluster*> target_clusters;
                 #pragma omp for
@@ -312,7 +302,7 @@ namespace fslic {
             cluster_updatable[k] = preemptive_grid.is_updatable_cluster(clusters[k]);
         }
 
-        #pragma omp parallel
+        #pragma omp parallel num_threads(fsparallel::nth())
         {
             std::vector<uint32_t> local_acc_vec(K * 5, 0); // sum of [y, x, r, g, b] in cluster
             std::vector<uint32_t> local_num_cluster_members(K, 0);
diff --git a/context.h b/context.h
index bd2227c..f0cdd70 100644
--- a/context.h
+++ b/context.h
@@ -23,7 +23,7 @@ namespace fslic {
     class BaseContext {
     public:
         int16_t subsample_stride_config = 3;
-        int num_threads = 0;
+        int num_threads = -1;
         float compactness = 20;
         float min_size_factor = 0.1;
         bool convert_to_lab = false;
diff --git a/fast-slic.cpp b/fast-slic.cpp
index 5849199..1dd0caa 100644
--- a/fast-slic.cpp
+++ b/fast-slic.cpp
@@ -1,6 +1,7 @@
 #include <utility>
 #include "fast-slic.h"
 #include "context.h"
+#include "parallel.h"
 
 extern "C" {
     static uint32_t symmetric_int_hash(uint32_t x, uint32_t y) {
@@ -93,7 +94,7 @@ extern "C" {
         conn->num_neighbors = new int[K];
         conn->neighbors = new uint32_t*[K];
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int i = 0; i < K; i++) {
             const Cluster* cluster = clusters + i;
             int cell_center_x = cluster->x / S, cell_center_y = cluster->y / S;
diff --git a/fast_slic/base_slic.py b/fast_slic/base_slic.py
index 88e03ed..df0cd13 100644
--- a/fast_slic/base_slic.py
+++ b/fast_slic/base_slic.py
@@ -12,7 +12,8 @@ def __init__(self,
                  convert_to_lab=True,
                  preemptive=False,
                  preemptive_thres=0.05,
-                 manhattan_spatial_dist=True):
+                 manhattan_spatial_dist=True,
+                 num_threads=-1):
         self.compactness = compactness
         self.subsample_stride = subsample_stride
         self.min_size_factor = min_size_factor
@@ -23,6 +24,7 @@ def __init__(self,
         self._slic_model.preemptive = preemptive
         self._slic_model.preemptive_thres = preemptive_thres
         self._slic_model.manhattan_spatial_dist = manhattan_spatial_dist
+        self._slic_model.num_threads = num_threads
 
     @property
     def convert_to_lab(self):
diff --git a/lsc.cpp b/lsc.cpp
index 2c86960..8fa1356 100644
--- a/lsc.cpp
+++ b/lsc.cpp
@@ -3,6 +3,7 @@
 #include <cmath>
 #include "lsc.h"
 #include "cielab.h"
+#include "parallel.h"
 
 //map pixels into ten dimensional feature space
 
@@ -54,7 +55,7 @@ namespace fslic {
         #ifdef FAST_SLIC_TIMER
         auto t1 = Clock::now();
         #endif
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int i = 0; i < H; i++) {
             const uint8_t* image_row = quad_image.get_row(i);
             for (int j = 0; j < W; j++) {
@@ -106,7 +107,7 @@ namespace fslic {
             const uint8_t* __restrict L = &image_planes[0][0];
             const uint8_t* __restrict A = &image_planes[1][0];
             const uint8_t* __restrict B = &image_planes[2][0];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < len; i++) {
                 image_features[0][i] = L_cosine_map[L[i]];
     			image_features[1][i] = L_sine_map[L[i]];
@@ -117,7 +118,7 @@ namespace fslic {
             }
             // x1, x2, y1, y2
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int y = 0; y < H; y++) {
                 std::copy(
                     width_cosine_map.begin(),
@@ -131,7 +132,7 @@ namespace fslic {
                 );
             }
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int y = 0; y < H; y++) {
                 std::fill_n(&image_features[8][y * W], W, height_cosine_map[y]);
                 std::fill_n(&image_features[9][y * W], W, height_sine_map[y]);
@@ -144,7 +145,7 @@ namespace fslic {
 	    float sum_features[10];
         std::fill_n(sum_features, 10, 0);
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int ix_feat = 0; ix_feat < 10; ix_feat++) {
                 float sum = 0;
                 for (int i = 0; i < len; i++) {
@@ -159,7 +160,7 @@ namespace fslic {
         #endif
 
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < len; i++) {
                 float w = 0;
                 for (int ix_feat = 0; ix_feat < 10; ix_feat++) {
@@ -187,7 +188,7 @@ namespace fslic {
             std::fill_n(feat, K, 0);
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int k = 0; k < K; k++) {
             const Cluster* cluster = &clusters[k];
             int cluster_y = cluster->y, cluster_x = cluster->x;
@@ -257,7 +258,7 @@ namespace fslic {
             wsums[k] = cluster_updatable[k]? 0.0f : 1.0f;
         }
 
-        #pragma omp parallel
+        #pragma omp parallel num_threads(fsparallel::nth())
         {
             float* __restrict local_feats[10];
             float* __restrict local_wsums = new float[K];
@@ -327,7 +328,7 @@ namespace fslic {
 
 
 	void ContextLSC::normalize_features(float *__restrict numers[10], float* __restrict weights, int size) {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int i = 0; i < size; i++) {
             for (int ix_feat = 0; ix_feat < 10; ix_feat++) {
                 numers[ix_feat][i] /= weights[i];
diff --git a/parallel.cpp b/parallel.cpp
new file mode 100644
index 0000000..5c19d43
--- /dev/null
+++ b/parallel.cpp
@@ -0,0 +1,63 @@
+#include <cstdlib>
+#include <string>
+#include <stdexcept>
+#ifdef _OPENMP
+#define PARALLELISM_SUPPORTED true
+#include <omp.h>
+#else
+#define PARALLELISM_SUPPORTED false
+#define omp_get_max_threads() 1
+#endif
+#include "parallel.h"
+
+namespace fsparallel {
+    thread_local int num_threads = -1;
+
+    bool parallelism_supported() {
+        return PARALLELISM_SUPPORTED;
+    }
+
+    static int parse_env(const char *name) {
+        if (const char * env_p = std::getenv(name)) {
+            try {
+                return std::stoi(std::string(env_p));
+            } catch (const std::invalid_argument &ia) {
+                return -1;
+            }
+        }
+        return -1;
+    }
+
+    int nth() {
+        if (num_threads < 0) {
+            int n;
+            n = parse_env("FSLIC_NUM_THREADS");
+            if (n > 0) {
+                return n;
+            } else if (n == 0) {
+                return omp_get_max_threads();
+            }
+
+            n = parse_env("OMP_NUM_THREADS");
+            if (n > 0) {
+                return n;
+            } else if (n == 0) {
+                return omp_get_max_threads();
+            }
+            return omp_get_max_threads();
+        } else if (num_threads == 0) {
+            return omp_get_max_threads();
+        } else {
+            return num_threads;
+        }
+    }
+
+    Scope::Scope(int n) {
+        old_val = num_threads;
+        num_threads = n;
+    }
+
+    Scope::~Scope() {
+        num_threads = old_val;
+    }
+};
diff --git a/parallel.h b/parallel.h
new file mode 100644
index 0000000..1f1e3b2
--- /dev/null
+++ b/parallel.h
@@ -0,0 +1,16 @@
+#ifndef FSLIC_PARALLEL_H
+#define FSLIC_PARALLEL_H
+
+namespace fsparallel {
+    bool parallelism_supported();
+    int nth();
+    class Scope {
+    private:
+        int old_val;
+    public:
+        Scope(int n);
+        ~Scope();
+    };
+};
+
+#endif
diff --git a/preemptive.h b/preemptive.h
index 18ae32c..81d10a2 100644
--- a/preemptive.h
+++ b/preemptive.h
@@ -3,6 +3,7 @@
 #include <cstdint>
 #include "fast-slic-common.h"
 #include "simd-helper.hpp"
+#include "parallel.h"
 
 struct PreemptiveTile {
     int sy, sx, ey, ex;
@@ -88,7 +89,7 @@ class PreemptiveGrid {
         if (!enabled) return;
         std::fill(num_changes.begin(), num_changes.end(), 0);
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int ci = 0; ci < CH; ci++) {
             for (int cj = 0; cj < CW; cj++) {
                 if (!is_active[CW * ci + cj]) continue;
diff --git a/setup.py b/setup.py
index 41d3c0a..ac96928 100755
--- a/setup.py
+++ b/setup.py
@@ -122,7 +122,7 @@ def _check_neon():
             Extension(
                 "cfast_slic",
                 include_dirs=[np.get_include()],
-                sources=["timer.cpp", "fast-slic.cpp", "cca.cpp", "context.cpp", "context-impl.cpp", "lsc.cpp", "lsc-builder.cpp", "cfast_slic.pyx"],
+                sources=["timer.cpp", "parallel.cpp", "fast-slic.cpp", "cca.cpp", "context.cpp", "context-impl.cpp", "lsc.cpp", "lsc-builder.cpp", "cfast_slic.pyx"],
                 extra_compile_args=extra_compile_args,
                 extra_link_args=extra_link_args,
                 language="c++",