Add num_threads option

Algy · Aug 30, 2019 · 1e139c4 · 1e139c4
1 parent 9b120d7
commit 1e139c4
Show file tree

Hide file tree

Showing 15 changed files with 122 additions and 40 deletions.
diff --git a/arch/arm/neon.h b/arch/arm/neon.h
@@ -2,6 +2,7 @@
 #include <cassert>
 #include "../../context.h"
 #include "../../lsc.h"
+#include "../../parallel.h"
 
 inline void get_assignment_value_vec(
         const Cluster* cluster,
@@ -244,7 +245,7 @@ namespace fslic {
         }
 
     	virtual void normalize_features(float * __restrict numers[10], float* __restrict weights, int size) {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < size; i += 4) {
                 float32x4_t reciprocal_w = vrecpeq_f32(vld1q_f32(&weights[i]));
                 vst1q_f32(&numers[0][i], vmulq_f32(vld1q_f32(&numers[0][i]), reciprocal_w));

diff --git a/arch/x64/avx2.h b/arch/x64/avx2.h
@@ -1,6 +1,7 @@
 #include <immintrin.h>
 #include "../../context.h"
 #include "../../lsc.h"
+#include "../../parallel.h"
 
 
 inline __m256 _mm256_set_ps1(float v) {
@@ -280,7 +281,7 @@ namespace fslic {
         }
 
     	void normalize_features(float * __restrict img_feats[10], float* __restrict weights, int size) {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < size; i += 8) {
                 __m256 reciprocal_w = _mm256_rcp_ps(_mm256_loadu_ps(&weights[i]));
                 _mm256_storeu_ps(&img_feats[0][i], _mm256_mul_ps(_mm256_loadu_ps(&img_feats[0][i]), reciprocal_w));

diff --git a/cca.cpp b/cca.cpp
@@ -13,6 +13,7 @@
 #include <queue>
 #include <deque>
 #include "timer.h"
+#include "parallel.h"
 
 typedef std::chrono::high_resolution_clock Clock;
 
@@ -33,7 +34,7 @@ namespace cca {
         DisjointSet cc_set(H * W);
 
         std::vector<int> seam_ys;
-        #pragma omp parallel
+        #pragma omp parallel num_threads(fsparallel::nth())
         {
             bool is_first = true;
             int seam = 0;
@@ -104,7 +105,7 @@ namespace cca {
         std::unique_ptr<ComponentSet> result { new ComponentSet(size) };
         std::vector<std::vector<tree_node_t>> rootset;
         std::vector<int> root_offsets;
-        #pragma omp parallel
+        #pragma omp parallel num_threads(fsparallel::nth())
         {
             #pragma omp single
             {
@@ -256,7 +257,7 @@ namespace cca {
 
         {
             fstimer::Scope s("output");
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < H * W; i++) {
                 out[i] = substitute[cc_set->component_assignment[i]];
             }

diff --git a/cfast_slic.pxd b/cfast_slic.pxd
@@ -113,6 +113,7 @@ cdef class NodeConnectivity:
 cdef class SlicModel:
     cdef Cluster* _c_clusters
     cdef readonly int num_components
+    cdef public int num_threads
     cdef public object initialized
     cdef public object arch_name
     cdef public object real_dist

diff --git a/cfast_slic.pyx b/cfast_slic.pyx
@@ -27,6 +27,7 @@ cdef class SlicModel:
             raise ValueError("num_components should be a non-negative integer")
 
         self.num_components = num_components
+        self.num_threads = -1
         self.arch_name = arch_name
         self.real_dist = real_dist
         self.real_dist_type = "standard"
@@ -172,6 +173,7 @@ cdef class SlicModel:
                 c_clusters,
             )
             try:
+                context.num_threads = self.num_threads
                 context.compactness = compactness
                 context.min_size_factor = min_size_factor
                 context.subsample_stride_config = subsample_stride
@@ -227,6 +229,7 @@ cdef class SlicModel:
                 raise RuntimeError("No such real_dist_type " + repr(self.real_dist_type))
 
             try:
+                context_real_dist.num_threads = self.num_threads
                 context_real_dist.compactness = compactness
                 context_real_dist.min_size_factor = min_size_factor
                 context_real_dist.subsample_stride_config = subsample_stride

diff --git a/cielab.h b/cielab.h
@@ -4,6 +4,7 @@
 #include <cmath>
 #include <cstdint>
 #include <vector>
+#include "parallel.h"
 /*
 def get_xyz_nonlin_tbl(a):
     v = a / 255.
@@ -335,7 +336,7 @@ static FastCIELabCvt fast_cielab_cvt;
 
 static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool scale_L = false) {
     if (scale_L) {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int s = 0; s < size; s += 4) {
             fast_cielab_cvt.convert<true>(
                 aligned_quad_image[s],
@@ -347,7 +348,7 @@ static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int s
             );
         }
     } else {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int s = 0; s < size; s += 4) {
             fast_cielab_cvt.convert<false>(
                 aligned_quad_image[s],
@@ -364,7 +365,7 @@ static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int s
 
 #if 0
 static void rgb_to_cielab_orig(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool parallel) {
-    #pragma omp parallel for if(parallel)
+    #pragma omp parallel for if(parallel) num_threads(fsparallel::nth())
     for (int s = 0; s < size; s += 4) {
         float r = _srgb_gamma_tbl[aligned_quad_image[s]],
             g = _srgb_gamma_tbl[aligned_quad_image[s+1]],

diff --git a/context.cpp b/context.cpp
@@ -2,17 +2,10 @@
 #include "cca.h"
 #include "cielab.h"
 #include "timer.h"
+#include "parallel.h"
 
 #include <limits>
 
-#ifdef _OPENMP
-#include <omp.h>
-#endif
-
-#ifndef _OPENMP
-#define omp_get_num_threads() 1
-#endif
-
 namespace fslic {
     template<typename DistType>
     BaseContext<DistType>::~BaseContext() {
@@ -105,20 +98,17 @@ namespace fslic {
 
     template<typename DistType>
     bool BaseContext<DistType>::parallelism_supported() {
-#if defined(_OPENMP)
-    return true;
-#else
-    return false;
-#endif
+        return fsparallel::parallelism_supported();
     }
 
     template<typename DistType>
     void BaseContext<DistType>::iterate(uint16_t *assignment, int max_iter) {
         {
+            fsparallel::Scope parallel_scope(num_threads);
             fstimer::Scope s("iterate");
             {
                 fstimer::Scope s("write_to_buffer");
-                #pragma omp parallel
+                #pragma omp parallel num_threads(fsparallel::nth())
                 {
                     #pragma omp for
                     for (int i = 0; i < H; i++) {
@@ -180,7 +170,7 @@ namespace fslic {
             }
             {
                 fstimer::Scope s("write_back");
-                #pragma omp parallel for
+                #pragma omp parallel for num_threads(fsparallel::nth())
                 for (int i = 0; i < H; i++) {
                     for (int j = 0; j < W; j++) {
                         assignment[W * i + j] = this->assignment.get(i, j);
@@ -197,7 +187,7 @@ namespace fslic {
 
     template<typename DistType>
     void BaseContext<DistType>::assign() {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int i = 0; i < H; i++) {
             for (int j = 0; j < W; j++) {
                 min_dists.get(i, j) = std::numeric_limits<DistType>::max();
@@ -226,7 +216,7 @@ namespace fslic {
                     grid_indices.push_back(i * cell_W + j);
                 }
             }
-            #pragma omp parallel
+            #pragma omp parallel num_threads(fsparallel::nth())
             {
                 std::vector<const Cluster*> target_clusters;
                 #pragma omp for
@@ -312,7 +302,7 @@ namespace fslic {
             cluster_updatable[k] = preemptive_grid.is_updatable_cluster(clusters[k]);
         }
 
-        #pragma omp parallel
+        #pragma omp parallel num_threads(fsparallel::nth())
         {
             std::vector<uint32_t> local_acc_vec(K * 5, 0); // sum of [y, x, r, g, b] in cluster
             std::vector<uint32_t> local_num_cluster_members(K, 0);

diff --git a/context.h b/context.h
@@ -23,7 +23,7 @@ namespace fslic {
     class BaseContext {
     public:
         int16_t subsample_stride_config = 3;
-        int num_threads = 0;
+        int num_threads = -1;
         float compactness = 20;
         float min_size_factor = 0.1;
         bool convert_to_lab = false;

diff --git a/fast-slic.cpp b/fast-slic.cpp
@@ -1,6 +1,7 @@
 #include <utility>
 #include "fast-slic.h"
 #include "context.h"
+#include "parallel.h"
 
 extern "C" {
     static uint32_t symmetric_int_hash(uint32_t x, uint32_t y) {
@@ -93,7 +94,7 @@ extern "C" {
         conn->num_neighbors = new int[K];
         conn->neighbors = new uint32_t*[K];
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int i = 0; i < K; i++) {
             const Cluster* cluster = clusters + i;
             int cell_center_x = cluster->x / S, cell_center_y = cluster->y / S;

diff --git a/fast_slic/base_slic.py b/fast_slic/base_slic.py
@@ -12,7 +12,8 @@ def __init__(self,
                  convert_to_lab=True,
                  preemptive=False,
                  preemptive_thres=0.05,
-                 manhattan_spatial_dist=True):
+                 manhattan_spatial_dist=True,
+                 num_threads=-1):
         self.compactness = compactness
         self.subsample_stride = subsample_stride
         self.min_size_factor = min_size_factor
@@ -23,6 +24,7 @@ def __init__(self,
         self._slic_model.preemptive = preemptive
         self._slic_model.preemptive_thres = preemptive_thres
         self._slic_model.manhattan_spatial_dist = manhattan_spatial_dist
+        self._slic_model.num_threads = num_threads
 
     @property
     def convert_to_lab(self):

diff --git a/lsc.cpp b/lsc.cpp
@@ -3,6 +3,7 @@
 #include <cmath>
 #include "lsc.h"
 #include "cielab.h"
+#include "parallel.h"
 
 //map pixels into ten dimensional feature space
 
@@ -54,7 +55,7 @@ namespace fslic {
         #ifdef FAST_SLIC_TIMER
         auto t1 = Clock::now();
         #endif
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int i = 0; i < H; i++) {
             const uint8_t* image_row = quad_image.get_row(i);
             for (int j = 0; j < W; j++) {
@@ -106,7 +107,7 @@ namespace fslic {
             const uint8_t* __restrict L = &image_planes[0][0];
             const uint8_t* __restrict A = &image_planes[1][0];
             const uint8_t* __restrict B = &image_planes[2][0];
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < len; i++) {
                 image_features[0][i] = L_cosine_map[L[i]];
     			image_features[1][i] = L_sine_map[L[i]];
@@ -117,7 +118,7 @@ namespace fslic {
             }
             // x1, x2, y1, y2
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int y = 0; y < H; y++) {
                 std::copy(
                     width_cosine_map.begin(),
@@ -131,7 +132,7 @@ namespace fslic {
                 );
             }
 
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int y = 0; y < H; y++) {
                 std::fill_n(&image_features[8][y * W], W, height_cosine_map[y]);
                 std::fill_n(&image_features[9][y * W], W, height_sine_map[y]);
@@ -144,7 +145,7 @@ namespace fslic {
 	    float sum_features[10];
         std::fill_n(sum_features, 10, 0);
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int ix_feat = 0; ix_feat < 10; ix_feat++) {
                 float sum = 0;
                 for (int i = 0; i < len; i++) {
@@ -159,7 +160,7 @@ namespace fslic {
         #endif
 
         {
-            #pragma omp parallel for
+            #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < len; i++) {
                 float w = 0;
                 for (int ix_feat = 0; ix_feat < 10; ix_feat++) {
@@ -187,7 +188,7 @@ namespace fslic {
             std::fill_n(feat, K, 0);
         }
 
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int k = 0; k < K; k++) {
             const Cluster* cluster = &clusters[k];
             int cluster_y = cluster->y, cluster_x = cluster->x;
@@ -257,7 +258,7 @@ namespace fslic {
             wsums[k] = cluster_updatable[k]? 0.0f : 1.0f;
         }
 
-        #pragma omp parallel
+        #pragma omp parallel num_threads(fsparallel::nth())
         {
             float* __restrict local_feats[10];
             float* __restrict local_wsums = new float[K];
@@ -327,7 +328,7 @@ namespace fslic {
 
 
 	void ContextLSC::normalize_features(float *__restrict numers[10], float* __restrict weights, int size) {
-        #pragma omp parallel for
+        #pragma omp parallel for num_threads(fsparallel::nth())
         for (int i = 0; i < size; i++) {
             for (int ix_feat = 0; ix_feat < 10; ix_feat++) {
                 numers[ix_feat][i] /= weights[i];