From 1e139c49f63f4c9acfa9aab925231fbd0b72fb8c Mon Sep 17 00:00:00 2001 From: Algy Date: Fri, 30 Aug 2019 11:31:14 +0900 Subject: [PATCH] Add num_threads option --- arch/arm/neon.h | 3 +- arch/x64/avx2.h | 3 +- cca.cpp | 7 +++-- cfast_slic.pxd | 1 + cfast_slic.pyx | 3 ++ cielab.h | 7 +++-- context.cpp | 26 ++++++----------- context.h | 2 +- fast-slic.cpp | 3 +- fast_slic/base_slic.py | 4 ++- lsc.cpp | 19 +++++++------ parallel.cpp | 63 ++++++++++++++++++++++++++++++++++++++++++ parallel.h | 16 +++++++++++ preemptive.h | 3 +- setup.py | 2 +- 15 files changed, 122 insertions(+), 40 deletions(-) create mode 100644 parallel.cpp create mode 100644 parallel.h diff --git a/arch/arm/neon.h b/arch/arm/neon.h index 07880c5..6936a3a 100644 --- a/arch/arm/neon.h +++ b/arch/arm/neon.h @@ -2,6 +2,7 @@ #include #include "../../context.h" #include "../../lsc.h" +#include "../../parallel.h" inline void get_assignment_value_vec( const Cluster* cluster, @@ -244,7 +245,7 @@ namespace fslic { } virtual void normalize_features(float * __restrict numers[10], float* __restrict weights, int size) { - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < size; i += 4) { float32x4_t reciprocal_w = vrecpeq_f32(vld1q_f32(&weights[i])); vst1q_f32(&numers[0][i], vmulq_f32(vld1q_f32(&numers[0][i]), reciprocal_w)); diff --git a/arch/x64/avx2.h b/arch/x64/avx2.h index 4ad5e49..74fcbaf 100644 --- a/arch/x64/avx2.h +++ b/arch/x64/avx2.h @@ -1,6 +1,7 @@ #include #include "../../context.h" #include "../../lsc.h" +#include "../../parallel.h" inline __m256 _mm256_set_ps1(float v) { @@ -280,7 +281,7 @@ namespace fslic { } void normalize_features(float * __restrict img_feats[10], float* __restrict weights, int size) { - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < size; i += 8) { __m256 reciprocal_w = _mm256_rcp_ps(_mm256_loadu_ps(&weights[i])); _mm256_storeu_ps(&img_feats[0][i], _mm256_mul_ps(_mm256_loadu_ps(&img_feats[0][i]), reciprocal_w)); diff --git a/cca.cpp b/cca.cpp index a94a487..7493c43 100644 --- a/cca.cpp +++ b/cca.cpp @@ -13,6 +13,7 @@ #include #include #include "timer.h" +#include "parallel.h" typedef std::chrono::high_resolution_clock Clock; @@ -33,7 +34,7 @@ namespace cca { DisjointSet cc_set(H * W); std::vector seam_ys; - #pragma omp parallel + #pragma omp parallel num_threads(fsparallel::nth()) { bool is_first = true; int seam = 0; @@ -104,7 +105,7 @@ namespace cca { std::unique_ptr result { new ComponentSet(size) }; std::vector> rootset; std::vector root_offsets; - #pragma omp parallel + #pragma omp parallel num_threads(fsparallel::nth()) { #pragma omp single { @@ -256,7 +257,7 @@ namespace cca { { fstimer::Scope s("output"); - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < H * W; i++) { out[i] = substitute[cc_set->component_assignment[i]]; } diff --git a/cfast_slic.pxd b/cfast_slic.pxd index 0f5dd71..a83ac02 100644 --- a/cfast_slic.pxd +++ b/cfast_slic.pxd @@ -113,6 +113,7 @@ cdef class NodeConnectivity: cdef class SlicModel: cdef Cluster* _c_clusters cdef readonly int num_components + cdef public int num_threads cdef public object initialized cdef public object arch_name cdef public object real_dist diff --git a/cfast_slic.pyx b/cfast_slic.pyx index cc616b9..a2b0cb8 100644 --- a/cfast_slic.pyx +++ b/cfast_slic.pyx @@ -27,6 +27,7 @@ cdef class SlicModel: raise ValueError("num_components should be a non-negative integer") self.num_components = num_components + self.num_threads = -1 self.arch_name = arch_name self.real_dist = real_dist self.real_dist_type = "standard" @@ -172,6 +173,7 @@ cdef class SlicModel: c_clusters, ) try: + context.num_threads = self.num_threads context.compactness = compactness context.min_size_factor = min_size_factor context.subsample_stride_config = subsample_stride @@ -227,6 +229,7 @@ cdef class SlicModel: raise RuntimeError("No such real_dist_type " + repr(self.real_dist_type)) try: + context_real_dist.num_threads = self.num_threads context_real_dist.compactness = compactness context_real_dist.min_size_factor = min_size_factor context_real_dist.subsample_stride_config = subsample_stride diff --git a/cielab.h b/cielab.h index 9ab5dd0..7811dac 100644 --- a/cielab.h +++ b/cielab.h @@ -4,6 +4,7 @@ #include #include #include +#include "parallel.h" /* def get_xyz_nonlin_tbl(a): v = a / 255. @@ -335,7 +336,7 @@ static FastCIELabCvt fast_cielab_cvt; static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool scale_L = false) { if (scale_L) { - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int s = 0; s < size; s += 4) { fast_cielab_cvt.convert( aligned_quad_image[s], @@ -347,7 +348,7 @@ static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int s ); } } else { - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int s = 0; s < size; s += 4) { fast_cielab_cvt.convert( aligned_quad_image[s], @@ -364,7 +365,7 @@ static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int s #if 0 static void rgb_to_cielab_orig(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool parallel) { - #pragma omp parallel for if(parallel) + #pragma omp parallel for if(parallel) num_threads(fsparallel::nth()) for (int s = 0; s < size; s += 4) { float r = _srgb_gamma_tbl[aligned_quad_image[s]], g = _srgb_gamma_tbl[aligned_quad_image[s+1]], diff --git a/context.cpp b/context.cpp index 83a2633..67508b8 100644 --- a/context.cpp +++ b/context.cpp @@ -2,17 +2,10 @@ #include "cca.h" #include "cielab.h" #include "timer.h" +#include "parallel.h" #include -#ifdef _OPENMP -#include -#endif - -#ifndef _OPENMP -#define omp_get_num_threads() 1 -#endif - namespace fslic { template BaseContext::~BaseContext() { @@ -105,20 +98,17 @@ namespace fslic { template bool BaseContext::parallelism_supported() { -#if defined(_OPENMP) - return true; -#else - return false; -#endif + return fsparallel::parallelism_supported(); } template void BaseContext::iterate(uint16_t *assignment, int max_iter) { { + fsparallel::Scope parallel_scope(num_threads); fstimer::Scope s("iterate"); { fstimer::Scope s("write_to_buffer"); - #pragma omp parallel + #pragma omp parallel num_threads(fsparallel::nth()) { #pragma omp for for (int i = 0; i < H; i++) { @@ -180,7 +170,7 @@ namespace fslic { } { fstimer::Scope s("write_back"); - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < H; i++) { for (int j = 0; j < W; j++) { assignment[W * i + j] = this->assignment.get(i, j); @@ -197,7 +187,7 @@ namespace fslic { template void BaseContext::assign() { - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < H; i++) { for (int j = 0; j < W; j++) { min_dists.get(i, j) = std::numeric_limits::max(); @@ -226,7 +216,7 @@ namespace fslic { grid_indices.push_back(i * cell_W + j); } } - #pragma omp parallel + #pragma omp parallel num_threads(fsparallel::nth()) { std::vector target_clusters; #pragma omp for @@ -312,7 +302,7 @@ namespace fslic { cluster_updatable[k] = preemptive_grid.is_updatable_cluster(clusters[k]); } - #pragma omp parallel + #pragma omp parallel num_threads(fsparallel::nth()) { std::vector local_acc_vec(K * 5, 0); // sum of [y, x, r, g, b] in cluster std::vector local_num_cluster_members(K, 0); diff --git a/context.h b/context.h index bd2227c..f0cdd70 100644 --- a/context.h +++ b/context.h @@ -23,7 +23,7 @@ namespace fslic { class BaseContext { public: int16_t subsample_stride_config = 3; - int num_threads = 0; + int num_threads = -1; float compactness = 20; float min_size_factor = 0.1; bool convert_to_lab = false; diff --git a/fast-slic.cpp b/fast-slic.cpp index 5849199..1dd0caa 100644 --- a/fast-slic.cpp +++ b/fast-slic.cpp @@ -1,6 +1,7 @@ #include #include "fast-slic.h" #include "context.h" +#include "parallel.h" extern "C" { static uint32_t symmetric_int_hash(uint32_t x, uint32_t y) { @@ -93,7 +94,7 @@ extern "C" { conn->num_neighbors = new int[K]; conn->neighbors = new uint32_t*[K]; - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < K; i++) { const Cluster* cluster = clusters + i; int cell_center_x = cluster->x / S, cell_center_y = cluster->y / S; diff --git a/fast_slic/base_slic.py b/fast_slic/base_slic.py index 88e03ed..df0cd13 100644 --- a/fast_slic/base_slic.py +++ b/fast_slic/base_slic.py @@ -12,7 +12,8 @@ def __init__(self, convert_to_lab=True, preemptive=False, preemptive_thres=0.05, - manhattan_spatial_dist=True): + manhattan_spatial_dist=True, + num_threads=-1): self.compactness = compactness self.subsample_stride = subsample_stride self.min_size_factor = min_size_factor @@ -23,6 +24,7 @@ def __init__(self, self._slic_model.preemptive = preemptive self._slic_model.preemptive_thres = preemptive_thres self._slic_model.manhattan_spatial_dist = manhattan_spatial_dist + self._slic_model.num_threads = num_threads @property def convert_to_lab(self): diff --git a/lsc.cpp b/lsc.cpp index 2c86960..8fa1356 100644 --- a/lsc.cpp +++ b/lsc.cpp @@ -3,6 +3,7 @@ #include #include "lsc.h" #include "cielab.h" +#include "parallel.h" //map pixels into ten dimensional feature space @@ -54,7 +55,7 @@ namespace fslic { #ifdef FAST_SLIC_TIMER auto t1 = Clock::now(); #endif - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < H; i++) { const uint8_t* image_row = quad_image.get_row(i); for (int j = 0; j < W; j++) { @@ -106,7 +107,7 @@ namespace fslic { const uint8_t* __restrict L = &image_planes[0][0]; const uint8_t* __restrict A = &image_planes[1][0]; const uint8_t* __restrict B = &image_planes[2][0]; - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < len; i++) { image_features[0][i] = L_cosine_map[L[i]]; image_features[1][i] = L_sine_map[L[i]]; @@ -117,7 +118,7 @@ namespace fslic { } // x1, x2, y1, y2 - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int y = 0; y < H; y++) { std::copy( width_cosine_map.begin(), @@ -131,7 +132,7 @@ namespace fslic { ); } - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int y = 0; y < H; y++) { std::fill_n(&image_features[8][y * W], W, height_cosine_map[y]); std::fill_n(&image_features[9][y * W], W, height_sine_map[y]); @@ -144,7 +145,7 @@ namespace fslic { float sum_features[10]; std::fill_n(sum_features, 10, 0); { - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int ix_feat = 0; ix_feat < 10; ix_feat++) { float sum = 0; for (int i = 0; i < len; i++) { @@ -159,7 +160,7 @@ namespace fslic { #endif { - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < len; i++) { float w = 0; for (int ix_feat = 0; ix_feat < 10; ix_feat++) { @@ -187,7 +188,7 @@ namespace fslic { std::fill_n(feat, K, 0); } - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int k = 0; k < K; k++) { const Cluster* cluster = &clusters[k]; int cluster_y = cluster->y, cluster_x = cluster->x; @@ -257,7 +258,7 @@ namespace fslic { wsums[k] = cluster_updatable[k]? 0.0f : 1.0f; } - #pragma omp parallel + #pragma omp parallel num_threads(fsparallel::nth()) { float* __restrict local_feats[10]; float* __restrict local_wsums = new float[K]; @@ -327,7 +328,7 @@ namespace fslic { void ContextLSC::normalize_features(float *__restrict numers[10], float* __restrict weights, int size) { - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < size; i++) { for (int ix_feat = 0; ix_feat < 10; ix_feat++) { numers[ix_feat][i] /= weights[i]; diff --git a/parallel.cpp b/parallel.cpp new file mode 100644 index 0000000..5c19d43 --- /dev/null +++ b/parallel.cpp @@ -0,0 +1,63 @@ +#include +#include +#include +#ifdef _OPENMP +#define PARALLELISM_SUPPORTED true +#include +#else +#define PARALLELISM_SUPPORTED false +#define omp_get_max_threads() 1 +#endif +#include "parallel.h" + +namespace fsparallel { + thread_local int num_threads = -1; + + bool parallelism_supported() { + return PARALLELISM_SUPPORTED; + } + + static int parse_env(const char *name) { + if (const char * env_p = std::getenv(name)) { + try { + return std::stoi(std::string(env_p)); + } catch (const std::invalid_argument &ia) { + return -1; + } + } + return -1; + } + + int nth() { + if (num_threads < 0) { + int n; + n = parse_env("FSLIC_NUM_THREADS"); + if (n > 0) { + return n; + } else if (n == 0) { + return omp_get_max_threads(); + } + + n = parse_env("OMP_NUM_THREADS"); + if (n > 0) { + return n; + } else if (n == 0) { + return omp_get_max_threads(); + } + return omp_get_max_threads(); + } else if (num_threads == 0) { + return omp_get_max_threads(); + } else { + return num_threads; + } + } + + Scope::Scope(int n) { + old_val = num_threads; + num_threads = n; + } + + Scope::~Scope() { + num_threads = old_val; + } +}; diff --git a/parallel.h b/parallel.h new file mode 100644 index 0000000..1f1e3b2 --- /dev/null +++ b/parallel.h @@ -0,0 +1,16 @@ +#ifndef FSLIC_PARALLEL_H +#define FSLIC_PARALLEL_H + +namespace fsparallel { + bool parallelism_supported(); + int nth(); + class Scope { + private: + int old_val; + public: + Scope(int n); + ~Scope(); + }; +}; + +#endif diff --git a/preemptive.h b/preemptive.h index 18ae32c..81d10a2 100644 --- a/preemptive.h +++ b/preemptive.h @@ -3,6 +3,7 @@ #include #include "fast-slic-common.h" #include "simd-helper.hpp" +#include "parallel.h" struct PreemptiveTile { int sy, sx, ey, ex; @@ -88,7 +89,7 @@ class PreemptiveGrid { if (!enabled) return; std::fill(num_changes.begin(), num_changes.end(), 0); - #pragma omp parallel for + #pragma omp parallel for num_threads(fsparallel::nth()) for (int ci = 0; ci < CH; ci++) { for (int cj = 0; cj < CW; cj++) { if (!is_active[CW * ci + cj]) continue; diff --git a/setup.py b/setup.py index 41d3c0a..ac96928 100755 --- a/setup.py +++ b/setup.py @@ -122,7 +122,7 @@ def _check_neon(): Extension( "cfast_slic", include_dirs=[np.get_include()], - sources=["timer.cpp", "fast-slic.cpp", "cca.cpp", "context.cpp", "context-impl.cpp", "lsc.cpp", "lsc-builder.cpp", "cfast_slic.pyx"], + sources=["timer.cpp", "parallel.cpp", "fast-slic.cpp", "cca.cpp", "context.cpp", "context-impl.cpp", "lsc.cpp", "lsc-builder.cpp", "cfast_slic.pyx"], extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, language="c++",