Skip to content

Commit

Permalink
Add num_threads option
Browse files Browse the repository at this point in the history
  • Loading branch information
Algy committed Aug 30, 2019
1 parent 9b120d7 commit 1e139c4
Show file tree
Hide file tree
Showing 15 changed files with 122 additions and 40 deletions.
3 changes: 2 additions & 1 deletion arch/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#include <cassert>
#include "../../context.h"
#include "../../lsc.h"
#include "../../parallel.h"

inline void get_assignment_value_vec(
const Cluster* cluster,
Expand Down Expand Up @@ -244,7 +245,7 @@ namespace fslic {
}

virtual void normalize_features(float * __restrict numers[10], float* __restrict weights, int size) {
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < size; i += 4) {
float32x4_t reciprocal_w = vrecpeq_f32(vld1q_f32(&weights[i]));
vst1q_f32(&numers[0][i], vmulq_f32(vld1q_f32(&numers[0][i]), reciprocal_w));
Expand Down
3 changes: 2 additions & 1 deletion arch/x64/avx2.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <immintrin.h>
#include "../../context.h"
#include "../../lsc.h"
#include "../../parallel.h"


inline __m256 _mm256_set_ps1(float v) {
Expand Down Expand Up @@ -280,7 +281,7 @@ namespace fslic {
}

void normalize_features(float * __restrict img_feats[10], float* __restrict weights, int size) {
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < size; i += 8) {
__m256 reciprocal_w = _mm256_rcp_ps(_mm256_loadu_ps(&weights[i]));
_mm256_storeu_ps(&img_feats[0][i], _mm256_mul_ps(_mm256_loadu_ps(&img_feats[0][i]), reciprocal_w));
Expand Down
7 changes: 4 additions & 3 deletions cca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <queue>
#include <deque>
#include "timer.h"
#include "parallel.h"

typedef std::chrono::high_resolution_clock Clock;

Expand All @@ -33,7 +34,7 @@ namespace cca {
DisjointSet cc_set(H * W);

std::vector<int> seam_ys;
#pragma omp parallel
#pragma omp parallel num_threads(fsparallel::nth())
{
bool is_first = true;
int seam = 0;
Expand Down Expand Up @@ -104,7 +105,7 @@ namespace cca {
std::unique_ptr<ComponentSet> result { new ComponentSet(size) };
std::vector<std::vector<tree_node_t>> rootset;
std::vector<int> root_offsets;
#pragma omp parallel
#pragma omp parallel num_threads(fsparallel::nth())
{
#pragma omp single
{
Expand Down Expand Up @@ -256,7 +257,7 @@ namespace cca {

{
fstimer::Scope s("output");
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < H * W; i++) {
out[i] = substitute[cc_set->component_assignment[i]];
}
Expand Down
1 change: 1 addition & 0 deletions cfast_slic.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ cdef class NodeConnectivity:
cdef class SlicModel:
cdef Cluster* _c_clusters
cdef readonly int num_components
cdef public int num_threads
cdef public object initialized
cdef public object arch_name
cdef public object real_dist
Expand Down
3 changes: 3 additions & 0 deletions cfast_slic.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ cdef class SlicModel:
raise ValueError("num_components should be a non-negative integer")

self.num_components = num_components
self.num_threads = -1
self.arch_name = arch_name
self.real_dist = real_dist
self.real_dist_type = "standard"
Expand Down Expand Up @@ -172,6 +173,7 @@ cdef class SlicModel:
c_clusters,
)
try:
context.num_threads = self.num_threads
context.compactness = compactness
context.min_size_factor = min_size_factor
context.subsample_stride_config = subsample_stride
Expand Down Expand Up @@ -227,6 +229,7 @@ cdef class SlicModel:
raise RuntimeError("No such real_dist_type " + repr(self.real_dist_type))

try:
context_real_dist.num_threads = self.num_threads
context_real_dist.compactness = compactness
context_real_dist.min_size_factor = min_size_factor
context_real_dist.subsample_stride_config = subsample_stride
Expand Down
7 changes: 4 additions & 3 deletions cielab.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cmath>
#include <cstdint>
#include <vector>
#include "parallel.h"
/*
def get_xyz_nonlin_tbl(a):
v = a / 255.
Expand Down Expand Up @@ -335,7 +336,7 @@ static FastCIELabCvt fast_cielab_cvt;

static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool scale_L = false) {
if (scale_L) {
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int s = 0; s < size; s += 4) {
fast_cielab_cvt.convert<true>(
aligned_quad_image[s],
Expand All @@ -347,7 +348,7 @@ static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int s
);
}
} else {
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int s = 0; s < size; s += 4) {
fast_cielab_cvt.convert<false>(
aligned_quad_image[s],
Expand All @@ -364,7 +365,7 @@ static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int s

#if 0
static void rgb_to_cielab_orig(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool parallel) {
#pragma omp parallel for if(parallel)
#pragma omp parallel for if(parallel) num_threads(fsparallel::nth())
for (int s = 0; s < size; s += 4) {
float r = _srgb_gamma_tbl[aligned_quad_image[s]],
g = _srgb_gamma_tbl[aligned_quad_image[s+1]],
Expand Down
26 changes: 8 additions & 18 deletions context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,10 @@
#include "cca.h"
#include "cielab.h"
#include "timer.h"
#include "parallel.h"

#include <limits>

#ifdef _OPENMP
#include <omp.h>
#endif

#ifndef _OPENMP
#define omp_get_num_threads() 1
#endif

namespace fslic {
template<typename DistType>
BaseContext<DistType>::~BaseContext() {
Expand Down Expand Up @@ -105,20 +98,17 @@ namespace fslic {

template<typename DistType>
bool BaseContext<DistType>::parallelism_supported() {
#if defined(_OPENMP)
return true;
#else
return false;
#endif
return fsparallel::parallelism_supported();
}

template<typename DistType>
void BaseContext<DistType>::iterate(uint16_t *assignment, int max_iter) {
{
fsparallel::Scope parallel_scope(num_threads);
fstimer::Scope s("iterate");
{
fstimer::Scope s("write_to_buffer");
#pragma omp parallel
#pragma omp parallel num_threads(fsparallel::nth())
{
#pragma omp for
for (int i = 0; i < H; i++) {
Expand Down Expand Up @@ -180,7 +170,7 @@ namespace fslic {
}
{
fstimer::Scope s("write_back");
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
assignment[W * i + j] = this->assignment.get(i, j);
Expand All @@ -197,7 +187,7 @@ namespace fslic {

template<typename DistType>
void BaseContext<DistType>::assign() {
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
min_dists.get(i, j) = std::numeric_limits<DistType>::max();
Expand Down Expand Up @@ -226,7 +216,7 @@ namespace fslic {
grid_indices.push_back(i * cell_W + j);
}
}
#pragma omp parallel
#pragma omp parallel num_threads(fsparallel::nth())
{
std::vector<const Cluster*> target_clusters;
#pragma omp for
Expand Down Expand Up @@ -312,7 +302,7 @@ namespace fslic {
cluster_updatable[k] = preemptive_grid.is_updatable_cluster(clusters[k]);
}

#pragma omp parallel
#pragma omp parallel num_threads(fsparallel::nth())
{
std::vector<uint32_t> local_acc_vec(K * 5, 0); // sum of [y, x, r, g, b] in cluster
std::vector<uint32_t> local_num_cluster_members(K, 0);
Expand Down
2 changes: 1 addition & 1 deletion context.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ namespace fslic {
class BaseContext {
public:
int16_t subsample_stride_config = 3;
int num_threads = 0;
int num_threads = -1;
float compactness = 20;
float min_size_factor = 0.1;
bool convert_to_lab = false;
Expand Down
3 changes: 2 additions & 1 deletion fast-slic.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <utility>
#include "fast-slic.h"
#include "context.h"
#include "parallel.h"

extern "C" {
static uint32_t symmetric_int_hash(uint32_t x, uint32_t y) {
Expand Down Expand Up @@ -93,7 +94,7 @@ extern "C" {
conn->num_neighbors = new int[K];
conn->neighbors = new uint32_t*[K];

#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < K; i++) {
const Cluster* cluster = clusters + i;
int cell_center_x = cluster->x / S, cell_center_y = cluster->y / S;
Expand Down
4 changes: 3 additions & 1 deletion fast_slic/base_slic.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ def __init__(self,
convert_to_lab=True,
preemptive=False,
preemptive_thres=0.05,
manhattan_spatial_dist=True):
manhattan_spatial_dist=True,
num_threads=-1):
self.compactness = compactness
self.subsample_stride = subsample_stride
self.min_size_factor = min_size_factor
Expand All @@ -23,6 +24,7 @@ def __init__(self,
self._slic_model.preemptive = preemptive
self._slic_model.preemptive_thres = preemptive_thres
self._slic_model.manhattan_spatial_dist = manhattan_spatial_dist
self._slic_model.num_threads = num_threads

@property
def convert_to_lab(self):
Expand Down
19 changes: 10 additions & 9 deletions lsc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <cmath>
#include "lsc.h"
#include "cielab.h"
#include "parallel.h"

//map pixels into ten dimensional feature space

Expand Down Expand Up @@ -54,7 +55,7 @@ namespace fslic {
#ifdef FAST_SLIC_TIMER
auto t1 = Clock::now();
#endif
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < H; i++) {
const uint8_t* image_row = quad_image.get_row(i);
for (int j = 0; j < W; j++) {
Expand Down Expand Up @@ -106,7 +107,7 @@ namespace fslic {
const uint8_t* __restrict L = &image_planes[0][0];
const uint8_t* __restrict A = &image_planes[1][0];
const uint8_t* __restrict B = &image_planes[2][0];
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < len; i++) {
image_features[0][i] = L_cosine_map[L[i]];
image_features[1][i] = L_sine_map[L[i]];
Expand All @@ -117,7 +118,7 @@ namespace fslic {
}
// x1, x2, y1, y2

#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int y = 0; y < H; y++) {
std::copy(
width_cosine_map.begin(),
Expand All @@ -131,7 +132,7 @@ namespace fslic {
);
}

#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int y = 0; y < H; y++) {
std::fill_n(&image_features[8][y * W], W, height_cosine_map[y]);
std::fill_n(&image_features[9][y * W], W, height_sine_map[y]);
Expand All @@ -144,7 +145,7 @@ namespace fslic {
float sum_features[10];
std::fill_n(sum_features, 10, 0);
{
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int ix_feat = 0; ix_feat < 10; ix_feat++) {
float sum = 0;
for (int i = 0; i < len; i++) {
Expand All @@ -159,7 +160,7 @@ namespace fslic {
#endif

{
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < len; i++) {
float w = 0;
for (int ix_feat = 0; ix_feat < 10; ix_feat++) {
Expand Down Expand Up @@ -187,7 +188,7 @@ namespace fslic {
std::fill_n(feat, K, 0);
}

#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int k = 0; k < K; k++) {
const Cluster* cluster = &clusters[k];
int cluster_y = cluster->y, cluster_x = cluster->x;
Expand Down Expand Up @@ -257,7 +258,7 @@ namespace fslic {
wsums[k] = cluster_updatable[k]? 0.0f : 1.0f;
}

#pragma omp parallel
#pragma omp parallel num_threads(fsparallel::nth())
{
float* __restrict local_feats[10];
float* __restrict local_wsums = new float[K];
Expand Down Expand Up @@ -327,7 +328,7 @@ namespace fslic {


void ContextLSC::normalize_features(float *__restrict numers[10], float* __restrict weights, int size) {
#pragma omp parallel for
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < size; i++) {
for (int ix_feat = 0; ix_feat < 10; ix_feat++) {
numers[ix_feat][i] /= weights[i];
Expand Down
Loading

0 comments on commit 1e139c4

Please sign in to comment.