Skip to content

Commit

Permalink
Add timer facility
Browse files Browse the repository at this point in the history
  • Loading branch information
Algy committed Aug 29, 2019
1 parent 37aa36d commit 72f4e1c
Show file tree
Hide file tree
Showing 8 changed files with 269 additions and 136 deletions.
95 changes: 53 additions & 42 deletions cca.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <ctime>
#include <queue>
#include <deque>
#include "timer.h"

typedef std::chrono::high_resolution_clock Clock;

Expand Down Expand Up @@ -189,66 +190,76 @@ namespace cca {
}
};

// auto t0 = Clock::now();
DisjointSet disjoint_set = assign_disjoint_set(out, H, W);
// auto t1 = Clock::now();
std::unique_ptr<ComponentSet> cc_set { disjoint_set.flatten() };
fstimer::Scope s("cca");
std::unique_ptr<ComponentSet> cc_set;
{
fstimer::begin("build_disjoint_set");
DisjointSet disjoint_set = assign_disjoint_set(out, H, W);
fstimer::end();
fstimer::begin("flatten");
cc_set = std::move(disjoint_set.flatten());
fstimer::end();
}

int num_components = cc_set->num_components;

std::vector<label_no_t> substitute(num_components, 0xFFFF);
std::vector<component_no_t> comps;
comps.reserve(max_label_size);

for (component_no_t component_no = 0; component_no < num_components; component_no++) {
if (cc_set->num_component_members[component_no] >= min_threshold) {
comps.push_back(component_no);
{
fstimer::Scope s("threshold_by_area");
for (component_no_t component_no = 0; component_no < num_components; component_no++) {
if (cc_set->num_component_members[component_no] >= min_threshold) {
comps.push_back(component_no);
}
}
}

areacmpcls areacmp(cc_set->num_component_members);
leader_index_cmpcls leadercmp(cc_set->component_leaders);

if ((std::size_t)max_label_size < comps.size()) {
std::partial_sort(comps.begin(), comps.begin() + max_label_size, comps.end(), areacmp);
comps.erase(comps.begin() + max_label_size, comps.end());
{
fstimer::Scope s("sort");
if ((std::size_t)max_label_size < comps.size()) {
std::partial_sort(comps.begin(), comps.begin() + max_label_size, comps.end(), areacmp);
comps.erase(comps.begin() + max_label_size, comps.end());
}
std::sort(comps.begin(), comps.end(), leadercmp);
}
std::sort(comps.begin(), comps.end(), leadercmp);
label_no_t next_label = 0;

for (component_no_t component_no : comps) {
substitute[component_no] = next_label++;
}
if (num_components > 0 && substitute[0] == 0xFFFF) substitute[0] = 0;

for (component_no_t component_no = 0; component_no < num_components; component_no++) {
if (substitute[component_no] != 0xFFFF) continue;
int leader_index = cc_set->component_leaders[component_no];
label_no_t subs_label = 0xFFFF;
if (leader_index % W > 0) {
subs_label = substitute[cc_set->component_assignment[leader_index - 1]];
} else {
subs_label = substitute[cc_set->component_assignment[leader_index - W]];
}
if (subs_label == 0xFFFF) {
subs_label = 0;
// std::cerr << "leader_y " << leader_index << "\n";
}
substitute[component_no] = subs_label;
}

{
fstimer::Scope s("substitute");
for (component_no_t component_no : comps) {
substitute[component_no] = next_label++;
}
if (num_components > 0 && substitute[0] == 0xFFFF) substitute[0] = 0;

#pragma omp parallel for
for (int i = 0; i < H * W; i++) {
out[i] = substitute[cc_set->component_assignment[i]];
for (component_no_t component_no = 0; component_no < num_components; component_no++) {
if (substitute[component_no] != 0xFFFF) continue;
int leader_index = cc_set->component_leaders[component_no];
label_no_t subs_label = 0xFFFF;
if (leader_index % W > 0) {
subs_label = substitute[cc_set->component_assignment[leader_index - 1]];
} else {
subs_label = substitute[cc_set->component_assignment[leader_index - W]];
}
if (subs_label == 0xFFFF) {
subs_label = 0;
// std::cerr << "leader_y " << leader_index << "\n";
}
substitute[component_no] = subs_label;
}
}

// auto t3 = Clock::now();
// auto t6 = Clock::now();

// std::cerr << " disjoint: " << micro(t1 -t0) << "us" << std::endl;
// std::cerr << " flatten: " << micro(t3 - t1) << "us" << std::endl;
// std::cerr << " unlabel_comp: " << micro(t4 - t3) << "us" << std::endl;
// std::cerr << " adj: " << micro(t5 - t4) << "us" << std::endl;
// std::cerr << " writeback_comp: " << micro(t6 - t5) << "us" << std::endl;
{
fstimer::Scope s("output");
#pragma omp parallel for
for (int i = 0; i < H * W; i++) {
out[i] = substitute[cc_set->component_assignment[i]];
}
}
}
};
4 changes: 4 additions & 0 deletions cfast_slic.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from libc.stdint cimport uint8_t, uint32_t, uint16_t, int16_t
from libcpp cimport bool
from libcpp.string cimport string

cdef extern from "fast-slic-common.h":
ctypedef struct Cluster:
Expand Down Expand Up @@ -46,6 +47,7 @@ cdef extern from "context.h" namespace "fslic":
void initialize_state() nogil
bool parallelism_supported() nogil
void iterate(uint16_t *assignment, int max_iter) nogil except +
string get_timing_report();

cdef cppclass ContextRealDist:
int16_t subsample_stride_config
Expand All @@ -64,6 +66,7 @@ cdef extern from "context.h" namespace "fslic":
void initialize_state() nogil
bool parallelism_supported() nogil
void iterate(uint16_t *assignment, int max_iter) nogil except +
string get_timing_report();

cdef cppclass ContextRealDistL2(ContextRealDist):
ContextRealDistL2(int H, int W, int K, const uint8_t* image, Cluster *clusters) except +
Expand Down Expand Up @@ -118,6 +121,7 @@ cdef class SlicModel:
cdef public object preemptive
cdef public float preemptive_thres
cdef public object manhattan_spatial_dist
cdef public object last_timing_report

cpdef void initialize(self, const uint8_t [:, :, ::1] image)
cpdef iterate(self, const uint8_t [:, :, ::1] image, int max_iter, float compactness, float min_size_factor, uint8_t subsample_stride)
Expand Down
2 changes: 2 additions & 0 deletions cfast_slic.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ cdef class SlicModel:
max_iter,
)
finally:
self.last_timing_report = context.get_timing_report().decode("utf-8")
del context
else:
if self.real_dist_type == 'standard':
Expand Down Expand Up @@ -240,6 +241,7 @@ cdef class SlicModel:
max_iter,
)
finally:
self.last_timing_report = context_real_dist.get_timing_report().decode("utf-8")
del context_real_dist
result = assignments.astype(np.int16)
result[result == 0xFFFF] = -1
Expand Down
145 changes: 64 additions & 81 deletions context.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "context.h"
#include "cca.h"
#include "cielab.h"
#include "timer.h"

#include <limits>

Expand All @@ -12,8 +13,6 @@
#define omp_get_num_threads() 1
#endif

// #define FAST_SLIC_TIMER

namespace fslic {
template<typename DistType>
BaseContext<DistType>::~BaseContext() {
Expand Down Expand Up @@ -136,100 +135,84 @@ namespace fslic {
template<typename DistType>
void BaseContext<DistType>::iterate(uint16_t *assignment, int max_iter) {
{
# ifdef FAST_SLIC_TIMER
auto t0 = Clock::now();
# endif
#pragma omp parallel
fstimer::Scope s("iterate");
{
#pragma omp for
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
for (int k = 0; k < 3; k++) {
quad_image.get(i, 4 * j + k) = image[i * W * 3 + 3 * j + k];
fstimer::Scope s("write_to_buffer");
#pragma omp parallel
{
#pragma omp for
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
for (int k = 0; k < 3; k++) {
quad_image.get(i, 4 * j + k) = image[i * W * 3 + 3 * j + k];
}
}
}
}

#pragma omp for
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
this->assignment.get(i, j) = 0xFFFF;
#pragma omp for
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
this->assignment.get(i, j) = 0xFFFF;
}
}
}
}

# ifdef FAST_SLIC_TIMER
auto t1 = Clock::now();
std::cerr << "Copy Image&initialize label map: " << std::chrono::duration_cast<std::chrono::microseconds>(t1 - t0).count() << "us\n";
# endif
}
{
fstimer::Scope s("cielab_conversion");
if (convert_to_lab) {
rgb_to_lab(&quad_image.get(0, 0), quad_image.contiguous_memory_size());
}
}

if (convert_to_lab) {
rgb_to_lab(&quad_image.get(0, 0), quad_image.contiguous_memory_size());
}
subsample_rem = 0;
subsample_stride = my_min<int>(subsample_stride_config, (int)(2 * S + 1));
{
fstimer::Scope s("before_iteration");
before_iteration();
}
preemptive_grid.initialize(preemptive, preemptive_thres, subsample_stride);

subsample_rem = 0;
subsample_stride = my_min<int>(subsample_stride_config, (int)(2 * S + 1));
# ifdef FAST_SLIC_TIMER
auto ts = Clock::now();
# endif
before_iteration();
# ifdef FAST_SLIC_TIMER
auto tt = Clock::now();
std::cerr << "before_iteration " << std::chrono::duration_cast<std::chrono::microseconds>(tt-ts).count() << "us\n";
# endif
preemptive_grid.initialize(preemptive, preemptive_thres, subsample_stride);

for (int i = 0; i < max_iter; i++) {
# ifdef FAST_SLIC_TIMER
auto t1 = Clock::now();
# endif
assign();
# ifdef FAST_SLIC_TIMER
auto t2 = Clock::now();
# endif
preemptive_grid.set_old_assignment(this->assignment);
update();
preemptive_grid.set_new_assignment(this->assignment);
# ifdef FAST_SLIC_TIMER
auto t21 = Clock::now();
# endif
after_update();
# ifdef FAST_SLIC_TIMER
auto t3 = Clock::now();
std::cerr << "assignment " << std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count() << "us\n";
std::cerr << "update "<< std::chrono::duration_cast<std::chrono::microseconds>(t3-t2).count() << "us (post " << std::chrono::duration_cast<std::chrono::microseconds>(t3 - t21).count() << "us)\n";
# endif
subsample_rem = (subsample_rem + 1) % subsample_stride;
}
preemptive_grid.finalize();
for (int i = 0; i < max_iter; i++) {
{
fstimer::Scope s("assign");
assign();
}

full_assign();
{
fstimer::Scope s("update");
preemptive_grid.set_old_assignment(this->assignment);
update();
preemptive_grid.set_new_assignment(this->assignment);
}

{
# ifdef FAST_SLIC_TIMER
auto t1 = Clock::now();
# endif

#pragma omp parallel for
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
assignment[W * i + j] = this->assignment.get(i, j);
{
fstimer::Scope s("after_update");
after_update();
}
subsample_rem = (subsample_rem + 1) % subsample_stride;
}
preemptive_grid.finalize();

{
fstimer::Scope s("full_assign");
full_assign();
}
{
fstimer::Scope s("write_back");
#pragma omp parallel for
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
assignment[W * i + j] = this->assignment.get(i, j);
}
}
}
{
fstimer::Scope s("enforce_connectivity");
enforce_connectivity(assignment);
}
# ifdef FAST_SLIC_TIMER
auto t2 = Clock::now();
std::cerr << "Write back assignment"<< std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count() << "us \n";
# endif
}
# ifdef FAST_SLIC_TIMER
auto t1 = Clock::now();
# endif
enforce_connectivity(assignment);
# ifdef FAST_SLIC_TIMER
auto t2 = Clock::now();
std::cerr << "enforce connectivity "<< std::chrono::duration_cast<std::chrono::microseconds>(t2-t1).count() << "us \n";
# endif
last_timing_report = fstimer::get_report();
}

template<typename DistType>
Expand Down
Loading

0 comments on commit 72f4e1c

Please sign in to comment.