From 72f4e1ccad46348bcce0cd04b253b87063dee7b3 Mon Sep 17 00:00:00 2001 From: Algy Date: Thu, 29 Aug 2019 22:53:29 +0900 Subject: [PATCH] Add timer facility --- cca.cpp | 95 ++++++++++++++++++-------------- cfast_slic.pxd | 4 ++ cfast_slic.pyx | 2 + context.cpp | 145 ++++++++++++++++++++++--------------------------- context.h | 25 +++++---- setup.py | 2 +- timer.cpp | 54 ++++++++++++++++++ timer.h | 78 ++++++++++++++++++++++++++ 8 files changed, 269 insertions(+), 136 deletions(-) create mode 100644 timer.cpp create mode 100644 timer.h diff --git a/cca.cpp b/cca.cpp index 5a82e74..a94a487 100644 --- a/cca.cpp +++ b/cca.cpp @@ -12,6 +12,7 @@ #include #include #include +#include "timer.h" typedef std::chrono::high_resolution_clock Clock; @@ -189,66 +190,76 @@ namespace cca { } }; - // auto t0 = Clock::now(); - DisjointSet disjoint_set = assign_disjoint_set(out, H, W); - // auto t1 = Clock::now(); - std::unique_ptr cc_set { disjoint_set.flatten() }; + fstimer::Scope s("cca"); + std::unique_ptr cc_set; + { + fstimer::begin("build_disjoint_set"); + DisjointSet disjoint_set = assign_disjoint_set(out, H, W); + fstimer::end(); + fstimer::begin("flatten"); + cc_set = std::move(disjoint_set.flatten()); + fstimer::end(); + } int num_components = cc_set->num_components; + std::vector substitute(num_components, 0xFFFF); std::vector comps; comps.reserve(max_label_size); - for (component_no_t component_no = 0; component_no < num_components; component_no++) { - if (cc_set->num_component_members[component_no] >= min_threshold) { - comps.push_back(component_no); + { + fstimer::Scope s("threshold_by_area"); + for (component_no_t component_no = 0; component_no < num_components; component_no++) { + if (cc_set->num_component_members[component_no] >= min_threshold) { + comps.push_back(component_no); + } } } areacmpcls areacmp(cc_set->num_component_members); leader_index_cmpcls leadercmp(cc_set->component_leaders); - if ((std::size_t)max_label_size < comps.size()) { - std::partial_sort(comps.begin(), comps.begin() + max_label_size, comps.end(), areacmp); - comps.erase(comps.begin() + max_label_size, comps.end()); + { + fstimer::Scope s("sort"); + if ((std::size_t)max_label_size < comps.size()) { + std::partial_sort(comps.begin(), comps.begin() + max_label_size, comps.end(), areacmp); + comps.erase(comps.begin() + max_label_size, comps.end()); + } + std::sort(comps.begin(), comps.end(), leadercmp); } - std::sort(comps.begin(), comps.end(), leadercmp); label_no_t next_label = 0; - for (component_no_t component_no : comps) { - substitute[component_no] = next_label++; - } - if (num_components > 0 && substitute[0] == 0xFFFF) substitute[0] = 0; - - for (component_no_t component_no = 0; component_no < num_components; component_no++) { - if (substitute[component_no] != 0xFFFF) continue; - int leader_index = cc_set->component_leaders[component_no]; - label_no_t subs_label = 0xFFFF; - if (leader_index % W > 0) { - subs_label = substitute[cc_set->component_assignment[leader_index - 1]]; - } else { - subs_label = substitute[cc_set->component_assignment[leader_index - W]]; - } - if (subs_label == 0xFFFF) { - subs_label = 0; - // std::cerr << "leader_y " << leader_index << "\n"; - } - substitute[component_no] = subs_label; - } - + { + fstimer::Scope s("substitute"); + for (component_no_t component_no : comps) { + substitute[component_no] = next_label++; + } + if (num_components > 0 && substitute[0] == 0xFFFF) substitute[0] = 0; - #pragma omp parallel for - for (int i = 0; i < H * W; i++) { - out[i] = substitute[cc_set->component_assignment[i]]; + for (component_no_t component_no = 0; component_no < num_components; component_no++) { + if (substitute[component_no] != 0xFFFF) continue; + int leader_index = cc_set->component_leaders[component_no]; + label_no_t subs_label = 0xFFFF; + if (leader_index % W > 0) { + subs_label = substitute[cc_set->component_assignment[leader_index - 1]]; + } else { + subs_label = substitute[cc_set->component_assignment[leader_index - W]]; + } + if (subs_label == 0xFFFF) { + subs_label = 0; + // std::cerr << "leader_y " << leader_index << "\n"; + } + substitute[component_no] = subs_label; + } } - // auto t3 = Clock::now(); - // auto t6 = Clock::now(); - // std::cerr << " disjoint: " << micro(t1 -t0) << "us" << std::endl; - // std::cerr << " flatten: " << micro(t3 - t1) << "us" << std::endl; - // std::cerr << " unlabel_comp: " << micro(t4 - t3) << "us" << std::endl; - // std::cerr << " adj: " << micro(t5 - t4) << "us" << std::endl; - // std::cerr << " writeback_comp: " << micro(t6 - t5) << "us" << std::endl; + { + fstimer::Scope s("output"); + #pragma omp parallel for + for (int i = 0; i < H * W; i++) { + out[i] = substitute[cc_set->component_assignment[i]]; + } + } } }; diff --git a/cfast_slic.pxd b/cfast_slic.pxd index 7efdf39..0f5dd71 100644 --- a/cfast_slic.pxd +++ b/cfast_slic.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport uint8_t, uint32_t, uint16_t, int16_t from libcpp cimport bool +from libcpp.string cimport string cdef extern from "fast-slic-common.h": ctypedef struct Cluster: @@ -46,6 +47,7 @@ cdef extern from "context.h" namespace "fslic": void initialize_state() nogil bool parallelism_supported() nogil void iterate(uint16_t *assignment, int max_iter) nogil except + + string get_timing_report(); cdef cppclass ContextRealDist: int16_t subsample_stride_config @@ -64,6 +66,7 @@ cdef extern from "context.h" namespace "fslic": void initialize_state() nogil bool parallelism_supported() nogil void iterate(uint16_t *assignment, int max_iter) nogil except + + string get_timing_report(); cdef cppclass ContextRealDistL2(ContextRealDist): ContextRealDistL2(int H, int W, int K, const uint8_t* image, Cluster *clusters) except + @@ -118,6 +121,7 @@ cdef class SlicModel: cdef public object preemptive cdef public float preemptive_thres cdef public object manhattan_spatial_dist + cdef public object last_timing_report cpdef void initialize(self, const uint8_t [:, :, ::1] image) cpdef iterate(self, const uint8_t [:, :, ::1] image, int max_iter, float compactness, float min_size_factor, uint8_t subsample_stride) diff --git a/cfast_slic.pyx b/cfast_slic.pyx index 960b306..cc616b9 100644 --- a/cfast_slic.pyx +++ b/cfast_slic.pyx @@ -186,6 +186,7 @@ cdef class SlicModel: max_iter, ) finally: + self.last_timing_report = context.get_timing_report().decode("utf-8") del context else: if self.real_dist_type == 'standard': @@ -240,6 +241,7 @@ cdef class SlicModel: max_iter, ) finally: + self.last_timing_report = context_real_dist.get_timing_report().decode("utf-8") del context_real_dist result = assignments.astype(np.int16) result[result == 0xFFFF] = -1 diff --git a/context.cpp b/context.cpp index e3b3640..c2d98fb 100644 --- a/context.cpp +++ b/context.cpp @@ -1,6 +1,7 @@ #include "context.h" #include "cca.h" #include "cielab.h" +#include "timer.h" #include @@ -12,8 +13,6 @@ #define omp_get_num_threads() 1 #endif -// #define FAST_SLIC_TIMER - namespace fslic { template BaseContext::~BaseContext() { @@ -136,100 +135,84 @@ namespace fslic { template void BaseContext::iterate(uint16_t *assignment, int max_iter) { { -# ifdef FAST_SLIC_TIMER - auto t0 = Clock::now(); -# endif - #pragma omp parallel + fstimer::Scope s("iterate"); { - #pragma omp for - for (int i = 0; i < H; i++) { - for (int j = 0; j < W; j++) { - for (int k = 0; k < 3; k++) { - quad_image.get(i, 4 * j + k) = image[i * W * 3 + 3 * j + k]; + fstimer::Scope s("write_to_buffer"); + #pragma omp parallel + { + #pragma omp for + for (int i = 0; i < H; i++) { + for (int j = 0; j < W; j++) { + for (int k = 0; k < 3; k++) { + quad_image.get(i, 4 * j + k) = image[i * W * 3 + 3 * j + k]; + } } } - } - #pragma omp for - for (int i = 0; i < H; i++) { - for (int j = 0; j < W; j++) { - this->assignment.get(i, j) = 0xFFFF; + #pragma omp for + for (int i = 0; i < H; i++) { + for (int j = 0; j < W; j++) { + this->assignment.get(i, j) = 0xFFFF; + } } } } -# ifdef FAST_SLIC_TIMER - auto t1 = Clock::now(); - std::cerr << "Copy Image&initialize label map: " << std::chrono::duration_cast(t1 - t0).count() << "us\n"; -# endif - } + { + fstimer::Scope s("cielab_conversion"); + if (convert_to_lab) { + rgb_to_lab(&quad_image.get(0, 0), quad_image.contiguous_memory_size()); + } + } - if (convert_to_lab) { - rgb_to_lab(&quad_image.get(0, 0), quad_image.contiguous_memory_size()); - } + subsample_rem = 0; + subsample_stride = my_min(subsample_stride_config, (int)(2 * S + 1)); + { + fstimer::Scope s("before_iteration"); + before_iteration(); + } + preemptive_grid.initialize(preemptive, preemptive_thres, subsample_stride); - subsample_rem = 0; - subsample_stride = my_min(subsample_stride_config, (int)(2 * S + 1)); -# ifdef FAST_SLIC_TIMER - auto ts = Clock::now(); -# endif - before_iteration(); -# ifdef FAST_SLIC_TIMER - auto tt = Clock::now(); - std::cerr << "before_iteration " << std::chrono::duration_cast(tt-ts).count() << "us\n"; -# endif - preemptive_grid.initialize(preemptive, preemptive_thres, subsample_stride); - - for (int i = 0; i < max_iter; i++) { -# ifdef FAST_SLIC_TIMER - auto t1 = Clock::now(); -# endif - assign(); -# ifdef FAST_SLIC_TIMER - auto t2 = Clock::now(); -# endif - preemptive_grid.set_old_assignment(this->assignment); - update(); - preemptive_grid.set_new_assignment(this->assignment); -# ifdef FAST_SLIC_TIMER - auto t21 = Clock::now(); -# endif - after_update(); -# ifdef FAST_SLIC_TIMER - auto t3 = Clock::now(); - std::cerr << "assignment " << std::chrono::duration_cast(t2-t1).count() << "us\n"; - std::cerr << "update "<< std::chrono::duration_cast(t3-t2).count() << "us (post " << std::chrono::duration_cast(t3 - t21).count() << "us)\n"; -# endif - subsample_rem = (subsample_rem + 1) % subsample_stride; - } - preemptive_grid.finalize(); + for (int i = 0; i < max_iter; i++) { + { + fstimer::Scope s("assign"); + assign(); + } - full_assign(); + { + fstimer::Scope s("update"); + preemptive_grid.set_old_assignment(this->assignment); + update(); + preemptive_grid.set_new_assignment(this->assignment); + } - { -# ifdef FAST_SLIC_TIMER - auto t1 = Clock::now(); -# endif - - #pragma omp parallel for - for (int i = 0; i < H; i++) { - for (int j = 0; j < W; j++) { - assignment[W * i + j] = this->assignment.get(i, j); + { + fstimer::Scope s("after_update"); + after_update(); } + subsample_rem = (subsample_rem + 1) % subsample_stride; + } + preemptive_grid.finalize(); + + { + fstimer::Scope s("full_assign"); + full_assign(); + } + { + fstimer::Scope s("write_back"); + #pragma omp parallel for + for (int i = 0; i < H; i++) { + for (int j = 0; j < W; j++) { + assignment[W * i + j] = this->assignment.get(i, j); + } + } + } + { + fstimer::Scope s("enforce_connectivity"); + enforce_connectivity(assignment); } -# ifdef FAST_SLIC_TIMER - auto t2 = Clock::now(); - std::cerr << "Write back assignment"<< std::chrono::duration_cast(t2-t1).count() << "us \n"; -# endif } -# ifdef FAST_SLIC_TIMER - auto t1 = Clock::now(); -# endif - enforce_connectivity(assignment); -# ifdef FAST_SLIC_TIMER - auto t2 = Clock::now(); - std::cerr << "enforce connectivity "<< std::chrono::duration_cast(t2-t1).count() << "us \n"; -# endif + last_timing_report = fstimer::get_report(); } template diff --git a/context.h b/context.h index 384c690..bd2227c 100644 --- a/context.h +++ b/context.h @@ -47,6 +47,8 @@ namespace fslic { simd_helper::AlignedArray spatial_dist_patch; PreemptiveGrid preemptive_grid; + public: + std::string last_timing_report; public: BaseContext(int H, int W, int K, const uint8_t* image, Cluster *clusters) : H(H), W(W), K(K), image(image), clusters(clusters), S(sqrt(H * W / K)), @@ -56,30 +58,29 @@ namespace fslic { spatial_dist_patch(2 * S + 1, 2 * S + 1), preemptive_grid(H, W, S) {}; virtual ~BaseContext(); - public: - template - inline T fit_to_stride(T value) { - T plus_rem = subsample_rem - value % subsample_stride; - if (plus_rem < 0) plus_rem += subsample_stride; - return value + plus_rem; - } - - inline bool valid_subsample_row(int i) { - return i % subsample_stride == subsample_rem; - } - public: void initialize_clusters(); void initialize_state(); void enforce_connectivity(uint16_t *assignment); bool parallelism_supported(); void iterate(uint16_t *assignment, int max_iter); + std::string get_timing_report() { return last_timing_report; }; private: void prepare_spatial(); void assign(); void update(); protected: void full_assign(); + template + inline T fit_to_stride(T value) { + T plus_rem = subsample_rem - value % subsample_stride; + if (plus_rem < 0) plus_rem += subsample_stride; + return value + plus_rem; + } + + inline bool valid_subsample_row(int i) { + return i % subsample_stride == subsample_rem; + } protected: virtual void before_iteration() {}; virtual void after_update() {}; diff --git a/setup.py b/setup.py index a4e8871..41d3c0a 100755 --- a/setup.py +++ b/setup.py @@ -122,7 +122,7 @@ def _check_neon(): Extension( "cfast_slic", include_dirs=[np.get_include()], - sources=["fast-slic.cpp", "cca.cpp", "context.cpp", "context-impl.cpp", "lsc.cpp", "lsc-builder.cpp", "cfast_slic.pyx"], + sources=["timer.cpp", "fast-slic.cpp", "cca.cpp", "context.cpp", "context-impl.cpp", "lsc.cpp", "lsc-builder.cpp", "cfast_slic.pyx"], extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, language="c++", diff --git a/timer.cpp b/timer.cpp new file mode 100644 index 0000000..52f67d8 --- /dev/null +++ b/timer.cpp @@ -0,0 +1,54 @@ +#include "timer.h" + +namespace fstimer { + void TimerSection::dump_json(std::stringstream &stream) { + stream << "{\"name\": \"" << name << "\", \"duration\": " << get_micros() + << ", \"children\": "; + stream << "["; + bool is_first = true; + for (const auto & child : children) { + if (is_first) { + is_first = false; + } else { + stream << ","; + } + child->dump_json(stream); + } + stream << "]}"; + } + + void Timer::begin(const std::string &name) { + std::unique_ptr section { new TimerSection(name) }; + section->has_started(); + stack.push_back(std::move(section)); + } + + void Timer::end() { + if (stack.empty()) return; + std::unique_ptr section { std::move(stack.back()) }; + stack.pop_back(); + section->has_finished(); + if (stack.empty()) { + last = std::move(section); + } else { + stack.back()->add_child(std::move(section)); + } + } + + std::string Timer::get_report() { + if (last == nullptr) return std::string(""); + std::stringstream stream; + last->dump_json(stream); + return stream.str(); + } + + thread_local Timer thread_local_timer; + Timer& local_timer() { return thread_local_timer; }; + void begin(const std::string& name) { thread_local_timer.begin(name); }; + void end() { thread_local_timer.end(); }; + std::string get_report() { return thread_local_timer.get_report(); }; + + Scope::Scope(Timer &timer, const std::string &name) : timer(timer) { timer.begin(name); }; + Scope::Scope(const std::string &name) : Scope(thread_local_timer, name) {}; + Scope::~Scope() { timer.end(); }; +}; diff --git a/timer.h b/timer.h new file mode 100644 index 0000000..e846e8b --- /dev/null +++ b/timer.h @@ -0,0 +1,78 @@ +#ifndef FAST_SLIC_TIMER_H +#define FAST_SLIC_TIMER_H +#include +#include +#include +#include +#include +#include + +namespace fstimer { + typedef std::chrono::high_resolution_clock Clock; + class TimerSection { + private: + std::string name; + std::chrono::time_point start_time; + std::chrono::time_point end_time; + std::vector> children; + public: + TimerSection(const std::string& name) : name(name) {}; + void has_started() { start_time = Clock::now(); }; + void has_finished() { + end_time = Clock::now(); + }; + + void add_child(std::unique_ptr child) { + children.push_back(std::move(child)); + } + + template + long long get_duration() const { + return std::chrono::duration_cast(end_time - start_time).count(); + } + + long long get_nanos() const { + return get_duration(); + }; + + long long get_micros() const { + return get_duration(); + }; + + long long get_millis() const { + return get_duration(); + }; + + std::vector>& get_children() { + return children; + } + + void dump_json(std::stringstream &stream); + }; + + class Timer { + private: + std::vector> stack; + std::unique_ptr last; + public: + Timer() {}; + void begin(const std::string& name); + void end(); + std::string get_report(); + }; + void begin(const std::string& name); + void end(); + std::string get_report(); + Timer& local_timer(); + + class Scope { + private: + Timer &timer; + public: + Scope(Timer &timer, const std::string &name); + Scope(const std::string &name); + ~Scope(); + }; +}; + +#endif