diff --git a/src/arch/arm/neon.h b/src/arch/arm/neon.h index 6936a3a..2fa9e21 100644 --- a/src/arch/arm/neon.h +++ b/src/arch/arm/neon.h @@ -6,22 +6,23 @@ inline void get_assignment_value_vec( const Cluster* cluster, - const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row, + const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row, const uint16_t* min_dist_row, const uint16_t* assignment_row, - uint16x8_t cluster_number_vec, uint8x16_t cluster_color_vec, + uint16x8_t cluster_number_vec, uint16x8_t cluster_color_vec, uint16x8_t& new_min_dist, uint16x8_t& new_assignment ) { uint16x8_t spatial_dist_vec = vld1q_u16(spatial_dist_patch_row); - uint8x16_t image_segment = vld1q_u8(img_quad_row); - uint8x16_t image_segment_2 = vld1q_u8(img_quad_row + 16); + uint16x8_t image_segment_1 = vld1q_u16(img_quad_row); + uint16x8_t image_segment_2 = vld1q_u16(img_quad_row + 8); + uint16x8_t image_segment_3 = vld1q_u16(img_quad_row + 16); + uint16x8_t image_segment_4 = vld1q_u16(img_quad_row + 24); - uint8x16_t abs_segment = vabdq_u8(image_segment, cluster_color_vec); - uint8x16_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec); + uint16x8_t abs_segment_1 = vabdq_u8(image_segment_1, cluster_color_vec); + uint16x8_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec); + uint16x8_t abs_segment_3 = vabdq_u8(image_segment_3, cluster_color_vec); + uint16x8_t abs_segment_4 = vabdq_u8(image_segment_4, cluster_color_vec); - uint32x4_t sad_segment = vpaddlq_u16(vpaddlq_u8(abs_segment)); - uint32x4_t sad_segment_2 = vpaddlq_u16(vpaddlq_u8(abs_segment_2)); - - uint16x8_t color_dist_vec = vcombine_u16(vmovn_u32(sad_segment), vmovn_u32(sad_segment_2)); + uint16x8_t color_dist_vec = vpaddq_u16(vpaddq_u16(abs_segment_1, abs_segment_2), vpaddq_u16(abs_segment_3, abs_segment_4)); uint16x8_t dist_vec = vaddq_u16(color_dist_vec, spatial_dist_vec); uint16x8_t old_assignment = vld1q_u16(assignment_row); @@ -60,28 +61,20 @@ namespace fslic { cluster_number }; - uint8x16_t cluster_color_vec = { - (uint8_t)cluster->r, - (uint8_t)cluster->g, - (uint8_t)cluster->b, - 0, - (uint8_t)cluster->r, - (uint8_t)cluster->g, - (uint8_t)cluster->b, - 0, - (uint8_t)cluster->r, - (uint8_t)cluster->g, - (uint8_t)cluster->b, + uint16x8_t cluster_color_vec = { + (uint16_t)cluster->r, + (uint16_t)cluster->g, + (uint16_t)cluster->b, 0, - (uint8_t)cluster->r, - (uint8_t)cluster->g, - (uint8_t)cluster->b, + (uint16_t)cluster->r, + (uint16_t)cluster->g, + (uint16_t)cluster->b, 0 }; int16_t patch_height = spatial_dist_patch.get_height(); for (int16_t i = fit_to_stride(y_lo) - y_lo; i < patch_height; i += subsample_stride) { const uint16_t* spatial_dist_patch_base_row = spatial_dist_patch.get_row(i); - const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo); + const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo); uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo); uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo); @@ -89,7 +82,7 @@ namespace fslic { uint16x8_t new_min_dist, new_assignment; \ uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \ uint16_t* assignment_row = assignment_base_row + j; /* unaligned */ \ - const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \ + const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \ const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \ get_assignment_value_vec( \ cluster, \ diff --git a/src/arch/x64/avx2.h b/src/arch/x64/avx2.h index 74fcbaf..444600b 100644 --- a/src/arch/x64/avx2.h +++ b/src/arch/x64/avx2.h @@ -10,7 +10,7 @@ inline __m256 _mm256_set_ps1(float v) { inline void get_assignment_value_vec( const Cluster* cluster, - const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row, + const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row, const uint16_t* min_dist_row, const uint16_t* assignment_row, __m128i cluster_number_vec, __m256i cluster_color_vec, __m128i order_swap_mask, @@ -23,8 +23,8 @@ inline void get_assignment_value_vec( __m128i spatial_dist_vec = _mm_load_si128((__m128i *)spatial_dist_patch_row); - __m256i image_segment = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)img_quad_row)); - __m256i image_segment_2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(img_quad_row + 16))); + __m256i image_segment = _mm256_loadu_si256((__m256i*)img_quad_row); + __m256i image_segment_2 = _mm256_loadu_si256((__m256i*)(img_quad_row + 16)); // [R1, G1, B1, A1, R2, G2, B2, A2, R3, G3, B3, A3, R3, G3, B3, A3] __m256i abd_segment = _mm256_abs_epi16(_mm256_subs_epi16(image_segment, cluster_color_vec)); @@ -114,7 +114,7 @@ namespace fslic { assert((long long)spatial_dist_patch_base_row % 32 == 0); #endif // not aligned - const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo); + const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo); uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo); uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo); @@ -122,7 +122,7 @@ namespace fslic { __m128i new_assignment__narrow, new_min_dist__narrow; \ uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \ uint16_t* assignment_row = assignment_base_row + j; /* unaligned */ \ - const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \ + const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \ const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \ get_assignment_value_vec( \ cluster, \ diff --git a/src/cielab.h b/src/cielab.h index 2794507..419999e 100644 --- a/src/cielab.h +++ b/src/cielab.h @@ -5,6 +5,7 @@ #include #include #include "parallel.h" +#include "simd-helper.hpp" /* def get_xyz_nonlin_tbl(a): v = a / 255. @@ -278,6 +279,7 @@ static float _srgb_gamma_tbl[256] = { #define srgb_shift 13 #define srgb_max (1 << srgb_shift) #define lab_shift 16 +#define output_shift 3 class FastCIELabCvt { public: @@ -301,8 +303,7 @@ class FastCIELabCvt { } - template - inline void convert(uint8_t R, uint8_t G, uint8_t B, uint8_t& l, uint8_t& a, uint8_t& b) { + inline void convert(uint8_t R, uint8_t G, uint8_t B, uint16_t& l, uint16_t& a, uint16_t& b) { int sr = srgb_gamma_tbl[R], sg = srgb_gamma_tbl[G], sb = srgb_gamma_tbl[B]; int xr = (Cb[0] * sr + Cb[1] * sg + Cb[2] * sb) >> lab_shift; @@ -315,13 +316,9 @@ class FastCIELabCvt { int ciea = 500 * (fx - fy) + (128 << srgb_shift); // to positive integer int cieb = 200 * (fy - fz) + (128 << srgb_shift); // to positive integer - if (scale_lab) { - ciel = ciel * 255 / 100; - } - - l = (uint8_t)((unsigned)ciel >> srgb_shift); - a = (uint8_t)((unsigned)ciea >> srgb_shift); - b = (uint8_t)((unsigned)cieb >> srgb_shift); + l = (uint16_t)((unsigned)ciel >> (srgb_shift - output_shift)); + a = (uint16_t)((unsigned)ciea >> (srgb_shift - output_shift)); + b = (uint16_t)((unsigned)cieb >> (srgb_shift - output_shift)); } private: @@ -334,33 +331,22 @@ class FastCIELabCvt { static FastCIELabCvt fast_cielab_cvt; -static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool scale_L = false) { - if (scale_L) { - #pragma omp parallel for num_threads(fsparallel::nth()) - for (int s = 0; s < size; s += 4) { - fast_cielab_cvt.convert( - aligned_quad_image[s], - aligned_quad_image[s+1], - aligned_quad_image[s+2], - out[s], - out[s+1], - out[s+2] - ); - } - } else { - #pragma omp parallel for num_threads(fsparallel::nth()) - for (int s = 0; s < size; s += 4) { - fast_cielab_cvt.convert( - aligned_quad_image[s], - aligned_quad_image[s+1], - aligned_quad_image[s+2], - out[s], - out[s+1], - out[s+2] +static void rgb_to_cielab(const uint8_t* image, int H, int W, simd_helper::AlignedArray &arr, int &shift_out) { + #pragma omp parallel for num_threads(fsparallel::nth()) + for (int i = 0; i < H; i++) { + for (int j = 0; j < W; j++) { + int index = W * i + j; + fast_cielab_cvt.convert( + image[3 * index], + image[3 * index + 1], + image[3 * index + 2], + arr.get(i, 4 * j), + arr.get(i, 4 * j + 1), + arr.get(i, 4 * j + 2) ); } - } + shift_out = output_shift; } static void rgb_to_cielab_orig(const uint8_t* image, float *out, int size) { diff --git a/src/context.cpp b/src/context.cpp index 715ac1f..658d333 100644 --- a/src/context.cpp +++ b/src/context.cpp @@ -22,6 +22,7 @@ namespace fslic { template void BaseContext::set_spatial_patch() { float coef = 1.0f / ((float)S / compactness); + coef *= (1 << color_shift); int16_t S_2 = 2 * S; if (manhattan_spatial_dist) { for (int16_t i = 0; i <= S_2; i++) { @@ -95,7 +96,6 @@ namespace fslic { template void BaseContext::initialize_state() { - set_spatial_patch(); } template @@ -109,10 +109,11 @@ namespace fslic { fsparallel::Scope parallel_scope(num_threads); fstimer::Scope s("iterate"); { - fstimer::Scope s("write_to_buffer"); - #pragma omp parallel num_threads(fsparallel::nth()) - { - #pragma omp for + fstimer::Scope s("cielab_conversion"); + if (convert_to_lab) { + rgb_to_cielab(image, H, W, quad_image, color_shift); + } else { + #pragma omp parallel num_threads(fsparallel::nth()) for (int i = 0; i < H; i++) { for (int j = 0; j < W; j++) { for (int k = 0; k < 3; k++) { @@ -120,21 +121,18 @@ namespace fslic { } } } - - #pragma omp for - for (int i = 0; i < H; i++) { - for (int j = 0; j < W; j++) { - this->assignment.get(i, j) = 0xFFFF; - } - } + color_shift = 0; } } - { - fstimer::Scope s("cielab_conversion"); - if (convert_to_lab) { - rgb_to_lab(&quad_image.get(0, 0), quad_image.contiguous_memory_size()); + fstimer::Scope s("write_to_buffer"); + #pragma omp parallel for num_threads(fsparallel::nth()) + for (int i = 0; i < H; i++) { + for (int j = 0; j < W; j++) { + this->assignment.get(i, j) = 0xFFFF; + } } + set_spatial_patch(); } subsample_rem = 0; @@ -261,7 +259,7 @@ namespace fslic { for (int i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) { if (!valid_subsample_row(i)) continue; - const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S)); + const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S)); uint16_t* __restrict assignment_row = assignment.get_row(i, cluster_x - S); DistType* __restrict min_dist_row = min_dists.get_row(i, cluster_x - S); const DistType* __restrict patch_row = spatial_dist_patch.get_row(i_off); @@ -289,11 +287,6 @@ namespace fslic { delete [] dist_row; } - template - void BaseContext::rgb_to_lab(uint8_t *quad_image, int size) { - rgb_to_cielab(quad_image, quad_image, size, false); - } - template void BaseContext::update() { @@ -401,7 +394,7 @@ namespace fslic { for (int16_t i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) { if (!valid_subsample_row(i)) continue; - const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S)); + const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S)); uint16_t* __restrict assignment_row = assignment.get_row(i, cluster_x - S); float* __restrict min_dist_row = min_dists.get_row(i, cluster_x - S); const float* __restrict patch_row = spatial_dist_patch.get_row(i_off); @@ -431,6 +424,7 @@ namespace fslic { void ContextRealDistL2::set_spatial_patch() { float coef = 1.0f / ((float)S / compactness); + coef *= (1 << color_shift); int16_t S_2 = 2 * S; for (int16_t i = 0; i <= S_2; i++) { for (int16_t j = 0; j <= S_2; j++) { @@ -474,6 +468,7 @@ namespace fslic { template void ContextRealDistNoQ::assign_clusters_proto(const Cluster** target_clusters, int size) { float coef = 1.0f / ((float)S / compactness); + coef *= (1 << color_shift); for (int cidx = 0; cidx < size; cidx++) { const Cluster* cluster = target_clusters[cidx]; diff --git a/src/context.h b/src/context.h index b78d09d..e5a39f6 100644 --- a/src/context.h +++ b/src/context.h @@ -40,8 +40,11 @@ namespace fslic { protected: int16_t subsample_rem; int16_t subsample_stride; + + protected: + int color_shift; protected: - simd_helper::AlignedArray quad_image; + simd_helper::AlignedArray quad_image; simd_helper::AlignedArray assignment; simd_helper::AlignedArray min_dists; simd_helper::AlignedArray spatial_dist_patch; @@ -86,7 +89,6 @@ namespace fslic { virtual void after_update() {}; virtual void set_spatial_patch(); virtual void assign_clusters(const Cluster **target_clusters, int size); - virtual void rgb_to_lab(uint8_t* quad_image, int size); virtual bool centroid_quantization_enabled(); }; diff --git a/src/lsc.cpp b/src/lsc.cpp index 6b8ea5a..e8b360f 100644 --- a/src/lsc.cpp +++ b/src/lsc.cpp @@ -15,7 +15,7 @@ namespace fslic { } ContextLSC::~ContextLSC() { - if (uint8_memory_pool) delete [] uint8_memory_pool; + if (uint16_memory_pool) delete [] uint16_memory_pool; if (float_memory_pool) delete [] float_memory_pool; } @@ -34,14 +34,14 @@ namespace fslic { { fstimer::Scope s("image_alloc"); - if (uint8_memory_pool) delete [] uint8_memory_pool; - uint8_memory_pool = new uint8_t[3 * aligned_len]; - if (float_memory_pool) delete [] uint8_memory_pool; + if (uint16_memory_pool) delete [] uint16_memory_pool; + uint16_memory_pool = new uint16_t[3 * aligned_len]; + if (float_memory_pool) delete [] uint16_memory_pool; float_memory_pool = new float[11 * aligned_len + 10 * aligned_K]; - image_planes[0] = &uint8_memory_pool[0]; - image_planes[1] = &uint8_memory_pool[aligned_len]; - image_planes[2] = &uint8_memory_pool[2 * aligned_len]; + image_planes[0] = &uint16_memory_pool[0]; + image_planes[1] = &uint16_memory_pool[aligned_len]; + image_planes[2] = &uint16_memory_pool[2 * aligned_len]; for (int i = 0; i < 10; i++) { image_features[i] = &float_memory_pool[i * aligned_len]; centroid_features[i] = &float_memory_pool[11 * aligned_len + i * aligned_K]; @@ -53,7 +53,7 @@ namespace fslic { fstimer::Scope s("image_copy"); #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < H; i++) { - const uint8_t* image_row = quad_image.get_row(i); + const uint16_t* image_row = quad_image.get_row(i); for (int j = 0; j < W; j++) { int index = i * W + j; image_planes[0][index] = image_row[4 * j]; @@ -67,27 +67,10 @@ namespace fslic { fstimer::Scope s("feature_map"); // l1, l2, a1, a2, b1, b2 - float color_sine_map[256]; - float color_cosine_map[256]; - float L_sine_map[256]; - float L_cosine_map[256]; std::vector width_cosine_map(W); std::vector width_sine_map(W); std::vector height_cosine_map(H); std::vector height_sine_map(H); - for (int X = 0; X < 256; X++) { - float theta = halfPI * (X / 255.0f); - float cosine = cos(theta), sine = sin(theta); - color_cosine_map[X] = C_color * cosine * 2.55f; - color_sine_map[X] = C_color * sine * 2.55f; - } - - for (int X = 0; X < 256; X++) { - float theta = halfPI * (X / 255.0f); - L_cosine_map[X] = C_color * cos(theta); - L_sine_map[X] = C_color * sin(theta); - } - for (int i = 0; i < H; i++) { float theta = i * (halfPI / S); height_cosine_map[i] = C_spatial * cos(theta); @@ -100,17 +83,33 @@ namespace fslic { width_sine_map[i] = C_spatial * sin(theta); } - const uint8_t* __restrict L = &image_planes[0][0]; - const uint8_t* __restrict A = &image_planes[1][0]; - const uint8_t* __restrict B = &image_planes[2][0]; + const uint16_t* __restrict L = &image_planes[0][0]; + const uint16_t* __restrict A = &image_planes[1][0]; + const uint16_t* __restrict B = &image_planes[2][0]; #pragma omp parallel for num_threads(fsparallel::nth()) for (int i = 0; i < len; i++) { - image_features[0][i] = L_cosine_map[L[i]]; - image_features[1][i] = L_sine_map[L[i]]; - image_features[2][i] = color_cosine_map[A[i]]; - image_features[3][i] = color_sine_map[A[i]]; - image_features[4][i] = color_cosine_map[B[i]]; - image_features[5][i] = color_sine_map[B[i]]; + { + float X = L[i] / (float)(1 << color_shift); + float theta = halfPI * (X / 100.0f); + float cosine = cos(theta), sine = sin(theta); + image_features[0][i] = C_color * cosine; + image_features[1][i] = C_color * sine; + } + + { + float X = A[i] / (float)(1 << color_shift); + float theta = halfPI * (X / 255.0f); + float cosine = cos(theta), sine = sin(theta); + image_features[2][i] = C_color * cosine; + image_features[3][i] = C_color * sine; + } + { + float X = B[i] / (float)(1 << color_shift); + float theta = halfPI * (X / 255.0f); + float cosine = cos(theta), sine = sin(theta); + image_features[4][i] = C_color * cosine; + image_features[5][i] = C_color * sine; + } } // x1, x2, y1, y2 @@ -306,10 +305,6 @@ namespace fslic { delete [] wsums; } - void ContextLSC::rgb_to_lab(uint8_t *quad_image, int size) { - rgb_to_cielab(quad_image, quad_image, size, true); - } - void ContextLSC::normalize_features(float *__restrict numers[10], float* __restrict weights, int size) { #pragma omp parallel for num_threads(fsparallel::nth()) diff --git a/src/lsc.h b/src/lsc.h index 29572ad..bbe8135 100644 --- a/src/lsc.h +++ b/src/lsc.h @@ -7,8 +7,8 @@ namespace fslic { protected: float C_color = 20; float* float_memory_pool = nullptr; - uint8_t* uint8_memory_pool = nullptr; - uint8_t* __restrict image_planes[3]; // L, a, b plane (H x W) + uint16_t* uint16_memory_pool = nullptr; + uint16_t* __restrict image_planes[3]; // L, a, b plane (H x W) float* __restrict image_features[10]; // l1, l2, a1, a2, b1, b2, x1, x2, y1, y2 float* __restrict image_weights; float* __restrict centroid_features[10]; // l1, l2, a1, a2, b1, b2, x1, x2, y1, y2 @@ -20,7 +20,6 @@ namespace fslic { virtual void after_update(); virtual void assign_clusters(const Cluster **target_clusters, int size); virtual void normalize_features(float *__restrict numers[10], float* __restrict weights, int size); - virtual void rgb_to_lab(uint8_t* quad_image, int size); private: void map_image_into_feature_space(); void map_centroids_into_feature_space();