Color shift

Algy · Aug 31, 2019 · 26c4f4e · 26c4f4e
1 parent e1a877c
commit 26c4f4e
Show file tree

Hide file tree

Showing 7 changed files with 101 additions and 131 deletions.
diff --git a/src/arch/arm/neon.h b/src/arch/arm/neon.h
@@ -6,22 +6,23 @@
 
 inline void get_assignment_value_vec(
         const Cluster* cluster,
-        const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
+        const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
         const uint16_t* min_dist_row, const uint16_t* assignment_row,
-        uint16x8_t cluster_number_vec, uint8x16_t cluster_color_vec,
+        uint16x8_t cluster_number_vec, uint16x8_t cluster_color_vec,
         uint16x8_t& new_min_dist, uint16x8_t& new_assignment
         ) {
     uint16x8_t spatial_dist_vec = vld1q_u16(spatial_dist_patch_row);
-    uint8x16_t image_segment = vld1q_u8(img_quad_row);
-    uint8x16_t image_segment_2 = vld1q_u8(img_quad_row + 16);
+    uint16x8_t image_segment_1 = vld1q_u16(img_quad_row);
+    uint16x8_t image_segment_2 = vld1q_u16(img_quad_row + 8);
+    uint16x8_t image_segment_3 = vld1q_u16(img_quad_row + 16);
+    uint16x8_t image_segment_4 = vld1q_u16(img_quad_row + 24);
 
-    uint8x16_t abs_segment = vabdq_u8(image_segment, cluster_color_vec);
-    uint8x16_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec);
+    uint16x8_t abs_segment_1 = vabdq_u8(image_segment_1, cluster_color_vec);
+    uint16x8_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec);
+    uint16x8_t abs_segment_3 = vabdq_u8(image_segment_3, cluster_color_vec);
+    uint16x8_t abs_segment_4 = vabdq_u8(image_segment_4, cluster_color_vec);
 
-    uint32x4_t sad_segment = vpaddlq_u16(vpaddlq_u8(abs_segment));
-    uint32x4_t sad_segment_2 = vpaddlq_u16(vpaddlq_u8(abs_segment_2));
-
-    uint16x8_t color_dist_vec = vcombine_u16(vmovn_u32(sad_segment), vmovn_u32(sad_segment_2));
+    uint16x8_t color_dist_vec = vpaddq_u16(vpaddq_u16(abs_segment_1, abs_segment_2), vpaddq_u16(abs_segment_3, abs_segment_4));
 
     uint16x8_t dist_vec = vaddq_u16(color_dist_vec, spatial_dist_vec);
     uint16x8_t old_assignment = vld1q_u16(assignment_row);
@@ -60,36 +61,28 @@ namespace fslic {
     				cluster_number
     			};
 
-    			uint8x16_t cluster_color_vec = {
-    				(uint8_t)cluster->r,
-    				(uint8_t)cluster->g,
-    				(uint8_t)cluster->b,
-    				0,
-    				(uint8_t)cluster->r,
-    				(uint8_t)cluster->g,
-    				(uint8_t)cluster->b,
-    				0,
-    				(uint8_t)cluster->r,
-    				(uint8_t)cluster->g,
-    				(uint8_t)cluster->b,
+    			uint16x8_t cluster_color_vec = {
+    				(uint16_t)cluster->r,
+    				(uint16_t)cluster->g,
+    				(uint16_t)cluster->b,
     				0,
-    				(uint8_t)cluster->r,
-    				(uint8_t)cluster->g,
-    				(uint8_t)cluster->b,
+    				(uint16_t)cluster->r,
+    				(uint16_t)cluster->g,
+    				(uint16_t)cluster->b,
     				0
     			};
                 int16_t patch_height = spatial_dist_patch.get_height();
     			for (int16_t i = fit_to_stride(y_lo) - y_lo; i < patch_height; i += subsample_stride) {
     				const uint16_t* spatial_dist_patch_base_row = spatial_dist_patch.get_row(i);
-                    const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
+                    const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
                     uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
                     uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);
 
     	#define ASSIGNMENT_VALUE_GETTER_BODY \
     		uint16x8_t new_min_dist, new_assignment; \
     		uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
     		uint16_t* assignment_row = assignment_base_row + j;  /* unaligned */ \
-    		const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
+    		const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
     		const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
     		get_assignment_value_vec( \
     			cluster, \

diff --git a/src/arch/x64/avx2.h b/src/arch/x64/avx2.h
@@ -10,7 +10,7 @@ inline __m256 _mm256_set_ps1(float v) {
 
 inline void get_assignment_value_vec(
         const Cluster* cluster,
-        const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
+        const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
         const uint16_t* min_dist_row, const uint16_t* assignment_row,
         __m128i cluster_number_vec, __m256i cluster_color_vec,
         __m128i order_swap_mask,
@@ -23,8 +23,8 @@ inline void get_assignment_value_vec(
 
     __m128i spatial_dist_vec = _mm_load_si128((__m128i *)spatial_dist_patch_row);
 
-    __m256i image_segment = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)img_quad_row));
-    __m256i image_segment_2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(img_quad_row + 16)));
+    __m256i image_segment = _mm256_loadu_si256((__m256i*)img_quad_row);
+    __m256i image_segment_2 = _mm256_loadu_si256((__m256i*)(img_quad_row + 16));
 
     // [R1, G1, B1, A1, R2, G2, B2, A2, R3, G3, B3, A3, R3, G3, B3, A3]
     __m256i abd_segment = _mm256_abs_epi16(_mm256_subs_epi16(image_segment, cluster_color_vec));
@@ -114,15 +114,15 @@ namespace fslic {
                     assert((long long)spatial_dist_patch_base_row % 32 == 0);
         #endif
                     // not aligned
-                    const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
+                    const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
                     uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
                     uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);
 
         #define ASSIGNMENT_VALUE_GETTER_BODY \
             __m128i new_assignment__narrow, new_min_dist__narrow; \
             uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
             uint16_t* assignment_row = assignment_base_row + j;  /* unaligned */ \
-            const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
+            const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
             const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
             get_assignment_value_vec( \
                 cluster, \

diff --git a/src/cielab.h b/src/cielab.h
@@ -5,6 +5,7 @@
 #include <cstdint>
 #include <vector>
 #include "parallel.h"
+#include "simd-helper.hpp"
 /*
 def get_xyz_nonlin_tbl(a):
     v = a / 255.
@@ -278,6 +279,7 @@ static float  _srgb_gamma_tbl[256] = {
 #define srgb_shift 13
 #define srgb_max (1 << srgb_shift)
 #define lab_shift 16
+#define output_shift 3
 
 class FastCIELabCvt {
 public:
@@ -301,8 +303,7 @@ class FastCIELabCvt {
     }
 
 
-    template <bool scale_lab = false>
-    inline void convert(uint8_t R, uint8_t G, uint8_t B, uint8_t& l, uint8_t& a, uint8_t& b) {
+    inline void convert(uint8_t R, uint8_t G, uint8_t B, uint16_t& l, uint16_t& a, uint16_t& b) {
         int sr = srgb_gamma_tbl[R], sg = srgb_gamma_tbl[G], sb = srgb_gamma_tbl[B];
 
         int xr = (Cb[0] * sr + Cb[1] * sg + Cb[2] * sb) >> lab_shift;
@@ -315,13 +316,9 @@ class FastCIELabCvt {
         int ciea = 500 * (fx - fy) + (128 << srgb_shift); // to positive integer
         int cieb = 200 * (fy - fz) + (128 << srgb_shift); // to positive integer
 
-        if (scale_lab) {
-            ciel = ciel * 255 / 100;
-        }
-
-        l = (uint8_t)((unsigned)ciel >> srgb_shift);
-        a = (uint8_t)((unsigned)ciea >> srgb_shift);
-        b = (uint8_t)((unsigned)cieb >> srgb_shift);
+        l = (uint16_t)((unsigned)ciel >> (srgb_shift - output_shift));
+        a = (uint16_t)((unsigned)ciea >> (srgb_shift - output_shift));
+        b = (uint16_t)((unsigned)cieb >> (srgb_shift - output_shift));
     }
 
 private:
@@ -334,33 +331,22 @@ class FastCIELabCvt {
 
 static FastCIELabCvt fast_cielab_cvt;
 
-static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool scale_L = false) {
-    if (scale_L) {
-        #pragma omp parallel for num_threads(fsparallel::nth())
-        for (int s = 0; s < size; s += 4) {
-            fast_cielab_cvt.convert<true>(
-                aligned_quad_image[s],
-                aligned_quad_image[s+1],
-                aligned_quad_image[s+2],
-                out[s],
-                out[s+1],
-                out[s+2]
-            );
-        }
-    } else {
-        #pragma omp parallel for num_threads(fsparallel::nth())
-        for (int s = 0; s < size; s += 4) {
-            fast_cielab_cvt.convert<false>(
-                aligned_quad_image[s],
-                aligned_quad_image[s+1],
-                aligned_quad_image[s+2],
-                out[s],
-                out[s+1],
-                out[s+2]
+static void rgb_to_cielab(const uint8_t* image, int H, int W, simd_helper::AlignedArray<uint16_t> &arr, int &shift_out) {
+    #pragma omp parallel for num_threads(fsparallel::nth())
+    for (int i = 0; i < H; i++) {
+        for (int j = 0; j < W; j++) {
+            int index = W * i + j;
+            fast_cielab_cvt.convert(
+                image[3 * index],
+                image[3 * index + 1],
+                image[3 * index + 2],
+                arr.get(i, 4 * j),
+                arr.get(i, 4 * j + 1),
+                arr.get(i, 4 * j + 2)
             );
         }
-
     }
+    shift_out = output_shift;
 }
 
 static void rgb_to_cielab_orig(const uint8_t* image, float *out, int size) {

diff --git a/src/context.cpp b/src/context.cpp
@@ -22,6 +22,7 @@ namespace fslic {
     template<typename DistType>
     void BaseContext<DistType>::set_spatial_patch() {
         float coef = 1.0f / ((float)S / compactness);
+        coef *= (1 << color_shift);
         int16_t S_2 = 2 * S;
         if (manhattan_spatial_dist) {
             for (int16_t i = 0; i <= S_2; i++) {
@@ -95,7 +96,6 @@ namespace fslic {
 
     template<typename DistType>
     void BaseContext<DistType>::initialize_state() {
-        set_spatial_patch();
     }
 
     template<typename DistType>
@@ -109,32 +109,30 @@ namespace fslic {
             fsparallel::Scope parallel_scope(num_threads);
             fstimer::Scope s("iterate");
             {
-                fstimer::Scope s("write_to_buffer");
-                #pragma omp parallel num_threads(fsparallel::nth())
-                {
-                    #pragma omp for
+                fstimer::Scope s("cielab_conversion");
+                if (convert_to_lab) {
+                    rgb_to_cielab(image, H, W, quad_image, color_shift);
+                } else {
+                    #pragma omp parallel num_threads(fsparallel::nth())
                     for (int i = 0; i < H; i++) {
                         for (int j = 0; j < W; j++) {
                             for (int k = 0; k < 3; k++) {
                                 quad_image.get(i, 4 * j + k) = image[i * W * 3 + 3 * j + k];
                             }
                         }
                     }
-
-                    #pragma omp for
-                    for (int i = 0; i < H; i++) {
-                        for (int j = 0; j < W; j++) {
-                            this->assignment.get(i, j) = 0xFFFF;
-                        }
-                    }
+                    color_shift = 0;
                 }
             }
-
             {
-                fstimer::Scope s("cielab_conversion");
-                if (convert_to_lab) {
-                    rgb_to_lab(&quad_image.get(0, 0), quad_image.contiguous_memory_size());
+                fstimer::Scope s("write_to_buffer");
+                #pragma omp parallel for num_threads(fsparallel::nth())
+                for (int i = 0; i < H; i++) {
+                    for (int j = 0; j < W; j++) {
+                        this->assignment.get(i, j) = 0xFFFF;
+                    }
                 }
+                set_spatial_patch();
             }
 
             subsample_rem = 0;
@@ -261,7 +259,7 @@ namespace fslic {
 
             for (int i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
                 if (!valid_subsample_row(i)) continue;
-                const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
+                const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
                 uint16_t* __restrict  assignment_row = assignment.get_row(i, cluster_x - S);
                 DistType* __restrict min_dist_row = min_dists.get_row(i, cluster_x - S);
                 const DistType* __restrict patch_row = spatial_dist_patch.get_row(i_off);
@@ -289,11 +287,6 @@ namespace fslic {
         delete [] dist_row;
     }
 
-    template<typename DistType>
-    void BaseContext<DistType>::rgb_to_lab(uint8_t *quad_image, int size) {
-        rgb_to_cielab(quad_image, quad_image, size, false);
-    }
-
 
     template<typename DistType>
     void BaseContext<DistType>::update() {
@@ -401,7 +394,7 @@ namespace fslic {
 
             for (int16_t i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
                 if (!valid_subsample_row(i)) continue;
-                const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
+                const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
                 uint16_t* __restrict assignment_row = assignment.get_row(i, cluster_x - S);
                 float* __restrict  min_dist_row = min_dists.get_row(i, cluster_x - S);
                 const float* __restrict patch_row = spatial_dist_patch.get_row(i_off);
@@ -431,6 +424,7 @@ namespace fslic {
 
     void ContextRealDistL2::set_spatial_patch() {
         float coef = 1.0f / ((float)S / compactness);
+        coef *= (1 << color_shift);
         int16_t S_2 = 2 * S;
         for (int16_t i = 0; i <= S_2; i++) {
             for (int16_t j = 0; j <= S_2; j++) {
@@ -474,6 +468,7 @@ namespace fslic {
     template<bool use_manhattan, bool use_float_color>
     void ContextRealDistNoQ::assign_clusters_proto(const Cluster** target_clusters, int size) {
         float coef = 1.0f / ((float)S / compactness);
+        coef *= (1 << color_shift);
 
         for (int cidx = 0; cidx < size; cidx++) {
             const Cluster* cluster = target_clusters[cidx];

diff --git a/src/context.h b/src/context.h
@@ -40,8 +40,11 @@ namespace fslic {
     protected:
         int16_t subsample_rem;
         int16_t subsample_stride;
+
+    protected:
+        int color_shift;
     protected:
-        simd_helper::AlignedArray<uint8_t> quad_image;
+        simd_helper::AlignedArray<uint16_t> quad_image;
         simd_helper::AlignedArray<uint16_t> assignment;
         simd_helper::AlignedArray<DistType> min_dists;
         simd_helper::AlignedArray<DistType> spatial_dist_patch;
@@ -86,7 +89,6 @@ namespace fslic {
         virtual void after_update() {};
         virtual void set_spatial_patch();
         virtual void assign_clusters(const Cluster **target_clusters, int size);
-        virtual void rgb_to_lab(uint8_t* quad_image, int size);
         virtual bool centroid_quantization_enabled();
     };