shrink image size

Algy · Nov 21, 2019 · 7dbaca7 · 7dbaca7
1 parent 5dd2b76
commit 7dbaca7
Show file tree

Hide file tree

Showing 7 changed files with 79 additions and 74 deletions.
diff --git a/src/arch/arm/neon.h b/src/arch/arm/neon.h
@@ -6,28 +6,22 @@
 
 inline void get_assignment_value_vec(
         const Cluster* cluster,
-        const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
+        const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
         const uint16_t* min_dist_row, const uint16_t* assignment_row,
-        uint16x8_t cluster_number_vec, uint16x8_t cluster_color_vec,
+        uint16x8_t cluster_number_vec, uint8x16_t cluster_color_vec,
         uint16x8_t& new_min_dist, uint16x8_t& new_assignment
         ) {
     uint16x8_t spatial_dist_vec = vld1q_u16(spatial_dist_patch_row);
-    uint16x8_t image_segment_1 = vld1q_u16(img_quad_row);
-    uint16x8_t image_segment_2 = vld1q_u16(img_quad_row + 8);
-    uint16x8_t image_segment_3 = vld1q_u16(img_quad_row + 16);
-    uint16x8_t image_segment_4 = vld1q_u16(img_quad_row + 24);
+    uint8x16_t image_segment = vld1q_u8(img_quad_row);
+    uint8x16_t image_segment_2 = vld1q_u8(img_quad_row + 16);
 
-    uint16x8_t abs_segment_1 = vabdq_u16(image_segment_1, cluster_color_vec);
-    uint16x8_t abs_segment_2 = vabdq_u16(image_segment_2, cluster_color_vec);
-    uint16x8_t abs_segment_3 = vabdq_u16(image_segment_3, cluster_color_vec);
-    uint16x8_t abs_segment_4 = vabdq_u16(image_segment_4, cluster_color_vec);
+    uint8x16_t abs_segment = vabdq_u8(image_segment, cluster_color_vec);
+    uint8x16_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec);
 
-    uint16x4_t f_1 = vmovn_u32(vpaddlq_u16(abs_segment_1));
-    uint16x4_t f_2 = vmovn_u32(vpaddlq_u16(abs_segment_2));
-    uint16x4_t f_3 = vmovn_u32(vpaddlq_u16(abs_segment_3));
-    uint16x4_t f_4 = vmovn_u32(vpaddlq_u16(abs_segment_4));
+    uint32x4_t sad_segment = vpaddlq_u16(vpaddlq_u8(abs_segment));
+    uint32x4_t sad_segment_2 = vpaddlq_u16(vpaddlq_u8(abs_segment_2));
 
-    uint16x8_t color_dist_vec = vcombine_u16(vpadd_u16(f_1, f_2), vpadd_u16(f_3, f_4));
+    uint16x8_t color_dist_vec = vcombine_u16(vmovn_u32(sad_segment), vmovn_u32(sad_segment_2));
 
     uint16x8_t dist_vec = vaddq_u16(color_dist_vec, spatial_dist_vec);
     uint16x8_t old_assignment = vld1q_u16(assignment_row);
@@ -66,28 +60,36 @@ namespace fslic {
     				cluster_number
     			};
 
-    			uint16x8_t cluster_color_vec = {
-    				(uint16_t)cluster->r,
-    				(uint16_t)cluster->g,
-    				(uint16_t)cluster->b,
+    			uint8x16_t cluster_color_vec = {
+    				(uint8_t)cluster->r,
+    				(uint8_t)cluster->g,
+    				(uint8_t)cluster->b,
     				0,
-    				(uint16_t)cluster->r,
-    				(uint16_t)cluster->g,
-    				(uint16_t)cluster->b,
+    				(uint8_t)cluster->r,
+    				(uint8_t)cluster->g,
+    				(uint8_t)cluster->b,
+    				0,
+    				(uint8_t)cluster->r,
+    				(uint8_t)cluster->g,
+    				(uint8_t)cluster->b,
+    				0,
+    				(uint8_t)cluster->r,
+    				(uint8_t)cluster->g,
+    				(uint8_t)cluster->b,
     				0
     			};
                 int16_t patch_height = spatial_dist_patch.get_height();
     			for (int16_t i = fit_to_stride(y_lo) - y_lo; i < patch_height; i += subsample_stride) {
     				const uint16_t* spatial_dist_patch_base_row = spatial_dist_patch.get_row(i);
-                    const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
+                    const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
                     uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
                     uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);
 
     	#define ASSIGNMENT_VALUE_GETTER_BODY \
     		uint16x8_t new_min_dist, new_assignment; \
     		uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
     		uint16_t* assignment_row = assignment_base_row + j;  /* unaligned */ \
-    		const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
+    		const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
     		const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
     		get_assignment_value_vec( \
     			cluster, \

diff --git a/src/arch/x64/avx2.h b/src/arch/x64/avx2.h
@@ -10,7 +10,7 @@ inline __m256 _mm256_set_ps1(float v) {
 
 inline void get_assignment_value_vec(
         const Cluster* cluster,
-        const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
+        const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
         const uint16_t* min_dist_row, const uint16_t* assignment_row,
         __m128i cluster_number_vec, __m256i cluster_color_vec,
         __m128i order_swap_mask,
@@ -23,8 +23,8 @@ inline void get_assignment_value_vec(
 
     __m128i spatial_dist_vec = _mm_load_si128((__m128i *)spatial_dist_patch_row);
 
-    __m256i image_segment = _mm256_loadu_si256((__m256i*)img_quad_row);
-    __m256i image_segment_2 = _mm256_loadu_si256((__m256i*)(img_quad_row + 16));
+    __m256i image_segment = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)img_quad_row));
+    __m256i image_segment_2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(img_quad_row + 16)));
 
     // [R1, G1, B1, A1, R2, G2, B2, A2, R3, G3, B3, A3, R3, G3, B3, A3]
     __m256i abd_segment = _mm256_abs_epi16(_mm256_subs_epi16(image_segment, cluster_color_vec));
@@ -114,15 +114,15 @@ namespace fslic {
                     assert((long long)spatial_dist_patch_base_row % 32 == 0);
         #endif
                     // not aligned
-                    const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
+                    const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
                     uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
                     uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);
 
         #define ASSIGNMENT_VALUE_GETTER_BODY \
             __m128i new_assignment__narrow, new_min_dist__narrow; \
             uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
             uint16_t* assignment_row = assignment_base_row + j;  /* unaligned */ \
-            const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
+            const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
             const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
             get_assignment_value_vec( \
                 cluster, \

diff --git a/src/cielab.h b/src/cielab.h
@@ -6,6 +6,8 @@
 #include <vector>
 #include "parallel.h"
 #include "simd-helper.hpp"
+#include "fast-slic-common.h"
+
 /*
 def get_xyz_nonlin_tbl(a):
     v = a / 255.
@@ -279,7 +281,7 @@ static float  _srgb_gamma_tbl[256] = {
 #define srgb_shift 13
 #define srgb_max (1 << srgb_shift)
 #define lab_shift 16
-#define output_shift 3
+#define output_shift 1
 
 class FastCIELabCvt {
 public:
@@ -303,7 +305,7 @@ class FastCIELabCvt {
     }
 
 
-    inline void convert(uint8_t R, uint8_t G, uint8_t B, uint16_t& l, uint16_t& a, uint16_t& b) {
+    inline void convert(uint8_t R, uint8_t G, uint8_t B, uint8_t& l, uint8_t& a, uint8_t& b) {
         int sr = srgb_gamma_tbl[R], sg = srgb_gamma_tbl[G], sb = srgb_gamma_tbl[B];
 
         int xr = (Cb[0] * sr + Cb[1] * sg + Cb[2] * sb) >> lab_shift;
@@ -316,9 +318,10 @@ class FastCIELabCvt {
         int ciea = 500 * (fx - fy) + (128 << srgb_shift); // to positive integer
         int cieb = 200 * (fy - fz) + (128 << srgb_shift); // to positive integer
 
-        l = (uint16_t)((unsigned)ciel >> (srgb_shift - output_shift));
-        a = (uint16_t)((unsigned)ciea >> (srgb_shift - output_shift));
-        b = (uint16_t)((unsigned)cieb >> (srgb_shift - output_shift));
+
+        l = clamp<int>((unsigned)ciel >> (srgb_shift - output_shift), 0, 255);
+        a = clamp<int>(((unsigned)ciea >> (srgb_shift - output_shift)) - (64 << output_shift), 0, 255);
+        b = clamp<int>(((unsigned)cieb >> (srgb_shift - output_shift)) - (64 << output_shift), 0, 255);
     }
 
 private:
@@ -331,7 +334,7 @@ class FastCIELabCvt {
 
 static FastCIELabCvt fast_cielab_cvt;
 
-static void rgb_to_cielab(const uint8_t* image, int H, int W, simd_helper::AlignedArray<uint16_t> &arr, int &shift_out) {
+static void rgb_to_cielab(const uint8_t* image, int H, int W, simd_helper::AlignedArray<uint8_t> &arr, int &shift_out) {
     #pragma omp parallel for num_threads(fsparallel::nth())
     for (int i = 0; i < H; i++) {
         for (int j = 0; j < W; j++) {

diff --git a/src/context.cpp b/src/context.cpp
@@ -269,7 +269,7 @@ namespace fslic {
 
             for (int i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
                 if (!valid_subsample_row(i)) continue;
-                const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
+                const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
                 uint16_t* __restrict  assignment_row = assignment.get_row(i, cluster_x - S);
                 DistType* __restrict min_dist_row = min_dists.get_row(i, cluster_x - S);
                 const DistType* __restrict patch_row = spatial_dist_patch.get_row(i_off);
@@ -404,7 +404,7 @@ namespace fslic {
 
             for (int16_t i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
                 if (!valid_subsample_row(i)) continue;
-                const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
+                const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
                 uint16_t* __restrict assignment_row = assignment.get_row(i, cluster_x - S);
                 float* __restrict  min_dist_row = min_dists.get_row(i, cluster_x - S);
                 const float* __restrict patch_row = spatial_dist_patch.get_row(i_off);

diff --git a/src/context.h b/src/context.h
@@ -46,7 +46,7 @@ namespace fslic {
     protected:
         int color_shift;
     protected:
-        simd_helper::AlignedArray<uint16_t> quad_image;
+        simd_helper::AlignedArray<uint8_t> quad_image;
         simd_helper::AlignedArray<uint16_t> assignment;
         simd_helper::AlignedArray<DistType> min_dists;
         simd_helper::AlignedArray<DistType> spatial_dist_patch;

diff --git a/src/lsc.cpp b/src/lsc.cpp
@@ -15,7 +15,7 @@ namespace fslic {
     }
 
     ContextLSC::~ContextLSC() {
-        if (uint16_memory_pool) delete [] uint16_memory_pool;
+        if (uint8_memory_pool) delete [] uint8_memory_pool;
         if (float_memory_pool) delete [] float_memory_pool;
     }
 
@@ -34,14 +34,14 @@ namespace fslic {
         {
             fstimer::Scope s("image_alloc");
 
-            if (uint16_memory_pool) delete [] uint16_memory_pool;
-            uint16_memory_pool = new uint16_t[3 * aligned_len];
-            if (float_memory_pool) delete [] uint16_memory_pool;
+            if (uint8_memory_pool) delete [] uint8_memory_pool;
+            uint8_memory_pool = new uint8_t[3 * aligned_len];
+            if (float_memory_pool) delete [] uint8_memory_pool;
             float_memory_pool = new float[11 * aligned_len + 10 * aligned_K];
 
-            image_planes[0] = &uint16_memory_pool[0];
-            image_planes[1] = &uint16_memory_pool[aligned_len];
-            image_planes[2] = &uint16_memory_pool[2 * aligned_len];
+            image_planes[0] = &uint8_memory_pool[0];
+            image_planes[1] = &uint8_memory_pool[aligned_len];
+            image_planes[2] = &uint8_memory_pool[2 * aligned_len];
             for (int i = 0; i < 10; i++) {
                 image_features[i] = &float_memory_pool[i * aligned_len];
                 centroid_features[i] = &float_memory_pool[11 * aligned_len + i * aligned_K];
@@ -53,7 +53,7 @@ namespace fslic {
             fstimer::Scope s("image_copy");
             #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < H; i++) {
-                const uint16_t* image_row = quad_image.get_row(i);
+                const uint8_t* image_row = quad_image.get_row(i);
                 for (int j = 0; j < W; j++) {
                     int index = i * W + j;
                     image_planes[0][index] = image_row[4 * j];
@@ -67,10 +67,27 @@ namespace fslic {
             fstimer::Scope s("feature_map");
 
             // l1, l2, a1, a2, b1, b2
+            float color_sine_map[256];
+            float color_cosine_map[256];
+            float L_sine_map[256];
+            float L_cosine_map[256];
             std::vector<float> width_cosine_map(W);
             std::vector<float> width_sine_map(W);
             std::vector<float> height_cosine_map(H);
             std::vector<float> height_sine_map(H);
+            for (int X = 0; X < 256; X++) {
+                float theta = halfPI * (X / 255.0f);
+                float cosine = cos(theta), sine = sin(theta);
+                color_cosine_map[X] = C_color * cosine * 2.55f;
+    			color_sine_map[X] = C_color * sine * 2.55f;
+            }
+
+            for (int X = 0; X < 256; X++) {
+                float theta = halfPI * (X / 255.0f);
+                L_cosine_map[X] = C_color * cos(theta);
+                L_sine_map[X] = C_color * sin(theta);
+            }
+
             for (int i = 0; i < H; i++) {
                 float theta = i * (halfPI / S);
                 height_cosine_map[i] = C_spatial * cos(theta);
@@ -83,33 +100,17 @@ namespace fslic {
                 width_sine_map[i] = C_spatial * sin(theta);
             }
 
-            const uint16_t* __restrict L = &image_planes[0][0];
-            const uint16_t* __restrict A = &image_planes[1][0];
-            const uint16_t* __restrict B = &image_planes[2][0];
+            const uint8_t* __restrict L = &image_planes[0][0];
+            const uint8_t* __restrict A = &image_planes[1][0];
+            const uint8_t* __restrict B = &image_planes[2][0];
             #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < len; i++) {
-                {
-                    float X = L[i] / (float)(1 << color_shift);
-                    float theta = halfPI * (X / 100.0f);
-                    float cosine = cos(theta), sine = sin(theta);
-                    image_features[0][i] = C_color * cosine;
-        			image_features[1][i] = C_color * sine;
-                }
-
-                {
-                    float X = A[i] / (float)(1 << color_shift);
-                    float theta = halfPI * (X / 255.0f);
-                    float cosine = cos(theta), sine = sin(theta);
-                    image_features[2][i] = C_color * cosine;
-                    image_features[3][i] = C_color * sine;
-                }
-                {
-                    float X = B[i] / (float)(1 << color_shift);
-                    float theta = halfPI * (X / 255.0f);
-                    float cosine = cos(theta), sine = sin(theta);
-                    image_features[4][i] = C_color * cosine;
-                    image_features[5][i] = C_color * sine;
-                }
+                image_features[0][i] = L_cosine_map[L[i]];
+    			image_features[1][i] = L_sine_map[L[i]];
+    			image_features[2][i] = color_cosine_map[A[i]];
+    			image_features[3][i] = color_sine_map[A[i]];
+    			image_features[4][i] = color_cosine_map[B[i]];
+    			image_features[5][i] = color_sine_map[B[i]];
             }
             // x1, x2, y1, y2
 
@@ -305,7 +306,6 @@ namespace fslic {
         delete [] wsums;
     }
 
-
 	void ContextLSC::normalize_features(float *__restrict numers[10], float* __restrict weights, int size) {
         #pragma omp parallel for num_threads(fsparallel::nth())
         for (int i = 0; i < size; i++) {

diff --git a/src/lsc.h b/src/lsc.h
@@ -7,8 +7,8 @@ namespace fslic {
 	protected:
 		float C_color = 20;
 		float* float_memory_pool = nullptr;
-		uint16_t* uint16_memory_pool = nullptr;
-	    uint16_t* __restrict image_planes[3]; // L, a, b plane (H x W)
+		uint8_t* uint8_memory_pool = nullptr;
+	    uint8_t* __restrict image_planes[3]; // L, a, b plane (H x W)
 	    float* __restrict image_features[10]; // l1, l2, a1, a2, b1, b2, x1, x2, y1, y2
 	    float* __restrict image_weights;
 	    float* __restrict centroid_features[10]; // l1, l2, a1, a2, b1, b2, x1, x2, y1, y2