diff --git a/src/arch/arm/neon.h b/src/arch/arm/neon.h
index 6936a3a..2fa9e21 100644
--- a/src/arch/arm/neon.h
+++ b/src/arch/arm/neon.h
@@ -6,22 +6,23 @@
 
 inline void get_assignment_value_vec(
         const Cluster* cluster,
-        const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
+        const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
         const uint16_t* min_dist_row, const uint16_t* assignment_row,
-        uint16x8_t cluster_number_vec, uint8x16_t cluster_color_vec,
+        uint16x8_t cluster_number_vec, uint16x8_t cluster_color_vec,
         uint16x8_t& new_min_dist, uint16x8_t& new_assignment
         ) {
     uint16x8_t spatial_dist_vec = vld1q_u16(spatial_dist_patch_row);
-    uint8x16_t image_segment = vld1q_u8(img_quad_row);
-    uint8x16_t image_segment_2 = vld1q_u8(img_quad_row + 16);
+    uint16x8_t image_segment_1 = vld1q_u16(img_quad_row);
+    uint16x8_t image_segment_2 = vld1q_u16(img_quad_row + 8);
+    uint16x8_t image_segment_3 = vld1q_u16(img_quad_row + 16);
+    uint16x8_t image_segment_4 = vld1q_u16(img_quad_row + 24);
 
-    uint8x16_t abs_segment = vabdq_u8(image_segment, cluster_color_vec);
-    uint8x16_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec);
+    uint16x8_t abs_segment_1 = vabdq_u8(image_segment_1, cluster_color_vec);
+    uint16x8_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec);
+    uint16x8_t abs_segment_3 = vabdq_u8(image_segment_3, cluster_color_vec);
+    uint16x8_t abs_segment_4 = vabdq_u8(image_segment_4, cluster_color_vec);
 
-    uint32x4_t sad_segment = vpaddlq_u16(vpaddlq_u8(abs_segment));
-    uint32x4_t sad_segment_2 = vpaddlq_u16(vpaddlq_u8(abs_segment_2));
-
-    uint16x8_t color_dist_vec = vcombine_u16(vmovn_u32(sad_segment), vmovn_u32(sad_segment_2));
+    uint16x8_t color_dist_vec = vpaddq_u16(vpaddq_u16(abs_segment_1, abs_segment_2), vpaddq_u16(abs_segment_3, abs_segment_4));
 
     uint16x8_t dist_vec = vaddq_u16(color_dist_vec, spatial_dist_vec);
     uint16x8_t old_assignment = vld1q_u16(assignment_row);
@@ -60,28 +61,20 @@ namespace fslic {
     				cluster_number
     			};
 
-    			uint8x16_t cluster_color_vec = {
-    				(uint8_t)cluster->r,
-    				(uint8_t)cluster->g,
-    				(uint8_t)cluster->b,
-    				0,
-    				(uint8_t)cluster->r,
-    				(uint8_t)cluster->g,
-    				(uint8_t)cluster->b,
-    				0,
-    				(uint8_t)cluster->r,
-    				(uint8_t)cluster->g,
-    				(uint8_t)cluster->b,
+    			uint16x8_t cluster_color_vec = {
+    				(uint16_t)cluster->r,
+    				(uint16_t)cluster->g,
+    				(uint16_t)cluster->b,
     				0,
-    				(uint8_t)cluster->r,
-    				(uint8_t)cluster->g,
-    				(uint8_t)cluster->b,
+    				(uint16_t)cluster->r,
+    				(uint16_t)cluster->g,
+    				(uint16_t)cluster->b,
     				0
     			};
                 int16_t patch_height = spatial_dist_patch.get_height();
     			for (int16_t i = fit_to_stride(y_lo) - y_lo; i < patch_height; i += subsample_stride) {
     				const uint16_t* spatial_dist_patch_base_row = spatial_dist_patch.get_row(i);
-                    const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
+                    const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
                     uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
                     uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);
 
@@ -89,7 +82,7 @@ namespace fslic {
     		uint16x8_t new_min_dist, new_assignment; \
     		uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
     		uint16_t* assignment_row = assignment_base_row + j;  /* unaligned */ \
-    		const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
+    		const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
     		const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
     		get_assignment_value_vec( \
     			cluster, \
diff --git a/src/arch/x64/avx2.h b/src/arch/x64/avx2.h
index 74fcbaf..444600b 100644
--- a/src/arch/x64/avx2.h
+++ b/src/arch/x64/avx2.h
@@ -10,7 +10,7 @@ inline __m256 _mm256_set_ps1(float v) {
 
 inline void get_assignment_value_vec(
         const Cluster* cluster,
-        const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
+        const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
         const uint16_t* min_dist_row, const uint16_t* assignment_row,
         __m128i cluster_number_vec, __m256i cluster_color_vec,
         __m128i order_swap_mask,
@@ -23,8 +23,8 @@ inline void get_assignment_value_vec(
 
     __m128i spatial_dist_vec = _mm_load_si128((__m128i *)spatial_dist_patch_row);
 
-    __m256i image_segment = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)img_quad_row));
-    __m256i image_segment_2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(img_quad_row + 16)));
+    __m256i image_segment = _mm256_loadu_si256((__m256i*)img_quad_row);
+    __m256i image_segment_2 = _mm256_loadu_si256((__m256i*)(img_quad_row + 16));
 
     // [R1, G1, B1, A1, R2, G2, B2, A2, R3, G3, B3, A3, R3, G3, B3, A3]
     __m256i abd_segment = _mm256_abs_epi16(_mm256_subs_epi16(image_segment, cluster_color_vec));
@@ -114,7 +114,7 @@ namespace fslic {
                     assert((long long)spatial_dist_patch_base_row % 32 == 0);
         #endif
                     // not aligned
-                    const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
+                    const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
                     uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
                     uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);
 
@@ -122,7 +122,7 @@ namespace fslic {
             __m128i new_assignment__narrow, new_min_dist__narrow; \
             uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
             uint16_t* assignment_row = assignment_base_row + j;  /* unaligned */ \
-            const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
+            const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
             const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
             get_assignment_value_vec( \
                 cluster, \
diff --git a/src/cielab.h b/src/cielab.h
index 2794507..419999e 100644
--- a/src/cielab.h
+++ b/src/cielab.h
@@ -5,6 +5,7 @@
 #include <cstdint>
 #include <vector>
 #include "parallel.h"
+#include "simd-helper.hpp"
 /*
 def get_xyz_nonlin_tbl(a):
     v = a / 255.
@@ -278,6 +279,7 @@ static float  _srgb_gamma_tbl[256] = {
 #define srgb_shift 13
 #define srgb_max (1 << srgb_shift)
 #define lab_shift 16
+#define output_shift 3
 
 class FastCIELabCvt {
 public:
@@ -301,8 +303,7 @@ class FastCIELabCvt {
     }
 
 
-    template <bool scale_lab = false>
-    inline void convert(uint8_t R, uint8_t G, uint8_t B, uint8_t& l, uint8_t& a, uint8_t& b) {
+    inline void convert(uint8_t R, uint8_t G, uint8_t B, uint16_t& l, uint16_t& a, uint16_t& b) {
         int sr = srgb_gamma_tbl[R], sg = srgb_gamma_tbl[G], sb = srgb_gamma_tbl[B];
 
         int xr = (Cb[0] * sr + Cb[1] * sg + Cb[2] * sb) >> lab_shift;
@@ -315,13 +316,9 @@ class FastCIELabCvt {
         int ciea = 500 * (fx - fy) + (128 << srgb_shift); // to positive integer
         int cieb = 200 * (fy - fz) + (128 << srgb_shift); // to positive integer
 
-        if (scale_lab) {
-            ciel = ciel * 255 / 100;
-        }
-
-        l = (uint8_t)((unsigned)ciel >> srgb_shift);
-        a = (uint8_t)((unsigned)ciea >> srgb_shift);
-        b = (uint8_t)((unsigned)cieb >> srgb_shift);
+        l = (uint16_t)((unsigned)ciel >> (srgb_shift - output_shift));
+        a = (uint16_t)((unsigned)ciea >> (srgb_shift - output_shift));
+        b = (uint16_t)((unsigned)cieb >> (srgb_shift - output_shift));
     }
 
 private:
@@ -334,33 +331,22 @@ class FastCIELabCvt {
 
 static FastCIELabCvt fast_cielab_cvt;
 
-static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool scale_L = false) {
-    if (scale_L) {
-        #pragma omp parallel for num_threads(fsparallel::nth())
-        for (int s = 0; s < size; s += 4) {
-            fast_cielab_cvt.convert<true>(
-                aligned_quad_image[s],
-                aligned_quad_image[s+1],
-                aligned_quad_image[s+2],
-                out[s],
-                out[s+1],
-                out[s+2]
-            );
-        }
-    } else {
-        #pragma omp parallel for num_threads(fsparallel::nth())
-        for (int s = 0; s < size; s += 4) {
-            fast_cielab_cvt.convert<false>(
-                aligned_quad_image[s],
-                aligned_quad_image[s+1],
-                aligned_quad_image[s+2],
-                out[s],
-                out[s+1],
-                out[s+2]
+static void rgb_to_cielab(const uint8_t* image, int H, int W, simd_helper::AlignedArray<uint16_t> &arr, int &shift_out) {
+    #pragma omp parallel for num_threads(fsparallel::nth())
+    for (int i = 0; i < H; i++) {
+        for (int j = 0; j < W; j++) {
+            int index = W * i + j;
+            fast_cielab_cvt.convert(
+                image[3 * index],
+                image[3 * index + 1],
+                image[3 * index + 2],
+                arr.get(i, 4 * j),
+                arr.get(i, 4 * j + 1),
+                arr.get(i, 4 * j + 2)
             );
         }
-
     }
+    shift_out = output_shift;
 }
 
 static void rgb_to_cielab_orig(const uint8_t* image, float *out, int size) {
diff --git a/src/context.cpp b/src/context.cpp
index 715ac1f..658d333 100644
--- a/src/context.cpp
+++ b/src/context.cpp
@@ -22,6 +22,7 @@ namespace fslic {
     template<typename DistType>
     void BaseContext<DistType>::set_spatial_patch() {
         float coef = 1.0f / ((float)S / compactness);
+        coef *= (1 << color_shift);
         int16_t S_2 = 2 * S;
         if (manhattan_spatial_dist) {
             for (int16_t i = 0; i <= S_2; i++) {
@@ -95,7 +96,6 @@ namespace fslic {
 
     template<typename DistType>
     void BaseContext<DistType>::initialize_state() {
-        set_spatial_patch();
     }
 
     template<typename DistType>
@@ -109,10 +109,11 @@ namespace fslic {
             fsparallel::Scope parallel_scope(num_threads);
             fstimer::Scope s("iterate");
             {
-                fstimer::Scope s("write_to_buffer");
-                #pragma omp parallel num_threads(fsparallel::nth())
-                {
-                    #pragma omp for
+                fstimer::Scope s("cielab_conversion");
+                if (convert_to_lab) {
+                    rgb_to_cielab(image, H, W, quad_image, color_shift);
+                } else {
+                    #pragma omp parallel num_threads(fsparallel::nth())
                     for (int i = 0; i < H; i++) {
                         for (int j = 0; j < W; j++) {
                             for (int k = 0; k < 3; k++) {
@@ -120,21 +121,18 @@ namespace fslic {
                             }
                         }
                     }
-
-                    #pragma omp for
-                    for (int i = 0; i < H; i++) {
-                        for (int j = 0; j < W; j++) {
-                            this->assignment.get(i, j) = 0xFFFF;
-                        }
-                    }
+                    color_shift = 0;
                 }
             }
-
             {
-                fstimer::Scope s("cielab_conversion");
-                if (convert_to_lab) {
-                    rgb_to_lab(&quad_image.get(0, 0), quad_image.contiguous_memory_size());
+                fstimer::Scope s("write_to_buffer");
+                #pragma omp parallel for num_threads(fsparallel::nth())
+                for (int i = 0; i < H; i++) {
+                    for (int j = 0; j < W; j++) {
+                        this->assignment.get(i, j) = 0xFFFF;
+                    }
                 }
+                set_spatial_patch();
             }
 
             subsample_rem = 0;
@@ -261,7 +259,7 @@ namespace fslic {
 
             for (int i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
                 if (!valid_subsample_row(i)) continue;
-                const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
+                const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
                 uint16_t* __restrict  assignment_row = assignment.get_row(i, cluster_x - S);
                 DistType* __restrict min_dist_row = min_dists.get_row(i, cluster_x - S);
                 const DistType* __restrict patch_row = spatial_dist_patch.get_row(i_off);
@@ -289,11 +287,6 @@ namespace fslic {
         delete [] dist_row;
     }
 
-    template<typename DistType>
-    void BaseContext<DistType>::rgb_to_lab(uint8_t *quad_image, int size) {
-        rgb_to_cielab(quad_image, quad_image, size, false);
-    }
-
 
     template<typename DistType>
     void BaseContext<DistType>::update() {
@@ -401,7 +394,7 @@ namespace fslic {
 
             for (int16_t i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
                 if (!valid_subsample_row(i)) continue;
-                const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
+                const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
                 uint16_t* __restrict assignment_row = assignment.get_row(i, cluster_x - S);
                 float* __restrict  min_dist_row = min_dists.get_row(i, cluster_x - S);
                 const float* __restrict patch_row = spatial_dist_patch.get_row(i_off);
@@ -431,6 +424,7 @@ namespace fslic {
 
     void ContextRealDistL2::set_spatial_patch() {
         float coef = 1.0f / ((float)S / compactness);
+        coef *= (1 << color_shift);
         int16_t S_2 = 2 * S;
         for (int16_t i = 0; i <= S_2; i++) {
             for (int16_t j = 0; j <= S_2; j++) {
@@ -474,6 +468,7 @@ namespace fslic {
     template<bool use_manhattan, bool use_float_color>
     void ContextRealDistNoQ::assign_clusters_proto(const Cluster** target_clusters, int size) {
         float coef = 1.0f / ((float)S / compactness);
+        coef *= (1 << color_shift);
 
         for (int cidx = 0; cidx < size; cidx++) {
             const Cluster* cluster = target_clusters[cidx];
diff --git a/src/context.h b/src/context.h
index b78d09d..e5a39f6 100644
--- a/src/context.h
+++ b/src/context.h
@@ -40,8 +40,11 @@ namespace fslic {
     protected:
         int16_t subsample_rem;
         int16_t subsample_stride;
+
+    protected:
+        int color_shift;
     protected:
-        simd_helper::AlignedArray<uint8_t> quad_image;
+        simd_helper::AlignedArray<uint16_t> quad_image;
         simd_helper::AlignedArray<uint16_t> assignment;
         simd_helper::AlignedArray<DistType> min_dists;
         simd_helper::AlignedArray<DistType> spatial_dist_patch;
@@ -86,7 +89,6 @@ namespace fslic {
         virtual void after_update() {};
         virtual void set_spatial_patch();
         virtual void assign_clusters(const Cluster **target_clusters, int size);
-        virtual void rgb_to_lab(uint8_t* quad_image, int size);
         virtual bool centroid_quantization_enabled();
     };
 
diff --git a/src/lsc.cpp b/src/lsc.cpp
index 6b8ea5a..e8b360f 100644
--- a/src/lsc.cpp
+++ b/src/lsc.cpp
@@ -15,7 +15,7 @@ namespace fslic {
     }
 
     ContextLSC::~ContextLSC() {
-        if (uint8_memory_pool) delete [] uint8_memory_pool;
+        if (uint16_memory_pool) delete [] uint16_memory_pool;
         if (float_memory_pool) delete [] float_memory_pool;
     }
 
@@ -34,14 +34,14 @@ namespace fslic {
         {
             fstimer::Scope s("image_alloc");
 
-            if (uint8_memory_pool) delete [] uint8_memory_pool;
-            uint8_memory_pool = new uint8_t[3 * aligned_len];
-            if (float_memory_pool) delete [] uint8_memory_pool;
+            if (uint16_memory_pool) delete [] uint16_memory_pool;
+            uint16_memory_pool = new uint16_t[3 * aligned_len];
+            if (float_memory_pool) delete [] uint16_memory_pool;
             float_memory_pool = new float[11 * aligned_len + 10 * aligned_K];
 
-            image_planes[0] = &uint8_memory_pool[0];
-            image_planes[1] = &uint8_memory_pool[aligned_len];
-            image_planes[2] = &uint8_memory_pool[2 * aligned_len];
+            image_planes[0] = &uint16_memory_pool[0];
+            image_planes[1] = &uint16_memory_pool[aligned_len];
+            image_planes[2] = &uint16_memory_pool[2 * aligned_len];
             for (int i = 0; i < 10; i++) {
                 image_features[i] = &float_memory_pool[i * aligned_len];
                 centroid_features[i] = &float_memory_pool[11 * aligned_len + i * aligned_K];
@@ -53,7 +53,7 @@ namespace fslic {
             fstimer::Scope s("image_copy");
             #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < H; i++) {
-                const uint8_t* image_row = quad_image.get_row(i);
+                const uint16_t* image_row = quad_image.get_row(i);
                 for (int j = 0; j < W; j++) {
                     int index = i * W + j;
                     image_planes[0][index] = image_row[4 * j];
@@ -67,27 +67,10 @@ namespace fslic {
             fstimer::Scope s("feature_map");
 
             // l1, l2, a1, a2, b1, b2
-            float color_sine_map[256];
-            float color_cosine_map[256];
-            float L_sine_map[256];
-            float L_cosine_map[256];
             std::vector<float> width_cosine_map(W);
             std::vector<float> width_sine_map(W);
             std::vector<float> height_cosine_map(H);
             std::vector<float> height_sine_map(H);
-            for (int X = 0; X < 256; X++) {
-                float theta = halfPI * (X / 255.0f);
-                float cosine = cos(theta), sine = sin(theta);
-                color_cosine_map[X] = C_color * cosine * 2.55f;
-    			color_sine_map[X] = C_color * sine * 2.55f;
-            }
-
-            for (int X = 0; X < 256; X++) {
-                float theta = halfPI * (X / 255.0f);
-                L_cosine_map[X] = C_color * cos(theta);
-                L_sine_map[X] = C_color * sin(theta);
-            }
-
             for (int i = 0; i < H; i++) {
                 float theta = i * (halfPI / S);
                 height_cosine_map[i] = C_spatial * cos(theta);
@@ -100,17 +83,33 @@ namespace fslic {
                 width_sine_map[i] = C_spatial * sin(theta);
             }
 
-            const uint8_t* __restrict L = &image_planes[0][0];
-            const uint8_t* __restrict A = &image_planes[1][0];
-            const uint8_t* __restrict B = &image_planes[2][0];
+            const uint16_t* __restrict L = &image_planes[0][0];
+            const uint16_t* __restrict A = &image_planes[1][0];
+            const uint16_t* __restrict B = &image_planes[2][0];
             #pragma omp parallel for num_threads(fsparallel::nth())
             for (int i = 0; i < len; i++) {
-                image_features[0][i] = L_cosine_map[L[i]];
-    			image_features[1][i] = L_sine_map[L[i]];
-    			image_features[2][i] = color_cosine_map[A[i]];
-    			image_features[3][i] = color_sine_map[A[i]];
-    			image_features[4][i] = color_cosine_map[B[i]];
-    			image_features[5][i] = color_sine_map[B[i]];
+                {
+                    float X = L[i] / (float)(1 << color_shift);
+                    float theta = halfPI * (X / 100.0f);
+                    float cosine = cos(theta), sine = sin(theta);
+                    image_features[0][i] = C_color * cosine;
+        			image_features[1][i] = C_color * sine;
+                }
+
+                {
+                    float X = A[i] / (float)(1 << color_shift);
+                    float theta = halfPI * (X / 255.0f);
+                    float cosine = cos(theta), sine = sin(theta);
+                    image_features[2][i] = C_color * cosine;
+                    image_features[3][i] = C_color * sine;
+                }
+                {
+                    float X = B[i] / (float)(1 << color_shift);
+                    float theta = halfPI * (X / 255.0f);
+                    float cosine = cos(theta), sine = sin(theta);
+                    image_features[4][i] = C_color * cosine;
+                    image_features[5][i] = C_color * sine;
+                }
             }
             // x1, x2, y1, y2
 
@@ -306,10 +305,6 @@ namespace fslic {
         delete [] wsums;
     }
 
-    void ContextLSC::rgb_to_lab(uint8_t *quad_image, int size) {
-        rgb_to_cielab(quad_image, quad_image, size, true);
-    }
-
 
 	void ContextLSC::normalize_features(float *__restrict numers[10], float* __restrict weights, int size) {
         #pragma omp parallel for num_threads(fsparallel::nth())
diff --git a/src/lsc.h b/src/lsc.h
index 29572ad..bbe8135 100644
--- a/src/lsc.h
+++ b/src/lsc.h
@@ -7,8 +7,8 @@ namespace fslic {
 	protected:
 		float C_color = 20;
 		float* float_memory_pool = nullptr;
-		uint8_t* uint8_memory_pool = nullptr;
-	    uint8_t* __restrict image_planes[3]; // L, a, b plane (H x W)
+		uint16_t* uint16_memory_pool = nullptr;
+	    uint16_t* __restrict image_planes[3]; // L, a, b plane (H x W)
 	    float* __restrict image_features[10]; // l1, l2, a1, a2, b1, b2, x1, x2, y1, y2
 	    float* __restrict image_weights;
 	    float* __restrict centroid_features[10]; // l1, l2, a1, a2, b1, b2, x1, x2, y1, y2
@@ -20,7 +20,6 @@ namespace fslic {
 		virtual void after_update();
 	    virtual void assign_clusters(const Cluster **target_clusters, int size);
 		virtual void normalize_features(float *__restrict numers[10], float* __restrict weights, int size);
-        virtual void rgb_to_lab(uint8_t* quad_image, int size);
 	private:
 	    void map_image_into_feature_space();
 	    void map_centroids_into_feature_space();