Skip to content

Commit

Permalink
shrink image size
Browse files Browse the repository at this point in the history
  • Loading branch information
Algy committed Nov 21, 2019
1 parent 5dd2b76 commit 7dbaca7
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 74 deletions.
50 changes: 26 additions & 24 deletions src/arch/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,22 @@

inline void get_assignment_value_vec(
const Cluster* cluster,
const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
const uint16_t* min_dist_row, const uint16_t* assignment_row,
uint16x8_t cluster_number_vec, uint16x8_t cluster_color_vec,
uint16x8_t cluster_number_vec, uint8x16_t cluster_color_vec,
uint16x8_t& new_min_dist, uint16x8_t& new_assignment
) {
uint16x8_t spatial_dist_vec = vld1q_u16(spatial_dist_patch_row);
uint16x8_t image_segment_1 = vld1q_u16(img_quad_row);
uint16x8_t image_segment_2 = vld1q_u16(img_quad_row + 8);
uint16x8_t image_segment_3 = vld1q_u16(img_quad_row + 16);
uint16x8_t image_segment_4 = vld1q_u16(img_quad_row + 24);
uint8x16_t image_segment = vld1q_u8(img_quad_row);
uint8x16_t image_segment_2 = vld1q_u8(img_quad_row + 16);

uint16x8_t abs_segment_1 = vabdq_u16(image_segment_1, cluster_color_vec);
uint16x8_t abs_segment_2 = vabdq_u16(image_segment_2, cluster_color_vec);
uint16x8_t abs_segment_3 = vabdq_u16(image_segment_3, cluster_color_vec);
uint16x8_t abs_segment_4 = vabdq_u16(image_segment_4, cluster_color_vec);
uint8x16_t abs_segment = vabdq_u8(image_segment, cluster_color_vec);
uint8x16_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec);

uint16x4_t f_1 = vmovn_u32(vpaddlq_u16(abs_segment_1));
uint16x4_t f_2 = vmovn_u32(vpaddlq_u16(abs_segment_2));
uint16x4_t f_3 = vmovn_u32(vpaddlq_u16(abs_segment_3));
uint16x4_t f_4 = vmovn_u32(vpaddlq_u16(abs_segment_4));
uint32x4_t sad_segment = vpaddlq_u16(vpaddlq_u8(abs_segment));
uint32x4_t sad_segment_2 = vpaddlq_u16(vpaddlq_u8(abs_segment_2));

uint16x8_t color_dist_vec = vcombine_u16(vpadd_u16(f_1, f_2), vpadd_u16(f_3, f_4));
uint16x8_t color_dist_vec = vcombine_u16(vmovn_u32(sad_segment), vmovn_u32(sad_segment_2));

uint16x8_t dist_vec = vaddq_u16(color_dist_vec, spatial_dist_vec);
uint16x8_t old_assignment = vld1q_u16(assignment_row);
Expand Down Expand Up @@ -66,28 +60,36 @@ namespace fslic {
cluster_number
};

uint16x8_t cluster_color_vec = {
(uint16_t)cluster->r,
(uint16_t)cluster->g,
(uint16_t)cluster->b,
uint8x16_t cluster_color_vec = {
(uint8_t)cluster->r,
(uint8_t)cluster->g,
(uint8_t)cluster->b,
0,
(uint16_t)cluster->r,
(uint16_t)cluster->g,
(uint16_t)cluster->b,
(uint8_t)cluster->r,
(uint8_t)cluster->g,
(uint8_t)cluster->b,
0,
(uint8_t)cluster->r,
(uint8_t)cluster->g,
(uint8_t)cluster->b,
0,
(uint8_t)cluster->r,
(uint8_t)cluster->g,
(uint8_t)cluster->b,
0
};
int16_t patch_height = spatial_dist_patch.get_height();
for (int16_t i = fit_to_stride(y_lo) - y_lo; i < patch_height; i += subsample_stride) {
const uint16_t* spatial_dist_patch_base_row = spatial_dist_patch.get_row(i);
const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);

#define ASSIGNMENT_VALUE_GETTER_BODY \
uint16x8_t new_min_dist, new_assignment; \
uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
uint16_t* assignment_row = assignment_base_row + j; /* unaligned */ \
const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
get_assignment_value_vec( \
cluster, \
Expand Down
10 changes: 5 additions & 5 deletions src/arch/x64/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ inline __m256 _mm256_set_ps1(float v) {

inline void get_assignment_value_vec(
const Cluster* cluster,
const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
const uint16_t* min_dist_row, const uint16_t* assignment_row,
__m128i cluster_number_vec, __m256i cluster_color_vec,
__m128i order_swap_mask,
Expand All @@ -23,8 +23,8 @@ inline void get_assignment_value_vec(

__m128i spatial_dist_vec = _mm_load_si128((__m128i *)spatial_dist_patch_row);

__m256i image_segment = _mm256_loadu_si256((__m256i*)img_quad_row);
__m256i image_segment_2 = _mm256_loadu_si256((__m256i*)(img_quad_row + 16));
__m256i image_segment = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)img_quad_row));
__m256i image_segment_2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(img_quad_row + 16)));

// [R1, G1, B1, A1, R2, G2, B2, A2, R3, G3, B3, A3, R3, G3, B3, A3]
__m256i abd_segment = _mm256_abs_epi16(_mm256_subs_epi16(image_segment, cluster_color_vec));
Expand Down Expand Up @@ -114,15 +114,15 @@ namespace fslic {
assert((long long)spatial_dist_patch_base_row % 32 == 0);
#endif
// not aligned
const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);

#define ASSIGNMENT_VALUE_GETTER_BODY \
__m128i new_assignment__narrow, new_min_dist__narrow; \
uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
uint16_t* assignment_row = assignment_base_row + j; /* unaligned */ \
const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
get_assignment_value_vec( \
cluster, \
Expand Down
15 changes: 9 additions & 6 deletions src/cielab.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#include <vector>
#include "parallel.h"
#include "simd-helper.hpp"
#include "fast-slic-common.h"

/*
def get_xyz_nonlin_tbl(a):
v = a / 255.
Expand Down Expand Up @@ -279,7 +281,7 @@ static float _srgb_gamma_tbl[256] = {
#define srgb_shift 13
#define srgb_max (1 << srgb_shift)
#define lab_shift 16
#define output_shift 3
#define output_shift 1

class FastCIELabCvt {
public:
Expand All @@ -303,7 +305,7 @@ class FastCIELabCvt {
}


inline void convert(uint8_t R, uint8_t G, uint8_t B, uint16_t& l, uint16_t& a, uint16_t& b) {
inline void convert(uint8_t R, uint8_t G, uint8_t B, uint8_t& l, uint8_t& a, uint8_t& b) {
int sr = srgb_gamma_tbl[R], sg = srgb_gamma_tbl[G], sb = srgb_gamma_tbl[B];

int xr = (Cb[0] * sr + Cb[1] * sg + Cb[2] * sb) >> lab_shift;
Expand All @@ -316,9 +318,10 @@ class FastCIELabCvt {
int ciea = 500 * (fx - fy) + (128 << srgb_shift); // to positive integer
int cieb = 200 * (fy - fz) + (128 << srgb_shift); // to positive integer

l = (uint16_t)((unsigned)ciel >> (srgb_shift - output_shift));
a = (uint16_t)((unsigned)ciea >> (srgb_shift - output_shift));
b = (uint16_t)((unsigned)cieb >> (srgb_shift - output_shift));

l = clamp<int>((unsigned)ciel >> (srgb_shift - output_shift), 0, 255);
a = clamp<int>(((unsigned)ciea >> (srgb_shift - output_shift)) - (64 << output_shift), 0, 255);
b = clamp<int>(((unsigned)cieb >> (srgb_shift - output_shift)) - (64 << output_shift), 0, 255);
}

private:
Expand All @@ -331,7 +334,7 @@ class FastCIELabCvt {

static FastCIELabCvt fast_cielab_cvt;

static void rgb_to_cielab(const uint8_t* image, int H, int W, simd_helper::AlignedArray<uint16_t> &arr, int &shift_out) {
static void rgb_to_cielab(const uint8_t* image, int H, int W, simd_helper::AlignedArray<uint8_t> &arr, int &shift_out) {
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
Expand Down
4 changes: 2 additions & 2 deletions src/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,7 @@ namespace fslic {

for (int i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
if (!valid_subsample_row(i)) continue;
const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
uint16_t* __restrict assignment_row = assignment.get_row(i, cluster_x - S);
DistType* __restrict min_dist_row = min_dists.get_row(i, cluster_x - S);
const DistType* __restrict patch_row = spatial_dist_patch.get_row(i_off);
Expand Down Expand Up @@ -404,7 +404,7 @@ namespace fslic {

for (int16_t i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
if (!valid_subsample_row(i)) continue;
const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
uint16_t* __restrict assignment_row = assignment.get_row(i, cluster_x - S);
float* __restrict min_dist_row = min_dists.get_row(i, cluster_x - S);
const float* __restrict patch_row = spatial_dist_patch.get_row(i_off);
Expand Down
2 changes: 1 addition & 1 deletion src/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace fslic {
protected:
int color_shift;
protected:
simd_helper::AlignedArray<uint16_t> quad_image;
simd_helper::AlignedArray<uint8_t> quad_image;
simd_helper::AlignedArray<uint16_t> assignment;
simd_helper::AlignedArray<DistType> min_dists;
simd_helper::AlignedArray<DistType> spatial_dist_patch;
Expand Down
68 changes: 34 additions & 34 deletions src/lsc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ namespace fslic {
}

ContextLSC::~ContextLSC() {
if (uint16_memory_pool) delete [] uint16_memory_pool;
if (uint8_memory_pool) delete [] uint8_memory_pool;
if (float_memory_pool) delete [] float_memory_pool;
}

Expand All @@ -34,14 +34,14 @@ namespace fslic {
{
fstimer::Scope s("image_alloc");

if (uint16_memory_pool) delete [] uint16_memory_pool;
uint16_memory_pool = new uint16_t[3 * aligned_len];
if (float_memory_pool) delete [] uint16_memory_pool;
if (uint8_memory_pool) delete [] uint8_memory_pool;
uint8_memory_pool = new uint8_t[3 * aligned_len];
if (float_memory_pool) delete [] uint8_memory_pool;
float_memory_pool = new float[11 * aligned_len + 10 * aligned_K];

image_planes[0] = &uint16_memory_pool[0];
image_planes[1] = &uint16_memory_pool[aligned_len];
image_planes[2] = &uint16_memory_pool[2 * aligned_len];
image_planes[0] = &uint8_memory_pool[0];
image_planes[1] = &uint8_memory_pool[aligned_len];
image_planes[2] = &uint8_memory_pool[2 * aligned_len];
for (int i = 0; i < 10; i++) {
image_features[i] = &float_memory_pool[i * aligned_len];
centroid_features[i] = &float_memory_pool[11 * aligned_len + i * aligned_K];
Expand All @@ -53,7 +53,7 @@ namespace fslic {
fstimer::Scope s("image_copy");
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < H; i++) {
const uint16_t* image_row = quad_image.get_row(i);
const uint8_t* image_row = quad_image.get_row(i);
for (int j = 0; j < W; j++) {
int index = i * W + j;
image_planes[0][index] = image_row[4 * j];
Expand All @@ -67,10 +67,27 @@ namespace fslic {
fstimer::Scope s("feature_map");

// l1, l2, a1, a2, b1, b2
float color_sine_map[256];
float color_cosine_map[256];
float L_sine_map[256];
float L_cosine_map[256];
std::vector<float> width_cosine_map(W);
std::vector<float> width_sine_map(W);
std::vector<float> height_cosine_map(H);
std::vector<float> height_sine_map(H);
for (int X = 0; X < 256; X++) {
float theta = halfPI * (X / 255.0f);
float cosine = cos(theta), sine = sin(theta);
color_cosine_map[X] = C_color * cosine * 2.55f;
color_sine_map[X] = C_color * sine * 2.55f;
}

for (int X = 0; X < 256; X++) {
float theta = halfPI * (X / 255.0f);
L_cosine_map[X] = C_color * cos(theta);
L_sine_map[X] = C_color * sin(theta);
}

for (int i = 0; i < H; i++) {
float theta = i * (halfPI / S);
height_cosine_map[i] = C_spatial * cos(theta);
Expand All @@ -83,33 +100,17 @@ namespace fslic {
width_sine_map[i] = C_spatial * sin(theta);
}

const uint16_t* __restrict L = &image_planes[0][0];
const uint16_t* __restrict A = &image_planes[1][0];
const uint16_t* __restrict B = &image_planes[2][0];
const uint8_t* __restrict L = &image_planes[0][0];
const uint8_t* __restrict A = &image_planes[1][0];
const uint8_t* __restrict B = &image_planes[2][0];
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < len; i++) {
{
float X = L[i] / (float)(1 << color_shift);
float theta = halfPI * (X / 100.0f);
float cosine = cos(theta), sine = sin(theta);
image_features[0][i] = C_color * cosine;
image_features[1][i] = C_color * sine;
}

{
float X = A[i] / (float)(1 << color_shift);
float theta = halfPI * (X / 255.0f);
float cosine = cos(theta), sine = sin(theta);
image_features[2][i] = C_color * cosine;
image_features[3][i] = C_color * sine;
}
{
float X = B[i] / (float)(1 << color_shift);
float theta = halfPI * (X / 255.0f);
float cosine = cos(theta), sine = sin(theta);
image_features[4][i] = C_color * cosine;
image_features[5][i] = C_color * sine;
}
image_features[0][i] = L_cosine_map[L[i]];
image_features[1][i] = L_sine_map[L[i]];
image_features[2][i] = color_cosine_map[A[i]];
image_features[3][i] = color_sine_map[A[i]];
image_features[4][i] = color_cosine_map[B[i]];
image_features[5][i] = color_sine_map[B[i]];
}
// x1, x2, y1, y2

Expand Down Expand Up @@ -305,7 +306,6 @@ namespace fslic {
delete [] wsums;
}


void ContextLSC::normalize_features(float *__restrict numers[10], float* __restrict weights, int size) {
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < size; i++) {
Expand Down
4 changes: 2 additions & 2 deletions src/lsc.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ namespace fslic {
protected:
float C_color = 20;
float* float_memory_pool = nullptr;
uint16_t* uint16_memory_pool = nullptr;
uint16_t* __restrict image_planes[3]; // L, a, b plane (H x W)
uint8_t* uint8_memory_pool = nullptr;
uint8_t* __restrict image_planes[3]; // L, a, b plane (H x W)
float* __restrict image_features[10]; // l1, l2, a1, a2, b1, b2, x1, x2, y1, y2
float* __restrict image_weights;
float* __restrict centroid_features[10]; // l1, l2, a1, a2, b1, b2, x1, x2, y1, y2
Expand Down

0 comments on commit 7dbaca7

Please sign in to comment.