Skip to content

Commit

Permalink
Color shift
Browse files Browse the repository at this point in the history
  • Loading branch information
Algy committed Aug 31, 2019
1 parent e1a877c commit 26c4f4e
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 131 deletions.
47 changes: 20 additions & 27 deletions src/arch/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,23 @@

inline void get_assignment_value_vec(
const Cluster* cluster,
const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
const uint16_t* min_dist_row, const uint16_t* assignment_row,
uint16x8_t cluster_number_vec, uint8x16_t cluster_color_vec,
uint16x8_t cluster_number_vec, uint16x8_t cluster_color_vec,
uint16x8_t& new_min_dist, uint16x8_t& new_assignment
) {
uint16x8_t spatial_dist_vec = vld1q_u16(spatial_dist_patch_row);
uint8x16_t image_segment = vld1q_u8(img_quad_row);
uint8x16_t image_segment_2 = vld1q_u8(img_quad_row + 16);
uint16x8_t image_segment_1 = vld1q_u16(img_quad_row);
uint16x8_t image_segment_2 = vld1q_u16(img_quad_row + 8);
uint16x8_t image_segment_3 = vld1q_u16(img_quad_row + 16);
uint16x8_t image_segment_4 = vld1q_u16(img_quad_row + 24);

uint8x16_t abs_segment = vabdq_u8(image_segment, cluster_color_vec);
uint8x16_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec);
uint16x8_t abs_segment_1 = vabdq_u8(image_segment_1, cluster_color_vec);
uint16x8_t abs_segment_2 = vabdq_u8(image_segment_2, cluster_color_vec);
uint16x8_t abs_segment_3 = vabdq_u8(image_segment_3, cluster_color_vec);
uint16x8_t abs_segment_4 = vabdq_u8(image_segment_4, cluster_color_vec);

uint32x4_t sad_segment = vpaddlq_u16(vpaddlq_u8(abs_segment));
uint32x4_t sad_segment_2 = vpaddlq_u16(vpaddlq_u8(abs_segment_2));

uint16x8_t color_dist_vec = vcombine_u16(vmovn_u32(sad_segment), vmovn_u32(sad_segment_2));
uint16x8_t color_dist_vec = vpaddq_u16(vpaddq_u16(abs_segment_1, abs_segment_2), vpaddq_u16(abs_segment_3, abs_segment_4));

uint16x8_t dist_vec = vaddq_u16(color_dist_vec, spatial_dist_vec);
uint16x8_t old_assignment = vld1q_u16(assignment_row);
Expand Down Expand Up @@ -60,36 +61,28 @@ namespace fslic {
cluster_number
};

uint8x16_t cluster_color_vec = {
(uint8_t)cluster->r,
(uint8_t)cluster->g,
(uint8_t)cluster->b,
0,
(uint8_t)cluster->r,
(uint8_t)cluster->g,
(uint8_t)cluster->b,
0,
(uint8_t)cluster->r,
(uint8_t)cluster->g,
(uint8_t)cluster->b,
uint16x8_t cluster_color_vec = {
(uint16_t)cluster->r,
(uint16_t)cluster->g,
(uint16_t)cluster->b,
0,
(uint8_t)cluster->r,
(uint8_t)cluster->g,
(uint8_t)cluster->b,
(uint16_t)cluster->r,
(uint16_t)cluster->g,
(uint16_t)cluster->b,
0
};
int16_t patch_height = spatial_dist_patch.get_height();
for (int16_t i = fit_to_stride(y_lo) - y_lo; i < patch_height; i += subsample_stride) {
const uint16_t* spatial_dist_patch_base_row = spatial_dist_patch.get_row(i);
const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);

#define ASSIGNMENT_VALUE_GETTER_BODY \
uint16x8_t new_min_dist, new_assignment; \
uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
uint16_t* assignment_row = assignment_base_row + j; /* unaligned */ \
const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
get_assignment_value_vec( \
cluster, \
Expand Down
10 changes: 5 additions & 5 deletions src/arch/x64/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ inline __m256 _mm256_set_ps1(float v) {

inline void get_assignment_value_vec(
const Cluster* cluster,
const uint8_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
const uint16_t* img_quad_row, const uint16_t* spatial_dist_patch_row,
const uint16_t* min_dist_row, const uint16_t* assignment_row,
__m128i cluster_number_vec, __m256i cluster_color_vec,
__m128i order_swap_mask,
Expand All @@ -23,8 +23,8 @@ inline void get_assignment_value_vec(

__m128i spatial_dist_vec = _mm_load_si128((__m128i *)spatial_dist_patch_row);

__m256i image_segment = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)img_quad_row));
__m256i image_segment_2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i*)(img_quad_row + 16)));
__m256i image_segment = _mm256_loadu_si256((__m256i*)img_quad_row);
__m256i image_segment_2 = _mm256_loadu_si256((__m256i*)(img_quad_row + 16));

// [R1, G1, B1, A1, R2, G2, B2, A2, R3, G3, B3, A3, R3, G3, B3, A3]
__m256i abd_segment = _mm256_abs_epi16(_mm256_subs_epi16(image_segment, cluster_color_vec));
Expand Down Expand Up @@ -114,15 +114,15 @@ namespace fslic {
assert((long long)spatial_dist_patch_base_row % 32 == 0);
#endif
// not aligned
const uint8_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
const uint16_t *img_quad_base_row = quad_image.get_row(y_lo + i, 4 * x_lo);
uint16_t* assignment_base_row = assignment.get_row(i + y_lo, x_lo);
uint16_t* min_dist_base_row = min_dists.get_row(i + y_lo, x_lo);

#define ASSIGNMENT_VALUE_GETTER_BODY \
__m128i new_assignment__narrow, new_min_dist__narrow; \
uint16_t* min_dist_row = min_dist_base_row + j; /* unaligned */ \
uint16_t* assignment_row = assignment_base_row + j; /* unaligned */ \
const uint8_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
const uint16_t* img_quad_row = img_quad_base_row + 4 * j; /*Image rows are not aligned due to x_lo*/ \
const uint16_t* spatial_dist_patch_row = (uint16_t *)HINT_ALIGNED_AS(spatial_dist_patch_base_row + j, 16); /* Spatial distance patch is aligned */ \
get_assignment_value_vec( \
cluster, \
Expand Down
52 changes: 19 additions & 33 deletions src/cielab.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <cstdint>
#include <vector>
#include "parallel.h"
#include "simd-helper.hpp"
/*
def get_xyz_nonlin_tbl(a):
v = a / 255.
Expand Down Expand Up @@ -278,6 +279,7 @@ static float _srgb_gamma_tbl[256] = {
#define srgb_shift 13
#define srgb_max (1 << srgb_shift)
#define lab_shift 16
#define output_shift 3

class FastCIELabCvt {
public:
Expand All @@ -301,8 +303,7 @@ class FastCIELabCvt {
}


template <bool scale_lab = false>
inline void convert(uint8_t R, uint8_t G, uint8_t B, uint8_t& l, uint8_t& a, uint8_t& b) {
inline void convert(uint8_t R, uint8_t G, uint8_t B, uint16_t& l, uint16_t& a, uint16_t& b) {
int sr = srgb_gamma_tbl[R], sg = srgb_gamma_tbl[G], sb = srgb_gamma_tbl[B];

int xr = (Cb[0] * sr + Cb[1] * sg + Cb[2] * sb) >> lab_shift;
Expand All @@ -315,13 +316,9 @@ class FastCIELabCvt {
int ciea = 500 * (fx - fy) + (128 << srgb_shift); // to positive integer
int cieb = 200 * (fy - fz) + (128 << srgb_shift); // to positive integer

if (scale_lab) {
ciel = ciel * 255 / 100;
}

l = (uint8_t)((unsigned)ciel >> srgb_shift);
a = (uint8_t)((unsigned)ciea >> srgb_shift);
b = (uint8_t)((unsigned)cieb >> srgb_shift);
l = (uint16_t)((unsigned)ciel >> (srgb_shift - output_shift));
a = (uint16_t)((unsigned)ciea >> (srgb_shift - output_shift));
b = (uint16_t)((unsigned)cieb >> (srgb_shift - output_shift));
}

private:
Expand All @@ -334,33 +331,22 @@ class FastCIELabCvt {

static FastCIELabCvt fast_cielab_cvt;

static void rgb_to_cielab(const uint8_t* aligned_quad_image, uint8_t *out, int size, bool scale_L = false) {
if (scale_L) {
#pragma omp parallel for num_threads(fsparallel::nth())
for (int s = 0; s < size; s += 4) {
fast_cielab_cvt.convert<true>(
aligned_quad_image[s],
aligned_quad_image[s+1],
aligned_quad_image[s+2],
out[s],
out[s+1],
out[s+2]
);
}
} else {
#pragma omp parallel for num_threads(fsparallel::nth())
for (int s = 0; s < size; s += 4) {
fast_cielab_cvt.convert<false>(
aligned_quad_image[s],
aligned_quad_image[s+1],
aligned_quad_image[s+2],
out[s],
out[s+1],
out[s+2]
static void rgb_to_cielab(const uint8_t* image, int H, int W, simd_helper::AlignedArray<uint16_t> &arr, int &shift_out) {
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
int index = W * i + j;
fast_cielab_cvt.convert(
image[3 * index],
image[3 * index + 1],
image[3 * index + 2],
arr.get(i, 4 * j),
arr.get(i, 4 * j + 1),
arr.get(i, 4 * j + 2)
);
}

}
shift_out = output_shift;
}

static void rgb_to_cielab_orig(const uint8_t* image, float *out, int size) {
Expand Down
41 changes: 18 additions & 23 deletions src/context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ namespace fslic {
template<typename DistType>
void BaseContext<DistType>::set_spatial_patch() {
float coef = 1.0f / ((float)S / compactness);
coef *= (1 << color_shift);
int16_t S_2 = 2 * S;
if (manhattan_spatial_dist) {
for (int16_t i = 0; i <= S_2; i++) {
Expand Down Expand Up @@ -95,7 +96,6 @@ namespace fslic {

template<typename DistType>
void BaseContext<DistType>::initialize_state() {
set_spatial_patch();
}

template<typename DistType>
Expand All @@ -109,32 +109,30 @@ namespace fslic {
fsparallel::Scope parallel_scope(num_threads);
fstimer::Scope s("iterate");
{
fstimer::Scope s("write_to_buffer");
#pragma omp parallel num_threads(fsparallel::nth())
{
#pragma omp for
fstimer::Scope s("cielab_conversion");
if (convert_to_lab) {
rgb_to_cielab(image, H, W, quad_image, color_shift);
} else {
#pragma omp parallel num_threads(fsparallel::nth())
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
for (int k = 0; k < 3; k++) {
quad_image.get(i, 4 * j + k) = image[i * W * 3 + 3 * j + k];
}
}
}

#pragma omp for
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
this->assignment.get(i, j) = 0xFFFF;
}
}
color_shift = 0;
}
}

{
fstimer::Scope s("cielab_conversion");
if (convert_to_lab) {
rgb_to_lab(&quad_image.get(0, 0), quad_image.contiguous_memory_size());
fstimer::Scope s("write_to_buffer");
#pragma omp parallel for num_threads(fsparallel::nth())
for (int i = 0; i < H; i++) {
for (int j = 0; j < W; j++) {
this->assignment.get(i, j) = 0xFFFF;
}
}
set_spatial_patch();
}

subsample_rem = 0;
Expand Down Expand Up @@ -261,7 +259,7 @@ namespace fslic {

for (int i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
if (!valid_subsample_row(i)) continue;
const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
uint16_t* __restrict assignment_row = assignment.get_row(i, cluster_x - S);
DistType* __restrict min_dist_row = min_dists.get_row(i, cluster_x - S);
const DistType* __restrict patch_row = spatial_dist_patch.get_row(i_off);
Expand Down Expand Up @@ -289,11 +287,6 @@ namespace fslic {
delete [] dist_row;
}

template<typename DistType>
void BaseContext<DistType>::rgb_to_lab(uint8_t *quad_image, int size) {
rgb_to_cielab(quad_image, quad_image, size, false);
}


template<typename DistType>
void BaseContext<DistType>::update() {
Expand Down Expand Up @@ -401,7 +394,7 @@ namespace fslic {

for (int16_t i_off = 0, i = cluster_y - S; i_off <= S_2; i_off++, i++) {
if (!valid_subsample_row(i)) continue;
const uint8_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
const uint16_t* __restrict image_row = quad_image.get_row(i, 4 * (cluster_x - S));
uint16_t* __restrict assignment_row = assignment.get_row(i, cluster_x - S);
float* __restrict min_dist_row = min_dists.get_row(i, cluster_x - S);
const float* __restrict patch_row = spatial_dist_patch.get_row(i_off);
Expand Down Expand Up @@ -431,6 +424,7 @@ namespace fslic {

void ContextRealDistL2::set_spatial_patch() {
float coef = 1.0f / ((float)S / compactness);
coef *= (1 << color_shift);
int16_t S_2 = 2 * S;
for (int16_t i = 0; i <= S_2; i++) {
for (int16_t j = 0; j <= S_2; j++) {
Expand Down Expand Up @@ -474,6 +468,7 @@ namespace fslic {
template<bool use_manhattan, bool use_float_color>
void ContextRealDistNoQ::assign_clusters_proto(const Cluster** target_clusters, int size) {
float coef = 1.0f / ((float)S / compactness);
coef *= (1 << color_shift);

for (int cidx = 0; cidx < size; cidx++) {
const Cluster* cluster = target_clusters[cidx];
Expand Down
6 changes: 4 additions & 2 deletions src/context.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,11 @@ namespace fslic {
protected:
int16_t subsample_rem;
int16_t subsample_stride;

protected:
int color_shift;
protected:
simd_helper::AlignedArray<uint8_t> quad_image;
simd_helper::AlignedArray<uint16_t> quad_image;
simd_helper::AlignedArray<uint16_t> assignment;
simd_helper::AlignedArray<DistType> min_dists;
simd_helper::AlignedArray<DistType> spatial_dist_patch;
Expand Down Expand Up @@ -86,7 +89,6 @@ namespace fslic {
virtual void after_update() {};
virtual void set_spatial_patch();
virtual void assign_clusters(const Cluster **target_clusters, int size);
virtual void rgb_to_lab(uint8_t* quad_image, int size);
virtual bool centroid_quantization_enabled();
};

Expand Down
Loading

0 comments on commit 26c4f4e

Please sign in to comment.