From 258b641f87d5b70655b4e741de821dd73807b0d6 Mon Sep 17 00:00:00 2001 From: Algy Date: Wed, 11 Dec 2019 22:33:09 +0900 Subject: [PATCH] Optimize unaligned tail --- src/arch/arm/neon.h | 50 +++++++++++++++++++++++++++++++++++---------- src/arch/x64/avx2.h | 35 +++++++++++++++++++++++++------ 2 files changed, 68 insertions(+), 17 deletions(-) diff --git a/src/arch/arm/neon.h b/src/arch/arm/neon.h index 6936a3a..61d5091 100644 --- a/src/arch/arm/neon.h +++ b/src/arch/arm/neon.h @@ -110,17 +110,45 @@ namespace fslic { vst1q_u16(assignment_row, new_assignment); } - if (0 < patch_width - patch_width_multiple8) { - uint16_t new_min_dists[8], new_assignments[8]; - int j = patch_width_multiple8; - ASSIGNMENT_VALUE_GETTER_BODY - vst1q_u16(new_min_dists, new_min_dist); - vst1q_u16(new_assignments, new_assignment); - for (int delta = 0; delta < patch_width - patch_width_multiple8; delta++) { - min_dist_row[delta] = new_min_dists[delta]; - assignment_row[delta] = new_assignments[delta]; - } - } + if (0 < patch_width - patch_width_multiple8) { + int j = patch_width_multiple8; + int rem = patch_width - patch_width_multiple8; + ASSIGNMENT_VALUE_GETTER_BODY + + uint16x4_t dist_4, assignment_4; + if (rem >= 4) { + vst1q_u16(&min_dist_base_row[j], vget_low_u16(new_min_dist__narrow)); + vst1q_u16(&assignment_base_row[j], vget_low_u16(new_assignment__narrow)); + rem -= 4; + j += 4; + dist_4 = vget_high_u16(new_min_dist__narrow); + assignment_4 = vget_high_u16(new_assignment__narrow); + } else { + dist_4 = vget_low_u16(new_min_dist__narrow); + assignment_4 = vget_low_u16(new_assignment__narrow); + } + + switch (rem) { + case 3: + min_dist_base_row[j] = dist_4[0]; + assignment_base_row[j] = assignment_4[0]; + min_dist_base_row[j+1] = dist_4[1]; + assignment_base_row[j+1] = assignment_4[1]; + min_dist_base_row[j+2] = dist_4[2]; + assignment_base_row[j+2] = assignment_4[2]; + break; + case 2: + min_dist_base_row[j] = dist_4[0]; + assignment_base_row[j] = assignment_4[0]; + min_dist_base_row[j+1] = dist_4[1]; + assignment_base_row[j+1] = assignment_4[1]; + break; + case 1: + min_dist_base_row[j] = dist_4[0]; + assignment_base_row[j] = assignment_4[0]; + break; + } + } } } } diff --git a/src/arch/x64/avx2.h b/src/arch/x64/avx2.h index 74fcbaf..f465217 100644 --- a/src/arch/x64/avx2.h +++ b/src/arch/x64/avx2.h @@ -145,15 +145,38 @@ namespace fslic { } if (0 < patch_width - patch_width_multiple8) { - uint16_t new_min_dists[8], new_assignments[8]; int j = patch_width_multiple8; + int rem = patch_width - patch_width_multiple8; ASSIGNMENT_VALUE_GETTER_BODY - _mm_storeu_si128((__m128i*)new_min_dists, new_min_dist__narrow); - _mm_storeu_si128((__m128i*)new_assignments, new_assignment__narrow); - for (int delta = 0; delta < patch_width - patch_width_multiple8; delta++) { - min_dist_row[delta] = new_min_dists[delta]; - assignment_row[delta] = new_assignments[delta]; + uint64_t dist_4, assignment_4; + if (rem >= 4) { + *(uint64_t *)&min_dist_base_row[j] = _mm_extract_epi64(new_min_dist__narrow, 0); + *(uint64_t *)&assignment_base_row[j] = _mm_extract_epi64(new_assignment__narrow, 0); + rem -= 4; + j += 4; + dist_4 = _mm_extract_epi64(new_min_dist__narrow, 1); + assignment_4 = _mm_extract_epi64(new_assignment__narrow, 1); + } else { + dist_4 = _mm_extract_epi64(new_min_dist__narrow, 0); + assignment_4 = _mm_extract_epi64(new_assignment__narrow, 0); + } + + switch (rem) { + case 3: + *(uint32_t *)&min_dist_base_row[j] = (uint32_t)dist_4; + *(uint32_t *)&assignment_base_row[j] = (uint32_t)assignment_4; + *(uint16_t *)&min_dist_base_row[j + 2] = (uint16_t)(dist_4 >> 32); + *(uint16_t *)&assignment_base_row[j + 2] = (uint16_t)(assignment_4 >> 32); + break; + case 2: + *(uint32_t *)&min_dist_base_row[j] = (uint32_t)dist_4; + *(uint32_t *)&assignment_base_row[j] = (uint32_t)assignment_4; + break; + case 1: + *(uint16_t *)&min_dist_base_row[j] = (uint16_t)dist_4; + *(uint16_t *)&assignment_base_row[j] = (uint16_t)assignment_4; + break; } } }