Skip to content

Commit

Permalink
Optimize unaligned tail
Browse files Browse the repository at this point in the history
  • Loading branch information
Algy committed Dec 11, 2019
1 parent 128c7fd commit 258b641
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 17 deletions.
50 changes: 39 additions & 11 deletions src/arch/arm/neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,17 +110,45 @@ namespace fslic {
vst1q_u16(assignment_row, new_assignment);
}

if (0 < patch_width - patch_width_multiple8) {
uint16_t new_min_dists[8], new_assignments[8];
int j = patch_width_multiple8;
ASSIGNMENT_VALUE_GETTER_BODY
vst1q_u16(new_min_dists, new_min_dist);
vst1q_u16(new_assignments, new_assignment);
for (int delta = 0; delta < patch_width - patch_width_multiple8; delta++) {
min_dist_row[delta] = new_min_dists[delta];
assignment_row[delta] = new_assignments[delta];
}
}
if (0 < patch_width - patch_width_multiple8) {
int j = patch_width_multiple8;
int rem = patch_width - patch_width_multiple8;
ASSIGNMENT_VALUE_GETTER_BODY

uint16x4_t dist_4, assignment_4;
if (rem >= 4) {
vst1q_u16(&min_dist_base_row[j], vget_low_u16(new_min_dist__narrow));
vst1q_u16(&assignment_base_row[j], vget_low_u16(new_assignment__narrow));
rem -= 4;
j += 4;
dist_4 = vget_high_u16(new_min_dist__narrow);
assignment_4 = vget_high_u16(new_assignment__narrow);
} else {
dist_4 = vget_low_u16(new_min_dist__narrow);
assignment_4 = vget_low_u16(new_assignment__narrow);
}

switch (rem) {
case 3:
min_dist_base_row[j] = dist_4[0];
assignment_base_row[j] = assignment_4[0];
min_dist_base_row[j+1] = dist_4[1];
assignment_base_row[j+1] = assignment_4[1];
min_dist_base_row[j+2] = dist_4[2];
assignment_base_row[j+2] = assignment_4[2];
break;
case 2:
min_dist_base_row[j] = dist_4[0];
assignment_base_row[j] = assignment_4[0];
min_dist_base_row[j+1] = dist_4[1];
assignment_base_row[j+1] = assignment_4[1];
break;
case 1:
min_dist_base_row[j] = dist_4[0];
assignment_base_row[j] = assignment_4[0];
break;
}
}
}
}
}
Expand Down
35 changes: 29 additions & 6 deletions src/arch/x64/avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,38 @@ namespace fslic {
}

if (0 < patch_width - patch_width_multiple8) {
uint16_t new_min_dists[8], new_assignments[8];
int j = patch_width_multiple8;
int rem = patch_width - patch_width_multiple8;
ASSIGNMENT_VALUE_GETTER_BODY
_mm_storeu_si128((__m128i*)new_min_dists, new_min_dist__narrow);
_mm_storeu_si128((__m128i*)new_assignments, new_assignment__narrow);

for (int delta = 0; delta < patch_width - patch_width_multiple8; delta++) {
min_dist_row[delta] = new_min_dists[delta];
assignment_row[delta] = new_assignments[delta];
uint64_t dist_4, assignment_4;
if (rem >= 4) {
*(uint64_t *)&min_dist_base_row[j] = _mm_extract_epi64(new_min_dist__narrow, 0);
*(uint64_t *)&assignment_base_row[j] = _mm_extract_epi64(new_assignment__narrow, 0);
rem -= 4;
j += 4;
dist_4 = _mm_extract_epi64(new_min_dist__narrow, 1);
assignment_4 = _mm_extract_epi64(new_assignment__narrow, 1);
} else {
dist_4 = _mm_extract_epi64(new_min_dist__narrow, 0);
assignment_4 = _mm_extract_epi64(new_assignment__narrow, 0);
}

switch (rem) {
case 3:
*(uint32_t *)&min_dist_base_row[j] = (uint32_t)dist_4;
*(uint32_t *)&assignment_base_row[j] = (uint32_t)assignment_4;
*(uint16_t *)&min_dist_base_row[j + 2] = (uint16_t)(dist_4 >> 32);
*(uint16_t *)&assignment_base_row[j + 2] = (uint16_t)(assignment_4 >> 32);
break;
case 2:
*(uint32_t *)&min_dist_base_row[j] = (uint32_t)dist_4;
*(uint32_t *)&assignment_base_row[j] = (uint32_t)assignment_4;
break;
case 1:
*(uint16_t *)&min_dist_base_row[j] = (uint16_t)dist_4;
*(uint16_t *)&assignment_base_row[j] = (uint16_t)assignment_4;
break;
}
}
}
Expand Down

0 comments on commit 258b641

Please sign in to comment.