Skip to content

Commit

Permalink
aarch64: Improve scalar mode popcount expansion by using SVE [PR113860]
Browse files Browse the repository at this point in the history
This is similar to the recent improvements to the Advanced SIMD popcount
expansion by using SVE. We can utilize SVE to generate more efficient code for
scalar mode popcount too.

Changes since v1:
* v2: Add a new VNx1BI mode and a new test case for V1DI.
* v3: Abandon VNx1BI changes and add a new variant of aarch64_ptrue_reg.

	PR target/113860

gcc/ChangeLog:

	* config/aarch64/aarch64-protos.h (aarch64_ptrue_reg): New function.
	* config/aarch64/aarch64-simd.md (popcount<mode>2): Update pattern to
	also support V1DI mode.
	* config/aarch64/aarch64.cc (aarch64_ptrue_reg): New function.
	* config/aarch64/aarch64.md (popcount<mode>2): Add TARGET_SVE support.
	* config/aarch64/iterators.md (VDQHSD_V1DI): New mode iterator.
	(SVE_VDQ_I): Add V1DI.
	(bitsize): Likewise.
	(VPRED): Likewise.
	(VEC_POP_MODE): New mode attribute.
	(vec_pop_mode): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/popcnt-sve.c: Update test.
	* gcc.target/aarch64/popcnt11.c: New test.
	* gcc.target/aarch64/popcnt12.c: New test.

Signed-off-by: Pengxuan Zheng <quic_pzheng@quicinc.com>
  • Loading branch information
pzhengqc committed Oct 23, 2024
1 parent 774ad67 commit 9ffcf1f
Show file tree
Hide file tree
Showing 8 changed files with 139 additions and 11 deletions.
1 change: 1 addition & 0 deletions gcc/config/aarch64/aarch64-protos.h
Original file line number Diff line number Diff line change
Expand Up @@ -917,6 +917,7 @@ rtx aarch64_expand_sve_dupq (rtx, machine_mode, rtx);
void aarch64_expand_mov_immediate (rtx, rtx);
rtx aarch64_stack_protect_canary_mem (machine_mode, rtx, aarch64_salt_type);
rtx aarch64_ptrue_reg (machine_mode);
rtx aarch64_ptrue_reg (machine_mode, unsigned int);
rtx aarch64_pfalse_reg (machine_mode);
bool aarch64_sve_same_pred_for_ptest_p (rtx *, rtx *);
void aarch64_emit_sve_pred_move (rtx, rtx, rtx);
Expand Down
15 changes: 12 additions & 3 deletions gcc/config/aarch64/aarch64-simd.md
Original file line number Diff line number Diff line change
Expand Up @@ -3516,19 +3516,28 @@
)

(define_expand "popcount<mode>2"
[(set (match_operand:VDQHSD 0 "register_operand")
(popcount:VDQHSD (match_operand:VDQHSD 1 "register_operand")))]
[(set (match_operand:VDQHSD_V1DI 0 "register_operand")
(popcount:VDQHSD_V1DI
(match_operand:VDQHSD_V1DI 1 "register_operand")))]
"TARGET_SIMD"
{
if (TARGET_SVE)
{
rtx p = aarch64_ptrue_reg (<VPRED>mode);
rtx p = aarch64_ptrue_reg (<VPRED>mode, <bitsize> == 64 ? 8 : 16);
emit_insn (gen_aarch64_pred_popcount<mode> (operands[0],
p,
operands[1]));
DONE;
}

if (<MODE>mode == V1DImode)
{
rtx out = gen_reg_rtx (DImode);
emit_insn (gen_popcountdi2 (out, gen_lowpart (DImode, operands[1])));
emit_move_insn (operands[0], gen_lowpart (<MODE>mode, out));
DONE;
}

/* Generate a byte popcount. */
machine_mode mode = <bitsize> == 64 ? V8QImode : V16QImode;
machine_mode mode2 = <bitsize> == 64 ? V2SImode : V4SImode;
Expand Down
21 changes: 21 additions & 0 deletions gcc/config/aarch64/aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3630,6 +3630,27 @@ aarch64_ptrue_reg (machine_mode mode)
return gen_lowpart (mode, reg);
}

/* Return an all-true (restricted to the leading VL bits) predicate register of
mode MODE. */

rtx
aarch64_ptrue_reg (machine_mode mode, unsigned int vl)
{
gcc_assert (aarch64_sve_pred_mode_p (mode));

rtx_vector_builder builder (VNx16BImode, vl, 2);

for (int i = 0; i < vl; i++)
builder.quick_push (CONST1_RTX (BImode));

for (int i = 0; i < vl; i++)
builder.quick_push (CONST0_RTX (BImode));

rtx const_vec = builder.build ();
rtx reg = force_reg (VNx16BImode, const_vec);
return gen_lowpart (mode, reg);
}

/* Return an all-false predicate register of mode MODE. */

rtx
Expand Down
9 changes: 9 additions & 0 deletions gcc/config/aarch64/aarch64.md
Original file line number Diff line number Diff line change
Expand Up @@ -5345,6 +5345,15 @@
(popcount:ALLI (match_operand:ALLI 1 "register_operand")))]
"TARGET_CSSC ? GET_MODE_BITSIZE (<MODE>mode) >= 32 : TARGET_SIMD"
{
if (!TARGET_CSSC && TARGET_SVE && <MODE>mode != QImode)
{
rtx tmp = gen_reg_rtx (<VEC_POP_MODE>mode);
rtx op1 = gen_lowpart (<VEC_POP_MODE>mode, operands[1]);
emit_insn (gen_popcount<vec_pop_mode>2 (tmp, op1));
emit_move_insn (operands[0], gen_lowpart (<MODE>mode, tmp));
DONE;
}

if (!TARGET_CSSC)
{
rtx v = gen_reg_rtx (V8QImode);
Expand Down
16 changes: 13 additions & 3 deletions gcc/config/aarch64/iterators.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,8 @@
;; Advanced SIMD modes for H, S and D types.
(define_mode_iterator VDQHSD [V4HI V8HI V2SI V4SI V2DI])

(define_mode_iterator VDQHSD_V1DI [VDQHSD V1DI])

;; Advanced SIMD and scalar integer modes for H and S.
(define_mode_iterator VSDQ_HSI [V4HI V8HI V2SI V4SI HI SI])

Expand Down Expand Up @@ -559,7 +561,7 @@
(define_mode_iterator SVE_I_SIMD_DI [SVE_I V2DI])

;; All SVE and Advanced SIMD integer vector modes.
(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I])
(define_mode_iterator SVE_VDQ_I [SVE_I VDQ_I V1DI])

;; SVE integer vector modes whose elements are 16 bits or wider.
(define_mode_iterator SVE_HSDI [VNx8HI VNx4HI VNx2HI
Expand Down Expand Up @@ -1235,7 +1237,7 @@
(define_mode_attr bitsize [(V8QI "64") (V16QI "128")
(V4HI "64") (V8HI "128")
(V2SI "64") (V4SI "128")
(V2DI "128")])
(V1DI "64") (V2DI "128")])

;; Map a floating point or integer mode to the appropriate register name prefix
(define_mode_attr s [(HF "h") (SF "s") (DF "d") (SI "s") (DI "d")])
Expand Down Expand Up @@ -2297,7 +2299,7 @@
(VNx8DI "VNx2BI") (VNx8DF "VNx2BI")
(V8QI "VNx8BI") (V16QI "VNx16BI")
(V4HI "VNx4BI") (V8HI "VNx8BI") (V2SI "VNx2BI")
(V4SI "VNx4BI") (V2DI "VNx2BI")])
(V4SI "VNx4BI") (V2DI "VNx2BI") (V1DI "VNx2BI")])

;; ...and again in lower case.
(define_mode_attr vpred [(VNx16QI "vnx16bi") (VNx8QI "vnx8bi")
Expand Down Expand Up @@ -2331,6 +2333,14 @@
(VNx4SI "VNx8SI") (VNx4SF "VNx8SF")
(VNx2DI "VNx4DI") (VNx2DF "VNx4DF")])

;; The Advanced SIMD modes of popcount corresponding to scalar modes.
(define_mode_attr VEC_POP_MODE [(QI "V8QI") (HI "V4HI")
(SI "V2SI") (DI "V1DI")])

;; ...and again in lower case.
(define_mode_attr vec_pop_mode [(QI "v8qi") (HI "v4hi")
(SI "v2si") (DI "v1di")])

;; On AArch64 the By element instruction doesn't have a 2S variant.
;; However because the instruction always selects a pair of values
;; The normal 3SAME instruction can be used here instead.
Expand Down
10 changes: 5 additions & 5 deletions gcc/testsuite/gcc.target/aarch64/popcnt-sve.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

/*
** f_v4hi:
** ptrue (p[0-7]).b, all
** ptrue (p[0-7]).b, vl8
** ldr d([0-9]+), \[x0\]
** cnt z\2.h, \1/m, z\2.h
** str d\2, \[x1\]
Expand All @@ -21,7 +21,7 @@ f_v4hi (unsigned short *__restrict b, unsigned short *__restrict d)

/*
** f_v8hi:
** ptrue (p[0-7]).b, all
** ptrue (p[0-7]).b, vl16
** ldr q([0-9]+), \[x0\]
** cnt z\2.h, \1/m, z\2.h
** str q\2, \[x1\]
Expand All @@ -42,7 +42,7 @@ f_v8hi (unsigned short *__restrict b, unsigned short *__restrict d)

/*
** f_v2si:
** ptrue (p[0-7]).b, all
** ptrue (p[0-7]).b, vl8
** ldr d([0-9]+), \[x0\]
** cnt z\2.s, \1/m, z\2.s
** str d\2, \[x1\]
Expand All @@ -57,7 +57,7 @@ f_v2si (unsigned int *__restrict b, unsigned int *__restrict d)

/*
** f_v4si:
** ptrue (p[0-7]).b, all
** ptrue (p[0-7]).b, vl16
** ldr q([0-9]+), \[x0\]
** cnt z\2.s, \1/m, z\2.s
** str q\2, \[x1\]
Expand All @@ -74,7 +74,7 @@ f_v4si (unsigned int *__restrict b, unsigned int *__restrict d)

/*
** f_v2di:
** ptrue (p[0-7]).b, all
** ptrue (p[0-7]).b, vl16
** ldr q([0-9]+), \[x0\]
** cnt z\2.d, \1/m, z\2.d
** str q\2, \[x1\]
Expand Down
58 changes: 58 additions & 0 deletions gcc/testsuite/gcc.target/aarch64/popcnt11.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/* { dg-do compile } */
/* { dg-options "-O2 -march=armv8.2-a+sve" } */
/* { dg-final { check-function-bodies "**" "" "" } } */

/*
** f_qi:
** ldr b([0-9]+), \[x0\]
** cnt v\1.8b, v\1.8b
** smov w0, v\1.b\[0\]
** ret
*/
unsigned
f_qi (unsigned char *a)
{
return __builtin_popcountg (a[0]);
}

/*
** f_hi:
** ldr h([0-9]+), \[x0\]
** ptrue (p[0-7]).b, vl8
** cnt z\1.h, \2/m, z\1.h
** smov w0, v\1.h\[0\]
** ret
*/
unsigned
f_hi (unsigned short *a)
{
return __builtin_popcountg (a[0]);
}

/*
** f_si:
** ldr s([0-9]+), \[x0\]
** ptrue (p[0-7]).b, vl8
** cnt z\1.s, \2/m, z\1.s
** umov x0, v\1.d\[0\]
** ret
*/
unsigned
f_si (unsigned int *a)
{
return __builtin_popcountg (a[0]);
}

/*
** f_di:
** ldr d([0-9]+), \[x0\]
** ptrue (p[0-7])\.b, vl8
** cnt z\1\.d, \2/m, z\1\.d
** fmov x0, d\1
** ret
*/
unsigned
f_di (unsigned long *a)
{
return __builtin_popcountg (a[0]);
}
20 changes: 20 additions & 0 deletions gcc/testsuite/gcc.target/aarch64/popcnt12.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/* { dg-do compile } */
/* { dg-options "-O2 -fgimple" } */
/* { dg-final { check-function-bodies "**" "" "" } } */

#pragma GCC target "+nosve"

/*
** foo:
** cnt (v[0-9]+\.8b), v0\.8b
** addv b0, \1
** ret
*/
__Uint64x1_t __GIMPLE
foo (__Uint64x1_t x)
{
__Uint64x1_t z;

z = .POPCOUNT (x);
return z;
}

0 comments on commit 9ffcf1f

Please sign in to comment.