Skip to content

Commit

Permalink
AArch64: Remove AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
Browse files Browse the repository at this point in the history
This patch removes the AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS tunable and
use_new_vector_costs entry in aarch64-tuning-flags.def and makes the
AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS paths in the backend the
default. To that end, the function aarch64_use_new_vector_costs_p and its uses
were removed. To prevent costing vec_to_scalar operations with 0, as
described in
https://gcc.gnu.org/pipermail/gcc-patches/2024-October/665481.html,
we adjusted vectorizable_store such that the variable n_adjacent_stores
also covers vec_to_scalar operations. This way vec_to_scalar operations
are not costed individually, but as a group.
As suggested by Richard Sandiford, the "known_ne" in the multilane-check
was replaced by "maybe_ne" in order to treat nunits==1+1X as a vector
rather than a scalar.

Two tests were adjusted due to changes in codegen. In both cases, the
old code performed loop unrolling once, but the new code does not:
Example from gcc.target/aarch64/sve/strided_load_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none):
f_int64_t_32:
        cbz     w3, .L92
        mov     x4, 0
        uxtw    x3, w3
+       cntd    x5
+       whilelo p7.d, xzr, x3
+       mov     z29.s, w5
        mov     z31.s, w2
-       whilelo p6.d, xzr, x3
-       mov     x2, x3
-       index   z30.s, #0, #1
-       uqdecd  x2
-       ptrue   p5.b, all
-       whilelo p7.d, xzr, x2
+       index   z30.d, #0, #1
+       ptrue   p6.b, all
        .p2align 3,,7
 .L94:
-       ld1d    z27.d, p7/z, [x0, #1, mul vl]
-       ld1d    z28.d, p6/z, [x0]
-       movprfx z29, z31
-       mul     z29.s, p5/m, z29.s, z30.s
-       incw    x4
-       uunpklo z0.d, z29.s
-       uunpkhi z29.d, z29.s
-       ld1d    z25.d, p6/z, [x1, z0.d, lsl 3]
-       ld1d    z26.d, p7/z, [x1, z29.d, lsl 3]
-       add     z25.d, z28.d, z25.d
+       ld1d    z27.d, p7/z, [x0, x4, lsl 3]
+       movprfx z28, z31
+       mul     z28.s, p6/m, z28.s, z30.s
+       ld1d    z26.d, p7/z, [x1, z28.d, uxtw 3]
        add     z26.d, z27.d, z26.d
-       st1d    z26.d, p7, [x0, #1, mul vl]
-       whilelo p7.d, x4, x2
-       st1d    z25.d, p6, [x0]
-       incw    z30.s
-       incb    x0, all, mul #2
-       whilelo p6.d, x4, x3
+       st1d    z26.d, p7, [x0, x4, lsl 3]
+       add     z30.s, z30.s, z29.s
+       incd    x4
+       whilelo p7.d, x4, x3
        b.any   .L94
 .L92:
        ret

Example from gcc.target/aarch64/sve/strided_store_2.c (compiled with
-O2 -ftree-vectorize -march=armv8.2-a+sve -mtune=generic -moverride=tune=none):
f_int64_t_32:
        cbz     w3, .L84
-       addvl   x5, x1, #1
        mov     x4, 0
        uxtw    x3, w3
-       mov     z31.s, w2
+       cntd    x5
        whilelo p7.d, xzr, x3
-       mov     x2, x3
-       index   z30.s, #0, #1
-       uqdecd  x2
-       ptrue   p5.b, all
-       whilelo p6.d, xzr, x2
+       mov     z29.s, w5
+       mov     z31.s, w2
+       index   z30.d, #0, #1
+       ptrue   p6.b, all
        .p2align 3,,7
 .L86:
-       ld1d    z28.d, p7/z, [x1, x4, lsl 3]
-       ld1d    z27.d, p6/z, [x5, x4, lsl 3]
-       movprfx z29, z30
-       mul     z29.s, p5/m, z29.s, z31.s
-       add     z28.d, z28.d, #1
-       uunpklo z26.d, z29.s
-       st1d    z28.d, p7, [x0, z26.d, lsl 3]
-       incw    x4
-       uunpkhi z29.d, z29.s
+       ld1d    z27.d, p7/z, [x1, x4, lsl 3]
+       movprfx z28, z30
+       mul     z28.s, p6/m, z28.s, z31.s
        add     z27.d, z27.d, #1
-       whilelo p6.d, x4, x2
-       st1d    z27.d, p7, [x0, z29.d, lsl 3]
-       incw    z30.s
+       st1d    z27.d, p7, [x0, z28.d, uxtw 3]
+       incd    x4
+       add     z30.s, z30.s, z29.s
        whilelo p7.d, x4, x3
        b.any   .L86
 .L84:
	ret

The patch was bootstrapped and tested on aarch64-linux-gnu, no
regression.
OK for mainline?

Signed-off-by: Jennifer Schmitz <jschmitz@nvidia.com>

gcc/
	* tree-vect-stmts.cc (vectorizable_store): Extend the use of
	n_adjacent_stores to also cover vec_to_scalar operations.
	* config/aarch64/aarch64-tuning-flags.def: Remove
	use_new_vector_costs as tuning option.
	* config/aarch64/aarch64.cc (aarch64_use_new_vector_costs_p):
	Remove.
	(aarch64_vector_costs::add_stmt_cost): Remove use of
	aarch64_use_new_vector_costs_p.
	(aarch64_vector_costs::finish_cost): Remove use of
	aarch64_use_new_vector_costs_p.
	* config/aarch64/tuning_models/cortexx925.h: Remove
	AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS.
	* config/aarch64/tuning_models/fujitsu_monaka.h: Likewise.
	* config/aarch64/tuning_models/generic_armv8_a.h: Likewise.
	* config/aarch64/tuning_models/generic_armv9_a.h: Likewise.
	* config/aarch64/tuning_models/neoverse512tvb.h: Likewise.
	* config/aarch64/tuning_models/neoversen2.h: Likewise.
	* config/aarch64/tuning_models/neoversen3.h: Likewise.
	* config/aarch64/tuning_models/neoversev1.h: Likewise.
	* config/aarch64/tuning_models/neoversev2.h: Likewise.
	* config/aarch64/tuning_models/neoversev3.h: Likewise.
	* config/aarch64/tuning_models/neoversev3ae.h: Likewise.

gcc/testsuite/
	* gcc.target/aarch64/sve/strided_load_2.c: Adjust expected outcome.
	* gcc.target/aarch64/sve/strided_store_2.c: Likewise.
  • Loading branch information
Jennifer Schmitz committed Jan 7, 2025
1 parent e53277d commit 70035b6
Show file tree
Hide file tree
Showing 16 changed files with 27 additions and 50 deletions.
2 changes: 0 additions & 2 deletions gcc/config/aarch64/aarch64-tuning-flags.def
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@ AARCH64_EXTRA_TUNING_OPTION ("cheap_shift_extend", CHEAP_SHIFT_EXTEND)

AARCH64_EXTRA_TUNING_OPTION ("cse_sve_vl_constants", CSE_SVE_VL_CONSTANTS)

AARCH64_EXTRA_TUNING_OPTION ("use_new_vector_costs", USE_NEW_VECTOR_COSTS)

AARCH64_EXTRA_TUNING_OPTION ("matched_vector_throughput", MATCHED_VECTOR_THROUGHPUT)

AARCH64_EXTRA_TUNING_OPTION ("avoid_cross_loop_fma", AVOID_CROSS_LOOP_FMA)
Expand Down
20 changes: 4 additions & 16 deletions gcc/config/aarch64/aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16627,16 +16627,6 @@ aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
return new aarch64_vector_costs (vinfo, costing_for_scalar);
}

/* Return true if the current CPU should use the new costs defined
in GCC 11. This should be removed for GCC 12 and above, with the
costs applying to all CPUs instead. */
static bool
aarch64_use_new_vector_costs_p ()
{
return (aarch64_tune_params.extra_tuning_flags
& AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
}

/* Return the appropriate SIMD costs for vectors of type VECTYPE. */
static const simd_vec_cost *
aarch64_simd_vec_costs (tree vectype)
Expand Down Expand Up @@ -17555,7 +17545,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,

/* Do one-time initialization based on the vinfo. */
loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
if (!m_analyzed_vinfo)
{
if (loop_vinfo)
analyze_loop_vinfo (loop_vinfo);
Expand All @@ -17573,7 +17563,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,

/* Try to get a more accurate cost by looking at STMT_INFO instead
of just looking at KIND. */
if (stmt_info && aarch64_use_new_vector_costs_p ())
if (stmt_info)
{
/* If we scalarize a strided store, the vectorizer costs one
vec_to_scalar for each element. However, we can store the first
Expand Down Expand Up @@ -17638,7 +17628,7 @@ aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
else
m_num_last_promote_demote = 0;

if (stmt_info && aarch64_use_new_vector_costs_p ())
if (stmt_info)
{
/* Account for any extra "embedded" costs that apply additively
to the base cost calculated above. */
Expand Down Expand Up @@ -17999,9 +17989,7 @@ aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)

auto *scalar_costs
= static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
if (loop_vinfo
&& m_vec_flags
&& aarch64_use_new_vector_costs_p ())
if (loop_vinfo && m_vec_flags)
{
m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
m_costs[vect_body]);
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/cortexx925.h
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,6 @@ static const struct tune_params cortexx925_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/fujitsu_monaka.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ static const struct tune_params fujitsu_monaka_tunings =
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/generic_armv8_a.h
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ static const struct tune_params generic_armv8_a_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/generic_armv9_a.h
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,6 @@ static const struct tune_params generic_armv9_a_tunings =
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_armv9a_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/neoverse512tvb.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,6 @@ static const struct tune_params neoverse512tvb_tunings =
0, /* max_case_values. */
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_armv9a_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/neoversen2.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ static const struct tune_params neoversen2_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/neoversen3.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ static const struct tune_params neoversen3_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
&generic_armv9a_prefetch_tune,
AARCH64_LDP_STP_POLICY_ALWAYS, /* ldp_policy_model. */
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/neoversev1.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,6 @@ static const struct tune_params neoversev1_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/neoversev2.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ static const struct tune_params neoversev2_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW
| AARCH64_EXTRA_TUNE_FULLY_PIPELINED_FMA), /* tune_flags. */
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/neoversev3.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ static const struct tune_params neoversev3_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
Expand Down
1 change: 0 additions & 1 deletion gcc/config/aarch64/tuning_models/neoversev3ae.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,6 @@ static const struct tune_params neoversev3ae_tunings =
tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
(AARCH64_EXTRA_TUNE_BASE
| AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
| AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
| AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
| AARCH64_EXTRA_TUNE_AVOID_PRED_RMW), /* tune_flags. */
&generic_armv9a_prefetch_tune,
Expand Down
2 changes: 1 addition & 1 deletion gcc/testsuite/gcc.target/aarch64/sve/strided_load_2.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@
so we vectorize the offset calculation. This means that the
64-bit version needs two copies. */
/* { dg-final { scan-assembler-times {\tld1w\tz[0-9]+\.s, p[0-7]/z, \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
/* { dg-final { scan-assembler-times {\tld1d\tz[0-9]+\.d, p[0-7]/z, \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 9 } } */
2 changes: 1 addition & 1 deletion gcc/testsuite/gcc.target/aarch64/sve/strided_store_2.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@
so we vectorize the offset calculation. This means that the
64-bit version needs two copies. */
/* { dg-final { scan-assembler-times {\tst1w\tz[0-9]+\.s, p[0-7], \[x[0-9]+, z[0-9]+.s, uxtw 2\]\n} 3 } } */
/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 15 } } */
/* { dg-final { scan-assembler-times {\tst1d\tz[0-9]+\.d, p[0-7], \[x[0-9]+, z[0-9]+.d, lsl 3\]\n} 9 } } */
40 changes: 21 additions & 19 deletions gcc/tree-vect-stmts.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8834,22 +8834,7 @@ vectorizable_store (vec_info *vinfo,
{
if (costing_p)
{
/* Only need vector extracting when there are more
than one stores. */
if (nstores > 1)
inside_cost
+= record_stmt_cost (cost_vec, 1, vec_to_scalar,
stmt_info, slp_node,
0, vect_body);
/* Take a single lane vector type store as scalar
store to avoid ICE like 110776. */
if (VECTOR_TYPE_P (ltype)
&& known_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
n_adjacent_stores++;
else
inside_cost
+= record_stmt_cost (cost_vec, 1, scalar_store,
stmt_info, 0, vect_body);
n_adjacent_stores++;
continue;
}
tree newref, newoff;
Expand Down Expand Up @@ -8905,9 +8890,26 @@ vectorizable_store (vec_info *vinfo,
if (costing_p)
{
if (n_adjacent_stores > 0)
vect_get_store_cost (vinfo, stmt_info, slp_node, n_adjacent_stores,
alignment_support_scheme, misalignment,
&inside_cost, cost_vec);
{
/* Take a single lane vector type store as scalar
store to avoid ICE like 110776. */
if (VECTOR_TYPE_P (ltype)
&& maybe_ne (TYPE_VECTOR_SUBPARTS (ltype), 1U))
vect_get_store_cost (vinfo, stmt_info, slp_node,
n_adjacent_stores, alignment_support_scheme,
misalignment, &inside_cost, cost_vec);
else
inside_cost
+= record_stmt_cost (cost_vec, n_adjacent_stores,
scalar_store, stmt_info, 0, vect_body);
/* Only need vector extracting when there are more
than one stores. */
if (nstores > 1)
inside_cost
+= record_stmt_cost (cost_vec, n_adjacent_stores,
vec_to_scalar, stmt_info, slp_node,
0, vect_body);
}
if (dump_enabled_p ())
dump_printf_loc (MSG_NOTE, vect_location,
"vect_model_store_cost: inside_cost = %d, "
Expand Down

0 comments on commit 70035b6

Please sign in to comment.