Skip to content

Commit

Permalink
Zen5 tuning part 3: scheduler tweaks
Browse files Browse the repository at this point in the history
this patch adds support for new fussion in znver5 documented in the
optimization manual:

   The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions
   with certain ALU instructions. The following conditions need to be met for
   fusion to happen:
     - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B
     - The MOV is followed by an ALU instruction where the MOV and ALU destination register match.
     - The ALU instruction may source only registers or immediate data. There cannot be any memory source.
     - The ALU instruction sources either the source or dest of MOV instruction.
     - If ALU instruction has 2 reg sources, they should be different.
     - The following ALU instructions can fuse with an older qualified MOV instruction:
       ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR
       (I assume OP is OR)

I also increased issue rate from 4 to 6.  Theoretically znver5 can do more, but
with our model we can't realy use it.
Increasing issue rate to 8 leads to infinite loop in scheduler.

Finally, I also enabled fuse_alu_and_branch since it is supported by
znver5 (I think by earlier zens too).

New fussion pattern moves quite few instructions around in common code:
@@ -2210,13 +2210,13 @@
        .cfi_offset 3, -32
        leaq    63(%rsi), %rbx
        movq    %rbx, %rbp
+       shrq    $6, %rbp
+       salq    $3, %rbp
        subq    $16, %rsp
        .cfi_def_cfa_offset 48
        movq    %rdi, %r12
-       shrq    $6, %rbp
-       movq    %rsi, 8(%rsp)
-       salq    $3, %rbp
        movq    %rbp, %rdi
+       movq    %rsi, 8(%rsp)
        call    _Znwm
        movq    8(%rsp), %rsi
        movl    $0, 8(%r12)
@@ -2224,8 +2224,8 @@
        movq    %rax, (%r12)
        movq    %rbp, 32(%r12)
        testq   %rsi, %rsi
-       movq    %rsi, %rdx
        cmovns  %rsi, %rbx
+       movq    %rsi, %rdx
        sarq    $63, %rdx
        shrq    $58, %rdx
        sarq    $6, %rbx
which should help decoder bandwidth and perhaps also cache, though I was not
able to measure off-noise effect on SPEC.

gcc/ChangeLog:

	* config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune.
	* config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5.
	(ix86_adjust_cost): Add TODO about znver5 memory latency.
	(ix86_fuse_mov_alu_p): New.
	(ix86_macro_fusion_pair_p): Use it.
	* config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add ZNVER5.
	(X86_TUNE_FUSE_MOV_AND_ALU): New tune;
  • Loading branch information
Jan Hubicka committed Sep 3, 2024
1 parent dee3c5c commit e2125a6
Show file tree
Hide file tree
Showing 3 changed files with 77 additions and 3 deletions.
2 changes: 2 additions & 0 deletions gcc/config/i386/i386.h
Original file line number Diff line number Diff line change
Expand Up @@ -430,6 +430,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST];
ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS]
#define TARGET_FUSE_ALU_AND_BRANCH \
ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH]
#define TARGET_FUSE_MOV_AND_ALU \
ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU]
#define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU]
#define TARGET_AVOID_LEA_FOR_ADDR \
ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR]
Expand Down
67 changes: 66 additions & 1 deletion gcc/config/i386/x86-tune-sched.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,6 @@ ix86_issue_rate (void)
case PROCESSOR_ZNVER2:
case PROCESSOR_ZNVER3:
case PROCESSOR_ZNVER4:
case PROCESSOR_ZNVER5:
case PROCESSOR_CORE2:
case PROCESSOR_NEHALEM:
case PROCESSOR_SANDYBRIDGE:
Expand All @@ -91,6 +90,13 @@ ix86_issue_rate (void)
return 5;

case PROCESSOR_SAPPHIRERAPIDS:
/* For znver5 decoder can handle 4 or 8 instructions per cycle,
op cache 12 instruction/cycle, dispatch 8 instructions
integer rename 8 instructions and Fp 6 instructions.
The scheduler, without understanding out of order nature of the CPU
is unlikely going to be able to fill all of these. */
case PROCESSOR_ZNVER5:
return 6;

default:
Expand Down Expand Up @@ -434,6 +440,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
enum attr_unit unit = get_attr_unit (insn);
int loadcost;

/* TODO: On znver5 complex addressing modes have
greater latency. */
if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
loadcost = 4;
else
Expand Down Expand Up @@ -563,13 +571,70 @@ ix86_macro_fusion_p ()
return TARGET_FUSE_CMP_AND_BRANCH;
}

static bool
ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
{
/* Validate mov:
- It should be reg-reg move with opcode 0x89 or 0x8B. */
rtx set1 = PATTERN (mov);
if (GET_CODE (set1) != SET
|| !GENERAL_REG_P (SET_SRC (set1))
|| !GENERAL_REG_P (SET_DEST (set1)))
return false;
rtx reg = SET_DEST (set1);
/* - it should have 0x89 or 0x8B opcode. */
if (!INTEGRAL_MODE_P (GET_MODE (reg))
|| GET_MODE_SIZE (GET_MODE (reg)) < 2
|| GET_MODE_SIZE (GET_MODE (reg)) > 8)
return false;
/* Validate ALU. */
if (GET_CODE (PATTERN (alu)) != PARALLEL)
return false;
rtx set2 = XVECEXP (PATTERN (alu), 0, 0);
if (GET_CODE (set2) != SET)
return false;
/* Match one of:
ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR
We also may add insn attribute to handle some of sporadic
case we output those with different RTX expressions. */

if (GET_CODE (SET_SRC (set2)) != PLUS
&& GET_CODE (SET_SRC (set2)) != MINUS
&& GET_CODE (SET_SRC (set2)) != XOR
&& GET_CODE (SET_SRC (set2)) != AND
&& GET_CODE (SET_SRC (set2)) != IOR
&& GET_CODE (SET_SRC (set2)) != NOT
&& GET_CODE (SET_SRC (set2)) != ASHIFT
&& GET_CODE (SET_SRC (set2)) != ASHIFTRT
&& GET_CODE (SET_SRC (set2)) != LSHIFTRT)
return false;
rtx op0 = XEXP (SET_SRC (set2), 0);
rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL;
/* One of operands should be register. */
if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
std::swap (op0, op1);
if (!REG_P (op0) || REGNO (op1) != REGNO (reg))
return false;
if (op1
&& !REG_P (op1)
&& !x86_64_immediate_operand (op1, VOIDmode))
return false;
/* Only one of two paramters must be move destination. */
if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg))
return false;
return true;
}

/* Check whether current microarchitecture support macro fusion
for insn pair "CONDGEN + CONDJMP". Refer to
"Intel Architectures Optimization Reference Manual". */

bool
ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
{
if (TARGET_FUSE_MOV_AND_ALU
&& ix86_fuse_mov_alu_p (condgen, condjmp))
return true;
rtx src, dest;
enum rtx_code ccode;
rtx compare_set = NULL_RTX, test_if, cond;
Expand Down
11 changes: 9 additions & 2 deletions gcc/config/i386/x86-tune.def
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,17 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags",

/* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional
jump instruction when the alu instruction produces the CCFLAG consumed by
the conditional jump instruction. */
the conditional jump instruction.

TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND,
There is also limitation for immediate and displacement supported. */
DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch",
m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC)
m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5)

/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov
and the destination is used by alu. alu must be one of
ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */
DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5)

/*****************************************************************************/
/* Function prologue, epilogue and function calling sequences. */
Expand Down

0 comments on commit e2125a6

Please sign in to comment.