From e2125a600552bc6e0329e3f1224eea14804db8d3 Mon Sep 17 00:00:00 2001 From: Jan Hubicka Date: Tue, 3 Sep 2024 16:26:16 +0200 Subject: [PATCH] Zen5 tuning part 3: scheduler tweaks this patch adds support for new fussion in znver5 documented in the optimization manual: The Zen5 microarchitecture adds support to fuse reg-reg MOV Instructions with certain ALU instructions. The following conditions need to be met for fusion to happen: - The MOV should be reg-reg mov with Opcode 0x89 or 0x8B - The MOV is followed by an ALU instruction where the MOV and ALU destination register match. - The ALU instruction may source only registers or immediate data. There cannot be any memory source. - The ALU instruction sources either the source or dest of MOV instruction. - If ALU instruction has 2 reg sources, they should be different. - The following ALU instructions can fuse with an older qualified MOV instruction: ADD ADC AND XOR OP SUB SBB INC DEC NOT SAL / SHL SHR SAR (I assume OP is OR) I also increased issue rate from 4 to 6. Theoretically znver5 can do more, but with our model we can't realy use it. Increasing issue rate to 8 leads to infinite loop in scheduler. Finally, I also enabled fuse_alu_and_branch since it is supported by znver5 (I think by earlier zens too). New fussion pattern moves quite few instructions around in common code: @@ -2210,13 +2210,13 @@ .cfi_offset 3, -32 leaq 63(%rsi), %rbx movq %rbx, %rbp + shrq $6, %rbp + salq $3, %rbp subq $16, %rsp .cfi_def_cfa_offset 48 movq %rdi, %r12 - shrq $6, %rbp - movq %rsi, 8(%rsp) - salq $3, %rbp movq %rbp, %rdi + movq %rsi, 8(%rsp) call _Znwm movq 8(%rsp), %rsi movl $0, 8(%r12) @@ -2224,8 +2224,8 @@ movq %rax, (%r12) movq %rbp, 32(%r12) testq %rsi, %rsi - movq %rsi, %rdx cmovns %rsi, %rbx + movq %rsi, %rdx sarq $63, %rdx shrq $58, %rdx sarq $6, %rbx which should help decoder bandwidth and perhaps also cache, though I was not able to measure off-noise effect on SPEC. gcc/ChangeLog: * config/i386/i386.h (TARGET_FUSE_MOV_AND_ALU): New tune. * config/i386/x86-tune-sched.cc (ix86_issue_rate): Updat for znver5. (ix86_adjust_cost): Add TODO about znver5 memory latency. (ix86_fuse_mov_alu_p): New. (ix86_macro_fusion_pair_p): Use it. * config/i386/x86-tune.def (X86_TUNE_FUSE_ALU_AND_BRANCH): Add ZNVER5. (X86_TUNE_FUSE_MOV_AND_ALU): New tune; --- gcc/config/i386/i386.h | 2 + gcc/config/i386/x86-tune-sched.cc | 67 ++++++++++++++++++++++++++++++- gcc/config/i386/x86-tune.def | 11 ++++- 3 files changed, 77 insertions(+), 3 deletions(-) diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h index eabb3248ea008..c1ec92ffb1509 100644 --- a/gcc/config/i386/i386.h +++ b/gcc/config/i386/i386.h @@ -430,6 +430,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_LAST]; ix86_tune_features[X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS] #define TARGET_FUSE_ALU_AND_BRANCH \ ix86_tune_features[X86_TUNE_FUSE_ALU_AND_BRANCH] +#define TARGET_FUSE_MOV_AND_ALU \ + ix86_tune_features[X86_TUNE_FUSE_MOV_AND_ALU] #define TARGET_OPT_AGU ix86_tune_features[X86_TUNE_OPT_AGU] #define TARGET_AVOID_LEA_FOR_ADDR \ ix86_tune_features[X86_TUNE_AVOID_LEA_FOR_ADDR] diff --git a/gcc/config/i386/x86-tune-sched.cc b/gcc/config/i386/x86-tune-sched.cc index d77298b0e34dc..c6d5426ae8d38 100644 --- a/gcc/config/i386/x86-tune-sched.cc +++ b/gcc/config/i386/x86-tune-sched.cc @@ -67,7 +67,6 @@ ix86_issue_rate (void) case PROCESSOR_ZNVER2: case PROCESSOR_ZNVER3: case PROCESSOR_ZNVER4: - case PROCESSOR_ZNVER5: case PROCESSOR_CORE2: case PROCESSOR_NEHALEM: case PROCESSOR_SANDYBRIDGE: @@ -91,6 +90,13 @@ ix86_issue_rate (void) return 5; case PROCESSOR_SAPPHIRERAPIDS: + /* For znver5 decoder can handle 4 or 8 instructions per cycle, + op cache 12 instruction/cycle, dispatch 8 instructions + integer rename 8 instructions and Fp 6 instructions. + + The scheduler, without understanding out of order nature of the CPU + is unlikely going to be able to fill all of these. */ + case PROCESSOR_ZNVER5: return 6; default: @@ -434,6 +440,8 @@ ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost, enum attr_unit unit = get_attr_unit (insn); int loadcost; + /* TODO: On znver5 complex addressing modes have + greater latency. */ if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN) loadcost = 4; else @@ -563,6 +571,60 @@ ix86_macro_fusion_p () return TARGET_FUSE_CMP_AND_BRANCH; } +static bool +ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu) +{ + /* Validate mov: + - It should be reg-reg move with opcode 0x89 or 0x8B. */ + rtx set1 = PATTERN (mov); + if (GET_CODE (set1) != SET + || !GENERAL_REG_P (SET_SRC (set1)) + || !GENERAL_REG_P (SET_DEST (set1))) + return false; + rtx reg = SET_DEST (set1); + /* - it should have 0x89 or 0x8B opcode. */ + if (!INTEGRAL_MODE_P (GET_MODE (reg)) + || GET_MODE_SIZE (GET_MODE (reg)) < 2 + || GET_MODE_SIZE (GET_MODE (reg)) > 8) + return false; + /* Validate ALU. */ + if (GET_CODE (PATTERN (alu)) != PARALLEL) + return false; + rtx set2 = XVECEXP (PATTERN (alu), 0, 0); + if (GET_CODE (set2) != SET) + return false; + /* Match one of: + ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR + We also may add insn attribute to handle some of sporadic + case we output those with different RTX expressions. */ + + if (GET_CODE (SET_SRC (set2)) != PLUS + && GET_CODE (SET_SRC (set2)) != MINUS + && GET_CODE (SET_SRC (set2)) != XOR + && GET_CODE (SET_SRC (set2)) != AND + && GET_CODE (SET_SRC (set2)) != IOR + && GET_CODE (SET_SRC (set2)) != NOT + && GET_CODE (SET_SRC (set2)) != ASHIFT + && GET_CODE (SET_SRC (set2)) != ASHIFTRT + && GET_CODE (SET_SRC (set2)) != LSHIFTRT) + return false; + rtx op0 = XEXP (SET_SRC (set2), 0); + rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL; + /* One of operands should be register. */ + if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg))) + std::swap (op0, op1); + if (!REG_P (op0) || REGNO (op1) != REGNO (reg)) + return false; + if (op1 + && !REG_P (op1) + && !x86_64_immediate_operand (op1, VOIDmode)) + return false; + /* Only one of two paramters must be move destination. */ + if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg)) + return false; + return true; +} + /* Check whether current microarchitecture support macro fusion for insn pair "CONDGEN + CONDJMP". Refer to "Intel Architectures Optimization Reference Manual". */ @@ -570,6 +632,9 @@ ix86_macro_fusion_p () bool ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp) { + if (TARGET_FUSE_MOV_AND_ALU + && ix86_fuse_mov_alu_p (condgen, condjmp)) + return true; rtx src, dest; enum rtx_code ccode; rtx compare_set = NULL_RTX, test_if, cond; diff --git a/gcc/config/i386/x86-tune.def b/gcc/config/i386/x86-tune.def index ed26136faee52..d7e2ad7fd250b 100644 --- a/gcc/config/i386/x86-tune.def +++ b/gcc/config/i386/x86-tune.def @@ -143,10 +143,17 @@ DEF_TUNE (X86_TUNE_FUSE_CMP_AND_BRANCH_SOFLAGS, "fuse_cmp_and_branch_soflags", /* X86_TUNE_FUSE_ALU_AND_BRANCH: Fuse alu with a subsequent conditional jump instruction when the alu instruction produces the CCFLAG consumed by - the conditional jump instruction. */ + the conditional jump instruction. + + TODO: znver5 supports fusing with SUB, ADD, INC, DEC, OR, AND, + There is also limitation for immediate and displacement supported. */ DEF_TUNE (X86_TUNE_FUSE_ALU_AND_BRANCH, "fuse_alu_and_branch", - m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC) + m_SANDYBRIDGE | m_CORE_AVX2 | m_ZHAOXIN | m_GENERIC | m_ZNVER5) +/* X86_TUNE_FUSE_MOV_AND_ALU: mov and alu in case mov is reg-reg mov + and the destination is used by alu. alu must be one of + ADD, ADC, AND, XOR, OR, SUB, SBB, INC, DEC, NOT, SAL, SHL, SHR, SAR. */ +DEF_TUNE (X86_TUNE_FUSE_MOV_AND_ALU, "fuse_mov_and_alu", m_ZNVER5) /*****************************************************************************/ /* Function prologue, epilogue and function calling sequences. */