From 2f4f5f3c22b01c28d74e11e7f2cad1019ba82c4f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 21 Jan 2015 13:56:42 -0800 Subject: [PATCH] toplev: Add support for Top Down 2.9 Update toplev to use Top Down methology v2.9 Thanks to Ahmad and Benny. Many improvements: - Many fixes to SMT support. SMT now supported on Haswell. - Many bug fixes to metrics - Initial Haswell Server support - Add a Sandy Bridge EP model - Lots of new metrics and nodes: e.g. Core IPC, MUX confidence, BPTB (Branch per Taken Branch), SMT_2T utilization, IFetch_Line_utilization - Initial Broadwell model support - Improve sample event support. Now enable them by default. --per-socket and --per-core is not supported anymore with SMT. --- bdw_client_ratios.py | 1642 ++++++++++++++++++++++++++++++++++++++++++ cpumap.sh | 2 + hsw_client_ratios.py | 1269 ++++++++++++++++++++++---------- ivb_client_ratios.py | 788 +++++++++++++++----- ivb_server_ratios.py | 696 +++++++++++++----- jkt_server_ratios.py | 789 ++++++++++++++++++++ snb_client_ratios.py | 562 +++++++++++---- tl-tester | 19 +- toplev.py | 109 ++- 9 files changed, 4923 insertions(+), 953 deletions(-) create mode 100644 bdw_client_ratios.py create mode 100644 jkt_server_ratios.py diff --git a/bdw_client_ratios.py b/bdw_client_ratios.py new file mode 100644 index 00000000..c547637f --- /dev/null +++ b/bdw_client_ratios.py @@ -0,0 +1,1642 @@ + +# +# auto generated TopDown 2.9 description for Intel 5th gen Core / Core M (code named Broadwell) +# Please see http://ark.intel.com for more details on these CPUs. +# +# References: +# http://halobates.de/blog/p/262 +# https://sites.google.com/site/analysismethods/yasin-pubs +# + +smt_enabled = False + +# Constants + +Pipeline_Width = 4 +L2_Store_Latency = 9 +Mem_L3_Weight = 7 +Mem_STLB_Hit_Cost = 7 +Mem_SFB_Cost = 13 +Mem_4K_Alias_Cost = 7 +Mem_XSNP_HitM_Cost = 60 +MEM_XSNP_Hit_Cost = 43 +MEM_XSNP_None_Cost = 29 +Mem_Local_DRAM_Cost = 200 +Mem_Remote_DRAM_Cost = 310 +Mem_Remote_HitM_Cost = 200 +Mem_Remote_Fwd_Cost = 180 +MS_Switches_Cost = 2 +OneMillion = 1000000 +Energy_Unit = 61 + +# Aux. formulas + + +def Recovery_Cycles(EV, level): + EV("INT_MISC.RECOVERY_CYCLES", level) + EV("INT_MISC.RECOVERY_CYCLES:amt1", level) + return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level) + +def Execute_Cycles(EV, level): + EV("UOPS_EXECUTED.CORE:c1", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c1", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + +def L1D_Miss_Cycles(EV, level): + EV("L1D_PEND_MISS.PENDING_CYCLES", level) + EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) + return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level) + +def SQ_Full_Cycles(EV, level): + EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + +def ITLB_Miss_Cycles(EV, level): + return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level)) + +def Cycles_0_Ports_Utilized(EV, level): + EV("ARITH.FPU_DIV_ACTIVE", level) + EV("UOPS_EXECUTED.CORE:i1:c1", level) + EV("CYCLE_ACTIVITY.STALLS_TOTAL", level) + EV("RS_EVENTS.EMPTY_CYCLES", level) + return (EV("UOPS_EXECUTED.CORE:i1:c1", level)) / 2 if smt_enabled else(EV("CYCLE_ACTIVITY.STALLS_TOTAL", level) - EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ARITH.FPU_DIV_ACTIVE", level)) + +def Cycles_1_Port_Utilized(EV, level): + EV("UOPS_EXECUTED.CORE:c2", level) + EV("UOPS_EXECUTED.CORE:c1", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)) + +def Cycles_2_Ports_Utilized(EV, level): + EV("UOPS_EXECUTED.CORE:c2", level) + EV("UOPS_EXECUTED.CORE:c3", level) + EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)) + +def Cycles_3m_Ports_Utilized(EV, level): + EV("UOPS_EXECUTED.CORE:c3", level) + EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c3", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + +def ORO_Demand_DRD_C1(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level ) + +def ORO_Demand_DRD_C6(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level ) + +def Store_L2_Hit_Cycles(EV, level): + return 0 + +def Few_Uops_Executed_Threshold(EV, level): + EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level)> 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + +def Backend_Bound_At_EXE(EV, level): + return (EV("CYCLE_ACTIVITY.STALLS_TOTAL", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level)) / CLKS(EV, level) + +def Mem_L3_Hit_Fraction(EV, level): + return EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level)) + +def Mem_Lock_St_Fraction(EV, level): + return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level) + +def Mispred_Clears_Fraction(EV, level): + return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) + +def Retire_Uop_Fraction(EV, level): + return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level) + +def SLOTS(EV, level): + return Pipeline_Width * CORE_CLKS(EV, level) + +def DurationTimeInSeconds(EV, level): + return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 ) + +# Instructions Per Cycle (per logical thread) +def IPC(EV, level): + return EV("INST_RETIRED.ANY", level) / CLKS(EV, level) + +# Cycles Per Instruction (threaded) +def CPI(EV, level): + return 1 / IPC(EV, level) + +# Instructions Per Cycle (per physical core) +def CoreIPC(EV, level): + return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level) + +# Uops Per Instruction +def UPI(EV, level): + return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level) + +# Instruction per taken branch +def IPTB(EV, level): + return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + +# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes. +def BPTB(EV, level): + return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + +# Fraction of Uops delivered by the DSB (decoded instructions cache) +def DSB_Coverage(EV, level): + return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level)) + +# Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed) +def ILP(EV, level): + return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(EV, level) + +# Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss) +def MLP(EV, level): + return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level) + +# Actual Average Latency for L1 data-cache miss demand loads +def Load_Miss_Real_Latency(EV, level): + return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level)) + +# Average Frequency Utilization relative nominal frequency +def Turbo_Utilization(EV, level): + return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level) + +# Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store +def Page_Walks_Use(EV, level): + return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level) + +# PerfMon Event Multiplexing accuracy indicator +def MUX(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level) + +# Per-thread actual clocks +def CLKS(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD", level) + +# Core actual clocks +def CORE_CLKS(EV, level): + EV("CPU_CLK_UNHALTED.THREAD:amt1", level) + return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level) + +# Run duration time in seconds +def Time(EV, level): + return DurationTimeInSeconds(EV, level) + +# Event groups + + +class Frontend_Bound: + name = "Frontend_Bound" + domain = "Slots" + area = "FE" + desc = """ +This category reflects slots where the Frontend of the processor undersupplies +its Backend. Frontend denotes the first portion of pipeline responsible to +fetch micro-ops which the Backend can execute. Within the Frontend, a branch +predictor predicts the next address to fetch, cache-lines are fetched from +memory, parsed into instructions, and lastly decoded into micro-ops. The +purpose of the Frontend cluster is to deliver uops to Backend whenever the +latter can accept them. For example, stalls due to instruction-cache misses +would be categorized under Frontend Bound.""" + level = 1 + htoff = False + def compute(self, EV): + try: + self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 ) + self.thresh = (self.val > 0.2) + except ZeroDivisionError: + #print "Frontend_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class Frontend_Latency: + name = "Frontend_Latency" + domain = "Slots" + area = "FE" + desc = """ +This metric represents slots fraction CPU was stalled due to Frontend latency +issues. For example, instruction-cache misses, iTLB misses or fetch stalls +after a branch misprediction are categorized under Frontend Latency. In such +cases the Frontend eventually delivers no uops for some period.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 ) + self.thresh = (self.val > 0.15) and self.parent.thresh + except ZeroDivisionError: + #print "Frontend_Latency zero division" + self.val = 0 + self.thresh = False + return self.val + +class ITLB_Misses: + name = "ITLB_Misses" + domain = "Clocks" + area = "FE" + desc = """ +This metric represents cycles fraction CPU was stalled due to instruction TLB +misses. Using large code pages may be considered here.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "ITLB_Misses zero division" + self.val = 0 + self.thresh = False + return self.val + +class DSB_Switches: + name = "DSB_Switches" + domain = "Clocks" + area = "FE" + desc = """ +This metric represents cycles fraction CPU was stalled due to switches from +DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "DSB_Switches zero division" + self.val = 0 + self.thresh = False + return self.val + +class LCP: + name = "LCP" + domain = "Clocks" + area = "FE" + desc = """ +This metric represents cycles fraction CPU was stalled due to Length Changing +Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will +certainly avoid this.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "LCP zero division" + self.val = 0 + self.thresh = False + return self.val + +class MS_Switches: + name = "MS_Switches" + domain = "Clocks" + area = "FE" + desc = """ +This metric represents cycles fraction CPU was stalled due to switches of uop +delivery to the Microcode Sequencer (MS). Commonly used instructions are +optimized for delivery by the DSB or MITE pipelines. The MS is designated to +deliver long uop flows required by CISC instructions like CPUID, or uncommon +conditions like Floating Point Assists when dealing with Denormals.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "MS_Switches zero division" + self.val = 0 + self.thresh = False + return self.val + +class Frontend_Bandwidth: + name = "Frontend_Bandwidth" + domain = "Slots" + area = "FE" + desc = """ +This metric represents slots fraction CPU was stalled due to Frontend +bandwidth issues. For example, inefficiencies at the instruction decoders, or +code restrictions for caching in the DSB (decoded uops cache) are categorized +under Frontend Bandwidth. In such cases, the Frontend typically delivers non- +optimal amount of uops to the Backend.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV ) + self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh + except ZeroDivisionError: + #print "Frontend_Bandwidth zero division" + self.val = 0 + self.thresh = False + return self.val + +class MITE: + name = "MITE" + domain = "CoreClocks" + area = "FE" + desc = """ +This metric represents Core cycles fraction in which CPU was likely limited +due to the MITE fetch pipeline. For example, inefficiencies in the +instruction decoders are categorized here.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "MITE zero division" + self.val = 0 + self.thresh = False + return self.val + +class DSB: + name = "DSB" + domain = "CoreClocks" + area = "FE" + desc = """ +This metric represents Core cycles fraction in which CPU was likely limited +due to DSB (decoded uop cache) fetch pipeline. For example, inefficient +utilization of the DSB cache structure or bank conflict when reading from it, +are categorized here.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) + self.thresh = (self.val > 0.3) and self.parent.thresh + except ZeroDivisionError: + #print "DSB zero division" + self.val = 0 + self.thresh = False + return self.val + +class LSD: + name = "LSD" + domain = "CoreClocks" + area = "FE" + desc = """ +This metric represents Core cycles fraction in which CPU was likely limited +due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining +Uop supply. However, in some rare cases, optimal uop-delivery could not be +reached for small loops whose size (in terms of number of uops) does not suit +well the LSD structure.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "LSD zero division" + self.val = 0 + self.thresh = False + return self.val + +class Bad_Speculation: + name = "Bad_Speculation" + domain = "Slots" + area = "BAD" + desc = """ +This category reflects slots wasted due to incorrect speculations, which +include slots used to allocate uops that do not eventually get retired and +slots for which allocation was blocked due to recovery from earlier incorrect +speculation. For example, wasted work due to miss-predicted branches are +categorized under Bad Speculation category""" + level = 1 + htoff = False + def compute(self, EV): + try: + self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 ) + self.thresh = (self.val > 0.1) + except ZeroDivisionError: + #print "Bad_Speculation zero division" + self.val = 0 + self.thresh = False + return self.val + +class Branch_Mispredicts: + name = "Branch_Mispredicts" + domain = "Slots" + area = "BAD" + desc = """ +This metric represents slots fraction CPU was impacted by Branch +Misprediction. These slots are either wasted by uops fetched from an +incorrectly speculated program path, or stalls the Backend of the machine +needs to recover its state from a speculative path.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "Branch_Mispredicts zero division" + self.val = 0 + self.thresh = False + return self.val + +class Machine_Clears: + name = "Machine_Clears" + domain = "Slots" + area = "BAD" + desc = """ +This metric represents slots fraction CPU was impacted by Machine Clears. +These slots are either wasted by uops fetched prior to the clear, or stalls +the Backend of the machine needs to recover its state after the clear. For +example, this can happen due to memory ordering Nukes (e.g. Memory +Disambiguation) or Self-Modifying-Code (SMC) nukes.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "Machine_Clears zero division" + self.val = 0 + self.thresh = False + return self.val + +class Backend_Bound: + name = "Backend_Bound" + domain = "Slots" + area = "BE" + desc = """ +This category reflects slots where no uops are being delivered due to a lack +of required resources for accepting more uops in the Backend of the pipeline. +Backend describes the portion of the pipeline where the out-of-order scheduler +dispatches ready uops into their respective execution units, and once +completed these uops get retired according to program order. For example, +stalls due to data-cache misses or stalls due to the divider unit being +overloaded are both categorized under Backend Bound.""" + level = 1 + htoff = False + def compute(self, EV): + try: + self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV)) + self.thresh = (self.val > 0.2) + except ZeroDivisionError: + #print "Backend_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class Memory_Bound: + name = "Memory_Bound" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents how much Memory subsystem was a bottleneck. Memory +Bound measures cycle fraction where pipeline is likely stalled due to demand +load or store instructions. This accounts mainly for non-completed in-flight +memory demand loads which coincides with execution starvation. in addition to +less common cases where stores could imply backpressure on the pipeline.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = (EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 ) + self.thresh = (self.val > 0.2) and self.parent.thresh + except ZeroDivisionError: + #print "Memory_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class L1_Bound: + name = "L1_Bound" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents how often CPU was stalled without missing the L1 data +cache. The L1 cache typically has the shortest latency. However, in certain +cases like loads blocked on older stores, a load might suffer a high latency +even though it is being satisfied by the L1. There are no fill-buffers +allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event +as it accounts for any non-completed load.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_MISS", 3)) / CLKS(EV, 3 ) + self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh + except ZeroDivisionError: + #print "L1_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class DTLB_Load: + name = "DTLB_Load" + domain = "Clocks" + area = "BE/Mem" + desc = "" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "DTLB_Load zero division" + self.val = 0 + self.thresh = False + return self.val + +class Store_Fwd_Blk: + name = "Store_Fwd_Blk" + domain = "Clocks" + area = "BE/Mem" + desc = "" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "Store_Fwd_Blk zero division" + self.val = 0 + self.thresh = False + return self.val + +class Split_Loads: + name = "Split_Loads" + domain = "Clocks" + area = "BE/Mem" + desc = "" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "Split_Loads zero division" + self.val = 0 + self.thresh = False + return self.val + +class G4K_Aliasing: + name = "4K_Aliasing" + domain = "Clocks" + area = "BE/Mem" + desc = "" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "G4K_Aliasing zero division" + self.val = 0 + self.thresh = False + return self.val + +class L2_Bound: + name = "L2_Bound" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents how often CPU was stalled on L2 cache. Avoiding cache +misses (i.e. L1 misses/L2 hits) will improve the latency and increase +performance.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_MISS", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_MISS", 3)) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.03) and self.parent.thresh + except ZeroDivisionError: + #print "L2_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class L3_Bound: + name = "L3_Bound" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents how often CPU was stalled on L3 cache or contended with +a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) will improve +the latency and increase performance.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_MISS", 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "L3_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class Contested_Accesses: + name = "Contested_Accesses" + domain = "Clocks" + area = "BE/Mem" + desc = "" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "Contested_Accesses zero division" + self.val = 0 + self.thresh = False + return self.val + +class Data_Sharing: + name = "Data_Sharing" + domain = "Clocks" + area = "BE/Mem" + desc = "" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "Data_Sharing zero division" + self.val = 0 + self.thresh = False + return self.val + +class L3_Latency: + name = "L3_Latency" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric is a rough aggregate estimate of cycles fraction where CPU +accessed L3 cache for all load requests, while there was no contention/sharing +with a sibling core. Avoiding cache misses (i.e. L2 misses/L3 hits) will +improve the latency and increase performance.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", 4) / CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "L3_Latency zero division" + self.val = 0 + self.thresh = False + return self.val + +class SQ_Full: + name = "SQ_Full" + domain = "CoreClocks" + area = "BE/Mem" + desc = """ +This metric measures fraction of cycles where the Super Queue (SQ) was full +taking into account all request-types and both hardware SMT threads. The Super +Queue is used for requests to access the L2 cache or to go out to the Uncore.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "SQ_Full zero division" + self.val = 0 + self.thresh = False + return self.val + +class MEM_Bound: + name = "MEM_Bound" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents how often CPU was stalled on main memory (DRAM). +Caching will improve the latency and increase performance.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_MISS", 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "MEM_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class MEM_Bandwidth: + name = "MEM_Bandwidth" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents how often CPU was likely stalled due to approaching +bandwidth limits of main memory (DRAM). NUMA in multi-socket system may be +considered in such case.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "MEM_Bandwidth zero division" + self.val = 0 + self.thresh = False + return self.val + +class MEM_Latency: + name = "MEM_Latency" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents how often CPU was likely stalled due to latency from +main memory (DRAM). Data layout re-structuring or using Software Prefetches +(also through the compiler) may be considered in such case.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "MEM_Latency zero division" + self.val = 0 + self.thresh = False + return self.val + +class Stores_Bound: + name = "Stores_Bound" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents how often CPU was stalled due to store operations. +even though memory store accesses do not typically stall out-of-order CPUs; +there are few cases where stores can lead to actual stalls. This metric will +be flagged should any of these cases be a bottleneck.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = self.Memory_Bound.compute(EV) -(EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 3) / CLKS(EV, 3)) + self.thresh = (self.val > 0.2) and self.parent.thresh + except ZeroDivisionError: + #print "Stores_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class Split_Stores: + name = "Split_Stores" + domain = "CoreClocks" + area = "BE/Mem" + desc = """ +This metric represents rate of split store accesses. Consider aligning your +data to the 64-byte cache line granularity.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.2) and self.parent.thresh + except ZeroDivisionError: + #print "Split_Stores zero division" + self.val = 0 + self.thresh = False + return self.val + +class DTLB_Store: + name = "DTLB_Store" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents cycles fraction spent handling first-level data TLB +store misses. As with ordinary data caching, focus on improving data locality +and reducing working-set size to reduce DTLB overhead. Additionally, consider +using profile-guided optimization (PGO) to collocate frequently-used data on +the same page. Try using larger page sizes for large amounts of frequently- +used data.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "DTLB_Store zero division" + self.val = 0 + self.thresh = False + return self.val + +class Core_Bound: + name = "Core_Bound" + domain = "Clocks" + area = "BE/Core" + desc = """ +This metric represents how much Core non-memory issues were of a bottleneck. +Shortage in hardware compute resources, or dependencies software's +instructions are both categorized under Core Bound. Hence it may indicate the +machine ran out of an OOO resources, certain execution units are overloaded or +dependencies in program's data- or instruction-flow are limiting the +performance (e.g. FP-chained long-latency arithmetic operations). Tip: +consider Port Saturation analysis as next step.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "Core_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class Divider: + name = "Divider" + domain = "CoreClocks" + area = "BE/Core" + desc = "" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(EV, 3 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "Divider zero division" + self.val = 0 + self.thresh = False + return self.val + +class Ports_Utilization: + name = "Ports_Utilization" + domain = "Clocks" + area = "BE/Core" + desc = """ +This metric represents cycles fraction application was stalled due to Core +computation issues (non divider-related). For example, heavy data-dependency +between nearby instructions will manifest in this category. Ditto if +instruction-mix used by the application overloads specific hardware execution +unit. Hint: Loop Vectorization -most compilers feature auto-Vectorization +options today- reduces pressure on the execution ports as multiple elements +are calculated with same uop.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "Ports_Utilization zero division" + self.val = 0 + self.thresh = False + return self.val + +class G0_Ports_Utilized: + name = "0_Ports_Utilized" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU executed no uops on any +execution port.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "G0_Ports_Utilized zero division" + self.val = 0 + self.thresh = False + return self.val + +class G1_Port_Utilized: + name = "1_Port_Utilized" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction where the CPU executed total of 1 +uop per cycle on all execution ports. This can be due to heavy data-dependency +among software instructions, or over oversubscribing a particular hardware +resource. In some other cases with high 1_Port_Utilized and L1_Bound, this +metric can point to L1 data-cache latency bottleneck that may not necessarily +manifest with complete execution starvation (due to the short L1 latency e.g. +walking a linked list) - looking at the assembly can be helpful. Tip: consider +'Core Ports Saturation' analysis-type as next step.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "G1_Port_Utilized zero division" + self.val = 0 + self.thresh = False + return self.val + +class G2_Ports_Utilized: + name = "2_Ports_Utilized" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU executed total of 2 uops per +cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis- +type as next step. Loop Vectorization -most compilers feature auto- +Vectorization options today- reduces pressure on the execution ports as +multiple elements are calculated with same uop.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "G2_Ports_Utilized zero division" + self.val = 0 + self.thresh = False + return self.val + +class G3m_Ports_Utilized: + name = "3m_Ports_Utilized" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU executed total of 3 or more +uops per cycle on all execution ports. Tip: consider 'Core Port Saturation' +analysis-type as next step""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "G3m_Ports_Utilized zero division" + self.val = 0 + self.thresh = False + return self.val + +class Port_0: + name = "Port_0" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU dispatched uops on execution +port 0 (SNB+: ALU; HSW+:ALU and 2nd branch)""" + level = 5 + htoff = False + def compute(self, EV): + try: + self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + #print "Port_0 zero division" + self.val = 0 + self.thresh = False + return self.val + +class Port_1: + name = "Port_1" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU dispatched uops on execution +port 1 (ALU)""" + level = 5 + htoff = False + def compute(self, EV): + try: + self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + #print "Port_1 zero division" + self.val = 0 + self.thresh = False + return self.val + +class Port_2: + name = "Port_2" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU dispatched uops on execution +port 2 (Loads and Store-address)""" + level = 5 + htoff = False + def compute(self, EV): + try: + self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + #print "Port_2 zero division" + self.val = 0 + self.thresh = False + return self.val + +class Port_3: + name = "Port_3" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU dispatched uops on execution +port 3 (Loads and Store-address)""" + level = 5 + htoff = False + def compute(self, EV): + try: + self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + #print "Port_3 zero division" + self.val = 0 + self.thresh = False + return self.val + +class Port_4: + name = "Port_4" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU dispatched uops on execution +port 4 (Store-data)""" + level = 5 + htoff = False + def compute(self, EV): + try: + self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + #print "Port_4 zero division" + self.val = 0 + self.thresh = False + return self.val + +class Port_5: + name = "Port_5" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU dispatched uops on execution +port 5 (SNB+: Branches and ALU; HSW+: ALU)""" + level = 5 + htoff = False + def compute(self, EV): + try: + self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) + except ZeroDivisionError: + #print "Port_5 zero division" + self.val = 0 + self.thresh = False + return self.val + +class Retiring: + name = "Retiring" + domain = "Slots" + area = "RET" + desc = """ +This category reflects slots utilized by useful work i.e. allocated uops that +eventually get retired. Ideally, all pipeline slots would be attributed to the +Retiring category. Retiring of 100% would indicate the maximum 4 uops retired +per cycle has been achieved. Maximizing Retiring typically increases the +Instruction-Per-Cycle metric. Note that a high Retiring value does not +necessary mean there is no room for more performance. For example, Microcode +assists are categorized under Retiring. They hurt performance and can often be +avoided. A high Retiring value for non-vectorized code may be a good hint for +programmer to consider vectorizing his code. Doing so essentially lets more +computations be done without significantly increasing number of instructions +thus improving the performance.""" + level = 1 + htoff = False + def compute(self, EV): + try: + self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 ) + self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh + except ZeroDivisionError: + #print "Retiring zero division" + self.val = 0 + self.thresh = False + return self.val + +class Base: + name = "Base" + domain = "Slots" + area = "RET" + desc = """ +This metric represents slots fraction where the CPU was retiring uops not +originated from the microcode-sequencer. This correlates with total number of +instructions used by the program. A uops-per-instruction ratio of 1 should be +expected. While this is the most desirable of the top 4 categories, high +values may still indicate areas for improvement. If possible focus on +techniques that reduce instruction count or result in more efficient +instructions generation such as vectorization.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV ) + self.thresh = (self.val > 0.6) and self.parent.thresh + except ZeroDivisionError: + #print "Base zero division" + self.val = 0 + self.thresh = False + return self.val + +class Microcode_Sequencer: + name = "Microcode_Sequencer" + domain = "Slots" + area = "RET" + desc = """ +This metric represents slots fraction CPU was retiring uops fetched by the +Microcode Sequencer (MS) ROM. The MS is used for CISC instructions not fully +decoded by the default decoders (like repeat move strings), or by microcode +assists used to address some operation modes (like in Floating Point assists).""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 ) + self.thresh = (self.val > 0.05) + except ZeroDivisionError: + #print "Microcode_Sequencer zero division" + self.val = 0 + self.thresh = False + return self.val + +class Metric_IPC: + name = "IPC" + desc = """ +Instructions Per Cycle (per logical thread)""" + domain = "Metric" + maxval = 5 + + def compute(self, EV): + try: + self.val = IPC(EV, 0) + except ZeroDivisionError: + print "IPC zero division" + self.val = 0 + +class Metric_CPI: + name = "CPI" + desc = """ +Cycles Per Instruction (threaded)""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = CPI(EV, 0) + except ZeroDivisionError: + print "CPI zero division" + self.val = 0 + +class Metric_CoreIPC: + name = "CoreIPC" + desc = """ +Instructions Per Cycle (per physical core)""" + domain = "Metric" + maxval = 5 + + def compute(self, EV): + try: + self.val = CoreIPC(EV, 0) + except ZeroDivisionError: + print "CoreIPC zero division" + self.val = 0 + +class Metric_UPI: + name = "UPI" + desc = """ +Uops Per Instruction""" + domain = "Metric" + maxval = 2 + + def compute(self, EV): + try: + self.val = UPI(EV, 0) + except ZeroDivisionError: + print "UPI zero division" + self.val = 0 + +class Metric_IPTB: + name = "IPTB" + desc = """ +Instruction per taken branch""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = IPTB(EV, 0) + except ZeroDivisionError: + print "IPTB zero division" + self.val = 0 + +class Metric_BPTB: + name = "BPTB" + desc = """ +Branch instructions per taken branch. Can be used to approximate PGO- +likelihood for non-loopy codes.""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = BPTB(EV, 0) + except ZeroDivisionError: + print "BPTB zero division" + self.val = 0 + +class Metric_DSB_Coverage: + name = "DSB_Coverage" + desc = """ +Fraction of Uops delivered by the DSB (decoded instructions cache)""" + domain = "Metric" + maxval = 1 + + def compute(self, EV): + try: + self.val = DSB_Coverage(EV, 0) + except ZeroDivisionError: + print "DSB_Coverage zero division" + self.val = 0 + +class Metric_ILP: + name = "ILP" + desc = """ +Instruction-Level-Parallelism (average number of uops executed when there is +at least 1 uop executed)""" + domain = "Metric" + maxval = 10 + + def compute(self, EV): + try: + self.val = ILP(EV, 0) + except ZeroDivisionError: + print "ILP zero division" + self.val = 0 + +class Metric_MLP: + name = "MLP" + desc = """ +Memory-Level-Parallelism (average number of L1 miss demand load when there is +at least 1 such miss)""" + domain = "Metric" + maxval = 10 + + def compute(self, EV): + try: + self.val = MLP(EV, 0) + except ZeroDivisionError: + print "MLP zero division" + self.val = 0 + +class Metric_Load_Miss_Real_Latency: + name = "Load_Miss_Real_Latency" + desc = """ +Actual Average Latency for L1 data-cache miss demand loads""" + domain = "Metric" + maxval = 1000 + + def compute(self, EV): + try: + self.val = Load_Miss_Real_Latency(EV, 0) + except ZeroDivisionError: + print "Load_Miss_Real_Latency zero division" + self.val = 0 + +class Metric_Turbo_Utilization: + name = "Turbo_Utilization" + desc = """ +Average Frequency Utilization relative nominal frequency""" + domain = "Metric" + maxval = 10 + + def compute(self, EV): + try: + self.val = Turbo_Utilization(EV, 0) + except ZeroDivisionError: + print "Turbo_Utilization zero division" + self.val = 0 + +class Metric_Page_Walks_Use: + name = "Page_Walks_Use" + desc = """ +Fraction of cycles where the core's Page Walker is busy serving +iTLB/Load/Store""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = Page_Walks_Use(EV, 0) + except ZeroDivisionError: + print "Page_Walks_Use zero division" + self.val = 0 + +class Metric_MUX: + name = "MUX" + desc = """ +PerfMon Event Multiplexing accuracy indicator""" + domain = "Clocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = MUX(EV, 0) + except ZeroDivisionError: + print "MUX zero division" + self.val = 0 + +class Metric_CLKS: + name = "CLKS" + desc = """ +Per-thread actual clocks""" + domain = "Count" + maxval = 0 + + def compute(self, EV): + try: + self.val = CLKS(EV, 0) + except ZeroDivisionError: + print "CLKS zero division" + self.val = 0 + +class Metric_CORE_CLKS: + name = "CORE_CLKS" + desc = """ +Core actual clocks""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = CORE_CLKS(EV, 0) + except ZeroDivisionError: + print "CORE_CLKS zero division" + self.val = 0 + +class Metric_Time: + name = "Time" + desc = """ +Run duration time in seconds""" + domain = "Count" + maxval = 0 + + def compute(self, EV): + try: + self.val = Time(EV, 0) + except ZeroDivisionError: + print "Time zero division" + self.val = 0 + +# Schedule + + +class Setup: + def __init__(self, r): + o = dict() + n = Frontend_Bound() ; r.run(n) ; o["Frontend_Bound"] = n + n = Frontend_Latency() ; r.run(n) ; o["Frontend_Latency"] = n + n = ITLB_Misses() ; r.run(n) ; o["ITLB_Misses"] = n + n = DSB_Switches() ; r.run(n) ; o["DSB_Switches"] = n + n = LCP() ; r.run(n) ; o["LCP"] = n + n = MS_Switches() ; r.run(n) ; o["MS_Switches"] = n + n = Frontend_Bandwidth() ; r.run(n) ; o["Frontend_Bandwidth"] = n + n = MITE() ; r.run(n) ; o["MITE"] = n + n = DSB() ; r.run(n) ; o["DSB"] = n + n = LSD() ; r.run(n) ; o["LSD"] = n + n = Bad_Speculation() ; r.run(n) ; o["Bad_Speculation"] = n + n = Branch_Mispredicts() ; r.run(n) ; o["Branch_Mispredicts"] = n + n = Machine_Clears() ; r.run(n) ; o["Machine_Clears"] = n + n = Backend_Bound() ; r.run(n) ; o["Backend_Bound"] = n + n = Memory_Bound() ; r.run(n) ; o["Memory_Bound"] = n + n = L1_Bound() ; r.run(n) ; o["L1_Bound"] = n + n = DTLB_Load() ; r.run(n) ; o["DTLB_Load"] = n + n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n + n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n + n = L2_Bound() ; r.run(n) ; o["L2_Bound"] = n + n = L3_Bound() ; r.run(n) ; o["L3_Bound"] = n + n = Contested_Accesses() ; r.run(n) ; o["Contested_Accesses"] = n + n = Data_Sharing() ; r.run(n) ; o["Data_Sharing"] = n + n = L3_Latency() ; r.run(n) ; o["L3_Latency"] = n + n = SQ_Full() ; r.run(n) ; o["SQ_Full"] = n + n = MEM_Bound() ; r.run(n) ; o["MEM_Bound"] = n + n = MEM_Bandwidth() ; r.run(n) ; o["MEM_Bandwidth"] = n + n = MEM_Latency() ; r.run(n) ; o["MEM_Latency"] = n + n = Stores_Bound() ; r.run(n) ; o["Stores_Bound"] = n + n = Split_Stores() ; r.run(n) ; o["Split_Stores"] = n + n = DTLB_Store() ; r.run(n) ; o["DTLB_Store"] = n + n = Core_Bound() ; r.run(n) ; o["Core_Bound"] = n + n = Divider() ; r.run(n) ; o["Divider"] = n + n = Ports_Utilization() ; r.run(n) ; o["Ports_Utilization"] = n + n = G0_Ports_Utilized() ; r.run(n) ; o["G0_Ports_Utilized"] = n + n = G1_Port_Utilized() ; r.run(n) ; o["G1_Port_Utilized"] = n + n = G2_Ports_Utilized() ; r.run(n) ; o["G2_Ports_Utilized"] = n + n = G3m_Ports_Utilized() ; r.run(n) ; o["G3m_Ports_Utilized"] = n + n = Port_0() ; r.run(n) ; o["Port_0"] = n + n = Port_1() ; r.run(n) ; o["Port_1"] = n + n = Port_2() ; r.run(n) ; o["Port_2"] = n + n = Port_3() ; r.run(n) ; o["Port_3"] = n + n = Port_4() ; r.run(n) ; o["Port_4"] = n + n = Port_5() ; r.run(n) ; o["Port_5"] = n + n = Retiring() ; r.run(n) ; o["Retiring"] = n + n = Base() ; r.run(n) ; o["Base"] = n + n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n + + # parents + + o["Frontend_Latency"].parent = o["Frontend_Bound"] + o["ITLB_Misses"].parent = o["Frontend_Latency"] + o["DSB_Switches"].parent = o["Frontend_Latency"] + o["LCP"].parent = o["Frontend_Latency"] + o["MS_Switches"].parent = o["Frontend_Latency"] + o["Frontend_Bandwidth"].parent = o["Frontend_Bound"] + o["MITE"].parent = o["Frontend_Bandwidth"] + o["DSB"].parent = o["Frontend_Bandwidth"] + o["LSD"].parent = o["Frontend_Bandwidth"] + o["Branch_Mispredicts"].parent = o["Bad_Speculation"] + o["Machine_Clears"].parent = o["Bad_Speculation"] + o["Memory_Bound"].parent = o["Backend_Bound"] + o["L1_Bound"].parent = o["Memory_Bound"] + o["DTLB_Load"].parent = o["L1_Bound"] + o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["Split_Loads"].parent = o["L1_Bound"] + o["G4K_Aliasing"].parent = o["L1_Bound"] + o["L2_Bound"].parent = o["Memory_Bound"] + o["L3_Bound"].parent = o["Memory_Bound"] + o["Contested_Accesses"].parent = o["L3_Bound"] + o["Data_Sharing"].parent = o["L3_Bound"] + o["L3_Latency"].parent = o["L3_Bound"] + o["SQ_Full"].parent = o["L3_Bound"] + o["MEM_Bound"].parent = o["Memory_Bound"] + o["MEM_Bandwidth"].parent = o["MEM_Bound"] + o["MEM_Latency"].parent = o["MEM_Bound"] + o["Stores_Bound"].parent = o["Memory_Bound"] + o["Split_Stores"].parent = o["Stores_Bound"] + o["DTLB_Store"].parent = o["Stores_Bound"] + o["Core_Bound"].parent = o["Backend_Bound"] + o["Divider"].parent = o["Core_Bound"] + o["Ports_Utilization"].parent = o["Core_Bound"] + o["G0_Ports_Utilized"].parent = o["Ports_Utilization"] + o["G1_Port_Utilized"].parent = o["Ports_Utilization"] + o["G2_Ports_Utilized"].parent = o["Ports_Utilization"] + o["G3m_Ports_Utilized"].parent = o["Ports_Utilization"] + o["Port_0"].parent = o["G3m_Ports_Utilized"] + o["Port_1"].parent = o["G3m_Ports_Utilized"] + o["Port_2"].parent = o["G3m_Ports_Utilized"] + o["Port_3"].parent = o["G3m_Ports_Utilized"] + o["Port_4"].parent = o["G3m_Ports_Utilized"] + o["Port_5"].parent = o["G3m_Ports_Utilized"] + o["Base"].parent = o["Retiring"] + o["Microcode_Sequencer"].parent = o["Retiring"] + + # references between groups + + o["Frontend_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["Frontend_Bandwidth"].Frontend_Latency = o["Frontend_Latency"] + o["Branch_Mispredicts"].Bad_Speculation = o["Bad_Speculation"] + o["Machine_Clears"].Bad_Speculation = o["Bad_Speculation"] + o["Machine_Clears"].Branch_Mispredicts = o["Branch_Mispredicts"] + o["Backend_Bound"].Frontend_Bound = o["Frontend_Bound"] + o["Backend_Bound"].Bad_Speculation = o["Bad_Speculation"] + o["Backend_Bound"].Retiring = o["Retiring"] + o["L1_Bound"].DTLB_Load = o["DTLB_Load"] + o["Stores_Bound"].Memory_Bound = o["Memory_Bound"] + o["Core_Bound"].Memory_Bound = o["Memory_Bound"] + o["Ports_Utilization"].Core_Bound = o["Core_Bound"] + o["Ports_Utilization"].Divider = o["Divider"] + o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Base"].Retiring = o["Retiring"] + o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"] + + # siblings cross-tree + + o["Frontend_Bound"].sibling = None + o["Frontend_Latency"].sibling = None + o["ITLB_Misses"].sibling = None + o["DSB_Switches"].sibling = None + o["LCP"].sibling = None + o["MS_Switches"].sibling = o["Microcode_Sequencer"] + o["Frontend_Bandwidth"].sibling = None + o["MITE"].sibling = None + o["DSB"].sibling = None + o["LSD"].sibling = None + o["Bad_Speculation"].sibling = None + o["Branch_Mispredicts"].sibling = None + o["Machine_Clears"].sibling = None + o["Backend_Bound"].sibling = None + o["Memory_Bound"].sibling = None + o["L1_Bound"].sibling = o["G1_Port_Utilized"] + o["DTLB_Load"].sibling = None + o["Store_Fwd_Blk"].sibling = None + o["Split_Loads"].sibling = None + o["G4K_Aliasing"].sibling = None + o["L2_Bound"].sibling = None + o["L3_Bound"].sibling = None + o["Contested_Accesses"].sibling = None + o["Data_Sharing"].sibling = None + o["L3_Latency"].sibling = None + o["SQ_Full"].sibling = None + o["MEM_Bound"].sibling = None + o["MEM_Bandwidth"].sibling = None + o["MEM_Latency"].sibling = None + o["Stores_Bound"].sibling = None + o["Split_Stores"].sibling = o["Port_4"] + o["DTLB_Store"].sibling = None + o["Core_Bound"].sibling = None + o["Divider"].sibling = None + o["Ports_Utilization"].sibling = None + o["G0_Ports_Utilized"].sibling = None + o["G1_Port_Utilized"].sibling = o["L1_Bound"] + o["G2_Ports_Utilized"].sibling = None + o["G3m_Ports_Utilized"].sibling = None + o["Port_0"].sibling = None + o["Port_1"].sibling = None + o["Port_2"].sibling = None + o["Port_3"].sibling = None + o["Port_4"].sibling = o["Split_Stores"] + o["Port_5"].sibling = None + o["Retiring"].sibling = None + o["Base"].sibling = None + o["Microcode_Sequencer"].sibling = o["MS_Switches"] + + # sampling events + + o["Frontend_Bound"].sample = [] + o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END'] + o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED'] + o["DSB_Switches"].sample = [] + o["LCP"].sample = [] + o["MS_Switches"].sample = [] + o["Frontend_Bandwidth"].sample = [] + o["MITE"].sample = [] + o["DSB"].sample = [] + o["LSD"].sample = [] + o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES'] + o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] + o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT'] + o["Backend_Bound"].sample = [] + o["Memory_Bound"].sample = [] + o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp'] + o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] + o["Store_Fwd_Blk"].sample = [] + o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp'] + o["G4K_Aliasing"].sample = [] + o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] + o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp'] + o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp'] + o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp'] + o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp'] + o["SQ_Full"].sample = [] + o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp'] + o["MEM_Bandwidth"].sample = [] + o["MEM_Latency"].sample = [] + o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp'] + o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp'] + o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] + o["Core_Bound"].sample = [] + o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE'] + o["Ports_Utilization"].sample = [] + o["G0_Ports_Utilized"].sample = [] + o["G1_Port_Utilized"].sample = [] + o["G2_Ports_Utilized"].sample = [] + o["G3m_Ports_Utilized"].sample = [] + o["Port_0"].sample = [] + o["Port_1"].sample = [] + o["Port_2"].sample = [] + o["Port_3"].sample = [] + o["Port_4"].sample = [] + o["Port_5"].sample = [] + o["Retiring"].sample = [] + o["Base"].sample = ['INST_RETIRED.PREC_DIST'] + o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS'] + + # user visible metrics + + n = Metric_IPC() ; r.metric(n) + n = Metric_CPI() ; r.metric(n) + n = Metric_CoreIPC() ; r.metric(n) + n = Metric_UPI() ; r.metric(n) + n = Metric_IPTB() ; r.metric(n) + n = Metric_BPTB() ; r.metric(n) + n = Metric_DSB_Coverage() ; r.metric(n) + n = Metric_ILP() ; r.metric(n) + n = Metric_MLP() ; r.metric(n) + n = Metric_Load_Miss_Real_Latency() ; r.metric(n) + n = Metric_Turbo_Utilization() ; r.metric(n) + n = Metric_Page_Walks_Use() ; r.metric(n) + n = Metric_MUX() ; r.metric(n) + n = Metric_CLKS() ; r.metric(n) + n = Metric_CORE_CLKS() ; r.metric(n) + n = Metric_Time() ; r.metric(n) diff --git a/cpumap.sh b/cpumap.sh index e07bb4f3..5ecdc115 100644 --- a/cpumap.sh +++ b/cpumap.sh @@ -12,3 +12,5 @@ cpus[ivb]=GenuineIntel-6-3A cpus[hsw]=GenuineIntel-6-45 cpus[slm]=GenuineIntel-6-37 cpus[bnl]=GenuineIntel-6-35 +cpus[bdw]=GenuineIntel-6-3D +cpus[hsx]=GenuineIntel-6-3F diff --git a/hsw_client_ratios.py b/hsw_client_ratios.py index 0ae53359..f9f9c594 100644 --- a/hsw_client_ratios.py +++ b/hsw_client_ratios.py @@ -1,70 +1,190 @@ # -# auto generated TopDown description for Intel 4th gen Core (code named Haswell) +# auto generated TopDown 2.9 description for Intel 4rd gen Core (code named Haswell) # Please see http://ark.intel.com for more details on these CPUs. # +# References: +# http://halobates.de/blog/p/262 +# https://sites.google.com/site/analysismethods/yasin-pubs +# +smt_enabled = False # Constants -PipelineWidth = 4 -MEM_L3_WEIGHT = 7 -MEM_STLB_HIT_COST = 7 -MEM_SFB_COST = 13 -MEM_4KALIAS_COST = 7 -MEM_XSNP_HITM_COST = 60 -MEM_XSNP_HIT_COST = 43 -MEM_XSNP_NONE_COST = 29 -MS_SWITCHES_COST = 3 +Pipeline_Width = 4 +L2_Store_Latency = 9 +Mem_L3_Weight = 7 +Mem_STLB_Hit_Cost = 7 +Mem_SFB_Cost = 13 +Mem_4K_Alias_Cost = 7 +Mem_XSNP_HitM_Cost = 60 +MEM_XSNP_Hit_Cost = 43 +MEM_XSNP_None_Cost = 29 +Mem_Local_DRAM_Cost = 200 +Mem_Remote_DRAM_Cost = 310 +Mem_Remote_HitM_Cost = 200 +Mem_Remote_Fwd_Cost = 180 +MS_Switches_Cost = 2 +OneMillion = 1000000 +Energy_Unit = 61 # Aux. formulas -def CLKS(EV, level): - return EV("CPU_CLK_UNHALTED.THREAD", level) -def FewUopsExecutedThreshold(EV, level): - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level); EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level) > 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) -def BackendBoundAtEXE_stalls(EV, level): - return ( EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOPS_EXEC", level) - FewUopsExecutedThreshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level) ) -def BackendBoundAtEXE(EV, level): - return BackendBoundAtEXE_stalls(EV, level) / CLKS(EV, level) -def MemL3HitFraction(EV, level): - return EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) + MEM_L3_WEIGHT * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) ) -def MispredClearsFraction(EV, level): - return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level) ) -def AvgRsEmptyPeriodClears(EV, level): - return ( EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level))/ EV("RS_EVENTS.EMPTY_END", level) -def RetireUopFraction(EV, level): + +def Recovery_Cycles(EV, level): + EV("INT_MISC.RECOVERY_CYCLES", level) + EV("INT_MISC.RECOVERY_CYCLES:amt1", level) + return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level) + +def Execute_Cycles(EV, level): + EV("UOPS_EXECUTED.CORE:c1", level) + return (EV("UOPS_EXECUTED.CORE:c1", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CORE:c1", level) + +def L1D_Miss_Cycles(EV, level): + EV("L1D_PEND_MISS.PENDING_CYCLES", level) + EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) + return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level) + +def SQ_Full_Cycles(EV, level): + EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + +def ITLB_Miss_Cycles(EV, level): + return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level)) + +def Cycles_0_Ports_Utilized(EV, level): + return STALLS_TOTAL(EV, level) + +def Cycles_1_Port_Utilized(EV, level): + EV("UOPS_EXECUTED.CORE:c2", level) + EV("UOPS_EXECUTED.CORE:c1", level) + return (EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level)) + +def Cycles_2_Ports_Utilized(EV, level): + EV("UOPS_EXECUTED.CORE:c2", level) + EV("UOPS_EXECUTED.CORE:c3", level) + return (EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level)) + +def Cycles_3m_Ports_Utilized(EV, level): + EV("UOPS_EXECUTED.CORE:c3", level) + return (EV("UOPS_EXECUTED.CORE:c3", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CORE:c3", level) + +def STALLS_MEM_ANY(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level)) , level ) + +def STALLS_TOTAL(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level)) , level ) + +def ORO_Demand_DRD_C1(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level ) + +def ORO_Demand_DRD_C6(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level ) + +def Store_L2_Hit_Cycles(EV, level): + return 0 + +def Cycles_False_Sharing_Client(EV, level): + return Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", level) + EV("OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE", level)) + +def Few_Uops_Executed_Threshold(EV, level): + EV("UOPS_EXECUTED.CORE:c2", level) + EV("UOPS_EXECUTED.CORE:c3", level) + return EV("UOPS_EXECUTED.CORE:c3", level) if(IPC(EV, level)> 1.25)else EV("UOPS_EXECUTED.CORE:c2", level) + +def Backend_Bound_At_EXE(EV, level): + return (STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CORE:c1", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level)) / CLKS(EV, level) + +def Mem_L3_Hit_Fraction(EV, level): + return EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level)) + +def Mem_Lock_St_Fraction(EV, level): + return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level) + +def Mispred_Clears_Fraction(EV, level): + return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) + +def Retire_Uop_Fraction(EV, level): return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level) + def SLOTS(EV, level): - return PipelineWidth * CLKS(EV, level) -# Instructions Per Cycle + return Pipeline_Width * CORE_CLKS(EV, level) + +def DurationTimeInSeconds(EV, level): + return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 ) + +# Instructions Per Cycle (per logical thread) def IPC(EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(EV, level) + +# Cycles Per Instruction (threaded) +def CPI(EV, level): + return 1 / IPC(EV, level) + +# Instructions Per Cycle (per physical core) +def CoreIPC(EV, level): + return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level) + # Uops Per Instruction def UPI(EV, level): return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level) + # Instruction per taken branch -def InstPerTakenBranch(EV, level): +def IPTB(EV, level): return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + +# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes. +def BPTB(EV, level): + return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + # Fraction of Uops delivered by the DSB (decoded instructions cache) -def DSBCoverage(EV, level): - return ( EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level))/(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level) ) -# Memory-Level-Parallelism (avg L1 miss demand load when there is at least 1 such miss) +def DSB_Coverage(EV, level): + return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level)) + +# Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed) +def ILP(EV, level): + EV("UOPS_EXECUTED.CORE", level) + return (EV("UOPS_EXECUTED.CORE", level) / 2 / Execute_Cycles(EV, level)) if smt_enabled else EV("UOPS_EXECUTED.CORE", level) / Execute_Cycles(EV, level) + +# Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss) def MLP(EV, level): - return EV("L1D_PEND_MISS.PENDING", level) / EV("L1D_PEND_MISS.PENDING_CYCLES", level) -# Average L1 miss demand load latency -def L1dMissLatency(EV, level): - return EV("L1D_PEND_MISS.PENDING", level) / EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level) + +# Actual Average Latency for L1 data-cache miss demand loads +def Load_Miss_Real_Latency(EV, level): + return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level)) + # Average Frequency Utilization relative nominal frequency -def TurboUtilization(EV, level): +def Turbo_Utilization(EV, level): return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level) +# Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store +def Page_Walks_Use(EV, level): + return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level) + +# PerfMon Event Multiplexing accuracy indicator +def MUX(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level) + +# Per-thread actual clocks +def CLKS(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD", level) + +# Core actual clocks +def CORE_CLKS(EV, level): + EV("CPU_CLK_UNHALTED.THREAD:amt1", level) + return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level) + +# Run duration time in seconds +def Time(EV, level): + return DurationTimeInSeconds(EV, level) + # Event groups -class FrontendBound: - name = "FrontendBound" +class Frontend_Bound: + name = "Frontend_Bound" domain = "Slots" area = "FE" desc = """ @@ -77,108 +197,72 @@ class FrontendBound: latter can accept them. For example, stalls due to instruction-cache misses would be categorized under Frontend Bound.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1)/ SLOTS(EV, 1 ) + self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.2) except ZeroDivisionError: - #print "FrontendBound zero division" + #print "Frontend_Bound zero division" self.val = 0 self.thresh = False return self.val -class FrontendLatency: - name = "Frontend Latency" +class Frontend_Latency: + name = "Frontend_Latency" domain = "Slots" area = "FE" desc = """ This metric represents slots fraction CPU was stalled due to Frontend latency issues. For example, instruction-cache misses, iTLB misses or fetch stalls -after a branch missprediction are categorized under Frontend Latency. In such +after a branch misprediction are categorized under Frontend Latency. In such cases the Frontend eventually delivers no uops for some period.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = PipelineWidth * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2)/ SLOTS(EV, 2 ) + self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 ) self.thresh = (self.val > 0.15) and self.parent.thresh except ZeroDivisionError: - #print "FrontendLatency zero division" - self.val = 0 - self.thresh = False - return self.val - -class ICacheMisses: - name = "ICache Misses" - domain = "Clocks" - area = "FE" - desc = """ -This metric represents cycles fraction CPU was stalled due to instruction -cache misses. Using compiler's Profile-Guided Optimization (PGO) can reduce -i-cache misses through improved hot code layout.""" - level = 3 - def compute(self, EV): - try: - self.val = ( EV("ICACHE.IFETCH_STALL", 3)- EV("ITLB_MISSES.WALK_DURATION", 3)) / CLKS(EV, 3 ) - self.thresh = (self.val > 0.05) and self.parent.thresh - except ZeroDivisionError: - #print "ICacheMisses zero division" + #print "Frontend_Latency zero division" self.val = 0 self.thresh = False return self.val -class ITLBmisses: - name = "ITLB misses" +class ITLB_Misses: + name = "ITLB_Misses" domain = "Clocks" area = "FE" desc = """ This metric represents cycles fraction CPU was stalled due to instruction TLB misses. Using large code pages may be considered here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ITLB_MISSES.WALK_DURATION", 3)/ CLKS(EV, 3 ) + self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: - #print "ITLBmisses zero division" + #print "ITLB_Misses zero division" self.val = 0 self.thresh = False return self.val -class BranchResteers: - name = "Branch Resteers" - domain = "Clocks" - area = "FE" - desc = """ -This metric represents cycles fraction CPU was stalled due to Branch Resteers. -Following all sorts of miss-predicted branches, this measure the delays of -fetch instructions from corrected path caused by the Frontend of the machine. -For example, branchy code with lots of (taken) branches and/or branch miss- -predictions might get categorized under Branch Resteers.""" - level = 3 - def compute(self, EV): - try: - self.val = ( EV("BR_MISP_RETIRED.ALL_BRANCHES", 3)+ EV("MACHINE_CLEARS.COUNT", 3)+ EV("BACLEARS.ANY", 3)) * AvgRsEmptyPeriodClears(EV, 3)/ CLKS(EV, 3 ) - self.thresh = (self.val > 0.05) and self.parent.thresh - except ZeroDivisionError: - #print "BranchResteers zero division" - self.val = 0 - self.thresh = False - return self.val - -class DSBswitches: - name = "DSB switches" +class DSB_Switches: + name = "DSB_Switches" domain = "Clocks" area = "FE" desc = """ This metric represents cycles fraction CPU was stalled due to switches from DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3)/ CLKS(EV, 3 ) + self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: - #print "DSBswitches zero division" + #print "DSB_Switches zero division" self.val = 0 self.thresh = False return self.val @@ -192,9 +276,10 @@ class LCP: Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ILD_STALL.LCP", 3)/ CLKS(EV, 3 ) + self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "LCP zero division" @@ -202,8 +287,8 @@ def compute(self, EV): self.thresh = False return self.val -class MSswitches: - name = "MS switches" +class MS_Switches: + name = "MS_Switches" domain = "Clocks" area = "FE" desc = """ @@ -213,18 +298,19 @@ class MSswitches: deliver long uop flows required by CISC instructions like CPUID, or uncommon conditions like Floating Point Assists when dealing with Denormals.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = MS_SWITCHES_COST * EV("IDQ.MS_SWITCHES", 3)/ CLKS(EV, 3 ) + self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: - #print "MSswitches zero division" + #print "MS_Switches zero division" self.val = 0 self.thresh = False return self.val -class FrontendBandwidth: - name = "Frontend Bandwidth" +class Frontend_Bandwidth: + name = "Frontend_Bandwidth" domain = "Slots" area = "FE" desc = """ @@ -234,28 +320,30 @@ class FrontendBandwidth: under Frontend Bandwidth. In such cases, the Frontend typically delivers non- optimal amount of uops to the Backend.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.FrontendBound.compute(EV)- self.FrontendLatency.compute(EV ) + self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV ) self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh except ZeroDivisionError: - #print "FrontendBandwidth zero division" + #print "Frontend_Bandwidth zero division" self.val = 0 self.thresh = False return self.val class MITE: name = "MITE" - domain = "Clocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -the MITE fetch pipeline. For example, inefficiencies in the instruction -decoders are categorized here.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to the MITE fetch pipeline. For example, inefficiencies in the +instruction decoders are categorized here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CLKS(EV, 3 ) + self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MITE zero division" @@ -265,17 +353,18 @@ def compute(self, EV): class DSB: name = "DSB" - domain = "Clocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -DSB (decoded uop cache) fetch pipeline. For example, inefficient utlilization -of the DSB cache structure or bank conflict when reading from it, are -categorized here.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to DSB (decoded uop cache) fetch pipeline. For example, inefficient +utilization of the DSB cache structure or bank conflict when reading from it, +are categorized here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CLKS(EV, 3 ) + self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.3) and self.parent.thresh except ZeroDivisionError: #print "DSB zero division" @@ -283,8 +372,30 @@ def compute(self, EV): self.thresh = False return self.val -class BadSpeculation: - name = "BadSpeculation" +class LSD: + name = "LSD" + domain = "CoreClocks" + area = "FE" + desc = """ +This metric represents Core cycles fraction in which CPU was likely limited +due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining +Uop supply. However, in some rare cases, optimal uop-delivery could not be +reached for small loops whose size (in terms of number of uops) does not suit +well the LSD structure.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "LSD zero division" + self.val = 0 + self.thresh = False + return self.val + +class Bad_Speculation: + name = "Bad_Speculation" domain = "Slots" area = "BAD" desc = """ @@ -294,38 +405,40 @@ class BadSpeculation: speculation. For example, wasted work due to miss-predicted branches are categorized under Bad Speculation category""" level = 1 + htoff = False def compute(self, EV): try: - self.val = ( EV("UOPS_ISSUED.ANY", 1)- EV("UOPS_RETIRED.RETIRE_SLOTS", 1)+ PipelineWidth * EV("INT_MISC.RECOVERY_CYCLES", 1)) / SLOTS(EV, 1 ) + self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.1) except ZeroDivisionError: - #print "BadSpeculation zero division" + #print "Bad_Speculation zero division" self.val = 0 self.thresh = False return self.val -class BranchMispredicts: - name = "Branch Mispredicts" +class Branch_Mispredicts: + name = "Branch_Mispredicts" domain = "Slots" area = "BAD" desc = """ This metric represents slots fraction CPU was impacted by Branch -Missprediction. These slots are either wasted by uops fetched from an +Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path, or stalls the Backend of the machine needs to recover its state from a speculative path.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = MispredClearsFraction(EV, 2)* self.BadSpeculation.compute(EV ) + self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: - #print "BranchMispredicts zero division" + #print "Branch_Mispredicts zero division" self.val = 0 self.thresh = False return self.val -class MachineClears: - name = "Machine Clears" +class Machine_Clears: + name = "Machine_Clears" domain = "Slots" area = "BAD" desc = """ @@ -335,12 +448,13 @@ class MachineClears: example, this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.BadSpeculation.compute(EV)- self.BranchMispredicts.compute(EV ) + self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: - #print "MachineClears zero division" + #print "Machine_Clears zero division" self.val = 0 self.thresh = False return self.val @@ -358,9 +472,10 @@ class Backend_Bound: stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = 1 -(self.FrontendBound.compute(EV)+ self.BadSpeculation.compute(EV)+ self.Retiring.compute(EV)) + self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV)) self.thresh = (self.val > 0.2) except ZeroDivisionError: #print "Backend_Bound zero division" @@ -368,8 +483,8 @@ def compute(self, EV): self.thresh = False return self.val -class MemoryBound: - name = "MemoryBound" +class Memory_Bound: + name = "Memory_Bound" domain = "Clocks" area = "BE/Mem" desc = """ @@ -379,18 +494,19 @@ class MemoryBound: memory demand loads which coincides with execution starvation. in addition to less common cases where stores could imply backpressure on the pipeline.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = ( EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", 2)+ EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 ) + self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: - #print "MemoryBound zero division" + #print "Memory_Bound zero division" self.val = 0 self.thresh = False return self.val -class L1Bound: - name = "L1 Bound" +class L1_Bound: + name = "L1_Bound" domain = "Clocks" area = "BE/Mem" desc = """ @@ -401,82 +517,87 @@ class L1Bound: allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event as it accounts for any non-completed load.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", 3)- EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 ) - self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Overhead.thresh + self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 ) + self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh except ZeroDivisionError: - #print "L1Bound zero division" + #print "L1_Bound zero division" self.val = 0 self.thresh = False return self.val -class DTLB_Overhead: - name = "DTLB_Overhead" +class DTLB_Load: + name = "DTLB_Load" domain = "Clocks" area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( MEM_STLB_HIT_COST * EV("DTLB_LOAD_MISSES.STLB_HIT", 4)+ EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) + self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: - #print "DTLB_Overhead zero division" + #print "DTLB_Load zero division" self.val = 0 self.thresh = False return self.val -class LoadsBlockedbyStoreForwarding: - name = "Loads Blocked by Store Forwarding" +class Store_Fwd_Blk: + name = "Store_Fwd_Blk" domain = "Clocks" area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_SFB_COST * EV("LD_BLOCKS.STORE_FORWARD", 4)/ CLKS(EV, 4 ) + self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: - #print "LoadsBlockedbyStoreForwarding zero division" + #print "Store_Fwd_Blk zero division" self.val = 0 self.thresh = False return self.val -class SplitLoads: - name = "Split Loads" +class Split_Loads: + name = "Split_Loads" domain = "Clocks" area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = L1dMissLatency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4)/ CLKS(EV, 4 ) + self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: - #print "SplitLoads zero division" + #print "Split_Loads zero division" self.val = 0 self.thresh = False return self.val -class G4KAliasing: - name = "4K Aliasing" +class G4K_Aliasing: + name = "4K_Aliasing" domain = "Clocks" area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_4KALIAS_COST * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4)/ CLKS(EV, 4 ) + self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: - #print "G4KAliasing zero division" + #print "G4K_Aliasing zero division" self.val = 0 self.thresh = False return self.val -class L2Bound: - name = "L2 Bound" +class L2_Bound: + name = "L2_Bound" domain = "Clocks" area = "BE/Mem" desc = """ @@ -484,18 +605,19 @@ class L2Bound: misses (i.e. L1 misses/L2 hits) will improve the latency and increase performance.""" level = 3 + htoff = True def compute(self, EV): try: - self.val = ( EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)- EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 ) + self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 ) self.thresh = (self.val > 0.03) and self.parent.thresh except ZeroDivisionError: - #print "L2Bound zero division" + #print "L2_Bound zero division" self.val = 0 self.thresh = False return self.val -class L3Bound: - name = "L3 Bound" +class L3_Bound: + name = "L3_Bound" domain = "Clocks" area = "BE/Mem" desc = """ @@ -503,88 +625,113 @@ class L3Bound: a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) will improve the latency and increase performance.""" level = 3 + htoff = True def compute(self, EV): try: - self.val = MemL3HitFraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 ) + self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: - #print "L3Bound zero division" + #print "L3_Bound zero division" self.val = 0 self.thresh = False return self.val -class ContestedAccesses: - name = "Contested Accesses" +class Contested_Accesses: + name = "Contested_Accesses" domain = "Clocks" area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_XSNP_HITM_COST *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4)+ EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 ) + self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: - #print "ContestedAccesses zero division" + #print "Contested_Accesses zero division" self.val = 0 self.thresh = False return self.val -class DataSharing: - name = "Data Sharing" +class Data_Sharing: + name = "Data_Sharing" domain = "Clocks" area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_XSNP_HIT_COST * EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT", 4)/ CLKS(EV, 4 ) + self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: - #print "DataSharing zero division" + #print "Data_Sharing zero division" self.val = 0 self.thresh = False return self.val -class L3Latency: - name = "L3 Latency" +class L3_Latency: + name = "L3_Latency" domain = "Clocks" area = "BE/Mem" desc = """ This metric is a rough aggregate estimate of cycles fraction where CPU accessed L3 cache for all load requests, while there was no contention/sharing -with a sibiling core. Avoiding cache misses (i.e. L2 misses/L3 hits) will +with a sibling core. Avoiding cache misses (i.e. L2 misses/L3 hits) will improve the latency and increase performance.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_XSNP_NONE_COST * EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", 4)/ CLKS(EV, 4 ) + self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", 4) / CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: - #print "L3Latency zero division" + #print "L3_Latency zero division" + self.val = 0 + self.thresh = False + return self.val + +class SQ_Full: + name = "SQ_Full" + domain = "CoreClocks" + area = "BE/Mem" + desc = """ +This metric measures fraction of cycles where the Super Queue (SQ) was full +taking into account all request-types and both hardware SMT threads. The Super +Queue is used for requests to access the L2 cache or to go out to the Uncore.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "SQ_Full zero division" self.val = 0 self.thresh = False return self.val -class DRAMBound: - name = "DRAM Bound" +class MEM_Bound: + name = "MEM_Bound" domain = "Clocks" area = "BE/Mem" desc = """ This metric represents how often CPU was stalled on main memory (DRAM). Caching will improve the latency and increase performance.""" level = 3 + htoff = True def compute(self, EV): try: - self.val = ( 1 - MemL3HitFraction(EV, 3)) * EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 ) + self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: - #print "DRAMBound zero division" + #print "MEM_Bound zero division" self.val = 0 self.thresh = False return self.val -class MEMBandwidth: - name = "MEM Bandwidth" +class MEM_Bandwidth: + name = "MEM_Bandwidth" domain = "Clocks" area = "BE/Mem" desc = """ @@ -592,91 +739,100 @@ class MEMBandwidth: bandwidth limits of main memory (DRAM). NUMA in multi-socket system may be considered in such case.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:cmask=6", 4)/ CLKS(EV, 4 ) + self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: - #print "MEMBandwidth zero division" + #print "MEM_Bandwidth zero division" self.val = 0 self.thresh = False return self.val -class MEMLatency: - name = "MEM Latency" +class MEM_Latency: + name = "MEM_Latency" domain = "Clocks" area = "BE/Mem" desc = """ This metric represents how often CPU was likely stalled due to latency from -main memory (DRAM). Data layout restructing or using Software Prefetches +main memory (DRAM). Data layout re-structuring or using Software Prefetches (also through the compiler) may be considered in such case.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", 4)- EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:cmask=6", 4)) / CLKS(EV, 4 ) + self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: - #print "MEMLatency zero division" + #print "MEM_Latency zero division" self.val = 0 self.thresh = False return self.val -class StoresBound: - name = "Stores Bound" +class Stores_Bound: + name = "Stores_Bound" domain = "Clocks" area = "BE/Mem" desc = """ -This metric represents how often CPU was stalled on due to store operations. -Tip: consider False Sharing analysis as next step""" +This metric represents how often CPU was stalled due to store operations. +even though memory store accesses do not typically stall out-of-order CPUs; +there are few cases where stores can lead to actual stalls. This metric will +be flagged should any of these cases be a bottleneck.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = self.MemoryBound.compute(EV)-(EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", 3)/ CLKS(EV, 3)) + self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: - #print "StoresBound zero division" + #print "Stores_Bound zero division" self.val = 0 self.thresh = False return self.val -class FalseSharing: - name = "False Sharing" +class False_Sharing: + name = "False_Sharing" domain = "Clocks" area = "BE/Mem" desc = """ -This metric represents how often CPU was stalled on due to store operations. -Tip: consider False Sharing analysis as next step""" +This metric represents how often CPU was stalled due to False Sharing. False +Sharing is a multithreading hiccup, where multiple threads contend on +different data-elements mapped into the same cache line. It can be easily +avoided by padding to make threads access different lines.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_XSNP_HITM_COST *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4)+ EV("OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE", 4)) / CLKS(EV, 4 ) + self.val = Cycles_False_Sharing_Client(EV, 4) / CLKS(EV, 4 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: - #print "FalseSharing zero division" + #print "False_Sharing zero division" self.val = 0 self.thresh = False return self.val -class SplitStores: - name = "Split Stores" - domain = "Stores" +class Split_Stores: + name = "Split_Stores" + domain = "CoreClocks" area = "BE/Mem" desc = """ This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4)/ EV("MEM_UOPS_RETIRED.ALL_STORES", 4 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: - #print "SplitStores zero division" + #print "Split_Stores zero division" self.val = 0 self.thresh = False return self.val -class DTLBStoreOverhead: - name = "DTLB Store Overhead" +class DTLB_Store: + name = "DTLB_Store" domain = "Clocks" area = "BE/Mem" desc = """ @@ -687,33 +843,164 @@ class DTLBStoreOverhead: the same page. Try using larger page sizes for large amounts of frequently- used data.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( MEM_STLB_HIT_COST * EV("DTLB_STORE_MISSES.STLB_HIT", 4)+ EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) + self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: - #print "DTLBStoreOverhead zero division" + #print "DTLB_Store zero division" self.val = 0 self.thresh = False return self.val -class CoreBound: - name = "CoreBound" +class Core_Bound: + name = "Core_Bound" domain = "Clocks" area = "BE/Core" desc = """ -This metric represents how much Core non-memory issues were a bottleneck. -This may indicate that we ran out of OOO resources or are saturating certain -execution units (e.g. the use of FP-chained long-latency arithmetic -operations) which can limit performance. Tip: consider Port Saturation -analysis as next step""" +This metric represents how much Core non-memory issues were of a bottleneck. +Shortage in hardware compute resources, or dependencies software's +instructions are both categorized under Core Bound. Hence it may indicate the +machine ran out of an OOO resources, certain execution units are overloaded or +dependencies in program's data- or instruction-flow are limiting the +performance (e.g. FP-chained long-latency arithmetic operations). Tip: +consider Port Saturation analysis as next step.""" level = 2 + htoff = False + def compute(self, EV): + try: + self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "Core_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class Divider: + name = "Divider" + domain = "CoreClocks" + area = "BE/Core" + desc = "" + level = 3 + htoff = False def compute(self, EV): try: - self.val = BackendBoundAtEXE(EV, 2)- self.MemoryBound.compute(EV ) + self.val = 10 * EV("ARITH.DIVIDER_UOPS", 3) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: - #print "CoreBound zero division" + #print "Divider zero division" + self.val = 0 + self.thresh = False + return self.val + +class Ports_Utilization: + name = "Ports_Utilization" + domain = "Clocks" + area = "BE/Core" + desc = """ +This metric represents cycles fraction application was stalled due to Core +computation issues (non divider-related). For example, heavy data-dependency +between nearby instructions will manifest in this category. Ditto if +instruction-mix used by the application overloads specific hardware execution +unit. Hint: Loop Vectorization -most compilers feature auto-Vectorization +options today- reduces pressure on the execution ports as multiple elements +are calculated with same uop.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "Ports_Utilization zero division" + self.val = 0 + self.thresh = False + return self.val + +class G0_Ports_Utilized: + name = "0_Ports_Utilized" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU executed no uops on any +execution port.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "G0_Ports_Utilized zero division" + self.val = 0 + self.thresh = False + return self.val + +class G1_Port_Utilized: + name = "1_Port_Utilized" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction where the CPU executed total of 1 +uop per cycle on all execution ports. This can be due to heavy data-dependency +among software instructions, or over oversubscribing a particular hardware +resource. In some other cases with high 1_Port_Utilized and L1_Bound, this +metric can point to L1 data-cache latency bottleneck that may not necessarily +manifest with complete execution starvation (due to the short L1 latency e.g. +walking a linked list) - looking at the assembly can be helpful. Tip: consider +'Core Ports Saturation' analysis-type as next step.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "G1_Port_Utilized zero division" + self.val = 0 + self.thresh = False + return self.val + +class G2_Ports_Utilized: + name = "2_Ports_Utilized" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU executed total of 2 uops per +cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis- +type as next step. Loop Vectorization -most compilers feature auto- +Vectorization options today- reduces pressure on the execution ports as +multiple elements are calculated with same uop.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "G2_Ports_Utilized zero division" + self.val = 0 + self.thresh = False + return self.val + +class G3m_Ports_Utilized: + name = "3m_Ports_Utilized" + domain = "CoreClocks" + area = "BE/Core" + desc = """ +This metric represents Core cycles fraction CPU executed total of 3 or more +uops per cycle on all execution ports. Tip: consider 'Core Port Saturation' +analysis-type as next step""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "G3m_Ports_Utilized zero division" self.val = 0 self.thresh = False return self.val @@ -735,40 +1022,43 @@ class Retiring: computations be done without significantly increasing number of instructions thus improving the performance.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1)/ SLOTS(EV, 1 ) - self.thresh = (self.val > 0.7) | self.MicroSequencer.thresh + self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 ) + self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh except ZeroDivisionError: #print "Retiring zero division" self.val = 0 self.thresh = False return self.val -class BASE: - name = "BASE" +class Base: + name = "Base" domain = "Slots" area = "RET" desc = """ -This metric represents slots fraction CPU was retiring uops not originated -from the microcode-sequencer. This correlates with total number of +This metric represents slots fraction where the CPU was retiring uops not +originated from the microcode-sequencer. This correlates with total number of instructions used by the program. A uops-per-instruction ratio of 1 should be -expected. A high Retiring value for non-vectorized code is typically a good -hint for programmer to pursue vectorizing his code, which can reduce -instructions hence this bucket.""" +expected. While this is the most desirable of the top 4 categories, high +values may still indicate areas for improvement. If possible focus on +techniques that reduce instruction count or result in more efficient +instructions generation such as vectorization.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.Retiring.compute(EV)- self.MicroSequencer.compute(EV ) + self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV ) self.thresh = (self.val > 0.6) and self.parent.thresh except ZeroDivisionError: - #print "BASE zero division" + #print "Base zero division" self.val = 0 self.thresh = False return self.val -class MicroSequencer: - name = "MicroSequencer" +class Microcode_Sequencer: + name = "Microcode_Sequencer" domain = "Slots" area = "RET" desc = """ @@ -777,12 +1067,13 @@ class MicroSequencer: decoded by the default decoders (like repeat move strings), or by microcode assists used to address some operation modes (like in Floating Point assists).""" level = 2 + htoff = False def compute(self, EV): try: - self.val = RetireUopFraction(EV, 2)* EV("IDQ.MS_UOPS", 2)/ SLOTS(EV, 2 ) + self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 ) self.thresh = (self.val > 0.05) except ZeroDivisionError: - #print "MicroSequencer zero division" + #print "Microcode_Sequencer zero division" self.val = 0 self.thresh = False return self.val @@ -790,7 +1081,9 @@ def compute(self, EV): class Metric_IPC: name = "IPC" desc = """ -Instructions Per Cycle""" +Instructions Per Cycle (per logical thread)""" + domain = "Metric" + maxval = 5 def compute(self, EV): try: @@ -799,10 +1092,40 @@ def compute(self, EV): print "IPC zero division" self.val = 0 +class Metric_CPI: + name = "CPI" + desc = """ +Cycles Per Instruction (threaded)""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = CPI(EV, 0) + except ZeroDivisionError: + print "CPI zero division" + self.val = 0 + +class Metric_CoreIPC: + name = "CoreIPC" + desc = """ +Instructions Per Cycle (per physical core)""" + domain = "Metric" + maxval = 5 + + def compute(self, EV): + try: + self.val = CoreIPC(EV, 0) + except ZeroDivisionError: + print "CoreIPC zero division" + self.val = 0 + class Metric_UPI: name = "UPI" desc = """ Uops Per Instruction""" + domain = "Metric" + maxval = 2 def compute(self, EV): try: @@ -811,35 +1134,71 @@ def compute(self, EV): print "UPI zero division" self.val = 0 -class Metric_InstPerTakenBranch: - name = "InstPerTakenBranch" +class Metric_IPTB: + name = "IPTB" desc = """ Instruction per taken branch""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = IPTB(EV, 0) + except ZeroDivisionError: + print "IPTB zero division" + self.val = 0 + +class Metric_BPTB: + name = "BPTB" + desc = """ +Branch instructions per taken branch. Can be used to approximate PGO- +likelihood for non-loopy codes.""" + domain = "Metric" + maxval = 0 def compute(self, EV): try: - self.val = InstPerTakenBranch(EV, 0) + self.val = BPTB(EV, 0) except ZeroDivisionError: - print "InstPerTakenBranch zero division" + print "BPTB zero division" self.val = 0 -class Metric_DSBCoverage: - name = "DSBCoverage" +class Metric_DSB_Coverage: + name = "DSB_Coverage" desc = """ Fraction of Uops delivered by the DSB (decoded instructions cache)""" + domain = "Metric" + maxval = 1 + + def compute(self, EV): + try: + self.val = DSB_Coverage(EV, 0) + except ZeroDivisionError: + print "DSB_Coverage zero division" + self.val = 0 + +class Metric_ILP: + name = "ILP" + desc = """ +Instruction-Level-Parallelism (average number of uops executed when there is +at least 1 uop executed)""" + domain = "Metric" + maxval = 10 def compute(self, EV): try: - self.val = DSBCoverage(EV, 0) + self.val = ILP(EV, 0) except ZeroDivisionError: - print "DSBCoverage zero division" + print "ILP zero division" self.val = 0 class Metric_MLP: name = "MLP" desc = """ -Memory-Level-Parallelism (avg L1 miss demand load when there is at least 1 -such miss)""" +Memory-Level-Parallelism (average number of L1 miss demand load when there is +at least 1 such miss)""" + domain = "Metric" + maxval = 10 def compute(self, EV): try: @@ -848,28 +1207,103 @@ def compute(self, EV): print "MLP zero division" self.val = 0 -class Metric_L1dMissLatency: - name = "L1dMissLatency" +class Metric_Load_Miss_Real_Latency: + name = "Load_Miss_Real_Latency" desc = """ -Average L1 miss demand load latency""" +Actual Average Latency for L1 data-cache miss demand loads""" + domain = "Metric" + maxval = 1000 def compute(self, EV): try: - self.val = L1dMissLatency(EV, 0) + self.val = Load_Miss_Real_Latency(EV, 0) except ZeroDivisionError: - print "L1dMissLatency zero division" + print "Load_Miss_Real_Latency zero division" self.val = 0 -class Metric_TurboUtilization: - name = "TurboUtilization" +class Metric_Turbo_Utilization: + name = "Turbo_Utilization" desc = """ Average Frequency Utilization relative nominal frequency""" + domain = "Metric" + maxval = 10 + + def compute(self, EV): + try: + self.val = Turbo_Utilization(EV, 0) + except ZeroDivisionError: + print "Turbo_Utilization zero division" + self.val = 0 + +class Metric_Page_Walks_Use: + name = "Page_Walks_Use" + desc = """ +Fraction of cycles where the core's Page Walker is busy serving +iTLB/Load/Store""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = Page_Walks_Use(EV, 0) + except ZeroDivisionError: + print "Page_Walks_Use zero division" + self.val = 0 + +class Metric_MUX: + name = "MUX" + desc = """ +PerfMon Event Multiplexing accuracy indicator""" + domain = "Clocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = MUX(EV, 0) + except ZeroDivisionError: + print "MUX zero division" + self.val = 0 + +class Metric_CLKS: + name = "CLKS" + desc = """ +Per-thread actual clocks""" + domain = "Count" + maxval = 0 + + def compute(self, EV): + try: + self.val = CLKS(EV, 0) + except ZeroDivisionError: + print "CLKS zero division" + self.val = 0 + +class Metric_CORE_CLKS: + name = "CORE_CLKS" + desc = """ +Core actual clocks""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = CORE_CLKS(EV, 0) + except ZeroDivisionError: + print "CORE_CLKS zero division" + self.val = 0 + +class Metric_Time: + name = "Time" + desc = """ +Run duration time in seconds""" + domain = "Count" + maxval = 0 def compute(self, EV): try: - self.val = TurboUtilization(EV, 0) + self.val = Time(EV, 0) except ZeroDivisionError: - print "TurboUtilization zero division" + print "Time zero division" self.val = 0 # Schedule @@ -878,183 +1312,218 @@ def compute(self, EV): class Setup: def __init__(self, r): o = dict() - n = FrontendBound() ; r.run(n) ; o["FrontendBound"] = n - n = FrontendLatency() ; r.run(n) ; o["FrontendLatency"] = n - n = ICacheMisses() ; r.run(n) ; o["ICacheMisses"] = n - n = ITLBmisses() ; r.run(n) ; o["ITLBmisses"] = n - n = BranchResteers() ; r.run(n) ; o["BranchResteers"] = n - n = DSBswitches() ; r.run(n) ; o["DSBswitches"] = n + n = Frontend_Bound() ; r.run(n) ; o["Frontend_Bound"] = n + n = Frontend_Latency() ; r.run(n) ; o["Frontend_Latency"] = n + n = ITLB_Misses() ; r.run(n) ; o["ITLB_Misses"] = n + n = DSB_Switches() ; r.run(n) ; o["DSB_Switches"] = n n = LCP() ; r.run(n) ; o["LCP"] = n - n = MSswitches() ; r.run(n) ; o["MSswitches"] = n - n = FrontendBandwidth() ; r.run(n) ; o["FrontendBandwidth"] = n + n = MS_Switches() ; r.run(n) ; o["MS_Switches"] = n + n = Frontend_Bandwidth() ; r.run(n) ; o["Frontend_Bandwidth"] = n n = MITE() ; r.run(n) ; o["MITE"] = n n = DSB() ; r.run(n) ; o["DSB"] = n - n = BadSpeculation() ; r.run(n) ; o["BadSpeculation"] = n - n = BranchMispredicts() ; r.run(n) ; o["BranchMispredicts"] = n - n = MachineClears() ; r.run(n) ; o["MachineClears"] = n + n = LSD() ; r.run(n) ; o["LSD"] = n + n = Bad_Speculation() ; r.run(n) ; o["Bad_Speculation"] = n + n = Branch_Mispredicts() ; r.run(n) ; o["Branch_Mispredicts"] = n + n = Machine_Clears() ; r.run(n) ; o["Machine_Clears"] = n n = Backend_Bound() ; r.run(n) ; o["Backend_Bound"] = n - n = MemoryBound() ; r.run(n) ; o["MemoryBound"] = n - n = L1Bound() ; r.run(n) ; o["L1Bound"] = n - n = DTLB_Overhead() ; r.run(n) ; o["DTLB_Overhead"] = n - n = LoadsBlockedbyStoreForwarding() ; r.run(n) ; o["LoadsBlockedbyStoreForwarding"] = n - n = SplitLoads() ; r.run(n) ; o["SplitLoads"] = n - n = G4KAliasing() ; r.run(n) ; o["G4KAliasing"] = n - n = L2Bound() ; r.run(n) ; o["L2Bound"] = n - n = L3Bound() ; r.run(n) ; o["L3Bound"] = n - n = ContestedAccesses() ; r.run(n) ; o["ContestedAccesses"] = n - n = DataSharing() ; r.run(n) ; o["DataSharing"] = n - n = L3Latency() ; r.run(n) ; o["L3Latency"] = n - n = DRAMBound() ; r.run(n) ; o["DRAMBound"] = n - n = MEMBandwidth() ; r.run(n) ; o["MEMBandwidth"] = n - n = MEMLatency() ; r.run(n) ; o["MEMLatency"] = n - n = StoresBound() ; r.run(n) ; o["StoresBound"] = n - n = FalseSharing() ; r.run(n) ; o["FalseSharing"] = n - n = SplitStores() ; r.run(n) ; o["SplitStores"] = n - n = DTLBStoreOverhead() ; r.run(n) ; o["DTLBStoreOverhead"] = n - n = CoreBound() ; r.run(n) ; o["CoreBound"] = n + n = Memory_Bound() ; r.run(n) ; o["Memory_Bound"] = n + n = L1_Bound() ; r.run(n) ; o["L1_Bound"] = n + n = DTLB_Load() ; r.run(n) ; o["DTLB_Load"] = n + n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n + n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n + n = L2_Bound() ; r.run(n) ; o["L2_Bound"] = n + n = L3_Bound() ; r.run(n) ; o["L3_Bound"] = n + n = Contested_Accesses() ; r.run(n) ; o["Contested_Accesses"] = n + n = Data_Sharing() ; r.run(n) ; o["Data_Sharing"] = n + n = L3_Latency() ; r.run(n) ; o["L3_Latency"] = n + n = SQ_Full() ; r.run(n) ; o["SQ_Full"] = n + n = MEM_Bound() ; r.run(n) ; o["MEM_Bound"] = n + n = MEM_Bandwidth() ; r.run(n) ; o["MEM_Bandwidth"] = n + n = MEM_Latency() ; r.run(n) ; o["MEM_Latency"] = n + n = Stores_Bound() ; r.run(n) ; o["Stores_Bound"] = n + n = False_Sharing() ; r.run(n) ; o["False_Sharing"] = n + n = Split_Stores() ; r.run(n) ; o["Split_Stores"] = n + n = DTLB_Store() ; r.run(n) ; o["DTLB_Store"] = n + n = Core_Bound() ; r.run(n) ; o["Core_Bound"] = n + n = Divider() ; r.run(n) ; o["Divider"] = n + n = Ports_Utilization() ; r.run(n) ; o["Ports_Utilization"] = n + n = G0_Ports_Utilized() ; r.run(n) ; o["G0_Ports_Utilized"] = n + n = G1_Port_Utilized() ; r.run(n) ; o["G1_Port_Utilized"] = n + n = G2_Ports_Utilized() ; r.run(n) ; o["G2_Ports_Utilized"] = n + n = G3m_Ports_Utilized() ; r.run(n) ; o["G3m_Ports_Utilized"] = n n = Retiring() ; r.run(n) ; o["Retiring"] = n - n = BASE() ; r.run(n) ; o["BASE"] = n - n = MicroSequencer() ; r.run(n) ; o["MicroSequencer"] = n + n = Base() ; r.run(n) ; o["Base"] = n + n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n # parents - o["FrontendLatency"].parent = o["FrontendBound"] - o["ICacheMisses"].parent = o["FrontendLatency"] - o["ITLBmisses"].parent = o["FrontendLatency"] - o["BranchResteers"].parent = o["FrontendLatency"] - o["DSBswitches"].parent = o["FrontendLatency"] - o["LCP"].parent = o["FrontendLatency"] - o["MSswitches"].parent = o["FrontendLatency"] - o["FrontendBandwidth"].parent = o["FrontendBound"] - o["MITE"].parent = o["FrontendBandwidth"] - o["DSB"].parent = o["FrontendBandwidth"] - o["BranchMispredicts"].parent = o["BadSpeculation"] - o["MachineClears"].parent = o["BadSpeculation"] - o["MemoryBound"].parent = o["Backend_Bound"] - o["L1Bound"].parent = o["MemoryBound"] - o["DTLB_Overhead"].parent = o["L1Bound"] - o["LoadsBlockedbyStoreForwarding"].parent = o["L1Bound"] - o["SplitLoads"].parent = o["L1Bound"] - o["G4KAliasing"].parent = o["L1Bound"] - o["L2Bound"].parent = o["MemoryBound"] - o["L3Bound"].parent = o["MemoryBound"] - o["ContestedAccesses"].parent = o["L3Bound"] - o["DataSharing"].parent = o["L3Bound"] - o["L3Latency"].parent = o["L3Bound"] - o["DRAMBound"].parent = o["MemoryBound"] - o["MEMBandwidth"].parent = o["DRAMBound"] - o["MEMLatency"].parent = o["DRAMBound"] - o["StoresBound"].parent = o["MemoryBound"] - o["FalseSharing"].parent = o["StoresBound"] - o["SplitStores"].parent = o["StoresBound"] - o["DTLBStoreOverhead"].parent = o["StoresBound"] - o["CoreBound"].parent = o["Backend_Bound"] - o["BASE"].parent = o["Retiring"] - o["MicroSequencer"].parent = o["Retiring"] + o["Frontend_Latency"].parent = o["Frontend_Bound"] + o["ITLB_Misses"].parent = o["Frontend_Latency"] + o["DSB_Switches"].parent = o["Frontend_Latency"] + o["LCP"].parent = o["Frontend_Latency"] + o["MS_Switches"].parent = o["Frontend_Latency"] + o["Frontend_Bandwidth"].parent = o["Frontend_Bound"] + o["MITE"].parent = o["Frontend_Bandwidth"] + o["DSB"].parent = o["Frontend_Bandwidth"] + o["LSD"].parent = o["Frontend_Bandwidth"] + o["Branch_Mispredicts"].parent = o["Bad_Speculation"] + o["Machine_Clears"].parent = o["Bad_Speculation"] + o["Memory_Bound"].parent = o["Backend_Bound"] + o["L1_Bound"].parent = o["Memory_Bound"] + o["DTLB_Load"].parent = o["L1_Bound"] + o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["Split_Loads"].parent = o["L1_Bound"] + o["G4K_Aliasing"].parent = o["L1_Bound"] + o["L2_Bound"].parent = o["Memory_Bound"] + o["L3_Bound"].parent = o["Memory_Bound"] + o["Contested_Accesses"].parent = o["L3_Bound"] + o["Data_Sharing"].parent = o["L3_Bound"] + o["L3_Latency"].parent = o["L3_Bound"] + o["SQ_Full"].parent = o["L3_Bound"] + o["MEM_Bound"].parent = o["Memory_Bound"] + o["MEM_Bandwidth"].parent = o["MEM_Bound"] + o["MEM_Latency"].parent = o["MEM_Bound"] + o["Stores_Bound"].parent = o["Memory_Bound"] + o["False_Sharing"].parent = o["Stores_Bound"] + o["Split_Stores"].parent = o["Stores_Bound"] + o["DTLB_Store"].parent = o["Stores_Bound"] + o["Core_Bound"].parent = o["Backend_Bound"] + o["Divider"].parent = o["Core_Bound"] + o["Ports_Utilization"].parent = o["Core_Bound"] + o["G0_Ports_Utilized"].parent = o["Ports_Utilization"] + o["G1_Port_Utilized"].parent = o["Ports_Utilization"] + o["G2_Ports_Utilized"].parent = o["Ports_Utilization"] + o["G3m_Ports_Utilized"].parent = o["Ports_Utilization"] + o["Base"].parent = o["Retiring"] + o["Microcode_Sequencer"].parent = o["Retiring"] # references between groups - o["FrontendBandwidth"].FrontendBound = o["FrontendBound"] - o["FrontendBandwidth"].FrontendLatency = o["FrontendLatency"] - o["BranchMispredicts"].BadSpeculation = o["BadSpeculation"] - o["MachineClears"].BadSpeculation = o["BadSpeculation"] - o["MachineClears"].BranchMispredicts = o["BranchMispredicts"] - o["Backend_Bound"].FrontendBound = o["FrontendBound"] - o["Backend_Bound"].BadSpeculation = o["BadSpeculation"] + o["Frontend_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["Frontend_Bandwidth"].Frontend_Latency = o["Frontend_Latency"] + o["Branch_Mispredicts"].Bad_Speculation = o["Bad_Speculation"] + o["Machine_Clears"].Bad_Speculation = o["Bad_Speculation"] + o["Machine_Clears"].Branch_Mispredicts = o["Branch_Mispredicts"] + o["Backend_Bound"].Frontend_Bound = o["Frontend_Bound"] + o["Backend_Bound"].Bad_Speculation = o["Bad_Speculation"] o["Backend_Bound"].Retiring = o["Retiring"] - o["L1Bound"].DTLB_Overhead = o["DTLB_Overhead"] - o["StoresBound"].MemoryBound = o["MemoryBound"] - o["CoreBound"].MemoryBound = o["MemoryBound"] - o["Retiring"].MicroSequencer = o["MicroSequencer"] - o["BASE"].Retiring = o["Retiring"] - o["BASE"].MicroSequencer = o["MicroSequencer"] + o["L1_Bound"].DTLB_Load = o["DTLB_Load"] + o["Stores_Bound"].Memory_Bound = o["Memory_Bound"] + o["Core_Bound"].Memory_Bound = o["Memory_Bound"] + o["Ports_Utilization"].Core_Bound = o["Core_Bound"] + o["Ports_Utilization"].Divider = o["Divider"] + o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Base"].Retiring = o["Retiring"] + o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"] # siblings cross-tree - o["FrontendBound"].sibling = None - o["FrontendLatency"].sibling = None - o["ICacheMisses"].sibling = None - o["ITLBmisses"].sibling = None - o["BranchResteers"].sibling = o["BadSpeculation"] - o["DSBswitches"].sibling = None + o["Frontend_Bound"].sibling = None + o["Frontend_Latency"].sibling = None + o["ITLB_Misses"].sibling = None + o["DSB_Switches"].sibling = None o["LCP"].sibling = None - o["MSswitches"].sibling = o["MicroSequencer"] - o["FrontendBandwidth"].sibling = None + o["MS_Switches"].sibling = o["Microcode_Sequencer"] + o["Frontend_Bandwidth"].sibling = None o["MITE"].sibling = None o["DSB"].sibling = None - o["BadSpeculation"].sibling = o["BranchResteers"] - o["BranchMispredicts"].sibling = None - o["MachineClears"].sibling = None + o["LSD"].sibling = None + o["Bad_Speculation"].sibling = None + o["Branch_Mispredicts"].sibling = None + o["Machine_Clears"].sibling = None o["Backend_Bound"].sibling = None - o["MemoryBound"].sibling = None - o["L1Bound"].sibling = None - o["DTLB_Overhead"].sibling = None - o["LoadsBlockedbyStoreForwarding"].sibling = None - o["SplitLoads"].sibling = None - o["G4KAliasing"].sibling = None - o["L2Bound"].sibling = None - o["L3Bound"].sibling = None - o["ContestedAccesses"].sibling = None - o["DataSharing"].sibling = None - o["L3Latency"].sibling = None - o["DRAMBound"].sibling = None - o["MEMBandwidth"].sibling = None - o["MEMLatency"].sibling = None - o["StoresBound"].sibling = None - o["FalseSharing"].sibling = None - o["SplitStores"].sibling = None - o["DTLBStoreOverhead"].sibling = None - o["CoreBound"].sibling = None + o["Memory_Bound"].sibling = None + o["L1_Bound"].sibling = o["G1_Port_Utilized"] + o["DTLB_Load"].sibling = None + o["Store_Fwd_Blk"].sibling = None + o["Split_Loads"].sibling = None + o["G4K_Aliasing"].sibling = None + o["L2_Bound"].sibling = None + o["L3_Bound"].sibling = None + o["Contested_Accesses"].sibling = None + o["Data_Sharing"].sibling = None + o["L3_Latency"].sibling = None + o["SQ_Full"].sibling = None + o["MEM_Bound"].sibling = None + o["MEM_Bandwidth"].sibling = None + o["MEM_Latency"].sibling = None + o["Stores_Bound"].sibling = None + o["False_Sharing"].sibling = None + o["Split_Stores"].sibling = None + o["DTLB_Store"].sibling = None + o["Core_Bound"].sibling = None + o["Divider"].sibling = None + o["Ports_Utilization"].sibling = None + o["G0_Ports_Utilized"].sibling = None + o["G1_Port_Utilized"].sibling = o["L1_Bound"] + o["G2_Ports_Utilized"].sibling = None + o["G3m_Ports_Utilized"].sibling = None o["Retiring"].sibling = None - o["BASE"].sibling = None - o["MicroSequencer"].sibling = o["MSswitches"] + o["Base"].sibling = None + o["Microcode_Sequencer"].sibling = o["MS_Switches"] - # sampling events (experimential) + # sampling events - o["FrontendBound"].sample = [] - o["FrontendLatency"].sample = [] - o["ICacheMisses"].sample = [] - o["ITLBmisses"].sample = [] - o["BranchResteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES_PS'] - o["DSBswitches"].sample = [] + o["Frontend_Bound"].sample = [] + o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END'] + o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED'] + o["DSB_Switches"].sample = [] o["LCP"].sample = [] - o["MSswitches"].sample = [] - o["FrontendBandwidth"].sample = [] + o["MS_Switches"].sample = [] + o["Frontend_Bandwidth"].sample = [] o["MITE"].sample = [] o["DSB"].sample = [] - o["BadSpeculation"].sample = [] - o["BranchMispredicts"].sample = [] - o["MachineClears"].sample = [] + o["LSD"].sample = [] + o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES'] + o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] + o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT'] o["Backend_Bound"].sample = [] - o["MemoryBound"].sample = [] - o["L1Bound"].sample = [] - o["DTLB_Overhead"].sample = [] - o["LoadsBlockedbyStoreForwarding"].sample = [] - o["SplitLoads"].sample = [] - o["G4KAliasing"].sample = [] - o["L2Bound"].sample = [] - o["L3Bound"].sample = [] - o["ContestedAccesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS_PS'] - o["DataSharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT_PS'] - o["L3Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT_PS'] - o["DRAMBound"].sample = [] - o["MEMBandwidth"].sample = [] - o["MEMLatency"].sample = [] - o["StoresBound"].sample = [] - o["FalseSharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS'] - o["SplitStores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES_PS', 'MEM_UOPS_RETIRED.ALL_STORES_PS'] - o["DTLBStoreOverhead"].sample = [] - o["CoreBound"].sample = [] + o["Memory_Bound"].sample = [] + o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp'] + o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] + o["Store_Fwd_Blk"].sample = [] + o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp'] + o["G4K_Aliasing"].sample = [] + o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] + o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp'] + o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp'] + o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp'] + o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp'] + o["SQ_Full"].sample = [] + o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp'] + o["MEM_Bandwidth"].sample = [] + o["MEM_Latency"].sample = [] + o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp'] + o["False_Sharing"].sample = [' MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE:request=DEMAND_RFO:response=L3_HIT.SNOOP_HITM'] + o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp'] + o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] + o["Core_Bound"].sample = [] + o["Divider"].sample = ['ARITH.DIVIDER_UOPS'] + o["Ports_Utilization"].sample = [] + o["G0_Ports_Utilized"].sample = [] + o["G1_Port_Utilized"].sample = [] + o["G2_Ports_Utilized"].sample = [] + o["G3m_Ports_Utilized"].sample = [] o["Retiring"].sample = [] - o["BASE"].sample = [] - o["MicroSequencer"].sample = [] + o["Base"].sample = ['INST_RETIRED.PREC_DIST'] + o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS'] # user visible metrics n = Metric_IPC() ; r.metric(n) + n = Metric_CPI() ; r.metric(n) + n = Metric_CoreIPC() ; r.metric(n) n = Metric_UPI() ; r.metric(n) - n = Metric_InstPerTakenBranch() ; r.metric(n) - n = Metric_DSBCoverage() ; r.metric(n) + n = Metric_IPTB() ; r.metric(n) + n = Metric_BPTB() ; r.metric(n) + n = Metric_DSB_Coverage() ; r.metric(n) + n = Metric_ILP() ; r.metric(n) n = Metric_MLP() ; r.metric(n) - n = Metric_L1dMissLatency() ; r.metric(n) - n = Metric_TurboUtilization() ; r.metric(n) + n = Metric_Load_Miss_Real_Latency() ; r.metric(n) + n = Metric_Turbo_Utilization() ; r.metric(n) + n = Metric_Page_Walks_Use() ; r.metric(n) + n = Metric_MUX() ; r.metric(n) + n = Metric_CLKS() ; r.metric(n) + n = Metric_CORE_CLKS() ; r.metric(n) + n = Metric_Time() ; r.metric(n) diff --git a/ivb_client_ratios.py b/ivb_client_ratios.py index ce4ceeb5..5a868462 100644 --- a/ivb_client_ratios.py +++ b/ivb_client_ratios.py @@ -1,14 +1,19 @@ # -# auto generated TopDown description for Ivy Bridge +# auto generated TopDown 2.9 description for Intel 3rd gen Core (code named IvyBridge) # Please see http://ark.intel.com for more details on these CPUs. # +# References: +# http://halobates.de/blog/p/262 +# https://sites.google.com/site/analysismethods/yasin-pubs +# smt_enabled = False # Constants Pipeline_Width = 4 +L2_Store_Latency = 9 Mem_L3_Weight = 7 Mem_STLB_Hit_Cost = 7 Mem_SFB_Cost = 13 @@ -16,89 +21,186 @@ Mem_XSNP_HitM_Cost = 60 MEM_XSNP_Hit_Cost = 43 MEM_XSNP_None_Cost = 29 +Mem_Local_DRAM_Cost = 200 +Mem_Remote_DRAM_Cost = 310 +Mem_Remote_HitM_Cost = 200 +Mem_Remote_Fwd_Cost = 180 MS_Switches_Cost = 3 OneMillion = 1000000 +Energy_Unit = 15.6 # Aux. formulas + # Floating Point Operations Count def FLOP_Count(EV, level): - return ( 1 *(EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level))+ 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level))+ 8 * EV("SIMD_FP_256.PACKED_SINGLE", level) ) + return (1 *(EV("FP_COMP_OPS_EXE.X87", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level)) + 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level)) + 8 * EV("SIMD_FP_256.PACKED_SINGLE", level)) + def Recovery_Cycles(EV, level): - return ( EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2)if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level) + EV("INT_MISC.RECOVERY_CYCLES", level) + EV("INT_MISC.RECOVERY_CYCLES:amt1", level) + return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level) + def Execute_Cycles(EV, level): - return ( EV("UOPS_EXECUTED.CORE:c1", level) / 2)if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + EV("UOPS_EXECUTED.CORE:c1", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c1", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + def L1D_Miss_Cycles(EV, level): - return ( EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2)if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level) + EV("L1D_PEND_MISS.PENDING_CYCLES", level) + EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) + return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level) + +def SQ_Full_Cycles(EV, level): + EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + def ITLB_Miss_Cycles(EV, level): - return ( Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level) ) + return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level)) + def Cycles_0_Ports_Utilized(EV, level): - return ( EV("UOPS_EXECUTED.CORE:i1", level))/ 2 if smt_enabled else(STALLS_TOTAL(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ARITH.FPU_DIV_ACTIVE", level) ) + EV("ARITH.FPU_DIV_ACTIVE", level) + EV("UOPS_EXECUTED.CORE:i1:c1", level) + EV("RS_EVENTS.EMPTY_CYCLES", level) + return (EV("UOPS_EXECUTED.CORE:i1:c1", level)) / 2 if smt_enabled else(STALLS_TOTAL(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ARITH.FPU_DIV_ACTIVE", level)) + def Cycles_1_Port_Utilized(EV, level): - return ( EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level))/ 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) ) + EV("UOPS_EXECUTED.CORE:c2", level) + EV("UOPS_EXECUTED.CORE:c1", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)) + def Cycles_2_Ports_Utilized(EV, level): - return ( EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level))/ 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) ) + EV("UOPS_EXECUTED.CORE:c2", level) + EV("UOPS_EXECUTED.CORE:c3", level) + EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)) + def Cycles_3m_Ports_Utilized(EV, level): - return ( EV("UOPS_EXECUTED.CORE:c3", level) / 2)if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + EV("UOPS_EXECUTED.CORE:c3", level) + EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c3", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + def STALLS_MEM_ANY(EV, level): - return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level) ) + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level)) , level ) + def STALLS_TOTAL(EV, level): - return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level) ) + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level)) , level ) + def ORO_Demand_DRD_C1(EV, level): - return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level) ) + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level ) + def ORO_Demand_DRD_C6(EV, level): - return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level) ) + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level ) + +def ORO_Demand_RFO_C1(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", level)) , level ) + +def Store_L2_Hit_Cycles(EV, level): + return 0 + +def Cycles_False_Sharing_Client(EV, level): + return Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", level) + EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE", level)) + def Few_Uops_Executed_Threshold(EV, level): - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) - return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level) > 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level)> 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + def Backend_Bound_At_EXE(EV, level): - return ( STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level))/ CLKS(EV, level) + return (STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level)) / CLKS(EV, level) + def Mem_L3_Hit_Fraction(EV, level): - return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level) ) + return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level)) + +def Mem_Lock_St_Fraction(EV, level): + return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level) + def Mispred_Clears_Fraction(EV, level): - return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level) ) + return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) + def Avg_RS_Empty_Period_Clears(EV, level): - return ( EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level))/ EV("RS_EVENTS.EMPTY_END", level) + return (EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level)) / EV("RS_EVENTS.EMPTY_END", level) + def Retire_Uop_Fraction(EV, level): return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level) + def SLOTS(EV, level): - return Pipeline_Width * CLKS1(EV, level) -# Instructions Per Cycle (per physical core) + return Pipeline_Width * CORE_CLKS(EV, level) + +def DurationTimeInSeconds(EV, level): + return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 ) + +# Instructions Per Cycle (per logical thread) def IPC(EV, level): - return EV("INST_RETIRED.ANY", level) / CLKS1(EV, level) + return EV("INST_RETIRED.ANY", level) / CLKS(EV, level) + +# Cycles Per Instruction (threaded) def CPI(EV, level): return 1 / IPC(EV, level) + +# Instructions Per Cycle (per physical core) +def CoreIPC(EV, level): + return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level) + # Uops Per Instruction def UPI(EV, level): return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level) + # Instruction per taken branch def IPTB(EV, level): return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + +# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes. +def BPTB(EV, level): + return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + # Fraction of Uops delivered by the DSB (decoded instructions cache) def DSB_Coverage(EV, level): - return ( EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level))/(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level) ) + return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level)) + # Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed) def ILP(EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(EV, level) + # Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss) def MLP(EV, level): return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level) + # Actual Average Latency for L1 data-cache miss demand loads def Load_Miss_Real_Latency(EV, level): - return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level) ) + return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level)) + +# Giga Floating Point Operations Per Second +def GFLOPs(EV, level): + return FLOP_Count(EV, level) / OneMillion / DurationTimeInSeconds(EV, level) / 1000 + # Average Frequency Utilization relative nominal frequency def Turbo_Utilization(EV, level): return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level) + # Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store def Page_Walks_Use(EV, level): - return ( EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level))/ CLKS1(EV, level) + return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level) + +# PerfMon Event Multiplexing accuracy indicator +def MUX(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level) + # Per-thread actual clocks def CLKS(EV, level): return EV("CPU_CLK_UNHALTED.THREAD", level) + # Core actual clocks -def CLKS1(EV, level): - return ( EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2)if smt_enabled else CLKS(EV, level) +def CORE_CLKS(EV, level): + EV("CPU_CLK_UNHALTED.THREAD:amt1", level) + return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level) + +# Run duration time in seconds +def Time(EV, level): + return DurationTimeInSeconds(EV, level) # Event groups @@ -117,9 +219,10 @@ class Frontend_Bound: latter can accept them. For example, stalls due to instruction-cache misses would be categorized under Frontend Bound.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1)/ SLOTS(EV, 1 ) + self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.2) except ZeroDivisionError: #print "Frontend_Bound zero division" @@ -137,9 +240,10 @@ class Frontend_Latency: after a branch misprediction are categorized under Frontend Latency. In such cases the Frontend eventually delivers no uops for some period.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2)/ SLOTS(EV, 2 ) + self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 ) self.thresh = (self.val > 0.15) and self.parent.thresh except ZeroDivisionError: #print "Frontend_Latency zero division" @@ -156,9 +260,10 @@ class ICache_Misses: cache misses. Using compiler's Profile-Guided Optimization (PGO) can reduce i-cache misses through improved hot code layout.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ICACHE.IFETCH_STALL", 3)/ CLKS(EV, 3)- self.ITLB_Misses.compute(EV ) + self.val = EV("ICACHE.IFETCH_STALL", 3) / CLKS(EV, 3) - self.ITLB_Misses.compute(EV ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "ICache_Misses zero division" @@ -174,9 +279,10 @@ class ITLB_Misses: This metric represents cycles fraction CPU was stalled due to instruction TLB misses. Using large code pages may be considered here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ITLB_Miss_Cycles(EV, 3)/ CLKS(EV, 3 ) + self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "ITLB_Misses zero division" @@ -195,9 +301,10 @@ class Branch_Resteers: For example, branchy code with lots of (taken) branches and/or branch miss- predictions might get categorized under Branch Resteers.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3)+ EV("MACHINE_CLEARS.COUNT", 3)+ EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 ) + self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3) + EV("MACHINE_CLEARS.COUNT", 3) + EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "Branch_Resteers zero division" @@ -213,9 +320,10 @@ class DSB_Switches: This metric represents cycles fraction CPU was stalled due to switches from DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3)/ CLKS(EV, 3 ) + self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "DSB_Switches zero division" @@ -232,9 +340,10 @@ class LCP: Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ILD_STALL.LCP", 3)/ CLKS(EV, 3 ) + self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "LCP zero division" @@ -253,9 +362,10 @@ class MS_Switches: deliver long uop flows required by CISC instructions like CPUID, or uncommon conditions like Floating Point Assists when dealing with Denormals.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3)/ CLKS(EV, 3 ) + self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "MS_Switches zero division" @@ -274,9 +384,10 @@ class Frontend_Bandwidth: under Frontend Bandwidth. In such cases, the Frontend typically delivers non- optimal amount of uops to the Backend.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.Frontend_Bound.compute(EV)- self.Frontend_Latency.compute(EV ) + self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV ) self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh except ZeroDivisionError: #print "Frontend_Bandwidth zero division" @@ -286,16 +397,17 @@ def compute(self, EV): class MITE: name = "MITE" - domain = "CClocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -the MITE fetch pipeline. For example, inefficiencies in the instruction -decoders are categorized here.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to the MITE fetch pipeline. For example, inefficiencies in the +instruction decoders are categorized here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 ) + self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MITE zero division" @@ -305,17 +417,18 @@ def compute(self, EV): class DSB: name = "DSB" - domain = "CClocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -DSB (decoded uop cache) fetch pipeline. For example, inefficient utilization -of the DSB cache structure or bank conflict when reading from it, are -categorized here.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to DSB (decoded uop cache) fetch pipeline. For example, inefficient +utilization of the DSB cache structure or bank conflict when reading from it, +are categorized here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 ) + self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.3) and self.parent.thresh except ZeroDivisionError: #print "DSB zero division" @@ -325,18 +438,19 @@ def compute(self, EV): class LSD: name = "LSD" - domain = "CClocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop -supply. However, in some rare cases, optimal uop-delivery could not be reached -for small loops whose size (in terms of number of uops) does not suit well the -LSD structure.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining +Uop supply. However, in some rare cases, optimal uop-delivery could not be +reached for small loops whose size (in terms of number of uops) does not suit +well the LSD structure.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("LSD.CYCLES_ACTIVE", 3)- EV("LSD.CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 ) + self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "LSD zero division" @@ -355,9 +469,10 @@ class Bad_Speculation: speculation. For example, wasted work due to miss-predicted branches are categorized under Bad Speculation category""" level = 1 + htoff = False def compute(self, EV): try: - self.val = ( EV("UOPS_ISSUED.ANY", 1)- EV("UOPS_RETIRED.RETIRE_SLOTS", 1)+ Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 ) + self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.1) except ZeroDivisionError: #print "Bad_Speculation zero division" @@ -375,6 +490,7 @@ class Branch_Mispredicts: incorrectly speculated program path, or stalls the Backend of the machine needs to recover its state from a speculative path.""" level = 2 + htoff = False def compute(self, EV): try: self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV ) @@ -396,9 +512,10 @@ class Machine_Clears: example, this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.Bad_Speculation.compute(EV)- self.Branch_Mispredicts.compute(EV ) + self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "Machine_Clears zero division" @@ -419,9 +536,10 @@ class Backend_Bound: stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = 1 -(self.Frontend_Bound.compute(EV)+ self.Bad_Speculation.compute(EV)+ self.Retiring.compute(EV)) + self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV)) self.thresh = (self.val > 0.2) except ZeroDivisionError: #print "Backend_Bound zero division" @@ -440,9 +558,10 @@ class Memory_Bound: memory demand loads which coincides with execution starvation. in addition to less common cases where stores could imply backpressure on the pipeline.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = ( STALLS_MEM_ANY(EV, 2)+ EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 ) + self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: #print "Memory_Bound zero division" @@ -462,9 +581,10 @@ class L1_Bound: allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event as it accounts for any non-completed load.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( STALLS_MEM_ANY(EV, 3)- EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 ) + self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 ) self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh except ZeroDivisionError: #print "L1_Bound zero division" @@ -478,9 +598,10 @@ class DTLB_Load: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4)+ EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) + self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "DTLB_Load zero division" @@ -494,9 +615,10 @@ class Store_Fwd_Blk: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4)/ CLKS(EV, 4 ) + self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "Store_Fwd_Blk zero division" @@ -504,15 +626,36 @@ def compute(self, EV): self.thresh = False return self.val +class Lock_Latency: + name = "Lock_Latency" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents cycles fraction the CPU spent handling cache misses due +to lock operations. Due to the microarchitecture handling of locks, they are +classified as L1_Bound regardless of what memory source satsified them.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Mem_Lock_St_Fraction(EV, 4)* ORO_Demand_RFO_C1(EV, 4) / CLKS(EV, 4 ) + self.thresh = (self.val > 0.2) and self.parent.thresh + except ZeroDivisionError: + #print "Lock_Latency zero division" + self.val = 0 + self.thresh = False + return self.val + class Split_Loads: name = "Split_Loads" domain = "Clocks" area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4)/ CLKS(EV, 4 ) + self.val = 13 * EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "Split_Loads zero division" @@ -526,9 +669,10 @@ class G4K_Aliasing: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4)/ CLKS(EV, 4 ) + self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "G4K_Aliasing zero division" @@ -545,9 +689,10 @@ class L2_Bound: misses (i.e. L1 misses/L2 hits) will improve the latency and increase performance.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)- EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 ) + self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 ) self.thresh = (self.val > 0.03) and self.parent.thresh except ZeroDivisionError: #print "L2_Bound zero division" @@ -564,9 +709,10 @@ class L3_Bound: a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) will improve the latency and increase performance.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 ) + self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "L3_Bound zero division" @@ -580,9 +726,10 @@ class Contested_Accesses: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4)+ EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 ) + self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "Contested_Accesses zero division" @@ -596,9 +743,10 @@ class Data_Sharing: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4)/ CLKS(EV, 4 ) + self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "Data_Sharing zero division" @@ -616,9 +764,10 @@ class L3_Latency: with a sibling core. Avoiding cache misses (i.e. L2 misses/L3 hits) will improve the latency and increase performance.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4)/ CLKS(EV, 4 ) + self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4) / CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "L3_Latency zero division" @@ -626,6 +775,26 @@ def compute(self, EV): self.thresh = False return self.val +class SQ_Full: + name = "SQ_Full" + domain = "CoreClocks" + area = "BE/Mem" + desc = """ +This metric measures fraction of cycles where the Super Queue (SQ) was full +taking into account all request-types and both hardware SMT threads. The Super +Queue is used for requests to access the L2 cache or to go out to the Uncore.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "SQ_Full zero division" + self.val = 0 + self.thresh = False + return self.val + class MEM_Bound: name = "MEM_Bound" domain = "Clocks" @@ -634,9 +803,10 @@ class MEM_Bound: This metric represents how often CPU was stalled on main memory (DRAM). Caching will improve the latency and increase performance.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( 1 - Mem_L3_Hit_Fraction(EV, 3)) * EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 ) + self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MEM_Bound zero division" @@ -653,9 +823,10 @@ class MEM_Bandwidth: bandwidth limits of main memory (DRAM). NUMA in multi-socket system may be considered in such case.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ORO_Demand_DRD_C6(EV, 4)/ CLKS(EV, 4 ) + self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MEM_Bandwidth zero division" @@ -672,9 +843,10 @@ class MEM_Latency: main memory (DRAM). Data layout re-structuring or using Software Prefetches (also through the compiler) may be considered in such case.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( ORO_Demand_DRD_C1(EV, 4)- ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 ) + self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MEM_Latency zero division" @@ -687,12 +859,15 @@ class Stores_Bound: domain = "Clocks" area = "BE/Mem" desc = """ -This metric represents how often CPU was stalled on due to store operations. -Tip: consider False Sharing analysis as next step""" +This metric represents how often CPU was stalled due to store operations. +even though memory store accesses do not typically stall out-of-order CPUs; +there are few cases where stores can lead to actual stalls. This metric will +be flagged should any of these cases be a bottleneck.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = self.Memory_Bound.compute(EV)- STALLS_MEM_ANY(EV, 3)/ CLKS(EV, 3 ) + self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: #print "Stores_Bound zero division" @@ -700,17 +875,39 @@ def compute(self, EV): self.thresh = False return self.val +class Store_Latency: + name = "Store_Latency" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents cycles fraction the CPU spent handling long-latency +store misses (missing 2nd level cache).""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = (Store_L2_Hit_Cycles(EV, 4) +(1 - Mem_Lock_St_Fraction(EV, 4))* ORO_Demand_RFO_C1(EV, 4)) / CLKS(EV, 4 ) + self.thresh = (self.val > 0.2) and self.parent.thresh + except ZeroDivisionError: + #print "Store_Latency zero division" + self.val = 0 + self.thresh = False + return self.val + class False_Sharing: name = "False_Sharing" domain = "Clocks" area = "BE/Mem" desc = """ -This metric represents how often CPU was stalled on due to store operations. -Tip: consider False Sharing analysis as next step""" +This metric represents how often CPU was stalled due to False Sharing. False +Sharing is a multithreading hiccup, where multiple threads contend on +different data-elements mapped into the same cache line. It can be easily +avoided by padding to make threads access different lines.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4)+ EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE", 4)) / CLKS(EV, 4 ) + self.val = Cycles_False_Sharing_Client(EV, 4) / CLKS(EV, 4 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: #print "False_Sharing zero division" @@ -720,16 +917,17 @@ def compute(self, EV): class Split_Stores: name = "Split_Stores" - domain = "Stores" + domain = "CoreClocks" area = "BE/Mem" desc = """ This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4)/ EV("MEM_UOPS_RETIRED.ALL_STORES", 4 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: #print "Split_Stores zero division" self.val = 0 @@ -748,9 +946,10 @@ class DTLB_Store: the same page. Try using larger page sizes for large amounts of frequently- used data.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4)+ EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) + self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "DTLB_Store zero division" @@ -771,9 +970,10 @@ class Core_Bound: performance (e.g. FP-chained long-latency arithmetic operations). Tip: consider Port Saturation analysis as next step.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = Backend_Bound_At_EXE(EV, 2)- self.Memory_Bound.compute(EV ) + self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "Core_Bound zero division" @@ -783,13 +983,14 @@ def compute(self, EV): class Divider: name = "Divider" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = "" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ARITH.FPU_DIV_ACTIVE", 3)/ CLKS1(EV, 3 ) + self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "Divider zero division" @@ -810,9 +1011,10 @@ class Ports_Utilization: options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = self.Core_Bound.compute(EV)- self.Divider.compute(EV ) + self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "Ports_Utilization zero division" @@ -822,15 +1024,16 @@ def compute(self, EV): class G0_Ports_Utilized: name = "0_Ports_Utilized" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ -This metric represents cycles fraction CPU executed no uops on any execution -port.""" +This metric represents Core cycles fraction CPU executed no uops on any +execution port.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = Cycles_0_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 ) + self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "G0_Ports_Utilized zero division" @@ -840,20 +1043,22 @@ def compute(self, EV): class G1_Port_Utilized: name = "1_Port_Utilized" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ -This metric represents cycles fraction CPU executed total of 1 uop per cycle -on all execution ports. This can be due to heavy data-dependency among -instructions. In some cases with high 1_Port_Utilized and L1_Bound it can -point to L1 data-cache latency bottleneck that may not necessarily manifest -with complete execution starvation (due to the short L1 latency e.g. walking -linked list) - looking at the assembly can be helpful. Tip: consider 'Core -Port Saturation' analysis-type as next step.""" +This metric represents Core cycles fraction where the CPU executed total of 1 +uop per cycle on all execution ports. This can be due to heavy data-dependency +among software instructions, or over oversubscribing a particular hardware +resource. In some other cases with high 1_Port_Utilized and L1_Bound, this +metric can point to L1 data-cache latency bottleneck that may not necessarily +manifest with complete execution starvation (due to the short L1 latency e.g. +walking a linked list) - looking at the assembly can be helpful. Tip: consider +'Core Ports Saturation' analysis-type as next step.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = Cycles_1_Port_Utilized(EV, 4)/ CLKS1(EV, 4 ) + self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "G1_Port_Utilized zero division" @@ -863,18 +1068,19 @@ def compute(self, EV): class G2_Ports_Utilized: name = "2_Ports_Utilized" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ -This metric represents cycles fraction CPU executed total of 2 uops per cycle -on all execution ports. Tip: consider 'Core Port Saturation' analysis-type as -next step. Loop Vectorization -most compilers feature auto-Vectorization -options today- reduces pressure on the execution ports as multiple elements -are calculated with same uop.""" +This metric represents Core cycles fraction CPU executed total of 2 uops per +cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis- +type as next step. Loop Vectorization -most compilers feature auto- +Vectorization options today- reduces pressure on the execution ports as +multiple elements are calculated with same uop.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = Cycles_2_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 ) + self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "G2_Ports_Utilized zero division" @@ -884,16 +1090,17 @@ def compute(self, EV): class G3m_Ports_Utilized: name = "3m_Ports_Utilized" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ -This metric represents cycles fraction CPU executed total of 3 or more uops -per cycle on all execution ports. Tip: consider 'Core Port Saturation' +This metric represents Core cycles fraction CPU executed total of 3 or more +uops per cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-type as next step""" level = 4 + htoff = False def compute(self, EV): try: - self.val = Cycles_3m_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 ) + self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "G3m_Ports_Utilized zero division" @@ -903,16 +1110,17 @@ def compute(self, EV): class Port_0: name = "Port_0" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution -port 0 (ALU)""" +port 0 (SNB+: ALU; HSW+:ALU and 2nd branch)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_0 zero division" self.val = 0 @@ -921,16 +1129,17 @@ def compute(self, EV): class Port_1: name = "Port_1" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution port 1 (ALU)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_1 zero division" self.val = 0 @@ -939,16 +1148,17 @@ def compute(self, EV): class Port_2: name = "Port_2" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution port 2 (Loads and Store-address)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_2 zero division" self.val = 0 @@ -957,16 +1167,17 @@ def compute(self, EV): class Port_3: name = "Port_3" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution port 3 (Loads and Store-address)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_3 zero division" self.val = 0 @@ -975,16 +1186,17 @@ def compute(self, EV): class Port_4: name = "Port_4" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution port 4 (Store-data)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_4 zero division" self.val = 0 @@ -993,16 +1205,17 @@ def compute(self, EV): class Port_5: name = "Port_5" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution -port 5 (Branches and ALU)""" +port 5 (SNB+: Branches and ALU; HSW+: ALU)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_5 zero division" self.val = 0 @@ -1026,9 +1239,10 @@ class Retiring: computations be done without significantly increasing number of instructions thus improving the performance.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1)/ SLOTS(EV, 1 ) + self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh except ZeroDivisionError: #print "Retiring zero division" @@ -1041,16 +1255,18 @@ class Base: domain = "Slots" area = "RET" desc = """ -This metric represents slots fraction CPU was retiring uops not originated -from the microcode-sequencer. This correlates with total number of +This metric represents slots fraction where the CPU was retiring uops not +originated from the microcode-sequencer. This correlates with total number of instructions used by the program. A uops-per-instruction ratio of 1 should be -expected. A high Retiring value for non-vectorized code is typically a good -hint for programmer to pursue vectorizing his code, which can reduce -instructions hence this bucket.""" +expected. While this is the most desirable of the top 4 categories, high +values may still indicate areas for improvement. If possible focus on +techniques that reduce instruction count or result in more efficient +instructions generation such as vectorization.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.Retiring.compute(EV)- self.Microcode_Sequencer.compute(EV ) + self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV ) self.thresh = (self.val > 0.6) and self.parent.thresh except ZeroDivisionError: #print "Base zero division" @@ -1058,18 +1274,38 @@ def compute(self, EV): self.thresh = False return self.val +class FP_Arith: + name = "FP_Arith" + domain = "Uops" + area = "RET" + desc = """ +This metric represents overall arithmetic floating-point (FP) uops fraction +the CPU has executed.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = self.FP_x87.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV ) + self.thresh = (self.val > 0.2) and self.parent.thresh + except ZeroDivisionError: + #print "FP_Arith zero division" + self.val = 0 + self.thresh = False + return self.val + class FP_x87: name = "FP_x87" domain = "Uops" area = "RET" desc = """ -This metric represents floating-point (FP) x87 uops fraction the CPU has -executed. Tip: consider compiler flags to generate newer AVX (or SSE) +This metric is an approxmiation of floating-point (FP) x87 (arithmetic) uops +fraction. Tip: consider compiler flags to generate newer AVX (or SSE) instruction sets, which typically perform better and feature vectors.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = EV("FP_COMP_OPS_EXE.X87", 4)/ EV("UOPS_EXECUTED.THREAD", 4 ) + self.val = EV("FP_COMP_OPS_EXE.X87", 4) / EV("UOPS_EXECUTED.THREAD", 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "FP_x87 zero division" @@ -1077,6 +1313,65 @@ def compute(self, EV): self.thresh = False return self.val +class FP_Scalar: + name = "FP_Scalar" + domain = "Uops" + area = "RET" + desc = """ +This metric represents arithmetic floating-point (FP) scalar uops fraction the +CPU has executed. Tip: investigate what limits (compiler) generation of vector +code.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = (EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", 4) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "FP_Scalar zero division" + self.val = 0 + self.thresh = False + return self.val + +class FP_Vector: + name = "FP_Vector" + domain = "Uops" + area = "RET" + desc = """ +This metric represents arithmetic floating-point (FP) vector uops fraction the +CPU has executed. Tip: check if vector width is expected""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = (EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", 4) + EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 ) + self.thresh = (self.val > 0.2) and self.parent.thresh + except ZeroDivisionError: + #print "FP_Vector zero division" + self.val = 0 + self.thresh = False + return self.val + +class Other: + name = "Other" + domain = "Uops" + area = "RET" + desc = """ +This metric represents non-floating-point (FP) uop fraction the CPU has +executed. If you application has no FP operations, this will likely be biggest +fraction.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = 1 - self.FP_Arith.compute(EV ) + self.thresh = (self.val > 0.3) and self.parent.thresh + except ZeroDivisionError: + #print "Other zero division" + self.val = 0 + self.thresh = False + return self.val + class Microcode_Sequencer: name = "Microcode_Sequencer" domain = "Slots" @@ -1087,9 +1382,10 @@ class Microcode_Sequencer: decoded by the default decoders (like repeat move strings), or by microcode assists used to address some operation modes (like in Floating Point assists).""" level = 2 + htoff = False def compute(self, EV): try: - self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2)/ SLOTS(EV, 2 ) + self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 ) self.thresh = (self.val > 0.05) except ZeroDivisionError: #print "Microcode_Sequencer zero division" @@ -1100,7 +1396,9 @@ def compute(self, EV): class Metric_IPC: name = "IPC" desc = """ -Instructions Per Cycle (per physical core)""" +Instructions Per Cycle (per logical thread)""" + domain = "Metric" + maxval = 5 def compute(self, EV): try: @@ -1112,7 +1410,9 @@ def compute(self, EV): class Metric_CPI: name = "CPI" desc = """ -""" +Cycles Per Instruction (threaded)""" + domain = "Metric" + maxval = 0 def compute(self, EV): try: @@ -1121,10 +1421,26 @@ def compute(self, EV): print "CPI zero division" self.val = 0 +class Metric_CoreIPC: + name = "CoreIPC" + desc = """ +Instructions Per Cycle (per physical core)""" + domain = "Metric" + maxval = 5 + + def compute(self, EV): + try: + self.val = CoreIPC(EV, 0) + except ZeroDivisionError: + print "CoreIPC zero division" + self.val = 0 + class Metric_UPI: name = "UPI" desc = """ Uops Per Instruction""" + domain = "Metric" + maxval = 2 def compute(self, EV): try: @@ -1137,6 +1453,8 @@ class Metric_IPTB: name = "IPTB" desc = """ Instruction per taken branch""" + domain = "Metric" + maxval = 0 def compute(self, EV): try: @@ -1145,10 +1463,27 @@ def compute(self, EV): print "IPTB zero division" self.val = 0 +class Metric_BPTB: + name = "BPTB" + desc = """ +Branch instructions per taken branch. Can be used to approximate PGO- +likelihood for non-loopy codes.""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = BPTB(EV, 0) + except ZeroDivisionError: + print "BPTB zero division" + self.val = 0 + class Metric_DSB_Coverage: name = "DSB_Coverage" desc = """ Fraction of Uops delivered by the DSB (decoded instructions cache)""" + domain = "Metric" + maxval = 1 def compute(self, EV): try: @@ -1162,6 +1497,8 @@ class Metric_ILP: desc = """ Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)""" + domain = "Metric" + maxval = 10 def compute(self, EV): try: @@ -1175,6 +1512,8 @@ class Metric_MLP: desc = """ Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)""" + domain = "Metric" + maxval = 10 def compute(self, EV): try: @@ -1187,6 +1526,8 @@ class Metric_Load_Miss_Real_Latency: name = "Load_Miss_Real_Latency" desc = """ Actual Average Latency for L1 data-cache miss demand loads""" + domain = "Metric" + maxval = 1000 def compute(self, EV): try: @@ -1195,10 +1536,26 @@ def compute(self, EV): print "Load_Miss_Real_Latency zero division" self.val = 0 +class Metric_GFLOPs: + name = "GFLOPs" + desc = """ +Giga Floating Point Operations Per Second""" + domain = "Metric" + maxval = 100 + + def compute(self, EV): + try: + self.val = GFLOPs(EV, 0) + except ZeroDivisionError: + print "GFLOPs zero division" + self.val = 0 + class Metric_Turbo_Utilization: name = "Turbo_Utilization" desc = """ Average Frequency Utilization relative nominal frequency""" + domain = "Metric" + maxval = 10 def compute(self, EV): try: @@ -1212,6 +1569,8 @@ class Metric_Page_Walks_Use: desc = """ Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store""" + domain = "CoreClocks" + maxval = 0 def compute(self, EV): try: @@ -1220,10 +1579,26 @@ def compute(self, EV): print "Page_Walks_Use zero division" self.val = 0 +class Metric_MUX: + name = "MUX" + desc = """ +PerfMon Event Multiplexing accuracy indicator""" + domain = "Clocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = MUX(EV, 0) + except ZeroDivisionError: + print "MUX zero division" + self.val = 0 + class Metric_CLKS: name = "CLKS" desc = """ Per-thread actual clocks""" + domain = "Count" + maxval = 0 def compute(self, EV): try: @@ -1232,16 +1607,32 @@ def compute(self, EV): print "CLKS zero division" self.val = 0 -class Metric_CLKS1: - name = "CLKS1" +class Metric_CORE_CLKS: + name = "CORE_CLKS" desc = """ Core actual clocks""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = CORE_CLKS(EV, 0) + except ZeroDivisionError: + print "CORE_CLKS zero division" + self.val = 0 + +class Metric_Time: + name = "Time" + desc = """ +Run duration time in seconds""" + domain = "Count" + maxval = 0 def compute(self, EV): try: - self.val = CLKS1(EV, 0) + self.val = Time(EV, 0) except ZeroDivisionError: - print "CLKS1 zero division" + print "Time zero division" self.val = 0 # Schedule @@ -1270,6 +1661,7 @@ def __init__(self, r): n = L1_Bound() ; r.run(n) ; o["L1_Bound"] = n n = DTLB_Load() ; r.run(n) ; o["DTLB_Load"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n n = L2_Bound() ; r.run(n) ; o["L2_Bound"] = n @@ -1277,10 +1669,12 @@ def __init__(self, r): n = Contested_Accesses() ; r.run(n) ; o["Contested_Accesses"] = n n = Data_Sharing() ; r.run(n) ; o["Data_Sharing"] = n n = L3_Latency() ; r.run(n) ; o["L3_Latency"] = n + n = SQ_Full() ; r.run(n) ; o["SQ_Full"] = n n = MEM_Bound() ; r.run(n) ; o["MEM_Bound"] = n n = MEM_Bandwidth() ; r.run(n) ; o["MEM_Bandwidth"] = n n = MEM_Latency() ; r.run(n) ; o["MEM_Latency"] = n n = Stores_Bound() ; r.run(n) ; o["Stores_Bound"] = n + n = Store_Latency() ; r.run(n) ; o["Store_Latency"] = n n = False_Sharing() ; r.run(n) ; o["False_Sharing"] = n n = Split_Stores() ; r.run(n) ; o["Split_Stores"] = n n = DTLB_Store() ; r.run(n) ; o["DTLB_Store"] = n @@ -1299,7 +1693,11 @@ def __init__(self, r): n = Port_5() ; r.run(n) ; o["Port_5"] = n n = Retiring() ; r.run(n) ; o["Retiring"] = n n = Base() ; r.run(n) ; o["Base"] = n + n = FP_Arith() ; r.run(n) ; o["FP_Arith"] = n n = FP_x87() ; r.run(n) ; o["FP_x87"] = n + n = FP_Scalar() ; r.run(n) ; o["FP_Scalar"] = n + n = FP_Vector() ; r.run(n) ; o["FP_Vector"] = n + n = Other() ; r.run(n) ; o["Other"] = n n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n # parents @@ -1321,6 +1719,7 @@ def __init__(self, r): o["L1_Bound"].parent = o["Memory_Bound"] o["DTLB_Load"].parent = o["L1_Bound"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["G4K_Aliasing"].parent = o["L1_Bound"] o["L2_Bound"].parent = o["Memory_Bound"] @@ -1328,10 +1727,12 @@ def __init__(self, r): o["Contested_Accesses"].parent = o["L3_Bound"] o["Data_Sharing"].parent = o["L3_Bound"] o["L3_Latency"].parent = o["L3_Bound"] + o["SQ_Full"].parent = o["L3_Bound"] o["MEM_Bound"].parent = o["Memory_Bound"] o["MEM_Bandwidth"].parent = o["MEM_Bound"] o["MEM_Latency"].parent = o["MEM_Bound"] o["Stores_Bound"].parent = o["Memory_Bound"] + o["Store_Latency"].parent = o["Stores_Bound"] o["False_Sharing"].parent = o["Stores_Bound"] o["Split_Stores"].parent = o["Stores_Bound"] o["DTLB_Store"].parent = o["Stores_Bound"] @@ -1349,7 +1750,11 @@ def __init__(self, r): o["Port_4"].parent = o["G3m_Ports_Utilized"] o["Port_5"].parent = o["G3m_Ports_Utilized"] o["Base"].parent = o["Retiring"] - o["FP_x87"].parent = o["Base"] + o["FP_Arith"].parent = o["Base"] + o["FP_x87"].parent = o["FP_Arith"] + o["FP_Scalar"].parent = o["FP_Arith"] + o["FP_Vector"].parent = o["FP_Arith"] + o["Other"].parent = o["Base"] o["Microcode_Sequencer"].parent = o["Retiring"] # references between groups @@ -1371,6 +1776,10 @@ def __init__(self, r): o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"] o["Base"].Retiring = o["Retiring"] o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["FP_Arith"].FP_x87 = o["FP_x87"] + o["FP_Arith"].FP_Scalar = o["FP_Scalar"] + o["FP_Arith"].FP_Vector = o["FP_Vector"] + o["Other"].FP_Arith = o["FP_Arith"] # siblings cross-tree @@ -1378,22 +1787,23 @@ def __init__(self, r): o["Frontend_Latency"].sibling = None o["ICache_Misses"].sibling = None o["ITLB_Misses"].sibling = None - o["Branch_Resteers"].sibling = o["Bad_Speculation"] + o["Branch_Resteers"].sibling = o["Bad_Speculation"] o["DSB_Switches"].sibling = None o["LCP"].sibling = None - o["MS_Switches"].sibling = o["Microcode_Sequencer"] + o["MS_Switches"].sibling = o["Microcode_Sequencer"] o["Frontend_Bandwidth"].sibling = None o["MITE"].sibling = None o["DSB"].sibling = None o["LSD"].sibling = None - o["Bad_Speculation"].sibling = o["Branch_Resteers"] + o["Bad_Speculation"].sibling = o["Branch_Resteers"] o["Branch_Mispredicts"].sibling = None o["Machine_Clears"].sibling = None o["Backend_Bound"].sibling = None o["Memory_Bound"].sibling = None - o["L1_Bound"].sibling = None + o["L1_Bound"].sibling = o["G1_Port_Utilized"] o["DTLB_Load"].sibling = None o["Store_Fwd_Blk"].sibling = None + o["Lock_Latency"].sibling = o["Store_Latency"] o["Split_Loads"].sibling = None o["G4K_Aliasing"].sibling = None o["L2_Bound"].sibling = None @@ -1401,38 +1811,44 @@ def __init__(self, r): o["Contested_Accesses"].sibling = None o["Data_Sharing"].sibling = None o["L3_Latency"].sibling = None + o["SQ_Full"].sibling = None o["MEM_Bound"].sibling = None o["MEM_Bandwidth"].sibling = None o["MEM_Latency"].sibling = None o["Stores_Bound"].sibling = None + o["Store_Latency"].sibling = o["Lock_Latency"] o["False_Sharing"].sibling = None - o["Split_Stores"].sibling = None + o["Split_Stores"].sibling = o["Port_4"] o["DTLB_Store"].sibling = None o["Core_Bound"].sibling = None o["Divider"].sibling = None o["Ports_Utilization"].sibling = None o["G0_Ports_Utilized"].sibling = None - o["G1_Port_Utilized"].sibling = None + o["G1_Port_Utilized"].sibling = o["L1_Bound"] o["G2_Ports_Utilized"].sibling = None o["G3m_Ports_Utilized"].sibling = None o["Port_0"].sibling = None o["Port_1"].sibling = None o["Port_2"].sibling = None o["Port_3"].sibling = None - o["Port_4"].sibling = None + o["Port_4"].sibling = o["Split_Stores"] o["Port_5"].sibling = None o["Retiring"].sibling = None o["Base"].sibling = None + o["FP_Arith"].sibling = None o["FP_x87"].sibling = None - o["Microcode_Sequencer"].sibling = o["MS_Switches"] + o["FP_Scalar"].sibling = None + o["FP_Vector"].sibling = None + o["Other"].sibling = None + o["Microcode_Sequencer"].sibling = o["MS_Switches"] - # sampling events (experimental) + # sampling events o["Frontend_Bound"].sample = [] - o["Frontend_Latency"].sample = [] + o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END'] o["ICache_Misses"].sample = [] - o["ITLB_Misses"].sample = [] - o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES_PS'] + o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED'] + o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] o["DSB_Switches"].sample = [] o["LCP"].sample = [] o["MS_Switches"].sample = [] @@ -1440,30 +1856,33 @@ def __init__(self, r): o["MITE"].sample = [] o["DSB"].sample = [] o["LSD"].sample = [] - o["Bad_Speculation"].sample = [] - o["Branch_Mispredicts"].sample = [] - o["Machine_Clears"].sample = [] + o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES'] + o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] + o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT'] o["Backend_Bound"].sample = [] o["Memory_Bound"].sample = [] - o["L1_Bound"].sample = [] - o["DTLB_Load"].sample = [] + o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp'] + o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] o["Store_Fwd_Blk"].sample = [] - o["Split_Loads"].sample = [] + o["Lock_Latency"].sample = ['MEM_UOPS_RETIRED.LOCK_LOADS:pp'] + o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp'] o["G4K_Aliasing"].sample = [] - o["L2_Bound"].sample = [] - o["L3_Bound"].sample = [] - o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS_PS'] - o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT_PS'] - o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT_PS'] - o["MEM_Bound"].sample = [] + o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] + o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp'] + o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp'] + o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp'] + o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp'] + o["SQ_Full"].sample = [] + o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp'] o["MEM_Bandwidth"].sample = [] o["MEM_Latency"].sample = [] - o["Stores_Bound"].sample = [] - o["False_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS'] - o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES_PS', 'MEM_UOPS_RETIRED.ALL_STORES_PS'] - o["DTLB_Store"].sample = [] + o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp'] + o["Store_Latency"].sample = [] + o["False_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE_0'] + o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp'] + o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] o["Core_Bound"].sample = [] - o["Divider"].sample = [] + o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE'] o["Ports_Utilization"].sample = [] o["G0_Ports_Utilized"].sample = [] o["G1_Port_Utilized"].sample = [] @@ -1476,21 +1895,30 @@ def __init__(self, r): o["Port_4"].sample = [] o["Port_5"].sample = [] o["Retiring"].sample = [] - o["Base"].sample = [] + o["Base"].sample = ['INST_RETIRED.PREC_DIST'] + o["FP_Arith"].sample = [] o["FP_x87"].sample = [] - o["Microcode_Sequencer"].sample = [] + o["FP_Scalar"].sample = [] + o["FP_Vector"].sample = [] + o["Other"].sample = [] + o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS'] # user visible metrics n = Metric_IPC() ; r.metric(n) n = Metric_CPI() ; r.metric(n) + n = Metric_CoreIPC() ; r.metric(n) n = Metric_UPI() ; r.metric(n) n = Metric_IPTB() ; r.metric(n) + n = Metric_BPTB() ; r.metric(n) n = Metric_DSB_Coverage() ; r.metric(n) n = Metric_ILP() ; r.metric(n) n = Metric_MLP() ; r.metric(n) n = Metric_Load_Miss_Real_Latency() ; r.metric(n) + n = Metric_GFLOPs() ; r.metric(n) n = Metric_Turbo_Utilization() ; r.metric(n) n = Metric_Page_Walks_Use() ; r.metric(n) + n = Metric_MUX() ; r.metric(n) n = Metric_CLKS() ; r.metric(n) - n = Metric_CLKS1() ; r.metric(n) + n = Metric_CORE_CLKS() ; r.metric(n) + n = Metric_Time() ; r.metric(n) diff --git a/ivb_server_ratios.py b/ivb_server_ratios.py index d665f974..16a3bb11 100644 --- a/ivb_server_ratios.py +++ b/ivb_server_ratios.py @@ -1,14 +1,19 @@ # -# auto generated TopDown description for Intel Xeon E5 v2 (code named IvyBridge EP) +# auto generated TopDown 2.9 description for Intel Xeon E5 v2 (code named IvyBridge EP) # Please see http://ark.intel.com for more details on these CPUs. # +# References: +# http://halobates.de/blog/p/262 +# https://sites.google.com/site/analysismethods/yasin-pubs +# smt_enabled = False # Constants Pipeline_Width = 4 +L2_Store_Latency = 9 Mem_L3_Weight = 7 Mem_STLB_Hit_Cost = 7 Mem_SFB_Cost = 13 @@ -22,85 +27,180 @@ Mem_Remote_Fwd_Cost = 180 MS_Switches_Cost = 3 OneMillion = 1000000 +Energy_Unit = 15.6 # Aux. formulas + # Floating Point Operations Count def FLOP_Count(EV, level): - return ( 1 *(EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level))+ 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level))+ 8 * EV("SIMD_FP_256.PACKED_SINGLE", level) ) + return (1 *(EV("FP_COMP_OPS_EXE.X87", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level)) + 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level)) + 8 * EV("SIMD_FP_256.PACKED_SINGLE", level)) + def Recovery_Cycles(EV, level): - return ( EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2)if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level) + EV("INT_MISC.RECOVERY_CYCLES", level) + EV("INT_MISC.RECOVERY_CYCLES:amt1", level) + return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level) + def Execute_Cycles(EV, level): - return ( EV("UOPS_EXECUTED.CORE:c1", level) / 2)if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + EV("UOPS_EXECUTED.CORE:c1", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c1", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + def L1D_Miss_Cycles(EV, level): - return ( EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2)if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level) + EV("L1D_PEND_MISS.PENDING_CYCLES", level) + EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) + return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level) + +def SQ_Full_Cycles(EV, level): + EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + def ITLB_Miss_Cycles(EV, level): - return ( Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level) ) + return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level)) + +def Cycles_0_Ports_Utilized(EV, level): + EV("ARITH.FPU_DIV_ACTIVE", level) + EV("UOPS_EXECUTED.CORE:i1:c1", level) + EV("RS_EVENTS.EMPTY_CYCLES", level) + return (EV("UOPS_EXECUTED.CORE:i1:c1", level)) / 2 if smt_enabled else(STALLS_TOTAL(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ARITH.FPU_DIV_ACTIVE", level)) + def Cycles_1_Port_Utilized(EV, level): - return ( EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level))/ 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) ) + EV("UOPS_EXECUTED.CORE:c2", level) + EV("UOPS_EXECUTED.CORE:c1", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) + EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)) + def Cycles_2_Ports_Utilized(EV, level): - return ( EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level))/ 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) ) + EV("UOPS_EXECUTED.CORE:c2", level) + EV("UOPS_EXECUTED.CORE:c3", level) + EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)) + def Cycles_3m_Ports_Utilized(EV, level): - return ( EV("UOPS_EXECUTED.CORE:c3", level) / 2)if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + EV("UOPS_EXECUTED.CORE:c3", level) + EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + return (EV("UOPS_EXECUTED.CORE:c3", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) + def STALLS_MEM_ANY(EV, level): - return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level) ) + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level)) , level ) + def STALLS_TOTAL(EV, level): - return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level) ) + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level)) , level ) + def ORO_Demand_DRD_C1(EV, level): - return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level) ) + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level ) + def ORO_Demand_DRD_C6(EV, level): - return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level) ) + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level ) + +def ORO_Demand_RFO_C1(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", level)) , level ) + +def Store_L2_Hit_Cycles(EV, level): + return 0 + +def Cycles_False_Sharing_Client(EV, level): + return Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", level) + EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE", level)) + def Few_Uops_Executed_Threshold(EV, level): - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) - return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level) > 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level)> 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) + def Backend_Bound_At_EXE(EV, level): - return ( STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level))/ CLKS(EV, level) + return (STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level)) / CLKS(EV, level) + def Mem_L3_Hit_Fraction(EV, level): - return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level) ) + return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level)) + +def Mem_Lock_St_Fraction(EV, level): + return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level) + def Mispred_Clears_Fraction(EV, level): - return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level) ) + return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level)) + def Avg_RS_Empty_Period_Clears(EV, level): - return ( EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level))/ EV("RS_EVENTS.EMPTY_END", level) + return (EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level)) / EV("RS_EVENTS.EMPTY_END", level) + def Retire_Uop_Fraction(EV, level): return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level) + def SLOTS(EV, level): - return Pipeline_Width * CLKS1(EV, level) -# Instructions Per Cycle (per physical core) + return Pipeline_Width * CORE_CLKS(EV, level) + +def DurationTimeInSeconds(EV, level): + return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 ) + +# Instructions Per Cycle (per logical thread) def IPC(EV, level): - return EV("INST_RETIRED.ANY", level) / CLKS1(EV, level) + return EV("INST_RETIRED.ANY", level) / CLKS(EV, level) + +# Cycles Per Instruction (threaded) def CPI(EV, level): return 1 / IPC(EV, level) + +# Instructions Per Cycle (per physical core) +def CoreIPC(EV, level): + return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level) + # Uops Per Instruction def UPI(EV, level): return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level) + # Instruction per taken branch def IPTB(EV, level): return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + +# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes. +def BPTB(EV, level): + return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + # Fraction of Uops delivered by the DSB (decoded instructions cache) def DSB_Coverage(EV, level): - return ( EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level))/(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level) ) + return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level)) + # Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed) def ILP(EV, level): return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(EV, level) + # Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss) def MLP(EV, level): return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level) + # Actual Average Latency for L1 data-cache miss demand loads def Load_Miss_Real_Latency(EV, level): - return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level) ) + return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level)) + +# Giga Floating Point Operations Per Second +def GFLOPs(EV, level): + return FLOP_Count(EV, level) / OneMillion / DurationTimeInSeconds(EV, level) / 1000 + # Average Frequency Utilization relative nominal frequency def Turbo_Utilization(EV, level): return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level) + # Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store def Page_Walks_Use(EV, level): - return ( EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level))/ CLKS1(EV, level) + return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level) + +# PerfMon Event Multiplexing accuracy indicator +def MUX(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level) + # Per-thread actual clocks def CLKS(EV, level): return EV("CPU_CLK_UNHALTED.THREAD", level) + # Core actual clocks -def CLKS1(EV, level): - return ( EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2)if smt_enabled else CLKS(EV, level) +def CORE_CLKS(EV, level): + EV("CPU_CLK_UNHALTED.THREAD:amt1", level) + return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level) + +# Run duration time in seconds +def Time(EV, level): + return DurationTimeInSeconds(EV, level) # Event groups @@ -119,9 +219,10 @@ class Frontend_Bound: latter can accept them. For example, stalls due to instruction-cache misses would be categorized under Frontend Bound.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1)/ SLOTS(EV, 1 ) + self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.2) except ZeroDivisionError: #print "Frontend_Bound zero division" @@ -139,9 +240,10 @@ class Frontend_Latency: after a branch misprediction are categorized under Frontend Latency. In such cases the Frontend eventually delivers no uops for some period.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2)/ SLOTS(EV, 2 ) + self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 ) self.thresh = (self.val > 0.15) and self.parent.thresh except ZeroDivisionError: #print "Frontend_Latency zero division" @@ -158,9 +260,10 @@ class ICache_Misses: cache misses. Using compiler's Profile-Guided Optimization (PGO) can reduce i-cache misses through improved hot code layout.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ICACHE.IFETCH_STALL", 3)/ CLKS(EV, 3)- self.ITLB_Misses.compute(EV ) + self.val = EV("ICACHE.IFETCH_STALL", 3) / CLKS(EV, 3) - self.ITLB_Misses.compute(EV ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "ICache_Misses zero division" @@ -176,9 +279,10 @@ class ITLB_Misses: This metric represents cycles fraction CPU was stalled due to instruction TLB misses. Using large code pages may be considered here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ITLB_Miss_Cycles(EV, 3)/ CLKS(EV, 3 ) + self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "ITLB_Misses zero division" @@ -197,9 +301,10 @@ class Branch_Resteers: For example, branchy code with lots of (taken) branches and/or branch miss- predictions might get categorized under Branch Resteers.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3)+ EV("MACHINE_CLEARS.COUNT", 3)+ EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 ) + self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3) + EV("MACHINE_CLEARS.COUNT", 3) + EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "Branch_Resteers zero division" @@ -215,9 +320,10 @@ class DSB_Switches: This metric represents cycles fraction CPU was stalled due to switches from DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3)/ CLKS(EV, 3 ) + self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "DSB_Switches zero division" @@ -234,9 +340,10 @@ class LCP: Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ILD_STALL.LCP", 3)/ CLKS(EV, 3 ) + self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "LCP zero division" @@ -255,9 +362,10 @@ class MS_Switches: deliver long uop flows required by CISC instructions like CPUID, or uncommon conditions like Floating Point Assists when dealing with Denormals.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3)/ CLKS(EV, 3 ) + self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "MS_Switches zero division" @@ -276,9 +384,10 @@ class Frontend_Bandwidth: under Frontend Bandwidth. In such cases, the Frontend typically delivers non- optimal amount of uops to the Backend.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.Frontend_Bound.compute(EV)- self.Frontend_Latency.compute(EV ) + self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV ) self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh except ZeroDivisionError: #print "Frontend_Bandwidth zero division" @@ -288,16 +397,17 @@ def compute(self, EV): class MITE: name = "MITE" - domain = "CClocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -the MITE fetch pipeline. For example, inefficiencies in the instruction -decoders are categorized here.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to the MITE fetch pipeline. For example, inefficiencies in the +instruction decoders are categorized here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 ) + self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MITE zero division" @@ -307,17 +417,18 @@ def compute(self, EV): class DSB: name = "DSB" - domain = "CClocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -DSB (decoded uop cache) fetch pipeline. For example, inefficient utilization -of the DSB cache structure or bank conflict when reading from it, are -categorized here.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to DSB (decoded uop cache) fetch pipeline. For example, inefficient +utilization of the DSB cache structure or bank conflict when reading from it, +are categorized here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 ) + self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.3) and self.parent.thresh except ZeroDivisionError: #print "DSB zero division" @@ -327,18 +438,19 @@ def compute(self, EV): class LSD: name = "LSD" - domain = "CClocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop -supply. However, in some rare cases, optimal uop-delivery could not be reached -for small loops whose size (in terms of number of uops) does not suit well the -LSD structure.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining +Uop supply. However, in some rare cases, optimal uop-delivery could not be +reached for small loops whose size (in terms of number of uops) does not suit +well the LSD structure.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("LSD.CYCLES_ACTIVE", 3)- EV("LSD.CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 ) + self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "LSD zero division" @@ -357,9 +469,10 @@ class Bad_Speculation: speculation. For example, wasted work due to miss-predicted branches are categorized under Bad Speculation category""" level = 1 + htoff = False def compute(self, EV): try: - self.val = ( EV("UOPS_ISSUED.ANY", 1)- EV("UOPS_RETIRED.RETIRE_SLOTS", 1)+ Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 ) + self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.1) except ZeroDivisionError: #print "Bad_Speculation zero division" @@ -377,6 +490,7 @@ class Branch_Mispredicts: incorrectly speculated program path, or stalls the Backend of the machine needs to recover its state from a speculative path.""" level = 2 + htoff = False def compute(self, EV): try: self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV ) @@ -398,9 +512,10 @@ class Machine_Clears: example, this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.Bad_Speculation.compute(EV)- self.Branch_Mispredicts.compute(EV ) + self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "Machine_Clears zero division" @@ -421,9 +536,10 @@ class Backend_Bound: stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = 1 -(self.Frontend_Bound.compute(EV)+ self.Bad_Speculation.compute(EV)+ self.Retiring.compute(EV)) + self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV)) self.thresh = (self.val > 0.2) except ZeroDivisionError: #print "Backend_Bound zero division" @@ -442,9 +558,10 @@ class Memory_Bound: memory demand loads which coincides with execution starvation. in addition to less common cases where stores could imply backpressure on the pipeline.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = ( STALLS_MEM_ANY(EV, 2)+ EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 ) + self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: #print "Memory_Bound zero division" @@ -464,9 +581,10 @@ class L1_Bound: allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event as it accounts for any non-completed load.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( STALLS_MEM_ANY(EV, 3)- EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 ) + self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 ) self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh except ZeroDivisionError: #print "L1_Bound zero division" @@ -480,9 +598,10 @@ class DTLB_Load: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4)+ EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) + self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "DTLB_Load zero division" @@ -496,9 +615,10 @@ class Store_Fwd_Blk: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4)/ CLKS(EV, 4 ) + self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "Store_Fwd_Blk zero division" @@ -506,15 +626,36 @@ def compute(self, EV): self.thresh = False return self.val +class Lock_Latency: + name = "Lock_Latency" + domain = "Clocks" + area = "BE/Mem" + desc = """ +This metric represents cycles fraction the CPU spent handling cache misses due +to lock operations. Due to the microarchitecture handling of locks, they are +classified as L1_Bound regardless of what memory source satsified them.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = Mem_Lock_St_Fraction(EV, 4)* ORO_Demand_RFO_C1(EV, 4) / CLKS(EV, 4 ) + self.thresh = (self.val > 0.2) and self.parent.thresh + except ZeroDivisionError: + #print "Lock_Latency zero division" + self.val = 0 + self.thresh = False + return self.val + class Split_Loads: name = "Split_Loads" domain = "Clocks" area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4)/ CLKS(EV, 4 ) + self.val = 13 * EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "Split_Loads zero division" @@ -528,9 +669,10 @@ class G4K_Aliasing: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4)/ CLKS(EV, 4 ) + self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "G4K_Aliasing zero division" @@ -547,9 +689,10 @@ class L2_Bound: misses (i.e. L1 misses/L2 hits) will improve the latency and increase performance.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)- EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 ) + self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 ) self.thresh = (self.val > 0.03) and self.parent.thresh except ZeroDivisionError: #print "L2_Bound zero division" @@ -566,9 +709,10 @@ class L3_Bound: a sibling Core. Avoiding cache misses (i.e. L2 misses/L3 hits) will improve the latency and increase performance.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 ) + self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "L3_Bound zero division" @@ -582,9 +726,10 @@ class Contested_Accesses: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4)+ EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 ) + self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "Contested_Accesses zero division" @@ -598,9 +743,10 @@ class Data_Sharing: area = "BE/Mem" desc = "" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4)/ CLKS(EV, 4 ) + self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 ) self.thresh = self.val > 0.0 and self.parent.thresh except ZeroDivisionError: #print "Data_Sharing zero division" @@ -618,9 +764,10 @@ class L3_Latency: with a sibling core. Avoiding cache misses (i.e. L2 misses/L3 hits) will improve the latency and increase performance.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4)/ CLKS(EV, 4 ) + self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4) / CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "L3_Latency zero division" @@ -628,6 +775,26 @@ def compute(self, EV): self.thresh = False return self.val +class SQ_Full: + name = "SQ_Full" + domain = "CoreClocks" + area = "BE/Mem" + desc = """ +This metric measures fraction of cycles where the Super Queue (SQ) was full +taking into account all request-types and both hardware SMT threads. The Super +Queue is used for requests to access the L2 cache or to go out to the Uncore.""" + level = 4 + htoff = False + def compute(self, EV): + try: + self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 ) + self.thresh = self.val > 0.0 and self.parent.thresh + except ZeroDivisionError: + #print "SQ_Full zero division" + self.val = 0 + self.thresh = False + return self.val + class MEM_Bound: name = "MEM_Bound" domain = "Clocks" @@ -636,9 +803,10 @@ class MEM_Bound: This metric represents how often CPU was stalled on main memory (DRAM). Caching will improve the latency and increase performance.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( 1 - Mem_L3_Hit_Fraction(EV, 3)) * EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 ) + self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MEM_Bound zero division" @@ -655,9 +823,10 @@ class MEM_Bandwidth: bandwidth limits of main memory (DRAM). NUMA in multi-socket system may be considered in such case.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ORO_Demand_DRD_C6(EV, 4)/ CLKS(EV, 4 ) + self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MEM_Bandwidth zero division" @@ -674,9 +843,10 @@ class MEM_Latency: main memory (DRAM). Data layout re-structuring or using Software Prefetches (also through the compiler) may be considered in such case.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( ORO_Demand_DRD_C1(EV, 4)- ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 ) + self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MEM_Latency zero division" @@ -692,9 +862,10 @@ class Local_DRAM: This metric represents how often CPU was likely stalled due to loads from local memory. Caching will improve the latency and increase performance.""" level = 5 + htoff = False def compute(self, EV): try: - self.val = Mem_Local_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM", 5)/ CLKS(EV, 5 ) + self.val = Mem_Local_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM", 5) / CLKS(EV, 5 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "Local_DRAM zero division" @@ -710,9 +881,10 @@ class Remote_DRAM: This metric represents how often CPU was likely stalled due to loads from remote memory. This is caused often due to non-optimal NUMA allocations.""" level = 5 + htoff = False def compute(self, EV): try: - self.val = Mem_Remote_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM", 5)/ CLKS(EV, 5 ) + self.val = Mem_Remote_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM", 5) / CLKS(EV, 5 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "Remote_DRAM zero division" @@ -729,9 +901,10 @@ class Remote_Cache: remote cache in other sockets. This is caused often due to non-optimal NUMA allocations.""" level = 5 + htoff = False def compute(self, EV): try: - self.val = ( Mem_Remote_HitM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM", 5)+ Mem_Remote_Fwd_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD", 5)) / CLKS(EV, 5 ) + self.val = (Mem_Remote_HitM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM", 5) + Mem_Remote_Fwd_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD", 5)) / CLKS(EV, 5 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "Remote_Cache zero division" @@ -744,12 +917,15 @@ class Stores_Bound: domain = "Clocks" area = "BE/Mem" desc = """ -This metric represents how often CPU was stalled on due to store operations. -Tip: consider False Sharing analysis as next step""" +This metric represents how often CPU was stalled due to store operations. +even though memory store accesses do not typically stall out-of-order CPUs; +there are few cases where stores can lead to actual stalls. This metric will +be flagged should any of these cases be a bottleneck.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = self.Memory_Bound.compute(EV)- STALLS_MEM_ANY(EV, 3)/ CLKS(EV, 3 ) + self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: #print "Stores_Bound zero division" @@ -757,36 +933,38 @@ def compute(self, EV): self.thresh = False return self.val -class False_Sharing: - name = "False_Sharing" +class Store_Latency: + name = "Store_Latency" domain = "Clocks" area = "BE/Mem" desc = """ -This metric represents how often CPU was stalled on due to store operations. -Tip: consider False Sharing analysis as next step""" +This metric represents cycles fraction the CPU spent handling long-latency +store misses (missing 2nd level cache).""" level = 4 + htoff = False def compute(self, EV): try: - self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4)+ EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE", 4)) / CLKS(EV, 4 ) + self.val = (Store_L2_Hit_Cycles(EV, 4) +(1 - Mem_Lock_St_Fraction(EV, 4))* ORO_Demand_RFO_C1(EV, 4)) / CLKS(EV, 4 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: - #print "False_Sharing zero division" + #print "Store_Latency zero division" self.val = 0 self.thresh = False return self.val class Split_Stores: name = "Split_Stores" - domain = "Stores" + domain = "CoreClocks" area = "BE/Mem" desc = """ This metric represents rate of split store accesses. Consider aligning your data to the 64-byte cache line granularity.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4)/ EV("MEM_UOPS_RETIRED.ALL_STORES", 4 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 ) + self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: #print "Split_Stores zero division" self.val = 0 @@ -805,9 +983,10 @@ class DTLB_Store: the same page. Try using larger page sizes for large amounts of frequently- used data.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4)+ EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) + self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "DTLB_Store zero division" @@ -828,9 +1007,10 @@ class Core_Bound: performance (e.g. FP-chained long-latency arithmetic operations). Tip: consider Port Saturation analysis as next step.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = Backend_Bound_At_EXE(EV, 2)- self.Memory_Bound.compute(EV ) + self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "Core_Bound zero division" @@ -840,13 +1020,14 @@ def compute(self, EV): class Divider: name = "Divider" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = "" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ARITH.FPU_DIV_ACTIVE", 3)/ CLKS1(EV, 3 ) + self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "Divider zero division" @@ -867,9 +1048,10 @@ class Ports_Utilization: options today- reduces pressure on the execution ports as multiple elements are calculated with same uop.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = self.Core_Bound.compute(EV)- self.Divider.compute(EV ) + self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "Ports_Utilization zero division" @@ -879,15 +1061,16 @@ def compute(self, EV): class G0_Ports_Utilized: name = "0_Ports_Utilized" - domain = "Clocks" + domain = "CoreClocks" area = "BE/Core" desc = """ -This metric represents cycles fraction CPU executed no uops on any execution -port.""" +This metric represents Core cycles fraction CPU executed no uops on any +execution port.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( STALLS_TOTAL(EV, 4)- EV("RS_EVENTS.EMPTY_CYCLES", 4)- EV("ARITH.FPU_DIV_ACTIVE", 4)) / CLKS(EV, 4 ) + self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "G0_Ports_Utilized zero division" @@ -897,20 +1080,22 @@ def compute(self, EV): class G1_Port_Utilized: name = "1_Port_Utilized" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ -This metric represents cycles fraction CPU executed total of 1 uop per cycle -on all execution ports. This can be due to heavy data-dependency among -instructions. In some cases with high 1_Port_Utilized and L1_Bound it can -point to L1 data-cache latency bottleneck that may not necessarily manifest -with complete execution starvation (due to the short L1 latency e.g. walking -linked list) - looking at the assembly can be helpful. Tip: consider 'Core -Port Saturation' analysis-type as next step.""" +This metric represents Core cycles fraction where the CPU executed total of 1 +uop per cycle on all execution ports. This can be due to heavy data-dependency +among software instructions, or over oversubscribing a particular hardware +resource. In some other cases with high 1_Port_Utilized and L1_Bound, this +metric can point to L1 data-cache latency bottleneck that may not necessarily +manifest with complete execution starvation (due to the short L1 latency e.g. +walking a linked list) - looking at the assembly can be helpful. Tip: consider +'Core Ports Saturation' analysis-type as next step.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = Cycles_1_Port_Utilized(EV, 4)/ CLKS1(EV, 4 ) + self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "G1_Port_Utilized zero division" @@ -920,18 +1105,19 @@ def compute(self, EV): class G2_Ports_Utilized: name = "2_Ports_Utilized" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ -This metric represents cycles fraction CPU executed total of 2 uops per cycle -on all execution ports. Tip: consider 'Core Port Saturation' analysis-type as -next step. Loop Vectorization -most compilers feature auto-Vectorization -options today- reduces pressure on the execution ports as multiple elements -are calculated with same uop.""" +This metric represents Core cycles fraction CPU executed total of 2 uops per +cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis- +type as next step. Loop Vectorization -most compilers feature auto- +Vectorization options today- reduces pressure on the execution ports as +multiple elements are calculated with same uop.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = Cycles_2_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 ) + self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "G2_Ports_Utilized zero division" @@ -941,16 +1127,17 @@ def compute(self, EV): class G3m_Ports_Utilized: name = "3m_Ports_Utilized" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ -This metric represents cycles fraction CPU executed total of 3 or more uops -per cycle on all execution ports. Tip: consider 'Core Port Saturation' +This metric represents Core cycles fraction CPU executed total of 3 or more +uops per cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-type as next step""" level = 4 + htoff = False def compute(self, EV): try: - self.val = Cycles_3m_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 ) + self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "G3m_Ports_Utilized zero division" @@ -960,16 +1147,17 @@ def compute(self, EV): class Port_0: name = "Port_0" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution -port 0 (ALU)""" +port 0 (SNB+: ALU; HSW+:ALU and 2nd branch)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_0 zero division" self.val = 0 @@ -978,16 +1166,17 @@ def compute(self, EV): class Port_1: name = "Port_1" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution port 1 (ALU)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_1 zero division" self.val = 0 @@ -996,16 +1185,17 @@ def compute(self, EV): class Port_2: name = "Port_2" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution port 2 (Loads and Store-address)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_2 zero division" self.val = 0 @@ -1014,16 +1204,17 @@ def compute(self, EV): class Port_3: name = "Port_3" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution port 3 (Loads and Store-address)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_3 zero division" self.val = 0 @@ -1032,16 +1223,17 @@ def compute(self, EV): class Port_4: name = "Port_4" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution port 4 (Store-data)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_4 zero division" self.val = 0 @@ -1050,16 +1242,17 @@ def compute(self, EV): class Port_5: name = "Port_5" - domain = "CClocks" + domain = "CoreClocks" area = "BE/Core" desc = """ This metric represents Core cycles fraction CPU dispatched uops on execution -port 5 (Branches and ALU)""" +port 5 (SNB+: Branches and ALU; HSW+: ALU)""" level = 5 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5)/ CLKS1(EV, 5 ) - self.thresh = self.val > 0.0 and self.parent.thresh + self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5) / CORE_CLKS(EV, 5 ) + self.thresh = (self.val > 0.5) except ZeroDivisionError: #print "Port_5 zero division" self.val = 0 @@ -1083,9 +1276,10 @@ class Retiring: computations be done without significantly increasing number of instructions thus improving the performance.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1)/ SLOTS(EV, 1 ) + self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh except ZeroDivisionError: #print "Retiring zero division" @@ -1098,16 +1292,18 @@ class Base: domain = "Slots" area = "RET" desc = """ -This metric represents slots fraction CPU was retiring uops not originated -from the microcode-sequencer. This correlates with total number of +This metric represents slots fraction where the CPU was retiring uops not +originated from the microcode-sequencer. This correlates with total number of instructions used by the program. A uops-per-instruction ratio of 1 should be -expected. A high Retiring value for non-vectorized code is typically a good -hint for programmer to pursue vectorizing his code, which can reduce -instructions hence this bucket.""" +expected. While this is the most desirable of the top 4 categories, high +values may still indicate areas for improvement. If possible focus on +techniques that reduce instruction count or result in more efficient +instructions generation such as vectorization.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.Retiring.compute(EV)- self.Microcode_Sequencer.compute(EV ) + self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV ) self.thresh = (self.val > 0.6) and self.parent.thresh except ZeroDivisionError: #print "Base zero division" @@ -1123,9 +1319,10 @@ class FP_Arith: This metric represents overall arithmetic floating-point (FP) uops fraction the CPU has executed.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = self.FP_x87.compute(EV)+ self.FP_Scalar.compute(EV)+ self.FP_Vector.compute(EV ) + self.val = self.FP_x87.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: #print "FP_Arith zero division" @@ -1138,13 +1335,14 @@ class FP_x87: domain = "Uops" area = "RET" desc = """ -This metric represents floating-point (FP) x87 uops fraction the CPU has -executed. Tip: consider compiler flags to generate newer AVX (or SSE) +This metric is an approxmiation of floating-point (FP) x87 (arithmetic) uops +fraction. Tip: consider compiler flags to generate newer AVX (or SSE) instruction sets, which typically perform better and feature vectors.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = EV("FP_COMP_OPS_EXE.X87", 4)/ EV("UOPS_EXECUTED.THREAD", 4 ) + self.val = EV("FP_COMP_OPS_EXE.X87", 4) / EV("UOPS_EXECUTED.THREAD", 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "FP_x87 zero division" @@ -1161,9 +1359,10 @@ class FP_Scalar: CPU has executed. Tip: investigate what limits (compiler) generation of vector code.""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", 4)+ EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 ) + self.val = (EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", 4) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "FP_Scalar zero division" @@ -1179,9 +1378,10 @@ class FP_Vector: This metric represents arithmetic floating-point (FP) vector uops fraction the CPU has executed. Tip: check if vector width is expected""" level = 4 + htoff = False def compute(self, EV): try: - self.val = ( EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", 4)+ EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", 4)+ EV("SIMD_FP_256.PACKED_SINGLE", 4)+ EV("SIMD_FP_256.PACKED_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 ) + self.val = (EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", 4) + EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 ) self.thresh = (self.val > 0.2) and self.parent.thresh except ZeroDivisionError: #print "FP_Vector zero division" @@ -1198,6 +1398,7 @@ class Other: executed. If you application has no FP operations, this will likely be biggest fraction.""" level = 3 + htoff = False def compute(self, EV): try: self.val = 1 - self.FP_Arith.compute(EV ) @@ -1218,9 +1419,10 @@ class Microcode_Sequencer: decoded by the default decoders (like repeat move strings), or by microcode assists used to address some operation modes (like in Floating Point assists).""" level = 2 + htoff = False def compute(self, EV): try: - self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2)/ SLOTS(EV, 2 ) + self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 ) self.thresh = (self.val > 0.05) except ZeroDivisionError: #print "Microcode_Sequencer zero division" @@ -1231,7 +1433,9 @@ def compute(self, EV): class Metric_IPC: name = "IPC" desc = """ -Instructions Per Cycle (per physical core)""" +Instructions Per Cycle (per logical thread)""" + domain = "Metric" + maxval = 5 def compute(self, EV): try: @@ -1243,7 +1447,9 @@ def compute(self, EV): class Metric_CPI: name = "CPI" desc = """ -""" +Cycles Per Instruction (threaded)""" + domain = "Metric" + maxval = 0 def compute(self, EV): try: @@ -1252,10 +1458,26 @@ def compute(self, EV): print "CPI zero division" self.val = 0 +class Metric_CoreIPC: + name = "CoreIPC" + desc = """ +Instructions Per Cycle (per physical core)""" + domain = "Metric" + maxval = 5 + + def compute(self, EV): + try: + self.val = CoreIPC(EV, 0) + except ZeroDivisionError: + print "CoreIPC zero division" + self.val = 0 + class Metric_UPI: name = "UPI" desc = """ Uops Per Instruction""" + domain = "Metric" + maxval = 2 def compute(self, EV): try: @@ -1268,6 +1490,8 @@ class Metric_IPTB: name = "IPTB" desc = """ Instruction per taken branch""" + domain = "Metric" + maxval = 0 def compute(self, EV): try: @@ -1276,10 +1500,27 @@ def compute(self, EV): print "IPTB zero division" self.val = 0 +class Metric_BPTB: + name = "BPTB" + desc = """ +Branch instructions per taken branch. Can be used to approximate PGO- +likelihood for non-loopy codes.""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = BPTB(EV, 0) + except ZeroDivisionError: + print "BPTB zero division" + self.val = 0 + class Metric_DSB_Coverage: name = "DSB_Coverage" desc = """ Fraction of Uops delivered by the DSB (decoded instructions cache)""" + domain = "Metric" + maxval = 1 def compute(self, EV): try: @@ -1293,6 +1534,8 @@ class Metric_ILP: desc = """ Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)""" + domain = "Metric" + maxval = 10 def compute(self, EV): try: @@ -1306,6 +1549,8 @@ class Metric_MLP: desc = """ Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)""" + domain = "Metric" + maxval = 10 def compute(self, EV): try: @@ -1318,6 +1563,8 @@ class Metric_Load_Miss_Real_Latency: name = "Load_Miss_Real_Latency" desc = """ Actual Average Latency for L1 data-cache miss demand loads""" + domain = "Metric" + maxval = 1000 def compute(self, EV): try: @@ -1326,10 +1573,26 @@ def compute(self, EV): print "Load_Miss_Real_Latency zero division" self.val = 0 +class Metric_GFLOPs: + name = "GFLOPs" + desc = """ +Giga Floating Point Operations Per Second""" + domain = "Metric" + maxval = 100 + + def compute(self, EV): + try: + self.val = GFLOPs(EV, 0) + except ZeroDivisionError: + print "GFLOPs zero division" + self.val = 0 + class Metric_Turbo_Utilization: name = "Turbo_Utilization" desc = """ Average Frequency Utilization relative nominal frequency""" + domain = "Metric" + maxval = 10 def compute(self, EV): try: @@ -1343,6 +1606,8 @@ class Metric_Page_Walks_Use: desc = """ Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store""" + domain = "CoreClocks" + maxval = 0 def compute(self, EV): try: @@ -1351,10 +1616,26 @@ def compute(self, EV): print "Page_Walks_Use zero division" self.val = 0 +class Metric_MUX: + name = "MUX" + desc = """ +PerfMon Event Multiplexing accuracy indicator""" + domain = "Clocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = MUX(EV, 0) + except ZeroDivisionError: + print "MUX zero division" + self.val = 0 + class Metric_CLKS: name = "CLKS" desc = """ Per-thread actual clocks""" + domain = "Count" + maxval = 0 def compute(self, EV): try: @@ -1363,16 +1644,32 @@ def compute(self, EV): print "CLKS zero division" self.val = 0 -class Metric_CLKS1: - name = "CLKS1" +class Metric_CORE_CLKS: + name = "CORE_CLKS" desc = """ Core actual clocks""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = CORE_CLKS(EV, 0) + except ZeroDivisionError: + print "CORE_CLKS zero division" + self.val = 0 + +class Metric_Time: + name = "Time" + desc = """ +Run duration time in seconds""" + domain = "Count" + maxval = 0 def compute(self, EV): try: - self.val = CLKS1(EV, 0) + self.val = Time(EV, 0) except ZeroDivisionError: - print "CLKS1 zero division" + print "Time zero division" self.val = 0 # Schedule @@ -1401,6 +1698,7 @@ def __init__(self, r): n = L1_Bound() ; r.run(n) ; o["L1_Bound"] = n n = DTLB_Load() ; r.run(n) ; o["DTLB_Load"] = n n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n + n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n n = L2_Bound() ; r.run(n) ; o["L2_Bound"] = n @@ -1408,6 +1706,7 @@ def __init__(self, r): n = Contested_Accesses() ; r.run(n) ; o["Contested_Accesses"] = n n = Data_Sharing() ; r.run(n) ; o["Data_Sharing"] = n n = L3_Latency() ; r.run(n) ; o["L3_Latency"] = n + n = SQ_Full() ; r.run(n) ; o["SQ_Full"] = n n = MEM_Bound() ; r.run(n) ; o["MEM_Bound"] = n n = MEM_Bandwidth() ; r.run(n) ; o["MEM_Bandwidth"] = n n = MEM_Latency() ; r.run(n) ; o["MEM_Latency"] = n @@ -1415,7 +1714,7 @@ def __init__(self, r): n = Remote_DRAM() ; r.run(n) ; o["Remote_DRAM"] = n n = Remote_Cache() ; r.run(n) ; o["Remote_Cache"] = n n = Stores_Bound() ; r.run(n) ; o["Stores_Bound"] = n - n = False_Sharing() ; r.run(n) ; o["False_Sharing"] = n + n = Store_Latency() ; r.run(n) ; o["Store_Latency"] = n n = Split_Stores() ; r.run(n) ; o["Split_Stores"] = n n = DTLB_Store() ; r.run(n) ; o["DTLB_Store"] = n n = Core_Bound() ; r.run(n) ; o["Core_Bound"] = n @@ -1459,6 +1758,7 @@ def __init__(self, r): o["L1_Bound"].parent = o["Memory_Bound"] o["DTLB_Load"].parent = o["L1_Bound"] o["Store_Fwd_Blk"].parent = o["L1_Bound"] + o["Lock_Latency"].parent = o["L1_Bound"] o["Split_Loads"].parent = o["L1_Bound"] o["G4K_Aliasing"].parent = o["L1_Bound"] o["L2_Bound"].parent = o["Memory_Bound"] @@ -1466,6 +1766,7 @@ def __init__(self, r): o["Contested_Accesses"].parent = o["L3_Bound"] o["Data_Sharing"].parent = o["L3_Bound"] o["L3_Latency"].parent = o["L3_Bound"] + o["SQ_Full"].parent = o["L3_Bound"] o["MEM_Bound"].parent = o["Memory_Bound"] o["MEM_Bandwidth"].parent = o["MEM_Bound"] o["MEM_Latency"].parent = o["MEM_Bound"] @@ -1473,7 +1774,7 @@ def __init__(self, r): o["Remote_DRAM"].parent = o["MEM_Latency"] o["Remote_Cache"].parent = o["MEM_Latency"] o["Stores_Bound"].parent = o["Memory_Bound"] - o["False_Sharing"].parent = o["Stores_Bound"] + o["Store_Latency"].parent = o["Stores_Bound"] o["Split_Stores"].parent = o["Stores_Bound"] o["DTLB_Store"].parent = o["Stores_Bound"] o["Core_Bound"].parent = o["Backend_Bound"] @@ -1527,22 +1828,23 @@ def __init__(self, r): o["Frontend_Latency"].sibling = None o["ICache_Misses"].sibling = None o["ITLB_Misses"].sibling = None - o["Branch_Resteers"].sibling = o["Bad_Speculation"] + o["Branch_Resteers"].sibling = o["Bad_Speculation"] o["DSB_Switches"].sibling = None o["LCP"].sibling = None - o["MS_Switches"].sibling = o["Microcode_Sequencer"] + o["MS_Switches"].sibling = o["Microcode_Sequencer"] o["Frontend_Bandwidth"].sibling = None o["MITE"].sibling = None o["DSB"].sibling = None o["LSD"].sibling = None - o["Bad_Speculation"].sibling = o["Branch_Resteers"] + o["Bad_Speculation"].sibling = o["Branch_Resteers"] o["Branch_Mispredicts"].sibling = None o["Machine_Clears"].sibling = None o["Backend_Bound"].sibling = None o["Memory_Bound"].sibling = None - o["L1_Bound"].sibling = None + o["L1_Bound"].sibling = o["G1_Port_Utilized"] o["DTLB_Load"].sibling = None o["Store_Fwd_Blk"].sibling = None + o["Lock_Latency"].sibling = o["Store_Latency"] o["Split_Loads"].sibling = None o["G4K_Aliasing"].sibling = None o["L2_Bound"].sibling = None @@ -1550,6 +1852,7 @@ def __init__(self, r): o["Contested_Accesses"].sibling = None o["Data_Sharing"].sibling = None o["L3_Latency"].sibling = None + o["SQ_Full"].sibling = None o["MEM_Bound"].sibling = None o["MEM_Bandwidth"].sibling = None o["MEM_Latency"].sibling = None @@ -1557,21 +1860,21 @@ def __init__(self, r): o["Remote_DRAM"].sibling = None o["Remote_Cache"].sibling = None o["Stores_Bound"].sibling = None - o["False_Sharing"].sibling = None - o["Split_Stores"].sibling = None + o["Store_Latency"].sibling = o["Lock_Latency"] + o["Split_Stores"].sibling = o["Port_4"] o["DTLB_Store"].sibling = None o["Core_Bound"].sibling = None o["Divider"].sibling = None o["Ports_Utilization"].sibling = None o["G0_Ports_Utilized"].sibling = None - o["G1_Port_Utilized"].sibling = None + o["G1_Port_Utilized"].sibling = o["L1_Bound"] o["G2_Ports_Utilized"].sibling = None o["G3m_Ports_Utilized"].sibling = None o["Port_0"].sibling = None o["Port_1"].sibling = None o["Port_2"].sibling = None o["Port_3"].sibling = None - o["Port_4"].sibling = None + o["Port_4"].sibling = o["Split_Stores"] o["Port_5"].sibling = None o["Retiring"].sibling = None o["Base"].sibling = None @@ -1580,15 +1883,15 @@ def __init__(self, r): o["FP_Scalar"].sibling = None o["FP_Vector"].sibling = None o["Other"].sibling = None - o["Microcode_Sequencer"].sibling = o["MS_Switches"] + o["Microcode_Sequencer"].sibling = o["MS_Switches"] - # sampling events (experimential) + # sampling events o["Frontend_Bound"].sample = [] - o["Frontend_Latency"].sample = [] + o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END'] o["ICache_Misses"].sample = [] - o["ITLB_Misses"].sample = [] - o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES_PS'] + o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED'] + o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] o["DSB_Switches"].sample = [] o["LCP"].sample = [] o["MS_Switches"].sample = [] @@ -1596,33 +1899,35 @@ def __init__(self, r): o["MITE"].sample = [] o["DSB"].sample = [] o["LSD"].sample = [] - o["Bad_Speculation"].sample = [] - o["Branch_Mispredicts"].sample = [] - o["Machine_Clears"].sample = [] + o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES'] + o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp'] + o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT'] o["Backend_Bound"].sample = [] o["Memory_Bound"].sample = [] - o["L1_Bound"].sample = [] - o["DTLB_Load"].sample = [] + o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp'] + o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp'] o["Store_Fwd_Blk"].sample = [] - o["Split_Loads"].sample = [] + o["Lock_Latency"].sample = ['MEM_UOPS_RETIRED.LOCK_LOADS:pp'] + o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp'] o["G4K_Aliasing"].sample = [] - o["L2_Bound"].sample = [] - o["L3_Bound"].sample = [] - o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS_PS'] - o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT_PS'] - o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT_PS'] - o["MEM_Bound"].sample = [] + o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp'] + o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp'] + o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp'] + o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp'] + o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp'] + o["SQ_Full"].sample = [] + o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp'] o["MEM_Bandwidth"].sample = [] o["MEM_Latency"].sample = [] - o["Local_DRAM"].sample = [] - o["Remote_DRAM"].sample = [] - o["Remote_Cache"].sample = [] - o["Stores_Bound"].sample = [] - o["False_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS'] - o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES_PS', 'MEM_UOPS_RETIRED.ALL_STORES_PS'] - o["DTLB_Store"].sample = [] + o["Local_DRAM"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM:pp'] + o["Remote_DRAM"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM:pp'] + o["Remote_Cache"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM:pp', 'MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD:pp'] + o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp'] + o["Store_Latency"].sample = [] + o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp'] + o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] o["Core_Bound"].sample = [] - o["Divider"].sample = [] + o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE'] o["Ports_Utilization"].sample = [] o["G0_Ports_Utilized"].sample = [] o["G1_Port_Utilized"].sample = [] @@ -1635,25 +1940,30 @@ def __init__(self, r): o["Port_4"].sample = [] o["Port_5"].sample = [] o["Retiring"].sample = [] - o["Base"].sample = [] + o["Base"].sample = ['INST_RETIRED.PREC_DIST'] o["FP_Arith"].sample = [] o["FP_x87"].sample = [] o["FP_Scalar"].sample = [] o["FP_Vector"].sample = [] o["Other"].sample = [] - o["Microcode_Sequencer"].sample = [] + o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS'] # user visible metrics n = Metric_IPC() ; r.metric(n) n = Metric_CPI() ; r.metric(n) + n = Metric_CoreIPC() ; r.metric(n) n = Metric_UPI() ; r.metric(n) n = Metric_IPTB() ; r.metric(n) + n = Metric_BPTB() ; r.metric(n) n = Metric_DSB_Coverage() ; r.metric(n) n = Metric_ILP() ; r.metric(n) n = Metric_MLP() ; r.metric(n) n = Metric_Load_Miss_Real_Latency() ; r.metric(n) + n = Metric_GFLOPs() ; r.metric(n) n = Metric_Turbo_Utilization() ; r.metric(n) n = Metric_Page_Walks_Use() ; r.metric(n) + n = Metric_MUX() ; r.metric(n) n = Metric_CLKS() ; r.metric(n) - n = Metric_CLKS1() ; r.metric(n) + n = Metric_CORE_CLKS() ; r.metric(n) + n = Metric_Time() ; r.metric(n) diff --git a/jkt_server_ratios.py b/jkt_server_ratios.py new file mode 100644 index 00000000..2a54ae24 --- /dev/null +++ b/jkt_server_ratios.py @@ -0,0 +1,789 @@ + +# +# auto generated TopDown 2.9 description for Intel Xeon E5 (code named SandyBridge EP) +# Please see http://ark.intel.com for more details on these CPUs. +# +# References: +# http://halobates.de/blog/p/262 +# https://sites.google.com/site/analysismethods/yasin-pubs +# + +smt_enabled = False + +# Constants + +Pipeline_Width = 4 +L2_Store_Latency = 9 +Mem_L3_Weight = 7 +Mem_STLB_Hit_Cost = 7 +Mem_SFB_Cost = 13 +Mem_4K_Alias_Cost = 7 +Mem_XSNP_HitM_Cost = 60 +MEM_XSNP_Hit_Cost = 43 +MEM_XSNP_None_Cost = 29 +Mem_Local_DRAM_Cost = 200 +Mem_Remote_DRAM_Cost = 310 +Mem_Remote_HitM_Cost = 200 +Mem_Remote_Fwd_Cost = 180 +MS_Switches_Cost = 3 +OneMillion = 1000000 +Energy_Unit = 15.6 + +# Aux. formulas + + +# Floating Point Operations Count +def FLOP_Count(EV, level): + return (1 *(EV("FP_COMP_OPS_EXE.X87", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level)) + 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level)) + 8 * EV("SIMD_FP_256.PACKED_SINGLE", level)) + +def Recovery_Cycles(EV, level): + EV("INT_MISC.RECOVERY_CYCLES", level) + EV("INT_MISC.RECOVERY_CYCLES:amt1", level) + return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level) + +def L1D_Miss_Cycles(EV, level): + EV("L1D_PEND_MISS.PENDING_CYCLES", level) + EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) + return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level) + +def SQ_Full_Cycles(EV, level): + EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + +def ITLB_Miss_Cycles(EV, level): + return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level)) + +def ORO_Demand_DRD_C1(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level ) + +def ORO_Demand_DRD_C6(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level ) + +def ORO_Demand_RFO_C1(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", level)) , level ) + +def Store_L2_Hit_Cycles(EV, level): + return 0 + +def Mem_L3_Hit_Fraction(EV, level): + return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level)) + +def Mem_Lock_St_Fraction(EV, level): + return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level) + +def Retire_Uop_Fraction(EV, level): + return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level) + +def SLOTS(EV, level): + return Pipeline_Width * CORE_CLKS(EV, level) + +def DurationTimeInSeconds(EV, level): + return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 ) + +# Instructions Per Cycle (per logical thread) +def IPC(EV, level): + return EV("INST_RETIRED.ANY", level) / CLKS(EV, level) + +# Cycles Per Instruction (threaded) +def CPI(EV, level): + return 1 / IPC(EV, level) + +# Instructions Per Cycle (per physical core) +def CoreIPC(EV, level): + return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level) + +# Uops Per Instruction +def UPI(EV, level): + return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level) + +# Instruction per taken branch +def IPTB(EV, level): + return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + +# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes. +def BPTB(EV, level): + return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + +# Fraction of Uops delivered by the DSB (decoded instructions cache) +def DSB_Coverage(EV, level): + return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level)) + +# Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss) +def MLP(EV, level): + return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level) + +# Giga Floating Point Operations Per Second +def GFLOPs(EV, level): + return FLOP_Count(EV, level) / OneMillion / DurationTimeInSeconds(EV, level) / 1000 + +# Average Frequency Utilization relative nominal frequency +def Turbo_Utilization(EV, level): + return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level) + +# Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store +def Page_Walks_Use(EV, level): + return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level) + +# PerfMon Event Multiplexing accuracy indicator +def MUX(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level) + +# Per-thread actual clocks +def CLKS(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD", level) + +# Core actual clocks +def CORE_CLKS(EV, level): + EV("CPU_CLK_UNHALTED.THREAD:amt1", level) + return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level) + +# Run duration time in seconds +def Time(EV, level): + return DurationTimeInSeconds(EV, level) + +# Event groups + + +class Frontend_Bound: + name = "Frontend_Bound" + domain = "Slots" + area = "FE" + desc = """ +This category reflects slots where the Frontend of the processor undersupplies +its Backend. Frontend denotes the first portion of pipeline responsible to +fetch micro-ops which the Backend can execute. Within the Frontend, a branch +predictor predicts the next address to fetch, cache-lines are fetched from +memory, parsed into instructions, and lastly decoded into micro-ops. The +purpose of the Frontend cluster is to deliver uops to Backend whenever the +latter can accept them. For example, stalls due to instruction-cache misses +would be categorized under Frontend Bound.""" + level = 1 + htoff = False + def compute(self, EV): + try: + self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 ) + self.thresh = (self.val > 0.2) + except ZeroDivisionError: + #print "Frontend_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class Frontend_Latency: + name = "Frontend_Latency" + domain = "Slots" + area = "FE" + desc = """ +This metric represents slots fraction CPU was stalled due to Frontend latency +issues. For example, instruction-cache misses, iTLB misses or fetch stalls +after a branch misprediction are categorized under Frontend Latency. In such +cases the Frontend eventually delivers no uops for some period.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 ) + self.thresh = (self.val > 0.15) and self.parent.thresh + except ZeroDivisionError: + #print "Frontend_Latency zero division" + self.val = 0 + self.thresh = False + return self.val + +class ITLB_Misses: + name = "ITLB_Misses" + domain = "Clocks" + area = "FE" + desc = """ +This metric represents cycles fraction CPU was stalled due to instruction TLB +misses. Using large code pages may be considered here.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "ITLB_Misses zero division" + self.val = 0 + self.thresh = False + return self.val + +class DSB_Switches: + name = "DSB_Switches" + domain = "Clocks" + area = "FE" + desc = """ +This metric represents cycles fraction CPU was stalled due to switches from +DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "DSB_Switches zero division" + self.val = 0 + self.thresh = False + return self.val + +class LCP: + name = "LCP" + domain = "Clocks" + area = "FE" + desc = """ +This metric represents cycles fraction CPU was stalled due to Length Changing +Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will +certainly avoid this.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "LCP zero division" + self.val = 0 + self.thresh = False + return self.val + +class MS_Switches: + name = "MS_Switches" + domain = "Clocks" + area = "FE" + desc = """ +This metric represents cycles fraction CPU was stalled due to switches of uop +delivery to the Microcode Sequencer (MS). Commonly used instructions are +optimized for delivery by the DSB or MITE pipelines. The MS is designated to +deliver long uop flows required by CISC instructions like CPUID, or uncommon +conditions like Floating Point Assists when dealing with Denormals.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 ) + self.thresh = (self.val > 0.05) and self.parent.thresh + except ZeroDivisionError: + #print "MS_Switches zero division" + self.val = 0 + self.thresh = False + return self.val + +class Frontend_Bandwidth: + name = "Frontend_Bandwidth" + domain = "Slots" + area = "FE" + desc = """ +This metric represents slots fraction CPU was stalled due to Frontend +bandwidth issues. For example, inefficiencies at the instruction decoders, or +code restrictions for caching in the DSB (decoded uops cache) are categorized +under Frontend Bandwidth. In such cases, the Frontend typically delivers non- +optimal amount of uops to the Backend.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV ) + self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh + except ZeroDivisionError: + #print "Frontend_Bandwidth zero division" + self.val = 0 + self.thresh = False + return self.val + +class MITE: + name = "MITE" + domain = "CoreClocks" + area = "FE" + desc = """ +This metric represents Core cycles fraction in which CPU was likely limited +due to the MITE fetch pipeline. For example, inefficiencies in the +instruction decoders are categorized here.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "MITE zero division" + self.val = 0 + self.thresh = False + return self.val + +class DSB: + name = "DSB" + domain = "CoreClocks" + area = "FE" + desc = """ +This metric represents Core cycles fraction in which CPU was likely limited +due to DSB (decoded uop cache) fetch pipeline. For example, inefficient +utilization of the DSB cache structure or bank conflict when reading from it, +are categorized here.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) + self.thresh = (self.val > 0.3) and self.parent.thresh + except ZeroDivisionError: + #print "DSB zero division" + self.val = 0 + self.thresh = False + return self.val + +class LSD: + name = "LSD" + domain = "CoreClocks" + area = "FE" + desc = """ +This metric represents Core cycles fraction in which CPU was likely limited +due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining +Uop supply. However, in some rare cases, optimal uop-delivery could not be +reached for small loops whose size (in terms of number of uops) does not suit +well the LSD structure.""" + level = 3 + htoff = False + def compute(self, EV): + try: + self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) + self.thresh = (self.val > 0.1) and self.parent.thresh + except ZeroDivisionError: + #print "LSD zero division" + self.val = 0 + self.thresh = False + return self.val + +class Bad_Speculation: + name = "Bad_Speculation" + domain = "Slots" + area = "BAD" + desc = """ +This category reflects slots wasted due to incorrect speculations, which +include slots used to allocate uops that do not eventually get retired and +slots for which allocation was blocked due to recovery from earlier incorrect +speculation. For example, wasted work due to miss-predicted branches are +categorized under Bad Speculation category""" + level = 1 + htoff = False + def compute(self, EV): + try: + self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 ) + self.thresh = (self.val > 0.1) + except ZeroDivisionError: + #print "Bad_Speculation zero division" + self.val = 0 + self.thresh = False + return self.val + +class Backend_Bound: + name = "Backend_Bound" + domain = "Slots" + area = "BE" + desc = """ +This category reflects slots where no uops are being delivered due to a lack +of required resources for accepting more uops in the Backend of the pipeline. +Backend describes the portion of the pipeline where the out-of-order scheduler +dispatches ready uops into their respective execution units, and once +completed these uops get retired according to program order. For example, +stalls due to data-cache misses or stalls due to the divider unit being +overloaded are both categorized under Backend Bound.""" + level = 1 + htoff = False + def compute(self, EV): + try: + self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV)) + self.thresh = (self.val > 0.2) + except ZeroDivisionError: + #print "Backend_Bound zero division" + self.val = 0 + self.thresh = False + return self.val + +class Retiring: + name = "Retiring" + domain = "Slots" + area = "RET" + desc = """ +This category reflects slots utilized by useful work i.e. allocated uops that +eventually get retired. Ideally, all pipeline slots would be attributed to the +Retiring category. Retiring of 100% would indicate the maximum 4 uops retired +per cycle has been achieved. Maximizing Retiring typically increases the +Instruction-Per-Cycle metric. Note that a high Retiring value does not +necessary mean there is no room for more performance. For example, Microcode +assists are categorized under Retiring. They hurt performance and can often be +avoided. A high Retiring value for non-vectorized code may be a good hint for +programmer to consider vectorizing his code. Doing so essentially lets more +computations be done without significantly increasing number of instructions +thus improving the performance.""" + level = 1 + htoff = False + def compute(self, EV): + try: + self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 ) + self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh + except ZeroDivisionError: + #print "Retiring zero division" + self.val = 0 + self.thresh = False + return self.val + +class Base: + name = "Base" + domain = "Slots" + area = "RET" + desc = """ +This metric represents slots fraction where the CPU was retiring uops not +originated from the microcode-sequencer. This correlates with total number of +instructions used by the program. A uops-per-instruction ratio of 1 should be +expected. While this is the most desirable of the top 4 categories, high +values may still indicate areas for improvement. If possible focus on +techniques that reduce instruction count or result in more efficient +instructions generation such as vectorization.""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV ) + self.thresh = (self.val > 0.6) and self.parent.thresh + except ZeroDivisionError: + #print "Base zero division" + self.val = 0 + self.thresh = False + return self.val + +class Microcode_Sequencer: + name = "Microcode_Sequencer" + domain = "Slots" + area = "RET" + desc = """ +This metric represents slots fraction CPU was retiring uops fetched by the +Microcode Sequencer (MS) ROM. The MS is used for CISC instructions not fully +decoded by the default decoders (like repeat move strings), or by microcode +assists used to address some operation modes (like in Floating Point assists).""" + level = 2 + htoff = False + def compute(self, EV): + try: + self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 ) + self.thresh = (self.val > 0.05) + except ZeroDivisionError: + #print "Microcode_Sequencer zero division" + self.val = 0 + self.thresh = False + return self.val + +class Metric_IPC: + name = "IPC" + desc = """ +Instructions Per Cycle (per logical thread)""" + domain = "Metric" + maxval = 5 + + def compute(self, EV): + try: + self.val = IPC(EV, 0) + except ZeroDivisionError: + print "IPC zero division" + self.val = 0 + +class Metric_CPI: + name = "CPI" + desc = """ +Cycles Per Instruction (threaded)""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = CPI(EV, 0) + except ZeroDivisionError: + print "CPI zero division" + self.val = 0 + +class Metric_CoreIPC: + name = "CoreIPC" + desc = """ +Instructions Per Cycle (per physical core)""" + domain = "Metric" + maxval = 5 + + def compute(self, EV): + try: + self.val = CoreIPC(EV, 0) + except ZeroDivisionError: + print "CoreIPC zero division" + self.val = 0 + +class Metric_UPI: + name = "UPI" + desc = """ +Uops Per Instruction""" + domain = "Metric" + maxval = 2 + + def compute(self, EV): + try: + self.val = UPI(EV, 0) + except ZeroDivisionError: + print "UPI zero division" + self.val = 0 + +class Metric_IPTB: + name = "IPTB" + desc = """ +Instruction per taken branch""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = IPTB(EV, 0) + except ZeroDivisionError: + print "IPTB zero division" + self.val = 0 + +class Metric_BPTB: + name = "BPTB" + desc = """ +Branch instructions per taken branch. Can be used to approximate PGO- +likelihood for non-loopy codes.""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = BPTB(EV, 0) + except ZeroDivisionError: + print "BPTB zero division" + self.val = 0 + +class Metric_DSB_Coverage: + name = "DSB_Coverage" + desc = """ +Fraction of Uops delivered by the DSB (decoded instructions cache)""" + domain = "Metric" + maxval = 1 + + def compute(self, EV): + try: + self.val = DSB_Coverage(EV, 0) + except ZeroDivisionError: + print "DSB_Coverage zero division" + self.val = 0 + +class Metric_MLP: + name = "MLP" + desc = """ +Memory-Level-Parallelism (average number of L1 miss demand load when there is +at least 1 such miss)""" + domain = "Metric" + maxval = 10 + + def compute(self, EV): + try: + self.val = MLP(EV, 0) + except ZeroDivisionError: + print "MLP zero division" + self.val = 0 + +class Metric_GFLOPs: + name = "GFLOPs" + desc = """ +Giga Floating Point Operations Per Second""" + domain = "Metric" + maxval = 100 + + def compute(self, EV): + try: + self.val = GFLOPs(EV, 0) + except ZeroDivisionError: + print "GFLOPs zero division" + self.val = 0 + +class Metric_Turbo_Utilization: + name = "Turbo_Utilization" + desc = """ +Average Frequency Utilization relative nominal frequency""" + domain = "Metric" + maxval = 10 + + def compute(self, EV): + try: + self.val = Turbo_Utilization(EV, 0) + except ZeroDivisionError: + print "Turbo_Utilization zero division" + self.val = 0 + +class Metric_Page_Walks_Use: + name = "Page_Walks_Use" + desc = """ +Fraction of cycles where the core's Page Walker is busy serving +iTLB/Load/Store""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = Page_Walks_Use(EV, 0) + except ZeroDivisionError: + print "Page_Walks_Use zero division" + self.val = 0 + +class Metric_MUX: + name = "MUX" + desc = """ +PerfMon Event Multiplexing accuracy indicator""" + domain = "Clocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = MUX(EV, 0) + except ZeroDivisionError: + print "MUX zero division" + self.val = 0 + +class Metric_CLKS: + name = "CLKS" + desc = """ +Per-thread actual clocks""" + domain = "Count" + maxval = 0 + + def compute(self, EV): + try: + self.val = CLKS(EV, 0) + except ZeroDivisionError: + print "CLKS zero division" + self.val = 0 + +class Metric_CORE_CLKS: + name = "CORE_CLKS" + desc = """ +Core actual clocks""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = CORE_CLKS(EV, 0) + except ZeroDivisionError: + print "CORE_CLKS zero division" + self.val = 0 + +class Metric_Time: + name = "Time" + desc = """ +Run duration time in seconds""" + domain = "Count" + maxval = 0 + + def compute(self, EV): + try: + self.val = Time(EV, 0) + except ZeroDivisionError: + print "Time zero division" + self.val = 0 + +# Schedule + + +class Setup: + def __init__(self, r): + o = dict() + n = Frontend_Bound() ; r.run(n) ; o["Frontend_Bound"] = n + n = Frontend_Latency() ; r.run(n) ; o["Frontend_Latency"] = n + n = ITLB_Misses() ; r.run(n) ; o["ITLB_Misses"] = n + n = DSB_Switches() ; r.run(n) ; o["DSB_Switches"] = n + n = LCP() ; r.run(n) ; o["LCP"] = n + n = MS_Switches() ; r.run(n) ; o["MS_Switches"] = n + n = Frontend_Bandwidth() ; r.run(n) ; o["Frontend_Bandwidth"] = n + n = MITE() ; r.run(n) ; o["MITE"] = n + n = DSB() ; r.run(n) ; o["DSB"] = n + n = LSD() ; r.run(n) ; o["LSD"] = n + n = Bad_Speculation() ; r.run(n) ; o["Bad_Speculation"] = n + n = Backend_Bound() ; r.run(n) ; o["Backend_Bound"] = n + n = Retiring() ; r.run(n) ; o["Retiring"] = n + n = Base() ; r.run(n) ; o["Base"] = n + n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n + + # parents + + o["Frontend_Latency"].parent = o["Frontend_Bound"] + o["ITLB_Misses"].parent = o["Frontend_Latency"] + o["DSB_Switches"].parent = o["Frontend_Latency"] + o["LCP"].parent = o["Frontend_Latency"] + o["MS_Switches"].parent = o["Frontend_Latency"] + o["Frontend_Bandwidth"].parent = o["Frontend_Bound"] + o["MITE"].parent = o["Frontend_Bandwidth"] + o["DSB"].parent = o["Frontend_Bandwidth"] + o["LSD"].parent = o["Frontend_Bandwidth"] + o["Base"].parent = o["Retiring"] + o["Microcode_Sequencer"].parent = o["Retiring"] + + # references between groups + + o["Frontend_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["Frontend_Bandwidth"].Frontend_Latency = o["Frontend_Latency"] + o["Backend_Bound"].Frontend_Bound = o["Frontend_Bound"] + o["Backend_Bound"].Bad_Speculation = o["Bad_Speculation"] + o["Backend_Bound"].Retiring = o["Retiring"] + o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Base"].Retiring = o["Retiring"] + o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"] + + # siblings cross-tree + + o["Frontend_Bound"].sibling = None + o["Frontend_Latency"].sibling = None + o["ITLB_Misses"].sibling = None + o["DSB_Switches"].sibling = None + o["LCP"].sibling = None + o["MS_Switches"].sibling = o["Microcode_Sequencer"] + o["Frontend_Bandwidth"].sibling = None + o["MITE"].sibling = None + o["DSB"].sibling = None + o["LSD"].sibling = None + o["Bad_Speculation"].sibling = None + o["Backend_Bound"].sibling = None + o["Retiring"].sibling = None + o["Base"].sibling = None + o["Microcode_Sequencer"].sibling = o["MS_Switches"] + + # sampling events + + o["Frontend_Bound"].sample = [] + o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END'] + o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED'] + o["DSB_Switches"].sample = [] + o["LCP"].sample = [] + o["MS_Switches"].sample = [] + o["Frontend_Bandwidth"].sample = [] + o["MITE"].sample = [] + o["DSB"].sample = [] + o["LSD"].sample = [] + o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES'] + o["Backend_Bound"].sample = [] + o["Retiring"].sample = [] + o["Base"].sample = ['INST_RETIRED.PREC_DIST'] + o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS'] + + # user visible metrics + + n = Metric_IPC() ; r.metric(n) + n = Metric_CPI() ; r.metric(n) + n = Metric_CoreIPC() ; r.metric(n) + n = Metric_UPI() ; r.metric(n) + n = Metric_IPTB() ; r.metric(n) + n = Metric_BPTB() ; r.metric(n) + n = Metric_DSB_Coverage() ; r.metric(n) + n = Metric_MLP() ; r.metric(n) + n = Metric_GFLOPs() ; r.metric(n) + n = Metric_Turbo_Utilization() ; r.metric(n) + n = Metric_Page_Walks_Use() ; r.metric(n) + n = Metric_MUX() ; r.metric(n) + n = Metric_CLKS() ; r.metric(n) + n = Metric_CORE_CLKS() ; r.metric(n) + n = Metric_Time() ; r.metric(n) diff --git a/snb_client_ratios.py b/snb_client_ratios.py index 875e347f..2c6747bb 100644 --- a/snb_client_ratios.py +++ b/snb_client_ratios.py @@ -1,57 +1,151 @@ # -# auto generated TopDown description for Intel 2nd gen Core (code named SandyBridge) +# auto generated TopDown 2.9 description for Intel 2nd gen Core (code named SandyBridge) # Please see http://ark.intel.com for more details on these CPUs. # +# References: +# http://halobates.de/blog/p/262 +# https://sites.google.com/site/analysismethods/yasin-pubs +# +smt_enabled = False # Constants -PipelineWidth = 4 -MEM_L3_WEIGHT = 7 -MEM_STLB_HIT_COST = 7 -MEM_SFB_COST = 13 -MEM_4KALIAS_COST = 7 -MEM_XSNP_HITM_COST = 60 -MEM_XSNP_HIT_COST = 43 -MEM_XSNP_NONE_COST = 29 -MS_SWITCHES_COST = 3 +Pipeline_Width = 4 +L2_Store_Latency = 9 +Mem_L3_Weight = 7 +Mem_STLB_Hit_Cost = 7 +Mem_SFB_Cost = 13 +Mem_4K_Alias_Cost = 7 +Mem_XSNP_HitM_Cost = 60 +MEM_XSNP_Hit_Cost = 43 +MEM_XSNP_None_Cost = 29 +Mem_Local_DRAM_Cost = 200 +Mem_Remote_DRAM_Cost = 310 +Mem_Remote_HitM_Cost = 200 +Mem_Remote_Fwd_Cost = 180 +MS_Switches_Cost = 3 +OneMillion = 1000000 +Energy_Unit = 15.6 # Aux. formulas -def CLKS(EV, level): - return EV("CPU_CLK_UNHALTED.THREAD", level) + # Floating Point Operations Count -def FLOP_count(EV, level): - return ( 1 *(EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level))+ 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level))+ 8 * EV("SIMD_FP_256.PACKED_SINGLE", level) ) -def RetireUopFraction(EV, level): +def FLOP_Count(EV, level): + return (1 *(EV("FP_COMP_OPS_EXE.X87", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level)) + 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level)) + 8 * EV("SIMD_FP_256.PACKED_SINGLE", level)) + +def Recovery_Cycles(EV, level): + EV("INT_MISC.RECOVERY_CYCLES", level) + EV("INT_MISC.RECOVERY_CYCLES:amt1", level) + return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level) + +def L1D_Miss_Cycles(EV, level): + EV("L1D_PEND_MISS.PENDING_CYCLES", level) + EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) + return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level) + +def SQ_Full_Cycles(EV, level): + EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) + +def ITLB_Miss_Cycles(EV, level): + return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level)) + +def ORO_Demand_DRD_C1(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level ) + +def ORO_Demand_DRD_C6(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level ) + +def ORO_Demand_RFO_C1(EV, level): + return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", level)) , level ) + +def Store_L2_Hit_Cycles(EV, level): + return 0 + +def Cycles_False_Sharing_Client(EV, level): + return Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", level) + EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE", level)) + +def Mem_Lock_St_Fraction(EV, level): + return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level) + +def Retire_Uop_Fraction(EV, level): return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level) + def SLOTS(EV, level): - return PipelineWidth * CLKS(EV, level) -# Instructions Per Cycle + return Pipeline_Width * CORE_CLKS(EV, level) + +def DurationTimeInSeconds(EV, level): + return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 ) + +# Instructions Per Cycle (per logical thread) def IPC(EV, level): return EV("INST_RETIRED.ANY", level) / CLKS(EV, level) + +# Cycles Per Instruction (threaded) +def CPI(EV, level): + return 1 / IPC(EV, level) + +# Instructions Per Cycle (per physical core) +def CoreIPC(EV, level): + return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level) + # Uops Per Instruction def UPI(EV, level): return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level) + # Instruction per taken branch -def InstPerTakenBranch(EV, level): +def IPTB(EV, level): return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + +# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes. +def BPTB(EV, level): + return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level) + # Fraction of Uops delivered by the DSB (decoded instructions cache) -def DSBCoverage(EV, level): - return ( EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level))/(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level) ) -# Memory-Level-Parallelism (avg L1 miss demand load when there is at least 1 such miss) +def DSB_Coverage(EV, level): + return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level)) + +# Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss) def MLP(EV, level): - return EV("L1D_PEND_MISS.PENDING", level) / EV("L1D_PEND_MISS.PENDING_CYCLES", level) + return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level) + +# Giga Floating Point Operations Per Second +def GFLOPs(EV, level): + return FLOP_Count(EV, level) / OneMillion / DurationTimeInSeconds(EV, level) / 1000 + # Average Frequency Utilization relative nominal frequency -def TurboUtilization(EV, level): +def Turbo_Utilization(EV, level): return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level) +# Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store +def Page_Walks_Use(EV, level): + return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level) + +# PerfMon Event Multiplexing accuracy indicator +def MUX(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level) + +# Per-thread actual clocks +def CLKS(EV, level): + return EV("CPU_CLK_UNHALTED.THREAD", level) + +# Core actual clocks +def CORE_CLKS(EV, level): + EV("CPU_CLK_UNHALTED.THREAD:amt1", level) + return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level) + +# Run duration time in seconds +def Time(EV, level): + return DurationTimeInSeconds(EV, level) + # Event groups -class FrontendBound: - name = "FrontendBound" +class Frontend_Bound: + name = "Frontend_Bound" domain = "Slots" area = "FE" desc = """ @@ -64,68 +158,72 @@ class FrontendBound: latter can accept them. For example, stalls due to instruction-cache misses would be categorized under Frontend Bound.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1)/ SLOTS(EV, 1 ) + self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.2) except ZeroDivisionError: - #print "FrontendBound zero division" + #print "Frontend_Bound zero division" self.val = 0 self.thresh = False return self.val -class FrontendLatency: - name = "Frontend Latency" +class Frontend_Latency: + name = "Frontend_Latency" domain = "Slots" area = "FE" desc = """ This metric represents slots fraction CPU was stalled due to Frontend latency issues. For example, instruction-cache misses, iTLB misses or fetch stalls -after a branch missprediction are categorized under Frontend Latency. In such +after a branch misprediction are categorized under Frontend Latency. In such cases the Frontend eventually delivers no uops for some period.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = PipelineWidth * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2)/ SLOTS(EV, 2 ) + self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 ) self.thresh = (self.val > 0.15) and self.parent.thresh except ZeroDivisionError: - #print "FrontendLatency zero division" + #print "Frontend_Latency zero division" self.val = 0 self.thresh = False return self.val -class ITLBmisses: - name = "ITLB misses" +class ITLB_Misses: + name = "ITLB_Misses" domain = "Clocks" area = "FE" desc = """ This metric represents cycles fraction CPU was stalled due to instruction TLB misses. Using large code pages may be considered here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ITLB_MISSES.WALK_DURATION", 3)/ CLKS(EV, 3 ) + self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: - #print "ITLBmisses zero division" + #print "ITLB_Misses zero division" self.val = 0 self.thresh = False return self.val -class DSBswitches: - name = "DSB switches" +class DSB_Switches: + name = "DSB_Switches" domain = "Clocks" area = "FE" desc = """ This metric represents cycles fraction CPU was stalled due to switches from DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3)/ CLKS(EV, 3 ) + self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: - #print "DSBswitches zero division" + #print "DSB_Switches zero division" self.val = 0 self.thresh = False return self.val @@ -139,9 +237,10 @@ class LCP: Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will certainly avoid this.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = EV("ILD_STALL.LCP", 3)/ CLKS(EV, 3 ) + self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: #print "LCP zero division" @@ -149,8 +248,8 @@ def compute(self, EV): self.thresh = False return self.val -class MSswitches: - name = "MS switches" +class MS_Switches: + name = "MS_Switches" domain = "Clocks" area = "FE" desc = """ @@ -160,18 +259,19 @@ class MSswitches: deliver long uop flows required by CISC instructions like CPUID, or uncommon conditions like Floating Point Assists when dealing with Denormals.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = MS_SWITCHES_COST * EV("IDQ.MS_SWITCHES", 3)/ CLKS(EV, 3 ) + self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 ) self.thresh = (self.val > 0.05) and self.parent.thresh except ZeroDivisionError: - #print "MSswitches zero division" + #print "MS_Switches zero division" self.val = 0 self.thresh = False return self.val -class FrontendBandwidth: - name = "Frontend Bandwidth" +class Frontend_Bandwidth: + name = "Frontend_Bandwidth" domain = "Slots" area = "FE" desc = """ @@ -181,28 +281,30 @@ class FrontendBandwidth: under Frontend Bandwidth. In such cases, the Frontend typically delivers non- optimal amount of uops to the Backend.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.FrontendBound.compute(EV)- self.FrontendLatency.compute(EV ) + self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV ) self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh except ZeroDivisionError: - #print "FrontendBandwidth zero division" + #print "Frontend_Bandwidth zero division" self.val = 0 self.thresh = False return self.val class MITE: name = "MITE" - domain = "Clocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -the MITE fetch pipeline. For example, inefficiencies in the instruction -decoders are categorized here.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to the MITE fetch pipeline. For example, inefficiencies in the +instruction decoders are categorized here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CLKS(EV, 3 ) + self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "MITE zero division" @@ -212,17 +314,18 @@ def compute(self, EV): class DSB: name = "DSB" - domain = "Clocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -DSB (decoded uop cache) fetch pipeline. For example, inefficient utlilization -of the DSB cache structure or bank conflict when reading from it, are -categorized here.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to DSB (decoded uop cache) fetch pipeline. For example, inefficient +utilization of the DSB cache structure or bank conflict when reading from it, +are categorized here.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CLKS(EV, 3 ) + self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.3) and self.parent.thresh except ZeroDivisionError: #print "DSB zero division" @@ -232,18 +335,19 @@ def compute(self, EV): class LSD: name = "LSD" - domain = "Clocks" + domain = "CoreClocks" area = "FE" desc = """ -This metric represents cycles fraction in which CPU was likely limited due to -LSD (Loop Stream Detector) unit. LSD typically does well sustaining Uop -supply. However, in some rare cases, optimal uop-delivery could not be reached -for small loops whose size (in terms of number of uops) does not suit well the -LSD structure.""" +This metric represents Core cycles fraction in which CPU was likely limited +due to LSD (Loop Stream Detector) unit. LSD typically does well sustaining +Uop supply. However, in some rare cases, optimal uop-delivery could not be +reached for small loops whose size (in terms of number of uops) does not suit +well the LSD structure.""" level = 3 + htoff = False def compute(self, EV): try: - self.val = ( EV("LSD.CYCLES_ACTIVE", 3)- EV("LSD.CYCLES_4_UOPS", 3)) / CLKS(EV, 3 ) + self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 ) self.thresh = (self.val > 0.1) and self.parent.thresh except ZeroDivisionError: #print "LSD zero division" @@ -251,8 +355,8 @@ def compute(self, EV): self.thresh = False return self.val -class BadSpeculation: - name = "BadSpeculation" +class Bad_Speculation: + name = "Bad_Speculation" domain = "Slots" area = "BAD" desc = """ @@ -262,12 +366,13 @@ class BadSpeculation: speculation. For example, wasted work due to miss-predicted branches are categorized under Bad Speculation category""" level = 1 + htoff = False def compute(self, EV): try: - self.val = ( EV("UOPS_ISSUED.ANY", 1)- EV("UOPS_RETIRED.RETIRE_SLOTS", 1)+ PipelineWidth * EV("INT_MISC.RECOVERY_CYCLES", 1)) / SLOTS(EV, 1 ) + self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 ) self.thresh = (self.val > 0.1) except ZeroDivisionError: - #print "BadSpeculation zero division" + #print "Bad_Speculation zero division" self.val = 0 self.thresh = False return self.val @@ -285,9 +390,10 @@ class Backend_Bound: stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = 1 -(self.FrontendBound.compute(EV)+ self.BadSpeculation.compute(EV)+ self.Retiring.compute(EV)) + self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV)) self.thresh = (self.val > 0.2) except ZeroDivisionError: #print "Backend_Bound zero division" @@ -312,40 +418,43 @@ class Retiring: computations be done without significantly increasing number of instructions thus improving the performance.""" level = 1 + htoff = False def compute(self, EV): try: - self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1)/ SLOTS(EV, 1 ) - self.thresh = (self.val > 0.7) | self.MicroSequencer.thresh + self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 ) + self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh except ZeroDivisionError: #print "Retiring zero division" self.val = 0 self.thresh = False return self.val -class BASE: - name = "BASE" +class Base: + name = "Base" domain = "Slots" area = "RET" desc = """ -This metric represents slots fraction CPU was retiring uops not originated -from the microcode-sequencer. This correlates with total number of +This metric represents slots fraction where the CPU was retiring uops not +originated from the microcode-sequencer. This correlates with total number of instructions used by the program. A uops-per-instruction ratio of 1 should be -expected. A high Retiring value for non-vectorized code is typically a good -hint for programmer to pursue vectorizing his code, which can reduce -instructions hence this bucket.""" +expected. While this is the most desirable of the top 4 categories, high +values may still indicate areas for improvement. If possible focus on +techniques that reduce instruction count or result in more efficient +instructions generation such as vectorization.""" level = 2 + htoff = False def compute(self, EV): try: - self.val = self.Retiring.compute(EV)- self.MicroSequencer.compute(EV ) + self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV ) self.thresh = (self.val > 0.6) and self.parent.thresh except ZeroDivisionError: - #print "BASE zero division" + #print "Base zero division" self.val = 0 self.thresh = False return self.val -class MicroSequencer: - name = "MicroSequencer" +class Microcode_Sequencer: + name = "Microcode_Sequencer" domain = "Slots" area = "RET" desc = """ @@ -354,12 +463,13 @@ class MicroSequencer: decoded by the default decoders (like repeat move strings), or by microcode assists used to address some operation modes (like in Floating Point assists).""" level = 2 + htoff = False def compute(self, EV): try: - self.val = RetireUopFraction(EV, 2)* EV("IDQ.MS_UOPS", 2)/ SLOTS(EV, 2 ) + self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 ) self.thresh = (self.val > 0.05) except ZeroDivisionError: - #print "MicroSequencer zero division" + #print "Microcode_Sequencer zero division" self.val = 0 self.thresh = False return self.val @@ -367,7 +477,9 @@ def compute(self, EV): class Metric_IPC: name = "IPC" desc = """ -Instructions Per Cycle""" +Instructions Per Cycle (per logical thread)""" + domain = "Metric" + maxval = 5 def compute(self, EV): try: @@ -376,10 +488,40 @@ def compute(self, EV): print "IPC zero division" self.val = 0 +class Metric_CPI: + name = "CPI" + desc = """ +Cycles Per Instruction (threaded)""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = CPI(EV, 0) + except ZeroDivisionError: + print "CPI zero division" + self.val = 0 + +class Metric_CoreIPC: + name = "CoreIPC" + desc = """ +Instructions Per Cycle (per physical core)""" + domain = "Metric" + maxval = 5 + + def compute(self, EV): + try: + self.val = CoreIPC(EV, 0) + except ZeroDivisionError: + print "CoreIPC zero division" + self.val = 0 + class Metric_UPI: name = "UPI" desc = """ Uops Per Instruction""" + domain = "Metric" + maxval = 2 def compute(self, EV): try: @@ -388,35 +530,56 @@ def compute(self, EV): print "UPI zero division" self.val = 0 -class Metric_InstPerTakenBranch: - name = "InstPerTakenBranch" +class Metric_IPTB: + name = "IPTB" desc = """ Instruction per taken branch""" + domain = "Metric" + maxval = 0 def compute(self, EV): try: - self.val = InstPerTakenBranch(EV, 0) + self.val = IPTB(EV, 0) except ZeroDivisionError: - print "InstPerTakenBranch zero division" + print "IPTB zero division" self.val = 0 -class Metric_DSBCoverage: - name = "DSBCoverage" +class Metric_BPTB: + name = "BPTB" + desc = """ +Branch instructions per taken branch. Can be used to approximate PGO- +likelihood for non-loopy codes.""" + domain = "Metric" + maxval = 0 + + def compute(self, EV): + try: + self.val = BPTB(EV, 0) + except ZeroDivisionError: + print "BPTB zero division" + self.val = 0 + +class Metric_DSB_Coverage: + name = "DSB_Coverage" desc = """ Fraction of Uops delivered by the DSB (decoded instructions cache)""" + domain = "Metric" + maxval = 1 def compute(self, EV): try: - self.val = DSBCoverage(EV, 0) + self.val = DSB_Coverage(EV, 0) except ZeroDivisionError: - print "DSBCoverage zero division" + print "DSB_Coverage zero division" self.val = 0 class Metric_MLP: name = "MLP" desc = """ -Memory-Level-Parallelism (avg L1 miss demand load when there is at least 1 -such miss)""" +Memory-Level-Parallelism (average number of L1 miss demand load when there is +at least 1 such miss)""" + domain = "Metric" + maxval = 10 def compute(self, EV): try: @@ -425,16 +588,103 @@ def compute(self, EV): print "MLP zero division" self.val = 0 -class Metric_TurboUtilization: - name = "TurboUtilization" +class Metric_GFLOPs: + name = "GFLOPs" + desc = """ +Giga Floating Point Operations Per Second""" + domain = "Metric" + maxval = 100 + + def compute(self, EV): + try: + self.val = GFLOPs(EV, 0) + except ZeroDivisionError: + print "GFLOPs zero division" + self.val = 0 + +class Metric_Turbo_Utilization: + name = "Turbo_Utilization" desc = """ Average Frequency Utilization relative nominal frequency""" + domain = "Metric" + maxval = 10 + + def compute(self, EV): + try: + self.val = Turbo_Utilization(EV, 0) + except ZeroDivisionError: + print "Turbo_Utilization zero division" + self.val = 0 + +class Metric_Page_Walks_Use: + name = "Page_Walks_Use" + desc = """ +Fraction of cycles where the core's Page Walker is busy serving +iTLB/Load/Store""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = Page_Walks_Use(EV, 0) + except ZeroDivisionError: + print "Page_Walks_Use zero division" + self.val = 0 + +class Metric_MUX: + name = "MUX" + desc = """ +PerfMon Event Multiplexing accuracy indicator""" + domain = "Clocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = MUX(EV, 0) + except ZeroDivisionError: + print "MUX zero division" + self.val = 0 + +class Metric_CLKS: + name = "CLKS" + desc = """ +Per-thread actual clocks""" + domain = "Count" + maxval = 0 + + def compute(self, EV): + try: + self.val = CLKS(EV, 0) + except ZeroDivisionError: + print "CLKS zero division" + self.val = 0 + +class Metric_CORE_CLKS: + name = "CORE_CLKS" + desc = """ +Core actual clocks""" + domain = "CoreClocks" + maxval = 0 + + def compute(self, EV): + try: + self.val = CORE_CLKS(EV, 0) + except ZeroDivisionError: + print "CORE_CLKS zero division" + self.val = 0 + +class Metric_Time: + name = "Time" + desc = """ +Run duration time in seconds""" + domain = "Count" + maxval = 0 def compute(self, EV): try: - self.val = TurboUtilization(EV, 0) + self.val = Time(EV, 0) except ZeroDivisionError: - print "TurboUtilization zero division" + print "Time zero division" self.val = 0 # Schedule @@ -443,89 +693,97 @@ def compute(self, EV): class Setup: def __init__(self, r): o = dict() - n = FrontendBound() ; r.run(n) ; o["FrontendBound"] = n - n = FrontendLatency() ; r.run(n) ; o["FrontendLatency"] = n - n = ITLBmisses() ; r.run(n) ; o["ITLBmisses"] = n - n = DSBswitches() ; r.run(n) ; o["DSBswitches"] = n + n = Frontend_Bound() ; r.run(n) ; o["Frontend_Bound"] = n + n = Frontend_Latency() ; r.run(n) ; o["Frontend_Latency"] = n + n = ITLB_Misses() ; r.run(n) ; o["ITLB_Misses"] = n + n = DSB_Switches() ; r.run(n) ; o["DSB_Switches"] = n n = LCP() ; r.run(n) ; o["LCP"] = n - n = MSswitches() ; r.run(n) ; o["MSswitches"] = n - n = FrontendBandwidth() ; r.run(n) ; o["FrontendBandwidth"] = n + n = MS_Switches() ; r.run(n) ; o["MS_Switches"] = n + n = Frontend_Bandwidth() ; r.run(n) ; o["Frontend_Bandwidth"] = n n = MITE() ; r.run(n) ; o["MITE"] = n n = DSB() ; r.run(n) ; o["DSB"] = n n = LSD() ; r.run(n) ; o["LSD"] = n - n = BadSpeculation() ; r.run(n) ; o["BadSpeculation"] = n + n = Bad_Speculation() ; r.run(n) ; o["Bad_Speculation"] = n n = Backend_Bound() ; r.run(n) ; o["Backend_Bound"] = n n = Retiring() ; r.run(n) ; o["Retiring"] = n - n = BASE() ; r.run(n) ; o["BASE"] = n - n = MicroSequencer() ; r.run(n) ; o["MicroSequencer"] = n + n = Base() ; r.run(n) ; o["Base"] = n + n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n # parents - o["FrontendLatency"].parent = o["FrontendBound"] - o["ITLBmisses"].parent = o["FrontendLatency"] - o["DSBswitches"].parent = o["FrontendLatency"] - o["LCP"].parent = o["FrontendLatency"] - o["MSswitches"].parent = o["FrontendLatency"] - o["FrontendBandwidth"].parent = o["FrontendBound"] - o["MITE"].parent = o["FrontendBandwidth"] - o["DSB"].parent = o["FrontendBandwidth"] - o["LSD"].parent = o["FrontendBandwidth"] - o["BASE"].parent = o["Retiring"] - o["MicroSequencer"].parent = o["Retiring"] + o["Frontend_Latency"].parent = o["Frontend_Bound"] + o["ITLB_Misses"].parent = o["Frontend_Latency"] + o["DSB_Switches"].parent = o["Frontend_Latency"] + o["LCP"].parent = o["Frontend_Latency"] + o["MS_Switches"].parent = o["Frontend_Latency"] + o["Frontend_Bandwidth"].parent = o["Frontend_Bound"] + o["MITE"].parent = o["Frontend_Bandwidth"] + o["DSB"].parent = o["Frontend_Bandwidth"] + o["LSD"].parent = o["Frontend_Bandwidth"] + o["Base"].parent = o["Retiring"] + o["Microcode_Sequencer"].parent = o["Retiring"] # references between groups - o["FrontendBandwidth"].FrontendBound = o["FrontendBound"] - o["FrontendBandwidth"].FrontendLatency = o["FrontendLatency"] - o["Backend_Bound"].FrontendBound = o["FrontendBound"] - o["Backend_Bound"].BadSpeculation = o["BadSpeculation"] + o["Frontend_Bandwidth"].Frontend_Bound = o["Frontend_Bound"] + o["Frontend_Bandwidth"].Frontend_Latency = o["Frontend_Latency"] + o["Backend_Bound"].Frontend_Bound = o["Frontend_Bound"] + o["Backend_Bound"].Bad_Speculation = o["Bad_Speculation"] o["Backend_Bound"].Retiring = o["Retiring"] - o["Retiring"].MicroSequencer = o["MicroSequencer"] - o["BASE"].Retiring = o["Retiring"] - o["BASE"].MicroSequencer = o["MicroSequencer"] + o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"] + o["Base"].Retiring = o["Retiring"] + o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"] # siblings cross-tree - o["FrontendBound"].sibling = None - o["FrontendLatency"].sibling = None - o["ITLBmisses"].sibling = None - o["DSBswitches"].sibling = None + o["Frontend_Bound"].sibling = None + o["Frontend_Latency"].sibling = None + o["ITLB_Misses"].sibling = None + o["DSB_Switches"].sibling = None o["LCP"].sibling = None - o["MSswitches"].sibling = o["MicroSequencer"] - o["FrontendBandwidth"].sibling = None + o["MS_Switches"].sibling = o["Microcode_Sequencer"] + o["Frontend_Bandwidth"].sibling = None o["MITE"].sibling = None o["DSB"].sibling = None o["LSD"].sibling = None - #o["BadSpeculation"].sibling = o["BranchResteers"] - o["BadSpeculation"].sibling = None + o["Bad_Speculation"].sibling = None o["Backend_Bound"].sibling = None o["Retiring"].sibling = None - o["BASE"].sibling = None - o["MicroSequencer"].sibling = o["MSswitches"] + o["Base"].sibling = None + o["Microcode_Sequencer"].sibling = o["MS_Switches"] - # sampling events (experimential) + # sampling events - o["FrontendBound"].sample = [] - o["FrontendLatency"].sample = [] - o["ITLBmisses"].sample = [] - o["DSBswitches"].sample = [] + o["Frontend_Bound"].sample = [] + o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END'] + o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED'] + o["DSB_Switches"].sample = [] o["LCP"].sample = [] - o["MSswitches"].sample = [] - o["FrontendBandwidth"].sample = [] + o["MS_Switches"].sample = [] + o["Frontend_Bandwidth"].sample = [] o["MITE"].sample = [] o["DSB"].sample = [] o["LSD"].sample = [] - o["BadSpeculation"].sample = [] + o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES'] o["Backend_Bound"].sample = [] o["Retiring"].sample = [] - o["BASE"].sample = [] - o["MicroSequencer"].sample = [] + o["Base"].sample = ['INST_RETIRED.PREC_DIST'] + o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS'] # user visible metrics n = Metric_IPC() ; r.metric(n) + n = Metric_CPI() ; r.metric(n) + n = Metric_CoreIPC() ; r.metric(n) n = Metric_UPI() ; r.metric(n) - n = Metric_InstPerTakenBranch() ; r.metric(n) - n = Metric_DSBCoverage() ; r.metric(n) + n = Metric_IPTB() ; r.metric(n) + n = Metric_BPTB() ; r.metric(n) + n = Metric_DSB_Coverage() ; r.metric(n) n = Metric_MLP() ; r.metric(n) - n = Metric_TurboUtilization() ; r.metric(n) + n = Metric_GFLOPs() ; r.metric(n) + n = Metric_Turbo_Utilization() ; r.metric(n) + n = Metric_Page_Walks_Use() ; r.metric(n) + n = Metric_MUX() ; r.metric(n) + n = Metric_CLKS() ; r.metric(n) + n = Metric_CORE_CLKS() ; r.metric(n) + n = Metric_Time() ; r.metric(n) diff --git a/tl-tester b/tl-tester index b75c520e..62bb7b51 100755 --- a/tl-tester +++ b/tl-tester @@ -33,10 +33,15 @@ EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py -d -l4 $LOAD EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py -v -d -l4 $LOAD EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py -x, -v -d -l4 $LOAD EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py --metrics -x, -v -d -l4 $LOAD -EVENTMAP=${cpus[ivt]} FORCECPU=ivt $WRAP ./toplev.py -d --all --sample --kernel $LOAD -EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d --all --sample $LOAD -EVENTMAP=${cpus[jkt]} FORCECPU=jkt $WRAP ./toplev.py -d --all --sample $LOAD -EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --all --sample $LOAD +EVENTMAP=${cpus[ivt]} FORCECPU=ivt $WRAP ./toplev.py -d --all --kernel $LOAD +EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d --all $LOAD +EVENTMAP=${cpus[jkt]} FORCECPU=jkt $WRAP ./toplev.py -d --all -l5 $LOAD +EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --all $LOAD +EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --all -l5 $LOAD +# fixme: need event list with missing events +#EVENTMAP=${cpus[hsx]} FORCECPU=hsx $WRAP ./toplev.py -d --all -l5 $LOAD +#EVENTMAP=${cpus[hsx]} FORCECPU=hsx $WRAP ./toplev.py -d --all $LOAD +EVENTMAP=${cpus[ivt]} FORCECPU=ivt $WRAP ./toplev.py -d --all $LOAD EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --metrics -l4 $LOAD EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --metrics --no-multiplex -l4 $LOAD EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --power -l4 $LOAD @@ -55,9 +60,9 @@ EVENTMAP=${cpus[slm]} FORCECPU=slm $WRAP ./toplev.py -d --all $LOAD # need new perf # test other perf output formats -EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d -l4 -I 1000 -a --per-core sleep 1 -EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d -l4 -I 1000 -a --per-socket sleep 1 -EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d -l4 -I 1000 -a -A sleep 1 +EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d -l4 -I 1000 -a --per-core sleep 1 +EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d -l4 -I 1000 -a --per-socket sleep 1 +EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d -l4 -I 1000 -a -A sleep 1 trap "" ERR 0 diff --git a/toplev.py b/toplev.py index 182dd55c..ac5dac60 100755 --- a/toplev.py +++ b/toplev.py @@ -18,7 +18,7 @@ # Handles a variety of perf versions, but older ones have various limitations. import sys, os, re, itertools, textwrap, platform, pty, subprocess -import exceptions, argparse, time +import exceptions, argparse, time, types from collections import defaultdict, Counter #sys.path.append("../pmu-tools") import ocperf @@ -31,9 +31,10 @@ ("hsw", (60, 70, 69 )), ("hsx", (63, )), ("slm", (55, 77)), + ("bdw", (61, )), ) -tsx_cpus = ("hsw", "hsx") +tsx_cpus = ("hsw", "hsx", "bdw") ingroup_events = frozenset(["cycles", "instructions", "ref-cycles", "cpu/event=0x3c,umask=0x00,any=1/", @@ -164,8 +165,6 @@ def event_group(evlist): type=int, default=1) p.add_argument('--detailed', '-d', help=argparse.SUPPRESS, action='store_true') p.add_argument('--metrics', '-m', help="Print extra metrics", action='store_true') -p.add_argument('--sample', '-S', help="Suggest commands to sample for bottlenecks (experimental)", - action='store_true') p.add_argument('--raw', help="Print raw values", action='store_true') p.add_argument('--sw', help="Measure perf Linux metrics", action='store_true') p.add_argument('--cpu', '-C', help=argparse.SUPPRESS) @@ -321,9 +320,11 @@ def __init__(self): self.has_tsx = False self.freq = 0.0 self.siblings = {} + self.threads = 0 forced_cpu = self.force_cpu() self.force_counters() cores = Counter() + sockets = Counter() self.coreids = defaultdict(list) self.cputocore = {} with open("/proc/cpuinfo", "r") as f: @@ -349,14 +350,16 @@ def __init__(self): self.freq = float(m.group(1)) elif (n[0], n[1]) == ("physical", "id"): physid = int(n[3]) + sockets[physid] += 1 elif (n[0], n[1]) == ("core", "id"): coreid = int(n[3]) key = (physid, coreid,) cores[key] += 1 - if cores[key] > 1: + self.threads = max(self.threads, cores[key]) + if self.threads > 1: self.ht = True - self.coreids[coreid].append(cpunum) - self.cputocore[cpunum] = coreid + self.coreids[key].append(cpunum) + self.cputocore[cpunum] = key elif n[0] == "flags": ok += 1 self.has_tsx = "rtm" in n @@ -372,6 +375,7 @@ def __init__(self): self.counters = 4 else: self.counters = 8 + self.sockets = len(sockets.keys()) cpu = CPU() @@ -490,9 +494,27 @@ def set_interval(env, d): if args.raw: print "interval-ns val", env['interval-ns'] +def key_to_coreid(k): + x = cpu.cputocore[int(k)] + return x[0] * 1000 + x[1] + +def core_fmt(core): + if cpu.sockets > 1: + return "S%d-C%d" % (core / 1000, core % 1000,) + return "C%d" % (core % 1000,) + def print_keys(runner, res, rev, out, interval, env): - for j in sorted(res.keys()): - runner.print_res(res[j], rev[j], out, interval, j, env) + if need_any: + # collect counts from all threads of cores as lists + # this way the model can access all threads individually + keys = sorted(res.keys(), key = key_to_coreid) + for core, citer in itertools.groupby(keys, key_to_coreid): + cpus = list(citer) + r = list(itertools.izip(*[res[j] for j in cpus])) + runner.print_res(r, rev[cpus[0]], out, interval, core_fmt(core), env) + else: + for j in sorted(res.keys()): + runner.print_res(res[j], rev[j], out, interval, j, env) def execute_no_multiplex(runner, out, rest): if args.interval: # XXX @@ -503,6 +525,8 @@ def execute_no_multiplex(runner, out, rest): rev = defaultdict(list) env = dict() for g in groups: + if len(g) == 0: + continue print "RUN #%d of %d" % (n, len(groups)) ret, res, rev, interval = do_execute(runner, g, out, rest, res, rev, env) n += 1 @@ -511,7 +535,9 @@ def execute_no_multiplex(runner, out, rest): def execute(runner, out, rest): env = dict() - ret, res, rev, interval = do_execute(runner, ",".join(runner.evgroups), out, rest, + print "evgroups", runner.evgroups + ret, res, rev, interval = do_execute(runner, ",".join(filter(lambda x: len(x) > 0, runner.evgroups)), + out, rest, defaultdict(list), defaultdict(list), env) @@ -599,6 +625,8 @@ def do_execute(runner, evstr, out, rest, res, rev, env): return ret, res, rev, interval def ev_append(ev, level, obj): + if isinstance(ev, types.LambdaType): + return ev(lambda ev, level: ev_append(ev, level, obj), level) if ev in nonperf_events: return 99 if not (ev, level) in obj.evlevels: @@ -610,7 +638,7 @@ def ev_append(ev, level, obj): return 99 def canon_event(e): - m = re.match(r"(.*):(.*)", e) + m = re.match(r"(.*?):(.*)", e) if m: e = m.group(1) if e.upper() in fixed_counters: @@ -629,13 +657,33 @@ def event_rmap(e): n = fixes[n.upper()].lower() return n -def lookup_res(res, rev, ev, obj, env, level): +def lookup_res(res, rev, ev, obj, env, level, cpuoff = -1): if ev in env: return env[ev] + # + # when the model passed in a lambda run the function for each logical cpu + # (by resolving its EVs to only that CPU) + # and then sum up. This is needed for the workarounds to make various + # per thread counters at least as big as unhalted cycles. + # + # otherwise we always sum up. + # + if isinstance(ev, types.LambdaType): + n = 0 + for off in range(cpu.threads): # XXX + n += ev(lambda ev, level: lookup_res(res, rev, ev, obj, env, level, off), level) + return n + index = obj.res_map[(ev, level)] rev = event_rmap(rev[index]) assert (rev == canon_event(ev) or (ev in event_fixes and canon_event(event_fixes[ev]) == rev)) + + if isinstance(res[index], types.TupleType): + if cpuoff == -1: + return sum(res[index]) + else: + return res[index][cpuoff] return res[index] def add_key(k, x, y): @@ -741,7 +789,6 @@ def split_groups(self, objl, evlev): self.add(objl, raw_events(get_names(evl)), evl) def add(self, objl, evnum, evlev): - assert evlev # does not fit into a group. if len(set(evnum) - add_filter(ingroup_events)) > cpu.counters: self.split_groups(objl, evlev) @@ -777,7 +824,7 @@ def schedule(self): # try to fit each objects events into groups # that fit into the available CPU counters for obj in solist: - if obj.evnum[0] in outgroup_events: + if len(obj.evnum) == 0 or obj.evnum[0] in outgroup_events: self.add([obj], obj.evnum, obj.evlevels) continue # try adding another object to the current group @@ -827,20 +874,25 @@ def print_res(self, res, rev, out, timestamp, title, env): val = obj.val if not obj.thresh and not dont_hide: val = 0.0 + disclaimer = "" + if 'htoff' in obj.__dict__ and obj.htoff and obj.thresh and cpu.ht: + disclaimer = """ +Warning: Hyper Threading may lead to incorrect measurements for this node. +Suggest to re-measure with HT off.""" desc = obj.desc[1:].replace("\n", "\n\t") if obj.metric: out.metric(obj.area if 'area' in obj.__class__.__dict__ else None, obj.name, val, timestamp, - desc, + desc + disclaimer, title, obj.unit if 'unit' in obj.__class__.__dict__ else "metric") else: out.p(obj.area if 'area' in obj.__class__.__dict__ else None, full_name(obj), val, timestamp, "below" if not obj.thresh else "above", - desc, + desc + disclaimer, title, - sample_desc(obj.sample) if args.sample and obj.sample else "") + sample_desc(obj.sample) if obj.sample else "") def sysctl(name): try: @@ -879,12 +931,27 @@ def ht_warning(): ivb_server_ratios.smt_enabled = cpu.ht need_any = cpu.ht ivb_server_ratios.Setup(runner) -elif cpu.cpu == "snb" and detailed_model: +elif cpu.cpu == "snb": import snb_client_ratios snb_client_ratios.Setup(runner) -elif cpu.cpu == "hsw" and detailed_model: +elif cpu.cpu == "jkt": + import jkt_server_ratios + jkt_server_ratios.Setup(runner) +elif cpu.cpu == "hsw": import hsw_client_ratios + hsw_client_ratios.smt_enabled = cpu.ht + need_any = cpu.ht hsw_client_ratios.Setup(runner) +elif cpu.cpu == "hsx": + import hsx_server_ratios + hsx_server_ratios.smt_enabled = cpu.ht + need_any = cpu.ht + hsx_server_ratios.Setup(runner) +elif cpu.cpu == "bdw": + import bdw_client_ratios + bdw_client_ratios.smt_enabled = cpu.ht + need_any = cpu.ht + bdw_client_ratios.Setup(runner) elif cpu.cpu == "slm": import slm_ratios slm_ratios.Setup(runner) @@ -892,8 +959,6 @@ def ht_warning(): ht_warning() if detailed_model: print >>sys.stderr, "Sorry, no detailed model for your CPU. Only Level 1 supported." - if cpu.cpu == "jkt": - print >>sys.stderr, "Consider using FORCECPU=snb" import simple_ratios simple_ratios.Setup(runner) @@ -928,6 +993,8 @@ def setup_with_metrics(p, runner): print "Running in HyperThreading mode. Will measure complete system." if "--per-socket" in rest: sys.exit("Hyper Threading more not compatible with --per-socket") + if "--per-core" in rest: + sys.exit("Hyper Threading more not compatible with --per-core") if args.cpu: print >>sys.stderr, "Warning: --cpu/-C mode with HyperThread must specify all core thread pairs!" if not (os.geteuid() == 0 or sysctl("kernel.perf_event_paranoid") == -1):