From 2f4f5f3c22b01c28d74e11e7f2cad1019ba82c4f Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 21 Jan 2015 13:56:42 -0800
Subject: [PATCH] toplev: Add support for Top Down 2.9

Update toplev to use Top Down methology v2.9
Thanks to Ahmad and Benny.

Many improvements:
- Many fixes to SMT support. SMT now supported on Haswell.
- Many bug fixes to metrics
- Initial Haswell Server support
- Add a Sandy Bridge EP model
- Lots of new metrics and nodes: e.g. Core IPC, MUX confidence,
  BPTB (Branch per Taken Branch), SMT_2T utilization,
  IFetch_Line_utilization
- Initial Broadwell model support
- Improve sample event support. Now enable them by default.

--per-socket and --per-core is not supported anymore with SMT.
---
 bdw_client_ratios.py | 1642 ++++++++++++++++++++++++++++++++++++++++++
 cpumap.sh            |    2 +
 hsw_client_ratios.py | 1269 ++++++++++++++++++++++----------
 ivb_client_ratios.py |  788 +++++++++++++++-----
 ivb_server_ratios.py |  696 +++++++++++++-----
 jkt_server_ratios.py |  789 ++++++++++++++++++++
 snb_client_ratios.py |  562 +++++++++++----
 tl-tester            |   19 +-
 toplev.py            |  109 ++-
 9 files changed, 4923 insertions(+), 953 deletions(-)
 create mode 100644 bdw_client_ratios.py
 create mode 100644 jkt_server_ratios.py

diff --git a/bdw_client_ratios.py b/bdw_client_ratios.py
new file mode 100644
index 00000000..c547637f
--- /dev/null
+++ b/bdw_client_ratios.py
@@ -0,0 +1,1642 @@
+
+#
+# auto generated TopDown 2.9 description for Intel 5th gen Core / Core M (code named Broadwell)
+# Please see http://ark.intel.com for more details on these CPUs.
+#
+# References:
+# http://halobates.de/blog/p/262
+# https://sites.google.com/site/analysismethods/yasin-pubs
+#
+
+smt_enabled = False
+
+# Constants
+
+Pipeline_Width = 4
+L2_Store_Latency = 9
+Mem_L3_Weight = 7
+Mem_STLB_Hit_Cost = 7
+Mem_SFB_Cost = 13
+Mem_4K_Alias_Cost = 7
+Mem_XSNP_HitM_Cost = 60
+MEM_XSNP_Hit_Cost = 43
+MEM_XSNP_None_Cost = 29
+Mem_Local_DRAM_Cost = 200
+Mem_Remote_DRAM_Cost = 310
+Mem_Remote_HitM_Cost = 200
+Mem_Remote_Fwd_Cost = 180
+MS_Switches_Cost = 2
+OneMillion = 1000000
+Energy_Unit = 61
+
+# Aux. formulas
+
+
+def Recovery_Cycles(EV, level):
+    EV("INT_MISC.RECOVERY_CYCLES", level)
+    EV("INT_MISC.RECOVERY_CYCLES:amt1", level)
+    return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level)
+
+def Execute_Cycles(EV, level):
+    EV("UOPS_EXECUTED.CORE:c1", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c1", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+
+def L1D_Miss_Cycles(EV, level):
+    EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+    EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level)
+    return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+
+def SQ_Full_Cycles(EV, level):
+    EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+    return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+
+def ITLB_Miss_Cycles(EV, level):
+    return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level))
+
+def Cycles_0_Ports_Utilized(EV, level):
+    EV("ARITH.FPU_DIV_ACTIVE", level)
+    EV("UOPS_EXECUTED.CORE:i1:c1", level)
+    EV("CYCLE_ACTIVITY.STALLS_TOTAL", level)
+    EV("RS_EVENTS.EMPTY_CYCLES", level)
+    return (EV("UOPS_EXECUTED.CORE:i1:c1", level)) / 2 if smt_enabled else(EV("CYCLE_ACTIVITY.STALLS_TOTAL", level) - EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ARITH.FPU_DIV_ACTIVE", level))
+
+def Cycles_1_Port_Utilized(EV, level):
+    EV("UOPS_EXECUTED.CORE:c2", level)
+    EV("UOPS_EXECUTED.CORE:c1", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level))
+
+def Cycles_2_Ports_Utilized(EV, level):
+    EV("UOPS_EXECUTED.CORE:c2", level)
+    EV("UOPS_EXECUTED.CORE:c3", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level))
+
+def Cycles_3m_Ports_Utilized(EV, level):
+    EV("UOPS_EXECUTED.CORE:c3", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c3", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+
+def ORO_Demand_DRD_C1(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level )
+
+def ORO_Demand_DRD_C6(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level )
+
+def Store_L2_Hit_Cycles(EV, level):
+    return 0
+
+def Few_Uops_Executed_Threshold(EV, level):
+    EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level)> 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+
+def Backend_Bound_At_EXE(EV, level):
+    return (EV("CYCLE_ACTIVITY.STALLS_TOTAL", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level)) / CLKS(EV, level)
+
+def Mem_L3_Hit_Fraction(EV, level):
+    return EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level))
+
+def Mem_Lock_St_Fraction(EV, level):
+    return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level)
+
+def Mispred_Clears_Fraction(EV, level):
+    return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
+
+def Retire_Uop_Fraction(EV, level):
+    return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level)
+
+def SLOTS(EV, level):
+    return Pipeline_Width * CORE_CLKS(EV, level)
+
+def DurationTimeInSeconds(EV, level):
+    return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 )
+
+# Instructions Per Cycle (per logical thread)
+def IPC(EV, level):
+    return EV("INST_RETIRED.ANY", level) / CLKS(EV, level)
+
+# Cycles Per Instruction (threaded)
+def CPI(EV, level):
+    return 1 / IPC(EV, level)
+
+# Instructions Per Cycle (per physical core)
+def CoreIPC(EV, level):
+    return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level)
+
+# Uops Per Instruction
+def UPI(EV, level):
+    return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level)
+
+# Instruction per taken branch
+def IPTB(EV, level):
+    return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
+# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes.
+def BPTB(EV, level):
+    return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
+# Fraction of Uops delivered by the DSB (decoded instructions cache)
+def DSB_Coverage(EV, level):
+    return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level))
+
+# Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)
+def ILP(EV, level):
+    return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(EV, level)
+
+# Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)
+def MLP(EV, level):
+    return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level)
+
+# Actual Average Latency for L1 data-cache miss demand loads
+def Load_Miss_Real_Latency(EV, level):
+    return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level))
+
+# Average Frequency Utilization relative nominal frequency
+def Turbo_Utilization(EV, level):
+    return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level)
+
+# Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store
+def Page_Walks_Use(EV, level):
+    return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level)
+
+# PerfMon Event Multiplexing accuracy indicator
+def MUX(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level)
+
+# Per-thread actual clocks
+def CLKS(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD", level)
+
+# Core actual clocks
+def CORE_CLKS(EV, level):
+    EV("CPU_CLK_UNHALTED.THREAD:amt1", level)
+    return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level)
+
+# Run duration time in seconds
+def Time(EV, level):
+    return DurationTimeInSeconds(EV, level)
+
+# Event groups
+
+
+class Frontend_Bound:
+    name = "Frontend_Bound"
+    domain = "Slots"
+    area = "FE"
+    desc = """
+This category reflects slots where the Frontend of the processor undersupplies
+its Backend. Frontend denotes the first portion of pipeline responsible to
+fetch micro-ops which the Backend can execute. Within the Frontend, a branch
+predictor predicts the next address to fetch, cache-lines are fetched from
+memory, parsed into instructions, and lastly decoded into micro-ops. The
+purpose of the Frontend cluster is to deliver uops to Backend whenever the
+latter can accept them. For example, stalls due to instruction-cache misses
+would be categorized under Frontend Bound."""
+    level = 1
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
+            self.thresh = (self.val > 0.2)
+        except ZeroDivisionError:
+            #print "Frontend_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Frontend_Latency:
+    name = "Frontend_Latency"
+    domain = "Slots"
+    area = "FE"
+    desc = """
+This metric represents slots fraction CPU was stalled due to Frontend latency
+issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
+after a branch misprediction are categorized under Frontend Latency. In such
+cases the Frontend eventually delivers no uops for some period."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
+            self.thresh = (self.val > 0.15) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Frontend_Latency zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class ITLB_Misses:
+    name = "ITLB_Misses"
+    domain = "Clocks"
+    area = "FE"
+    desc = """
+This metric represents cycles fraction CPU was stalled due to instruction TLB
+misses. Using large code pages may be considered here."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "ITLB_Misses zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class DSB_Switches:
+    name = "DSB_Switches"
+    domain = "Clocks"
+    area = "FE"
+    desc = """
+This metric represents cycles fraction CPU was stalled due to switches from
+DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "DSB_Switches zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class LCP:
+    name = "LCP"
+    domain = "Clocks"
+    area = "FE"
+    desc = """
+This metric represents cycles fraction CPU was stalled due to Length Changing
+Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
+certainly avoid this."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "LCP zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class MS_Switches:
+    name = "MS_Switches"
+    domain = "Clocks"
+    area = "FE"
+    desc = """
+This metric represents cycles fraction CPU was stalled due to switches of uop
+delivery to the Microcode Sequencer (MS). Commonly used instructions are
+optimized for delivery by the DSB or MITE pipelines. The MS is designated to
+deliver long uop flows required by CISC instructions like CPUID, or uncommon
+conditions like Floating Point Assists when dealing with Denormals."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "MS_Switches zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Frontend_Bandwidth:
+    name = "Frontend_Bandwidth"
+    domain = "Slots"
+    area = "FE"
+    desc = """
+This metric represents slots fraction CPU was stalled due to Frontend
+bandwidth issues.  For example, inefficiencies at the instruction decoders, or
+code restrictions for caching in the DSB (decoded uops cache) are categorized
+under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
+optimal amount of uops to the Backend."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
+            self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Frontend_Bandwidth zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class MITE:
+    name = "MITE"
+    domain = "CoreClocks"
+    area = "FE"
+    desc = """
+This metric represents Core cycles fraction in which CPU was likely limited
+due to the MITE fetch pipeline.  For example, inefficiencies in the
+instruction decoders are categorized here."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "MITE zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class DSB:
+    name = "DSB"
+    domain = "CoreClocks"
+    area = "FE"
+    desc = """
+This metric represents Core cycles fraction in which CPU was likely limited
+due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
+utilization of the DSB cache structure or bank conflict when reading from it,
+are categorized here."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
+            self.thresh = (self.val > 0.3) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "DSB zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class LSD:
+    name = "LSD"
+    domain = "CoreClocks"
+    area = "FE"
+    desc = """
+This metric represents Core cycles fraction in which CPU was likely limited
+due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
+Uop supply. However, in some rare cases, optimal uop-delivery could not be
+reached for small loops whose size (in terms of number of uops) does not suit
+well the LSD structure."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "LSD zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Bad_Speculation:
+    name = "Bad_Speculation"
+    domain = "Slots"
+    area = "BAD"
+    desc = """
+This category reflects slots wasted due to incorrect speculations, which
+include slots used to allocate uops that do not eventually get retired and
+slots for which allocation was blocked due to recovery from earlier incorrect
+speculation. For example, wasted work due to miss-predicted branches are
+categorized under Bad Speculation category"""
+    level = 1
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
+            self.thresh = (self.val > 0.1)
+        except ZeroDivisionError:
+            #print "Bad_Speculation zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Branch_Mispredicts:
+    name = "Branch_Mispredicts"
+    domain = "Slots"
+    area = "BAD"
+    desc = """
+This metric represents slots fraction CPU was impacted by Branch
+Misprediction.  These slots are either wasted by uops fetched from an
+incorrectly speculated program path, or stalls the Backend of the machine
+needs to recover its state from a speculative path."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Branch_Mispredicts zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Machine_Clears:
+    name = "Machine_Clears"
+    domain = "Slots"
+    area = "BAD"
+    desc = """
+This metric represents slots fraction CPU was impacted by Machine Clears.
+These slots are either wasted by uops fetched prior to the clear, or stalls
+the Backend of the machine needs to recover its state after the clear. For
+example, this can happen due to memory ordering Nukes (e.g. Memory
+Disambiguation) or Self-Modifying-Code (SMC) nukes."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Machine_Clears zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Backend_Bound:
+    name = "Backend_Bound"
+    domain = "Slots"
+    area = "BE"
+    desc = """
+This category reflects slots where no uops are being delivered due to a lack
+of required resources for accepting more uops in the Backend of the pipeline.
+Backend describes the portion of the pipeline where the out-of-order scheduler
+dispatches ready uops into their respective execution units, and once
+completed these uops get retired according to program order. For example,
+stalls due to data-cache misses or stalls due to the divider unit being
+overloaded are both categorized under Backend Bound."""
+    level = 1
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
+            self.thresh = (self.val > 0.2)
+        except ZeroDivisionError:
+            #print "Backend_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Memory_Bound:
+    name = "Memory_Bound"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents how much Memory subsystem was a bottleneck.  Memory
+Bound measures cycle fraction where pipeline is likely stalled due to demand
+load or store instructions. This accounts mainly for non-completed in-flight
+memory demand loads which coincides with execution starvation. in addition to
+less common cases where stores could imply backpressure on the pipeline."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Memory_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class L1_Bound:
+    name = "L1_Bound"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents how often CPU was stalled without missing the L1 data
+cache.  The L1 cache typically has the shortest latency.  However, in certain
+cases like loads blocked on older stores, a load might suffer a high latency
+even though it is being satisfied by the L1. There are no fill-buffers
+allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event
+as it accounts for any non-completed load."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_MISS", 3)) / CLKS(EV, 3 )
+            self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh
+        except ZeroDivisionError:
+            #print "L1_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class DTLB_Load:
+    name = "DTLB_Load"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = ""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "DTLB_Load zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Store_Fwd_Blk:
+    name = "Store_Fwd_Blk"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = ""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Store_Fwd_Blk zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Split_Loads:
+    name = "Split_Loads"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = ""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Split_Loads zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class G4K_Aliasing:
+    name = "4K_Aliasing"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = ""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "G4K_Aliasing zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class L2_Bound:
+    name = "L2_Bound"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents how often CPU was stalled on L2 cache.  Avoiding cache
+misses (i.e. L1 misses/L2 hits) will improve the latency and increase
+performance."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_MISS", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_MISS", 3)) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.03) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "L2_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class L3_Bound:
+    name = "L3_Bound"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents how often CPU was stalled on L3 cache or contended with
+a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
+the latency and increase performance."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_MISS", 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "L3_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Contested_Accesses:
+    name = "Contested_Accesses"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = ""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Contested_Accesses zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Data_Sharing:
+    name = "Data_Sharing"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = ""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Data_Sharing zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class L3_Latency:
+    name = "L3_Latency"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric is a rough aggregate estimate of cycles fraction where CPU
+accessed L3 cache for all load requests, while there was no contention/sharing
+with a sibling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
+improve the latency and increase performance."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", 4) / CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "L3_Latency zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class SQ_Full:
+    name = "SQ_Full"
+    domain = "CoreClocks"
+    area = "BE/Mem"
+    desc = """
+This metric measures fraction of cycles where the Super Queue (SQ) was full
+taking into account all request-types and both hardware SMT threads. The Super
+Queue is used for requests to access the L2 cache or to go out to the Uncore."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "SQ_Full zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class MEM_Bound:
+    name = "MEM_Bound"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents how often CPU was stalled on main memory (DRAM).
+Caching will improve the latency and increase performance."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_MISS", 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "MEM_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class MEM_Bandwidth:
+    name = "MEM_Bandwidth"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents how often CPU was likely stalled due to approaching
+bandwidth limits of main memory (DRAM).  NUMA in multi-socket system may be
+considered in such case."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "MEM_Bandwidth zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class MEM_Latency:
+    name = "MEM_Latency"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents how often CPU was likely stalled due to latency from
+main memory (DRAM).  Data layout re-structuring or using Software Prefetches
+(also through the compiler) may be considered in such case."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "MEM_Latency zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Stores_Bound:
+    name = "Stores_Bound"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents how often CPU was stalled  due to store operations.
+even though memory store accesses do not typically stall out-of-order CPUs;
+there are few cases where stores can lead to actual stalls. This metric will
+be flagged should any of these cases be a bottleneck."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = self.Memory_Bound.compute(EV) -(EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 3) / CLKS(EV, 3))
+            self.thresh = (self.val > 0.2) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Stores_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Split_Stores:
+    name = "Split_Stores"
+    domain = "CoreClocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents rate of split store accesses.  Consider aligning your
+data to the 64-byte cache line granularity."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Split_Stores zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class DTLB_Store:
+    name = "DTLB_Store"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents cycles fraction spent handling first-level data TLB
+store misses.  As with ordinary data caching, focus on improving data locality
+and reducing working-set size to reduce DTLB overhead.  Additionally, consider
+using profile-guided optimization (PGO) to collocate frequently-used data on
+the same page.  Try using larger page sizes for large amounts of frequently-
+used data."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "DTLB_Store zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Core_Bound:
+    name = "Core_Bound"
+    domain = "Clocks"
+    area = "BE/Core"
+    desc = """
+This metric represents how much Core non-memory issues were of a bottleneck.
+Shortage in hardware compute resources, or dependencies software's
+instructions are both categorized under Core Bound. Hence it may indicate the
+machine ran out of an OOO resources, certain execution units are overloaded or
+dependencies in program's data- or instruction-flow are limiting the
+performance (e.g. FP-chained long-latency arithmetic operations). Tip:
+consider Port Saturation analysis as next step."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Core_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Divider:
+    name = "Divider"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = ""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(EV, 3 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Divider zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Ports_Utilization:
+    name = "Ports_Utilization"
+    domain = "Clocks"
+    area = "BE/Core"
+    desc = """
+This metric represents cycles fraction application was stalled due to Core
+computation issues (non divider-related).  For example, heavy data-dependency
+between nearby instructions will manifest in this category. Ditto if
+instruction-mix used by the application overloads specific hardware execution
+unit. Hint: Loop Vectorization -most compilers feature auto-Vectorization
+options today- reduces pressure on the execution ports as multiple elements
+are calculated with same uop."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Ports_Utilization zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class G0_Ports_Utilized:
+    name = "0_Ports_Utilized"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU executed no uops on any
+execution port."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "G0_Ports_Utilized zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class G1_Port_Utilized:
+    name = "1_Port_Utilized"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction where the CPU executed total of 1
+uop per cycle on all execution ports. This can be due to heavy data-dependency
+among software instructions, or over oversubscribing a particular hardware
+resource. In some other cases with high 1_Port_Utilized and L1_Bound, this
+metric can point to L1 data-cache latency bottleneck that may not necessarily
+manifest with complete execution starvation (due to the short L1 latency e.g.
+walking a linked list) - looking at the assembly can be helpful. Tip: consider
+'Core Ports Saturation' analysis-type as next step."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "G1_Port_Utilized zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class G2_Ports_Utilized:
+    name = "2_Ports_Utilized"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU executed total of 2 uops per
+cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-
+type as next step. Loop Vectorization -most compilers feature auto-
+Vectorization options today- reduces pressure on the execution ports as
+multiple elements are calculated with same uop."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "G2_Ports_Utilized zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class G3m_Ports_Utilized:
+    name = "3m_Ports_Utilized"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU executed total of 3 or more
+uops per cycle on all execution ports. Tip: consider 'Core Port Saturation'
+analysis-type as next step"""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "G3m_Ports_Utilized zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Port_0:
+    name = "Port_0"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU dispatched uops on execution
+port 0 (SNB+: ALU; HSW+:ALU and 2nd branch)"""
+    level = 5
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            #print "Port_0 zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Port_1:
+    name = "Port_1"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU dispatched uops on execution
+port 1 (ALU)"""
+    level = 5
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            #print "Port_1 zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Port_2:
+    name = "Port_2"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU dispatched uops on execution
+port 2 (Loads and Store-address)"""
+    level = 5
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            #print "Port_2 zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Port_3:
+    name = "Port_3"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU dispatched uops on execution
+port 3 (Loads and Store-address)"""
+    level = 5
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            #print "Port_3 zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Port_4:
+    name = "Port_4"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU dispatched uops on execution
+port 4 (Store-data)"""
+    level = 5
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            #print "Port_4 zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Port_5:
+    name = "Port_5"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU dispatched uops on execution
+port 5 (SNB+: Branches and ALU; HSW+: ALU)"""
+    level = 5
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
+        except ZeroDivisionError:
+            #print "Port_5 zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Retiring:
+    name = "Retiring"
+    domain = "Slots"
+    area = "RET"
+    desc = """
+This category reflects slots utilized by useful work i.e. allocated uops that
+eventually get retired. Ideally, all pipeline slots would be attributed to the
+Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired
+per cycle has been achieved.  Maximizing Retiring typically increases the
+Instruction-Per-Cycle metric. Note that a high Retiring value does not
+necessary mean there is no room for more performance.  For example, Microcode
+assists are categorized under Retiring. They hurt performance and can often be
+avoided.  A high Retiring value for non-vectorized code may be a good hint for
+programmer to consider vectorizing his code.  Doing so essentially lets more
+computations be done without significantly increasing number of instructions
+thus improving the performance."""
+    level = 1
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
+            self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
+        except ZeroDivisionError:
+            #print "Retiring zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Base:
+    name = "Base"
+    domain = "Slots"
+    area = "RET"
+    desc = """
+This metric represents slots fraction where the CPU was retiring uops not
+originated from the microcode-sequencer. This correlates with total number of
+instructions used by the program. A uops-per-instruction ratio of 1 should be
+expected. While this is the most desirable of the top 4 categories, high
+values may still indicate areas for improvement. If possible focus on
+techniques that reduce instruction count or result in more efficient
+instructions generation such as vectorization."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
+            self.thresh = (self.val > 0.6) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Base zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Microcode_Sequencer:
+    name = "Microcode_Sequencer"
+    domain = "Slots"
+    area = "RET"
+    desc = """
+This metric represents slots fraction CPU was retiring uops fetched by the
+Microcode Sequencer (MS) ROM.  The MS is used for CISC instructions not fully
+decoded by the default decoders (like repeat move strings), or by microcode
+assists used to address some operation modes (like in Floating Point assists)."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
+            self.thresh = (self.val > 0.05)
+        except ZeroDivisionError:
+            #print "Microcode_Sequencer zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Metric_IPC:
+    name = "IPC"
+    desc = """
+Instructions Per Cycle (per logical thread)"""
+    domain = "Metric"
+    maxval = 5
+
+    def compute(self, EV):
+        try:
+            self.val = IPC(EV, 0)
+        except ZeroDivisionError:
+            print "IPC zero division"
+            self.val = 0
+
+class Metric_CPI:
+    name = "CPI"
+    desc = """
+Cycles Per Instruction (threaded)"""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CPI(EV, 0)
+        except ZeroDivisionError:
+            print "CPI zero division"
+            self.val = 0
+
+class Metric_CoreIPC:
+    name = "CoreIPC"
+    desc = """
+Instructions Per Cycle (per physical core)"""
+    domain = "Metric"
+    maxval = 5
+
+    def compute(self, EV):
+        try:
+            self.val = CoreIPC(EV, 0)
+        except ZeroDivisionError:
+            print "CoreIPC zero division"
+            self.val = 0
+
+class Metric_UPI:
+    name = "UPI"
+    desc = """
+Uops Per Instruction"""
+    domain = "Metric"
+    maxval = 2
+
+    def compute(self, EV):
+        try:
+            self.val = UPI(EV, 0)
+        except ZeroDivisionError:
+            print "UPI zero division"
+            self.val = 0
+
+class Metric_IPTB:
+    name = "IPTB"
+    desc = """
+Instruction per taken branch"""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = IPTB(EV, 0)
+        except ZeroDivisionError:
+            print "IPTB zero division"
+            self.val = 0
+
+class Metric_BPTB:
+    name = "BPTB"
+    desc = """
+Branch instructions per taken branch. Can be used to approximate PGO-
+likelihood for non-loopy codes."""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = BPTB(EV, 0)
+        except ZeroDivisionError:
+            print "BPTB zero division"
+            self.val = 0
+
+class Metric_DSB_Coverage:
+    name = "DSB_Coverage"
+    desc = """
+Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+    domain = "Metric"
+    maxval = 1
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Coverage(EV, 0)
+        except ZeroDivisionError:
+            print "DSB_Coverage zero division"
+            self.val = 0
+
+class Metric_ILP:
+    name = "ILP"
+    desc = """
+Instruction-Level-Parallelism (average number of uops executed when there is
+at least 1 uop executed)"""
+    domain = "Metric"
+    maxval = 10
+
+    def compute(self, EV):
+        try:
+            self.val = ILP(EV, 0)
+        except ZeroDivisionError:
+            print "ILP zero division"
+            self.val = 0
+
+class Metric_MLP:
+    name = "MLP"
+    desc = """
+Memory-Level-Parallelism (average number of L1 miss demand load when there is
+at least 1 such miss)"""
+    domain = "Metric"
+    maxval = 10
+
+    def compute(self, EV):
+        try:
+            self.val = MLP(EV, 0)
+        except ZeroDivisionError:
+            print "MLP zero division"
+            self.val = 0
+
+class Metric_Load_Miss_Real_Latency:
+    name = "Load_Miss_Real_Latency"
+    desc = """
+Actual Average Latency for L1 data-cache miss demand loads"""
+    domain = "Metric"
+    maxval = 1000
+
+    def compute(self, EV):
+        try:
+            self.val = Load_Miss_Real_Latency(EV, 0)
+        except ZeroDivisionError:
+            print "Load_Miss_Real_Latency zero division"
+            self.val = 0
+
+class Metric_Turbo_Utilization:
+    name = "Turbo_Utilization"
+    desc = """
+Average Frequency Utilization relative nominal frequency"""
+    domain = "Metric"
+    maxval = 10
+
+    def compute(self, EV):
+        try:
+            self.val = Turbo_Utilization(EV, 0)
+        except ZeroDivisionError:
+            print "Turbo_Utilization zero division"
+            self.val = 0
+
+class Metric_Page_Walks_Use:
+    name = "Page_Walks_Use"
+    desc = """
+Fraction of cycles where the core's Page Walker is busy serving
+iTLB/Load/Store"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = Page_Walks_Use(EV, 0)
+        except ZeroDivisionError:
+            print "Page_Walks_Use zero division"
+            self.val = 0
+
+class Metric_MUX:
+    name = "MUX"
+    desc = """
+PerfMon Event Multiplexing accuracy indicator"""
+    domain = "Clocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = MUX(EV, 0)
+        except ZeroDivisionError:
+            print "MUX zero division"
+            self.val = 0
+
+class Metric_CLKS:
+    name = "CLKS"
+    desc = """
+Per-thread actual clocks"""
+    domain = "Count"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CLKS zero division"
+            self.val = 0
+
+class Metric_CORE_CLKS:
+    name = "CORE_CLKS"
+    desc = """
+Core actual clocks"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CORE_CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CORE_CLKS zero division"
+            self.val = 0
+
+class Metric_Time:
+    name = "Time"
+    desc = """
+Run duration time in seconds"""
+    domain = "Count"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = Time(EV, 0)
+        except ZeroDivisionError:
+            print "Time zero division"
+            self.val = 0
+
+# Schedule
+
+
+class Setup:
+    def __init__(self, r):
+        o = dict()
+        n = Frontend_Bound() ; r.run(n) ; o["Frontend_Bound"] = n
+        n = Frontend_Latency() ; r.run(n) ; o["Frontend_Latency"] = n
+        n = ITLB_Misses() ; r.run(n) ; o["ITLB_Misses"] = n
+        n = DSB_Switches() ; r.run(n) ; o["DSB_Switches"] = n
+        n = LCP() ; r.run(n) ; o["LCP"] = n
+        n = MS_Switches() ; r.run(n) ; o["MS_Switches"] = n
+        n = Frontend_Bandwidth() ; r.run(n) ; o["Frontend_Bandwidth"] = n
+        n = MITE() ; r.run(n) ; o["MITE"] = n
+        n = DSB() ; r.run(n) ; o["DSB"] = n
+        n = LSD() ; r.run(n) ; o["LSD"] = n
+        n = Bad_Speculation() ; r.run(n) ; o["Bad_Speculation"] = n
+        n = Branch_Mispredicts() ; r.run(n) ; o["Branch_Mispredicts"] = n
+        n = Machine_Clears() ; r.run(n) ; o["Machine_Clears"] = n
+        n = Backend_Bound() ; r.run(n) ; o["Backend_Bound"] = n
+        n = Memory_Bound() ; r.run(n) ; o["Memory_Bound"] = n
+        n = L1_Bound() ; r.run(n) ; o["L1_Bound"] = n
+        n = DTLB_Load() ; r.run(n) ; o["DTLB_Load"] = n
+        n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
+        n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n
+        n = L2_Bound() ; r.run(n) ; o["L2_Bound"] = n
+        n = L3_Bound() ; r.run(n) ; o["L3_Bound"] = n
+        n = Contested_Accesses() ; r.run(n) ; o["Contested_Accesses"] = n
+        n = Data_Sharing() ; r.run(n) ; o["Data_Sharing"] = n
+        n = L3_Latency() ; r.run(n) ; o["L3_Latency"] = n
+        n = SQ_Full() ; r.run(n) ; o["SQ_Full"] = n
+        n = MEM_Bound() ; r.run(n) ; o["MEM_Bound"] = n
+        n = MEM_Bandwidth() ; r.run(n) ; o["MEM_Bandwidth"] = n
+        n = MEM_Latency() ; r.run(n) ; o["MEM_Latency"] = n
+        n = Stores_Bound() ; r.run(n) ; o["Stores_Bound"] = n
+        n = Split_Stores() ; r.run(n) ; o["Split_Stores"] = n
+        n = DTLB_Store() ; r.run(n) ; o["DTLB_Store"] = n
+        n = Core_Bound() ; r.run(n) ; o["Core_Bound"] = n
+        n = Divider() ; r.run(n) ; o["Divider"] = n
+        n = Ports_Utilization() ; r.run(n) ; o["Ports_Utilization"] = n
+        n = G0_Ports_Utilized() ; r.run(n) ; o["G0_Ports_Utilized"] = n
+        n = G1_Port_Utilized() ; r.run(n) ; o["G1_Port_Utilized"] = n
+        n = G2_Ports_Utilized() ; r.run(n) ; o["G2_Ports_Utilized"] = n
+        n = G3m_Ports_Utilized() ; r.run(n) ; o["G3m_Ports_Utilized"] = n
+        n = Port_0() ; r.run(n) ; o["Port_0"] = n
+        n = Port_1() ; r.run(n) ; o["Port_1"] = n
+        n = Port_2() ; r.run(n) ; o["Port_2"] = n
+        n = Port_3() ; r.run(n) ; o["Port_3"] = n
+        n = Port_4() ; r.run(n) ; o["Port_4"] = n
+        n = Port_5() ; r.run(n) ; o["Port_5"] = n
+        n = Retiring() ; r.run(n) ; o["Retiring"] = n
+        n = Base() ; r.run(n) ; o["Base"] = n
+        n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n
+
+        # parents
+
+        o["Frontend_Latency"].parent = o["Frontend_Bound"]
+        o["ITLB_Misses"].parent = o["Frontend_Latency"]
+        o["DSB_Switches"].parent = o["Frontend_Latency"]
+        o["LCP"].parent = o["Frontend_Latency"]
+        o["MS_Switches"].parent = o["Frontend_Latency"]
+        o["Frontend_Bandwidth"].parent = o["Frontend_Bound"]
+        o["MITE"].parent = o["Frontend_Bandwidth"]
+        o["DSB"].parent = o["Frontend_Bandwidth"]
+        o["LSD"].parent = o["Frontend_Bandwidth"]
+        o["Branch_Mispredicts"].parent = o["Bad_Speculation"]
+        o["Machine_Clears"].parent = o["Bad_Speculation"]
+        o["Memory_Bound"].parent = o["Backend_Bound"]
+        o["L1_Bound"].parent = o["Memory_Bound"]
+        o["DTLB_Load"].parent = o["L1_Bound"]
+        o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["Split_Loads"].parent = o["L1_Bound"]
+        o["G4K_Aliasing"].parent = o["L1_Bound"]
+        o["L2_Bound"].parent = o["Memory_Bound"]
+        o["L3_Bound"].parent = o["Memory_Bound"]
+        o["Contested_Accesses"].parent = o["L3_Bound"]
+        o["Data_Sharing"].parent = o["L3_Bound"]
+        o["L3_Latency"].parent = o["L3_Bound"]
+        o["SQ_Full"].parent = o["L3_Bound"]
+        o["MEM_Bound"].parent = o["Memory_Bound"]
+        o["MEM_Bandwidth"].parent = o["MEM_Bound"]
+        o["MEM_Latency"].parent = o["MEM_Bound"]
+        o["Stores_Bound"].parent = o["Memory_Bound"]
+        o["Split_Stores"].parent = o["Stores_Bound"]
+        o["DTLB_Store"].parent = o["Stores_Bound"]
+        o["Core_Bound"].parent = o["Backend_Bound"]
+        o["Divider"].parent = o["Core_Bound"]
+        o["Ports_Utilization"].parent = o["Core_Bound"]
+        o["G0_Ports_Utilized"].parent = o["Ports_Utilization"]
+        o["G1_Port_Utilized"].parent = o["Ports_Utilization"]
+        o["G2_Ports_Utilized"].parent = o["Ports_Utilization"]
+        o["G3m_Ports_Utilized"].parent = o["Ports_Utilization"]
+        o["Port_0"].parent = o["G3m_Ports_Utilized"]
+        o["Port_1"].parent = o["G3m_Ports_Utilized"]
+        o["Port_2"].parent = o["G3m_Ports_Utilized"]
+        o["Port_3"].parent = o["G3m_Ports_Utilized"]
+        o["Port_4"].parent = o["G3m_Ports_Utilized"]
+        o["Port_5"].parent = o["G3m_Ports_Utilized"]
+        o["Base"].parent = o["Retiring"]
+        o["Microcode_Sequencer"].parent = o["Retiring"]
+
+        # references between groups
+
+        o["Frontend_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["Frontend_Bandwidth"].Frontend_Latency = o["Frontend_Latency"]
+        o["Branch_Mispredicts"].Bad_Speculation = o["Bad_Speculation"]
+        o["Machine_Clears"].Bad_Speculation = o["Bad_Speculation"]
+        o["Machine_Clears"].Branch_Mispredicts = o["Branch_Mispredicts"]
+        o["Backend_Bound"].Frontend_Bound = o["Frontend_Bound"]
+        o["Backend_Bound"].Bad_Speculation = o["Bad_Speculation"]
+        o["Backend_Bound"].Retiring = o["Retiring"]
+        o["L1_Bound"].DTLB_Load = o["DTLB_Load"]
+        o["Stores_Bound"].Memory_Bound = o["Memory_Bound"]
+        o["Core_Bound"].Memory_Bound = o["Memory_Bound"]
+        o["Ports_Utilization"].Core_Bound = o["Core_Bound"]
+        o["Ports_Utilization"].Divider = o["Divider"]
+        o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Base"].Retiring = o["Retiring"]
+        o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"]
+
+        # siblings cross-tree
+
+        o["Frontend_Bound"].sibling = None
+        o["Frontend_Latency"].sibling = None
+        o["ITLB_Misses"].sibling = None
+        o["DSB_Switches"].sibling = None
+        o["LCP"].sibling = None
+	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
+        o["Frontend_Bandwidth"].sibling = None
+        o["MITE"].sibling = None
+        o["DSB"].sibling = None
+        o["LSD"].sibling = None
+	o["Bad_Speculation"].sibling = None
+        o["Branch_Mispredicts"].sibling = None
+        o["Machine_Clears"].sibling = None
+        o["Backend_Bound"].sibling = None
+        o["Memory_Bound"].sibling = None
+	o["L1_Bound"].sibling = o["G1_Port_Utilized"]
+        o["DTLB_Load"].sibling = None
+        o["Store_Fwd_Blk"].sibling = None
+        o["Split_Loads"].sibling = None
+        o["G4K_Aliasing"].sibling = None
+        o["L2_Bound"].sibling = None
+        o["L3_Bound"].sibling = None
+        o["Contested_Accesses"].sibling = None
+        o["Data_Sharing"].sibling = None
+        o["L3_Latency"].sibling = None
+        o["SQ_Full"].sibling = None
+        o["MEM_Bound"].sibling = None
+        o["MEM_Bandwidth"].sibling = None
+        o["MEM_Latency"].sibling = None
+        o["Stores_Bound"].sibling = None
+	o["Split_Stores"].sibling = o["Port_4"]
+        o["DTLB_Store"].sibling = None
+        o["Core_Bound"].sibling = None
+        o["Divider"].sibling = None
+        o["Ports_Utilization"].sibling = None
+        o["G0_Ports_Utilized"].sibling = None
+	o["G1_Port_Utilized"].sibling = o["L1_Bound"]
+        o["G2_Ports_Utilized"].sibling = None
+        o["G3m_Ports_Utilized"].sibling = None
+        o["Port_0"].sibling = None
+        o["Port_1"].sibling = None
+        o["Port_2"].sibling = None
+        o["Port_3"].sibling = None
+	o["Port_4"].sibling = o["Split_Stores"]
+        o["Port_5"].sibling = None
+        o["Retiring"].sibling = None
+        o["Base"].sibling = None
+	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
+
+        # sampling events
+
+        o["Frontend_Bound"].sample = []
+        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
+        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
+        o["DSB_Switches"].sample = []
+        o["LCP"].sample = []
+        o["MS_Switches"].sample = []
+        o["Frontend_Bandwidth"].sample = []
+        o["MITE"].sample = []
+        o["DSB"].sample = []
+        o["LSD"].sample = []
+        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
+        o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+        o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT']
+        o["Backend_Bound"].sample = []
+        o["Memory_Bound"].sample = []
+        o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
+        o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
+        o["Store_Fwd_Blk"].sample = []
+        o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
+        o["G4K_Aliasing"].sample = []
+        o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
+        o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
+        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp']
+        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+        o["SQ_Full"].sample = []
+        o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
+        o["MEM_Bandwidth"].sample = []
+        o["MEM_Latency"].sample = []
+        o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
+        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
+        o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
+        o["Core_Bound"].sample = []
+        o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE']
+        o["Ports_Utilization"].sample = []
+        o["G0_Ports_Utilized"].sample = []
+        o["G1_Port_Utilized"].sample = []
+        o["G2_Ports_Utilized"].sample = []
+        o["G3m_Ports_Utilized"].sample = []
+        o["Port_0"].sample = []
+        o["Port_1"].sample = []
+        o["Port_2"].sample = []
+        o["Port_3"].sample = []
+        o["Port_4"].sample = []
+        o["Port_5"].sample = []
+        o["Retiring"].sample = []
+        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
+        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
+
+        # user visible metrics
+
+        n = Metric_IPC() ; r.metric(n)
+        n = Metric_CPI() ; r.metric(n)
+        n = Metric_CoreIPC() ; r.metric(n)
+        n = Metric_UPI() ; r.metric(n)
+        n = Metric_IPTB() ; r.metric(n)
+        n = Metric_BPTB() ; r.metric(n)
+        n = Metric_DSB_Coverage() ; r.metric(n)
+        n = Metric_ILP() ; r.metric(n)
+        n = Metric_MLP() ; r.metric(n)
+        n = Metric_Load_Miss_Real_Latency() ; r.metric(n)
+        n = Metric_Turbo_Utilization() ; r.metric(n)
+        n = Metric_Page_Walks_Use() ; r.metric(n)
+        n = Metric_MUX() ; r.metric(n)
+        n = Metric_CLKS() ; r.metric(n)
+        n = Metric_CORE_CLKS() ; r.metric(n)
+        n = Metric_Time() ; r.metric(n)
diff --git a/cpumap.sh b/cpumap.sh
index e07bb4f3..5ecdc115 100644
--- a/cpumap.sh
+++ b/cpumap.sh
@@ -12,3 +12,5 @@ cpus[ivb]=GenuineIntel-6-3A
 cpus[hsw]=GenuineIntel-6-45
 cpus[slm]=GenuineIntel-6-37
 cpus[bnl]=GenuineIntel-6-35
+cpus[bdw]=GenuineIntel-6-3D
+cpus[hsx]=GenuineIntel-6-3F
diff --git a/hsw_client_ratios.py b/hsw_client_ratios.py
index 0ae53359..f9f9c594 100644
--- a/hsw_client_ratios.py
+++ b/hsw_client_ratios.py
@@ -1,70 +1,190 @@
 
 #
-# auto generated TopDown description for Intel 4th gen Core (code named Haswell)
+# auto generated TopDown 2.9 description for Intel 4rd gen Core (code named Haswell)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
+# References:
+# http://halobates.de/blog/p/262
+# https://sites.google.com/site/analysismethods/yasin-pubs
+#
 
+smt_enabled = False
 
 # Constants
 
-PipelineWidth = 4
-MEM_L3_WEIGHT = 7
-MEM_STLB_HIT_COST = 7
-MEM_SFB_COST = 13
-MEM_4KALIAS_COST = 7
-MEM_XSNP_HITM_COST = 60
-MEM_XSNP_HIT_COST = 43
-MEM_XSNP_NONE_COST = 29
-MS_SWITCHES_COST = 3
+Pipeline_Width = 4
+L2_Store_Latency = 9
+Mem_L3_Weight = 7
+Mem_STLB_Hit_Cost = 7
+Mem_SFB_Cost = 13
+Mem_4K_Alias_Cost = 7
+Mem_XSNP_HitM_Cost = 60
+MEM_XSNP_Hit_Cost = 43
+MEM_XSNP_None_Cost = 29
+Mem_Local_DRAM_Cost = 200
+Mem_Remote_DRAM_Cost = 310
+Mem_Remote_HitM_Cost = 200
+Mem_Remote_Fwd_Cost = 180
+MS_Switches_Cost = 2
+OneMillion = 1000000
+Energy_Unit = 61
 
 # Aux. formulas
 
-def CLKS(EV, level):
-    return EV("CPU_CLK_UNHALTED.THREAD", level)
-def FewUopsExecutedThreshold(EV, level):
-    EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level); EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
-    return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level) > 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
-def BackendBoundAtEXE_stalls(EV, level):
-    return ( EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOPS_EXEC", level) - FewUopsExecutedThreshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level) )
-def BackendBoundAtEXE(EV, level):
-    return BackendBoundAtEXE_stalls(EV, level) / CLKS(EV, level)
-def MemL3HitFraction(EV, level):
-    return EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) + MEM_L3_WEIGHT * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level) )
-def MispredClearsFraction(EV, level):
-    return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level) )
-def AvgRsEmptyPeriodClears(EV, level):
-    return ( EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level))/ EV("RS_EVENTS.EMPTY_END", level)
-def RetireUopFraction(EV, level):
+
+def Recovery_Cycles(EV, level):
+    EV("INT_MISC.RECOVERY_CYCLES", level)
+    EV("INT_MISC.RECOVERY_CYCLES:amt1", level)
+    return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level)
+
+def Execute_Cycles(EV, level):
+    EV("UOPS_EXECUTED.CORE:c1", level)
+    return (EV("UOPS_EXECUTED.CORE:c1", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CORE:c1", level)
+
+def L1D_Miss_Cycles(EV, level):
+    EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+    EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level)
+    return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+
+def SQ_Full_Cycles(EV, level):
+    EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+    return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+
+def ITLB_Miss_Cycles(EV, level):
+    return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level))
+
+def Cycles_0_Ports_Utilized(EV, level):
+    return STALLS_TOTAL(EV, level)
+
+def Cycles_1_Port_Utilized(EV, level):
+    EV("UOPS_EXECUTED.CORE:c2", level)
+    EV("UOPS_EXECUTED.CORE:c1", level)
+    return (EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level))
+
+def Cycles_2_Ports_Utilized(EV, level):
+    EV("UOPS_EXECUTED.CORE:c2", level)
+    EV("UOPS_EXECUTED.CORE:c3", level)
+    return (EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level))
+
+def Cycles_3m_Ports_Utilized(EV, level):
+    EV("UOPS_EXECUTED.CORE:c3", level)
+    return (EV("UOPS_EXECUTED.CORE:c3", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CORE:c3", level)
+
+def STALLS_MEM_ANY(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level)) , level )
+
+def STALLS_TOTAL(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level)) , level )
+
+def ORO_Demand_DRD_C1(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level )
+
+def ORO_Demand_DRD_C6(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level )
+
+def Store_L2_Hit_Cycles(EV, level):
+    return 0
+
+def Cycles_False_Sharing_Client(EV, level):
+    return Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", level) + EV("OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE", level))
+
+def Few_Uops_Executed_Threshold(EV, level):
+    EV("UOPS_EXECUTED.CORE:c2", level)
+    EV("UOPS_EXECUTED.CORE:c3", level)
+    return EV("UOPS_EXECUTED.CORE:c3", level) if(IPC(EV, level)> 1.25)else EV("UOPS_EXECUTED.CORE:c2", level)
+
+def Backend_Bound_At_EXE(EV, level):
+    return (STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CORE:c1", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level)) / CLKS(EV, level)
+
+def Mem_L3_Hit_Fraction(EV, level):
+    return EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.L3_MISS", level))
+
+def Mem_Lock_St_Fraction(EV, level):
+    return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level)
+
+def Mispred_Clears_Fraction(EV, level):
+    return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
+
+def Retire_Uop_Fraction(EV, level):
     return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level)
+
 def SLOTS(EV, level):
-    return PipelineWidth * CLKS(EV, level)
-# Instructions Per Cycle
+    return Pipeline_Width * CORE_CLKS(EV, level)
+
+def DurationTimeInSeconds(EV, level):
+    return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 )
+
+# Instructions Per Cycle (per logical thread)
 def IPC(EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(EV, level)
+
+# Cycles Per Instruction (threaded)
+def CPI(EV, level):
+    return 1 / IPC(EV, level)
+
+# Instructions Per Cycle (per physical core)
+def CoreIPC(EV, level):
+    return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level)
+
 # Uops Per Instruction
 def UPI(EV, level):
     return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level)
+
 # Instruction per taken branch
-def InstPerTakenBranch(EV, level):
+def IPTB(EV, level):
     return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
+# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes.
+def BPTB(EV, level):
+    return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
 # Fraction of Uops delivered by the DSB (decoded instructions cache)
-def DSBCoverage(EV, level):
-    return ( EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level))/(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level) )
-# Memory-Level-Parallelism (avg L1 miss demand load when there is at least 1 such miss)
+def DSB_Coverage(EV, level):
+    return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level))
+
+# Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)
+def ILP(EV, level):
+    EV("UOPS_EXECUTED.CORE", level)
+    return (EV("UOPS_EXECUTED.CORE", level) / 2 / Execute_Cycles(EV, level)) if smt_enabled else EV("UOPS_EXECUTED.CORE", level) / Execute_Cycles(EV, level)
+
+# Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)
 def MLP(EV, level):
-    return EV("L1D_PEND_MISS.PENDING", level) / EV("L1D_PEND_MISS.PENDING_CYCLES", level)
-# Average L1 miss demand load latency
-def L1dMissLatency(EV, level):
-    return EV("L1D_PEND_MISS.PENDING", level) / EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level)
+    return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level)
+
+# Actual Average Latency for L1 data-cache miss demand loads
+def Load_Miss_Real_Latency(EV, level):
+    return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level))
+
 # Average Frequency Utilization relative nominal frequency
-def TurboUtilization(EV, level):
+def Turbo_Utilization(EV, level):
     return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level)
 
+# Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store
+def Page_Walks_Use(EV, level):
+    return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level)
+
+# PerfMon Event Multiplexing accuracy indicator
+def MUX(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level)
+
+# Per-thread actual clocks
+def CLKS(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD", level)
+
+# Core actual clocks
+def CORE_CLKS(EV, level):
+    EV("CPU_CLK_UNHALTED.THREAD:amt1", level)
+    return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level)
+
+# Run duration time in seconds
+def Time(EV, level):
+    return DurationTimeInSeconds(EV, level)
+
 # Event groups
 
 
-class FrontendBound:
-    name = "FrontendBound"
+class Frontend_Bound:
+    name = "Frontend_Bound"
     domain = "Slots"
     area = "FE"
     desc = """
@@ -77,108 +197,72 @@ class FrontendBound:
 latter can accept them. For example, stalls due to instruction-cache misses
 would be categorized under Frontend Bound."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1)/ SLOTS(EV, 1 )
+            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "FrontendBound zero division"
+            #print "Frontend_Bound zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class FrontendLatency:
-    name = "Frontend Latency"
+class Frontend_Latency:
+    name = "Frontend_Latency"
     domain = "Slots"
     area = "FE"
     desc = """
 This metric represents slots fraction CPU was stalled due to Frontend latency
 issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
-after a branch missprediction are categorized under Frontend Latency. In such
+after a branch misprediction are categorized under Frontend Latency. In such
 cases the Frontend eventually delivers no uops for some period."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = PipelineWidth * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2)/ SLOTS(EV, 2 )
+            self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FrontendLatency zero division"
-            self.val = 0
-            self.thresh = False
-        return self.val
-
-class ICacheMisses:
-    name = "ICache Misses"
-    domain = "Clocks"
-    area = "FE"
-    desc = """
-This metric represents cycles fraction CPU was stalled due to instruction
-cache misses. Using compiler's Profile-Guided Optimization (PGO) can reduce
-i-cache misses through improved hot code layout."""
-    level = 3
-    def compute(self, EV):
-        try:
-            self.val = ( EV("ICACHE.IFETCH_STALL", 3)- EV("ITLB_MISSES.WALK_DURATION", 3)) / CLKS(EV, 3 )
-            self.thresh = (self.val > 0.05) and self.parent.thresh
-        except ZeroDivisionError:
-            #print "ICacheMisses zero division"
+            #print "Frontend_Latency zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class ITLBmisses:
-    name = "ITLB misses"
+class ITLB_Misses:
+    name = "ITLB_Misses"
     domain = "Clocks"
     area = "FE"
     desc = """
 This metric represents cycles fraction CPU was stalled due to instruction TLB
 misses. Using large code pages may be considered here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ITLB_MISSES.WALK_DURATION", 3)/ CLKS(EV, 3 )
+            self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ITLBmisses zero division"
+            #print "ITLB_Misses zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class BranchResteers:
-    name = "Branch Resteers"
-    domain = "Clocks"
-    area = "FE"
-    desc = """
-This metric represents cycles fraction CPU was stalled due to Branch Resteers.
-Following all sorts of miss-predicted branches, this measure the delays of
-fetch instructions from corrected path caused by the Frontend of the machine.
-For example, branchy code with lots of (taken) branches and/or branch miss-
-predictions might get categorized under Branch Resteers."""
-    level = 3
-    def compute(self, EV):
-        try:
-            self.val = ( EV("BR_MISP_RETIRED.ALL_BRANCHES", 3)+ EV("MACHINE_CLEARS.COUNT", 3)+ EV("BACLEARS.ANY", 3)) * AvgRsEmptyPeriodClears(EV, 3)/ CLKS(EV, 3 )
-            self.thresh = (self.val > 0.05) and self.parent.thresh
-        except ZeroDivisionError:
-            #print "BranchResteers zero division"
-            self.val = 0
-            self.thresh = False
-        return self.val
-
-class DSBswitches:
-    name = "DSB switches"
+class DSB_Switches:
+    name = "DSB_Switches"
     domain = "Clocks"
     area = "FE"
     desc = """
 This metric represents cycles fraction CPU was stalled due to switches from
 DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3)/ CLKS(EV, 3 )
+            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSBswitches zero division"
+            #print "DSB_Switches zero division"
             self.val = 0
             self.thresh = False
         return self.val
@@ -192,9 +276,10 @@ class LCP:
 Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
 certainly avoid this."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ILD_STALL.LCP", 3)/ CLKS(EV, 3 )
+            self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "LCP zero division"
@@ -202,8 +287,8 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
-class MSswitches:
-    name = "MS switches"
+class MS_Switches:
+    name = "MS_Switches"
     domain = "Clocks"
     area = "FE"
     desc = """
@@ -213,18 +298,19 @@ class MSswitches:
 deliver long uop flows required by CISC instructions like CPUID, or uncommon
 conditions like Floating Point Assists when dealing with Denormals."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MS_SWITCHES_COST * EV("IDQ.MS_SWITCHES", 3)/ CLKS(EV, 3 )
+            self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MSswitches zero division"
+            #print "MS_Switches zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class FrontendBandwidth:
-    name = "Frontend Bandwidth"
+class Frontend_Bandwidth:
+    name = "Frontend_Bandwidth"
     domain = "Slots"
     area = "FE"
     desc = """
@@ -234,28 +320,30 @@ class FrontendBandwidth:
 under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
 optimal amount of uops to the Backend."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.FrontendBound.compute(EV)- self.FrontendLatency.compute(EV )
+            self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FrontendBandwidth zero division"
+            #print "Frontend_Bandwidth zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
 class MITE:
     name = "MITE"
-    domain = "Clocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-the MITE fetch pipeline.  For example, inefficiencies in the instruction
-decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to the MITE fetch pipeline.  For example, inefficiencies in the
+instruction decoders are categorized here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CLKS(EV, 3 )
+            self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MITE zero division"
@@ -265,17 +353,18 @@ def compute(self, EV):
 
 class DSB:
     name = "DSB"
-    domain = "Clocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-DSB (decoded uop cache) fetch pipeline.  For example, inefficient utlilization
-of the DSB cache structure or bank conflict when reading from it, are
-categorized here."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
+utilization of the DSB cache structure or bank conflict when reading from it,
+are categorized here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CLKS(EV, 3 )
+            self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
             #print "DSB zero division"
@@ -283,8 +372,30 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
-class BadSpeculation:
-    name = "BadSpeculation"
+class LSD:
+    name = "LSD"
+    domain = "CoreClocks"
+    area = "FE"
+    desc = """
+This metric represents Core cycles fraction in which CPU was likely limited
+due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
+Uop supply. However, in some rare cases, optimal uop-delivery could not be
+reached for small loops whose size (in terms of number of uops) does not suit
+well the LSD structure."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "LSD zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Bad_Speculation:
+    name = "Bad_Speculation"
     domain = "Slots"
     area = "BAD"
     desc = """
@@ -294,38 +405,40 @@ class BadSpeculation:
 speculation. For example, wasted work due to miss-predicted branches are
 categorized under Bad Speculation category"""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("UOPS_ISSUED.ANY", 1)- EV("UOPS_RETIRED.RETIRE_SLOTS", 1)+ PipelineWidth * EV("INT_MISC.RECOVERY_CYCLES", 1)) / SLOTS(EV, 1 )
+            self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
-            #print "BadSpeculation zero division"
+            #print "Bad_Speculation zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class BranchMispredicts:
-    name = "Branch Mispredicts"
+class Branch_Mispredicts:
+    name = "Branch_Mispredicts"
     domain = "Slots"
     area = "BAD"
     desc = """
 This metric represents slots fraction CPU was impacted by Branch
-Missprediction.  These slots are either wasted by uops fetched from an
+Misprediction.  These slots are either wasted by uops fetched from an
 incorrectly speculated program path, or stalls the Backend of the machine
 needs to recover its state from a speculative path."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MispredClearsFraction(EV, 2)* self.BadSpeculation.compute(EV )
+            self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "BranchMispredicts zero division"
+            #print "Branch_Mispredicts zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class MachineClears:
-    name = "Machine Clears"
+class Machine_Clears:
+    name = "Machine_Clears"
     domain = "Slots"
     area = "BAD"
     desc = """
@@ -335,12 +448,13 @@ class MachineClears:
 example, this can happen due to memory ordering Nukes (e.g. Memory
 Disambiguation) or Self-Modifying-Code (SMC) nukes."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.BadSpeculation.compute(EV)- self.BranchMispredicts.compute(EV )
+            self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MachineClears zero division"
+            #print "Machine_Clears zero division"
             self.val = 0
             self.thresh = False
         return self.val
@@ -358,9 +472,10 @@ class Backend_Bound:
 stalls due to data-cache misses or stalls due to the divider unit being
 overloaded are both categorized under Backend Bound."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = 1 -(self.FrontendBound.compute(EV)+ self.BadSpeculation.compute(EV)+ self.Retiring.compute(EV))
+            self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             #print "Backend_Bound zero division"
@@ -368,8 +483,8 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
-class MemoryBound:
-    name = "MemoryBound"
+class Memory_Bound:
+    name = "Memory_Bound"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
@@ -379,18 +494,19 @@ class MemoryBound:
 memory demand loads which coincides with execution starvation. in addition to
 less common cases where stores could imply backpressure on the pipeline."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", 2)+ EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
+            self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MemoryBound zero division"
+            #print "Memory_Bound zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class L1Bound:
-    name = "L1 Bound"
+class L1_Bound:
+    name = "L1_Bound"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
@@ -401,82 +517,87 @@ class L1Bound:
 allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event
 as it accounts for any non-completed load."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", 3)- EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
-            self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Overhead.thresh
+            self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
+            self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh
         except ZeroDivisionError:
-            #print "L1Bound zero division"
+            #print "L1_Bound zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class DTLB_Overhead:
-    name = "DTLB_Overhead"
+class DTLB_Load:
+    name = "DTLB_Load"
     domain = "Clocks"
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( MEM_STLB_HIT_COST * EV("DTLB_LOAD_MISSES.STLB_HIT", 4)+ EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
+            self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Overhead zero division"
+            #print "DTLB_Load zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class LoadsBlockedbyStoreForwarding:
-    name = "Loads Blocked by Store Forwarding"
+class Store_Fwd_Blk:
+    name = "Store_Fwd_Blk"
     domain = "Clocks"
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_SFB_COST * EV("LD_BLOCKS.STORE_FORWARD", 4)/ CLKS(EV, 4 )
+            self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "LoadsBlockedbyStoreForwarding zero division"
+            #print "Store_Fwd_Blk zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class SplitLoads:
-    name = "Split Loads"
+class Split_Loads:
+    name = "Split_Loads"
     domain = "Clocks"
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = L1dMissLatency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4)/ CLKS(EV, 4 )
+            self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "SplitLoads zero division"
+            #print "Split_Loads zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class G4KAliasing:
-    name = "4K Aliasing"
+class G4K_Aliasing:
+    name = "4K_Aliasing"
     domain = "Clocks"
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_4KALIAS_COST * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4)/ CLKS(EV, 4 )
+            self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "G4KAliasing zero division"
+            #print "G4K_Aliasing zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class L2Bound:
-    name = "L2 Bound"
+class L2_Bound:
+    name = "L2_Bound"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
@@ -484,18 +605,19 @@ class L2Bound:
 misses (i.e. L1 misses/L2 hits) will improve the latency and increase
 performance."""
     level = 3
+    htoff = True
     def compute(self, EV):
         try:
-            self.val = ( EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)- EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
+            self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.03) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L2Bound zero division"
+            #print "L2_Bound zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class L3Bound:
-    name = "L3 Bound"
+class L3_Bound:
+    name = "L3_Bound"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
@@ -503,88 +625,113 @@ class L3Bound:
 a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
 the latency and increase performance."""
     level = 3
+    htoff = True
     def compute(self, EV):
         try:
-            self.val = MemL3HitFraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 )
+            self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3Bound zero division"
+            #print "L3_Bound zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class ContestedAccesses:
-    name = "Contested Accesses"
+class Contested_Accesses:
+    name = "Contested_Accesses"
     domain = "Clocks"
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_XSNP_HITM_COST *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4)+ EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
+            self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "ContestedAccesses zero division"
+            #print "Contested_Accesses zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class DataSharing:
-    name = "Data Sharing"
+class Data_Sharing:
+    name = "Data_Sharing"
     domain = "Clocks"
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_XSNP_HIT_COST * EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT", 4)/ CLKS(EV, 4 )
+            self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "DataSharing zero division"
+            #print "Data_Sharing zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class L3Latency:
-    name = "L3 Latency"
+class L3_Latency:
+    name = "L3_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
 This metric is a rough aggregate estimate of cycles fraction where CPU
 accessed L3 cache for all load requests, while there was no contention/sharing
-with a sibiling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
+with a sibling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
 improve the latency and increase performance."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_XSNP_NONE_COST * EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", 4)/ CLKS(EV, 4 )
+            self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3Latency zero division"
+            #print "L3_Latency zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class SQ_Full:
+    name = "SQ_Full"
+    domain = "CoreClocks"
+    area = "BE/Mem"
+    desc = """
+This metric measures fraction of cycles where the Super Queue (SQ) was full
+taking into account all request-types and both hardware SMT threads. The Super
+Queue is used for requests to access the L2 cache or to go out to the Uncore."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "SQ_Full zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class DRAMBound:
-    name = "DRAM Bound"
+class MEM_Bound:
+    name = "MEM_Bound"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
 This metric represents how often CPU was stalled on main memory (DRAM).
 Caching will improve the latency and increase performance."""
     level = 3
+    htoff = True
     def compute(self, EV):
         try:
-            self.val = ( 1 - MemL3HitFraction(EV, 3)) * EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 )
+            self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DRAMBound zero division"
+            #print "MEM_Bound zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class MEMBandwidth:
-    name = "MEM Bandwidth"
+class MEM_Bandwidth:
+    name = "MEM_Bandwidth"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
@@ -592,91 +739,100 @@ class MEMBandwidth:
 bandwidth limits of main memory (DRAM).  NUMA in multi-socket system may be
 considered in such case."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:cmask=6", 4)/ CLKS(EV, 4 )
+            self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEMBandwidth zero division"
+            #print "MEM_Bandwidth zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class MEMLatency:
-    name = "MEM Latency"
+class MEM_Latency:
+    name = "MEM_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
 This metric represents how often CPU was likely stalled due to latency from
-main memory (DRAM).  Data layout restructing or using Software Prefetches
+main memory (DRAM).  Data layout re-structuring or using Software Prefetches
 (also through the compiler) may be considered in such case."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", 4)- EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:cmask=6", 4)) / CLKS(EV, 4 )
+            self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEMLatency zero division"
+            #print "MEM_Latency zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class StoresBound:
-    name = "Stores Bound"
+class Stores_Bound:
+    name = "Stores_Bound"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on due to store operations.
-Tip: consider False Sharing analysis as next step"""
+This metric represents how often CPU was stalled  due to store operations.
+even though memory store accesses do not typically stall out-of-order CPUs;
+there are few cases where stores can lead to actual stalls. This metric will
+be flagged should any of these cases be a bottleneck."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.MemoryBound.compute(EV)-(EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", 3)/ CLKS(EV, 3))
+            self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "StoresBound zero division"
+            #print "Stores_Bound zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class FalseSharing:
-    name = "False Sharing"
+class False_Sharing:
+    name = "False_Sharing"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on due to store operations.
-Tip: consider False Sharing analysis as next step"""
+This metric represents how often CPU was stalled due to False Sharing. False
+Sharing is a multithreading hiccup, where multiple threads contend on
+different data-elements mapped into the same cache line. It can be easily
+avoided by padding to make threads access different lines."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_XSNP_HITM_COST *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4)+ EV("OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE", 4)) / CLKS(EV, 4 )
+            self.val = Cycles_False_Sharing_Client(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FalseSharing zero division"
+            #print "False_Sharing zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class SplitStores:
-    name = "Split Stores"
-    domain = "Stores"
+class Split_Stores:
+    name = "Split_Stores"
+    domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
 This metric represents rate of split store accesses.  Consider aligning your
 data to the 64-byte cache line granularity."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4)/ EV("MEM_UOPS_RETIRED.ALL_STORES", 4 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "SplitStores zero division"
+            #print "Split_Stores zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class DTLBStoreOverhead:
-    name = "DTLB Store Overhead"
+class DTLB_Store:
+    name = "DTLB_Store"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
@@ -687,33 +843,164 @@ class DTLBStoreOverhead:
 the same page.  Try using larger page sizes for large amounts of frequently-
 used data."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( MEM_STLB_HIT_COST * EV("DTLB_STORE_MISSES.STLB_HIT", 4)+ EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
+            self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLBStoreOverhead zero division"
+            #print "DTLB_Store zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class CoreBound:
-    name = "CoreBound"
+class Core_Bound:
+    name = "Core_Bound"
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents how much Core non-memory issues were a bottleneck.
-This may indicate that we ran out of OOO resources or are saturating certain
-execution units (e.g. the use of FP-chained long-latency arithmetic
-operations) which can limit performance. Tip: consider Port Saturation
-analysis as next step"""
+This metric represents how much Core non-memory issues were of a bottleneck.
+Shortage in hardware compute resources, or dependencies software's
+instructions are both categorized under Core Bound. Hence it may indicate the
+machine ran out of an OOO resources, certain execution units are overloaded or
+dependencies in program's data- or instruction-flow are limiting the
+performance (e.g. FP-chained long-latency arithmetic operations). Tip:
+consider Port Saturation analysis as next step."""
     level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Core_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Divider:
+    name = "Divider"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = ""
+    level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = BackendBoundAtEXE(EV, 2)- self.MemoryBound.compute(EV )
+            self.val = 10 * EV("ARITH.DIVIDER_UOPS", 3) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "CoreBound zero division"
+            #print "Divider zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Ports_Utilization:
+    name = "Ports_Utilization"
+    domain = "Clocks"
+    area = "BE/Core"
+    desc = """
+This metric represents cycles fraction application was stalled due to Core
+computation issues (non divider-related).  For example, heavy data-dependency
+between nearby instructions will manifest in this category. Ditto if
+instruction-mix used by the application overloads specific hardware execution
+unit. Hint: Loop Vectorization -most compilers feature auto-Vectorization
+options today- reduces pressure on the execution ports as multiple elements
+are calculated with same uop."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Ports_Utilization zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class G0_Ports_Utilized:
+    name = "0_Ports_Utilized"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU executed no uops on any
+execution port."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "G0_Ports_Utilized zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class G1_Port_Utilized:
+    name = "1_Port_Utilized"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction where the CPU executed total of 1
+uop per cycle on all execution ports. This can be due to heavy data-dependency
+among software instructions, or over oversubscribing a particular hardware
+resource. In some other cases with high 1_Port_Utilized and L1_Bound, this
+metric can point to L1 data-cache latency bottleneck that may not necessarily
+manifest with complete execution starvation (due to the short L1 latency e.g.
+walking a linked list) - looking at the assembly can be helpful. Tip: consider
+'Core Ports Saturation' analysis-type as next step."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "G1_Port_Utilized zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class G2_Ports_Utilized:
+    name = "2_Ports_Utilized"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU executed total of 2 uops per
+cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-
+type as next step. Loop Vectorization -most compilers feature auto-
+Vectorization options today- reduces pressure on the execution ports as
+multiple elements are calculated with same uop."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "G2_Ports_Utilized zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class G3m_Ports_Utilized:
+    name = "3m_Ports_Utilized"
+    domain = "CoreClocks"
+    area = "BE/Core"
+    desc = """
+This metric represents Core cycles fraction CPU executed total of 3 or more
+uops per cycle on all execution ports. Tip: consider 'Core Port Saturation'
+analysis-type as next step"""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "G3m_Ports_Utilized zero division"
             self.val = 0
             self.thresh = False
         return self.val
@@ -735,40 +1022,43 @@ class Retiring:
 computations be done without significantly increasing number of instructions
 thus improving the performance."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1)/ SLOTS(EV, 1 )
-            self.thresh = (self.val > 0.7) | self.MicroSequencer.thresh
+            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
+            self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
             #print "Retiring zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class BASE:
-    name = "BASE"
+class Base:
+    name = "Base"
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops not originated
-from the microcode-sequencer. This correlates with total number of
+This metric represents slots fraction where the CPU was retiring uops not
+originated from the microcode-sequencer. This correlates with total number of
 instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. A high Retiring value for non-vectorized code is typically a good
-hint for programmer to pursue vectorizing his code, which can reduce
-instructions hence this bucket."""
+expected. While this is the most desirable of the top 4 categories, high
+values may still indicate areas for improvement. If possible focus on
+techniques that reduce instruction count or result in more efficient
+instructions generation such as vectorization."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Retiring.compute(EV)- self.MicroSequencer.compute(EV )
+            self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
-            #print "BASE zero division"
+            #print "Base zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class MicroSequencer:
-    name = "MicroSequencer"
+class Microcode_Sequencer:
+    name = "Microcode_Sequencer"
     domain = "Slots"
     area = "RET"
     desc = """
@@ -777,12 +1067,13 @@ class MicroSequencer:
 decoded by the default decoders (like repeat move strings), or by microcode
 assists used to address some operation modes (like in Floating Point assists)."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = RetireUopFraction(EV, 2)* EV("IDQ.MS_UOPS", 2)/ SLOTS(EV, 2 )
+            self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
-            #print "MicroSequencer zero division"
+            #print "Microcode_Sequencer zero division"
             self.val = 0
             self.thresh = False
         return self.val
@@ -790,7 +1081,9 @@ def compute(self, EV):
 class Metric_IPC:
     name = "IPC"
     desc = """
-Instructions Per Cycle"""
+Instructions Per Cycle (per logical thread)"""
+    domain = "Metric"
+    maxval = 5
 
     def compute(self, EV):
         try:
@@ -799,10 +1092,40 @@ def compute(self, EV):
             print "IPC zero division"
             self.val = 0
 
+class Metric_CPI:
+    name = "CPI"
+    desc = """
+Cycles Per Instruction (threaded)"""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CPI(EV, 0)
+        except ZeroDivisionError:
+            print "CPI zero division"
+            self.val = 0
+
+class Metric_CoreIPC:
+    name = "CoreIPC"
+    desc = """
+Instructions Per Cycle (per physical core)"""
+    domain = "Metric"
+    maxval = 5
+
+    def compute(self, EV):
+        try:
+            self.val = CoreIPC(EV, 0)
+        except ZeroDivisionError:
+            print "CoreIPC zero division"
+            self.val = 0
+
 class Metric_UPI:
     name = "UPI"
     desc = """
 Uops Per Instruction"""
+    domain = "Metric"
+    maxval = 2
 
     def compute(self, EV):
         try:
@@ -811,35 +1134,71 @@ def compute(self, EV):
             print "UPI zero division"
             self.val = 0
 
-class Metric_InstPerTakenBranch:
-    name = "InstPerTakenBranch"
+class Metric_IPTB:
+    name = "IPTB"
     desc = """
 Instruction per taken branch"""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = IPTB(EV, 0)
+        except ZeroDivisionError:
+            print "IPTB zero division"
+            self.val = 0
+
+class Metric_BPTB:
+    name = "BPTB"
+    desc = """
+Branch instructions per taken branch. Can be used to approximate PGO-
+likelihood for non-loopy codes."""
+    domain = "Metric"
+    maxval = 0
 
     def compute(self, EV):
         try:
-            self.val = InstPerTakenBranch(EV, 0)
+            self.val = BPTB(EV, 0)
         except ZeroDivisionError:
-            print "InstPerTakenBranch zero division"
+            print "BPTB zero division"
             self.val = 0
 
-class Metric_DSBCoverage:
-    name = "DSBCoverage"
+class Metric_DSB_Coverage:
+    name = "DSB_Coverage"
     desc = """
 Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+    domain = "Metric"
+    maxval = 1
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Coverage(EV, 0)
+        except ZeroDivisionError:
+            print "DSB_Coverage zero division"
+            self.val = 0
+
+class Metric_ILP:
+    name = "ILP"
+    desc = """
+Instruction-Level-Parallelism (average number of uops executed when there is
+at least 1 uop executed)"""
+    domain = "Metric"
+    maxval = 10
 
     def compute(self, EV):
         try:
-            self.val = DSBCoverage(EV, 0)
+            self.val = ILP(EV, 0)
         except ZeroDivisionError:
-            print "DSBCoverage zero division"
+            print "ILP zero division"
             self.val = 0
 
 class Metric_MLP:
     name = "MLP"
     desc = """
-Memory-Level-Parallelism (avg L1 miss demand load when there is at least 1
-such miss)"""
+Memory-Level-Parallelism (average number of L1 miss demand load when there is
+at least 1 such miss)"""
+    domain = "Metric"
+    maxval = 10
 
     def compute(self, EV):
         try:
@@ -848,28 +1207,103 @@ def compute(self, EV):
             print "MLP zero division"
             self.val = 0
 
-class Metric_L1dMissLatency:
-    name = "L1dMissLatency"
+class Metric_Load_Miss_Real_Latency:
+    name = "Load_Miss_Real_Latency"
     desc = """
-Average L1 miss demand load latency"""
+Actual Average Latency for L1 data-cache miss demand loads"""
+    domain = "Metric"
+    maxval = 1000
 
     def compute(self, EV):
         try:
-            self.val = L1dMissLatency(EV, 0)
+            self.val = Load_Miss_Real_Latency(EV, 0)
         except ZeroDivisionError:
-            print "L1dMissLatency zero division"
+            print "Load_Miss_Real_Latency zero division"
             self.val = 0
 
-class Metric_TurboUtilization:
-    name = "TurboUtilization"
+class Metric_Turbo_Utilization:
+    name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
+    domain = "Metric"
+    maxval = 10
+
+    def compute(self, EV):
+        try:
+            self.val = Turbo_Utilization(EV, 0)
+        except ZeroDivisionError:
+            print "Turbo_Utilization zero division"
+            self.val = 0
+
+class Metric_Page_Walks_Use:
+    name = "Page_Walks_Use"
+    desc = """
+Fraction of cycles where the core's Page Walker is busy serving
+iTLB/Load/Store"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = Page_Walks_Use(EV, 0)
+        except ZeroDivisionError:
+            print "Page_Walks_Use zero division"
+            self.val = 0
+
+class Metric_MUX:
+    name = "MUX"
+    desc = """
+PerfMon Event Multiplexing accuracy indicator"""
+    domain = "Clocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = MUX(EV, 0)
+        except ZeroDivisionError:
+            print "MUX zero division"
+            self.val = 0
+
+class Metric_CLKS:
+    name = "CLKS"
+    desc = """
+Per-thread actual clocks"""
+    domain = "Count"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CLKS zero division"
+            self.val = 0
+
+class Metric_CORE_CLKS:
+    name = "CORE_CLKS"
+    desc = """
+Core actual clocks"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CORE_CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CORE_CLKS zero division"
+            self.val = 0
+
+class Metric_Time:
+    name = "Time"
+    desc = """
+Run duration time in seconds"""
+    domain = "Count"
+    maxval = 0
 
     def compute(self, EV):
         try:
-            self.val = TurboUtilization(EV, 0)
+            self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "TurboUtilization zero division"
+            print "Time zero division"
             self.val = 0
 
 # Schedule
@@ -878,183 +1312,218 @@ def compute(self, EV):
 class Setup:
     def __init__(self, r):
         o = dict()
-        n = FrontendBound() ; r.run(n) ; o["FrontendBound"] = n
-        n = FrontendLatency() ; r.run(n) ; o["FrontendLatency"] = n
-        n = ICacheMisses() ; r.run(n) ; o["ICacheMisses"] = n
-        n = ITLBmisses() ; r.run(n) ; o["ITLBmisses"] = n
-        n = BranchResteers() ; r.run(n) ; o["BranchResteers"] = n
-        n = DSBswitches() ; r.run(n) ; o["DSBswitches"] = n
+        n = Frontend_Bound() ; r.run(n) ; o["Frontend_Bound"] = n
+        n = Frontend_Latency() ; r.run(n) ; o["Frontend_Latency"] = n
+        n = ITLB_Misses() ; r.run(n) ; o["ITLB_Misses"] = n
+        n = DSB_Switches() ; r.run(n) ; o["DSB_Switches"] = n
         n = LCP() ; r.run(n) ; o["LCP"] = n
-        n = MSswitches() ; r.run(n) ; o["MSswitches"] = n
-        n = FrontendBandwidth() ; r.run(n) ; o["FrontendBandwidth"] = n
+        n = MS_Switches() ; r.run(n) ; o["MS_Switches"] = n
+        n = Frontend_Bandwidth() ; r.run(n) ; o["Frontend_Bandwidth"] = n
         n = MITE() ; r.run(n) ; o["MITE"] = n
         n = DSB() ; r.run(n) ; o["DSB"] = n
-        n = BadSpeculation() ; r.run(n) ; o["BadSpeculation"] = n
-        n = BranchMispredicts() ; r.run(n) ; o["BranchMispredicts"] = n
-        n = MachineClears() ; r.run(n) ; o["MachineClears"] = n
+        n = LSD() ; r.run(n) ; o["LSD"] = n
+        n = Bad_Speculation() ; r.run(n) ; o["Bad_Speculation"] = n
+        n = Branch_Mispredicts() ; r.run(n) ; o["Branch_Mispredicts"] = n
+        n = Machine_Clears() ; r.run(n) ; o["Machine_Clears"] = n
         n = Backend_Bound() ; r.run(n) ; o["Backend_Bound"] = n
-        n = MemoryBound() ; r.run(n) ; o["MemoryBound"] = n
-        n = L1Bound() ; r.run(n) ; o["L1Bound"] = n
-        n = DTLB_Overhead() ; r.run(n) ; o["DTLB_Overhead"] = n
-        n = LoadsBlockedbyStoreForwarding() ; r.run(n) ; o["LoadsBlockedbyStoreForwarding"] = n
-        n = SplitLoads() ; r.run(n) ; o["SplitLoads"] = n
-        n = G4KAliasing() ; r.run(n) ; o["G4KAliasing"] = n
-        n = L2Bound() ; r.run(n) ; o["L2Bound"] = n
-        n = L3Bound() ; r.run(n) ; o["L3Bound"] = n
-        n = ContestedAccesses() ; r.run(n) ; o["ContestedAccesses"] = n
-        n = DataSharing() ; r.run(n) ; o["DataSharing"] = n
-        n = L3Latency() ; r.run(n) ; o["L3Latency"] = n
-        n = DRAMBound() ; r.run(n) ; o["DRAMBound"] = n
-        n = MEMBandwidth() ; r.run(n) ; o["MEMBandwidth"] = n
-        n = MEMLatency() ; r.run(n) ; o["MEMLatency"] = n
-        n = StoresBound() ; r.run(n) ; o["StoresBound"] = n
-        n = FalseSharing() ; r.run(n) ; o["FalseSharing"] = n
-        n = SplitStores() ; r.run(n) ; o["SplitStores"] = n
-        n = DTLBStoreOverhead() ; r.run(n) ; o["DTLBStoreOverhead"] = n
-        n = CoreBound() ; r.run(n) ; o["CoreBound"] = n
+        n = Memory_Bound() ; r.run(n) ; o["Memory_Bound"] = n
+        n = L1_Bound() ; r.run(n) ; o["L1_Bound"] = n
+        n = DTLB_Load() ; r.run(n) ; o["DTLB_Load"] = n
+        n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
+        n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n
+        n = L2_Bound() ; r.run(n) ; o["L2_Bound"] = n
+        n = L3_Bound() ; r.run(n) ; o["L3_Bound"] = n
+        n = Contested_Accesses() ; r.run(n) ; o["Contested_Accesses"] = n
+        n = Data_Sharing() ; r.run(n) ; o["Data_Sharing"] = n
+        n = L3_Latency() ; r.run(n) ; o["L3_Latency"] = n
+        n = SQ_Full() ; r.run(n) ; o["SQ_Full"] = n
+        n = MEM_Bound() ; r.run(n) ; o["MEM_Bound"] = n
+        n = MEM_Bandwidth() ; r.run(n) ; o["MEM_Bandwidth"] = n
+        n = MEM_Latency() ; r.run(n) ; o["MEM_Latency"] = n
+        n = Stores_Bound() ; r.run(n) ; o["Stores_Bound"] = n
+        n = False_Sharing() ; r.run(n) ; o["False_Sharing"] = n
+        n = Split_Stores() ; r.run(n) ; o["Split_Stores"] = n
+        n = DTLB_Store() ; r.run(n) ; o["DTLB_Store"] = n
+        n = Core_Bound() ; r.run(n) ; o["Core_Bound"] = n
+        n = Divider() ; r.run(n) ; o["Divider"] = n
+        n = Ports_Utilization() ; r.run(n) ; o["Ports_Utilization"] = n
+        n = G0_Ports_Utilized() ; r.run(n) ; o["G0_Ports_Utilized"] = n
+        n = G1_Port_Utilized() ; r.run(n) ; o["G1_Port_Utilized"] = n
+        n = G2_Ports_Utilized() ; r.run(n) ; o["G2_Ports_Utilized"] = n
+        n = G3m_Ports_Utilized() ; r.run(n) ; o["G3m_Ports_Utilized"] = n
         n = Retiring() ; r.run(n) ; o["Retiring"] = n
-        n = BASE() ; r.run(n) ; o["BASE"] = n
-        n = MicroSequencer() ; r.run(n) ; o["MicroSequencer"] = n
+        n = Base() ; r.run(n) ; o["Base"] = n
+        n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n
 
         # parents
 
-        o["FrontendLatency"].parent = o["FrontendBound"]
-        o["ICacheMisses"].parent = o["FrontendLatency"]
-        o["ITLBmisses"].parent = o["FrontendLatency"]
-        o["BranchResteers"].parent = o["FrontendLatency"]
-        o["DSBswitches"].parent = o["FrontendLatency"]
-        o["LCP"].parent = o["FrontendLatency"]
-        o["MSswitches"].parent = o["FrontendLatency"]
-        o["FrontendBandwidth"].parent = o["FrontendBound"]
-        o["MITE"].parent = o["FrontendBandwidth"]
-        o["DSB"].parent = o["FrontendBandwidth"]
-        o["BranchMispredicts"].parent = o["BadSpeculation"]
-        o["MachineClears"].parent = o["BadSpeculation"]
-        o["MemoryBound"].parent = o["Backend_Bound"]
-        o["L1Bound"].parent = o["MemoryBound"]
-        o["DTLB_Overhead"].parent = o["L1Bound"]
-        o["LoadsBlockedbyStoreForwarding"].parent = o["L1Bound"]
-        o["SplitLoads"].parent = o["L1Bound"]
-        o["G4KAliasing"].parent = o["L1Bound"]
-        o["L2Bound"].parent = o["MemoryBound"]
-        o["L3Bound"].parent = o["MemoryBound"]
-        o["ContestedAccesses"].parent = o["L3Bound"]
-        o["DataSharing"].parent = o["L3Bound"]
-        o["L3Latency"].parent = o["L3Bound"]
-        o["DRAMBound"].parent = o["MemoryBound"]
-        o["MEMBandwidth"].parent = o["DRAMBound"]
-        o["MEMLatency"].parent = o["DRAMBound"]
-        o["StoresBound"].parent = o["MemoryBound"]
-        o["FalseSharing"].parent = o["StoresBound"]
-        o["SplitStores"].parent = o["StoresBound"]
-        o["DTLBStoreOverhead"].parent = o["StoresBound"]
-        o["CoreBound"].parent = o["Backend_Bound"]
-        o["BASE"].parent = o["Retiring"]
-        o["MicroSequencer"].parent = o["Retiring"]
+        o["Frontend_Latency"].parent = o["Frontend_Bound"]
+        o["ITLB_Misses"].parent = o["Frontend_Latency"]
+        o["DSB_Switches"].parent = o["Frontend_Latency"]
+        o["LCP"].parent = o["Frontend_Latency"]
+        o["MS_Switches"].parent = o["Frontend_Latency"]
+        o["Frontend_Bandwidth"].parent = o["Frontend_Bound"]
+        o["MITE"].parent = o["Frontend_Bandwidth"]
+        o["DSB"].parent = o["Frontend_Bandwidth"]
+        o["LSD"].parent = o["Frontend_Bandwidth"]
+        o["Branch_Mispredicts"].parent = o["Bad_Speculation"]
+        o["Machine_Clears"].parent = o["Bad_Speculation"]
+        o["Memory_Bound"].parent = o["Backend_Bound"]
+        o["L1_Bound"].parent = o["Memory_Bound"]
+        o["DTLB_Load"].parent = o["L1_Bound"]
+        o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["Split_Loads"].parent = o["L1_Bound"]
+        o["G4K_Aliasing"].parent = o["L1_Bound"]
+        o["L2_Bound"].parent = o["Memory_Bound"]
+        o["L3_Bound"].parent = o["Memory_Bound"]
+        o["Contested_Accesses"].parent = o["L3_Bound"]
+        o["Data_Sharing"].parent = o["L3_Bound"]
+        o["L3_Latency"].parent = o["L3_Bound"]
+        o["SQ_Full"].parent = o["L3_Bound"]
+        o["MEM_Bound"].parent = o["Memory_Bound"]
+        o["MEM_Bandwidth"].parent = o["MEM_Bound"]
+        o["MEM_Latency"].parent = o["MEM_Bound"]
+        o["Stores_Bound"].parent = o["Memory_Bound"]
+        o["False_Sharing"].parent = o["Stores_Bound"]
+        o["Split_Stores"].parent = o["Stores_Bound"]
+        o["DTLB_Store"].parent = o["Stores_Bound"]
+        o["Core_Bound"].parent = o["Backend_Bound"]
+        o["Divider"].parent = o["Core_Bound"]
+        o["Ports_Utilization"].parent = o["Core_Bound"]
+        o["G0_Ports_Utilized"].parent = o["Ports_Utilization"]
+        o["G1_Port_Utilized"].parent = o["Ports_Utilization"]
+        o["G2_Ports_Utilized"].parent = o["Ports_Utilization"]
+        o["G3m_Ports_Utilized"].parent = o["Ports_Utilization"]
+        o["Base"].parent = o["Retiring"]
+        o["Microcode_Sequencer"].parent = o["Retiring"]
 
         # references between groups
 
-        o["FrontendBandwidth"].FrontendBound = o["FrontendBound"]
-        o["FrontendBandwidth"].FrontendLatency = o["FrontendLatency"]
-        o["BranchMispredicts"].BadSpeculation = o["BadSpeculation"]
-        o["MachineClears"].BadSpeculation = o["BadSpeculation"]
-        o["MachineClears"].BranchMispredicts = o["BranchMispredicts"]
-        o["Backend_Bound"].FrontendBound = o["FrontendBound"]
-        o["Backend_Bound"].BadSpeculation = o["BadSpeculation"]
+        o["Frontend_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["Frontend_Bandwidth"].Frontend_Latency = o["Frontend_Latency"]
+        o["Branch_Mispredicts"].Bad_Speculation = o["Bad_Speculation"]
+        o["Machine_Clears"].Bad_Speculation = o["Bad_Speculation"]
+        o["Machine_Clears"].Branch_Mispredicts = o["Branch_Mispredicts"]
+        o["Backend_Bound"].Frontend_Bound = o["Frontend_Bound"]
+        o["Backend_Bound"].Bad_Speculation = o["Bad_Speculation"]
         o["Backend_Bound"].Retiring = o["Retiring"]
-        o["L1Bound"].DTLB_Overhead = o["DTLB_Overhead"]
-        o["StoresBound"].MemoryBound = o["MemoryBound"]
-        o["CoreBound"].MemoryBound = o["MemoryBound"]
-        o["Retiring"].MicroSequencer = o["MicroSequencer"]
-        o["BASE"].Retiring = o["Retiring"]
-        o["BASE"].MicroSequencer = o["MicroSequencer"]
+        o["L1_Bound"].DTLB_Load = o["DTLB_Load"]
+        o["Stores_Bound"].Memory_Bound = o["Memory_Bound"]
+        o["Core_Bound"].Memory_Bound = o["Memory_Bound"]
+        o["Ports_Utilization"].Core_Bound = o["Core_Bound"]
+        o["Ports_Utilization"].Divider = o["Divider"]
+        o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Base"].Retiring = o["Retiring"]
+        o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"]
 
         # siblings cross-tree
 
-        o["FrontendBound"].sibling = None
-        o["FrontendLatency"].sibling = None
-        o["ICacheMisses"].sibling = None
-        o["ITLBmisses"].sibling = None
-        o["BranchResteers"].sibling = o["BadSpeculation"]
-        o["DSBswitches"].sibling = None
+        o["Frontend_Bound"].sibling = None
+        o["Frontend_Latency"].sibling = None
+        o["ITLB_Misses"].sibling = None
+        o["DSB_Switches"].sibling = None
         o["LCP"].sibling = None
-        o["MSswitches"].sibling = o["MicroSequencer"]
-        o["FrontendBandwidth"].sibling = None
+	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
+        o["Frontend_Bandwidth"].sibling = None
         o["MITE"].sibling = None
         o["DSB"].sibling = None
-        o["BadSpeculation"].sibling = o["BranchResteers"]
-        o["BranchMispredicts"].sibling = None
-        o["MachineClears"].sibling = None
+        o["LSD"].sibling = None
+	o["Bad_Speculation"].sibling = None
+        o["Branch_Mispredicts"].sibling = None
+        o["Machine_Clears"].sibling = None
         o["Backend_Bound"].sibling = None
-        o["MemoryBound"].sibling = None
-        o["L1Bound"].sibling = None
-        o["DTLB_Overhead"].sibling = None
-        o["LoadsBlockedbyStoreForwarding"].sibling = None
-        o["SplitLoads"].sibling = None
-        o["G4KAliasing"].sibling = None
-        o["L2Bound"].sibling = None
-        o["L3Bound"].sibling = None
-        o["ContestedAccesses"].sibling = None
-        o["DataSharing"].sibling = None
-        o["L3Latency"].sibling = None
-        o["DRAMBound"].sibling = None
-        o["MEMBandwidth"].sibling = None
-        o["MEMLatency"].sibling = None
-        o["StoresBound"].sibling = None
-        o["FalseSharing"].sibling = None
-        o["SplitStores"].sibling = None
-        o["DTLBStoreOverhead"].sibling = None
-        o["CoreBound"].sibling = None
+        o["Memory_Bound"].sibling = None
+	o["L1_Bound"].sibling = o["G1_Port_Utilized"]
+        o["DTLB_Load"].sibling = None
+        o["Store_Fwd_Blk"].sibling = None
+        o["Split_Loads"].sibling = None
+        o["G4K_Aliasing"].sibling = None
+        o["L2_Bound"].sibling = None
+        o["L3_Bound"].sibling = None
+        o["Contested_Accesses"].sibling = None
+        o["Data_Sharing"].sibling = None
+        o["L3_Latency"].sibling = None
+        o["SQ_Full"].sibling = None
+        o["MEM_Bound"].sibling = None
+        o["MEM_Bandwidth"].sibling = None
+        o["MEM_Latency"].sibling = None
+        o["Stores_Bound"].sibling = None
+        o["False_Sharing"].sibling = None
+	o["Split_Stores"].sibling = None
+        o["DTLB_Store"].sibling = None
+        o["Core_Bound"].sibling = None
+        o["Divider"].sibling = None
+        o["Ports_Utilization"].sibling = None
+        o["G0_Ports_Utilized"].sibling = None
+	o["G1_Port_Utilized"].sibling = o["L1_Bound"]
+        o["G2_Ports_Utilized"].sibling = None
+        o["G3m_Ports_Utilized"].sibling = None
         o["Retiring"].sibling = None
-        o["BASE"].sibling = None
-        o["MicroSequencer"].sibling = o["MSswitches"]
+        o["Base"].sibling = None
+	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events (experimential)
+        # sampling events
 
-        o["FrontendBound"].sample = []
-        o["FrontendLatency"].sample = []
-        o["ICacheMisses"].sample = []
-        o["ITLBmisses"].sample = []
-        o["BranchResteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES_PS']
-        o["DSBswitches"].sample = []
+        o["Frontend_Bound"].sample = []
+        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
+        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
+        o["DSB_Switches"].sample = []
         o["LCP"].sample = []
-        o["MSswitches"].sample = []
-        o["FrontendBandwidth"].sample = []
+        o["MS_Switches"].sample = []
+        o["Frontend_Bandwidth"].sample = []
         o["MITE"].sample = []
         o["DSB"].sample = []
-        o["BadSpeculation"].sample = []
-        o["BranchMispredicts"].sample = []
-        o["MachineClears"].sample = []
+        o["LSD"].sample = []
+        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
+        o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+        o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT']
         o["Backend_Bound"].sample = []
-        o["MemoryBound"].sample = []
-        o["L1Bound"].sample = []
-        o["DTLB_Overhead"].sample = []
-        o["LoadsBlockedbyStoreForwarding"].sample = []
-        o["SplitLoads"].sample = []
-        o["G4KAliasing"].sample = []
-        o["L2Bound"].sample = []
-        o["L3Bound"].sample = []
-        o["ContestedAccesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS_PS']
-        o["DataSharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT_PS']
-        o["L3Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT_PS']
-        o["DRAMBound"].sample = []
-        o["MEMBandwidth"].sample = []
-        o["MEMLatency"].sample = []
-        o["StoresBound"].sample = []
-        o["FalseSharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS']
-        o["SplitStores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES_PS', 'MEM_UOPS_RETIRED.ALL_STORES_PS']
-        o["DTLBStoreOverhead"].sample = []
-        o["CoreBound"].sample = []
+        o["Memory_Bound"].sample = []
+        o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
+        o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
+        o["Store_Fwd_Blk"].sample = []
+        o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
+        o["G4K_Aliasing"].sample = []
+        o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
+        o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
+        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp']
+        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+        o["SQ_Full"].sample = []
+        o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
+        o["MEM_Bandwidth"].sample = []
+        o["MEM_Latency"].sample = []
+        o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
+        o["False_Sharing"].sample = [' MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE:request=DEMAND_RFO:response=L3_HIT.SNOOP_HITM']
+        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
+        o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
+        o["Core_Bound"].sample = []
+        o["Divider"].sample = ['ARITH.DIVIDER_UOPS']
+        o["Ports_Utilization"].sample = []
+        o["G0_Ports_Utilized"].sample = []
+        o["G1_Port_Utilized"].sample = []
+        o["G2_Ports_Utilized"].sample = []
+        o["G3m_Ports_Utilized"].sample = []
         o["Retiring"].sample = []
-        o["BASE"].sample = []
-        o["MicroSequencer"].sample = []
+        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
+        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
 
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
+        n = Metric_CPI() ; r.metric(n)
+        n = Metric_CoreIPC() ; r.metric(n)
         n = Metric_UPI() ; r.metric(n)
-        n = Metric_InstPerTakenBranch() ; r.metric(n)
-        n = Metric_DSBCoverage() ; r.metric(n)
+        n = Metric_IPTB() ; r.metric(n)
+        n = Metric_BPTB() ; r.metric(n)
+        n = Metric_DSB_Coverage() ; r.metric(n)
+        n = Metric_ILP() ; r.metric(n)
         n = Metric_MLP() ; r.metric(n)
-        n = Metric_L1dMissLatency() ; r.metric(n)
-        n = Metric_TurboUtilization() ; r.metric(n)
+        n = Metric_Load_Miss_Real_Latency() ; r.metric(n)
+        n = Metric_Turbo_Utilization() ; r.metric(n)
+        n = Metric_Page_Walks_Use() ; r.metric(n)
+        n = Metric_MUX() ; r.metric(n)
+        n = Metric_CLKS() ; r.metric(n)
+        n = Metric_CORE_CLKS() ; r.metric(n)
+        n = Metric_Time() ; r.metric(n)
diff --git a/ivb_client_ratios.py b/ivb_client_ratios.py
index ce4ceeb5..5a868462 100644
--- a/ivb_client_ratios.py
+++ b/ivb_client_ratios.py
@@ -1,14 +1,19 @@
 
 #
-# auto generated TopDown description for Ivy Bridge
+# auto generated TopDown 2.9 description for Intel 3rd gen Core (code named IvyBridge)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
+# References:
+# http://halobates.de/blog/p/262
+# https://sites.google.com/site/analysismethods/yasin-pubs
+#
 
 smt_enabled = False
 
 # Constants
 
 Pipeline_Width = 4
+L2_Store_Latency = 9
 Mem_L3_Weight = 7
 Mem_STLB_Hit_Cost = 7
 Mem_SFB_Cost = 13
@@ -16,89 +21,186 @@
 Mem_XSNP_HitM_Cost = 60
 MEM_XSNP_Hit_Cost = 43
 MEM_XSNP_None_Cost = 29
+Mem_Local_DRAM_Cost = 200
+Mem_Remote_DRAM_Cost = 310
+Mem_Remote_HitM_Cost = 200
+Mem_Remote_Fwd_Cost = 180
 MS_Switches_Cost = 3
 OneMillion = 1000000
+Energy_Unit = 15.6
 
 # Aux. formulas
 
+
 # Floating Point Operations Count
 def FLOP_Count(EV, level):
-    return ( 1 *(EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level))+ 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level))+ 8 * EV("SIMD_FP_256.PACKED_SINGLE", level) )
+    return (1 *(EV("FP_COMP_OPS_EXE.X87", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level)) + 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level)) + 8 * EV("SIMD_FP_256.PACKED_SINGLE", level))
+
 def Recovery_Cycles(EV, level):
-    return ( EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2)if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level)
+    EV("INT_MISC.RECOVERY_CYCLES", level)
+    EV("INT_MISC.RECOVERY_CYCLES:amt1", level)
+    return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level)
+
 def Execute_Cycles(EV, level):
-    return ( EV("UOPS_EXECUTED.CORE:c1", level) / 2)if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+    EV("UOPS_EXECUTED.CORE:c1", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c1", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+
 def L1D_Miss_Cycles(EV, level):
-    return ( EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2)if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+    EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+    EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level)
+    return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+
+def SQ_Full_Cycles(EV, level):
+    EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+    return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+
 def ITLB_Miss_Cycles(EV, level):
-    return ( Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level) )
+    return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level))
+
 def Cycles_0_Ports_Utilized(EV, level):
-    return ( EV("UOPS_EXECUTED.CORE:i1", level))/ 2 if smt_enabled else(STALLS_TOTAL(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ARITH.FPU_DIV_ACTIVE", level) )
+    EV("ARITH.FPU_DIV_ACTIVE", level)
+    EV("UOPS_EXECUTED.CORE:i1:c1", level)
+    EV("RS_EVENTS.EMPTY_CYCLES", level)
+    return (EV("UOPS_EXECUTED.CORE:i1:c1", level)) / 2 if smt_enabled else(STALLS_TOTAL(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ARITH.FPU_DIV_ACTIVE", level))
+
 def Cycles_1_Port_Utilized(EV, level):
-    return ( EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level))/ 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) )
+    EV("UOPS_EXECUTED.CORE:c2", level)
+    EV("UOPS_EXECUTED.CORE:c1", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level))
+
 def Cycles_2_Ports_Utilized(EV, level):
-    return ( EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level))/ 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) )
+    EV("UOPS_EXECUTED.CORE:c2", level)
+    EV("UOPS_EXECUTED.CORE:c3", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level))
+
 def Cycles_3m_Ports_Utilized(EV, level):
-    return ( EV("UOPS_EXECUTED.CORE:c3", level) / 2)if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+    EV("UOPS_EXECUTED.CORE:c3", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c3", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+
 def STALLS_MEM_ANY(EV, level):
-    return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level) )
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level)) , level )
+
 def STALLS_TOTAL(EV, level):
-    return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level) )
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level)) , level )
+
 def ORO_Demand_DRD_C1(EV, level):
-    return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level) )
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level )
+
 def ORO_Demand_DRD_C6(EV, level):
-    return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level) )
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level )
+
+def ORO_Demand_RFO_C1(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", level)) , level )
+
+def Store_L2_Hit_Cycles(EV, level):
+    return 0
+
+def Cycles_False_Sharing_Client(EV, level):
+    return Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", level) + EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE", level))
+
 def Few_Uops_Executed_Threshold(EV, level):
-    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
     EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
-    return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level) > 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level)> 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+
 def Backend_Bound_At_EXE(EV, level):
-    return ( STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level))/ CLKS(EV, level)
+    return (STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level)) / CLKS(EV, level)
+
 def Mem_L3_Hit_Fraction(EV, level):
-    return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level) )
+    return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level))
+
+def Mem_Lock_St_Fraction(EV, level):
+    return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level)
+
 def Mispred_Clears_Fraction(EV, level):
-    return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level) )
+    return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
+
 def Avg_RS_Empty_Period_Clears(EV, level):
-    return ( EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level))/ EV("RS_EVENTS.EMPTY_END", level)
+    return (EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level)) / EV("RS_EVENTS.EMPTY_END", level)
+
 def Retire_Uop_Fraction(EV, level):
     return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level)
+
 def SLOTS(EV, level):
-    return Pipeline_Width * CLKS1(EV, level)
-# Instructions Per Cycle (per physical core)
+    return Pipeline_Width * CORE_CLKS(EV, level)
+
+def DurationTimeInSeconds(EV, level):
+    return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 )
+
+# Instructions Per Cycle (per logical thread)
 def IPC(EV, level):
-    return EV("INST_RETIRED.ANY", level) / CLKS1(EV, level)
+    return EV("INST_RETIRED.ANY", level) / CLKS(EV, level)
+
+# Cycles Per Instruction (threaded)
 def CPI(EV, level):
     return 1 / IPC(EV, level)
+
+# Instructions Per Cycle (per physical core)
+def CoreIPC(EV, level):
+    return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level)
+
 # Uops Per Instruction
 def UPI(EV, level):
     return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level)
+
 # Instruction per taken branch
 def IPTB(EV, level):
     return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
+# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes.
+def BPTB(EV, level):
+    return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
 # Fraction of Uops delivered by the DSB (decoded instructions cache)
 def DSB_Coverage(EV, level):
-    return ( EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level))/(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level) )
+    return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level))
+
 # Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)
 def ILP(EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(EV, level)
+
 # Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)
 def MLP(EV, level):
     return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level)
+
 # Actual Average Latency for L1 data-cache miss demand loads
 def Load_Miss_Real_Latency(EV, level):
-    return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level) )
+    return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level))
+
+# Giga Floating Point Operations Per Second
+def GFLOPs(EV, level):
+    return FLOP_Count(EV, level) / OneMillion / DurationTimeInSeconds(EV, level) / 1000
+
 # Average Frequency Utilization relative nominal frequency
 def Turbo_Utilization(EV, level):
     return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level)
+
 # Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store
 def Page_Walks_Use(EV, level):
-    return ( EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level))/ CLKS1(EV, level)
+    return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level)
+
+# PerfMon Event Multiplexing accuracy indicator
+def MUX(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level)
+
 # Per-thread actual clocks
 def CLKS(EV, level):
     return EV("CPU_CLK_UNHALTED.THREAD", level)
+
 # Core actual clocks
-def CLKS1(EV, level):
-    return ( EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2)if smt_enabled else CLKS(EV, level)
+def CORE_CLKS(EV, level):
+    EV("CPU_CLK_UNHALTED.THREAD:amt1", level)
+    return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level)
+
+# Run duration time in seconds
+def Time(EV, level):
+    return DurationTimeInSeconds(EV, level)
 
 # Event groups
 
@@ -117,9 +219,10 @@ class Frontend_Bound:
 latter can accept them. For example, stalls due to instruction-cache misses
 would be categorized under Frontend Bound."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1)/ SLOTS(EV, 1 )
+            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             #print "Frontend_Bound zero division"
@@ -137,9 +240,10 @@ class Frontend_Latency:
 after a branch misprediction are categorized under Frontend Latency. In such
 cases the Frontend eventually delivers no uops for some period."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2)/ SLOTS(EV, 2 )
+            self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
             #print "Frontend_Latency zero division"
@@ -156,9 +260,10 @@ class ICache_Misses:
 cache misses. Using compiler's Profile-Guided Optimization (PGO) can reduce
 i-cache misses through improved hot code layout."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ICACHE.IFETCH_STALL", 3)/ CLKS(EV, 3)- self.ITLB_Misses.compute(EV )
+            self.val = EV("ICACHE.IFETCH_STALL", 3) / CLKS(EV, 3) - self.ITLB_Misses.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "ICache_Misses zero division"
@@ -174,9 +279,10 @@ class ITLB_Misses:
 This metric represents cycles fraction CPU was stalled due to instruction TLB
 misses. Using large code pages may be considered here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ITLB_Miss_Cycles(EV, 3)/ CLKS(EV, 3 )
+            self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "ITLB_Misses zero division"
@@ -195,9 +301,10 @@ class Branch_Resteers:
 For example, branchy code with lots of (taken) branches and/or branch miss-
 predictions might get categorized under Branch Resteers."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3)+ EV("MACHINE_CLEARS.COUNT", 3)+ EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 )
+            self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3) + EV("MACHINE_CLEARS.COUNT", 3) + EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "Branch_Resteers zero division"
@@ -213,9 +320,10 @@ class DSB_Switches:
 This metric represents cycles fraction CPU was stalled due to switches from
 DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3)/ CLKS(EV, 3 )
+            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "DSB_Switches zero division"
@@ -232,9 +340,10 @@ class LCP:
 Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
 certainly avoid this."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ILD_STALL.LCP", 3)/ CLKS(EV, 3 )
+            self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "LCP zero division"
@@ -253,9 +362,10 @@ class MS_Switches:
 deliver long uop flows required by CISC instructions like CPUID, or uncommon
 conditions like Floating Point Assists when dealing with Denormals."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3)/ CLKS(EV, 3 )
+            self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "MS_Switches zero division"
@@ -274,9 +384,10 @@ class Frontend_Bandwidth:
 under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
 optimal amount of uops to the Backend."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Frontend_Bound.compute(EV)- self.Frontend_Latency.compute(EV )
+            self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
             #print "Frontend_Bandwidth zero division"
@@ -286,16 +397,17 @@ def compute(self, EV):
 
 class MITE:
     name = "MITE"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-the MITE fetch pipeline.  For example, inefficiencies in the instruction
-decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to the MITE fetch pipeline.  For example, inefficiencies in the
+instruction decoders are categorized here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 )
+            self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MITE zero division"
@@ -305,17 +417,18 @@ def compute(self, EV):
 
 class DSB:
     name = "DSB"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-DSB (decoded uop cache) fetch pipeline.  For example, inefficient utilization
-of the DSB cache structure or bank conflict when reading from it, are
-categorized here."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
+utilization of the DSB cache structure or bank conflict when reading from it,
+are categorized here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 )
+            self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
             #print "DSB zero division"
@@ -325,18 +438,19 @@ def compute(self, EV):
 
 class LSD:
     name = "LSD"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop
-supply. However, in some rare cases, optimal uop-delivery could not be reached
-for small loops whose size (in terms of number of uops) does not suit well the
-LSD structure."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
+Uop supply. However, in some rare cases, optimal uop-delivery could not be
+reached for small loops whose size (in terms of number of uops) does not suit
+well the LSD structure."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("LSD.CYCLES_ACTIVE", 3)- EV("LSD.CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 )
+            self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "LSD zero division"
@@ -355,9 +469,10 @@ class Bad_Speculation:
 speculation. For example, wasted work due to miss-predicted branches are
 categorized under Bad Speculation category"""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("UOPS_ISSUED.ANY", 1)- EV("UOPS_RETIRED.RETIRE_SLOTS", 1)+ Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
+            self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
             #print "Bad_Speculation zero division"
@@ -375,6 +490,7 @@ class Branch_Mispredicts:
 incorrectly speculated program path, or stalls the Backend of the machine
 needs to recover its state from a speculative path."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
             self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV )
@@ -396,9 +512,10 @@ class Machine_Clears:
 example, this can happen due to memory ordering Nukes (e.g. Memory
 Disambiguation) or Self-Modifying-Code (SMC) nukes."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Bad_Speculation.compute(EV)- self.Branch_Mispredicts.compute(EV )
+            self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "Machine_Clears zero division"
@@ -419,9 +536,10 @@ class Backend_Bound:
 stalls due to data-cache misses or stalls due to the divider unit being
 overloaded are both categorized under Backend Bound."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = 1 -(self.Frontend_Bound.compute(EV)+ self.Bad_Speculation.compute(EV)+ self.Retiring.compute(EV))
+            self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             #print "Backend_Bound zero division"
@@ -440,9 +558,10 @@ class Memory_Bound:
 memory demand loads which coincides with execution starvation. in addition to
 less common cases where stores could imply backpressure on the pipeline."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( STALLS_MEM_ANY(EV, 2)+ EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
+            self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             #print "Memory_Bound zero division"
@@ -462,9 +581,10 @@ class L1_Bound:
 allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event
 as it accounts for any non-completed load."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( STALLS_MEM_ANY(EV, 3)- EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
+            self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh
         except ZeroDivisionError:
             #print "L1_Bound zero division"
@@ -478,9 +598,10 @@ class DTLB_Load:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4)+ EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
+            self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "DTLB_Load zero division"
@@ -494,9 +615,10 @@ class Store_Fwd_Blk:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4)/ CLKS(EV, 4 )
+            self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "Store_Fwd_Blk zero division"
@@ -504,15 +626,36 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
+class Lock_Latency:
+    name = "Lock_Latency"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents cycles fraction the CPU spent handling cache misses due
+to lock operations. Due to the microarchitecture handling of locks, they are
+classified as L1_Bound regardless of what memory source satsified them."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Mem_Lock_St_Fraction(EV, 4)* ORO_Demand_RFO_C1(EV, 4) / CLKS(EV, 4 )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Lock_Latency zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
 class Split_Loads:
     name = "Split_Loads"
     domain = "Clocks"
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4)/ CLKS(EV, 4 )
+            self.val = 13 * EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "Split_Loads zero division"
@@ -526,9 +669,10 @@ class G4K_Aliasing:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4)/ CLKS(EV, 4 )
+            self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "G4K_Aliasing zero division"
@@ -545,9 +689,10 @@ class L2_Bound:
 misses (i.e. L1 misses/L2 hits) will improve the latency and increase
 performance."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)- EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
+            self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.03) and self.parent.thresh
         except ZeroDivisionError:
             #print "L2_Bound zero division"
@@ -564,9 +709,10 @@ class L3_Bound:
 a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
 the latency and increase performance."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 )
+            self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "L3_Bound zero division"
@@ -580,9 +726,10 @@ class Contested_Accesses:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4)+ EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
+            self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "Contested_Accesses zero division"
@@ -596,9 +743,10 @@ class Data_Sharing:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4)/ CLKS(EV, 4 )
+            self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "Data_Sharing zero division"
@@ -616,9 +764,10 @@ class L3_Latency:
 with a sibling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
 improve the latency and increase performance."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4)/ CLKS(EV, 4 )
+            self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "L3_Latency zero division"
@@ -626,6 +775,26 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
+class SQ_Full:
+    name = "SQ_Full"
+    domain = "CoreClocks"
+    area = "BE/Mem"
+    desc = """
+This metric measures fraction of cycles where the Super Queue (SQ) was full
+taking into account all request-types and both hardware SMT threads. The Super
+Queue is used for requests to access the L2 cache or to go out to the Uncore."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "SQ_Full zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
 class MEM_Bound:
     name = "MEM_Bound"
     domain = "Clocks"
@@ -634,9 +803,10 @@ class MEM_Bound:
 This metric represents how often CPU was stalled on main memory (DRAM).
 Caching will improve the latency and increase performance."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( 1 - Mem_L3_Hit_Fraction(EV, 3)) * EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 )
+            self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MEM_Bound zero division"
@@ -653,9 +823,10 @@ class MEM_Bandwidth:
 bandwidth limits of main memory (DRAM).  NUMA in multi-socket system may be
 considered in such case."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ORO_Demand_DRD_C6(EV, 4)/ CLKS(EV, 4 )
+            self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MEM_Bandwidth zero division"
@@ -672,9 +843,10 @@ class MEM_Latency:
 main memory (DRAM).  Data layout re-structuring or using Software Prefetches
 (also through the compiler) may be considered in such case."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( ORO_Demand_DRD_C1(EV, 4)- ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
+            self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MEM_Latency zero division"
@@ -687,12 +859,15 @@ class Stores_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on due to store operations.
-Tip: consider False Sharing analysis as next step"""
+This metric represents how often CPU was stalled  due to store operations.
+even though memory store accesses do not typically stall out-of-order CPUs;
+there are few cases where stores can lead to actual stalls. This metric will
+be flagged should any of these cases be a bottleneck."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Memory_Bound.compute(EV)- STALLS_MEM_ANY(EV, 3)/ CLKS(EV, 3 )
+            self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             #print "Stores_Bound zero division"
@@ -700,17 +875,39 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
+class Store_Latency:
+    name = "Store_Latency"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents cycles fraction the CPU spent handling long-latency
+store misses (missing 2nd level cache)."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (Store_L2_Hit_Cycles(EV, 4) +(1 - Mem_Lock_St_Fraction(EV, 4))* ORO_Demand_RFO_C1(EV, 4)) / CLKS(EV, 4 )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Store_Latency zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
 class False_Sharing:
     name = "False_Sharing"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on due to store operations.
-Tip: consider False Sharing analysis as next step"""
+This metric represents how often CPU was stalled due to False Sharing. False
+Sharing is a multithreading hiccup, where multiple threads contend on
+different data-elements mapped into the same cache line. It can be easily
+avoided by padding to make threads access different lines."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4)+ EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE", 4)) / CLKS(EV, 4 )
+            self.val = Cycles_False_Sharing_Client(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             #print "False_Sharing zero division"
@@ -720,16 +917,17 @@ def compute(self, EV):
 
 class Split_Stores:
     name = "Split_Stores"
-    domain = "Stores"
+    domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
 This metric represents rate of split store accesses.  Consider aligning your
 data to the 64-byte cache line granularity."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4)/ EV("MEM_UOPS_RETIRED.ALL_STORES", 4 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             #print "Split_Stores zero division"
             self.val = 0
@@ -748,9 +946,10 @@ class DTLB_Store:
 the same page.  Try using larger page sizes for large amounts of frequently-
 used data."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4)+ EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
+            self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "DTLB_Store zero division"
@@ -771,9 +970,10 @@ class Core_Bound:
 performance (e.g. FP-chained long-latency arithmetic operations). Tip:
 consider Port Saturation analysis as next step."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Backend_Bound_At_EXE(EV, 2)- self.Memory_Bound.compute(EV )
+            self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "Core_Bound zero division"
@@ -783,13 +983,14 @@ def compute(self, EV):
 
 class Divider:
     name = "Divider"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = ""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ARITH.FPU_DIV_ACTIVE", 3)/ CLKS1(EV, 3 )
+            self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "Divider zero division"
@@ -810,9 +1011,10 @@ class Ports_Utilization:
 options today- reduces pressure on the execution ports as multiple elements
 are calculated with same uop."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Core_Bound.compute(EV)- self.Divider.compute(EV )
+            self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "Ports_Utilization zero division"
@@ -822,15 +1024,16 @@ def compute(self, EV):
 
 class G0_Ports_Utilized:
     name = "0_Ports_Utilized"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction CPU executed no uops on any execution
-port."""
+This metric represents Core cycles fraction CPU executed no uops on any
+execution port."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Cycles_0_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 )
+            self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "G0_Ports_Utilized zero division"
@@ -840,20 +1043,22 @@ def compute(self, EV):
 
 class G1_Port_Utilized:
     name = "1_Port_Utilized"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction CPU executed total of 1 uop per cycle
-on all execution ports. This can be due to heavy data-dependency among
-instructions. In some cases with high 1_Port_Utilized and L1_Bound it can
-point to L1 data-cache latency bottleneck that may not necessarily manifest
-with complete execution starvation (due to the short L1 latency e.g. walking
-linked list) - looking at the assembly can be helpful. Tip: consider 'Core
-Port Saturation' analysis-type as next step."""
+This metric represents Core cycles fraction where the CPU executed total of 1
+uop per cycle on all execution ports. This can be due to heavy data-dependency
+among software instructions, or over oversubscribing a particular hardware
+resource. In some other cases with high 1_Port_Utilized and L1_Bound, this
+metric can point to L1 data-cache latency bottleneck that may not necessarily
+manifest with complete execution starvation (due to the short L1 latency e.g.
+walking a linked list) - looking at the assembly can be helpful. Tip: consider
+'Core Ports Saturation' analysis-type as next step."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Cycles_1_Port_Utilized(EV, 4)/ CLKS1(EV, 4 )
+            self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "G1_Port_Utilized zero division"
@@ -863,18 +1068,19 @@ def compute(self, EV):
 
 class G2_Ports_Utilized:
     name = "2_Ports_Utilized"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction CPU executed total of 2 uops per cycle
-on all execution ports. Tip: consider 'Core Port Saturation' analysis-type as
-next step. Loop Vectorization -most compilers feature auto-Vectorization
-options today- reduces pressure on the execution ports as multiple elements
-are calculated with same uop."""
+This metric represents Core cycles fraction CPU executed total of 2 uops per
+cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-
+type as next step. Loop Vectorization -most compilers feature auto-
+Vectorization options today- reduces pressure on the execution ports as
+multiple elements are calculated with same uop."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Cycles_2_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 )
+            self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "G2_Ports_Utilized zero division"
@@ -884,16 +1090,17 @@ def compute(self, EV):
 
 class G3m_Ports_Utilized:
     name = "3m_Ports_Utilized"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction CPU executed total of 3 or more uops
-per cycle on all execution ports. Tip: consider 'Core Port Saturation'
+This metric represents Core cycles fraction CPU executed total of 3 or more
+uops per cycle on all execution ports. Tip: consider 'Core Port Saturation'
 analysis-type as next step"""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Cycles_3m_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 )
+            self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "G3m_Ports_Utilized zero division"
@@ -903,16 +1110,17 @@ def compute(self, EV):
 
 class Port_0:
     name = "Port_0"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
-port 0 (ALU)"""
+port 0 (SNB+: ALU; HSW+:ALU and 2nd branch)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_0 zero division"
             self.val = 0
@@ -921,16 +1129,17 @@ def compute(self, EV):
 
 class Port_1:
     name = "Port_1"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
 port 1 (ALU)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_1 zero division"
             self.val = 0
@@ -939,16 +1148,17 @@ def compute(self, EV):
 
 class Port_2:
     name = "Port_2"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
 port 2 (Loads and Store-address)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_2 zero division"
             self.val = 0
@@ -957,16 +1167,17 @@ def compute(self, EV):
 
 class Port_3:
     name = "Port_3"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
 port 3 (Loads and Store-address)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_3 zero division"
             self.val = 0
@@ -975,16 +1186,17 @@ def compute(self, EV):
 
 class Port_4:
     name = "Port_4"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
 port 4 (Store-data)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_4 zero division"
             self.val = 0
@@ -993,16 +1205,17 @@ def compute(self, EV):
 
 class Port_5:
     name = "Port_5"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
-port 5 (Branches and ALU)"""
+port 5 (SNB+: Branches and ALU; HSW+: ALU)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_5 zero division"
             self.val = 0
@@ -1026,9 +1239,10 @@ class Retiring:
 computations be done without significantly increasing number of instructions
 thus improving the performance."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1)/ SLOTS(EV, 1 )
+            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
             #print "Retiring zero division"
@@ -1041,16 +1255,18 @@ class Base:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops not originated
-from the microcode-sequencer. This correlates with total number of
+This metric represents slots fraction where the CPU was retiring uops not
+originated from the microcode-sequencer. This correlates with total number of
 instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. A high Retiring value for non-vectorized code is typically a good
-hint for programmer to pursue vectorizing his code, which can reduce
-instructions hence this bucket."""
+expected. While this is the most desirable of the top 4 categories, high
+values may still indicate areas for improvement. If possible focus on
+techniques that reduce instruction count or result in more efficient
+instructions generation such as vectorization."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Retiring.compute(EV)- self.Microcode_Sequencer.compute(EV )
+            self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
             #print "Base zero division"
@@ -1058,18 +1274,38 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
+class FP_Arith:
+    name = "FP_Arith"
+    domain = "Uops"
+    area = "RET"
+    desc = """
+This metric represents overall arithmetic floating-point (FP) uops fraction
+the CPU has executed."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = self.FP_x87.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "FP_Arith zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
 class FP_x87:
     name = "FP_x87"
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents floating-point (FP) x87 uops fraction the CPU has
-executed. Tip: consider compiler flags to generate newer AVX (or SSE)
+This metric is an approxmiation of floating-point (FP) x87 (arithmetic) uops
+fraction. Tip: consider compiler flags to generate newer AVX (or SSE)
 instruction sets, which typically perform better and feature vectors."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("FP_COMP_OPS_EXE.X87", 4)/ EV("UOPS_EXECUTED.THREAD", 4 )
+            self.val = EV("FP_COMP_OPS_EXE.X87", 4) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "FP_x87 zero division"
@@ -1077,6 +1313,65 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
+class FP_Scalar:
+    name = "FP_Scalar"
+    domain = "Uops"
+    area = "RET"
+    desc = """
+This metric represents arithmetic floating-point (FP) scalar uops fraction the
+CPU has executed. Tip: investigate what limits (compiler) generation of vector
+code."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", 4) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "FP_Scalar zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class FP_Vector:
+    name = "FP_Vector"
+    domain = "Uops"
+    area = "RET"
+    desc = """
+This metric represents arithmetic floating-point (FP) vector uops fraction the
+CPU has executed. Tip: check if vector width is expected"""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", 4) + EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "FP_Vector zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Other:
+    name = "Other"
+    domain = "Uops"
+    area = "RET"
+    desc = """
+This metric represents non-floating-point (FP) uop fraction the CPU has
+executed. If you application has no FP operations, this will likely be biggest
+fraction."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = 1 - self.FP_Arith.compute(EV )
+            self.thresh = (self.val > 0.3) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Other zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
 class Microcode_Sequencer:
     name = "Microcode_Sequencer"
     domain = "Slots"
@@ -1087,9 +1382,10 @@ class Microcode_Sequencer:
 decoded by the default decoders (like repeat move strings), or by microcode
 assists used to address some operation modes (like in Floating Point assists)."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2)/ SLOTS(EV, 2 )
+            self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
             #print "Microcode_Sequencer zero division"
@@ -1100,7 +1396,9 @@ def compute(self, EV):
 class Metric_IPC:
     name = "IPC"
     desc = """
-Instructions Per Cycle (per physical core)"""
+Instructions Per Cycle (per logical thread)"""
+    domain = "Metric"
+    maxval = 5
 
     def compute(self, EV):
         try:
@@ -1112,7 +1410,9 @@ def compute(self, EV):
 class Metric_CPI:
     name = "CPI"
     desc = """
-"""
+Cycles Per Instruction (threaded)"""
+    domain = "Metric"
+    maxval = 0
 
     def compute(self, EV):
         try:
@@ -1121,10 +1421,26 @@ def compute(self, EV):
             print "CPI zero division"
             self.val = 0
 
+class Metric_CoreIPC:
+    name = "CoreIPC"
+    desc = """
+Instructions Per Cycle (per physical core)"""
+    domain = "Metric"
+    maxval = 5
+
+    def compute(self, EV):
+        try:
+            self.val = CoreIPC(EV, 0)
+        except ZeroDivisionError:
+            print "CoreIPC zero division"
+            self.val = 0
+
 class Metric_UPI:
     name = "UPI"
     desc = """
 Uops Per Instruction"""
+    domain = "Metric"
+    maxval = 2
 
     def compute(self, EV):
         try:
@@ -1137,6 +1453,8 @@ class Metric_IPTB:
     name = "IPTB"
     desc = """
 Instruction per taken branch"""
+    domain = "Metric"
+    maxval = 0
 
     def compute(self, EV):
         try:
@@ -1145,10 +1463,27 @@ def compute(self, EV):
             print "IPTB zero division"
             self.val = 0
 
+class Metric_BPTB:
+    name = "BPTB"
+    desc = """
+Branch instructions per taken branch. Can be used to approximate PGO-
+likelihood for non-loopy codes."""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = BPTB(EV, 0)
+        except ZeroDivisionError:
+            print "BPTB zero division"
+            self.val = 0
+
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     desc = """
 Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+    domain = "Metric"
+    maxval = 1
 
     def compute(self, EV):
         try:
@@ -1162,6 +1497,8 @@ class Metric_ILP:
     desc = """
 Instruction-Level-Parallelism (average number of uops executed when there is
 at least 1 uop executed)"""
+    domain = "Metric"
+    maxval = 10
 
     def compute(self, EV):
         try:
@@ -1175,6 +1512,8 @@ class Metric_MLP:
     desc = """
 Memory-Level-Parallelism (average number of L1 miss demand load when there is
 at least 1 such miss)"""
+    domain = "Metric"
+    maxval = 10
 
     def compute(self, EV):
         try:
@@ -1187,6 +1526,8 @@ class Metric_Load_Miss_Real_Latency:
     name = "Load_Miss_Real_Latency"
     desc = """
 Actual Average Latency for L1 data-cache miss demand loads"""
+    domain = "Metric"
+    maxval = 1000
 
     def compute(self, EV):
         try:
@@ -1195,10 +1536,26 @@ def compute(self, EV):
             print "Load_Miss_Real_Latency zero division"
             self.val = 0
 
+class Metric_GFLOPs:
+    name = "GFLOPs"
+    desc = """
+Giga Floating Point Operations Per Second"""
+    domain = "Metric"
+    maxval = 100
+
+    def compute(self, EV):
+        try:
+            self.val = GFLOPs(EV, 0)
+        except ZeroDivisionError:
+            print "GFLOPs zero division"
+            self.val = 0
+
 class Metric_Turbo_Utilization:
     name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
+    domain = "Metric"
+    maxval = 10
 
     def compute(self, EV):
         try:
@@ -1212,6 +1569,8 @@ class Metric_Page_Walks_Use:
     desc = """
 Fraction of cycles where the core's Page Walker is busy serving
 iTLB/Load/Store"""
+    domain = "CoreClocks"
+    maxval = 0
 
     def compute(self, EV):
         try:
@@ -1220,10 +1579,26 @@ def compute(self, EV):
             print "Page_Walks_Use zero division"
             self.val = 0
 
+class Metric_MUX:
+    name = "MUX"
+    desc = """
+PerfMon Event Multiplexing accuracy indicator"""
+    domain = "Clocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = MUX(EV, 0)
+        except ZeroDivisionError:
+            print "MUX zero division"
+            self.val = 0
+
 class Metric_CLKS:
     name = "CLKS"
     desc = """
 Per-thread actual clocks"""
+    domain = "Count"
+    maxval = 0
 
     def compute(self, EV):
         try:
@@ -1232,16 +1607,32 @@ def compute(self, EV):
             print "CLKS zero division"
             self.val = 0
 
-class Metric_CLKS1:
-    name = "CLKS1"
+class Metric_CORE_CLKS:
+    name = "CORE_CLKS"
     desc = """
 Core actual clocks"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CORE_CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CORE_CLKS zero division"
+            self.val = 0
+
+class Metric_Time:
+    name = "Time"
+    desc = """
+Run duration time in seconds"""
+    domain = "Count"
+    maxval = 0
 
     def compute(self, EV):
         try:
-            self.val = CLKS1(EV, 0)
+            self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "CLKS1 zero division"
+            print "Time zero division"
             self.val = 0
 
 # Schedule
@@ -1270,6 +1661,7 @@ def __init__(self, r):
         n = L1_Bound() ; r.run(n) ; o["L1_Bound"] = n
         n = DTLB_Load() ; r.run(n) ; o["DTLB_Load"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n
         n = L2_Bound() ; r.run(n) ; o["L2_Bound"] = n
@@ -1277,10 +1669,12 @@ def __init__(self, r):
         n = Contested_Accesses() ; r.run(n) ; o["Contested_Accesses"] = n
         n = Data_Sharing() ; r.run(n) ; o["Data_Sharing"] = n
         n = L3_Latency() ; r.run(n) ; o["L3_Latency"] = n
+        n = SQ_Full() ; r.run(n) ; o["SQ_Full"] = n
         n = MEM_Bound() ; r.run(n) ; o["MEM_Bound"] = n
         n = MEM_Bandwidth() ; r.run(n) ; o["MEM_Bandwidth"] = n
         n = MEM_Latency() ; r.run(n) ; o["MEM_Latency"] = n
         n = Stores_Bound() ; r.run(n) ; o["Stores_Bound"] = n
+        n = Store_Latency() ; r.run(n) ; o["Store_Latency"] = n
         n = False_Sharing() ; r.run(n) ; o["False_Sharing"] = n
         n = Split_Stores() ; r.run(n) ; o["Split_Stores"] = n
         n = DTLB_Store() ; r.run(n) ; o["DTLB_Store"] = n
@@ -1299,7 +1693,11 @@ def __init__(self, r):
         n = Port_5() ; r.run(n) ; o["Port_5"] = n
         n = Retiring() ; r.run(n) ; o["Retiring"] = n
         n = Base() ; r.run(n) ; o["Base"] = n
+        n = FP_Arith() ; r.run(n) ; o["FP_Arith"] = n
         n = FP_x87() ; r.run(n) ; o["FP_x87"] = n
+        n = FP_Scalar() ; r.run(n) ; o["FP_Scalar"] = n
+        n = FP_Vector() ; r.run(n) ; o["FP_Vector"] = n
+        n = Other() ; r.run(n) ; o["Other"] = n
         n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n
 
         # parents
@@ -1321,6 +1719,7 @@ def __init__(self, r):
         o["L1_Bound"].parent = o["Memory_Bound"]
         o["DTLB_Load"].parent = o["L1_Bound"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["G4K_Aliasing"].parent = o["L1_Bound"]
         o["L2_Bound"].parent = o["Memory_Bound"]
@@ -1328,10 +1727,12 @@ def __init__(self, r):
         o["Contested_Accesses"].parent = o["L3_Bound"]
         o["Data_Sharing"].parent = o["L3_Bound"]
         o["L3_Latency"].parent = o["L3_Bound"]
+        o["SQ_Full"].parent = o["L3_Bound"]
         o["MEM_Bound"].parent = o["Memory_Bound"]
         o["MEM_Bandwidth"].parent = o["MEM_Bound"]
         o["MEM_Latency"].parent = o["MEM_Bound"]
         o["Stores_Bound"].parent = o["Memory_Bound"]
+        o["Store_Latency"].parent = o["Stores_Bound"]
         o["False_Sharing"].parent = o["Stores_Bound"]
         o["Split_Stores"].parent = o["Stores_Bound"]
         o["DTLB_Store"].parent = o["Stores_Bound"]
@@ -1349,7 +1750,11 @@ def __init__(self, r):
         o["Port_4"].parent = o["G3m_Ports_Utilized"]
         o["Port_5"].parent = o["G3m_Ports_Utilized"]
         o["Base"].parent = o["Retiring"]
-        o["FP_x87"].parent = o["Base"]
+        o["FP_Arith"].parent = o["Base"]
+        o["FP_x87"].parent = o["FP_Arith"]
+        o["FP_Scalar"].parent = o["FP_Arith"]
+        o["FP_Vector"].parent = o["FP_Arith"]
+        o["Other"].parent = o["Base"]
         o["Microcode_Sequencer"].parent = o["Retiring"]
 
         # references between groups
@@ -1371,6 +1776,10 @@ def __init__(self, r):
         o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"]
         o["Base"].Retiring = o["Retiring"]
         o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["FP_Arith"].FP_x87 = o["FP_x87"]
+        o["FP_Arith"].FP_Scalar = o["FP_Scalar"]
+        o["FP_Arith"].FP_Vector = o["FP_Vector"]
+        o["Other"].FP_Arith = o["FP_Arith"]
 
         # siblings cross-tree
 
@@ -1378,22 +1787,23 @@ def __init__(self, r):
         o["Frontend_Latency"].sibling = None
         o["ICache_Misses"].sibling = None
         o["ITLB_Misses"].sibling = None
-        o["Branch_Resteers"].sibling = o["Bad_Speculation"]
+	o["Branch_Resteers"].sibling = o["Bad_Speculation"]
         o["DSB_Switches"].sibling = None
         o["LCP"].sibling = None
-        o["MS_Switches"].sibling = o["Microcode_Sequencer"]
+	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
         o["Frontend_Bandwidth"].sibling = None
         o["MITE"].sibling = None
         o["DSB"].sibling = None
         o["LSD"].sibling = None
-        o["Bad_Speculation"].sibling = o["Branch_Resteers"]
+	o["Bad_Speculation"].sibling = o["Branch_Resteers"]
         o["Branch_Mispredicts"].sibling = None
         o["Machine_Clears"].sibling = None
         o["Backend_Bound"].sibling = None
         o["Memory_Bound"].sibling = None
-        o["L1_Bound"].sibling = None
+	o["L1_Bound"].sibling = o["G1_Port_Utilized"]
         o["DTLB_Load"].sibling = None
         o["Store_Fwd_Blk"].sibling = None
+	o["Lock_Latency"].sibling = o["Store_Latency"]
         o["Split_Loads"].sibling = None
         o["G4K_Aliasing"].sibling = None
         o["L2_Bound"].sibling = None
@@ -1401,38 +1811,44 @@ def __init__(self, r):
         o["Contested_Accesses"].sibling = None
         o["Data_Sharing"].sibling = None
         o["L3_Latency"].sibling = None
+        o["SQ_Full"].sibling = None
         o["MEM_Bound"].sibling = None
         o["MEM_Bandwidth"].sibling = None
         o["MEM_Latency"].sibling = None
         o["Stores_Bound"].sibling = None
+	o["Store_Latency"].sibling = o["Lock_Latency"]
         o["False_Sharing"].sibling = None
-        o["Split_Stores"].sibling = None
+	o["Split_Stores"].sibling = o["Port_4"]
         o["DTLB_Store"].sibling = None
         o["Core_Bound"].sibling = None
         o["Divider"].sibling = None
         o["Ports_Utilization"].sibling = None
         o["G0_Ports_Utilized"].sibling = None
-        o["G1_Port_Utilized"].sibling = None
+	o["G1_Port_Utilized"].sibling = o["L1_Bound"]
         o["G2_Ports_Utilized"].sibling = None
         o["G3m_Ports_Utilized"].sibling = None
         o["Port_0"].sibling = None
         o["Port_1"].sibling = None
         o["Port_2"].sibling = None
         o["Port_3"].sibling = None
-        o["Port_4"].sibling = None
+	o["Port_4"].sibling = o["Split_Stores"]
         o["Port_5"].sibling = None
         o["Retiring"].sibling = None
         o["Base"].sibling = None
+        o["FP_Arith"].sibling = None
         o["FP_x87"].sibling = None
-        o["Microcode_Sequencer"].sibling = o["MS_Switches"]
+        o["FP_Scalar"].sibling = None
+        o["FP_Vector"].sibling = None
+        o["Other"].sibling = None
+	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events (experimental)
+        # sampling events
 
         o["Frontend_Bound"].sample = []
-        o["Frontend_Latency"].sample = []
+        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
         o["ICache_Misses"].sample = []
-        o["ITLB_Misses"].sample = []
-        o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES_PS']
+        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
+        o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
         o["DSB_Switches"].sample = []
         o["LCP"].sample = []
         o["MS_Switches"].sample = []
@@ -1440,30 +1856,33 @@ def __init__(self, r):
         o["MITE"].sample = []
         o["DSB"].sample = []
         o["LSD"].sample = []
-        o["Bad_Speculation"].sample = []
-        o["Branch_Mispredicts"].sample = []
-        o["Machine_Clears"].sample = []
+        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
+        o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+        o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT']
         o["Backend_Bound"].sample = []
         o["Memory_Bound"].sample = []
-        o["L1_Bound"].sample = []
-        o["DTLB_Load"].sample = []
+        o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
+        o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
         o["Store_Fwd_Blk"].sample = []
-        o["Split_Loads"].sample = []
+        o["Lock_Latency"].sample = ['MEM_UOPS_RETIRED.LOCK_LOADS:pp']
+        o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
         o["G4K_Aliasing"].sample = []
-        o["L2_Bound"].sample = []
-        o["L3_Bound"].sample = []
-        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS_PS']
-        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT_PS']
-        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT_PS']
-        o["MEM_Bound"].sample = []
+        o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
+        o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
+        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp']
+        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp']
+        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
+        o["SQ_Full"].sample = []
+        o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
         o["MEM_Bandwidth"].sample = []
         o["MEM_Latency"].sample = []
-        o["Stores_Bound"].sample = []
-        o["False_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS']
-        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES_PS', 'MEM_UOPS_RETIRED.ALL_STORES_PS']
-        o["DTLB_Store"].sample = []
+        o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
+        o["Store_Latency"].sample = []
+        o["False_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE_0']
+        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
+        o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
         o["Core_Bound"].sample = []
-        o["Divider"].sample = []
+        o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE']
         o["Ports_Utilization"].sample = []
         o["G0_Ports_Utilized"].sample = []
         o["G1_Port_Utilized"].sample = []
@@ -1476,21 +1895,30 @@ def __init__(self, r):
         o["Port_4"].sample = []
         o["Port_5"].sample = []
         o["Retiring"].sample = []
-        o["Base"].sample = []
+        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
+        o["FP_Arith"].sample = []
         o["FP_x87"].sample = []
-        o["Microcode_Sequencer"].sample = []
+        o["FP_Scalar"].sample = []
+        o["FP_Vector"].sample = []
+        o["Other"].sample = []
+        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
 
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
         n = Metric_CPI() ; r.metric(n)
+        n = Metric_CoreIPC() ; r.metric(n)
         n = Metric_UPI() ; r.metric(n)
         n = Metric_IPTB() ; r.metric(n)
+        n = Metric_BPTB() ; r.metric(n)
         n = Metric_DSB_Coverage() ; r.metric(n)
         n = Metric_ILP() ; r.metric(n)
         n = Metric_MLP() ; r.metric(n)
         n = Metric_Load_Miss_Real_Latency() ; r.metric(n)
+        n = Metric_GFLOPs() ; r.metric(n)
         n = Metric_Turbo_Utilization() ; r.metric(n)
         n = Metric_Page_Walks_Use() ; r.metric(n)
+        n = Metric_MUX() ; r.metric(n)
         n = Metric_CLKS() ; r.metric(n)
-        n = Metric_CLKS1() ; r.metric(n)
+        n = Metric_CORE_CLKS() ; r.metric(n)
+        n = Metric_Time() ; r.metric(n)
diff --git a/ivb_server_ratios.py b/ivb_server_ratios.py
index d665f974..16a3bb11 100644
--- a/ivb_server_ratios.py
+++ b/ivb_server_ratios.py
@@ -1,14 +1,19 @@
 
 #
-# auto generated TopDown description for Intel Xeon E5 v2 (code named IvyBridge EP)
+# auto generated TopDown 2.9 description for Intel Xeon E5 v2 (code named IvyBridge EP)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
+# References:
+# http://halobates.de/blog/p/262
+# https://sites.google.com/site/analysismethods/yasin-pubs
+#
 
 smt_enabled = False
 
 # Constants
 
 Pipeline_Width = 4
+L2_Store_Latency = 9
 Mem_L3_Weight = 7
 Mem_STLB_Hit_Cost = 7
 Mem_SFB_Cost = 13
@@ -22,85 +27,180 @@
 Mem_Remote_Fwd_Cost = 180
 MS_Switches_Cost = 3
 OneMillion = 1000000
+Energy_Unit = 15.6
 
 # Aux. formulas
 
+
 # Floating Point Operations Count
 def FLOP_Count(EV, level):
-    return ( 1 *(EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level))+ 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level))+ 8 * EV("SIMD_FP_256.PACKED_SINGLE", level) )
+    return (1 *(EV("FP_COMP_OPS_EXE.X87", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level)) + 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level)) + 8 * EV("SIMD_FP_256.PACKED_SINGLE", level))
+
 def Recovery_Cycles(EV, level):
-    return ( EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2)if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level)
+    EV("INT_MISC.RECOVERY_CYCLES", level)
+    EV("INT_MISC.RECOVERY_CYCLES:amt1", level)
+    return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level)
+
 def Execute_Cycles(EV, level):
-    return ( EV("UOPS_EXECUTED.CORE:c1", level) / 2)if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+    EV("UOPS_EXECUTED.CORE:c1", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c1", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+
 def L1D_Miss_Cycles(EV, level):
-    return ( EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2)if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+    EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+    EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level)
+    return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+
+def SQ_Full_Cycles(EV, level):
+    EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+    return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+
 def ITLB_Miss_Cycles(EV, level):
-    return ( Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level) )
+    return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level))
+
+def Cycles_0_Ports_Utilized(EV, level):
+    EV("ARITH.FPU_DIV_ACTIVE", level)
+    EV("UOPS_EXECUTED.CORE:i1:c1", level)
+    EV("RS_EVENTS.EMPTY_CYCLES", level)
+    return (EV("UOPS_EXECUTED.CORE:i1:c1", level)) / 2 if smt_enabled else(STALLS_TOTAL(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ARITH.FPU_DIV_ACTIVE", level))
+
 def Cycles_1_Port_Utilized(EV, level):
-    return ( EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level))/ 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) )
+    EV("UOPS_EXECUTED.CORE:c2", level)
+    EV("UOPS_EXECUTED.CORE:c1", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c1", level) - EV("UOPS_EXECUTED.CORE:c2", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level))
+
 def Cycles_2_Ports_Utilized(EV, level):
-    return ( EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level))/ 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) )
+    EV("UOPS_EXECUTED.CORE:c2", level)
+    EV("UOPS_EXECUTED.CORE:c3", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c2", level) - EV("UOPS_EXECUTED.CORE:c3", level)) / 2 if smt_enabled else(EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level) - EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level))
+
 def Cycles_3m_Ports_Utilized(EV, level):
-    return ( EV("UOPS_EXECUTED.CORE:c3", level) / 2)if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+    EV("UOPS_EXECUTED.CORE:c3", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+    return (EV("UOPS_EXECUTED.CORE:c3", level) / 2) if smt_enabled else EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
+
 def STALLS_MEM_ANY(EV, level):
-    return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level) )
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.STALLS_LDM_PENDING", level)) , level )
+
 def STALLS_TOTAL(EV, level):
-    return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level) )
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("CYCLE_ACTIVITY.CYCLES_NO_EXECUTE", level)) , level )
+
 def ORO_Demand_DRD_C1(EV, level):
-    return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level) )
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level )
+
 def ORO_Demand_DRD_C6(EV, level):
-    return min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level) )
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level )
+
+def ORO_Demand_RFO_C1(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", level)) , level )
+
+def Store_L2_Hit_Cycles(EV, level):
+    return 0
+
+def Cycles_False_Sharing_Client(EV, level):
+    return Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", level) + EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE", level))
+
 def Few_Uops_Executed_Threshold(EV, level):
-    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
     EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level)
-    return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level) > 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+    return EV("UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC", level) if(IPC(EV, level)> 1.25)else EV("UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC", level)
+
 def Backend_Bound_At_EXE(EV, level):
-    return ( STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level))/ CLKS(EV, level)
+    return (STALLS_TOTAL(EV, level) + EV("UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC", level) - Few_Uops_Executed_Threshold(EV, level) - EV("RS_EVENTS.EMPTY_CYCLES", level) + EV("RESOURCE_STALLS.SB", level)) / CLKS(EV, level)
+
 def Mem_L3_Hit_Fraction(EV, level):
-    return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level) )
+    return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level))
+
+def Mem_Lock_St_Fraction(EV, level):
+    return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level)
+
 def Mispred_Clears_Fraction(EV, level):
-    return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level) )
+    return EV("BR_MISP_RETIRED.ALL_BRANCHES", level) /(EV("BR_MISP_RETIRED.ALL_BRANCHES", level) + EV("MACHINE_CLEARS.COUNT", level))
+
 def Avg_RS_Empty_Period_Clears(EV, level):
-    return ( EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level))/ EV("RS_EVENTS.EMPTY_END", level)
+    return (EV("RS_EVENTS.EMPTY_CYCLES", level) - EV("ICACHE.IFETCH_STALL", level)) / EV("RS_EVENTS.EMPTY_END", level)
+
 def Retire_Uop_Fraction(EV, level):
     return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level)
+
 def SLOTS(EV, level):
-    return Pipeline_Width * CLKS1(EV, level)
-# Instructions Per Cycle (per physical core)
+    return Pipeline_Width * CORE_CLKS(EV, level)
+
+def DurationTimeInSeconds(EV, level):
+    return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 )
+
+# Instructions Per Cycle (per logical thread)
 def IPC(EV, level):
-    return EV("INST_RETIRED.ANY", level) / CLKS1(EV, level)
+    return EV("INST_RETIRED.ANY", level) / CLKS(EV, level)
+
+# Cycles Per Instruction (threaded)
 def CPI(EV, level):
     return 1 / IPC(EV, level)
+
+# Instructions Per Cycle (per physical core)
+def CoreIPC(EV, level):
+    return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level)
+
 # Uops Per Instruction
 def UPI(EV, level):
     return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level)
+
 # Instruction per taken branch
 def IPTB(EV, level):
     return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
+# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes.
+def BPTB(EV, level):
+    return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
 # Fraction of Uops delivered by the DSB (decoded instructions cache)
 def DSB_Coverage(EV, level):
-    return ( EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level))/(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level) )
+    return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level))
+
 # Instruction-Level-Parallelism (average number of uops executed when there is at least 1 uop executed)
 def ILP(EV, level):
     return EV("UOPS_EXECUTED.THREAD", level) / Execute_Cycles(EV, level)
+
 # Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)
 def MLP(EV, level):
     return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level)
+
 # Actual Average Latency for L1 data-cache miss demand loads
 def Load_Miss_Real_Latency(EV, level):
-    return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level) )
+    return EV("L1D_PEND_MISS.PENDING", level) /(EV("MEM_LOAD_UOPS_RETIRED.L1_MISS", level) + EV("MEM_LOAD_UOPS_RETIRED.HIT_LFB", level))
+
+# Giga Floating Point Operations Per Second
+def GFLOPs(EV, level):
+    return FLOP_Count(EV, level) / OneMillion / DurationTimeInSeconds(EV, level) / 1000
+
 # Average Frequency Utilization relative nominal frequency
 def Turbo_Utilization(EV, level):
     return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level)
+
 # Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store
 def Page_Walks_Use(EV, level):
-    return ( EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level))/ CLKS1(EV, level)
+    return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level)
+
+# PerfMon Event Multiplexing accuracy indicator
+def MUX(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level)
+
 # Per-thread actual clocks
 def CLKS(EV, level):
     return EV("CPU_CLK_UNHALTED.THREAD", level)
+
 # Core actual clocks
-def CLKS1(EV, level):
-    return ( EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2)if smt_enabled else CLKS(EV, level)
+def CORE_CLKS(EV, level):
+    EV("CPU_CLK_UNHALTED.THREAD:amt1", level)
+    return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level)
+
+# Run duration time in seconds
+def Time(EV, level):
+    return DurationTimeInSeconds(EV, level)
 
 # Event groups
 
@@ -119,9 +219,10 @@ class Frontend_Bound:
 latter can accept them. For example, stalls due to instruction-cache misses
 would be categorized under Frontend Bound."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1)/ SLOTS(EV, 1 )
+            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             #print "Frontend_Bound zero division"
@@ -139,9 +240,10 @@ class Frontend_Latency:
 after a branch misprediction are categorized under Frontend Latency. In such
 cases the Frontend eventually delivers no uops for some period."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2)/ SLOTS(EV, 2 )
+            self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
             #print "Frontend_Latency zero division"
@@ -158,9 +260,10 @@ class ICache_Misses:
 cache misses. Using compiler's Profile-Guided Optimization (PGO) can reduce
 i-cache misses through improved hot code layout."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ICACHE.IFETCH_STALL", 3)/ CLKS(EV, 3)- self.ITLB_Misses.compute(EV )
+            self.val = EV("ICACHE.IFETCH_STALL", 3) / CLKS(EV, 3) - self.ITLB_Misses.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "ICache_Misses zero division"
@@ -176,9 +279,10 @@ class ITLB_Misses:
 This metric represents cycles fraction CPU was stalled due to instruction TLB
 misses. Using large code pages may be considered here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ITLB_Miss_Cycles(EV, 3)/ CLKS(EV, 3 )
+            self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "ITLB_Misses zero division"
@@ -197,9 +301,10 @@ class Branch_Resteers:
 For example, branchy code with lots of (taken) branches and/or branch miss-
 predictions might get categorized under Branch Resteers."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3)+ EV("MACHINE_CLEARS.COUNT", 3)+ EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 )
+            self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3) + EV("MACHINE_CLEARS.COUNT", 3) + EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "Branch_Resteers zero division"
@@ -215,9 +320,10 @@ class DSB_Switches:
 This metric represents cycles fraction CPU was stalled due to switches from
 DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3)/ CLKS(EV, 3 )
+            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "DSB_Switches zero division"
@@ -234,9 +340,10 @@ class LCP:
 Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
 certainly avoid this."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ILD_STALL.LCP", 3)/ CLKS(EV, 3 )
+            self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "LCP zero division"
@@ -255,9 +362,10 @@ class MS_Switches:
 deliver long uop flows required by CISC instructions like CPUID, or uncommon
 conditions like Floating Point Assists when dealing with Denormals."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3)/ CLKS(EV, 3 )
+            self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "MS_Switches zero division"
@@ -276,9 +384,10 @@ class Frontend_Bandwidth:
 under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
 optimal amount of uops to the Backend."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Frontend_Bound.compute(EV)- self.Frontend_Latency.compute(EV )
+            self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
             #print "Frontend_Bandwidth zero division"
@@ -288,16 +397,17 @@ def compute(self, EV):
 
 class MITE:
     name = "MITE"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-the MITE fetch pipeline.  For example, inefficiencies in the instruction
-decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to the MITE fetch pipeline.  For example, inefficiencies in the
+instruction decoders are categorized here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 )
+            self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MITE zero division"
@@ -307,17 +417,18 @@ def compute(self, EV):
 
 class DSB:
     name = "DSB"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-DSB (decoded uop cache) fetch pipeline.  For example, inefficient utilization
-of the DSB cache structure or bank conflict when reading from it, are
-categorized here."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
+utilization of the DSB cache structure or bank conflict when reading from it,
+are categorized here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 )
+            self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
             #print "DSB zero division"
@@ -327,18 +438,19 @@ def compute(self, EV):
 
 class LSD:
     name = "LSD"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop
-supply. However, in some rare cases, optimal uop-delivery could not be reached
-for small loops whose size (in terms of number of uops) does not suit well the
-LSD structure."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
+Uop supply. However, in some rare cases, optimal uop-delivery could not be
+reached for small loops whose size (in terms of number of uops) does not suit
+well the LSD structure."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("LSD.CYCLES_ACTIVE", 3)- EV("LSD.CYCLES_4_UOPS", 3)) / CLKS1(EV, 3 )
+            self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "LSD zero division"
@@ -357,9 +469,10 @@ class Bad_Speculation:
 speculation. For example, wasted work due to miss-predicted branches are
 categorized under Bad Speculation category"""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("UOPS_ISSUED.ANY", 1)- EV("UOPS_RETIRED.RETIRE_SLOTS", 1)+ Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
+            self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
             #print "Bad_Speculation zero division"
@@ -377,6 +490,7 @@ class Branch_Mispredicts:
 incorrectly speculated program path, or stalls the Backend of the machine
 needs to recover its state from a speculative path."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
             self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV )
@@ -398,9 +512,10 @@ class Machine_Clears:
 example, this can happen due to memory ordering Nukes (e.g. Memory
 Disambiguation) or Self-Modifying-Code (SMC) nukes."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Bad_Speculation.compute(EV)- self.Branch_Mispredicts.compute(EV )
+            self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "Machine_Clears zero division"
@@ -421,9 +536,10 @@ class Backend_Bound:
 stalls due to data-cache misses or stalls due to the divider unit being
 overloaded are both categorized under Backend Bound."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = 1 -(self.Frontend_Bound.compute(EV)+ self.Bad_Speculation.compute(EV)+ self.Retiring.compute(EV))
+            self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             #print "Backend_Bound zero division"
@@ -442,9 +558,10 @@ class Memory_Bound:
 memory demand loads which coincides with execution starvation. in addition to
 less common cases where stores could imply backpressure on the pipeline."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( STALLS_MEM_ANY(EV, 2)+ EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
+            self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             #print "Memory_Bound zero division"
@@ -464,9 +581,10 @@ class L1_Bound:
 allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event
 as it accounts for any non-completed load."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( STALLS_MEM_ANY(EV, 3)- EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
+            self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh
         except ZeroDivisionError:
             #print "L1_Bound zero division"
@@ -480,9 +598,10 @@ class DTLB_Load:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4)+ EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
+            self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "DTLB_Load zero division"
@@ -496,9 +615,10 @@ class Store_Fwd_Blk:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4)/ CLKS(EV, 4 )
+            self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "Store_Fwd_Blk zero division"
@@ -506,15 +626,36 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
+class Lock_Latency:
+    name = "Lock_Latency"
+    domain = "Clocks"
+    area = "BE/Mem"
+    desc = """
+This metric represents cycles fraction the CPU spent handling cache misses due
+to lock operations. Due to the microarchitecture handling of locks, they are
+classified as L1_Bound regardless of what memory source satsified them."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Mem_Lock_St_Fraction(EV, 4)* ORO_Demand_RFO_C1(EV, 4) / CLKS(EV, 4 )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Lock_Latency zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
 class Split_Loads:
     name = "Split_Loads"
     domain = "Clocks"
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4)/ CLKS(EV, 4 )
+            self.val = 13 * EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "Split_Loads zero division"
@@ -528,9 +669,10 @@ class G4K_Aliasing:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4)/ CLKS(EV, 4 )
+            self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "G4K_Aliasing zero division"
@@ -547,9 +689,10 @@ class L2_Bound:
 misses (i.e. L1 misses/L2 hits) will improve the latency and increase
 performance."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)- EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
+            self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.03) and self.parent.thresh
         except ZeroDivisionError:
             #print "L2_Bound zero division"
@@ -566,9 +709,10 @@ class L3_Bound:
 a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
 the latency and increase performance."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 )
+            self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "L3_Bound zero division"
@@ -582,9 +726,10 @@ class Contested_Accesses:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4)+ EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
+            self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "Contested_Accesses zero division"
@@ -598,9 +743,10 @@ class Data_Sharing:
     area = "BE/Mem"
     desc = ""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4)/ CLKS(EV, 4 )
+            self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
             #print "Data_Sharing zero division"
@@ -618,9 +764,10 @@ class L3_Latency:
 with a sibling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
 improve the latency and increase performance."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4)/ CLKS(EV, 4 )
+            self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "L3_Latency zero division"
@@ -628,6 +775,26 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
+class SQ_Full:
+    name = "SQ_Full"
+    domain = "CoreClocks"
+    area = "BE/Mem"
+    desc = """
+This metric measures fraction of cycles where the Super Queue (SQ) was full
+taking into account all request-types and both hardware SMT threads. The Super
+Queue is used for requests to access the L2 cache or to go out to the Uncore."""
+    level = 4
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 )
+            self.thresh = self.val > 0.0 and self.parent.thresh
+        except ZeroDivisionError:
+            #print "SQ_Full zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
 class MEM_Bound:
     name = "MEM_Bound"
     domain = "Clocks"
@@ -636,9 +803,10 @@ class MEM_Bound:
 This metric represents how often CPU was stalled on main memory (DRAM).
 Caching will improve the latency and increase performance."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( 1 - Mem_L3_Hit_Fraction(EV, 3)) * EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)/ CLKS(EV, 3 )
+            self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MEM_Bound zero division"
@@ -655,9 +823,10 @@ class MEM_Bandwidth:
 bandwidth limits of main memory (DRAM).  NUMA in multi-socket system may be
 considered in such case."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ORO_Demand_DRD_C6(EV, 4)/ CLKS(EV, 4 )
+            self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MEM_Bandwidth zero division"
@@ -674,9 +843,10 @@ class MEM_Latency:
 main memory (DRAM).  Data layout re-structuring or using Software Prefetches
 (also through the compiler) may be considered in such case."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( ORO_Demand_DRD_C1(EV, 4)- ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
+            self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MEM_Latency zero division"
@@ -692,9 +862,10 @@ class Local_DRAM:
 This metric represents how often CPU was likely stalled due to loads from
 local memory. Caching will improve the latency and increase performance."""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_Local_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM", 5)/ CLKS(EV, 5 )
+            self.val = Mem_Local_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM", 5) / CLKS(EV, 5 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "Local_DRAM zero division"
@@ -710,9 +881,10 @@ class Remote_DRAM:
 This metric represents how often CPU was likely stalled due to loads from
 remote memory. This is caused often due to non-optimal NUMA allocations."""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_Remote_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM", 5)/ CLKS(EV, 5 )
+            self.val = Mem_Remote_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM", 5) / CLKS(EV, 5 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "Remote_DRAM zero division"
@@ -729,9 +901,10 @@ class Remote_Cache:
 remote cache in other sockets. This is caused often due to non-optimal NUMA
 allocations."""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( Mem_Remote_HitM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM", 5)+ Mem_Remote_Fwd_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD", 5)) / CLKS(EV, 5 )
+            self.val = (Mem_Remote_HitM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM", 5) + Mem_Remote_Fwd_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD", 5)) / CLKS(EV, 5 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "Remote_Cache zero division"
@@ -744,12 +917,15 @@ class Stores_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on due to store operations.
-Tip: consider False Sharing analysis as next step"""
+This metric represents how often CPU was stalled  due to store operations.
+even though memory store accesses do not typically stall out-of-order CPUs;
+there are few cases where stores can lead to actual stalls. This metric will
+be flagged should any of these cases be a bottleneck."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Memory_Bound.compute(EV)- STALLS_MEM_ANY(EV, 3)/ CLKS(EV, 3 )
+            self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             #print "Stores_Bound zero division"
@@ -757,36 +933,38 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
-class False_Sharing:
-    name = "False_Sharing"
+class Store_Latency:
+    name = "Store_Latency"
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on due to store operations.
-Tip: consider False Sharing analysis as next step"""
+This metric represents cycles fraction the CPU spent handling long-latency
+store misses (missing 2nd level cache)."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4)+ EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE", 4)) / CLKS(EV, 4 )
+            self.val = (Store_L2_Hit_Cycles(EV, 4) +(1 - Mem_Lock_St_Fraction(EV, 4))* ORO_Demand_RFO_C1(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "False_Sharing zero division"
+            #print "Store_Latency zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
 class Split_Stores:
     name = "Split_Stores"
-    domain = "Stores"
+    domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
 This metric represents rate of split store accesses.  Consider aligning your
 data to the 64-byte cache line granularity."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4)/ EV("MEM_UOPS_RETIRED.ALL_STORES", 4 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 )
+            self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             #print "Split_Stores zero division"
             self.val = 0
@@ -805,9 +983,10 @@ class DTLB_Store:
 the same page.  Try using larger page sizes for large amounts of frequently-
 used data."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4)+ EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
+            self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "DTLB_Store zero division"
@@ -828,9 +1007,10 @@ class Core_Bound:
 performance (e.g. FP-chained long-latency arithmetic operations). Tip:
 consider Port Saturation analysis as next step."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Backend_Bound_At_EXE(EV, 2)- self.Memory_Bound.compute(EV )
+            self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "Core_Bound zero division"
@@ -840,13 +1020,14 @@ def compute(self, EV):
 
 class Divider:
     name = "Divider"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = ""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ARITH.FPU_DIV_ACTIVE", 3)/ CLKS1(EV, 3 )
+            self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "Divider zero division"
@@ -867,9 +1048,10 @@ class Ports_Utilization:
 options today- reduces pressure on the execution ports as multiple elements
 are calculated with same uop."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Core_Bound.compute(EV)- self.Divider.compute(EV )
+            self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "Ports_Utilization zero division"
@@ -879,15 +1061,16 @@ def compute(self, EV):
 
 class G0_Ports_Utilized:
     name = "0_Ports_Utilized"
-    domain = "Clocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction CPU executed no uops on any execution
-port."""
+This metric represents Core cycles fraction CPU executed no uops on any
+execution port."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( STALLS_TOTAL(EV, 4)- EV("RS_EVENTS.EMPTY_CYCLES", 4)- EV("ARITH.FPU_DIV_ACTIVE", 4)) / CLKS(EV, 4 )
+            self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "G0_Ports_Utilized zero division"
@@ -897,20 +1080,22 @@ def compute(self, EV):
 
 class G1_Port_Utilized:
     name = "1_Port_Utilized"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction CPU executed total of 1 uop per cycle
-on all execution ports. This can be due to heavy data-dependency among
-instructions. In some cases with high 1_Port_Utilized and L1_Bound it can
-point to L1 data-cache latency bottleneck that may not necessarily manifest
-with complete execution starvation (due to the short L1 latency e.g. walking
-linked list) - looking at the assembly can be helpful. Tip: consider 'Core
-Port Saturation' analysis-type as next step."""
+This metric represents Core cycles fraction where the CPU executed total of 1
+uop per cycle on all execution ports. This can be due to heavy data-dependency
+among software instructions, or over oversubscribing a particular hardware
+resource. In some other cases with high 1_Port_Utilized and L1_Bound, this
+metric can point to L1 data-cache latency bottleneck that may not necessarily
+manifest with complete execution starvation (due to the short L1 latency e.g.
+walking a linked list) - looking at the assembly can be helpful. Tip: consider
+'Core Ports Saturation' analysis-type as next step."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Cycles_1_Port_Utilized(EV, 4)/ CLKS1(EV, 4 )
+            self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "G1_Port_Utilized zero division"
@@ -920,18 +1105,19 @@ def compute(self, EV):
 
 class G2_Ports_Utilized:
     name = "2_Ports_Utilized"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction CPU executed total of 2 uops per cycle
-on all execution ports. Tip: consider 'Core Port Saturation' analysis-type as
-next step. Loop Vectorization -most compilers feature auto-Vectorization
-options today- reduces pressure on the execution ports as multiple elements
-are calculated with same uop."""
+This metric represents Core cycles fraction CPU executed total of 2 uops per
+cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-
+type as next step. Loop Vectorization -most compilers feature auto-
+Vectorization options today- reduces pressure on the execution ports as
+multiple elements are calculated with same uop."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Cycles_2_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 )
+            self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "G2_Ports_Utilized zero division"
@@ -941,16 +1127,17 @@ def compute(self, EV):
 
 class G3m_Ports_Utilized:
     name = "3m_Ports_Utilized"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction CPU executed total of 3 or more uops
-per cycle on all execution ports. Tip: consider 'Core Port Saturation'
+This metric represents Core cycles fraction CPU executed total of 3 or more
+uops per cycle on all execution ports. Tip: consider 'Core Port Saturation'
 analysis-type as next step"""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Cycles_3m_Ports_Utilized(EV, 4)/ CLKS1(EV, 4 )
+            self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "G3m_Ports_Utilized zero division"
@@ -960,16 +1147,17 @@ def compute(self, EV):
 
 class Port_0:
     name = "Port_0"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
-port 0 (ALU)"""
+port 0 (SNB+: ALU; HSW+:ALU and 2nd branch)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_0 zero division"
             self.val = 0
@@ -978,16 +1166,17 @@ def compute(self, EV):
 
 class Port_1:
     name = "Port_1"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
 port 1 (ALU)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_1 zero division"
             self.val = 0
@@ -996,16 +1185,17 @@ def compute(self, EV):
 
 class Port_2:
     name = "Port_2"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
 port 2 (Loads and Store-address)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_2 zero division"
             self.val = 0
@@ -1014,16 +1204,17 @@ def compute(self, EV):
 
 class Port_3:
     name = "Port_3"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
 port 3 (Loads and Store-address)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_3 zero division"
             self.val = 0
@@ -1032,16 +1223,17 @@ def compute(self, EV):
 
 class Port_4:
     name = "Port_4"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
 port 4 (Store-data)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_4 zero division"
             self.val = 0
@@ -1050,16 +1242,17 @@ def compute(self, EV):
 
 class Port_5:
     name = "Port_5"
-    domain = "CClocks"
+    domain = "CoreClocks"
     area = "BE/Core"
     desc = """
 This metric represents Core cycles fraction CPU dispatched uops on execution
-port 5 (Branches and ALU)"""
+port 5 (SNB+: Branches and ALU; HSW+: ALU)"""
     level = 5
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5)/ CLKS1(EV, 5 )
-            self.thresh = self.val > 0.0 and self.parent.thresh
+            self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5) / CORE_CLKS(EV, 5 )
+            self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
             #print "Port_5 zero division"
             self.val = 0
@@ -1083,9 +1276,10 @@ class Retiring:
 computations be done without significantly increasing number of instructions
 thus improving the performance."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1)/ SLOTS(EV, 1 )
+            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
             #print "Retiring zero division"
@@ -1098,16 +1292,18 @@ class Base:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops not originated
-from the microcode-sequencer. This correlates with total number of
+This metric represents slots fraction where the CPU was retiring uops not
+originated from the microcode-sequencer. This correlates with total number of
 instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. A high Retiring value for non-vectorized code is typically a good
-hint for programmer to pursue vectorizing his code, which can reduce
-instructions hence this bucket."""
+expected. While this is the most desirable of the top 4 categories, high
+values may still indicate areas for improvement. If possible focus on
+techniques that reduce instruction count or result in more efficient
+instructions generation such as vectorization."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Retiring.compute(EV)- self.Microcode_Sequencer.compute(EV )
+            self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
             #print "Base zero division"
@@ -1123,9 +1319,10 @@ class FP_Arith:
 This metric represents overall arithmetic floating-point (FP) uops fraction
 the CPU has executed."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.FP_x87.compute(EV)+ self.FP_Scalar.compute(EV)+ self.FP_Vector.compute(EV )
+            self.val = self.FP_x87.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             #print "FP_Arith zero division"
@@ -1138,13 +1335,14 @@ class FP_x87:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents floating-point (FP) x87 uops fraction the CPU has
-executed. Tip: consider compiler flags to generate newer AVX (or SSE)
+This metric is an approxmiation of floating-point (FP) x87 (arithmetic) uops
+fraction. Tip: consider compiler flags to generate newer AVX (or SSE)
 instruction sets, which typically perform better and feature vectors."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("FP_COMP_OPS_EXE.X87", 4)/ EV("UOPS_EXECUTED.THREAD", 4 )
+            self.val = EV("FP_COMP_OPS_EXE.X87", 4) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "FP_x87 zero division"
@@ -1161,9 +1359,10 @@ class FP_Scalar:
 CPU has executed. Tip: investigate what limits (compiler) generation of vector
 code."""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", 4)+ EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
+            self.val = (EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", 4) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "FP_Scalar zero division"
@@ -1179,9 +1378,10 @@ class FP_Vector:
 This metric represents arithmetic floating-point (FP) vector uops fraction the
 CPU has executed. Tip: check if vector width is expected"""
     level = 4
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", 4)+ EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", 4)+ EV("SIMD_FP_256.PACKED_SINGLE", 4)+ EV("SIMD_FP_256.PACKED_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
+            self.val = (EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", 4) + EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
             #print "FP_Vector zero division"
@@ -1198,6 +1398,7 @@ class Other:
 executed. If you application has no FP operations, this will likely be biggest
 fraction."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
             self.val = 1 - self.FP_Arith.compute(EV )
@@ -1218,9 +1419,10 @@ class Microcode_Sequencer:
 decoded by the default decoders (like repeat move strings), or by microcode
 assists used to address some operation modes (like in Floating Point assists)."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2)/ SLOTS(EV, 2 )
+            self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
             #print "Microcode_Sequencer zero division"
@@ -1231,7 +1433,9 @@ def compute(self, EV):
 class Metric_IPC:
     name = "IPC"
     desc = """
-Instructions Per Cycle (per physical core)"""
+Instructions Per Cycle (per logical thread)"""
+    domain = "Metric"
+    maxval = 5
 
     def compute(self, EV):
         try:
@@ -1243,7 +1447,9 @@ def compute(self, EV):
 class Metric_CPI:
     name = "CPI"
     desc = """
-"""
+Cycles Per Instruction (threaded)"""
+    domain = "Metric"
+    maxval = 0
 
     def compute(self, EV):
         try:
@@ -1252,10 +1458,26 @@ def compute(self, EV):
             print "CPI zero division"
             self.val = 0
 
+class Metric_CoreIPC:
+    name = "CoreIPC"
+    desc = """
+Instructions Per Cycle (per physical core)"""
+    domain = "Metric"
+    maxval = 5
+
+    def compute(self, EV):
+        try:
+            self.val = CoreIPC(EV, 0)
+        except ZeroDivisionError:
+            print "CoreIPC zero division"
+            self.val = 0
+
 class Metric_UPI:
     name = "UPI"
     desc = """
 Uops Per Instruction"""
+    domain = "Metric"
+    maxval = 2
 
     def compute(self, EV):
         try:
@@ -1268,6 +1490,8 @@ class Metric_IPTB:
     name = "IPTB"
     desc = """
 Instruction per taken branch"""
+    domain = "Metric"
+    maxval = 0
 
     def compute(self, EV):
         try:
@@ -1276,10 +1500,27 @@ def compute(self, EV):
             print "IPTB zero division"
             self.val = 0
 
+class Metric_BPTB:
+    name = "BPTB"
+    desc = """
+Branch instructions per taken branch. Can be used to approximate PGO-
+likelihood for non-loopy codes."""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = BPTB(EV, 0)
+        except ZeroDivisionError:
+            print "BPTB zero division"
+            self.val = 0
+
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     desc = """
 Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+    domain = "Metric"
+    maxval = 1
 
     def compute(self, EV):
         try:
@@ -1293,6 +1534,8 @@ class Metric_ILP:
     desc = """
 Instruction-Level-Parallelism (average number of uops executed when there is
 at least 1 uop executed)"""
+    domain = "Metric"
+    maxval = 10
 
     def compute(self, EV):
         try:
@@ -1306,6 +1549,8 @@ class Metric_MLP:
     desc = """
 Memory-Level-Parallelism (average number of L1 miss demand load when there is
 at least 1 such miss)"""
+    domain = "Metric"
+    maxval = 10
 
     def compute(self, EV):
         try:
@@ -1318,6 +1563,8 @@ class Metric_Load_Miss_Real_Latency:
     name = "Load_Miss_Real_Latency"
     desc = """
 Actual Average Latency for L1 data-cache miss demand loads"""
+    domain = "Metric"
+    maxval = 1000
 
     def compute(self, EV):
         try:
@@ -1326,10 +1573,26 @@ def compute(self, EV):
             print "Load_Miss_Real_Latency zero division"
             self.val = 0
 
+class Metric_GFLOPs:
+    name = "GFLOPs"
+    desc = """
+Giga Floating Point Operations Per Second"""
+    domain = "Metric"
+    maxval = 100
+
+    def compute(self, EV):
+        try:
+            self.val = GFLOPs(EV, 0)
+        except ZeroDivisionError:
+            print "GFLOPs zero division"
+            self.val = 0
+
 class Metric_Turbo_Utilization:
     name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
+    domain = "Metric"
+    maxval = 10
 
     def compute(self, EV):
         try:
@@ -1343,6 +1606,8 @@ class Metric_Page_Walks_Use:
     desc = """
 Fraction of cycles where the core's Page Walker is busy serving
 iTLB/Load/Store"""
+    domain = "CoreClocks"
+    maxval = 0
 
     def compute(self, EV):
         try:
@@ -1351,10 +1616,26 @@ def compute(self, EV):
             print "Page_Walks_Use zero division"
             self.val = 0
 
+class Metric_MUX:
+    name = "MUX"
+    desc = """
+PerfMon Event Multiplexing accuracy indicator"""
+    domain = "Clocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = MUX(EV, 0)
+        except ZeroDivisionError:
+            print "MUX zero division"
+            self.val = 0
+
 class Metric_CLKS:
     name = "CLKS"
     desc = """
 Per-thread actual clocks"""
+    domain = "Count"
+    maxval = 0
 
     def compute(self, EV):
         try:
@@ -1363,16 +1644,32 @@ def compute(self, EV):
             print "CLKS zero division"
             self.val = 0
 
-class Metric_CLKS1:
-    name = "CLKS1"
+class Metric_CORE_CLKS:
+    name = "CORE_CLKS"
     desc = """
 Core actual clocks"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CORE_CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CORE_CLKS zero division"
+            self.val = 0
+
+class Metric_Time:
+    name = "Time"
+    desc = """
+Run duration time in seconds"""
+    domain = "Count"
+    maxval = 0
 
     def compute(self, EV):
         try:
-            self.val = CLKS1(EV, 0)
+            self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "CLKS1 zero division"
+            print "Time zero division"
             self.val = 0
 
 # Schedule
@@ -1401,6 +1698,7 @@ def __init__(self, r):
         n = L1_Bound() ; r.run(n) ; o["L1_Bound"] = n
         n = DTLB_Load() ; r.run(n) ; o["DTLB_Load"] = n
         n = Store_Fwd_Blk() ; r.run(n) ; o["Store_Fwd_Blk"] = n
+        n = Lock_Latency() ; r.run(n) ; o["Lock_Latency"] = n
         n = Split_Loads() ; r.run(n) ; o["Split_Loads"] = n
         n = G4K_Aliasing() ; r.run(n) ; o["G4K_Aliasing"] = n
         n = L2_Bound() ; r.run(n) ; o["L2_Bound"] = n
@@ -1408,6 +1706,7 @@ def __init__(self, r):
         n = Contested_Accesses() ; r.run(n) ; o["Contested_Accesses"] = n
         n = Data_Sharing() ; r.run(n) ; o["Data_Sharing"] = n
         n = L3_Latency() ; r.run(n) ; o["L3_Latency"] = n
+        n = SQ_Full() ; r.run(n) ; o["SQ_Full"] = n
         n = MEM_Bound() ; r.run(n) ; o["MEM_Bound"] = n
         n = MEM_Bandwidth() ; r.run(n) ; o["MEM_Bandwidth"] = n
         n = MEM_Latency() ; r.run(n) ; o["MEM_Latency"] = n
@@ -1415,7 +1714,7 @@ def __init__(self, r):
         n = Remote_DRAM() ; r.run(n) ; o["Remote_DRAM"] = n
         n = Remote_Cache() ; r.run(n) ; o["Remote_Cache"] = n
         n = Stores_Bound() ; r.run(n) ; o["Stores_Bound"] = n
-        n = False_Sharing() ; r.run(n) ; o["False_Sharing"] = n
+        n = Store_Latency() ; r.run(n) ; o["Store_Latency"] = n
         n = Split_Stores() ; r.run(n) ; o["Split_Stores"] = n
         n = DTLB_Store() ; r.run(n) ; o["DTLB_Store"] = n
         n = Core_Bound() ; r.run(n) ; o["Core_Bound"] = n
@@ -1459,6 +1758,7 @@ def __init__(self, r):
         o["L1_Bound"].parent = o["Memory_Bound"]
         o["DTLB_Load"].parent = o["L1_Bound"]
         o["Store_Fwd_Blk"].parent = o["L1_Bound"]
+        o["Lock_Latency"].parent = o["L1_Bound"]
         o["Split_Loads"].parent = o["L1_Bound"]
         o["G4K_Aliasing"].parent = o["L1_Bound"]
         o["L2_Bound"].parent = o["Memory_Bound"]
@@ -1466,6 +1766,7 @@ def __init__(self, r):
         o["Contested_Accesses"].parent = o["L3_Bound"]
         o["Data_Sharing"].parent = o["L3_Bound"]
         o["L3_Latency"].parent = o["L3_Bound"]
+        o["SQ_Full"].parent = o["L3_Bound"]
         o["MEM_Bound"].parent = o["Memory_Bound"]
         o["MEM_Bandwidth"].parent = o["MEM_Bound"]
         o["MEM_Latency"].parent = o["MEM_Bound"]
@@ -1473,7 +1774,7 @@ def __init__(self, r):
         o["Remote_DRAM"].parent = o["MEM_Latency"]
         o["Remote_Cache"].parent = o["MEM_Latency"]
         o["Stores_Bound"].parent = o["Memory_Bound"]
-        o["False_Sharing"].parent = o["Stores_Bound"]
+        o["Store_Latency"].parent = o["Stores_Bound"]
         o["Split_Stores"].parent = o["Stores_Bound"]
         o["DTLB_Store"].parent = o["Stores_Bound"]
         o["Core_Bound"].parent = o["Backend_Bound"]
@@ -1527,22 +1828,23 @@ def __init__(self, r):
         o["Frontend_Latency"].sibling = None
         o["ICache_Misses"].sibling = None
         o["ITLB_Misses"].sibling = None
-        o["Branch_Resteers"].sibling = o["Bad_Speculation"]
+	o["Branch_Resteers"].sibling = o["Bad_Speculation"]
         o["DSB_Switches"].sibling = None
         o["LCP"].sibling = None
-        o["MS_Switches"].sibling = o["Microcode_Sequencer"]
+	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
         o["Frontend_Bandwidth"].sibling = None
         o["MITE"].sibling = None
         o["DSB"].sibling = None
         o["LSD"].sibling = None
-        o["Bad_Speculation"].sibling = o["Branch_Resteers"]
+	o["Bad_Speculation"].sibling = o["Branch_Resteers"]
         o["Branch_Mispredicts"].sibling = None
         o["Machine_Clears"].sibling = None
         o["Backend_Bound"].sibling = None
         o["Memory_Bound"].sibling = None
-        o["L1_Bound"].sibling = None
+	o["L1_Bound"].sibling = o["G1_Port_Utilized"]
         o["DTLB_Load"].sibling = None
         o["Store_Fwd_Blk"].sibling = None
+	o["Lock_Latency"].sibling = o["Store_Latency"]
         o["Split_Loads"].sibling = None
         o["G4K_Aliasing"].sibling = None
         o["L2_Bound"].sibling = None
@@ -1550,6 +1852,7 @@ def __init__(self, r):
         o["Contested_Accesses"].sibling = None
         o["Data_Sharing"].sibling = None
         o["L3_Latency"].sibling = None
+        o["SQ_Full"].sibling = None
         o["MEM_Bound"].sibling = None
         o["MEM_Bandwidth"].sibling = None
         o["MEM_Latency"].sibling = None
@@ -1557,21 +1860,21 @@ def __init__(self, r):
         o["Remote_DRAM"].sibling = None
         o["Remote_Cache"].sibling = None
         o["Stores_Bound"].sibling = None
-        o["False_Sharing"].sibling = None
-        o["Split_Stores"].sibling = None
+	o["Store_Latency"].sibling = o["Lock_Latency"]
+	o["Split_Stores"].sibling = o["Port_4"]
         o["DTLB_Store"].sibling = None
         o["Core_Bound"].sibling = None
         o["Divider"].sibling = None
         o["Ports_Utilization"].sibling = None
         o["G0_Ports_Utilized"].sibling = None
-        o["G1_Port_Utilized"].sibling = None
+	o["G1_Port_Utilized"].sibling = o["L1_Bound"]
         o["G2_Ports_Utilized"].sibling = None
         o["G3m_Ports_Utilized"].sibling = None
         o["Port_0"].sibling = None
         o["Port_1"].sibling = None
         o["Port_2"].sibling = None
         o["Port_3"].sibling = None
-        o["Port_4"].sibling = None
+	o["Port_4"].sibling = o["Split_Stores"]
         o["Port_5"].sibling = None
         o["Retiring"].sibling = None
         o["Base"].sibling = None
@@ -1580,15 +1883,15 @@ def __init__(self, r):
         o["FP_Scalar"].sibling = None
         o["FP_Vector"].sibling = None
         o["Other"].sibling = None
-        o["Microcode_Sequencer"].sibling = o["MS_Switches"]
+	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events (experimential)
+        # sampling events
 
         o["Frontend_Bound"].sample = []
-        o["Frontend_Latency"].sample = []
+        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
         o["ICache_Misses"].sample = []
-        o["ITLB_Misses"].sample = []
-        o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES_PS']
+        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
+        o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
         o["DSB_Switches"].sample = []
         o["LCP"].sample = []
         o["MS_Switches"].sample = []
@@ -1596,33 +1899,35 @@ def __init__(self, r):
         o["MITE"].sample = []
         o["DSB"].sample = []
         o["LSD"].sample = []
-        o["Bad_Speculation"].sample = []
-        o["Branch_Mispredicts"].sample = []
-        o["Machine_Clears"].sample = []
+        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
+        o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+        o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT']
         o["Backend_Bound"].sample = []
         o["Memory_Bound"].sample = []
-        o["L1_Bound"].sample = []
-        o["DTLB_Load"].sample = []
+        o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
+        o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
         o["Store_Fwd_Blk"].sample = []
-        o["Split_Loads"].sample = []
+        o["Lock_Latency"].sample = ['MEM_UOPS_RETIRED.LOCK_LOADS:pp']
+        o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
         o["G4K_Aliasing"].sample = []
-        o["L2_Bound"].sample = []
-        o["L3_Bound"].sample = []
-        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS_PS']
-        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT_PS']
-        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT_PS']
-        o["MEM_Bound"].sample = []
+        o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
+        o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
+        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp']
+        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp']
+        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
+        o["SQ_Full"].sample = []
+        o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
         o["MEM_Bandwidth"].sample = []
         o["MEM_Latency"].sample = []
-        o["Local_DRAM"].sample = []
-        o["Remote_DRAM"].sample = []
-        o["Remote_Cache"].sample = []
-        o["Stores_Bound"].sample = []
-        o["False_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM_PS']
-        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES_PS', 'MEM_UOPS_RETIRED.ALL_STORES_PS']
-        o["DTLB_Store"].sample = []
+        o["Local_DRAM"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM:pp']
+        o["Remote_DRAM"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM:pp']
+        o["Remote_Cache"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM:pp', 'MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD:pp']
+        o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
+        o["Store_Latency"].sample = []
+        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
+        o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
         o["Core_Bound"].sample = []
-        o["Divider"].sample = []
+        o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE']
         o["Ports_Utilization"].sample = []
         o["G0_Ports_Utilized"].sample = []
         o["G1_Port_Utilized"].sample = []
@@ -1635,25 +1940,30 @@ def __init__(self, r):
         o["Port_4"].sample = []
         o["Port_5"].sample = []
         o["Retiring"].sample = []
-        o["Base"].sample = []
+        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
         o["FP_Arith"].sample = []
         o["FP_x87"].sample = []
         o["FP_Scalar"].sample = []
         o["FP_Vector"].sample = []
         o["Other"].sample = []
-        o["Microcode_Sequencer"].sample = []
+        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
 
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
         n = Metric_CPI() ; r.metric(n)
+        n = Metric_CoreIPC() ; r.metric(n)
         n = Metric_UPI() ; r.metric(n)
         n = Metric_IPTB() ; r.metric(n)
+        n = Metric_BPTB() ; r.metric(n)
         n = Metric_DSB_Coverage() ; r.metric(n)
         n = Metric_ILP() ; r.metric(n)
         n = Metric_MLP() ; r.metric(n)
         n = Metric_Load_Miss_Real_Latency() ; r.metric(n)
+        n = Metric_GFLOPs() ; r.metric(n)
         n = Metric_Turbo_Utilization() ; r.metric(n)
         n = Metric_Page_Walks_Use() ; r.metric(n)
+        n = Metric_MUX() ; r.metric(n)
         n = Metric_CLKS() ; r.metric(n)
-        n = Metric_CLKS1() ; r.metric(n)
+        n = Metric_CORE_CLKS() ; r.metric(n)
+        n = Metric_Time() ; r.metric(n)
diff --git a/jkt_server_ratios.py b/jkt_server_ratios.py
new file mode 100644
index 00000000..2a54ae24
--- /dev/null
+++ b/jkt_server_ratios.py
@@ -0,0 +1,789 @@
+
+#
+# auto generated TopDown 2.9 description for Intel Xeon E5 (code named SandyBridge EP)
+# Please see http://ark.intel.com for more details on these CPUs.
+#
+# References:
+# http://halobates.de/blog/p/262
+# https://sites.google.com/site/analysismethods/yasin-pubs
+#
+
+smt_enabled = False
+
+# Constants
+
+Pipeline_Width = 4
+L2_Store_Latency = 9
+Mem_L3_Weight = 7
+Mem_STLB_Hit_Cost = 7
+Mem_SFB_Cost = 13
+Mem_4K_Alias_Cost = 7
+Mem_XSNP_HitM_Cost = 60
+MEM_XSNP_Hit_Cost = 43
+MEM_XSNP_None_Cost = 29
+Mem_Local_DRAM_Cost = 200
+Mem_Remote_DRAM_Cost = 310
+Mem_Remote_HitM_Cost = 200
+Mem_Remote_Fwd_Cost = 180
+MS_Switches_Cost = 3
+OneMillion = 1000000
+Energy_Unit = 15.6
+
+# Aux. formulas
+
+
+# Floating Point Operations Count
+def FLOP_Count(EV, level):
+    return (1 *(EV("FP_COMP_OPS_EXE.X87", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level)) + 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level)) + 8 * EV("SIMD_FP_256.PACKED_SINGLE", level))
+
+def Recovery_Cycles(EV, level):
+    EV("INT_MISC.RECOVERY_CYCLES", level)
+    EV("INT_MISC.RECOVERY_CYCLES:amt1", level)
+    return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level)
+
+def L1D_Miss_Cycles(EV, level):
+    EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+    EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level)
+    return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+
+def SQ_Full_Cycles(EV, level):
+    EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+    return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+
+def ITLB_Miss_Cycles(EV, level):
+    return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level))
+
+def ORO_Demand_DRD_C1(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level )
+
+def ORO_Demand_DRD_C6(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level )
+
+def ORO_Demand_RFO_C1(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", level)) , level )
+
+def Store_L2_Hit_Cycles(EV, level):
+    return 0
+
+def Mem_L3_Hit_Fraction(EV, level):
+    return EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) /(EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", level) + Mem_L3_Weight * EV("MEM_LOAD_UOPS_RETIRED.LLC_MISS", level))
+
+def Mem_Lock_St_Fraction(EV, level):
+    return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level)
+
+def Retire_Uop_Fraction(EV, level):
+    return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level)
+
+def SLOTS(EV, level):
+    return Pipeline_Width * CORE_CLKS(EV, level)
+
+def DurationTimeInSeconds(EV, level):
+    return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 )
+
+# Instructions Per Cycle (per logical thread)
+def IPC(EV, level):
+    return EV("INST_RETIRED.ANY", level) / CLKS(EV, level)
+
+# Cycles Per Instruction (threaded)
+def CPI(EV, level):
+    return 1 / IPC(EV, level)
+
+# Instructions Per Cycle (per physical core)
+def CoreIPC(EV, level):
+    return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level)
+
+# Uops Per Instruction
+def UPI(EV, level):
+    return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level)
+
+# Instruction per taken branch
+def IPTB(EV, level):
+    return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
+# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes.
+def BPTB(EV, level):
+    return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
+# Fraction of Uops delivered by the DSB (decoded instructions cache)
+def DSB_Coverage(EV, level):
+    return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level))
+
+# Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)
+def MLP(EV, level):
+    return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level)
+
+# Giga Floating Point Operations Per Second
+def GFLOPs(EV, level):
+    return FLOP_Count(EV, level) / OneMillion / DurationTimeInSeconds(EV, level) / 1000
+
+# Average Frequency Utilization relative nominal frequency
+def Turbo_Utilization(EV, level):
+    return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level)
+
+# Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store
+def Page_Walks_Use(EV, level):
+    return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level)
+
+# PerfMon Event Multiplexing accuracy indicator
+def MUX(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level)
+
+# Per-thread actual clocks
+def CLKS(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD", level)
+
+# Core actual clocks
+def CORE_CLKS(EV, level):
+    EV("CPU_CLK_UNHALTED.THREAD:amt1", level)
+    return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level)
+
+# Run duration time in seconds
+def Time(EV, level):
+    return DurationTimeInSeconds(EV, level)
+
+# Event groups
+
+
+class Frontend_Bound:
+    name = "Frontend_Bound"
+    domain = "Slots"
+    area = "FE"
+    desc = """
+This category reflects slots where the Frontend of the processor undersupplies
+its Backend. Frontend denotes the first portion of pipeline responsible to
+fetch micro-ops which the Backend can execute. Within the Frontend, a branch
+predictor predicts the next address to fetch, cache-lines are fetched from
+memory, parsed into instructions, and lastly decoded into micro-ops. The
+purpose of the Frontend cluster is to deliver uops to Backend whenever the
+latter can accept them. For example, stalls due to instruction-cache misses
+would be categorized under Frontend Bound."""
+    level = 1
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
+            self.thresh = (self.val > 0.2)
+        except ZeroDivisionError:
+            #print "Frontend_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Frontend_Latency:
+    name = "Frontend_Latency"
+    domain = "Slots"
+    area = "FE"
+    desc = """
+This metric represents slots fraction CPU was stalled due to Frontend latency
+issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
+after a branch misprediction are categorized under Frontend Latency. In such
+cases the Frontend eventually delivers no uops for some period."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
+            self.thresh = (self.val > 0.15) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Frontend_Latency zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class ITLB_Misses:
+    name = "ITLB_Misses"
+    domain = "Clocks"
+    area = "FE"
+    desc = """
+This metric represents cycles fraction CPU was stalled due to instruction TLB
+misses. Using large code pages may be considered here."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "ITLB_Misses zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class DSB_Switches:
+    name = "DSB_Switches"
+    domain = "Clocks"
+    area = "FE"
+    desc = """
+This metric represents cycles fraction CPU was stalled due to switches from
+DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "DSB_Switches zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class LCP:
+    name = "LCP"
+    domain = "Clocks"
+    area = "FE"
+    desc = """
+This metric represents cycles fraction CPU was stalled due to Length Changing
+Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
+certainly avoid this."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "LCP zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class MS_Switches:
+    name = "MS_Switches"
+    domain = "Clocks"
+    area = "FE"
+    desc = """
+This metric represents cycles fraction CPU was stalled due to switches of uop
+delivery to the Microcode Sequencer (MS). Commonly used instructions are
+optimized for delivery by the DSB or MITE pipelines. The MS is designated to
+deliver long uop flows required by CISC instructions like CPUID, or uncommon
+conditions like Floating Point Assists when dealing with Denormals."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
+            self.thresh = (self.val > 0.05) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "MS_Switches zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Frontend_Bandwidth:
+    name = "Frontend_Bandwidth"
+    domain = "Slots"
+    area = "FE"
+    desc = """
+This metric represents slots fraction CPU was stalled due to Frontend
+bandwidth issues.  For example, inefficiencies at the instruction decoders, or
+code restrictions for caching in the DSB (decoded uops cache) are categorized
+under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
+optimal amount of uops to the Backend."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
+            self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Frontend_Bandwidth zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class MITE:
+    name = "MITE"
+    domain = "CoreClocks"
+    area = "FE"
+    desc = """
+This metric represents Core cycles fraction in which CPU was likely limited
+due to the MITE fetch pipeline.  For example, inefficiencies in the
+instruction decoders are categorized here."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "MITE zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class DSB:
+    name = "DSB"
+    domain = "CoreClocks"
+    area = "FE"
+    desc = """
+This metric represents Core cycles fraction in which CPU was likely limited
+due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
+utilization of the DSB cache structure or bank conflict when reading from it,
+are categorized here."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
+            self.thresh = (self.val > 0.3) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "DSB zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class LSD:
+    name = "LSD"
+    domain = "CoreClocks"
+    area = "FE"
+    desc = """
+This metric represents Core cycles fraction in which CPU was likely limited
+due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
+Uop supply. However, in some rare cases, optimal uop-delivery could not be
+reached for small loops whose size (in terms of number of uops) does not suit
+well the LSD structure."""
+    level = 3
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
+            self.thresh = (self.val > 0.1) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "LSD zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Bad_Speculation:
+    name = "Bad_Speculation"
+    domain = "Slots"
+    area = "BAD"
+    desc = """
+This category reflects slots wasted due to incorrect speculations, which
+include slots used to allocate uops that do not eventually get retired and
+slots for which allocation was blocked due to recovery from earlier incorrect
+speculation. For example, wasted work due to miss-predicted branches are
+categorized under Bad Speculation category"""
+    level = 1
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
+            self.thresh = (self.val > 0.1)
+        except ZeroDivisionError:
+            #print "Bad_Speculation zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Backend_Bound:
+    name = "Backend_Bound"
+    domain = "Slots"
+    area = "BE"
+    desc = """
+This category reflects slots where no uops are being delivered due to a lack
+of required resources for accepting more uops in the Backend of the pipeline.
+Backend describes the portion of the pipeline where the out-of-order scheduler
+dispatches ready uops into their respective execution units, and once
+completed these uops get retired according to program order. For example,
+stalls due to data-cache misses or stalls due to the divider unit being
+overloaded are both categorized under Backend Bound."""
+    level = 1
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
+            self.thresh = (self.val > 0.2)
+        except ZeroDivisionError:
+            #print "Backend_Bound zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Retiring:
+    name = "Retiring"
+    domain = "Slots"
+    area = "RET"
+    desc = """
+This category reflects slots utilized by useful work i.e. allocated uops that
+eventually get retired. Ideally, all pipeline slots would be attributed to the
+Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired
+per cycle has been achieved.  Maximizing Retiring typically increases the
+Instruction-Per-Cycle metric. Note that a high Retiring value does not
+necessary mean there is no room for more performance.  For example, Microcode
+assists are categorized under Retiring. They hurt performance and can often be
+avoided.  A high Retiring value for non-vectorized code may be a good hint for
+programmer to consider vectorizing his code.  Doing so essentially lets more
+computations be done without significantly increasing number of instructions
+thus improving the performance."""
+    level = 1
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
+            self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
+        except ZeroDivisionError:
+            #print "Retiring zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Base:
+    name = "Base"
+    domain = "Slots"
+    area = "RET"
+    desc = """
+This metric represents slots fraction where the CPU was retiring uops not
+originated from the microcode-sequencer. This correlates with total number of
+instructions used by the program. A uops-per-instruction ratio of 1 should be
+expected. While this is the most desirable of the top 4 categories, high
+values may still indicate areas for improvement. If possible focus on
+techniques that reduce instruction count or result in more efficient
+instructions generation such as vectorization."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
+            self.thresh = (self.val > 0.6) and self.parent.thresh
+        except ZeroDivisionError:
+            #print "Base zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Microcode_Sequencer:
+    name = "Microcode_Sequencer"
+    domain = "Slots"
+    area = "RET"
+    desc = """
+This metric represents slots fraction CPU was retiring uops fetched by the
+Microcode Sequencer (MS) ROM.  The MS is used for CISC instructions not fully
+decoded by the default decoders (like repeat move strings), or by microcode
+assists used to address some operation modes (like in Floating Point assists)."""
+    level = 2
+    htoff = False
+    def compute(self, EV):
+        try:
+            self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
+            self.thresh = (self.val > 0.05)
+        except ZeroDivisionError:
+            #print "Microcode_Sequencer zero division"
+            self.val = 0
+            self.thresh = False
+        return self.val
+
+class Metric_IPC:
+    name = "IPC"
+    desc = """
+Instructions Per Cycle (per logical thread)"""
+    domain = "Metric"
+    maxval = 5
+
+    def compute(self, EV):
+        try:
+            self.val = IPC(EV, 0)
+        except ZeroDivisionError:
+            print "IPC zero division"
+            self.val = 0
+
+class Metric_CPI:
+    name = "CPI"
+    desc = """
+Cycles Per Instruction (threaded)"""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CPI(EV, 0)
+        except ZeroDivisionError:
+            print "CPI zero division"
+            self.val = 0
+
+class Metric_CoreIPC:
+    name = "CoreIPC"
+    desc = """
+Instructions Per Cycle (per physical core)"""
+    domain = "Metric"
+    maxval = 5
+
+    def compute(self, EV):
+        try:
+            self.val = CoreIPC(EV, 0)
+        except ZeroDivisionError:
+            print "CoreIPC zero division"
+            self.val = 0
+
+class Metric_UPI:
+    name = "UPI"
+    desc = """
+Uops Per Instruction"""
+    domain = "Metric"
+    maxval = 2
+
+    def compute(self, EV):
+        try:
+            self.val = UPI(EV, 0)
+        except ZeroDivisionError:
+            print "UPI zero division"
+            self.val = 0
+
+class Metric_IPTB:
+    name = "IPTB"
+    desc = """
+Instruction per taken branch"""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = IPTB(EV, 0)
+        except ZeroDivisionError:
+            print "IPTB zero division"
+            self.val = 0
+
+class Metric_BPTB:
+    name = "BPTB"
+    desc = """
+Branch instructions per taken branch. Can be used to approximate PGO-
+likelihood for non-loopy codes."""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = BPTB(EV, 0)
+        except ZeroDivisionError:
+            print "BPTB zero division"
+            self.val = 0
+
+class Metric_DSB_Coverage:
+    name = "DSB_Coverage"
+    desc = """
+Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+    domain = "Metric"
+    maxval = 1
+
+    def compute(self, EV):
+        try:
+            self.val = DSB_Coverage(EV, 0)
+        except ZeroDivisionError:
+            print "DSB_Coverage zero division"
+            self.val = 0
+
+class Metric_MLP:
+    name = "MLP"
+    desc = """
+Memory-Level-Parallelism (average number of L1 miss demand load when there is
+at least 1 such miss)"""
+    domain = "Metric"
+    maxval = 10
+
+    def compute(self, EV):
+        try:
+            self.val = MLP(EV, 0)
+        except ZeroDivisionError:
+            print "MLP zero division"
+            self.val = 0
+
+class Metric_GFLOPs:
+    name = "GFLOPs"
+    desc = """
+Giga Floating Point Operations Per Second"""
+    domain = "Metric"
+    maxval = 100
+
+    def compute(self, EV):
+        try:
+            self.val = GFLOPs(EV, 0)
+        except ZeroDivisionError:
+            print "GFLOPs zero division"
+            self.val = 0
+
+class Metric_Turbo_Utilization:
+    name = "Turbo_Utilization"
+    desc = """
+Average Frequency Utilization relative nominal frequency"""
+    domain = "Metric"
+    maxval = 10
+
+    def compute(self, EV):
+        try:
+            self.val = Turbo_Utilization(EV, 0)
+        except ZeroDivisionError:
+            print "Turbo_Utilization zero division"
+            self.val = 0
+
+class Metric_Page_Walks_Use:
+    name = "Page_Walks_Use"
+    desc = """
+Fraction of cycles where the core's Page Walker is busy serving
+iTLB/Load/Store"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = Page_Walks_Use(EV, 0)
+        except ZeroDivisionError:
+            print "Page_Walks_Use zero division"
+            self.val = 0
+
+class Metric_MUX:
+    name = "MUX"
+    desc = """
+PerfMon Event Multiplexing accuracy indicator"""
+    domain = "Clocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = MUX(EV, 0)
+        except ZeroDivisionError:
+            print "MUX zero division"
+            self.val = 0
+
+class Metric_CLKS:
+    name = "CLKS"
+    desc = """
+Per-thread actual clocks"""
+    domain = "Count"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CLKS zero division"
+            self.val = 0
+
+class Metric_CORE_CLKS:
+    name = "CORE_CLKS"
+    desc = """
+Core actual clocks"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CORE_CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CORE_CLKS zero division"
+            self.val = 0
+
+class Metric_Time:
+    name = "Time"
+    desc = """
+Run duration time in seconds"""
+    domain = "Count"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = Time(EV, 0)
+        except ZeroDivisionError:
+            print "Time zero division"
+            self.val = 0
+
+# Schedule
+
+
+class Setup:
+    def __init__(self, r):
+        o = dict()
+        n = Frontend_Bound() ; r.run(n) ; o["Frontend_Bound"] = n
+        n = Frontend_Latency() ; r.run(n) ; o["Frontend_Latency"] = n
+        n = ITLB_Misses() ; r.run(n) ; o["ITLB_Misses"] = n
+        n = DSB_Switches() ; r.run(n) ; o["DSB_Switches"] = n
+        n = LCP() ; r.run(n) ; o["LCP"] = n
+        n = MS_Switches() ; r.run(n) ; o["MS_Switches"] = n
+        n = Frontend_Bandwidth() ; r.run(n) ; o["Frontend_Bandwidth"] = n
+        n = MITE() ; r.run(n) ; o["MITE"] = n
+        n = DSB() ; r.run(n) ; o["DSB"] = n
+        n = LSD() ; r.run(n) ; o["LSD"] = n
+        n = Bad_Speculation() ; r.run(n) ; o["Bad_Speculation"] = n
+        n = Backend_Bound() ; r.run(n) ; o["Backend_Bound"] = n
+        n = Retiring() ; r.run(n) ; o["Retiring"] = n
+        n = Base() ; r.run(n) ; o["Base"] = n
+        n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n
+
+        # parents
+
+        o["Frontend_Latency"].parent = o["Frontend_Bound"]
+        o["ITLB_Misses"].parent = o["Frontend_Latency"]
+        o["DSB_Switches"].parent = o["Frontend_Latency"]
+        o["LCP"].parent = o["Frontend_Latency"]
+        o["MS_Switches"].parent = o["Frontend_Latency"]
+        o["Frontend_Bandwidth"].parent = o["Frontend_Bound"]
+        o["MITE"].parent = o["Frontend_Bandwidth"]
+        o["DSB"].parent = o["Frontend_Bandwidth"]
+        o["LSD"].parent = o["Frontend_Bandwidth"]
+        o["Base"].parent = o["Retiring"]
+        o["Microcode_Sequencer"].parent = o["Retiring"]
+
+        # references between groups
+
+        o["Frontend_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["Frontend_Bandwidth"].Frontend_Latency = o["Frontend_Latency"]
+        o["Backend_Bound"].Frontend_Bound = o["Frontend_Bound"]
+        o["Backend_Bound"].Bad_Speculation = o["Bad_Speculation"]
+        o["Backend_Bound"].Retiring = o["Retiring"]
+        o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Base"].Retiring = o["Retiring"]
+        o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"]
+
+        # siblings cross-tree
+
+        o["Frontend_Bound"].sibling = None
+        o["Frontend_Latency"].sibling = None
+        o["ITLB_Misses"].sibling = None
+        o["DSB_Switches"].sibling = None
+        o["LCP"].sibling = None
+	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
+        o["Frontend_Bandwidth"].sibling = None
+        o["MITE"].sibling = None
+        o["DSB"].sibling = None
+        o["LSD"].sibling = None
+	o["Bad_Speculation"].sibling = None
+        o["Backend_Bound"].sibling = None
+        o["Retiring"].sibling = None
+        o["Base"].sibling = None
+	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
+
+        # sampling events
+
+        o["Frontend_Bound"].sample = []
+        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
+        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
+        o["DSB_Switches"].sample = []
+        o["LCP"].sample = []
+        o["MS_Switches"].sample = []
+        o["Frontend_Bandwidth"].sample = []
+        o["MITE"].sample = []
+        o["DSB"].sample = []
+        o["LSD"].sample = []
+        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
+        o["Backend_Bound"].sample = []
+        o["Retiring"].sample = []
+        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
+        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
+
+        # user visible metrics
+
+        n = Metric_IPC() ; r.metric(n)
+        n = Metric_CPI() ; r.metric(n)
+        n = Metric_CoreIPC() ; r.metric(n)
+        n = Metric_UPI() ; r.metric(n)
+        n = Metric_IPTB() ; r.metric(n)
+        n = Metric_BPTB() ; r.metric(n)
+        n = Metric_DSB_Coverage() ; r.metric(n)
+        n = Metric_MLP() ; r.metric(n)
+        n = Metric_GFLOPs() ; r.metric(n)
+        n = Metric_Turbo_Utilization() ; r.metric(n)
+        n = Metric_Page_Walks_Use() ; r.metric(n)
+        n = Metric_MUX() ; r.metric(n)
+        n = Metric_CLKS() ; r.metric(n)
+        n = Metric_CORE_CLKS() ; r.metric(n)
+        n = Metric_Time() ; r.metric(n)
diff --git a/snb_client_ratios.py b/snb_client_ratios.py
index 875e347f..2c6747bb 100644
--- a/snb_client_ratios.py
+++ b/snb_client_ratios.py
@@ -1,57 +1,151 @@
 
 #
-# auto generated TopDown description for Intel 2nd gen Core (code named SandyBridge)
+# auto generated TopDown 2.9 description for Intel 2nd gen Core (code named SandyBridge)
 # Please see http://ark.intel.com for more details on these CPUs.
 #
+# References:
+# http://halobates.de/blog/p/262
+# https://sites.google.com/site/analysismethods/yasin-pubs
+#
 
+smt_enabled = False
 
 # Constants
 
-PipelineWidth = 4
-MEM_L3_WEIGHT = 7
-MEM_STLB_HIT_COST = 7
-MEM_SFB_COST = 13
-MEM_4KALIAS_COST = 7
-MEM_XSNP_HITM_COST = 60
-MEM_XSNP_HIT_COST = 43
-MEM_XSNP_NONE_COST = 29
-MS_SWITCHES_COST = 3
+Pipeline_Width = 4
+L2_Store_Latency = 9
+Mem_L3_Weight = 7
+Mem_STLB_Hit_Cost = 7
+Mem_SFB_Cost = 13
+Mem_4K_Alias_Cost = 7
+Mem_XSNP_HitM_Cost = 60
+MEM_XSNP_Hit_Cost = 43
+MEM_XSNP_None_Cost = 29
+Mem_Local_DRAM_Cost = 200
+Mem_Remote_DRAM_Cost = 310
+Mem_Remote_HitM_Cost = 200
+Mem_Remote_Fwd_Cost = 180
+MS_Switches_Cost = 3
+OneMillion = 1000000
+Energy_Unit = 15.6
 
 # Aux. formulas
 
-def CLKS(EV, level):
-    return EV("CPU_CLK_UNHALTED.THREAD", level)
+
 # Floating Point Operations Count
-def FLOP_count(EV, level):
-    return ( 1 *(EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level))+ 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level))+ 8 * EV("SIMD_FP_256.PACKED_SINGLE", level) )
-def RetireUopFraction(EV, level):
+def FLOP_Count(EV, level):
+    return (1 *(EV("FP_COMP_OPS_EXE.X87", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", level) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", level)) + 2 * EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", level) + 4 *(EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", level) + EV("SIMD_FP_256.PACKED_DOUBLE", level)) + 8 * EV("SIMD_FP_256.PACKED_SINGLE", level))
+
+def Recovery_Cycles(EV, level):
+    EV("INT_MISC.RECOVERY_CYCLES", level)
+    EV("INT_MISC.RECOVERY_CYCLES:amt1", level)
+    return (EV("INT_MISC.RECOVERY_CYCLES:amt1", level) / 2) if smt_enabled else EV("INT_MISC.RECOVERY_CYCLES", level)
+
+def L1D_Miss_Cycles(EV, level):
+    EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+    EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level)
+    return (EV("L1D_PEND_MISS.PENDING_CYCLES:amt1", level) / 2) if smt_enabled else EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+
+def SQ_Full_Cycles(EV, level):
+    EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+    return (EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level) / 2) if smt_enabled else EV("OFFCORE_REQUESTS_BUFFER.SQ_FULL", level)
+
+def ITLB_Miss_Cycles(EV, level):
+    return (Mem_STLB_Hit_Cost * EV("ITLB_MISSES.STLB_HIT", level) + EV("ITLB_MISSES.WALK_DURATION", level))
+
+def ORO_Demand_DRD_C1(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", level)) , level )
+
+def ORO_Demand_DRD_C6(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD:c6", level)) , level )
+
+def ORO_Demand_RFO_C1(EV, level):
+    return EV(lambda EV , level : min(EV("CPU_CLK_UNHALTED.THREAD", level) , EV("OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", level)) , level )
+
+def Store_L2_Hit_Cycles(EV, level):
+    return 0
+
+def Cycles_False_Sharing_Client(EV, level):
+    return Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", level) + EV("OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_OTHER_CORE", level))
+
+def Mem_Lock_St_Fraction(EV, level):
+    return EV("MEM_UOPS_RETIRED.LOCK_LOADS", level) / EV("MEM_UOPS_RETIRED.ALL_STORES", level)
+
+def Retire_Uop_Fraction(EV, level):
     return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("UOPS_ISSUED.ANY", level)
+
 def SLOTS(EV, level):
-    return PipelineWidth * CLKS(EV, level)
-# Instructions Per Cycle
+    return Pipeline_Width * CORE_CLKS(EV, level)
+
+def DurationTimeInSeconds(EV, level):
+    return EV("interval-ns", 0)* 1000000000 if EV("interval-ns", 0)* 1000000000 > 0 else(EV("interval-ns", 0)* 1000000 / 1000 )
+
+# Instructions Per Cycle (per logical thread)
 def IPC(EV, level):
     return EV("INST_RETIRED.ANY", level) / CLKS(EV, level)
+
+# Cycles Per Instruction (threaded)
+def CPI(EV, level):
+    return 1 / IPC(EV, level)
+
+# Instructions Per Cycle (per physical core)
+def CoreIPC(EV, level):
+    return EV("INST_RETIRED.ANY", level) / CORE_CLKS(EV, level)
+
 # Uops Per Instruction
 def UPI(EV, level):
     return EV("UOPS_RETIRED.RETIRE_SLOTS", level) / EV("INST_RETIRED.ANY", level)
+
 # Instruction per taken branch
-def InstPerTakenBranch(EV, level):
+def IPTB(EV, level):
     return EV("INST_RETIRED.ANY", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
+# Branch instructions per taken branch. Can be used to approximate PGO-likelihood for non-loopy codes.
+def BPTB(EV, level):
+    return EV("BR_INST_RETIRED.ALL_BRANCHES", level) / EV("BR_INST_RETIRED.NEAR_TAKEN", level)
+
 # Fraction of Uops delivered by the DSB (decoded instructions cache)
-def DSBCoverage(EV, level):
-    return ( EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level))/(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level) )
-# Memory-Level-Parallelism (avg L1 miss demand load when there is at least 1 such miss)
+def DSB_Coverage(EV, level):
+    return (EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level)) /(EV("IDQ.DSB_UOPS", level) + EV("LSD.UOPS", level) + EV("IDQ.MITE_UOPS", level) + EV("IDQ.MS_UOPS", level))
+
+# Memory-Level-Parallelism (average number of L1 miss demand load when there is at least 1 such miss)
 def MLP(EV, level):
-    return EV("L1D_PEND_MISS.PENDING", level) / EV("L1D_PEND_MISS.PENDING_CYCLES", level)
+    return EV("L1D_PEND_MISS.PENDING", level) / L1D_Miss_Cycles(EV, level)
+
+# Giga Floating Point Operations Per Second
+def GFLOPs(EV, level):
+    return FLOP_Count(EV, level) / OneMillion / DurationTimeInSeconds(EV, level) / 1000
+
 # Average Frequency Utilization relative nominal frequency
-def TurboUtilization(EV, level):
+def Turbo_Utilization(EV, level):
     return CLKS(EV, level) / EV("CPU_CLK_UNHALTED.REF_TSC", level)
 
+# Fraction of cycles where the core's Page Walker is busy serving iTLB/Load/Store
+def Page_Walks_Use(EV, level):
+    return (EV("ITLB_MISSES.WALK_DURATION", level) + EV("DTLB_LOAD_MISSES.WALK_DURATION", level) + EV("DTLB_STORE_MISSES.WALK_DURATION", level)) / CORE_CLKS(EV, level)
+
+# PerfMon Event Multiplexing accuracy indicator
+def MUX(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD_P", level) / EV("CPU_CLK_UNHALTED.THREAD", level)
+
+# Per-thread actual clocks
+def CLKS(EV, level):
+    return EV("CPU_CLK_UNHALTED.THREAD", level)
+
+# Core actual clocks
+def CORE_CLKS(EV, level):
+    EV("CPU_CLK_UNHALTED.THREAD:amt1", level)
+    return (EV("CPU_CLK_UNHALTED.THREAD:amt1", level) / 2) if smt_enabled else CLKS(EV, level)
+
+# Run duration time in seconds
+def Time(EV, level):
+    return DurationTimeInSeconds(EV, level)
+
 # Event groups
 
 
-class FrontendBound:
-    name = "FrontendBound"
+class Frontend_Bound:
+    name = "Frontend_Bound"
     domain = "Slots"
     area = "FE"
     desc = """
@@ -64,68 +158,72 @@ class FrontendBound:
 latter can accept them. For example, stalls due to instruction-cache misses
 would be categorized under Frontend Bound."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1)/ SLOTS(EV, 1 )
+            self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "FrontendBound zero division"
+            #print "Frontend_Bound zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class FrontendLatency:
-    name = "Frontend Latency"
+class Frontend_Latency:
+    name = "Frontend_Latency"
     domain = "Slots"
     area = "FE"
     desc = """
 This metric represents slots fraction CPU was stalled due to Frontend latency
 issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
-after a branch missprediction are categorized under Frontend Latency. In such
+after a branch misprediction are categorized under Frontend Latency. In such
 cases the Frontend eventually delivers no uops for some period."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = PipelineWidth * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2)/ SLOTS(EV, 2 )
+            self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FrontendLatency zero division"
+            #print "Frontend_Latency zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class ITLBmisses:
-    name = "ITLB misses"
+class ITLB_Misses:
+    name = "ITLB_Misses"
     domain = "Clocks"
     area = "FE"
     desc = """
 This metric represents cycles fraction CPU was stalled due to instruction TLB
 misses. Using large code pages may be considered here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ITLB_MISSES.WALK_DURATION", 3)/ CLKS(EV, 3 )
+            self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ITLBmisses zero division"
+            #print "ITLB_Misses zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class DSBswitches:
-    name = "DSB switches"
+class DSB_Switches:
+    name = "DSB_Switches"
     domain = "Clocks"
     area = "FE"
     desc = """
 This metric represents cycles fraction CPU was stalled due to switches from
 DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3)/ CLKS(EV, 3 )
+            self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSBswitches zero division"
+            #print "DSB_Switches zero division"
             self.val = 0
             self.thresh = False
         return self.val
@@ -139,9 +237,10 @@ class LCP:
 Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
 certainly avoid this."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("ILD_STALL.LCP", 3)/ CLKS(EV, 3 )
+            self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
             #print "LCP zero division"
@@ -149,8 +248,8 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
-class MSswitches:
-    name = "MS switches"
+class MS_Switches:
+    name = "MS_Switches"
     domain = "Clocks"
     area = "FE"
     desc = """
@@ -160,18 +259,19 @@ class MSswitches:
 deliver long uop flows required by CISC instructions like CPUID, or uncommon
 conditions like Floating Point Assists when dealing with Denormals."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = MS_SWITCHES_COST * EV("IDQ.MS_SWITCHES", 3)/ CLKS(EV, 3 )
+            self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MSswitches zero division"
+            #print "MS_Switches zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class FrontendBandwidth:
-    name = "Frontend Bandwidth"
+class Frontend_Bandwidth:
+    name = "Frontend_Bandwidth"
     domain = "Slots"
     area = "FE"
     desc = """
@@ -181,28 +281,30 @@ class FrontendBandwidth:
 under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
 optimal amount of uops to the Backend."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.FrontendBound.compute(EV)- self.FrontendLatency.compute(EV )
+            self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FrontendBandwidth zero division"
+            #print "Frontend_Bandwidth zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
 class MITE:
     name = "MITE"
-    domain = "Clocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-the MITE fetch pipeline.  For example, inefficiencies in the instruction
-decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to the MITE fetch pipeline.  For example, inefficiencies in the
+instruction decoders are categorized here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CLKS(EV, 3 )
+            self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "MITE zero division"
@@ -212,17 +314,18 @@ def compute(self, EV):
 
 class DSB:
     name = "DSB"
-    domain = "Clocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-DSB (decoded uop cache) fetch pipeline.  For example, inefficient utlilization
-of the DSB cache structure or bank conflict when reading from it, are
-categorized here."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
+utilization of the DSB cache structure or bank conflict when reading from it,
+are categorized here."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3)- EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CLKS(EV, 3 )
+            self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
             #print "DSB zero division"
@@ -232,18 +335,19 @@ def compute(self, EV):
 
 class LSD:
     name = "LSD"
-    domain = "Clocks"
+    domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction in which CPU was likely limited due to
-LSD (Loop Stream Detector) unit.  LSD typically does well sustaining Uop
-supply. However, in some rare cases, optimal uop-delivery could not be reached
-for small loops whose size (in terms of number of uops) does not suit well the
-LSD structure."""
+This metric represents Core cycles fraction in which CPU was likely limited
+due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
+Uop supply. However, in some rare cases, optimal uop-delivery could not be
+reached for small loops whose size (in terms of number of uops) does not suit
+well the LSD structure."""
     level = 3
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("LSD.CYCLES_ACTIVE", 3)- EV("LSD.CYCLES_4_UOPS", 3)) / CLKS(EV, 3 )
+            self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
             #print "LSD zero division"
@@ -251,8 +355,8 @@ def compute(self, EV):
             self.thresh = False
         return self.val
 
-class BadSpeculation:
-    name = "BadSpeculation"
+class Bad_Speculation:
+    name = "Bad_Speculation"
     domain = "Slots"
     area = "BAD"
     desc = """
@@ -262,12 +366,13 @@ class BadSpeculation:
 speculation. For example, wasted work due to miss-predicted branches are
 categorized under Bad Speculation category"""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = ( EV("UOPS_ISSUED.ANY", 1)- EV("UOPS_RETIRED.RETIRE_SLOTS", 1)+ PipelineWidth * EV("INT_MISC.RECOVERY_CYCLES", 1)) / SLOTS(EV, 1 )
+            self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
-            #print "BadSpeculation zero division"
+            #print "Bad_Speculation zero division"
             self.val = 0
             self.thresh = False
         return self.val
@@ -285,9 +390,10 @@ class Backend_Bound:
 stalls due to data-cache misses or stalls due to the divider unit being
 overloaded are both categorized under Backend Bound."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = 1 -(self.FrontendBound.compute(EV)+ self.BadSpeculation.compute(EV)+ self.Retiring.compute(EV))
+            self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
             #print "Backend_Bound zero division"
@@ -312,40 +418,43 @@ class Retiring:
 computations be done without significantly increasing number of instructions
 thus improving the performance."""
     level = 1
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1)/ SLOTS(EV, 1 )
-            self.thresh = (self.val > 0.7) | self.MicroSequencer.thresh
+            self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
+            self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
             #print "Retiring zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class BASE:
-    name = "BASE"
+class Base:
+    name = "Base"
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops not originated
-from the microcode-sequencer. This correlates with total number of
+This metric represents slots fraction where the CPU was retiring uops not
+originated from the microcode-sequencer. This correlates with total number of
 instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. A high Retiring value for non-vectorized code is typically a good
-hint for programmer to pursue vectorizing his code, which can reduce
-instructions hence this bucket."""
+expected. While this is the most desirable of the top 4 categories, high
+values may still indicate areas for improvement. If possible focus on
+techniques that reduce instruction count or result in more efficient
+instructions generation such as vectorization."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = self.Retiring.compute(EV)- self.MicroSequencer.compute(EV )
+            self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
-            #print "BASE zero division"
+            #print "Base zero division"
             self.val = 0
             self.thresh = False
         return self.val
 
-class MicroSequencer:
-    name = "MicroSequencer"
+class Microcode_Sequencer:
+    name = "Microcode_Sequencer"
     domain = "Slots"
     area = "RET"
     desc = """
@@ -354,12 +463,13 @@ class MicroSequencer:
 decoded by the default decoders (like repeat move strings), or by microcode
 assists used to address some operation modes (like in Floating Point assists)."""
     level = 2
+    htoff = False
     def compute(self, EV):
         try:
-            self.val = RetireUopFraction(EV, 2)* EV("IDQ.MS_UOPS", 2)/ SLOTS(EV, 2 )
+            self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
-            #print "MicroSequencer zero division"
+            #print "Microcode_Sequencer zero division"
             self.val = 0
             self.thresh = False
         return self.val
@@ -367,7 +477,9 @@ def compute(self, EV):
 class Metric_IPC:
     name = "IPC"
     desc = """
-Instructions Per Cycle"""
+Instructions Per Cycle (per logical thread)"""
+    domain = "Metric"
+    maxval = 5
 
     def compute(self, EV):
         try:
@@ -376,10 +488,40 @@ def compute(self, EV):
             print "IPC zero division"
             self.val = 0
 
+class Metric_CPI:
+    name = "CPI"
+    desc = """
+Cycles Per Instruction (threaded)"""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CPI(EV, 0)
+        except ZeroDivisionError:
+            print "CPI zero division"
+            self.val = 0
+
+class Metric_CoreIPC:
+    name = "CoreIPC"
+    desc = """
+Instructions Per Cycle (per physical core)"""
+    domain = "Metric"
+    maxval = 5
+
+    def compute(self, EV):
+        try:
+            self.val = CoreIPC(EV, 0)
+        except ZeroDivisionError:
+            print "CoreIPC zero division"
+            self.val = 0
+
 class Metric_UPI:
     name = "UPI"
     desc = """
 Uops Per Instruction"""
+    domain = "Metric"
+    maxval = 2
 
     def compute(self, EV):
         try:
@@ -388,35 +530,56 @@ def compute(self, EV):
             print "UPI zero division"
             self.val = 0
 
-class Metric_InstPerTakenBranch:
-    name = "InstPerTakenBranch"
+class Metric_IPTB:
+    name = "IPTB"
     desc = """
 Instruction per taken branch"""
+    domain = "Metric"
+    maxval = 0
 
     def compute(self, EV):
         try:
-            self.val = InstPerTakenBranch(EV, 0)
+            self.val = IPTB(EV, 0)
         except ZeroDivisionError:
-            print "InstPerTakenBranch zero division"
+            print "IPTB zero division"
             self.val = 0
 
-class Metric_DSBCoverage:
-    name = "DSBCoverage"
+class Metric_BPTB:
+    name = "BPTB"
+    desc = """
+Branch instructions per taken branch. Can be used to approximate PGO-
+likelihood for non-loopy codes."""
+    domain = "Metric"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = BPTB(EV, 0)
+        except ZeroDivisionError:
+            print "BPTB zero division"
+            self.val = 0
+
+class Metric_DSB_Coverage:
+    name = "DSB_Coverage"
     desc = """
 Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+    domain = "Metric"
+    maxval = 1
 
     def compute(self, EV):
         try:
-            self.val = DSBCoverage(EV, 0)
+            self.val = DSB_Coverage(EV, 0)
         except ZeroDivisionError:
-            print "DSBCoverage zero division"
+            print "DSB_Coverage zero division"
             self.val = 0
 
 class Metric_MLP:
     name = "MLP"
     desc = """
-Memory-Level-Parallelism (avg L1 miss demand load when there is at least 1
-such miss)"""
+Memory-Level-Parallelism (average number of L1 miss demand load when there is
+at least 1 such miss)"""
+    domain = "Metric"
+    maxval = 10
 
     def compute(self, EV):
         try:
@@ -425,16 +588,103 @@ def compute(self, EV):
             print "MLP zero division"
             self.val = 0
 
-class Metric_TurboUtilization:
-    name = "TurboUtilization"
+class Metric_GFLOPs:
+    name = "GFLOPs"
+    desc = """
+Giga Floating Point Operations Per Second"""
+    domain = "Metric"
+    maxval = 100
+
+    def compute(self, EV):
+        try:
+            self.val = GFLOPs(EV, 0)
+        except ZeroDivisionError:
+            print "GFLOPs zero division"
+            self.val = 0
+
+class Metric_Turbo_Utilization:
+    name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
+    domain = "Metric"
+    maxval = 10
+
+    def compute(self, EV):
+        try:
+            self.val = Turbo_Utilization(EV, 0)
+        except ZeroDivisionError:
+            print "Turbo_Utilization zero division"
+            self.val = 0
+
+class Metric_Page_Walks_Use:
+    name = "Page_Walks_Use"
+    desc = """
+Fraction of cycles where the core's Page Walker is busy serving
+iTLB/Load/Store"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = Page_Walks_Use(EV, 0)
+        except ZeroDivisionError:
+            print "Page_Walks_Use zero division"
+            self.val = 0
+
+class Metric_MUX:
+    name = "MUX"
+    desc = """
+PerfMon Event Multiplexing accuracy indicator"""
+    domain = "Clocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = MUX(EV, 0)
+        except ZeroDivisionError:
+            print "MUX zero division"
+            self.val = 0
+
+class Metric_CLKS:
+    name = "CLKS"
+    desc = """
+Per-thread actual clocks"""
+    domain = "Count"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CLKS zero division"
+            self.val = 0
+
+class Metric_CORE_CLKS:
+    name = "CORE_CLKS"
+    desc = """
+Core actual clocks"""
+    domain = "CoreClocks"
+    maxval = 0
+
+    def compute(self, EV):
+        try:
+            self.val = CORE_CLKS(EV, 0)
+        except ZeroDivisionError:
+            print "CORE_CLKS zero division"
+            self.val = 0
+
+class Metric_Time:
+    name = "Time"
+    desc = """
+Run duration time in seconds"""
+    domain = "Count"
+    maxval = 0
 
     def compute(self, EV):
         try:
-            self.val = TurboUtilization(EV, 0)
+            self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "TurboUtilization zero division"
+            print "Time zero division"
             self.val = 0
 
 # Schedule
@@ -443,89 +693,97 @@ def compute(self, EV):
 class Setup:
     def __init__(self, r):
         o = dict()
-        n = FrontendBound() ; r.run(n) ; o["FrontendBound"] = n
-        n = FrontendLatency() ; r.run(n) ; o["FrontendLatency"] = n
-        n = ITLBmisses() ; r.run(n) ; o["ITLBmisses"] = n
-        n = DSBswitches() ; r.run(n) ; o["DSBswitches"] = n
+        n = Frontend_Bound() ; r.run(n) ; o["Frontend_Bound"] = n
+        n = Frontend_Latency() ; r.run(n) ; o["Frontend_Latency"] = n
+        n = ITLB_Misses() ; r.run(n) ; o["ITLB_Misses"] = n
+        n = DSB_Switches() ; r.run(n) ; o["DSB_Switches"] = n
         n = LCP() ; r.run(n) ; o["LCP"] = n
-        n = MSswitches() ; r.run(n) ; o["MSswitches"] = n
-        n = FrontendBandwidth() ; r.run(n) ; o["FrontendBandwidth"] = n
+        n = MS_Switches() ; r.run(n) ; o["MS_Switches"] = n
+        n = Frontend_Bandwidth() ; r.run(n) ; o["Frontend_Bandwidth"] = n
         n = MITE() ; r.run(n) ; o["MITE"] = n
         n = DSB() ; r.run(n) ; o["DSB"] = n
         n = LSD() ; r.run(n) ; o["LSD"] = n
-        n = BadSpeculation() ; r.run(n) ; o["BadSpeculation"] = n
+        n = Bad_Speculation() ; r.run(n) ; o["Bad_Speculation"] = n
         n = Backend_Bound() ; r.run(n) ; o["Backend_Bound"] = n
         n = Retiring() ; r.run(n) ; o["Retiring"] = n
-        n = BASE() ; r.run(n) ; o["BASE"] = n
-        n = MicroSequencer() ; r.run(n) ; o["MicroSequencer"] = n
+        n = Base() ; r.run(n) ; o["Base"] = n
+        n = Microcode_Sequencer() ; r.run(n) ; o["Microcode_Sequencer"] = n
 
         # parents
 
-        o["FrontendLatency"].parent = o["FrontendBound"]
-        o["ITLBmisses"].parent = o["FrontendLatency"]
-        o["DSBswitches"].parent = o["FrontendLatency"]
-        o["LCP"].parent = o["FrontendLatency"]
-        o["MSswitches"].parent = o["FrontendLatency"]
-        o["FrontendBandwidth"].parent = o["FrontendBound"]
-        o["MITE"].parent = o["FrontendBandwidth"]
-        o["DSB"].parent = o["FrontendBandwidth"]
-        o["LSD"].parent = o["FrontendBandwidth"]
-        o["BASE"].parent = o["Retiring"]
-        o["MicroSequencer"].parent = o["Retiring"]
+        o["Frontend_Latency"].parent = o["Frontend_Bound"]
+        o["ITLB_Misses"].parent = o["Frontend_Latency"]
+        o["DSB_Switches"].parent = o["Frontend_Latency"]
+        o["LCP"].parent = o["Frontend_Latency"]
+        o["MS_Switches"].parent = o["Frontend_Latency"]
+        o["Frontend_Bandwidth"].parent = o["Frontend_Bound"]
+        o["MITE"].parent = o["Frontend_Bandwidth"]
+        o["DSB"].parent = o["Frontend_Bandwidth"]
+        o["LSD"].parent = o["Frontend_Bandwidth"]
+        o["Base"].parent = o["Retiring"]
+        o["Microcode_Sequencer"].parent = o["Retiring"]
 
         # references between groups
 
-        o["FrontendBandwidth"].FrontendBound = o["FrontendBound"]
-        o["FrontendBandwidth"].FrontendLatency = o["FrontendLatency"]
-        o["Backend_Bound"].FrontendBound = o["FrontendBound"]
-        o["Backend_Bound"].BadSpeculation = o["BadSpeculation"]
+        o["Frontend_Bandwidth"].Frontend_Bound = o["Frontend_Bound"]
+        o["Frontend_Bandwidth"].Frontend_Latency = o["Frontend_Latency"]
+        o["Backend_Bound"].Frontend_Bound = o["Frontend_Bound"]
+        o["Backend_Bound"].Bad_Speculation = o["Bad_Speculation"]
         o["Backend_Bound"].Retiring = o["Retiring"]
-        o["Retiring"].MicroSequencer = o["MicroSequencer"]
-        o["BASE"].Retiring = o["Retiring"]
-        o["BASE"].MicroSequencer = o["MicroSequencer"]
+        o["Retiring"].Microcode_Sequencer = o["Microcode_Sequencer"]
+        o["Base"].Retiring = o["Retiring"]
+        o["Base"].Microcode_Sequencer = o["Microcode_Sequencer"]
 
         # siblings cross-tree
 
-        o["FrontendBound"].sibling = None
-        o["FrontendLatency"].sibling = None
-        o["ITLBmisses"].sibling = None
-        o["DSBswitches"].sibling = None
+        o["Frontend_Bound"].sibling = None
+        o["Frontend_Latency"].sibling = None
+        o["ITLB_Misses"].sibling = None
+        o["DSB_Switches"].sibling = None
         o["LCP"].sibling = None
-        o["MSswitches"].sibling = o["MicroSequencer"]
-        o["FrontendBandwidth"].sibling = None
+	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
+        o["Frontend_Bandwidth"].sibling = None
         o["MITE"].sibling = None
         o["DSB"].sibling = None
         o["LSD"].sibling = None
-        #o["BadSpeculation"].sibling = o["BranchResteers"]
-        o["BadSpeculation"].sibling = None
+	o["Bad_Speculation"].sibling = None
         o["Backend_Bound"].sibling = None
         o["Retiring"].sibling = None
-        o["BASE"].sibling = None
-        o["MicroSequencer"].sibling = o["MSswitches"]
+        o["Base"].sibling = None
+	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events (experimential)
+        # sampling events
 
-        o["FrontendBound"].sample = []
-        o["FrontendLatency"].sample = []
-        o["ITLBmisses"].sample = []
-        o["DSBswitches"].sample = []
+        o["Frontend_Bound"].sample = []
+        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
+        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
+        o["DSB_Switches"].sample = []
         o["LCP"].sample = []
-        o["MSswitches"].sample = []
-        o["FrontendBandwidth"].sample = []
+        o["MS_Switches"].sample = []
+        o["Frontend_Bandwidth"].sample = []
         o["MITE"].sample = []
         o["DSB"].sample = []
         o["LSD"].sample = []
-        o["BadSpeculation"].sample = []
+        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
         o["Backend_Bound"].sample = []
         o["Retiring"].sample = []
-        o["BASE"].sample = []
-        o["MicroSequencer"].sample = []
+        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
+        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
 
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
+        n = Metric_CPI() ; r.metric(n)
+        n = Metric_CoreIPC() ; r.metric(n)
         n = Metric_UPI() ; r.metric(n)
-        n = Metric_InstPerTakenBranch() ; r.metric(n)
-        n = Metric_DSBCoverage() ; r.metric(n)
+        n = Metric_IPTB() ; r.metric(n)
+        n = Metric_BPTB() ; r.metric(n)
+        n = Metric_DSB_Coverage() ; r.metric(n)
         n = Metric_MLP() ; r.metric(n)
-        n = Metric_TurboUtilization() ; r.metric(n)
+        n = Metric_GFLOPs() ; r.metric(n)
+        n = Metric_Turbo_Utilization() ; r.metric(n)
+        n = Metric_Page_Walks_Use() ; r.metric(n)
+        n = Metric_MUX() ; r.metric(n)
+        n = Metric_CLKS() ; r.metric(n)
+        n = Metric_CORE_CLKS() ; r.metric(n)
+        n = Metric_Time() ; r.metric(n)
diff --git a/tl-tester b/tl-tester
index b75c520e..62bb7b51 100755
--- a/tl-tester
+++ b/tl-tester
@@ -33,10 +33,15 @@ EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py -d -l4 $LOAD
 EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py -v -d -l4 $LOAD
 EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py -x, -v -d -l4 $LOAD
 EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py --metrics -x, -v -d -l4 $LOAD
-EVENTMAP=${cpus[ivt]} FORCECPU=ivt $WRAP ./toplev.py -d --all --sample --kernel $LOAD
-EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d --all --sample $LOAD
-EVENTMAP=${cpus[jkt]} FORCECPU=jkt $WRAP ./toplev.py -d --all --sample $LOAD
-EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --all --sample $LOAD
+EVENTMAP=${cpus[ivt]} FORCECPU=ivt $WRAP ./toplev.py -d --all --kernel $LOAD
+EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d --all $LOAD
+EVENTMAP=${cpus[jkt]} FORCECPU=jkt $WRAP ./toplev.py -d --all -l5 $LOAD
+EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --all $LOAD
+EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --all -l5 $LOAD
+# fixme: need event list with missing events
+#EVENTMAP=${cpus[hsx]} FORCECPU=hsx $WRAP ./toplev.py -d --all -l5 $LOAD
+#EVENTMAP=${cpus[hsx]} FORCECPU=hsx $WRAP ./toplev.py -d --all $LOAD
+EVENTMAP=${cpus[ivt]} FORCECPU=ivt $WRAP ./toplev.py -d --all $LOAD
 EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --metrics -l4 $LOAD
 EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --metrics --no-multiplex -l4 $LOAD
 EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d --power -l4 $LOAD
@@ -55,9 +60,9 @@ EVENTMAP=${cpus[slm]} FORCECPU=slm $WRAP ./toplev.py -d --all $LOAD
 
 # need new perf
 # test other perf output formats
-EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d -l4 -I 1000 -a --per-core sleep 1
-EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d -l4 -I 1000 -a --per-socket sleep 1
-EVENTMAP=${cpus[hsw]} FORCECPU=hsw $WRAP ./toplev.py -d -l4 -I 1000 -a -A sleep 1
+EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d -l4 -I 1000 -a --per-core sleep 1
+EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d -l4 -I 1000 -a --per-socket sleep 1
+EVENTMAP=${cpus[snb]} FORCECPU=snb $WRAP ./toplev.py -d -l4 -I 1000 -a -A sleep 1
 
 trap "" ERR 0
 
diff --git a/toplev.py b/toplev.py
index 182dd55c..ac5dac60 100755
--- a/toplev.py
+++ b/toplev.py
@@ -18,7 +18,7 @@
 # Handles a variety of perf versions, but older ones have various limitations.
 
 import sys, os, re, itertools, textwrap, platform, pty, subprocess
-import exceptions, argparse, time
+import exceptions, argparse, time, types
 from collections import defaultdict, Counter
 #sys.path.append("../pmu-tools")
 import ocperf
@@ -31,9 +31,10 @@
     ("hsw", (60, 70, 69 )),
     ("hsx", (63, )),
     ("slm", (55, 77)),
+    ("bdw", (61, )),
 )
 
-tsx_cpus = ("hsw", "hsx")
+tsx_cpus = ("hsw", "hsx", "bdw")
 
 ingroup_events = frozenset(["cycles", "instructions", "ref-cycles",
                             "cpu/event=0x3c,umask=0x00,any=1/",
@@ -164,8 +165,6 @@ def event_group(evlist):
                type=int, default=1)
 p.add_argument('--detailed', '-d', help=argparse.SUPPRESS, action='store_true')
 p.add_argument('--metrics', '-m', help="Print extra metrics", action='store_true')
-p.add_argument('--sample', '-S', help="Suggest commands to sample for bottlenecks (experimental)",
-        action='store_true')
 p.add_argument('--raw', help="Print raw values", action='store_true')
 p.add_argument('--sw', help="Measure perf Linux metrics", action='store_true')
 p.add_argument('--cpu', '-C', help=argparse.SUPPRESS)
@@ -321,9 +320,11 @@ def __init__(self):
         self.has_tsx = False
         self.freq = 0.0
         self.siblings = {}
+        self.threads = 0
         forced_cpu = self.force_cpu()
         self.force_counters()
         cores = Counter()
+        sockets = Counter()
         self.coreids = defaultdict(list)
         self.cputocore = {}
         with open("/proc/cpuinfo", "r") as f:
@@ -349,14 +350,16 @@ def __init__(self):
                         self.freq = float(m.group(1))
                 elif (n[0], n[1]) == ("physical", "id"):
                     physid = int(n[3])
+                    sockets[physid] += 1
                 elif (n[0], n[1]) == ("core", "id"):
                     coreid = int(n[3])
                     key = (physid, coreid,)
                     cores[key] += 1
-                    if cores[key] > 1:
+                    self.threads = max(self.threads, cores[key])
+                    if self.threads > 1:
                         self.ht = True
-                    self.coreids[coreid].append(cpunum)
-                    self.cputocore[cpunum] = coreid
+                    self.coreids[key].append(cpunum)
+                    self.cputocore[cpunum] = key
                 elif n[0] == "flags":
                     ok += 1
                     self.has_tsx = "rtm" in n
@@ -372,6 +375,7 @@ def __init__(self):
                 self.counters = 4
             else:
                 self.counters = 8
+        self.sockets = len(sockets.keys())
 
 cpu = CPU()
 
@@ -490,9 +494,27 @@ def set_interval(env, d):
     if args.raw:
         print "interval-ns val", env['interval-ns']
 
+def key_to_coreid(k):
+    x = cpu.cputocore[int(k)]
+    return x[0] * 1000 + x[1]
+
+def core_fmt(core):
+    if cpu.sockets > 1:
+        return "S%d-C%d" % (core / 1000, core % 1000,)
+    return "C%d" % (core % 1000,)
+
 def print_keys(runner, res, rev, out, interval, env):
-   for j in sorted(res.keys()):
-        runner.print_res(res[j], rev[j], out, interval, j, env)
+    if need_any:
+        # collect counts from all threads of cores as lists
+        # this way the model can access all threads individually
+        keys = sorted(res.keys(), key = key_to_coreid)
+        for core, citer in itertools.groupby(keys, key_to_coreid):
+            cpus = list(citer)
+            r = list(itertools.izip(*[res[j] for j in cpus]))
+            runner.print_res(r, rev[cpus[0]], out, interval, core_fmt(core), env)
+    else:
+        for j in sorted(res.keys()):
+            runner.print_res(res[j], rev[j], out, interval, j, env)
 
 def execute_no_multiplex(runner, out, rest):
     if args.interval: # XXX
@@ -503,6 +525,8 @@ def execute_no_multiplex(runner, out, rest):
     rev = defaultdict(list)
     env = dict()
     for g in groups:
+        if len(g) == 0:
+            continue
         print "RUN #%d of %d" % (n, len(groups))
         ret, res, rev, interval = do_execute(runner, g, out, rest, res, rev, env)
         n += 1
@@ -511,7 +535,9 @@ def execute_no_multiplex(runner, out, rest):
 
 def execute(runner, out, rest):
     env = dict()
-    ret, res, rev, interval = do_execute(runner, ",".join(runner.evgroups), out, rest,
+    print "evgroups", runner.evgroups
+    ret, res, rev, interval = do_execute(runner, ",".join(filter(lambda x: len(x) > 0, runner.evgroups)),
+                                         out, rest,
                                          defaultdict(list),
                                          defaultdict(list),
                                          env)
@@ -599,6 +625,8 @@ def do_execute(runner, evstr, out, rest, res, rev, env):
     return ret, res, rev, interval
 
 def ev_append(ev, level, obj):
+    if isinstance(ev, types.LambdaType):
+        return ev(lambda ev, level: ev_append(ev, level, obj), level)
     if ev in nonperf_events:
         return 99
     if not (ev, level) in obj.evlevels:
@@ -610,7 +638,7 @@ def ev_append(ev, level, obj):
     return 99
 
 def canon_event(e):
-    m = re.match(r"(.*):(.*)", e)
+    m = re.match(r"(.*?):(.*)", e)
     if m:
         e = m.group(1)
     if e.upper() in fixed_counters:
@@ -629,13 +657,33 @@ def event_rmap(e):
         n = fixes[n.upper()].lower()
     return n
 
-def lookup_res(res, rev, ev, obj, env, level):
+def lookup_res(res, rev, ev, obj, env, level, cpuoff = -1):
     if ev in env:
         return env[ev]
+    #
+    # when the model passed in a lambda run the function for each logical cpu
+    # (by resolving its EVs to only that CPU)
+    # and then sum up. This is needed for the workarounds to make various
+    # per thread counters at least as big as unhalted cycles.
+    #
+    # otherwise we always sum up.
+    #
+    if isinstance(ev, types.LambdaType):
+        n = 0
+        for off in range(cpu.threads): # XXX
+            n += ev(lambda ev, level: lookup_res(res, rev, ev, obj, env, level, off), level)
+        return n
+
     index = obj.res_map[(ev, level)]
     rev = event_rmap(rev[index])
     assert (rev == canon_event(ev) or
                 (ev in event_fixes and canon_event(event_fixes[ev]) == rev))
+
+    if isinstance(res[index], types.TupleType):
+        if cpuoff == -1:
+            return sum(res[index])
+        else:
+            return res[index][cpuoff]
     return res[index]
 
 def add_key(k, x, y):
@@ -741,7 +789,6 @@ def split_groups(self, objl, evlev):
                     self.add(objl, raw_events(get_names(evl)), evl)
 
     def add(self, objl, evnum, evlev):
-        assert evlev
         # does not fit into a group.
         if len(set(evnum) - add_filter(ingroup_events)) > cpu.counters:
             self.split_groups(objl, evlev)
@@ -777,7 +824,7 @@ def schedule(self):
         # try to fit each objects events into groups
         # that fit into the available CPU counters
         for obj in solist:
-            if obj.evnum[0] in outgroup_events:
+            if len(obj.evnum) == 0 or obj.evnum[0] in outgroup_events:
                 self.add([obj], obj.evnum, obj.evlevels)
                 continue
             # try adding another object to the current group
@@ -827,20 +874,25 @@ def print_res(self, res, rev, out, timestamp, title, env):
                 val = obj.val
                 if not obj.thresh and not dont_hide:
                     val = 0.0
+                disclaimer = ""
+                if 'htoff' in obj.__dict__ and obj.htoff and obj.thresh and cpu.ht:
+                    disclaimer = """
+Warning: Hyper Threading may lead to incorrect measurements for this node.
+Suggest to re-measure with HT off."""
                 desc = obj.desc[1:].replace("\n", "\n\t")
                 if obj.metric:
                     out.metric(obj.area if 'area' in obj.__class__.__dict__ else None,
                             obj.name, val, timestamp,
-                            desc,
+                            desc + disclaimer,
                             title,
                             obj.unit if 'unit' in obj.__class__.__dict__ else "metric")
                 else:
                     out.p(obj.area if 'area' in obj.__class__.__dict__ else None,
                         full_name(obj), val, timestamp,
                         "below" if not obj.thresh else "above",
-                        desc,
+                        desc + disclaimer,
                         title,
-                        sample_desc(obj.sample) if args.sample and obj.sample else "")
+                        sample_desc(obj.sample) if obj.sample else "")
 
 def sysctl(name):
     try:
@@ -879,12 +931,27 @@ def ht_warning():
     ivb_server_ratios.smt_enabled = cpu.ht
     need_any = cpu.ht
     ivb_server_ratios.Setup(runner)
-elif cpu.cpu == "snb" and detailed_model:
+elif cpu.cpu == "snb":
     import snb_client_ratios
     snb_client_ratios.Setup(runner)
-elif cpu.cpu == "hsw" and detailed_model:
+elif cpu.cpu == "jkt":
+    import jkt_server_ratios
+    jkt_server_ratios.Setup(runner)
+elif cpu.cpu == "hsw":
     import hsw_client_ratios
+    hsw_client_ratios.smt_enabled = cpu.ht
+    need_any = cpu.ht
     hsw_client_ratios.Setup(runner)
+elif cpu.cpu == "hsx":
+    import hsx_server_ratios
+    hsx_server_ratios.smt_enabled = cpu.ht
+    need_any = cpu.ht
+    hsx_server_ratios.Setup(runner)
+elif cpu.cpu == "bdw":
+    import bdw_client_ratios
+    bdw_client_ratios.smt_enabled = cpu.ht
+    need_any = cpu.ht
+    bdw_client_ratios.Setup(runner)
 elif cpu.cpu == "slm":
     import slm_ratios
     slm_ratios.Setup(runner)
@@ -892,8 +959,6 @@ def ht_warning():
     ht_warning()
     if detailed_model:
         print >>sys.stderr, "Sorry, no detailed model for your CPU. Only Level 1 supported."
-        if cpu.cpu == "jkt":
-            print >>sys.stderr, "Consider using FORCECPU=snb"
     import simple_ratios
     simple_ratios.Setup(runner)
 
@@ -928,6 +993,8 @@ def setup_with_metrics(p, runner):
     print "Running in HyperThreading mode. Will measure complete system."
     if "--per-socket" in rest:
         sys.exit("Hyper Threading more not compatible with --per-socket")
+    if "--per-core" in rest:
+        sys.exit("Hyper Threading more not compatible with --per-core")
     if args.cpu:
         print >>sys.stderr, "Warning: --cpu/-C mode with HyperThread must specify all core thread pairs!"
     if not (os.geteuid() == 0 or sysctl("kernel.perf_event_paranoid") == -1):