diff --git a/bdw_client_ratios.py b/bdw_client_ratios.py
index d0a1b510..1f2fce16 100644
--- a/bdw_client_ratios.py
+++ b/bdw_client_ratios.py
@@ -8,6 +8,10 @@
 # https://sites.google.com/site/analysismethods/yasin-pubs
 #
 
+# Helpers
+
+print_error = lambda msg: False
+
 smt_enabled = False
 
 # Constants
@@ -166,22 +170,28 @@ class Frontend_Bound:
     domain = "Slots"
     area = "FE"
     desc = """
-This category reflects slots where the Frontend of the processor undersupplies
-its Backend. Frontend denotes the first portion of pipeline responsible to
-fetch micro-ops which the Backend can execute. Within the Frontend, a branch
-predictor predicts the next address to fetch, cache-lines are fetched from
-memory, parsed into instructions, and lastly decoded into micro-ops. The
-purpose of the Frontend cluster is to deliver uops to Backend whenever the
-latter can accept them. For example, stalls due to instruction-cache misses
-would be categorized under Frontend Bound."""
+This category reflects slots where the Frontend of the
+processor undersupplies its Backend. Frontend denotes the
+first portion of pipeline responsible to fetch micro-ops
+which the Backend can execute. Within the Frontend, a branch
+predictor predicts the next address to fetch, cache-lines
+are fetched from memory, parsed into instructions, and
+lastly decoded into micro-ops. The purpose of the Frontend
+cluster is to deliver uops to Backend whenever the latter
+can accept them. For example, stalls due to instruction-
+cache misses would be categorized under Frontend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Frontend_Bound zero division"
+            print_error("Frontend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -191,18 +201,24 @@ class Frontend_Latency:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend latency
-issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
-after a branch misprediction are categorized under Frontend Latency. In such
-cases the Frontend eventually delivers no uops for some period."""
+This metric represents slots fraction CPU was stalled due to
+Frontend latency issues.  For example, instruction-cache
+misses, iTLB misses or fetch stalls after a branch
+misprediction are categorized under Frontend Latency. In
+such cases the Frontend eventually delivers no uops for some
+period."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Latency zero division"
+            print_error("Frontend_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -212,16 +228,21 @@ class ITLB_Misses:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to instruction TLB
-misses. Using large code pages may be considered here."""
+This metric represents cycles fraction CPU was stalled due
+to instruction TLB misses. Using large code pages may be
+considered here."""
     level = 3
     htoff = False
+    sample = ['ITLB_MISSES.WALK_COMPLETED']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ITLB_Misses zero division"
+            print_error("ITLB_Misses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -231,16 +252,21 @@ class DSB_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches from
-DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
+This metric represents cycles fraction CPU was stalled due
+to switches from DSB to MITE pipelines. Optimizing for
+better DSB hit rate may be considered."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB_Switches zero division"
+            print_error("DSB_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -250,17 +276,22 @@ class LCP:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to Length Changing
-Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
-certainly avoid this."""
+This metric represents cycles fraction CPU was stalled due
+to Length Changing Prefixes (LCPs). Using proper compiler
+flags or Intel Compiler by default will certainly avoid
+this."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LCP zero division"
+            print_error("LCP zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -270,19 +301,25 @@ class MS_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches of uop
-delivery to the Microcode Sequencer (MS). Commonly used instructions are
-optimized for delivery by the DSB or MITE pipelines. The MS is designated to
-deliver long uop flows required by CISC instructions like CPUID, or uncommon
-conditions like Floating Point Assists when dealing with Denormals."""
+This metric represents cycles fraction CPU was stalled due
+to switches of uop delivery to the Microcode Sequencer (MS).
+Commonly used instructions are optimized for delivery by the
+DSB or MITE pipelines. The MS is designated to deliver long
+uop flows required by CISC instructions like CPUID, or
+uncommon conditions like Floating Point Assists when dealing
+with Denormals."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MS_Switches zero division"
+            print_error("MS_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -292,19 +329,24 @@ class Frontend_Bandwidth:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend
-bandwidth issues.  For example, inefficiencies at the instruction decoders, or
-code restrictions for caching in the DSB (decoded uops cache) are categorized
-under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
-optimal amount of uops to the Backend."""
+This metric represents slots fraction CPU was stalled due to
+Frontend bandwidth issues.  For example, inefficiencies at
+the instruction decoders, or code restrictions for caching
+in the DSB (decoded uops cache) are categorized under
+Frontend Bandwidth. In such cases, the Frontend typically
+delivers non-optimal amount of uops to the Backend."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Bandwidth zero division"
+            print_error("Frontend_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -314,17 +356,22 @@ class MITE:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to the MITE fetch pipeline.  For example, inefficiencies in the
-instruction decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to the MITE fetch pipeline.  For example,
+inefficiencies in the instruction decoders are categorized
+here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MITE zero division"
+            print_error("MITE zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -334,18 +381,23 @@ class DSB:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
-utilization of the DSB cache structure or bank conflict when reading from it,
-are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to DSB (decoded uop cache) fetch
+pipeline.  For example, inefficient utilization of the DSB
+cache structure or bank conflict when reading from it, are
+categorized here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB zero division"
+            print_error("DSB zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -355,19 +407,24 @@ class LSD:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
-Uop supply. However, in some rare cases, optimal uop-delivery could not be
-reached for small loops whose size (in terms of number of uops) does not suit
-well the LSD structure."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to LSD (Loop Stream Detector) unit.  LSD
+typically does well sustaining Uop supply. However, in some
+rare cases, optimal uop-delivery could not be reached for
+small loops whose size (in terms of number of uops) does not
+suit well the LSD structure."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LSD zero division"
+            print_error("LSD zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -377,19 +434,24 @@ class Bad_Speculation:
     domain = "Slots"
     area = "BAD"
     desc = """
-This category reflects slots wasted due to incorrect speculations, which
-include slots used to allocate uops that do not eventually get retired and
-slots for which allocation was blocked due to recovery from earlier incorrect
-speculation. For example, wasted work due to miss-predicted branches are
-categorized under Bad Speculation category"""
+This category reflects slots wasted due to incorrect
+speculations, which include slots used to allocate uops that
+do not eventually get retired and slots for which allocation
+was blocked due to recovery from earlier incorrect
+speculation. For example, wasted work due to miss-predicted
+branches are categorized under Bad Speculation category"""
     level = 1
     htoff = False
+    sample = ['INT_MISC.RECOVERY_CYCLES']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
-            #print "Bad_Speculation zero division"
+            print_error("Bad_Speculation zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -399,18 +461,23 @@ class Branch_Mispredicts:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Branch
-Misprediction.  These slots are either wasted by uops fetched from an
-incorrectly speculated program path, or stalls the Backend of the machine
-needs to recover its state from a speculative path."""
+This metric represents slots fraction CPU was impacted by
+Branch Misprediction.  These slots are either wasted by uops
+fetched from an incorrectly speculated program path, or
+stalls the Backend of the machine needs to recover its state
+from a speculative path."""
     level = 2
     htoff = False
+    sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Branch_Mispredicts zero division"
+            print_error("Branch_Mispredicts zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -420,19 +487,24 @@ class Machine_Clears:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Machine Clears.
-These slots are either wasted by uops fetched prior to the clear, or stalls
-the Backend of the machine needs to recover its state after the clear. For
-example, this can happen due to memory ordering Nukes (e.g. Memory
-Disambiguation) or Self-Modifying-Code (SMC) nukes."""
+This metric represents slots fraction CPU was impacted by
+Machine Clears.  These slots are either wasted by uops
+fetched prior to the clear, or stalls the Backend of the
+machine needs to recover its state after the clear. For
+example, this can happen due to memory ordering Nukes (e.g.
+Memory Disambiguation) or Self-Modifying-Code (SMC) nukes."""
     level = 2
     htoff = False
+    sample = ['MACHINE_CLEARS.COUNT']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Machine_Clears zero division"
+            print_error("Machine_Clears zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -442,21 +514,27 @@ class Backend_Bound:
     domain = "Slots"
     area = "BE"
     desc = """
-This category reflects slots where no uops are being delivered due to a lack
-of required resources for accepting more uops in the Backend of the pipeline.
-Backend describes the portion of the pipeline where the out-of-order scheduler
-dispatches ready uops into their respective execution units, and once
-completed these uops get retired according to program order. For example,
-stalls due to data-cache misses or stalls due to the divider unit being
-overloaded are both categorized under Backend Bound."""
+This category reflects slots where no uops are being
+delivered due to a lack of required resources for accepting
+more uops in the Backend of the pipeline. Backend describes
+the portion of the pipeline where the out-of-order scheduler
+dispatches ready uops into their respective execution units,
+and once completed these uops get retired according to
+program order. For example, stalls due to data-cache misses
+or stalls due to the divider unit being overloaded are both
+categorized under Backend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Backend_Bound zero division"
+            print_error("Backend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -466,19 +544,25 @@ class Memory_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how much Memory subsystem was a bottleneck.  Memory
-Bound measures cycle fraction where pipeline is likely stalled due to demand
-load or store instructions. This accounts mainly for non-completed in-flight
-memory demand loads which coincides with execution starvation. in addition to
-less common cases where stores could imply backpressure on the pipeline."""
+This metric represents how much Memory subsystem was a
+bottleneck.  Memory Bound measures cycle fraction where
+pipeline is likely stalled due to demand load or store
+instructions. This accounts mainly for non-completed in-
+flight memory demand loads which coincides with execution
+starvation. in addition to less common cases where stores
+could imply backpressure on the pipeline."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Memory_Bound zero division"
+            print_error("Memory_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -488,20 +572,26 @@ class L1_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled without missing the L1 data
-cache.  The L1 cache typically has the shortest latency.  However, in certain
-cases like loads blocked on older stores, a load might suffer a high latency
-even though it is being satisfied by the L1. There are no fill-buffers
-allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event
-as it accounts for any non-completed load."""
+This metric represents how often CPU was stalled without
+missing the L1 data cache.  The L1 cache typically has the
+shortest latency.  However, in certain cases like loads
+blocked on older stores, a load might suffer a high latency
+even though it is being satisfied by the L1. There are no
+fill-buffers allocated for L1 hits so instead we use the
+load matrix (LDM) stalls sub-event as it accounts for any
+non-completed load."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_MISS", 3)) / CLKS(EV, 3 )
             self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh
         except ZeroDivisionError:
-            #print "L1_Bound zero division"
+            print_error("L1_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -510,15 +600,21 @@ class DTLB_Load:
     name = "DTLB_Load"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were waiting for page table walks. Consider making the
+working set more compact or using large pages."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Load zero division"
+            print_error("DTLB_Load zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -527,15 +623,24 @@ class Store_Fwd_Blk:
     name = "Store_Fwd_Blk"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Stores were blocked on store-forwarding between depending
+operations. This typically occurs when an output of a
+computation is accessed with a different sized data type.
+Review the rules for store forwarding in the optimization
+guide."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Store_Fwd_Blk zero division"
+            print_error("Store_Fwd_Blk zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -544,15 +649,21 @@ class Split_Loads:
     name = "Split_Loads"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were crossing 64 byte cache lines. Consider naturally
+aligning data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Loads zero division"
+            print_error("Split_Loads zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -561,15 +672,22 @@ class G4K_Aliasing:
     name = "4K_Aliasing"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Memory accesses were aliased by nearby others with a 4K
+offset. Reorganize the data to avoid this. See the
+optimization manual for more details."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "G4K_Aliasing zero division"
+            print_error("G4K_Aliasing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -579,17 +697,21 @@ class L2_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L2 cache.  Avoiding cache
-misses (i.e. L1 misses/L2 hits) will improve the latency and increase
-performance."""
+This metric represents how often CPU was stalled on L2
+cache.  Avoiding cache misses (i.e. L1 misses/L2 hits) will
+improve the latency and increase performance."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_MISS", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_MISS", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.03) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L2_Bound zero division"
+            print_error("L2_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -599,17 +721,22 @@ class L3_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L3 cache or contended with
-a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
-the latency and increase performance."""
+This metric represents how often CPU was stalled on L3 cache
+or contended with a sibling Core.  Avoiding cache misses
+(i.e. L2 misses/L3 hits) will improve the latency and
+increase performance."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_MISS", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Bound zero division"
+            print_error("L3_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -618,15 +745,21 @@ class Contested_Accesses:
     name = "Contested_Accesses"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+64 byte cache lines were bouncing between cores. Avoid false
+sharing, unnecessary writes, and localize data."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Contested_Accesses zero division"
+            print_error("Contested_Accesses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -638,12 +771,16 @@ class Data_Sharing:
     desc = ""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Data_Sharing zero division"
+            print_error("Data_Sharing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -653,18 +790,23 @@ class L3_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric is a rough aggregate estimate of cycles fraction where CPU
-accessed L3 cache for all load requests, while there was no contention/sharing
-with a sibling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
-improve the latency and increase performance."""
+This metric is a rough aggregate estimate of cycles fraction
+where CPU accessed L3 cache for all load requests, while
+there was no contention/sharing with a sibling core.
+Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
+the latency and increase performance."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Latency zero division"
+            print_error("L3_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -674,17 +816,22 @@ class SQ_Full:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric measures fraction of cycles where the Super Queue (SQ) was full
-taking into account all request-types and both hardware SMT threads. The Super
-Queue is used for requests to access the L2 cache or to go out to the Uncore."""
+This metric measures fraction of cycles where the Super
+Queue (SQ) was full taking into account all request-types
+and both hardware SMT threads. The Super Queue is used for
+requests to access the L2 cache or to go out to the Uncore."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "SQ_Full zero division"
+            print_error("SQ_Full zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -694,16 +841,21 @@ class MEM_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on main memory (DRAM).
-Caching will improve the latency and increase performance."""
+This metric represents how often CPU was stalled on main
+memory (DRAM).  Caching will improve the latency and
+increase performance."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_MISS", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bound zero division"
+            print_error("MEM_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -713,17 +865,21 @@ class MEM_Bandwidth:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to approaching
-bandwidth limits of main memory (DRAM).  NUMA in multi-socket system may be
-considered in such case."""
+This metric represents how often CPU was likely stalled due
+to approaching bandwidth limits of main memory (DRAM).  NUMA
+in multi-socket system may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bandwidth zero division"
+            print_error("MEM_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -733,17 +889,22 @@ class MEM_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to latency from
-main memory (DRAM).  Data layout re-structuring or using Software Prefetches
-(also through the compiler) may be considered in such case."""
+This metric represents how often CPU was likely stalled due
+to latency from main memory (DRAM).  Data layout re-
+structuring or using Software Prefetches (also through the
+compiler) may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Latency zero division"
+            print_error("MEM_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -753,18 +914,23 @@ class Stores_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled  due to store operations.
-even though memory store accesses do not typically stall out-of-order CPUs;
-there are few cases where stores can lead to actual stalls. This metric will
-be flagged should any of these cases be a bottleneck."""
+This metric represents how often CPU was stalled  due to
+store operations. even though memory store accesses do not
+typically stall out-of-order CPUs; there are few cases where
+stores can lead to actual stalls. This metric will be
+flagged should any of these cases be a bottleneck."""
     level = 3
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Memory_Bound.compute(EV) -(EV("CYCLE_ACTIVITY.STALLS_MEM_ANY", 3) / CLKS(EV, 3))
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Stores_Bound zero division"
+            print_error("Stores_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -774,16 +940,21 @@ class Split_Stores:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric represents rate of split store accesses.  Consider aligning your
-data to the 64-byte cache line granularity."""
+This metric represents rate of split store accesses.
+Consider aligning your data to the 64-byte cache line
+granularity."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Stores zero division"
+            print_error("Split_Stores zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -793,20 +964,25 @@ class DTLB_Store:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents cycles fraction spent handling first-level data TLB
-store misses.  As with ordinary data caching, focus on improving data locality
-and reducing working-set size to reduce DTLB overhead.  Additionally, consider
-using profile-guided optimization (PGO) to collocate frequently-used data on
-the same page.  Try using larger page sizes for large amounts of frequently-
-used data."""
+This metric represents cycles fraction spent handling first-
+level data TLB store misses.  As with ordinary data caching,
+focus on improving data locality and reducing working-set
+size to reduce DTLB overhead.  Additionally, consider using
+profile-guided optimization (PGO) to collocate frequently-
+used data on the same page.  Try using larger page sizes for
+large amounts of frequently-used data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Store zero division"
+            print_error("DTLB_Store zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -816,21 +992,27 @@ class Core_Bound:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents how much Core non-memory issues were of a bottleneck.
-Shortage in hardware compute resources, or dependencies software's
-instructions are both categorized under Core Bound. Hence it may indicate the
-machine ran out of an OOO resources, certain execution units are overloaded or
-dependencies in program's data- or instruction-flow are limiting the
-performance (e.g. FP-chained long-latency arithmetic operations). Tip:
-consider Port Saturation analysis as next step."""
+This metric represents how much Core non-memory issues were
+of a bottleneck.  Shortage in hardware compute resources, or
+dependencies software's instructions are both categorized
+under Core Bound. Hence it may indicate the machine ran out
+of an OOO resources, certain execution units are overloaded
+or dependencies in program's data- or instruction-flow are
+limiting the performance (e.g. FP-chained long-latency
+arithmetic operations). Tip: consider Port Saturation
+analysis as next step."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Core_Bound zero division"
+            print_error("Core_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -839,15 +1021,22 @@ class Divider:
     name = "Divider"
     domain = "CoreClocks"
     area = "BE/Core"
-    desc = ""
+    desc = """
+Time waiting for divisions by variables. Change the dividend
+to be constant or use profile feedback to let the compiler
+do that."""
     level = 3
     htoff = False
+    sample = ['ARITH.FPU_DIV_ACTIVE']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Divider zero division"
+            print_error("Divider zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -857,21 +1046,27 @@ class Ports_Utilization:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction application was stalled due to Core
-computation issues (non divider-related).  For example, heavy data-dependency
-between nearby instructions will manifest in this category. Ditto if
-instruction-mix used by the application overloads specific hardware execution
-unit. Hint: Loop Vectorization -most compilers feature auto-Vectorization
-options today- reduces pressure on the execution ports as multiple elements
-are calculated with same uop."""
+This metric represents cycles fraction application was
+stalled due to Core computation issues (non divider-
+related).  For example, heavy data-dependency between nearby
+instructions will manifest in this category. Ditto if
+instruction-mix used by the application overloads specific
+hardware execution unit. Hint: Loop Vectorization -most
+compilers feature auto-Vectorization options today- reduces
+pressure on the execution ports as multiple elements are
+calculated with same uop."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Ports_Utilization zero division"
+            print_error("Ports_Utilization zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -881,16 +1076,20 @@ class G0_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed no uops on any
-execution port."""
+This metric represents Core cycles fraction CPU executed no
+uops on any execution port."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G0_Ports_Utilized zero division"
+            print_error("G0_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -900,22 +1099,29 @@ class G1_Port_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction where the CPU executed total of 1
-uop per cycle on all execution ports. This can be due to heavy data-dependency
-among software instructions, or over oversubscribing a particular hardware
-resource. In some other cases with high 1_Port_Utilized and L1_Bound, this
-metric can point to L1 data-cache latency bottleneck that may not necessarily
-manifest with complete execution starvation (due to the short L1 latency e.g.
-walking a linked list) - looking at the assembly can be helpful. Tip: consider
-'Core Ports Saturation' analysis-type as next step."""
+This metric represents Core cycles fraction where the CPU
+executed total of 1 uop per cycle on all execution ports.
+This can be due to heavy data-dependency among software
+instructions, or over oversubscribing a particular hardware
+resource. In some other cases with high 1_Port_Utilized and
+L1_Bound, this metric can point to L1 data-cache latency
+bottleneck that may not necessarily manifest with complete
+execution starvation (due to the short L1 latency e.g.
+walking a linked list) - looking at the assembly can be
+helpful. Tip: consider 'Core Ports Saturation' analysis-type
+as next step."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G1_Port_Utilized zero division"
+            print_error("G1_Port_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -925,19 +1131,25 @@ class G2_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 2 uops per
-cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-
-type as next step. Loop Vectorization -most compilers feature auto-
-Vectorization options today- reduces pressure on the execution ports as
-multiple elements are calculated with same uop."""
+This metric represents Core cycles fraction CPU executed
+total of 2 uops per cycle on all execution ports. Tip:
+consider 'Core Port Saturation' analysis-type as next step.
+Loop Vectorization -most compilers feature auto-
+Vectorization options today- reduces pressure on the
+execution ports as multiple elements are calculated with
+same uop."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G2_Ports_Utilized zero division"
+            print_error("G2_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -947,17 +1159,22 @@ class G3m_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 3 or more
-uops per cycle on all execution ports. Tip: consider 'Core Port Saturation'
-analysis-type as next step"""
+This metric represents Core cycles fraction CPU executed
+total of 3 or more uops per cycle on all execution ports.
+Tip: consider 'Core Port Saturation' analysis-type as next
+step"""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G3m_Ports_Utilized zero division"
+            print_error("G3m_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -967,16 +1184,21 @@ class Port_0:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 0 (SNB+: ALU; HSW+:ALU and 2nd branch)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 0 (SNB+: ALU; HSW+:ALU and 2nd
+branch)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_0 zero division"
+            print_error("Port_0 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -986,16 +1208,20 @@ class Port_1:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 1 (ALU)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 1 (ALU)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_1 zero division"
+            print_error("Port_1 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1005,16 +1231,20 @@ class Port_2:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 2 (Loads and Store-address)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 2 (Loads and Store-address)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_2 zero division"
+            print_error("Port_2 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1024,16 +1254,20 @@ class Port_3:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 3 (Loads and Store-address)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 3 (Loads and Store-address)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_3 zero division"
+            print_error("Port_3 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1043,16 +1277,20 @@ class Port_4:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 4 (Store-data)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 4 (Store-data)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_4 zero division"
+            print_error("Port_4 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1062,16 +1300,20 @@ class Port_5:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 5 (SNB+: Branches and ALU; HSW+: ALU)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 5 (SNB+: Branches and ALU; HSW+: ALU)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_5 zero division"
+            print_error("Port_5 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1081,25 +1323,32 @@ class Retiring:
     domain = "Slots"
     area = "RET"
     desc = """
-This category reflects slots utilized by useful work i.e. allocated uops that
-eventually get retired. Ideally, all pipeline slots would be attributed to the
-Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired
-per cycle has been achieved.  Maximizing Retiring typically increases the
-Instruction-Per-Cycle metric. Note that a high Retiring value does not
-necessary mean there is no room for more performance.  For example, Microcode
-assists are categorized under Retiring. They hurt performance and can often be
-avoided.  A high Retiring value for non-vectorized code may be a good hint for
-programmer to consider vectorizing his code.  Doing so essentially lets more
-computations be done without significantly increasing number of instructions
-thus improving the performance."""
+This category reflects slots utilized by useful work i.e.
+allocated uops that eventually get retired. Ideally, all
+pipeline slots would be attributed to the Retiring category.
+Retiring of 100% would indicate the maximum 4 uops retired
+per cycle has been achieved.  Maximizing Retiring typically
+increases the Instruction-Per-Cycle metric. Note that a high
+Retiring value does not necessary mean there is no room for
+more performance.  For example, Microcode assists are
+categorized under Retiring. They hurt performance and can
+often be avoided.  A high Retiring value for non-vectorized
+code may be a good hint for programmer to consider
+vectorizing his code.  Doing so essentially lets more
+computations be done without significantly increasing number
+of instructions thus improving the performance."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
-            #print "Retiring zero division"
+            print_error("Retiring zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1109,21 +1358,27 @@ class Base:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction where the CPU was retiring uops not
-originated from the microcode-sequencer. This correlates with total number of
-instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. While this is the most desirable of the top 4 categories, high
-values may still indicate areas for improvement. If possible focus on
-techniques that reduce instruction count or result in more efficient
-instructions generation such as vectorization."""
+This metric represents slots fraction where the CPU was
+retiring uops not originated from the microcode-sequencer.
+This correlates with total number of instructions used by
+the program. A uops-per-instruction ratio of 1 should be
+expected. While this is the most desirable of the top 4
+categories, high values may still indicate areas for
+improvement. If possible focus on techniques that reduce
+instruction count or result in more efficient instructions
+generation such as vectorization."""
     level = 2
     htoff = False
+    sample = ['INST_RETIRED.PREC_DIST']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Base zero division"
+            print_error("Base zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1133,18 +1388,24 @@ class Microcode_Sequencer:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops fetched by the
-Microcode Sequencer (MS) ROM.  The MS is used for CISC instructions not fully
-decoded by the default decoders (like repeat move strings), or by microcode
-assists used to address some operation modes (like in Floating Point assists)."""
+This metric represents slots fraction CPU was retiring uops
+fetched by the Microcode Sequencer (MS) ROM.  The MS is used
+for CISC instructions not fully decoded by the default
+decoders (like repeat move strings), or by microcode assists
+used to address some operation modes (like in Floating Point
+assists)."""
     level = 2
     htoff = False
+    sample = ['IDQ.MS_UOPS']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
-            #print "Microcode_Sequencer zero division"
+            print_error("Microcode_Sequencer zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1155,12 +1416,14 @@ class Metric_IPC:
 Instructions Per Cycle (per logical thread)"""
     domain = "Metric"
     maxval = 5
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPC(EV, 0)
         except ZeroDivisionError:
-            print "IPC zero division"
+            print_error("IPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CPI:
@@ -1169,12 +1432,14 @@ class Metric_CPI:
 Cycles Per Instruction (threaded)"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CPI(EV, 0)
         except ZeroDivisionError:
-            print "CPI zero division"
+            print_error("CPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CoreIPC:
@@ -1182,13 +1447,15 @@ class Metric_CoreIPC:
     desc = """
 Instructions Per Cycle (per physical core)"""
     domain = "CoreMetric"
-    maxval = 5
+    maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CoreIPC(EV, 0)
         except ZeroDivisionError:
-            print "CoreIPC zero division"
+            print_error("CoreIPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_UPI:
@@ -1197,12 +1464,14 @@ class Metric_UPI:
 Uops Per Instruction"""
     domain = "Metric"
     maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = UPI(EV, 0)
         except ZeroDivisionError:
-            print "UPI zero division"
+            print_error("UPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_IPTB:
@@ -1211,71 +1480,82 @@ class Metric_IPTB:
 Instruction per taken branch"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPTB(EV, 0)
         except ZeroDivisionError:
-            print "IPTB zero division"
+            print_error("IPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_BPTB:
     name = "BPTB"
     desc = """
-Branch instructions per taken branch. Can be used to approximate PGO-
-likelihood for non-loopy codes."""
+Branch instructions per taken branch. Can be used to
+approximate PGO-likelihood for non-loopy codes."""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = BPTB(EV, 0)
         except ZeroDivisionError:
-            print "BPTB zero division"
+            print_error("BPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     desc = """
-Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+Fraction of Uops delivered by the DSB (decoded instructions
+cache)"""
     domain = "Metric"
     maxval = 1
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = DSB_Coverage(EV, 0)
         except ZeroDivisionError:
-            print "DSB_Coverage zero division"
+            print_error("DSB_Coverage zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_ILP:
     name = "ILP"
     desc = """
-Instruction-Level-Parallelism (average number of uops executed when there is
-at least 1 uop executed)"""
+Instruction-Level-Parallelism (average number of uops
+executed when there is at least 1 uop executed)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = ILP(EV, 0)
         except ZeroDivisionError:
-            print "ILP zero division"
+            print_error("ILP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MLP:
     name = "MLP"
     desc = """
-Memory-Level-Parallelism (average number of L1 miss demand load when there is
-at least 1 such miss)"""
+Memory-Level-Parallelism (average number of L1 miss demand
+load when there is at least 1 such miss)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MLP(EV, 0)
         except ZeroDivisionError:
-            print "MLP zero division"
+            print_error("MLP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Load_Miss_Real_Latency:
@@ -1284,41 +1564,47 @@ class Metric_Load_Miss_Real_Latency:
 Actual Average Latency for L1 data-cache miss demand loads"""
     domain = "Metric"
     maxval = 1000
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Load_Miss_Real_Latency(EV, 0)
         except ZeroDivisionError:
-            print "Load_Miss_Real_Latency zero division"
+            print_error("Load_Miss_Real_Latency zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Turbo_Utilization:
     name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
-    domain = "CoreMetric"
+    domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Turbo_Utilization(EV, 0)
         except ZeroDivisionError:
-            print "Turbo_Utilization zero division"
+            print_error("Turbo_Utilization zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Page_Walks_Use:
     name = "Page_Walks_Use"
     desc = """
-Fraction of cycles where the core's Page Walker is busy serving
-iTLB/Load/Store"""
+Fraction of cycles where the core's Page Walker is busy
+serving iTLB/Load/Store"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Page_Walks_Use(EV, 0)
         except ZeroDivisionError:
-            print "Page_Walks_Use zero division"
+            print_error("Page_Walks_Use zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MUX:
@@ -1327,12 +1613,14 @@ class Metric_MUX:
 PerfMon Event Multiplexing accuracy indicator"""
     domain = "Clocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MUX(EV, 0)
         except ZeroDivisionError:
-            print "MUX zero division"
+            print_error("MUX zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CLKS:
@@ -1341,12 +1629,14 @@ class Metric_CLKS:
 Per-thread actual clocks"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CLKS zero division"
+            print_error("CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CORE_CLKS:
@@ -1355,12 +1645,14 @@ class Metric_CORE_CLKS:
 Core actual clocks"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CORE_CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CORE_CLKS zero division"
+            print_error("CORE_CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Time:
@@ -1369,12 +1661,14 @@ class Metric_Time:
 Run duration time in seconds"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "Time zero division"
+            print_error("Time zero division")
+            self.errcount += 1
             self.val = 0
 
 # Schedule
@@ -1500,106 +1794,13 @@ def __init__(self, r):
 
         # siblings cross-tree
 
-        o["Frontend_Bound"].sibling = None
-        o["Frontend_Latency"].sibling = None
-        o["ITLB_Misses"].sibling = None
-        o["DSB_Switches"].sibling = None
-        o["LCP"].sibling = None
 	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
-        o["Frontend_Bandwidth"].sibling = None
-        o["MITE"].sibling = None
-        o["DSB"].sibling = None
-        o["LSD"].sibling = None
-	o["Bad_Speculation"].sibling = None
-        o["Branch_Mispredicts"].sibling = None
-        o["Machine_Clears"].sibling = None
-        o["Backend_Bound"].sibling = None
-        o["Memory_Bound"].sibling = None
 	o["L1_Bound"].sibling = o["G1_Port_Utilized"]
-        o["DTLB_Load"].sibling = None
-        o["Store_Fwd_Blk"].sibling = None
-        o["Split_Loads"].sibling = None
-        o["G4K_Aliasing"].sibling = None
-        o["L2_Bound"].sibling = None
-        o["L3_Bound"].sibling = None
-        o["Contested_Accesses"].sibling = None
-        o["Data_Sharing"].sibling = None
-        o["L3_Latency"].sibling = None
-        o["SQ_Full"].sibling = None
-        o["MEM_Bound"].sibling = None
-        o["MEM_Bandwidth"].sibling = None
-        o["MEM_Latency"].sibling = None
-        o["Stores_Bound"].sibling = None
 	o["Split_Stores"].sibling = o["Port_4"]
-        o["DTLB_Store"].sibling = None
-        o["Core_Bound"].sibling = None
-        o["Divider"].sibling = None
-        o["Ports_Utilization"].sibling = None
-        o["G0_Ports_Utilized"].sibling = None
 	o["G1_Port_Utilized"].sibling = o["L1_Bound"]
-        o["G2_Ports_Utilized"].sibling = None
-        o["G3m_Ports_Utilized"].sibling = None
-        o["Port_0"].sibling = None
-        o["Port_1"].sibling = None
-        o["Port_2"].sibling = None
-        o["Port_3"].sibling = None
 	o["Port_4"].sibling = o["Split_Stores"]
-        o["Port_5"].sibling = None
-        o["Retiring"].sibling = None
-        o["Base"].sibling = None
 	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events
-
-        o["Frontend_Bound"].sample = []
-        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
-        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
-        o["DSB_Switches"].sample = []
-        o["LCP"].sample = []
-        o["MS_Switches"].sample = []
-        o["Frontend_Bandwidth"].sample = []
-        o["MITE"].sample = []
-        o["DSB"].sample = []
-        o["LSD"].sample = []
-        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
-        o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
-        o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT']
-        o["Backend_Bound"].sample = []
-        o["Memory_Bound"].sample = []
-        o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
-        o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
-        o["Store_Fwd_Blk"].sample = []
-        o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
-        o["G4K_Aliasing"].sample = []
-        o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
-        o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
-        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
-        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp']
-        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
-        o["SQ_Full"].sample = []
-        o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
-        o["MEM_Bandwidth"].sample = []
-        o["MEM_Latency"].sample = []
-        o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
-        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
-        o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
-        o["Core_Bound"].sample = []
-        o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE']
-        o["Ports_Utilization"].sample = []
-        o["G0_Ports_Utilized"].sample = []
-        o["G1_Port_Utilized"].sample = []
-        o["G2_Ports_Utilized"].sample = []
-        o["G3m_Ports_Utilized"].sample = []
-        o["Port_0"].sample = []
-        o["Port_1"].sample = []
-        o["Port_2"].sample = []
-        o["Port_3"].sample = []
-        o["Port_4"].sample = []
-        o["Port_5"].sample = []
-        o["Retiring"].sample = []
-        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
-        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
-
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
diff --git a/hsw_client_ratios.py b/hsw_client_ratios.py
index 5fcf0418..d9d17536 100644
--- a/hsw_client_ratios.py
+++ b/hsw_client_ratios.py
@@ -8,6 +8,10 @@
 # https://sites.google.com/site/analysismethods/yasin-pubs
 #
 
+# Helpers
+
+print_error = lambda msg: False
+
 smt_enabled = False
 
 # Constants
@@ -175,22 +179,28 @@ class Frontend_Bound:
     domain = "Slots"
     area = "FE"
     desc = """
-This category reflects slots where the Frontend of the processor undersupplies
-its Backend. Frontend denotes the first portion of pipeline responsible to
-fetch micro-ops which the Backend can execute. Within the Frontend, a branch
-predictor predicts the next address to fetch, cache-lines are fetched from
-memory, parsed into instructions, and lastly decoded into micro-ops. The
-purpose of the Frontend cluster is to deliver uops to Backend whenever the
-latter can accept them. For example, stalls due to instruction-cache misses
-would be categorized under Frontend Bound."""
+This category reflects slots where the Frontend of the
+processor undersupplies its Backend. Frontend denotes the
+first portion of pipeline responsible to fetch micro-ops
+which the Backend can execute. Within the Frontend, a branch
+predictor predicts the next address to fetch, cache-lines
+are fetched from memory, parsed into instructions, and
+lastly decoded into micro-ops. The purpose of the Frontend
+cluster is to deliver uops to Backend whenever the latter
+can accept them. For example, stalls due to instruction-
+cache misses would be categorized under Frontend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Frontend_Bound zero division"
+            print_error("Frontend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -200,18 +210,24 @@ class Frontend_Latency:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend latency
-issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
-after a branch misprediction are categorized under Frontend Latency. In such
-cases the Frontend eventually delivers no uops for some period."""
+This metric represents slots fraction CPU was stalled due to
+Frontend latency issues.  For example, instruction-cache
+misses, iTLB misses or fetch stalls after a branch
+misprediction are categorized under Frontend Latency. In
+such cases the Frontend eventually delivers no uops for some
+period."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Latency zero division"
+            print_error("Frontend_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -221,16 +237,21 @@ class ITLB_Misses:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to instruction TLB
-misses. Using large code pages may be considered here."""
+This metric represents cycles fraction CPU was stalled due
+to instruction TLB misses. Using large code pages may be
+considered here."""
     level = 3
     htoff = False
+    sample = ['ITLB_MISSES.WALK_COMPLETED']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ITLB_Misses zero division"
+            print_error("ITLB_Misses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -240,16 +261,21 @@ class DSB_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches from
-DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
+This metric represents cycles fraction CPU was stalled due
+to switches from DSB to MITE pipelines. Optimizing for
+better DSB hit rate may be considered."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB_Switches zero division"
+            print_error("DSB_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -259,17 +285,22 @@ class LCP:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to Length Changing
-Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
-certainly avoid this."""
+This metric represents cycles fraction CPU was stalled due
+to Length Changing Prefixes (LCPs). Using proper compiler
+flags or Intel Compiler by default will certainly avoid
+this."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LCP zero division"
+            print_error("LCP zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -279,19 +310,25 @@ class MS_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches of uop
-delivery to the Microcode Sequencer (MS). Commonly used instructions are
-optimized for delivery by the DSB or MITE pipelines. The MS is designated to
-deliver long uop flows required by CISC instructions like CPUID, or uncommon
-conditions like Floating Point Assists when dealing with Denormals."""
+This metric represents cycles fraction CPU was stalled due
+to switches of uop delivery to the Microcode Sequencer (MS).
+Commonly used instructions are optimized for delivery by the
+DSB or MITE pipelines. The MS is designated to deliver long
+uop flows required by CISC instructions like CPUID, or
+uncommon conditions like Floating Point Assists when dealing
+with Denormals."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MS_Switches zero division"
+            print_error("MS_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -301,19 +338,24 @@ class Frontend_Bandwidth:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend
-bandwidth issues.  For example, inefficiencies at the instruction decoders, or
-code restrictions for caching in the DSB (decoded uops cache) are categorized
-under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
-optimal amount of uops to the Backend."""
+This metric represents slots fraction CPU was stalled due to
+Frontend bandwidth issues.  For example, inefficiencies at
+the instruction decoders, or code restrictions for caching
+in the DSB (decoded uops cache) are categorized under
+Frontend Bandwidth. In such cases, the Frontend typically
+delivers non-optimal amount of uops to the Backend."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Bandwidth zero division"
+            print_error("Frontend_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -323,17 +365,22 @@ class MITE:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to the MITE fetch pipeline.  For example, inefficiencies in the
-instruction decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to the MITE fetch pipeline.  For example,
+inefficiencies in the instruction decoders are categorized
+here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MITE zero division"
+            print_error("MITE zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -343,18 +390,23 @@ class DSB:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
-utilization of the DSB cache structure or bank conflict when reading from it,
-are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to DSB (decoded uop cache) fetch
+pipeline.  For example, inefficient utilization of the DSB
+cache structure or bank conflict when reading from it, are
+categorized here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB zero division"
+            print_error("DSB zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -364,19 +416,24 @@ class LSD:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
-Uop supply. However, in some rare cases, optimal uop-delivery could not be
-reached for small loops whose size (in terms of number of uops) does not suit
-well the LSD structure."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to LSD (Loop Stream Detector) unit.  LSD
+typically does well sustaining Uop supply. However, in some
+rare cases, optimal uop-delivery could not be reached for
+small loops whose size (in terms of number of uops) does not
+suit well the LSD structure."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LSD zero division"
+            print_error("LSD zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -386,19 +443,24 @@ class Bad_Speculation:
     domain = "Slots"
     area = "BAD"
     desc = """
-This category reflects slots wasted due to incorrect speculations, which
-include slots used to allocate uops that do not eventually get retired and
-slots for which allocation was blocked due to recovery from earlier incorrect
-speculation. For example, wasted work due to miss-predicted branches are
-categorized under Bad Speculation category"""
+This category reflects slots wasted due to incorrect
+speculations, which include slots used to allocate uops that
+do not eventually get retired and slots for which allocation
+was blocked due to recovery from earlier incorrect
+speculation. For example, wasted work due to miss-predicted
+branches are categorized under Bad Speculation category"""
     level = 1
     htoff = False
+    sample = ['INT_MISC.RECOVERY_CYCLES']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
-            #print "Bad_Speculation zero division"
+            print_error("Bad_Speculation zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -408,18 +470,23 @@ class Branch_Mispredicts:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Branch
-Misprediction.  These slots are either wasted by uops fetched from an
-incorrectly speculated program path, or stalls the Backend of the machine
-needs to recover its state from a speculative path."""
+This metric represents slots fraction CPU was impacted by
+Branch Misprediction.  These slots are either wasted by uops
+fetched from an incorrectly speculated program path, or
+stalls the Backend of the machine needs to recover its state
+from a speculative path."""
     level = 2
     htoff = False
+    sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Branch_Mispredicts zero division"
+            print_error("Branch_Mispredicts zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -429,19 +496,24 @@ class Machine_Clears:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Machine Clears.
-These slots are either wasted by uops fetched prior to the clear, or stalls
-the Backend of the machine needs to recover its state after the clear. For
-example, this can happen due to memory ordering Nukes (e.g. Memory
-Disambiguation) or Self-Modifying-Code (SMC) nukes."""
+This metric represents slots fraction CPU was impacted by
+Machine Clears.  These slots are either wasted by uops
+fetched prior to the clear, or stalls the Backend of the
+machine needs to recover its state after the clear. For
+example, this can happen due to memory ordering Nukes (e.g.
+Memory Disambiguation) or Self-Modifying-Code (SMC) nukes."""
     level = 2
     htoff = False
+    sample = ['MACHINE_CLEARS.COUNT']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Machine_Clears zero division"
+            print_error("Machine_Clears zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -451,21 +523,27 @@ class Backend_Bound:
     domain = "Slots"
     area = "BE"
     desc = """
-This category reflects slots where no uops are being delivered due to a lack
-of required resources for accepting more uops in the Backend of the pipeline.
-Backend describes the portion of the pipeline where the out-of-order scheduler
-dispatches ready uops into their respective execution units, and once
-completed these uops get retired according to program order. For example,
-stalls due to data-cache misses or stalls due to the divider unit being
-overloaded are both categorized under Backend Bound."""
+This category reflects slots where no uops are being
+delivered due to a lack of required resources for accepting
+more uops in the Backend of the pipeline. Backend describes
+the portion of the pipeline where the out-of-order scheduler
+dispatches ready uops into their respective execution units,
+and once completed these uops get retired according to
+program order. For example, stalls due to data-cache misses
+or stalls due to the divider unit being overloaded are both
+categorized under Backend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Backend_Bound zero division"
+            print_error("Backend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -475,19 +553,25 @@ class Memory_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how much Memory subsystem was a bottleneck.  Memory
-Bound measures cycle fraction where pipeline is likely stalled due to demand
-load or store instructions. This accounts mainly for non-completed in-flight
-memory demand loads which coincides with execution starvation. in addition to
-less common cases where stores could imply backpressure on the pipeline."""
+This metric represents how much Memory subsystem was a
+bottleneck.  Memory Bound measures cycle fraction where
+pipeline is likely stalled due to demand load or store
+instructions. This accounts mainly for non-completed in-
+flight memory demand loads which coincides with execution
+starvation. in addition to less common cases where stores
+could imply backpressure on the pipeline."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Memory_Bound zero division"
+            print_error("Memory_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -497,20 +581,26 @@ class L1_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled without missing the L1 data
-cache.  The L1 cache typically has the shortest latency.  However, in certain
-cases like loads blocked on older stores, a load might suffer a high latency
-even though it is being satisfied by the L1. There are no fill-buffers
-allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event
-as it accounts for any non-completed load."""
+This metric represents how often CPU was stalled without
+missing the L1 data cache.  The L1 cache typically has the
+shortest latency.  However, in certain cases like loads
+blocked on older stores, a load might suffer a high latency
+even though it is being satisfied by the L1. There are no
+fill-buffers allocated for L1 hits so instead we use the
+load matrix (LDM) stalls sub-event as it accounts for any
+non-completed load."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh
         except ZeroDivisionError:
-            #print "L1_Bound zero division"
+            print_error("L1_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -519,15 +609,21 @@ class DTLB_Load:
     name = "DTLB_Load"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were waiting for page table walks. Consider making the
+working set more compact or using large pages."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Load zero division"
+            print_error("DTLB_Load zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -536,15 +632,24 @@ class Store_Fwd_Blk:
     name = "Store_Fwd_Blk"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Stores were blocked on store-forwarding between depending
+operations. This typically occurs when an output of a
+computation is accessed with a different sized data type.
+Review the rules for store forwarding in the optimization
+guide."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Store_Fwd_Blk zero division"
+            print_error("Store_Fwd_Blk zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -553,15 +658,21 @@ class Split_Loads:
     name = "Split_Loads"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were crossing 64 byte cache lines. Consider naturally
+aligning data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Loads zero division"
+            print_error("Split_Loads zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -570,15 +681,22 @@ class G4K_Aliasing:
     name = "4K_Aliasing"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Memory accesses were aliased by nearby others with a 4K
+offset. Reorganize the data to avoid this. See the
+optimization manual for more details."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "G4K_Aliasing zero division"
+            print_error("G4K_Aliasing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -588,17 +706,21 @@ class L2_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L2 cache.  Avoiding cache
-misses (i.e. L1 misses/L2 hits) will improve the latency and increase
-performance."""
+This metric represents how often CPU was stalled on L2
+cache.  Avoiding cache misses (i.e. L1 misses/L2 hits) will
+improve the latency and increase performance."""
     level = 3
     htoff = True
+    sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.03) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L2_Bound zero division"
+            print_error("L2_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -608,17 +730,22 @@ class L3_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L3 cache or contended with
-a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
-the latency and increase performance."""
+This metric represents how often CPU was stalled on L3 cache
+or contended with a sibling Core.  Avoiding cache misses
+(i.e. L2 misses/L3 hits) will improve the latency and
+increase performance."""
     level = 3
     htoff = True
+    sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Bound zero division"
+            print_error("L3_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -627,15 +754,21 @@ class Contested_Accesses:
     name = "Contested_Accesses"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+64 byte cache lines were bouncing between cores. Avoid false
+sharing, unnecessary writes, and localize data."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Contested_Accesses zero division"
+            print_error("Contested_Accesses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -647,12 +780,16 @@ class Data_Sharing:
     desc = ""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Data_Sharing zero division"
+            print_error("Data_Sharing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -662,18 +799,23 @@ class L3_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric is a rough aggregate estimate of cycles fraction where CPU
-accessed L3 cache for all load requests, while there was no contention/sharing
-with a sibling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
-improve the latency and increase performance."""
+This metric is a rough aggregate estimate of cycles fraction
+where CPU accessed L3 cache for all load requests, while
+there was no contention/sharing with a sibling core.
+Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
+the latency and increase performance."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Latency zero division"
+            print_error("L3_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -683,17 +825,22 @@ class SQ_Full:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric measures fraction of cycles where the Super Queue (SQ) was full
-taking into account all request-types and both hardware SMT threads. The Super
-Queue is used for requests to access the L2 cache or to go out to the Uncore."""
+This metric measures fraction of cycles where the Super
+Queue (SQ) was full taking into account all request-types
+and both hardware SMT threads. The Super Queue is used for
+requests to access the L2 cache or to go out to the Uncore."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "SQ_Full zero division"
+            print_error("SQ_Full zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -703,16 +850,21 @@ class MEM_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on main memory (DRAM).
-Caching will improve the latency and increase performance."""
+This metric represents how often CPU was stalled on main
+memory (DRAM).  Caching will improve the latency and
+increase performance."""
     level = 3
     htoff = True
+    sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bound zero division"
+            print_error("MEM_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -722,17 +874,21 @@ class MEM_Bandwidth:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to approaching
-bandwidth limits of main memory (DRAM).  NUMA in multi-socket system may be
-considered in such case."""
+This metric represents how often CPU was likely stalled due
+to approaching bandwidth limits of main memory (DRAM).  NUMA
+in multi-socket system may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bandwidth zero division"
+            print_error("MEM_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -742,17 +898,22 @@ class MEM_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to latency from
-main memory (DRAM).  Data layout re-structuring or using Software Prefetches
-(also through the compiler) may be considered in such case."""
+This metric represents how often CPU was likely stalled due
+to latency from main memory (DRAM).  Data layout re-
+structuring or using Software Prefetches (also through the
+compiler) may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Latency zero division"
+            print_error("MEM_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -762,18 +923,23 @@ class Stores_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled  due to store operations.
-even though memory store accesses do not typically stall out-of-order CPUs;
-there are few cases where stores can lead to actual stalls. This metric will
-be flagged should any of these cases be a bottleneck."""
+This metric represents how often CPU was stalled  due to
+store operations. even though memory store accesses do not
+typically stall out-of-order CPUs; there are few cases where
+stores can lead to actual stalls. This metric will be
+flagged should any of these cases be a bottleneck."""
     level = 3
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Stores_Bound zero division"
+            print_error("Stores_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -783,18 +949,23 @@ class False_Sharing:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled due to False Sharing. False
-Sharing is a multithreading hiccup, where multiple threads contend on
-different data-elements mapped into the same cache line. It can be easily
-avoided by padding to make threads access different lines."""
+This metric represents how often CPU was stalled due to
+False Sharing. False Sharing is a multithreading hiccup,
+where multiple threads contend on different data-elements
+mapped into the same cache line. It can be easily avoided by
+padding to make threads access different lines."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_False_Sharing_Client(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "False_Sharing zero division"
+            print_error("False_Sharing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -804,16 +975,21 @@ class Split_Stores:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric represents rate of split store accesses.  Consider aligning your
-data to the 64-byte cache line granularity."""
+This metric represents rate of split store accesses.
+Consider aligning your data to the 64-byte cache line
+granularity."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Stores zero division"
+            print_error("Split_Stores zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -823,20 +999,25 @@ class DTLB_Store:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents cycles fraction spent handling first-level data TLB
-store misses.  As with ordinary data caching, focus on improving data locality
-and reducing working-set size to reduce DTLB overhead.  Additionally, consider
-using profile-guided optimization (PGO) to collocate frequently-used data on
-the same page.  Try using larger page sizes for large amounts of frequently-
-used data."""
+This metric represents cycles fraction spent handling first-
+level data TLB store misses.  As with ordinary data caching,
+focus on improving data locality and reducing working-set
+size to reduce DTLB overhead.  Additionally, consider using
+profile-guided optimization (PGO) to collocate frequently-
+used data on the same page.  Try using larger page sizes for
+large amounts of frequently-used data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Store zero division"
+            print_error("DTLB_Store zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -846,21 +1027,27 @@ class Core_Bound:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents how much Core non-memory issues were of a bottleneck.
-Shortage in hardware compute resources, or dependencies software's
-instructions are both categorized under Core Bound. Hence it may indicate the
-machine ran out of an OOO resources, certain execution units are overloaded or
-dependencies in program's data- or instruction-flow are limiting the
-performance (e.g. FP-chained long-latency arithmetic operations). Tip:
-consider Port Saturation analysis as next step."""
+This metric represents how much Core non-memory issues were
+of a bottleneck.  Shortage in hardware compute resources, or
+dependencies software's instructions are both categorized
+under Core Bound. Hence it may indicate the machine ran out
+of an OOO resources, certain execution units are overloaded
+or dependencies in program's data- or instruction-flow are
+limiting the performance (e.g. FP-chained long-latency
+arithmetic operations). Tip: consider Port Saturation
+analysis as next step."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Core_Bound zero division"
+            print_error("Core_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -869,15 +1056,22 @@ class Divider:
     name = "Divider"
     domain = "CoreClocks"
     area = "BE/Core"
-    desc = ""
+    desc = """
+Time waiting for divisions by variables. Change the dividend
+to be constant or use profile feedback to let the compiler
+do that."""
     level = 3
     htoff = False
+    sample = ['ARITH.DIVIDER_UOPS']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 10 * EV("ARITH.DIVIDER_UOPS", 3) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Divider zero division"
+            print_error("Divider zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -887,21 +1081,27 @@ class Ports_Utilization:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction application was stalled due to Core
-computation issues (non divider-related).  For example, heavy data-dependency
-between nearby instructions will manifest in this category. Ditto if
-instruction-mix used by the application overloads specific hardware execution
-unit. Hint: Loop Vectorization -most compilers feature auto-Vectorization
-options today- reduces pressure on the execution ports as multiple elements
-are calculated with same uop."""
+This metric represents cycles fraction application was
+stalled due to Core computation issues (non divider-
+related).  For example, heavy data-dependency between nearby
+instructions will manifest in this category. Ditto if
+instruction-mix used by the application overloads specific
+hardware execution unit. Hint: Loop Vectorization -most
+compilers feature auto-Vectorization options today- reduces
+pressure on the execution ports as multiple elements are
+calculated with same uop."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Ports_Utilization zero division"
+            print_error("Ports_Utilization zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -911,16 +1111,20 @@ class G0_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed no uops on any
-execution port."""
+This metric represents Core cycles fraction CPU executed no
+uops on any execution port."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G0_Ports_Utilized zero division"
+            print_error("G0_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -930,22 +1134,29 @@ class G1_Port_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction where the CPU executed total of 1
-uop per cycle on all execution ports. This can be due to heavy data-dependency
-among software instructions, or over oversubscribing a particular hardware
-resource. In some other cases with high 1_Port_Utilized and L1_Bound, this
-metric can point to L1 data-cache latency bottleneck that may not necessarily
-manifest with complete execution starvation (due to the short L1 latency e.g.
-walking a linked list) - looking at the assembly can be helpful. Tip: consider
-'Core Ports Saturation' analysis-type as next step."""
+This metric represents Core cycles fraction where the CPU
+executed total of 1 uop per cycle on all execution ports.
+This can be due to heavy data-dependency among software
+instructions, or over oversubscribing a particular hardware
+resource. In some other cases with high 1_Port_Utilized and
+L1_Bound, this metric can point to L1 data-cache latency
+bottleneck that may not necessarily manifest with complete
+execution starvation (due to the short L1 latency e.g.
+walking a linked list) - looking at the assembly can be
+helpful. Tip: consider 'Core Ports Saturation' analysis-type
+as next step."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G1_Port_Utilized zero division"
+            print_error("G1_Port_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -955,19 +1166,25 @@ class G2_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 2 uops per
-cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-
-type as next step. Loop Vectorization -most compilers feature auto-
-Vectorization options today- reduces pressure on the execution ports as
-multiple elements are calculated with same uop."""
+This metric represents Core cycles fraction CPU executed
+total of 2 uops per cycle on all execution ports. Tip:
+consider 'Core Port Saturation' analysis-type as next step.
+Loop Vectorization -most compilers feature auto-
+Vectorization options today- reduces pressure on the
+execution ports as multiple elements are calculated with
+same uop."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G2_Ports_Utilized zero division"
+            print_error("G2_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -977,17 +1194,22 @@ class G3m_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 3 or more
-uops per cycle on all execution ports. Tip: consider 'Core Port Saturation'
-analysis-type as next step"""
+This metric represents Core cycles fraction CPU executed
+total of 3 or more uops per cycle on all execution ports.
+Tip: consider 'Core Port Saturation' analysis-type as next
+step"""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G3m_Ports_Utilized zero division"
+            print_error("G3m_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -997,25 +1219,32 @@ class Retiring:
     domain = "Slots"
     area = "RET"
     desc = """
-This category reflects slots utilized by useful work i.e. allocated uops that
-eventually get retired. Ideally, all pipeline slots would be attributed to the
-Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired
-per cycle has been achieved.  Maximizing Retiring typically increases the
-Instruction-Per-Cycle metric. Note that a high Retiring value does not
-necessary mean there is no room for more performance.  For example, Microcode
-assists are categorized under Retiring. They hurt performance and can often be
-avoided.  A high Retiring value for non-vectorized code may be a good hint for
-programmer to consider vectorizing his code.  Doing so essentially lets more
-computations be done without significantly increasing number of instructions
-thus improving the performance."""
+This category reflects slots utilized by useful work i.e.
+allocated uops that eventually get retired. Ideally, all
+pipeline slots would be attributed to the Retiring category.
+Retiring of 100% would indicate the maximum 4 uops retired
+per cycle has been achieved.  Maximizing Retiring typically
+increases the Instruction-Per-Cycle metric. Note that a high
+Retiring value does not necessary mean there is no room for
+more performance.  For example, Microcode assists are
+categorized under Retiring. They hurt performance and can
+often be avoided.  A high Retiring value for non-vectorized
+code may be a good hint for programmer to consider
+vectorizing his code.  Doing so essentially lets more
+computations be done without significantly increasing number
+of instructions thus improving the performance."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
-            #print "Retiring zero division"
+            print_error("Retiring zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1025,21 +1254,27 @@ class Base:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction where the CPU was retiring uops not
-originated from the microcode-sequencer. This correlates with total number of
-instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. While this is the most desirable of the top 4 categories, high
-values may still indicate areas for improvement. If possible focus on
-techniques that reduce instruction count or result in more efficient
-instructions generation such as vectorization."""
+This metric represents slots fraction where the CPU was
+retiring uops not originated from the microcode-sequencer.
+This correlates with total number of instructions used by
+the program. A uops-per-instruction ratio of 1 should be
+expected. While this is the most desirable of the top 4
+categories, high values may still indicate areas for
+improvement. If possible focus on techniques that reduce
+instruction count or result in more efficient instructions
+generation such as vectorization."""
     level = 2
     htoff = False
+    sample = ['INST_RETIRED.PREC_DIST']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Base zero division"
+            print_error("Base zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1049,18 +1284,24 @@ class Microcode_Sequencer:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops fetched by the
-Microcode Sequencer (MS) ROM.  The MS is used for CISC instructions not fully
-decoded by the default decoders (like repeat move strings), or by microcode
-assists used to address some operation modes (like in Floating Point assists)."""
+This metric represents slots fraction CPU was retiring uops
+fetched by the Microcode Sequencer (MS) ROM.  The MS is used
+for CISC instructions not fully decoded by the default
+decoders (like repeat move strings), or by microcode assists
+used to address some operation modes (like in Floating Point
+assists)."""
     level = 2
     htoff = False
+    sample = ['IDQ.MS_UOPS']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
-            #print "Microcode_Sequencer zero division"
+            print_error("Microcode_Sequencer zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1071,12 +1312,14 @@ class Metric_IPC:
 Instructions Per Cycle (per logical thread)"""
     domain = "Metric"
     maxval = 5
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPC(EV, 0)
         except ZeroDivisionError:
-            print "IPC zero division"
+            print_error("IPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CPI:
@@ -1085,12 +1328,14 @@ class Metric_CPI:
 Cycles Per Instruction (threaded)"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CPI(EV, 0)
         except ZeroDivisionError:
-            print "CPI zero division"
+            print_error("CPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CoreIPC:
@@ -1098,13 +1343,15 @@ class Metric_CoreIPC:
     desc = """
 Instructions Per Cycle (per physical core)"""
     domain = "CoreMetric"
-    maxval = 5
+    maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CoreIPC(EV, 0)
         except ZeroDivisionError:
-            print "CoreIPC zero division"
+            print_error("CoreIPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_UPI:
@@ -1113,12 +1360,14 @@ class Metric_UPI:
 Uops Per Instruction"""
     domain = "Metric"
     maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = UPI(EV, 0)
         except ZeroDivisionError:
-            print "UPI zero division"
+            print_error("UPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_IPTB:
@@ -1127,71 +1376,82 @@ class Metric_IPTB:
 Instruction per taken branch"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPTB(EV, 0)
         except ZeroDivisionError:
-            print "IPTB zero division"
+            print_error("IPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_BPTB:
     name = "BPTB"
     desc = """
-Branch instructions per taken branch. Can be used to approximate PGO-
-likelihood for non-loopy codes."""
+Branch instructions per taken branch. Can be used to
+approximate PGO-likelihood for non-loopy codes."""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = BPTB(EV, 0)
         except ZeroDivisionError:
-            print "BPTB zero division"
+            print_error("BPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     desc = """
-Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+Fraction of Uops delivered by the DSB (decoded instructions
+cache)"""
     domain = "Metric"
     maxval = 1
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = DSB_Coverage(EV, 0)
         except ZeroDivisionError:
-            print "DSB_Coverage zero division"
+            print_error("DSB_Coverage zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_ILP:
     name = "ILP"
     desc = """
-Instruction-Level-Parallelism (average number of uops executed when there is
-at least 1 uop executed)"""
+Instruction-Level-Parallelism (average number of uops
+executed when there is at least 1 uop executed)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = ILP(EV, 0)
         except ZeroDivisionError:
-            print "ILP zero division"
+            print_error("ILP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MLP:
     name = "MLP"
     desc = """
-Memory-Level-Parallelism (average number of L1 miss demand load when there is
-at least 1 such miss)"""
+Memory-Level-Parallelism (average number of L1 miss demand
+load when there is at least 1 such miss)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MLP(EV, 0)
         except ZeroDivisionError:
-            print "MLP zero division"
+            print_error("MLP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Load_Miss_Real_Latency:
@@ -1200,41 +1460,47 @@ class Metric_Load_Miss_Real_Latency:
 Actual Average Latency for L1 data-cache miss demand loads"""
     domain = "Metric"
     maxval = 1000
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Load_Miss_Real_Latency(EV, 0)
         except ZeroDivisionError:
-            print "Load_Miss_Real_Latency zero division"
+            print_error("Load_Miss_Real_Latency zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Turbo_Utilization:
     name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
-    domain = "CoreMetric"
+    domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Turbo_Utilization(EV, 0)
         except ZeroDivisionError:
-            print "Turbo_Utilization zero division"
+            print_error("Turbo_Utilization zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Page_Walks_Use:
     name = "Page_Walks_Use"
     desc = """
-Fraction of cycles where the core's Page Walker is busy serving
-iTLB/Load/Store"""
+Fraction of cycles where the core's Page Walker is busy
+serving iTLB/Load/Store"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Page_Walks_Use(EV, 0)
         except ZeroDivisionError:
-            print "Page_Walks_Use zero division"
+            print_error("Page_Walks_Use zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MUX:
@@ -1243,12 +1509,14 @@ class Metric_MUX:
 PerfMon Event Multiplexing accuracy indicator"""
     domain = "Clocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MUX(EV, 0)
         except ZeroDivisionError:
-            print "MUX zero division"
+            print_error("MUX zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CLKS:
@@ -1257,12 +1525,14 @@ class Metric_CLKS:
 Per-thread actual clocks"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CLKS zero division"
+            print_error("CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CORE_CLKS:
@@ -1271,12 +1541,14 @@ class Metric_CORE_CLKS:
 Core actual clocks"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CORE_CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CORE_CLKS zero division"
+            print_error("CORE_CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Time:
@@ -1285,12 +1557,14 @@ class Metric_Time:
 Run duration time in seconds"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "Time zero division"
+            print_error("Time zero division")
+            self.errcount += 1
             self.val = 0
 
 # Schedule
@@ -1406,96 +1680,11 @@ def __init__(self, r):
 
         # siblings cross-tree
 
-        o["Frontend_Bound"].sibling = None
-        o["Frontend_Latency"].sibling = None
-        o["ITLB_Misses"].sibling = None
-        o["DSB_Switches"].sibling = None
-        o["LCP"].sibling = None
 	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
-        o["Frontend_Bandwidth"].sibling = None
-        o["MITE"].sibling = None
-        o["DSB"].sibling = None
-        o["LSD"].sibling = None
-	o["Bad_Speculation"].sibling = None
-        o["Branch_Mispredicts"].sibling = None
-        o["Machine_Clears"].sibling = None
-        o["Backend_Bound"].sibling = None
-        o["Memory_Bound"].sibling = None
 	o["L1_Bound"].sibling = o["G1_Port_Utilized"]
-        o["DTLB_Load"].sibling = None
-        o["Store_Fwd_Blk"].sibling = None
-        o["Split_Loads"].sibling = None
-        o["G4K_Aliasing"].sibling = None
-        o["L2_Bound"].sibling = None
-        o["L3_Bound"].sibling = None
-        o["Contested_Accesses"].sibling = None
-        o["Data_Sharing"].sibling = None
-        o["L3_Latency"].sibling = None
-        o["SQ_Full"].sibling = None
-        o["MEM_Bound"].sibling = None
-        o["MEM_Bandwidth"].sibling = None
-        o["MEM_Latency"].sibling = None
-        o["Stores_Bound"].sibling = None
-        o["False_Sharing"].sibling = None
-	o["Split_Stores"].sibling = None
-        o["DTLB_Store"].sibling = None
-        o["Core_Bound"].sibling = None
-        o["Divider"].sibling = None
-        o["Ports_Utilization"].sibling = None
-        o["G0_Ports_Utilized"].sibling = None
 	o["G1_Port_Utilized"].sibling = o["L1_Bound"]
-        o["G2_Ports_Utilized"].sibling = None
-        o["G3m_Ports_Utilized"].sibling = None
-        o["Retiring"].sibling = None
-        o["Base"].sibling = None
 	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events
-
-        o["Frontend_Bound"].sample = []
-        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
-        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
-        o["DSB_Switches"].sample = []
-        o["LCP"].sample = []
-        o["MS_Switches"].sample = []
-        o["Frontend_Bandwidth"].sample = []
-        o["MITE"].sample = []
-        o["DSB"].sample = []
-        o["LSD"].sample = []
-        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
-        o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
-        o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT']
-        o["Backend_Bound"].sample = []
-        o["Memory_Bound"].sample = []
-        o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
-        o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
-        o["Store_Fwd_Blk"].sample = []
-        o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
-        o["G4K_Aliasing"].sample = []
-        o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
-        o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
-        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
-        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp']
-        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
-        o["SQ_Full"].sample = []
-        o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
-        o["MEM_Bandwidth"].sample = []
-        o["MEM_Latency"].sample = []
-        o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
-	o["False_Sharing"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE']
-        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
-        o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
-        o["Core_Bound"].sample = []
-        o["Divider"].sample = ['ARITH.DIVIDER_UOPS']
-        o["Ports_Utilization"].sample = []
-        o["G0_Ports_Utilized"].sample = []
-        o["G1_Port_Utilized"].sample = []
-        o["G2_Ports_Utilized"].sample = []
-        o["G3m_Ports_Utilized"].sample = []
-        o["Retiring"].sample = []
-        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
-        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
-
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
diff --git a/hsx_server_ratios.py b/hsx_server_ratios.py
index a5cb5528..a202ea84 100644
--- a/hsx_server_ratios.py
+++ b/hsx_server_ratios.py
@@ -8,6 +8,10 @@
 # https://sites.google.com/site/analysismethods/yasin-pubs
 #
 
+# Helpers
+
+print_error = lambda msg: False
+
 smt_enabled = False
 
 # Constants
@@ -172,22 +176,28 @@ class Frontend_Bound:
     domain = "Slots"
     area = "FE"
     desc = """
-This category reflects slots where the Frontend of the processor undersupplies
-its Backend. Frontend denotes the first portion of pipeline responsible to
-fetch micro-ops which the Backend can execute. Within the Frontend, a branch
-predictor predicts the next address to fetch, cache-lines are fetched from
-memory, parsed into instructions, and lastly decoded into micro-ops. The
-purpose of the Frontend cluster is to deliver uops to Backend whenever the
-latter can accept them. For example, stalls due to instruction-cache misses
-would be categorized under Frontend Bound."""
+This category reflects slots where the Frontend of the
+processor undersupplies its Backend. Frontend denotes the
+first portion of pipeline responsible to fetch micro-ops
+which the Backend can execute. Within the Frontend, a branch
+predictor predicts the next address to fetch, cache-lines
+are fetched from memory, parsed into instructions, and
+lastly decoded into micro-ops. The purpose of the Frontend
+cluster is to deliver uops to Backend whenever the latter
+can accept them. For example, stalls due to instruction-
+cache misses would be categorized under Frontend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Frontend_Bound zero division"
+            print_error("Frontend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -197,18 +207,24 @@ class Frontend_Latency:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend latency
-issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
-after a branch misprediction are categorized under Frontend Latency. In such
-cases the Frontend eventually delivers no uops for some period."""
+This metric represents slots fraction CPU was stalled due to
+Frontend latency issues.  For example, instruction-cache
+misses, iTLB misses or fetch stalls after a branch
+misprediction are categorized under Frontend Latency. In
+such cases the Frontend eventually delivers no uops for some
+period."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Latency zero division"
+            print_error("Frontend_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -218,16 +234,21 @@ class ITLB_Misses:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to instruction TLB
-misses. Using large code pages may be considered here."""
+This metric represents cycles fraction CPU was stalled due
+to instruction TLB misses. Using large code pages may be
+considered here."""
     level = 3
     htoff = False
+    sample = ['ITLB_MISSES.WALK_COMPLETED']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ITLB_Misses zero division"
+            print_error("ITLB_Misses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -237,16 +258,21 @@ class DSB_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches from
-DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
+This metric represents cycles fraction CPU was stalled due
+to switches from DSB to MITE pipelines. Optimizing for
+better DSB hit rate may be considered."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB_Switches zero division"
+            print_error("DSB_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -256,17 +282,22 @@ class LCP:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to Length Changing
-Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
-certainly avoid this."""
+This metric represents cycles fraction CPU was stalled due
+to Length Changing Prefixes (LCPs). Using proper compiler
+flags or Intel Compiler by default will certainly avoid
+this."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LCP zero division"
+            print_error("LCP zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -276,19 +307,25 @@ class MS_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches of uop
-delivery to the Microcode Sequencer (MS). Commonly used instructions are
-optimized for delivery by the DSB or MITE pipelines. The MS is designated to
-deliver long uop flows required by CISC instructions like CPUID, or uncommon
-conditions like Floating Point Assists when dealing with Denormals."""
+This metric represents cycles fraction CPU was stalled due
+to switches of uop delivery to the Microcode Sequencer (MS).
+Commonly used instructions are optimized for delivery by the
+DSB or MITE pipelines. The MS is designated to deliver long
+uop flows required by CISC instructions like CPUID, or
+uncommon conditions like Floating Point Assists when dealing
+with Denormals."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MS_Switches zero division"
+            print_error("MS_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -298,19 +335,24 @@ class Frontend_Bandwidth:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend
-bandwidth issues.  For example, inefficiencies at the instruction decoders, or
-code restrictions for caching in the DSB (decoded uops cache) are categorized
-under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
-optimal amount of uops to the Backend."""
+This metric represents slots fraction CPU was stalled due to
+Frontend bandwidth issues.  For example, inefficiencies at
+the instruction decoders, or code restrictions for caching
+in the DSB (decoded uops cache) are categorized under
+Frontend Bandwidth. In such cases, the Frontend typically
+delivers non-optimal amount of uops to the Backend."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Bandwidth zero division"
+            print_error("Frontend_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -320,17 +362,22 @@ class MITE:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to the MITE fetch pipeline.  For example, inefficiencies in the
-instruction decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to the MITE fetch pipeline.  For example,
+inefficiencies in the instruction decoders are categorized
+here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MITE zero division"
+            print_error("MITE zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -340,18 +387,23 @@ class DSB:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
-utilization of the DSB cache structure or bank conflict when reading from it,
-are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to DSB (decoded uop cache) fetch
+pipeline.  For example, inefficient utilization of the DSB
+cache structure or bank conflict when reading from it, are
+categorized here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB zero division"
+            print_error("DSB zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -361,19 +413,24 @@ class LSD:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
-Uop supply. However, in some rare cases, optimal uop-delivery could not be
-reached for small loops whose size (in terms of number of uops) does not suit
-well the LSD structure."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to LSD (Loop Stream Detector) unit.  LSD
+typically does well sustaining Uop supply. However, in some
+rare cases, optimal uop-delivery could not be reached for
+small loops whose size (in terms of number of uops) does not
+suit well the LSD structure."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LSD zero division"
+            print_error("LSD zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -383,19 +440,24 @@ class Bad_Speculation:
     domain = "Slots"
     area = "BAD"
     desc = """
-This category reflects slots wasted due to incorrect speculations, which
-include slots used to allocate uops that do not eventually get retired and
-slots for which allocation was blocked due to recovery from earlier incorrect
-speculation. For example, wasted work due to miss-predicted branches are
-categorized under Bad Speculation category"""
+This category reflects slots wasted due to incorrect
+speculations, which include slots used to allocate uops that
+do not eventually get retired and slots for which allocation
+was blocked due to recovery from earlier incorrect
+speculation. For example, wasted work due to miss-predicted
+branches are categorized under Bad Speculation category"""
     level = 1
     htoff = False
+    sample = ['INT_MISC.RECOVERY_CYCLES']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
-            #print "Bad_Speculation zero division"
+            print_error("Bad_Speculation zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -405,18 +467,23 @@ class Branch_Mispredicts:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Branch
-Misprediction.  These slots are either wasted by uops fetched from an
-incorrectly speculated program path, or stalls the Backend of the machine
-needs to recover its state from a speculative path."""
+This metric represents slots fraction CPU was impacted by
+Branch Misprediction.  These slots are either wasted by uops
+fetched from an incorrectly speculated program path, or
+stalls the Backend of the machine needs to recover its state
+from a speculative path."""
     level = 2
     htoff = False
+    sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Branch_Mispredicts zero division"
+            print_error("Branch_Mispredicts zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -426,19 +493,24 @@ class Machine_Clears:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Machine Clears.
-These slots are either wasted by uops fetched prior to the clear, or stalls
-the Backend of the machine needs to recover its state after the clear. For
-example, this can happen due to memory ordering Nukes (e.g. Memory
-Disambiguation) or Self-Modifying-Code (SMC) nukes."""
+This metric represents slots fraction CPU was impacted by
+Machine Clears.  These slots are either wasted by uops
+fetched prior to the clear, or stalls the Backend of the
+machine needs to recover its state after the clear. For
+example, this can happen due to memory ordering Nukes (e.g.
+Memory Disambiguation) or Self-Modifying-Code (SMC) nukes."""
     level = 2
     htoff = False
+    sample = ['MACHINE_CLEARS.COUNT']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Machine_Clears zero division"
+            print_error("Machine_Clears zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -448,21 +520,27 @@ class Backend_Bound:
     domain = "Slots"
     area = "BE"
     desc = """
-This category reflects slots where no uops are being delivered due to a lack
-of required resources for accepting more uops in the Backend of the pipeline.
-Backend describes the portion of the pipeline where the out-of-order scheduler
-dispatches ready uops into their respective execution units, and once
-completed these uops get retired according to program order. For example,
-stalls due to data-cache misses or stalls due to the divider unit being
-overloaded are both categorized under Backend Bound."""
+This category reflects slots where no uops are being
+delivered due to a lack of required resources for accepting
+more uops in the Backend of the pipeline. Backend describes
+the portion of the pipeline where the out-of-order scheduler
+dispatches ready uops into their respective execution units,
+and once completed these uops get retired according to
+program order. For example, stalls due to data-cache misses
+or stalls due to the divider unit being overloaded are both
+categorized under Backend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Backend_Bound zero division"
+            print_error("Backend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -472,19 +550,25 @@ class Memory_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how much Memory subsystem was a bottleneck.  Memory
-Bound measures cycle fraction where pipeline is likely stalled due to demand
-load or store instructions. This accounts mainly for non-completed in-flight
-memory demand loads which coincides with execution starvation. in addition to
-less common cases where stores could imply backpressure on the pipeline."""
+This metric represents how much Memory subsystem was a
+bottleneck.  Memory Bound measures cycle fraction where
+pipeline is likely stalled due to demand load or store
+instructions. This accounts mainly for non-completed in-
+flight memory demand loads which coincides with execution
+starvation. in addition to less common cases where stores
+could imply backpressure on the pipeline."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Memory_Bound zero division"
+            print_error("Memory_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -494,20 +578,26 @@ class L1_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled without missing the L1 data
-cache.  The L1 cache typically has the shortest latency.  However, in certain
-cases like loads blocked on older stores, a load might suffer a high latency
-even though it is being satisfied by the L1. There are no fill-buffers
-allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event
-as it accounts for any non-completed load."""
+This metric represents how often CPU was stalled without
+missing the L1 data cache.  The L1 cache typically has the
+shortest latency.  However, in certain cases like loads
+blocked on older stores, a load might suffer a high latency
+even though it is being satisfied by the L1. There are no
+fill-buffers allocated for L1 hits so instead we use the
+load matrix (LDM) stalls sub-event as it accounts for any
+non-completed load."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh
         except ZeroDivisionError:
-            #print "L1_Bound zero division"
+            print_error("L1_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -516,15 +606,21 @@ class DTLB_Load:
     name = "DTLB_Load"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were waiting for page table walks. Consider making the
+working set more compact or using large pages."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Load zero division"
+            print_error("DTLB_Load zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -533,15 +629,24 @@ class Store_Fwd_Blk:
     name = "Store_Fwd_Blk"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Stores were blocked on store-forwarding between depending
+operations. This typically occurs when an output of a
+computation is accessed with a different sized data type.
+Review the rules for store forwarding in the optimization
+guide."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Store_Fwd_Blk zero division"
+            print_error("Store_Fwd_Blk zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -550,15 +655,21 @@ class Split_Loads:
     name = "Split_Loads"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were crossing 64 byte cache lines. Consider naturally
+aligning data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Load_Miss_Real_Latency(EV, 4)* EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Loads zero division"
+            print_error("Split_Loads zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -567,15 +678,22 @@ class G4K_Aliasing:
     name = "4K_Aliasing"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Memory accesses were aliased by nearby others with a 4K
+offset. Reorganize the data to avoid this. See the
+optimization manual for more details."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "G4K_Aliasing zero division"
+            print_error("G4K_Aliasing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -585,17 +703,21 @@ class L2_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L2 cache.  Avoiding cache
-misses (i.e. L1 misses/L2 hits) will improve the latency and increase
-performance."""
+This metric represents how often CPU was stalled on L2
+cache.  Avoiding cache misses (i.e. L1 misses/L2 hits) will
+improve the latency and increase performance."""
     level = 3
     htoff = True
+    sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.03) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L2_Bound zero division"
+            print_error("L2_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -605,17 +727,22 @@ class L3_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L3 cache or contended with
-a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
-the latency and increase performance."""
+This metric represents how often CPU was stalled on L3 cache
+or contended with a sibling Core.  Avoiding cache misses
+(i.e. L2 misses/L3 hits) will improve the latency and
+increase performance."""
     level = 3
     htoff = True
+    sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Bound zero division"
+            print_error("L3_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -624,15 +751,21 @@ class Contested_Accesses:
     name = "Contested_Accesses"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+64 byte cache lines were bouncing between cores. Avoid false
+sharing, unnecessary writes, and localize data."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Contested_Accesses zero division"
+            print_error("Contested_Accesses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -644,12 +777,16 @@ class Data_Sharing:
     desc = ""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Data_Sharing zero division"
+            print_error("Data_Sharing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -659,18 +796,23 @@ class L3_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric is a rough aggregate estimate of cycles fraction where CPU
-accessed L3 cache for all load requests, while there was no contention/sharing
-with a sibling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
-improve the latency and increase performance."""
+This metric is a rough aggregate estimate of cycles fraction
+where CPU accessed L3 cache for all load requests, while
+there was no contention/sharing with a sibling core.
+Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
+the latency and increase performance."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.L3_HIT", 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Latency zero division"
+            print_error("L3_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -680,17 +822,22 @@ class SQ_Full:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric measures fraction of cycles where the Super Queue (SQ) was full
-taking into account all request-types and both hardware SMT threads. The Super
-Queue is used for requests to access the L2 cache or to go out to the Uncore."""
+This metric measures fraction of cycles where the Super
+Queue (SQ) was full taking into account all request-types
+and both hardware SMT threads. The Super Queue is used for
+requests to access the L2 cache or to go out to the Uncore."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "SQ_Full zero division"
+            print_error("SQ_Full zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -700,16 +847,21 @@ class MEM_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on main memory (DRAM).
-Caching will improve the latency and increase performance."""
+This metric represents how often CPU was stalled on main
+memory (DRAM).  Caching will improve the latency and
+increase performance."""
     level = 3
     htoff = True
+    sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bound zero division"
+            print_error("MEM_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -719,17 +871,21 @@ class MEM_Bandwidth:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to approaching
-bandwidth limits of main memory (DRAM).  NUMA in multi-socket system may be
-considered in such case."""
+This metric represents how often CPU was likely stalled due
+to approaching bandwidth limits of main memory (DRAM).  NUMA
+in multi-socket system may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bandwidth zero division"
+            print_error("MEM_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -739,17 +895,22 @@ class MEM_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to latency from
-main memory (DRAM).  Data layout re-structuring or using Software Prefetches
-(also through the compiler) may be considered in such case."""
+This metric represents how often CPU was likely stalled due
+to latency from main memory (DRAM).  Data layout re-
+structuring or using Software Prefetches (also through the
+compiler) may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Latency zero division"
+            print_error("MEM_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -759,16 +920,21 @@ class Local_DRAM:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to loads from
-local memory. Caching will improve the latency and increase performance."""
+This metric represents how often CPU was likely stalled due
+to loads from local memory. Caching will improve the latency
+and increase performance."""
     level = 5
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_Local_DRAM_Cost * EV("MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM", 5) / CLKS(EV, 5 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Local_DRAM zero division"
+            print_error("Local_DRAM zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -778,16 +944,21 @@ class Remote_DRAM:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to loads from
-remote memory. This is caused often due to non-optimal NUMA allocations."""
+This metric represents how often CPU was likely stalled due
+to loads from remote memory. This is caused often due to
+non-optimal NUMA allocations."""
     level = 5
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_Remote_DRAM_Cost * EV("MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM", 5) / CLKS(EV, 5 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Remote_DRAM zero division"
+            print_error("Remote_DRAM zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -797,17 +968,21 @@ class Remote_Cache:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to loads from
-remote cache in other sockets. This is caused often due to non-optimal NUMA
-allocations."""
+This metric represents how often CPU was likely stalled due
+to loads from remote cache in other sockets. This is caused
+often due to non-optimal NUMA allocations."""
     level = 5
     htoff = False
+    sample = ['MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM:pp', 'MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_Remote_HitM_Cost * EV("MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM", 5) + Mem_Remote_Fwd_Cost * EV("MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD", 5)) / CLKS(EV, 5 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Remote_Cache zero division"
+            print_error("Remote_Cache zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -817,18 +992,23 @@ class Stores_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled  due to store operations.
-even though memory store accesses do not typically stall out-of-order CPUs;
-there are few cases where stores can lead to actual stalls. This metric will
-be flagged should any of these cases be a bottleneck."""
+This metric represents how often CPU was stalled  due to
+store operations. even though memory store accesses do not
+typically stall out-of-order CPUs; there are few cases where
+stores can lead to actual stalls. This metric will be
+flagged should any of these cases be a bottleneck."""
     level = 3
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Stores_Bound zero division"
+            print_error("Stores_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -838,16 +1018,21 @@ class Split_Stores:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric represents rate of split store accesses.  Consider aligning your
-data to the 64-byte cache line granularity."""
+This metric represents rate of split store accesses.
+Consider aligning your data to the 64-byte cache line
+granularity."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Stores zero division"
+            print_error("Split_Stores zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -857,20 +1042,25 @@ class DTLB_Store:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents cycles fraction spent handling first-level data TLB
-store misses.  As with ordinary data caching, focus on improving data locality
-and reducing working-set size to reduce DTLB overhead.  Additionally, consider
-using profile-guided optimization (PGO) to collocate frequently-used data on
-the same page.  Try using larger page sizes for large amounts of frequently-
-used data."""
+This metric represents cycles fraction spent handling first-
+level data TLB store misses.  As with ordinary data caching,
+focus on improving data locality and reducing working-set
+size to reduce DTLB overhead.  Additionally, consider using
+profile-guided optimization (PGO) to collocate frequently-
+used data on the same page.  Try using larger page sizes for
+large amounts of frequently-used data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Store zero division"
+            print_error("DTLB_Store zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -880,21 +1070,27 @@ class Core_Bound:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents how much Core non-memory issues were of a bottleneck.
-Shortage in hardware compute resources, or dependencies software's
-instructions are both categorized under Core Bound. Hence it may indicate the
-machine ran out of an OOO resources, certain execution units are overloaded or
-dependencies in program's data- or instruction-flow are limiting the
-performance (e.g. FP-chained long-latency arithmetic operations). Tip:
-consider Port Saturation analysis as next step."""
+This metric represents how much Core non-memory issues were
+of a bottleneck.  Shortage in hardware compute resources, or
+dependencies software's instructions are both categorized
+under Core Bound. Hence it may indicate the machine ran out
+of an OOO resources, certain execution units are overloaded
+or dependencies in program's data- or instruction-flow are
+limiting the performance (e.g. FP-chained long-latency
+arithmetic operations). Tip: consider Port Saturation
+analysis as next step."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Core_Bound zero division"
+            print_error("Core_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -903,15 +1099,22 @@ class Divider:
     name = "Divider"
     domain = "CoreClocks"
     area = "BE/Core"
-    desc = ""
+    desc = """
+Time waiting for divisions by variables. Change the dividend
+to be constant or use profile feedback to let the compiler
+do that."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 10 * EV("ARITH.DIVIDER_UOPS", 3) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Divider zero division"
+            print_error("Divider zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -921,21 +1124,27 @@ class Ports_Utilization:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction application was stalled due to Core
-computation issues (non divider-related).  For example, heavy data-dependency
-between nearby instructions will manifest in this category. Ditto if
-instruction-mix used by the application overloads specific hardware execution
-unit. Hint: Loop Vectorization -most compilers feature auto-Vectorization
-options today- reduces pressure on the execution ports as multiple elements
-are calculated with same uop."""
+This metric represents cycles fraction application was
+stalled due to Core computation issues (non divider-
+related).  For example, heavy data-dependency between nearby
+instructions will manifest in this category. Ditto if
+instruction-mix used by the application overloads specific
+hardware execution unit. Hint: Loop Vectorization -most
+compilers feature auto-Vectorization options today- reduces
+pressure on the execution ports as multiple elements are
+calculated with same uop."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Ports_Utilization zero division"
+            print_error("Ports_Utilization zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -945,16 +1154,20 @@ class G0_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed no uops on any
-execution port."""
+This metric represents Core cycles fraction CPU executed no
+uops on any execution port."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G0_Ports_Utilized zero division"
+            print_error("G0_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -964,22 +1177,29 @@ class G1_Port_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction where the CPU executed total of 1
-uop per cycle on all execution ports. This can be due to heavy data-dependency
-among software instructions, or over oversubscribing a particular hardware
-resource. In some other cases with high 1_Port_Utilized and L1_Bound, this
-metric can point to L1 data-cache latency bottleneck that may not necessarily
-manifest with complete execution starvation (due to the short L1 latency e.g.
-walking a linked list) - looking at the assembly can be helpful. Tip: consider
-'Core Ports Saturation' analysis-type as next step."""
+This metric represents Core cycles fraction where the CPU
+executed total of 1 uop per cycle on all execution ports.
+This can be due to heavy data-dependency among software
+instructions, or over oversubscribing a particular hardware
+resource. In some other cases with high 1_Port_Utilized and
+L1_Bound, this metric can point to L1 data-cache latency
+bottleneck that may not necessarily manifest with complete
+execution starvation (due to the short L1 latency e.g.
+walking a linked list) - looking at the assembly can be
+helpful. Tip: consider 'Core Ports Saturation' analysis-type
+as next step."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G1_Port_Utilized zero division"
+            print_error("G1_Port_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -989,19 +1209,25 @@ class G2_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 2 uops per
-cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-
-type as next step. Loop Vectorization -most compilers feature auto-
-Vectorization options today- reduces pressure on the execution ports as
-multiple elements are calculated with same uop."""
+This metric represents Core cycles fraction CPU executed
+total of 2 uops per cycle on all execution ports. Tip:
+consider 'Core Port Saturation' analysis-type as next step.
+Loop Vectorization -most compilers feature auto-
+Vectorization options today- reduces pressure on the
+execution ports as multiple elements are calculated with
+same uop."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G2_Ports_Utilized zero division"
+            print_error("G2_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1011,17 +1237,22 @@ class G3m_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 3 or more
-uops per cycle on all execution ports. Tip: consider 'Core Port Saturation'
-analysis-type as next step"""
+This metric represents Core cycles fraction CPU executed
+total of 3 or more uops per cycle on all execution ports.
+Tip: consider 'Core Port Saturation' analysis-type as next
+step"""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G3m_Ports_Utilized zero division"
+            print_error("G3m_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1031,25 +1262,32 @@ class Retiring:
     domain = "Slots"
     area = "RET"
     desc = """
-This category reflects slots utilized by useful work i.e. allocated uops that
-eventually get retired. Ideally, all pipeline slots would be attributed to the
-Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired
-per cycle has been achieved.  Maximizing Retiring typically increases the
-Instruction-Per-Cycle metric. Note that a high Retiring value does not
-necessary mean there is no room for more performance.  For example, Microcode
-assists are categorized under Retiring. They hurt performance and can often be
-avoided.  A high Retiring value for non-vectorized code may be a good hint for
-programmer to consider vectorizing his code.  Doing so essentially lets more
-computations be done without significantly increasing number of instructions
-thus improving the performance."""
+This category reflects slots utilized by useful work i.e.
+allocated uops that eventually get retired. Ideally, all
+pipeline slots would be attributed to the Retiring category.
+Retiring of 100% would indicate the maximum 4 uops retired
+per cycle has been achieved.  Maximizing Retiring typically
+increases the Instruction-Per-Cycle metric. Note that a high
+Retiring value does not necessary mean there is no room for
+more performance.  For example, Microcode assists are
+categorized under Retiring. They hurt performance and can
+often be avoided.  A high Retiring value for non-vectorized
+code may be a good hint for programmer to consider
+vectorizing his code.  Doing so essentially lets more
+computations be done without significantly increasing number
+of instructions thus improving the performance."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
-            #print "Retiring zero division"
+            print_error("Retiring zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1059,21 +1297,27 @@ class Base:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction where the CPU was retiring uops not
-originated from the microcode-sequencer. This correlates with total number of
-instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. While this is the most desirable of the top 4 categories, high
-values may still indicate areas for improvement. If possible focus on
-techniques that reduce instruction count or result in more efficient
-instructions generation such as vectorization."""
+This metric represents slots fraction where the CPU was
+retiring uops not originated from the microcode-sequencer.
+This correlates with total number of instructions used by
+the program. A uops-per-instruction ratio of 1 should be
+expected. While this is the most desirable of the top 4
+categories, high values may still indicate areas for
+improvement. If possible focus on techniques that reduce
+instruction count or result in more efficient instructions
+generation such as vectorization."""
     level = 2
     htoff = False
+    sample = ['INST_RETIRED.PREC_DIST']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Base zero division"
+            print_error("Base zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1083,18 +1327,24 @@ class Microcode_Sequencer:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops fetched by the
-Microcode Sequencer (MS) ROM.  The MS is used for CISC instructions not fully
-decoded by the default decoders (like repeat move strings), or by microcode
-assists used to address some operation modes (like in Floating Point assists)."""
+This metric represents slots fraction CPU was retiring uops
+fetched by the Microcode Sequencer (MS) ROM.  The MS is used
+for CISC instructions not fully decoded by the default
+decoders (like repeat move strings), or by microcode assists
+used to address some operation modes (like in Floating Point
+assists)."""
     level = 2
     htoff = False
+    sample = ['IDQ.MS_UOPS']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
-            #print "Microcode_Sequencer zero division"
+            print_error("Microcode_Sequencer zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1105,12 +1355,14 @@ class Metric_IPC:
 Instructions Per Cycle (per logical thread)"""
     domain = "Metric"
     maxval = 5
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPC(EV, 0)
         except ZeroDivisionError:
-            print "IPC zero division"
+            print_error("IPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CPI:
@@ -1119,12 +1371,14 @@ class Metric_CPI:
 Cycles Per Instruction (threaded)"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CPI(EV, 0)
         except ZeroDivisionError:
-            print "CPI zero division"
+            print_error("CPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CoreIPC:
@@ -1132,13 +1386,15 @@ class Metric_CoreIPC:
     desc = """
 Instructions Per Cycle (per physical core)"""
     domain = "CoreMetric"
-    maxval = 5
+    maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CoreIPC(EV, 0)
         except ZeroDivisionError:
-            print "CoreIPC zero division"
+            print_error("CoreIPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_UPI:
@@ -1147,12 +1403,14 @@ class Metric_UPI:
 Uops Per Instruction"""
     domain = "Metric"
     maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = UPI(EV, 0)
         except ZeroDivisionError:
-            print "UPI zero division"
+            print_error("UPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_IPTB:
@@ -1161,71 +1419,82 @@ class Metric_IPTB:
 Instruction per taken branch"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPTB(EV, 0)
         except ZeroDivisionError:
-            print "IPTB zero division"
+            print_error("IPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_BPTB:
     name = "BPTB"
     desc = """
-Branch instructions per taken branch. Can be used to approximate PGO-
-likelihood for non-loopy codes."""
+Branch instructions per taken branch. Can be used to
+approximate PGO-likelihood for non-loopy codes."""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = BPTB(EV, 0)
         except ZeroDivisionError:
-            print "BPTB zero division"
+            print_error("BPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     desc = """
-Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+Fraction of Uops delivered by the DSB (decoded instructions
+cache)"""
     domain = "Metric"
     maxval = 1
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = DSB_Coverage(EV, 0)
         except ZeroDivisionError:
-            print "DSB_Coverage zero division"
+            print_error("DSB_Coverage zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_ILP:
     name = "ILP"
     desc = """
-Instruction-Level-Parallelism (average number of uops executed when there is
-at least 1 uop executed)"""
+Instruction-Level-Parallelism (average number of uops
+executed when there is at least 1 uop executed)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = ILP(EV, 0)
         except ZeroDivisionError:
-            print "ILP zero division"
+            print_error("ILP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MLP:
     name = "MLP"
     desc = """
-Memory-Level-Parallelism (average number of L1 miss demand load when there is
-at least 1 such miss)"""
+Memory-Level-Parallelism (average number of L1 miss demand
+load when there is at least 1 such miss)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MLP(EV, 0)
         except ZeroDivisionError:
-            print "MLP zero division"
+            print_error("MLP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Load_Miss_Real_Latency:
@@ -1234,41 +1503,47 @@ class Metric_Load_Miss_Real_Latency:
 Actual Average Latency for L1 data-cache miss demand loads"""
     domain = "Metric"
     maxval = 1000
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Load_Miss_Real_Latency(EV, 0)
         except ZeroDivisionError:
-            print "Load_Miss_Real_Latency zero division"
+            print_error("Load_Miss_Real_Latency zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Turbo_Utilization:
     name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
-    domain = "CoreMetric"
+    domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Turbo_Utilization(EV, 0)
         except ZeroDivisionError:
-            print "Turbo_Utilization zero division"
+            print_error("Turbo_Utilization zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Page_Walks_Use:
     name = "Page_Walks_Use"
     desc = """
-Fraction of cycles where the core's Page Walker is busy serving
-iTLB/Load/Store"""
+Fraction of cycles where the core's Page Walker is busy
+serving iTLB/Load/Store"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Page_Walks_Use(EV, 0)
         except ZeroDivisionError:
-            print "Page_Walks_Use zero division"
+            print_error("Page_Walks_Use zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MUX:
@@ -1277,12 +1552,14 @@ class Metric_MUX:
 PerfMon Event Multiplexing accuracy indicator"""
     domain = "Clocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MUX(EV, 0)
         except ZeroDivisionError:
-            print "MUX zero division"
+            print_error("MUX zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CLKS:
@@ -1291,12 +1568,14 @@ class Metric_CLKS:
 Per-thread actual clocks"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CLKS zero division"
+            print_error("CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CORE_CLKS:
@@ -1305,12 +1584,14 @@ class Metric_CORE_CLKS:
 Core actual clocks"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CORE_CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CORE_CLKS zero division"
+            print_error("CORE_CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Time:
@@ -1319,12 +1600,14 @@ class Metric_Time:
 Run duration time in seconds"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "Time zero division"
+            print_error("Time zero division")
+            self.errcount += 1
             self.val = 0
 
 # Schedule
@@ -1444,100 +1727,11 @@ def __init__(self, r):
 
         # siblings cross-tree
 
-        o["Frontend_Bound"].sibling = None
-        o["Frontend_Latency"].sibling = None
-        o["ITLB_Misses"].sibling = None
-        o["DSB_Switches"].sibling = None
-        o["LCP"].sibling = None
 	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
-        o["Frontend_Bandwidth"].sibling = None
-        o["MITE"].sibling = None
-        o["DSB"].sibling = None
-        o["LSD"].sibling = None
-	o["Bad_Speculation"].sibling = None
-        o["Branch_Mispredicts"].sibling = None
-        o["Machine_Clears"].sibling = None
-        o["Backend_Bound"].sibling = None
-        o["Memory_Bound"].sibling = None
 	o["L1_Bound"].sibling = o["G1_Port_Utilized"]
-        o["DTLB_Load"].sibling = None
-        o["Store_Fwd_Blk"].sibling = None
-        o["Split_Loads"].sibling = None
-        o["G4K_Aliasing"].sibling = None
-        o["L2_Bound"].sibling = None
-        o["L3_Bound"].sibling = None
-        o["Contested_Accesses"].sibling = None
-        o["Data_Sharing"].sibling = None
-        o["L3_Latency"].sibling = None
-        o["SQ_Full"].sibling = None
-        o["MEM_Bound"].sibling = None
-        o["MEM_Bandwidth"].sibling = None
-        o["MEM_Latency"].sibling = None
-        o["Local_DRAM"].sibling = None
-        o["Remote_DRAM"].sibling = None
-        o["Remote_Cache"].sibling = None
-        o["Stores_Bound"].sibling = None
-	o["Split_Stores"].sibling = None
-        o["DTLB_Store"].sibling = None
-        o["Core_Bound"].sibling = None
-        o["Divider"].sibling = None
-        o["Ports_Utilization"].sibling = None
-        o["G0_Ports_Utilized"].sibling = None
 	o["G1_Port_Utilized"].sibling = o["L1_Bound"]
-        o["G2_Ports_Utilized"].sibling = None
-        o["G3m_Ports_Utilized"].sibling = None
-        o["Retiring"].sibling = None
-        o["Base"].sibling = None
 	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events
-
-        o["Frontend_Bound"].sample = []
-        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
-        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
-        o["DSB_Switches"].sample = []
-        o["LCP"].sample = []
-        o["MS_Switches"].sample = []
-        o["Frontend_Bandwidth"].sample = []
-        o["MITE"].sample = []
-        o["DSB"].sample = []
-        o["LSD"].sample = []
-        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
-        o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
-        o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT']
-        o["Backend_Bound"].sample = []
-        o["Memory_Bound"].sample = []
-        o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
-        o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
-        o["Store_Fwd_Blk"].sample = []
-        o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
-        o["G4K_Aliasing"].sample = []
-        o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
-        o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
-        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS:pp']
-        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp']
-        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_HIT:pp']
-        o["SQ_Full"].sample = []
-        o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp']
-        o["MEM_Bandwidth"].sample = []
-        o["MEM_Latency"].sample = []
-        o["Local_DRAM"].sample = ['MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM:pp']
-        o["Remote_DRAM"].sample = ['MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM:pp']
-        o["Remote_Cache"].sample = ['MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM:pp', 'MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD:pp']
-        o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
-        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
-        o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
-        o["Core_Bound"].sample = []
-        o["Divider"].sample = []
-        o["Ports_Utilization"].sample = []
-        o["G0_Ports_Utilized"].sample = []
-        o["G1_Port_Utilized"].sample = []
-        o["G2_Ports_Utilized"].sample = []
-        o["G3m_Ports_Utilized"].sample = []
-        o["Retiring"].sample = []
-        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
-        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
-
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
diff --git a/ivb_client_ratios.py b/ivb_client_ratios.py
index 7373a4af..f978950f 100644
--- a/ivb_client_ratios.py
+++ b/ivb_client_ratios.py
@@ -8,6 +8,10 @@
 # https://sites.google.com/site/analysismethods/yasin-pubs
 #
 
+# Helpers
+
+print_error = lambda msg: False
+
 smt_enabled = False
 
 # Constants
@@ -189,22 +193,28 @@ class Frontend_Bound:
     domain = "Slots"
     area = "FE"
     desc = """
-This category reflects slots where the Frontend of the processor undersupplies
-its Backend. Frontend denotes the first portion of pipeline responsible to
-fetch micro-ops which the Backend can execute. Within the Frontend, a branch
-predictor predicts the next address to fetch, cache-lines are fetched from
-memory, parsed into instructions, and lastly decoded into micro-ops. The
-purpose of the Frontend cluster is to deliver uops to Backend whenever the
-latter can accept them. For example, stalls due to instruction-cache misses
-would be categorized under Frontend Bound."""
+This category reflects slots where the Frontend of the
+processor undersupplies its Backend. Frontend denotes the
+first portion of pipeline responsible to fetch micro-ops
+which the Backend can execute. Within the Frontend, a branch
+predictor predicts the next address to fetch, cache-lines
+are fetched from memory, parsed into instructions, and
+lastly decoded into micro-ops. The purpose of the Frontend
+cluster is to deliver uops to Backend whenever the latter
+can accept them. For example, stalls due to instruction-
+cache misses would be categorized under Frontend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Frontend_Bound zero division"
+            print_error("Frontend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -214,18 +224,24 @@ class Frontend_Latency:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend latency
-issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
-after a branch misprediction are categorized under Frontend Latency. In such
-cases the Frontend eventually delivers no uops for some period."""
+This metric represents slots fraction CPU was stalled due to
+Frontend latency issues.  For example, instruction-cache
+misses, iTLB misses or fetch stalls after a branch
+misprediction are categorized under Frontend Latency. In
+such cases the Frontend eventually delivers no uops for some
+period."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Latency zero division"
+            print_error("Frontend_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -235,17 +251,22 @@ class ICache_Misses:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to instruction
-cache misses. Using compiler's Profile-Guided Optimization (PGO) can reduce
-i-cache misses through improved hot code layout."""
+This metric represents cycles fraction CPU was stalled due
+to instruction cache misses. Using compiler's Profile-Guided
+Optimization (PGO) can reduce i-cache misses through
+improved hot code layout."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ICACHE.IFETCH_STALL", 3) / CLKS(EV, 3) - self.ITLB_Misses.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ICache_Misses zero division"
+            print_error("ICache_Misses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -255,16 +276,21 @@ class ITLB_Misses:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to instruction TLB
-misses. Using large code pages may be considered here."""
+This metric represents cycles fraction CPU was stalled due
+to instruction TLB misses. Using large code pages may be
+considered here."""
     level = 3
     htoff = False
+    sample = ['ITLB_MISSES.WALK_COMPLETED']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ITLB_Misses zero division"
+            print_error("ITLB_Misses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -274,19 +300,25 @@ class Branch_Resteers:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to Branch Resteers.
-Following all sorts of miss-predicted branches, this measure the delays of
-fetch instructions from corrected path caused by the Frontend of the machine.
-For example, branchy code with lots of (taken) branches and/or branch miss-
-predictions might get categorized under Branch Resteers."""
+This metric represents cycles fraction CPU was stalled due
+to Branch Resteers. Following all sorts of miss-predicted
+branches, this measure the delays of fetch instructions from
+corrected path caused by the Frontend of the machine. For
+example, branchy code with lots of (taken) branches and/or
+branch miss-predictions might get categorized under Branch
+Resteers."""
     level = 3
     htoff = False
+    sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3) + EV("MACHINE_CLEARS.COUNT", 3) + EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Branch_Resteers zero division"
+            print_error("Branch_Resteers zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -296,16 +328,21 @@ class DSB_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches from
-DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
+This metric represents cycles fraction CPU was stalled due
+to switches from DSB to MITE pipelines. Optimizing for
+better DSB hit rate may be considered."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB_Switches zero division"
+            print_error("DSB_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -315,17 +352,22 @@ class LCP:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to Length Changing
-Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
-certainly avoid this."""
+This metric represents cycles fraction CPU was stalled due
+to Length Changing Prefixes (LCPs). Using proper compiler
+flags or Intel Compiler by default will certainly avoid
+this."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LCP zero division"
+            print_error("LCP zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -335,19 +377,25 @@ class MS_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches of uop
-delivery to the Microcode Sequencer (MS). Commonly used instructions are
-optimized for delivery by the DSB or MITE pipelines. The MS is designated to
-deliver long uop flows required by CISC instructions like CPUID, or uncommon
-conditions like Floating Point Assists when dealing with Denormals."""
+This metric represents cycles fraction CPU was stalled due
+to switches of uop delivery to the Microcode Sequencer (MS).
+Commonly used instructions are optimized for delivery by the
+DSB or MITE pipelines. The MS is designated to deliver long
+uop flows required by CISC instructions like CPUID, or
+uncommon conditions like Floating Point Assists when dealing
+with Denormals."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MS_Switches zero division"
+            print_error("MS_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -357,19 +405,24 @@ class Frontend_Bandwidth:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend
-bandwidth issues.  For example, inefficiencies at the instruction decoders, or
-code restrictions for caching in the DSB (decoded uops cache) are categorized
-under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
-optimal amount of uops to the Backend."""
+This metric represents slots fraction CPU was stalled due to
+Frontend bandwidth issues.  For example, inefficiencies at
+the instruction decoders, or code restrictions for caching
+in the DSB (decoded uops cache) are categorized under
+Frontend Bandwidth. In such cases, the Frontend typically
+delivers non-optimal amount of uops to the Backend."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Bandwidth zero division"
+            print_error("Frontend_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -379,17 +432,22 @@ class MITE:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to the MITE fetch pipeline.  For example, inefficiencies in the
-instruction decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to the MITE fetch pipeline.  For example,
+inefficiencies in the instruction decoders are categorized
+here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MITE zero division"
+            print_error("MITE zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -399,18 +457,23 @@ class DSB:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
-utilization of the DSB cache structure or bank conflict when reading from it,
-are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to DSB (decoded uop cache) fetch
+pipeline.  For example, inefficient utilization of the DSB
+cache structure or bank conflict when reading from it, are
+categorized here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB zero division"
+            print_error("DSB zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -420,19 +483,24 @@ class LSD:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
-Uop supply. However, in some rare cases, optimal uop-delivery could not be
-reached for small loops whose size (in terms of number of uops) does not suit
-well the LSD structure."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to LSD (Loop Stream Detector) unit.  LSD
+typically does well sustaining Uop supply. However, in some
+rare cases, optimal uop-delivery could not be reached for
+small loops whose size (in terms of number of uops) does not
+suit well the LSD structure."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LSD zero division"
+            print_error("LSD zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -442,19 +510,24 @@ class Bad_Speculation:
     domain = "Slots"
     area = "BAD"
     desc = """
-This category reflects slots wasted due to incorrect speculations, which
-include slots used to allocate uops that do not eventually get retired and
-slots for which allocation was blocked due to recovery from earlier incorrect
-speculation. For example, wasted work due to miss-predicted branches are
-categorized under Bad Speculation category"""
+This category reflects slots wasted due to incorrect
+speculations, which include slots used to allocate uops that
+do not eventually get retired and slots for which allocation
+was blocked due to recovery from earlier incorrect
+speculation. For example, wasted work due to miss-predicted
+branches are categorized under Bad Speculation category"""
     level = 1
     htoff = False
+    sample = ['INT_MISC.RECOVERY_CYCLES']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
-            #print "Bad_Speculation zero division"
+            print_error("Bad_Speculation zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -464,18 +537,23 @@ class Branch_Mispredicts:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Branch
-Misprediction.  These slots are either wasted by uops fetched from an
-incorrectly speculated program path, or stalls the Backend of the machine
-needs to recover its state from a speculative path."""
+This metric represents slots fraction CPU was impacted by
+Branch Misprediction.  These slots are either wasted by uops
+fetched from an incorrectly speculated program path, or
+stalls the Backend of the machine needs to recover its state
+from a speculative path."""
     level = 2
     htoff = False
+    sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Branch_Mispredicts zero division"
+            print_error("Branch_Mispredicts zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -485,19 +563,24 @@ class Machine_Clears:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Machine Clears.
-These slots are either wasted by uops fetched prior to the clear, or stalls
-the Backend of the machine needs to recover its state after the clear. For
-example, this can happen due to memory ordering Nukes (e.g. Memory
-Disambiguation) or Self-Modifying-Code (SMC) nukes."""
+This metric represents slots fraction CPU was impacted by
+Machine Clears.  These slots are either wasted by uops
+fetched prior to the clear, or stalls the Backend of the
+machine needs to recover its state after the clear. For
+example, this can happen due to memory ordering Nukes (e.g.
+Memory Disambiguation) or Self-Modifying-Code (SMC) nukes."""
     level = 2
     htoff = False
+    sample = ['MACHINE_CLEARS.COUNT']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Machine_Clears zero division"
+            print_error("Machine_Clears zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -507,21 +590,27 @@ class Backend_Bound:
     domain = "Slots"
     area = "BE"
     desc = """
-This category reflects slots where no uops are being delivered due to a lack
-of required resources for accepting more uops in the Backend of the pipeline.
-Backend describes the portion of the pipeline where the out-of-order scheduler
-dispatches ready uops into their respective execution units, and once
-completed these uops get retired according to program order. For example,
-stalls due to data-cache misses or stalls due to the divider unit being
-overloaded are both categorized under Backend Bound."""
+This category reflects slots where no uops are being
+delivered due to a lack of required resources for accepting
+more uops in the Backend of the pipeline. Backend describes
+the portion of the pipeline where the out-of-order scheduler
+dispatches ready uops into their respective execution units,
+and once completed these uops get retired according to
+program order. For example, stalls due to data-cache misses
+or stalls due to the divider unit being overloaded are both
+categorized under Backend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Backend_Bound zero division"
+            print_error("Backend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -531,19 +620,25 @@ class Memory_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how much Memory subsystem was a bottleneck.  Memory
-Bound measures cycle fraction where pipeline is likely stalled due to demand
-load or store instructions. This accounts mainly for non-completed in-flight
-memory demand loads which coincides with execution starvation. in addition to
-less common cases where stores could imply backpressure on the pipeline."""
+This metric represents how much Memory subsystem was a
+bottleneck.  Memory Bound measures cycle fraction where
+pipeline is likely stalled due to demand load or store
+instructions. This accounts mainly for non-completed in-
+flight memory demand loads which coincides with execution
+starvation. in addition to less common cases where stores
+could imply backpressure on the pipeline."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Memory_Bound zero division"
+            print_error("Memory_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -553,20 +648,26 @@ class L1_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled without missing the L1 data
-cache.  The L1 cache typically has the shortest latency.  However, in certain
-cases like loads blocked on older stores, a load might suffer a high latency
-even though it is being satisfied by the L1. There are no fill-buffers
-allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event
-as it accounts for any non-completed load."""
+This metric represents how often CPU was stalled without
+missing the L1 data cache.  The L1 cache typically has the
+shortest latency.  However, in certain cases like loads
+blocked on older stores, a load might suffer a high latency
+even though it is being satisfied by the L1. There are no
+fill-buffers allocated for L1 hits so instead we use the
+load matrix (LDM) stalls sub-event as it accounts for any
+non-completed load."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh
         except ZeroDivisionError:
-            #print "L1_Bound zero division"
+            print_error("L1_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -575,15 +676,21 @@ class DTLB_Load:
     name = "DTLB_Load"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were waiting for page table walks. Consider making the
+working set more compact or using large pages."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Load zero division"
+            print_error("DTLB_Load zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -592,15 +699,24 @@ class Store_Fwd_Blk:
     name = "Store_Fwd_Blk"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Stores were blocked on store-forwarding between depending
+operations. This typically occurs when an output of a
+computation is accessed with a different sized data type.
+Review the rules for store forwarding in the optimization
+guide."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Store_Fwd_Blk zero division"
+            print_error("Store_Fwd_Blk zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -610,17 +726,22 @@ class Lock_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents cycles fraction the CPU spent handling cache misses due
-to lock operations. Due to the microarchitecture handling of locks, they are
-classified as L1_Bound regardless of what memory source satsified them."""
+This metric represents cycles fraction the CPU spent
+handling cache misses due to lock operations. Due to the
+microarchitecture handling of locks, they are classified as
+L1_Bound regardless of what memory source satsified them."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.LOCK_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_Lock_St_Fraction(EV, 4)* ORO_Demand_RFO_C1(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Lock_Latency zero division"
+            print_error("Lock_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -629,15 +750,21 @@ class Split_Loads:
     name = "Split_Loads"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were crossing 64 byte cache lines. Consider naturally
+aligning data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 13 * EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Loads zero division"
+            print_error("Split_Loads zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -646,15 +773,22 @@ class G4K_Aliasing:
     name = "4K_Aliasing"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Memory accesses were aliased by nearby others with a 4K
+offset. Reorganize the data to avoid this. See the
+optimization manual for more details."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "G4K_Aliasing zero division"
+            print_error("G4K_Aliasing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -664,17 +798,21 @@ class L2_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L2 cache.  Avoiding cache
-misses (i.e. L1 misses/L2 hits) will improve the latency and increase
-performance."""
+This metric represents how often CPU was stalled on L2
+cache.  Avoiding cache misses (i.e. L1 misses/L2 hits) will
+improve the latency and increase performance."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.03) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L2_Bound zero division"
+            print_error("L2_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -684,17 +822,22 @@ class L3_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L3 cache or contended with
-a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
-the latency and increase performance."""
+This metric represents how often CPU was stalled on L3 cache
+or contended with a sibling Core.  Avoiding cache misses
+(i.e. L2 misses/L3 hits) will improve the latency and
+increase performance."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Bound zero division"
+            print_error("L3_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -703,15 +846,21 @@ class Contested_Accesses:
     name = "Contested_Accesses"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+64 byte cache lines were bouncing between cores. Avoid false
+sharing, unnecessary writes, and localize data."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Contested_Accesses zero division"
+            print_error("Contested_Accesses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -723,12 +872,16 @@ class Data_Sharing:
     desc = ""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Data_Sharing zero division"
+            print_error("Data_Sharing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -738,18 +891,23 @@ class L3_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric is a rough aggregate estimate of cycles fraction where CPU
-accessed L3 cache for all load requests, while there was no contention/sharing
-with a sibling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
-improve the latency and increase performance."""
+This metric is a rough aggregate estimate of cycles fraction
+where CPU accessed L3 cache for all load requests, while
+there was no contention/sharing with a sibling core.
+Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
+the latency and increase performance."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Latency zero division"
+            print_error("L3_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -759,17 +917,22 @@ class SQ_Full:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric measures fraction of cycles where the Super Queue (SQ) was full
-taking into account all request-types and both hardware SMT threads. The Super
-Queue is used for requests to access the L2 cache or to go out to the Uncore."""
+This metric measures fraction of cycles where the Super
+Queue (SQ) was full taking into account all request-types
+and both hardware SMT threads. The Super Queue is used for
+requests to access the L2 cache or to go out to the Uncore."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "SQ_Full zero division"
+            print_error("SQ_Full zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -779,16 +942,21 @@ class MEM_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on main memory (DRAM).
-Caching will improve the latency and increase performance."""
+This metric represents how often CPU was stalled on main
+memory (DRAM).  Caching will improve the latency and
+increase performance."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.LLC_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bound zero division"
+            print_error("MEM_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -798,17 +966,21 @@ class MEM_Bandwidth:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to approaching
-bandwidth limits of main memory (DRAM).  NUMA in multi-socket system may be
-considered in such case."""
+This metric represents how often CPU was likely stalled due
+to approaching bandwidth limits of main memory (DRAM).  NUMA
+in multi-socket system may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bandwidth zero division"
+            print_error("MEM_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -818,17 +990,22 @@ class MEM_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to latency from
-main memory (DRAM).  Data layout re-structuring or using Software Prefetches
-(also through the compiler) may be considered in such case."""
+This metric represents how often CPU was likely stalled due
+to latency from main memory (DRAM).  Data layout re-
+structuring or using Software Prefetches (also through the
+compiler) may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Latency zero division"
+            print_error("MEM_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -838,18 +1015,23 @@ class Stores_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled  due to store operations.
-even though memory store accesses do not typically stall out-of-order CPUs;
-there are few cases where stores can lead to actual stalls. This metric will
-be flagged should any of these cases be a bottleneck."""
+This metric represents how often CPU was stalled  due to
+store operations. even though memory store accesses do not
+typically stall out-of-order CPUs; there are few cases where
+stores can lead to actual stalls. This metric will be
+flagged should any of these cases be a bottleneck."""
     level = 3
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Stores_Bound zero division"
+            print_error("Stores_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -859,16 +1041,21 @@ class Store_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents cycles fraction the CPU spent handling long-latency
-store misses (missing 2nd level cache)."""
+This metric represents cycles fraction the CPU spent
+handling long-latency store misses (missing 2nd level
+cache)."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Store_L2_Hit_Cycles(EV, 4) +(1 - Mem_Lock_St_Fraction(EV, 4))* ORO_Demand_RFO_C1(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Store_Latency zero division"
+            print_error("Store_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -878,18 +1065,23 @@ class False_Sharing:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled due to False Sharing. False
-Sharing is a multithreading hiccup, where multiple threads contend on
-different data-elements mapped into the same cache line. It can be easily
-avoided by padding to make threads access different lines."""
+This metric represents how often CPU was stalled due to
+False Sharing. False Sharing is a multithreading hiccup,
+where multiple threads contend on different data-elements
+mapped into the same cache line. It can be easily avoided by
+padding to make threads access different lines."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE_0']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_False_Sharing_Client(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "False_Sharing zero division"
+            print_error("False_Sharing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -899,16 +1091,21 @@ class Split_Stores:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric represents rate of split store accesses.  Consider aligning your
-data to the 64-byte cache line granularity."""
+This metric represents rate of split store accesses.
+Consider aligning your data to the 64-byte cache line
+granularity."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Stores zero division"
+            print_error("Split_Stores zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -918,20 +1115,25 @@ class DTLB_Store:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents cycles fraction spent handling first-level data TLB
-store misses.  As with ordinary data caching, focus on improving data locality
-and reducing working-set size to reduce DTLB overhead.  Additionally, consider
-using profile-guided optimization (PGO) to collocate frequently-used data on
-the same page.  Try using larger page sizes for large amounts of frequently-
-used data."""
+This metric represents cycles fraction spent handling first-
+level data TLB store misses.  As with ordinary data caching,
+focus on improving data locality and reducing working-set
+size to reduce DTLB overhead.  Additionally, consider using
+profile-guided optimization (PGO) to collocate frequently-
+used data on the same page.  Try using larger page sizes for
+large amounts of frequently-used data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Store zero division"
+            print_error("DTLB_Store zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -941,21 +1143,27 @@ class Core_Bound:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents how much Core non-memory issues were of a bottleneck.
-Shortage in hardware compute resources, or dependencies software's
-instructions are both categorized under Core Bound. Hence it may indicate the
-machine ran out of an OOO resources, certain execution units are overloaded or
-dependencies in program's data- or instruction-flow are limiting the
-performance (e.g. FP-chained long-latency arithmetic operations). Tip:
-consider Port Saturation analysis as next step."""
+This metric represents how much Core non-memory issues were
+of a bottleneck.  Shortage in hardware compute resources, or
+dependencies software's instructions are both categorized
+under Core Bound. Hence it may indicate the machine ran out
+of an OOO resources, certain execution units are overloaded
+or dependencies in program's data- or instruction-flow are
+limiting the performance (e.g. FP-chained long-latency
+arithmetic operations). Tip: consider Port Saturation
+analysis as next step."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Core_Bound zero division"
+            print_error("Core_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -964,15 +1172,22 @@ class Divider:
     name = "Divider"
     domain = "CoreClocks"
     area = "BE/Core"
-    desc = ""
+    desc = """
+Time waiting for divisions by variables. Change the dividend
+to be constant or use profile feedback to let the compiler
+do that."""
     level = 3
     htoff = False
+    sample = ['ARITH.FPU_DIV_ACTIVE']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Divider zero division"
+            print_error("Divider zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -982,21 +1197,27 @@ class Ports_Utilization:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction application was stalled due to Core
-computation issues (non divider-related).  For example, heavy data-dependency
-between nearby instructions will manifest in this category. Ditto if
-instruction-mix used by the application overloads specific hardware execution
-unit. Hint: Loop Vectorization -most compilers feature auto-Vectorization
-options today- reduces pressure on the execution ports as multiple elements
-are calculated with same uop."""
+This metric represents cycles fraction application was
+stalled due to Core computation issues (non divider-
+related).  For example, heavy data-dependency between nearby
+instructions will manifest in this category. Ditto if
+instruction-mix used by the application overloads specific
+hardware execution unit. Hint: Loop Vectorization -most
+compilers feature auto-Vectorization options today- reduces
+pressure on the execution ports as multiple elements are
+calculated with same uop."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Ports_Utilization zero division"
+            print_error("Ports_Utilization zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1006,16 +1227,20 @@ class G0_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed no uops on any
-execution port."""
+This metric represents Core cycles fraction CPU executed no
+uops on any execution port."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G0_Ports_Utilized zero division"
+            print_error("G0_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1025,22 +1250,29 @@ class G1_Port_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction where the CPU executed total of 1
-uop per cycle on all execution ports. This can be due to heavy data-dependency
-among software instructions, or over oversubscribing a particular hardware
-resource. In some other cases with high 1_Port_Utilized and L1_Bound, this
-metric can point to L1 data-cache latency bottleneck that may not necessarily
-manifest with complete execution starvation (due to the short L1 latency e.g.
-walking a linked list) - looking at the assembly can be helpful. Tip: consider
-'Core Ports Saturation' analysis-type as next step."""
+This metric represents Core cycles fraction where the CPU
+executed total of 1 uop per cycle on all execution ports.
+This can be due to heavy data-dependency among software
+instructions, or over oversubscribing a particular hardware
+resource. In some other cases with high 1_Port_Utilized and
+L1_Bound, this metric can point to L1 data-cache latency
+bottleneck that may not necessarily manifest with complete
+execution starvation (due to the short L1 latency e.g.
+walking a linked list) - looking at the assembly can be
+helpful. Tip: consider 'Core Ports Saturation' analysis-type
+as next step."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G1_Port_Utilized zero division"
+            print_error("G1_Port_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1050,19 +1282,25 @@ class G2_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 2 uops per
-cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-
-type as next step. Loop Vectorization -most compilers feature auto-
-Vectorization options today- reduces pressure on the execution ports as
-multiple elements are calculated with same uop."""
+This metric represents Core cycles fraction CPU executed
+total of 2 uops per cycle on all execution ports. Tip:
+consider 'Core Port Saturation' analysis-type as next step.
+Loop Vectorization -most compilers feature auto-
+Vectorization options today- reduces pressure on the
+execution ports as multiple elements are calculated with
+same uop."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G2_Ports_Utilized zero division"
+            print_error("G2_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1072,17 +1310,22 @@ class G3m_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 3 or more
-uops per cycle on all execution ports. Tip: consider 'Core Port Saturation'
-analysis-type as next step"""
+This metric represents Core cycles fraction CPU executed
+total of 3 or more uops per cycle on all execution ports.
+Tip: consider 'Core Port Saturation' analysis-type as next
+step"""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G3m_Ports_Utilized zero division"
+            print_error("G3m_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1092,16 +1335,21 @@ class Port_0:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 0 (SNB+: ALU; HSW+:ALU and 2nd branch)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 0 (SNB+: ALU; HSW+:ALU and 2nd
+branch)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_0 zero division"
+            print_error("Port_0 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1111,16 +1359,20 @@ class Port_1:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 1 (ALU)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 1 (ALU)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_1 zero division"
+            print_error("Port_1 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1130,16 +1382,20 @@ class Port_2:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 2 (Loads and Store-address)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 2 (Loads and Store-address)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_2 zero division"
+            print_error("Port_2 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1149,16 +1405,20 @@ class Port_3:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 3 (Loads and Store-address)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 3 (Loads and Store-address)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_3 zero division"
+            print_error("Port_3 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1168,16 +1428,20 @@ class Port_4:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 4 (Store-data)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 4 (Store-data)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_4 zero division"
+            print_error("Port_4 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1187,16 +1451,20 @@ class Port_5:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 5 (SNB+: Branches and ALU; HSW+: ALU)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 5 (SNB+: Branches and ALU; HSW+: ALU)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_5 zero division"
+            print_error("Port_5 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1206,25 +1474,32 @@ class Retiring:
     domain = "Slots"
     area = "RET"
     desc = """
-This category reflects slots utilized by useful work i.e. allocated uops that
-eventually get retired. Ideally, all pipeline slots would be attributed to the
-Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired
-per cycle has been achieved.  Maximizing Retiring typically increases the
-Instruction-Per-Cycle metric. Note that a high Retiring value does not
-necessary mean there is no room for more performance.  For example, Microcode
-assists are categorized under Retiring. They hurt performance and can often be
-avoided.  A high Retiring value for non-vectorized code may be a good hint for
-programmer to consider vectorizing his code.  Doing so essentially lets more
-computations be done without significantly increasing number of instructions
-thus improving the performance."""
+This category reflects slots utilized by useful work i.e.
+allocated uops that eventually get retired. Ideally, all
+pipeline slots would be attributed to the Retiring category.
+Retiring of 100% would indicate the maximum 4 uops retired
+per cycle has been achieved.  Maximizing Retiring typically
+increases the Instruction-Per-Cycle metric. Note that a high
+Retiring value does not necessary mean there is no room for
+more performance.  For example, Microcode assists are
+categorized under Retiring. They hurt performance and can
+often be avoided.  A high Retiring value for non-vectorized
+code may be a good hint for programmer to consider
+vectorizing his code.  Doing so essentially lets more
+computations be done without significantly increasing number
+of instructions thus improving the performance."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
-            #print "Retiring zero division"
+            print_error("Retiring zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1234,21 +1509,27 @@ class Base:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction where the CPU was retiring uops not
-originated from the microcode-sequencer. This correlates with total number of
-instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. While this is the most desirable of the top 4 categories, high
-values may still indicate areas for improvement. If possible focus on
-techniques that reduce instruction count or result in more efficient
-instructions generation such as vectorization."""
+This metric represents slots fraction where the CPU was
+retiring uops not originated from the microcode-sequencer.
+This correlates with total number of instructions used by
+the program. A uops-per-instruction ratio of 1 should be
+expected. While this is the most desirable of the top 4
+categories, high values may still indicate areas for
+improvement. If possible focus on techniques that reduce
+instruction count or result in more efficient instructions
+generation such as vectorization."""
     level = 2
     htoff = False
+    sample = ['INST_RETIRED.PREC_DIST']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Base zero division"
+            print_error("Base zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1258,16 +1539,20 @@ class FP_Arith:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents overall arithmetic floating-point (FP) uops fraction
-the CPU has executed."""
+This metric represents overall arithmetic floating-point
+(FP) uops fraction the CPU has executed."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.FP_x87.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FP_Arith zero division"
+            print_error("FP_Arith zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1277,17 +1562,22 @@ class FP_x87:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric is an approxmiation of floating-point (FP) x87 (arithmetic) uops
-fraction. Tip: consider compiler flags to generate newer AVX (or SSE)
-instruction sets, which typically perform better and feature vectors."""
+This metric is an approxmiation of floating-point (FP) x87
+(arithmetic) uops fraction. Tip: consider compiler flags to
+generate newer AVX (or SSE) instruction sets, which
+typically perform better and feature vectors."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("FP_COMP_OPS_EXE.X87", 4) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FP_x87 zero division"
+            print_error("FP_x87 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1297,17 +1587,21 @@ class FP_Scalar:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents arithmetic floating-point (FP) scalar uops fraction the
-CPU has executed. Tip: investigate what limits (compiler) generation of vector
-code."""
+This metric represents arithmetic floating-point (FP) scalar
+uops fraction the CPU has executed. Tip: investigate what
+limits (compiler) generation of vector code."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", 4) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FP_Scalar zero division"
+            print_error("FP_Scalar zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1317,16 +1611,21 @@ class FP_Vector:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents arithmetic floating-point (FP) vector uops fraction the
-CPU has executed. Tip: check if vector width is expected"""
+This metric represents arithmetic floating-point (FP) vector
+uops fraction the CPU has executed. Tip: check if vector
+width is expected"""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", 4) + EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FP_Vector zero division"
+            print_error("FP_Vector zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1336,17 +1635,21 @@ class Other:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents non-floating-point (FP) uop fraction the CPU has
-executed. If you application has no FP operations, this will likely be biggest
-fraction."""
+This metric represents non-floating-point (FP) uop fraction
+the CPU has executed. If you application has no FP
+operations, this will likely be biggest fraction."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 1 - self.FP_Arith.compute(EV )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Other zero division"
+            print_error("Other zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1356,18 +1659,24 @@ class Microcode_Sequencer:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops fetched by the
-Microcode Sequencer (MS) ROM.  The MS is used for CISC instructions not fully
-decoded by the default decoders (like repeat move strings), or by microcode
-assists used to address some operation modes (like in Floating Point assists)."""
+This metric represents slots fraction CPU was retiring uops
+fetched by the Microcode Sequencer (MS) ROM.  The MS is used
+for CISC instructions not fully decoded by the default
+decoders (like repeat move strings), or by microcode assists
+used to address some operation modes (like in Floating Point
+assists)."""
     level = 2
     htoff = False
+    sample = ['IDQ.MS_UOPS']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
-            #print "Microcode_Sequencer zero division"
+            print_error("Microcode_Sequencer zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1378,12 +1687,14 @@ class Metric_IPC:
 Instructions Per Cycle (per logical thread)"""
     domain = "Metric"
     maxval = 5
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPC(EV, 0)
         except ZeroDivisionError:
-            print "IPC zero division"
+            print_error("IPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CPI:
@@ -1392,12 +1703,14 @@ class Metric_CPI:
 Cycles Per Instruction (threaded)"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CPI(EV, 0)
         except ZeroDivisionError:
-            print "CPI zero division"
+            print_error("CPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CoreIPC:
@@ -1405,13 +1718,15 @@ class Metric_CoreIPC:
     desc = """
 Instructions Per Cycle (per physical core)"""
     domain = "CoreMetric"
-    maxval = 5
+    maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CoreIPC(EV, 0)
         except ZeroDivisionError:
-            print "CoreIPC zero division"
+            print_error("CoreIPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_UPI:
@@ -1420,12 +1735,14 @@ class Metric_UPI:
 Uops Per Instruction"""
     domain = "Metric"
     maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = UPI(EV, 0)
         except ZeroDivisionError:
-            print "UPI zero division"
+            print_error("UPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_IPTB:
@@ -1434,71 +1751,82 @@ class Metric_IPTB:
 Instruction per taken branch"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPTB(EV, 0)
         except ZeroDivisionError:
-            print "IPTB zero division"
+            print_error("IPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_BPTB:
     name = "BPTB"
     desc = """
-Branch instructions per taken branch. Can be used to approximate PGO-
-likelihood for non-loopy codes."""
+Branch instructions per taken branch. Can be used to
+approximate PGO-likelihood for non-loopy codes."""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = BPTB(EV, 0)
         except ZeroDivisionError:
-            print "BPTB zero division"
+            print_error("BPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     desc = """
-Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+Fraction of Uops delivered by the DSB (decoded instructions
+cache)"""
     domain = "Metric"
     maxval = 1
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = DSB_Coverage(EV, 0)
         except ZeroDivisionError:
-            print "DSB_Coverage zero division"
+            print_error("DSB_Coverage zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_ILP:
     name = "ILP"
     desc = """
-Instruction-Level-Parallelism (average number of uops executed when there is
-at least 1 uop executed)"""
+Instruction-Level-Parallelism (average number of uops
+executed when there is at least 1 uop executed)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = ILP(EV, 0)
         except ZeroDivisionError:
-            print "ILP zero division"
+            print_error("ILP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MLP:
     name = "MLP"
     desc = """
-Memory-Level-Parallelism (average number of L1 miss demand load when there is
-at least 1 such miss)"""
+Memory-Level-Parallelism (average number of L1 miss demand
+load when there is at least 1 such miss)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MLP(EV, 0)
         except ZeroDivisionError:
-            print "MLP zero division"
+            print_error("MLP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Load_Miss_Real_Latency:
@@ -1507,12 +1835,14 @@ class Metric_Load_Miss_Real_Latency:
 Actual Average Latency for L1 data-cache miss demand loads"""
     domain = "Metric"
     maxval = 1000
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Load_Miss_Real_Latency(EV, 0)
         except ZeroDivisionError:
-            print "Load_Miss_Real_Latency zero division"
+            print_error("Load_Miss_Real_Latency zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_GFLOPs:
@@ -1521,41 +1851,47 @@ class Metric_GFLOPs:
 Giga Floating Point Operations Per Second"""
     domain = "Metric"
     maxval = 100
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = GFLOPs(EV, 0)
         except ZeroDivisionError:
-            print "GFLOPs zero division"
+            print_error("GFLOPs zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Turbo_Utilization:
     name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
-    domain = "CoreMetric"
+    domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Turbo_Utilization(EV, 0)
         except ZeroDivisionError:
-            print "Turbo_Utilization zero division"
+            print_error("Turbo_Utilization zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Page_Walks_Use:
     name = "Page_Walks_Use"
     desc = """
-Fraction of cycles where the core's Page Walker is busy serving
-iTLB/Load/Store"""
+Fraction of cycles where the core's Page Walker is busy
+serving iTLB/Load/Store"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Page_Walks_Use(EV, 0)
         except ZeroDivisionError:
-            print "Page_Walks_Use zero division"
+            print_error("Page_Walks_Use zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MUX:
@@ -1564,12 +1900,14 @@ class Metric_MUX:
 PerfMon Event Multiplexing accuracy indicator"""
     domain = "Clocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MUX(EV, 0)
         except ZeroDivisionError:
-            print "MUX zero division"
+            print_error("MUX zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CLKS:
@@ -1578,12 +1916,14 @@ class Metric_CLKS:
 Per-thread actual clocks"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CLKS zero division"
+            print_error("CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CORE_CLKS:
@@ -1592,12 +1932,14 @@ class Metric_CORE_CLKS:
 Core actual clocks"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CORE_CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CORE_CLKS zero division"
+            print_error("CORE_CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Time:
@@ -1606,12 +1948,14 @@ class Metric_Time:
 Run duration time in seconds"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "Time zero division"
+            print_error("Time zero division")
+            self.errcount += 1
             self.val = 0
 
 # Schedule
@@ -1762,126 +2106,17 @@ def __init__(self, r):
 
         # siblings cross-tree
 
-        o["Frontend_Bound"].sibling = None
-        o["Frontend_Latency"].sibling = None
-        o["ICache_Misses"].sibling = None
-        o["ITLB_Misses"].sibling = None
 	o["Branch_Resteers"].sibling = o["Bad_Speculation"]
-        o["DSB_Switches"].sibling = None
-        o["LCP"].sibling = None
 	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
-        o["Frontend_Bandwidth"].sibling = None
-        o["MITE"].sibling = None
-        o["DSB"].sibling = None
-        o["LSD"].sibling = None
 	o["Bad_Speculation"].sibling = o["Branch_Resteers"]
-        o["Branch_Mispredicts"].sibling = None
-        o["Machine_Clears"].sibling = None
-        o["Backend_Bound"].sibling = None
-        o["Memory_Bound"].sibling = None
 	o["L1_Bound"].sibling = o["G1_Port_Utilized"]
-        o["DTLB_Load"].sibling = None
-        o["Store_Fwd_Blk"].sibling = None
 	o["Lock_Latency"].sibling = o["Store_Latency"]
-        o["Split_Loads"].sibling = None
-        o["G4K_Aliasing"].sibling = None
-        o["L2_Bound"].sibling = None
-        o["L3_Bound"].sibling = None
-        o["Contested_Accesses"].sibling = None
-        o["Data_Sharing"].sibling = None
-        o["L3_Latency"].sibling = None
-        o["SQ_Full"].sibling = None
-        o["MEM_Bound"].sibling = None
-        o["MEM_Bandwidth"].sibling = None
-        o["MEM_Latency"].sibling = None
-        o["Stores_Bound"].sibling = None
 	o["Store_Latency"].sibling = o["Lock_Latency"]
-        o["False_Sharing"].sibling = None
 	o["Split_Stores"].sibling = o["Port_4"]
-        o["DTLB_Store"].sibling = None
-        o["Core_Bound"].sibling = None
-        o["Divider"].sibling = None
-        o["Ports_Utilization"].sibling = None
-        o["G0_Ports_Utilized"].sibling = None
 	o["G1_Port_Utilized"].sibling = o["L1_Bound"]
-        o["G2_Ports_Utilized"].sibling = None
-        o["G3m_Ports_Utilized"].sibling = None
-        o["Port_0"].sibling = None
-        o["Port_1"].sibling = None
-        o["Port_2"].sibling = None
-        o["Port_3"].sibling = None
 	o["Port_4"].sibling = o["Split_Stores"]
-        o["Port_5"].sibling = None
-        o["Retiring"].sibling = None
-        o["Base"].sibling = None
-        o["FP_Arith"].sibling = None
-        o["FP_x87"].sibling = None
-        o["FP_Scalar"].sibling = None
-        o["FP_Vector"].sibling = None
-        o["Other"].sibling = None
 	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events
-
-        o["Frontend_Bound"].sample = []
-        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
-        o["ICache_Misses"].sample = []
-        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
-        o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
-        o["DSB_Switches"].sample = []
-        o["LCP"].sample = []
-        o["MS_Switches"].sample = []
-        o["Frontend_Bandwidth"].sample = []
-        o["MITE"].sample = []
-        o["DSB"].sample = []
-        o["LSD"].sample = []
-        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
-        o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
-        o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT']
-        o["Backend_Bound"].sample = []
-        o["Memory_Bound"].sample = []
-        o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
-        o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
-        o["Store_Fwd_Blk"].sample = []
-        o["Lock_Latency"].sample = ['MEM_UOPS_RETIRED.LOCK_LOADS:pp']
-        o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
-        o["G4K_Aliasing"].sample = []
-        o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
-        o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
-        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp']
-        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp']
-        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
-        o["SQ_Full"].sample = []
-        o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_MISS:pp']
-        o["MEM_Bandwidth"].sample = []
-        o["MEM_Latency"].sample = []
-        o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
-        o["Store_Latency"].sample = []
-        o["False_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE_0']
-        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
-        o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
-        o["Core_Bound"].sample = []
-        o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE']
-        o["Ports_Utilization"].sample = []
-        o["G0_Ports_Utilized"].sample = []
-        o["G1_Port_Utilized"].sample = []
-        o["G2_Ports_Utilized"].sample = []
-        o["G3m_Ports_Utilized"].sample = []
-        o["Port_0"].sample = []
-        o["Port_1"].sample = []
-        o["Port_2"].sample = []
-        o["Port_3"].sample = []
-        o["Port_4"].sample = []
-        o["Port_5"].sample = []
-        o["Retiring"].sample = []
-        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
-        o["FP_Arith"].sample = []
-        o["FP_x87"].sample = []
-        o["FP_Scalar"].sample = []
-        o["FP_Vector"].sample = []
-        o["Other"].sample = []
-        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
-
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
diff --git a/ivb_server_ratios.py b/ivb_server_ratios.py
index 348d1eec..b428e5fb 100644
--- a/ivb_server_ratios.py
+++ b/ivb_server_ratios.py
@@ -8,6 +8,10 @@
 # https://sites.google.com/site/analysismethods/yasin-pubs
 #
 
+# Helpers
+
+print_error = lambda msg: False
+
 smt_enabled = False
 
 # Constants
@@ -189,22 +193,28 @@ class Frontend_Bound:
     domain = "Slots"
     area = "FE"
     desc = """
-This category reflects slots where the Frontend of the processor undersupplies
-its Backend. Frontend denotes the first portion of pipeline responsible to
-fetch micro-ops which the Backend can execute. Within the Frontend, a branch
-predictor predicts the next address to fetch, cache-lines are fetched from
-memory, parsed into instructions, and lastly decoded into micro-ops. The
-purpose of the Frontend cluster is to deliver uops to Backend whenever the
-latter can accept them. For example, stalls due to instruction-cache misses
-would be categorized under Frontend Bound."""
+This category reflects slots where the Frontend of the
+processor undersupplies its Backend. Frontend denotes the
+first portion of pipeline responsible to fetch micro-ops
+which the Backend can execute. Within the Frontend, a branch
+predictor predicts the next address to fetch, cache-lines
+are fetched from memory, parsed into instructions, and
+lastly decoded into micro-ops. The purpose of the Frontend
+cluster is to deliver uops to Backend whenever the latter
+can accept them. For example, stalls due to instruction-
+cache misses would be categorized under Frontend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Frontend_Bound zero division"
+            print_error("Frontend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -214,18 +224,24 @@ class Frontend_Latency:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend latency
-issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
-after a branch misprediction are categorized under Frontend Latency. In such
-cases the Frontend eventually delivers no uops for some period."""
+This metric represents slots fraction CPU was stalled due to
+Frontend latency issues.  For example, instruction-cache
+misses, iTLB misses or fetch stalls after a branch
+misprediction are categorized under Frontend Latency. In
+such cases the Frontend eventually delivers no uops for some
+period."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Latency zero division"
+            print_error("Frontend_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -235,17 +251,22 @@ class ICache_Misses:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to instruction
-cache misses. Using compiler's Profile-Guided Optimization (PGO) can reduce
-i-cache misses through improved hot code layout."""
+This metric represents cycles fraction CPU was stalled due
+to instruction cache misses. Using compiler's Profile-Guided
+Optimization (PGO) can reduce i-cache misses through
+improved hot code layout."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ICACHE.IFETCH_STALL", 3) / CLKS(EV, 3) - self.ITLB_Misses.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ICache_Misses zero division"
+            print_error("ICache_Misses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -255,16 +276,21 @@ class ITLB_Misses:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to instruction TLB
-misses. Using large code pages may be considered here."""
+This metric represents cycles fraction CPU was stalled due
+to instruction TLB misses. Using large code pages may be
+considered here."""
     level = 3
     htoff = False
+    sample = ['ITLB_MISSES.WALK_COMPLETED']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ITLB_Misses zero division"
+            print_error("ITLB_Misses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -274,19 +300,25 @@ class Branch_Resteers:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to Branch Resteers.
-Following all sorts of miss-predicted branches, this measure the delays of
-fetch instructions from corrected path caused by the Frontend of the machine.
-For example, branchy code with lots of (taken) branches and/or branch miss-
-predictions might get categorized under Branch Resteers."""
+This metric represents cycles fraction CPU was stalled due
+to Branch Resteers. Following all sorts of miss-predicted
+branches, this measure the delays of fetch instructions from
+corrected path caused by the Frontend of the machine. For
+example, branchy code with lots of (taken) branches and/or
+branch miss-predictions might get categorized under Branch
+Resteers."""
     level = 3
     htoff = False
+    sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Avg_RS_Empty_Period_Clears(EV, 3)*(EV("BR_MISP_RETIRED.ALL_BRANCHES", 3) + EV("MACHINE_CLEARS.COUNT", 3) + EV("BACLEARS.ANY", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Branch_Resteers zero division"
+            print_error("Branch_Resteers zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -296,16 +328,21 @@ class DSB_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches from
-DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
+This metric represents cycles fraction CPU was stalled due
+to switches from DSB to MITE pipelines. Optimizing for
+better DSB hit rate may be considered."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB_Switches zero division"
+            print_error("DSB_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -315,17 +352,22 @@ class LCP:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to Length Changing
-Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
-certainly avoid this."""
+This metric represents cycles fraction CPU was stalled due
+to Length Changing Prefixes (LCPs). Using proper compiler
+flags or Intel Compiler by default will certainly avoid
+this."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LCP zero division"
+            print_error("LCP zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -335,19 +377,25 @@ class MS_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches of uop
-delivery to the Microcode Sequencer (MS). Commonly used instructions are
-optimized for delivery by the DSB or MITE pipelines. The MS is designated to
-deliver long uop flows required by CISC instructions like CPUID, or uncommon
-conditions like Floating Point Assists when dealing with Denormals."""
+This metric represents cycles fraction CPU was stalled due
+to switches of uop delivery to the Microcode Sequencer (MS).
+Commonly used instructions are optimized for delivery by the
+DSB or MITE pipelines. The MS is designated to deliver long
+uop flows required by CISC instructions like CPUID, or
+uncommon conditions like Floating Point Assists when dealing
+with Denormals."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MS_Switches zero division"
+            print_error("MS_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -357,19 +405,24 @@ class Frontend_Bandwidth:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend
-bandwidth issues.  For example, inefficiencies at the instruction decoders, or
-code restrictions for caching in the DSB (decoded uops cache) are categorized
-under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
-optimal amount of uops to the Backend."""
+This metric represents slots fraction CPU was stalled due to
+Frontend bandwidth issues.  For example, inefficiencies at
+the instruction decoders, or code restrictions for caching
+in the DSB (decoded uops cache) are categorized under
+Frontend Bandwidth. In such cases, the Frontend typically
+delivers non-optimal amount of uops to the Backend."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Bandwidth zero division"
+            print_error("Frontend_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -379,17 +432,22 @@ class MITE:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to the MITE fetch pipeline.  For example, inefficiencies in the
-instruction decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to the MITE fetch pipeline.  For example,
+inefficiencies in the instruction decoders are categorized
+here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MITE zero division"
+            print_error("MITE zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -399,18 +457,23 @@ class DSB:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
-utilization of the DSB cache structure or bank conflict when reading from it,
-are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to DSB (decoded uop cache) fetch
+pipeline.  For example, inefficient utilization of the DSB
+cache structure or bank conflict when reading from it, are
+categorized here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB zero division"
+            print_error("DSB zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -420,19 +483,24 @@ class LSD:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
-Uop supply. However, in some rare cases, optimal uop-delivery could not be
-reached for small loops whose size (in terms of number of uops) does not suit
-well the LSD structure."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to LSD (Loop Stream Detector) unit.  LSD
+typically does well sustaining Uop supply. However, in some
+rare cases, optimal uop-delivery could not be reached for
+small loops whose size (in terms of number of uops) does not
+suit well the LSD structure."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LSD zero division"
+            print_error("LSD zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -442,19 +510,24 @@ class Bad_Speculation:
     domain = "Slots"
     area = "BAD"
     desc = """
-This category reflects slots wasted due to incorrect speculations, which
-include slots used to allocate uops that do not eventually get retired and
-slots for which allocation was blocked due to recovery from earlier incorrect
-speculation. For example, wasted work due to miss-predicted branches are
-categorized under Bad Speculation category"""
+This category reflects slots wasted due to incorrect
+speculations, which include slots used to allocate uops that
+do not eventually get retired and slots for which allocation
+was blocked due to recovery from earlier incorrect
+speculation. For example, wasted work due to miss-predicted
+branches are categorized under Bad Speculation category"""
     level = 1
     htoff = False
+    sample = ['INT_MISC.RECOVERY_CYCLES']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
-            #print "Bad_Speculation zero division"
+            print_error("Bad_Speculation zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -464,18 +537,23 @@ class Branch_Mispredicts:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Branch
-Misprediction.  These slots are either wasted by uops fetched from an
-incorrectly speculated program path, or stalls the Backend of the machine
-needs to recover its state from a speculative path."""
+This metric represents slots fraction CPU was impacted by
+Branch Misprediction.  These slots are either wasted by uops
+fetched from an incorrectly speculated program path, or
+stalls the Backend of the machine needs to recover its state
+from a speculative path."""
     level = 2
     htoff = False
+    sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mispred_Clears_Fraction(EV, 2)* self.Bad_Speculation.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Branch_Mispredicts zero division"
+            print_error("Branch_Mispredicts zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -485,19 +563,24 @@ class Machine_Clears:
     domain = "Slots"
     area = "BAD"
     desc = """
-This metric represents slots fraction CPU was impacted by Machine Clears.
-These slots are either wasted by uops fetched prior to the clear, or stalls
-the Backend of the machine needs to recover its state after the clear. For
-example, this can happen due to memory ordering Nukes (e.g. Memory
-Disambiguation) or Self-Modifying-Code (SMC) nukes."""
+This metric represents slots fraction CPU was impacted by
+Machine Clears.  These slots are either wasted by uops
+fetched prior to the clear, or stalls the Backend of the
+machine needs to recover its state after the clear. For
+example, this can happen due to memory ordering Nukes (e.g.
+Memory Disambiguation) or Self-Modifying-Code (SMC) nukes."""
     level = 2
     htoff = False
+    sample = ['MACHINE_CLEARS.COUNT']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Bad_Speculation.compute(EV) - self.Branch_Mispredicts.compute(EV )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Machine_Clears zero division"
+            print_error("Machine_Clears zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -507,21 +590,27 @@ class Backend_Bound:
     domain = "Slots"
     area = "BE"
     desc = """
-This category reflects slots where no uops are being delivered due to a lack
-of required resources for accepting more uops in the Backend of the pipeline.
-Backend describes the portion of the pipeline where the out-of-order scheduler
-dispatches ready uops into their respective execution units, and once
-completed these uops get retired according to program order. For example,
-stalls due to data-cache misses or stalls due to the divider unit being
-overloaded are both categorized under Backend Bound."""
+This category reflects slots where no uops are being
+delivered due to a lack of required resources for accepting
+more uops in the Backend of the pipeline. Backend describes
+the portion of the pipeline where the out-of-order scheduler
+dispatches ready uops into their respective execution units,
+and once completed these uops get retired according to
+program order. For example, stalls due to data-cache misses
+or stalls due to the divider unit being overloaded are both
+categorized under Backend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Backend_Bound zero division"
+            print_error("Backend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -531,19 +620,25 @@ class Memory_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how much Memory subsystem was a bottleneck.  Memory
-Bound measures cycle fraction where pipeline is likely stalled due to demand
-load or store instructions. This accounts mainly for non-completed in-flight
-memory demand loads which coincides with execution starvation. in addition to
-less common cases where stores could imply backpressure on the pipeline."""
+This metric represents how much Memory subsystem was a
+bottleneck.  Memory Bound measures cycle fraction where
+pipeline is likely stalled due to demand load or store
+instructions. This accounts mainly for non-completed in-
+flight memory demand loads which coincides with execution
+starvation. in addition to less common cases where stores
+could imply backpressure on the pipeline."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (STALLS_MEM_ANY(EV, 2) + EV("RESOURCE_STALLS.SB", 2)) / CLKS(EV, 2 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Memory_Bound zero division"
+            print_error("Memory_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -553,20 +648,26 @@ class L1_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled without missing the L1 data
-cache.  The L1 cache typically has the shortest latency.  However, in certain
-cases like loads blocked on older stores, a load might suffer a high latency
-even though it is being satisfied by the L1. There are no fill-buffers
-allocated for L1 hits so instead we use the load matrix (LDM) stalls sub-event
-as it accounts for any non-completed load."""
+This metric represents how often CPU was stalled without
+missing the L1 data cache.  The L1 cache typically has the
+shortest latency.  However, in certain cases like loads
+blocked on older stores, a load might suffer a high latency
+even though it is being satisfied by the L1. There are no
+fill-buffers allocated for L1 hits so instead we use the
+load matrix (LDM) stalls sub-event as it accounts for any
+non-completed load."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (STALLS_MEM_ANY(EV, 3) - EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = ((self.val > 0.07) and self.parent.thresh) | self.DTLB_Load.thresh
         except ZeroDivisionError:
-            #print "L1_Bound zero division"
+            print_error("L1_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -575,15 +676,21 @@ class DTLB_Load:
     name = "DTLB_Load"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were waiting for page table walks. Consider making the
+working set more compact or using large pages."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_LOAD_MISSES.STLB_HIT", 4) + EV("DTLB_LOAD_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Load zero division"
+            print_error("DTLB_Load zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -592,15 +699,24 @@ class Store_Fwd_Blk:
     name = "Store_Fwd_Blk"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Stores were blocked on store-forwarding between depending
+operations. This typically occurs when an output of a
+computation is accessed with a different sized data type.
+Review the rules for store forwarding in the optimization
+guide."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_SFB_Cost * EV("LD_BLOCKS.STORE_FORWARD", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Store_Fwd_Blk zero division"
+            print_error("Store_Fwd_Blk zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -610,17 +726,22 @@ class Lock_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents cycles fraction the CPU spent handling cache misses due
-to lock operations. Due to the microarchitecture handling of locks, they are
-classified as L1_Bound regardless of what memory source satsified them."""
+This metric represents cycles fraction the CPU spent
+handling cache misses due to lock operations. Due to the
+microarchitecture handling of locks, they are classified as
+L1_Bound regardless of what memory source satsified them."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.LOCK_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_Lock_St_Fraction(EV, 4)* ORO_Demand_RFO_C1(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Lock_Latency zero division"
+            print_error("Lock_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -629,15 +750,21 @@ class Split_Loads:
     name = "Split_Loads"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Loads were crossing 64 byte cache lines. Consider naturally
+aligning data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 13 * EV("LD_BLOCKS.NO_SR", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Loads zero division"
+            print_error("Split_Loads zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -646,15 +773,22 @@ class G4K_Aliasing:
     name = "4K_Aliasing"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+Memory accesses were aliased by nearby others with a 4K
+offset. Reorganize the data to avoid this. See the
+optimization manual for more details."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_4K_Alias_Cost * EV("LD_BLOCKS_PARTIAL.ADDRESS_ALIAS", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "G4K_Aliasing zero division"
+            print_error("G4K_Aliasing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -664,17 +798,21 @@ class L2_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L2 cache.  Avoiding cache
-misses (i.e. L1 misses/L2 hits) will improve the latency and increase
-performance."""
+This metric represents how often CPU was stalled on L2
+cache.  Avoiding cache misses (i.e. L1 misses/L2 hits) will
+improve the latency and increase performance."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("CYCLE_ACTIVITY.STALLS_L1D_PENDING", 3) - EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3)) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.03) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L2_Bound zero division"
+            print_error("L2_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -684,17 +822,22 @@ class L3_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on L3 cache or contended with
-a sibling Core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
-the latency and increase performance."""
+This metric represents how often CPU was stalled on L3 cache
+or contended with a sibling Core.  Avoiding cache misses
+(i.e. L2 misses/L3 hits) will improve the latency and
+increase performance."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_L3_Hit_Fraction(EV, 3)* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Bound zero division"
+            print_error("L3_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -703,15 +846,21 @@ class Contested_Accesses:
     name = "Contested_Accesses"
     domain = "Clocks"
     area = "BE/Mem"
-    desc = ""
+    desc = """
+64 byte cache lines were bouncing between cores. Avoid false
+sharing, unnecessary writes, and localize data."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_XSNP_HitM_Cost *(EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM", 4) + EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS", 4)) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Contested_Accesses zero division"
+            print_error("Contested_Accesses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -723,12 +872,16 @@ class Data_Sharing:
     desc = ""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_Hit_Cost * EV("MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT", 4) / CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "Data_Sharing zero division"
+            print_error("Data_Sharing zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -738,18 +891,23 @@ class L3_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric is a rough aggregate estimate of cycles fraction where CPU
-accessed L3 cache for all load requests, while there was no contention/sharing
-with a sibling core.  Avoiding cache misses (i.e. L2 misses/L3 hits) will
-improve the latency and increase performance."""
+This metric is a rough aggregate estimate of cycles fraction
+where CPU accessed L3 cache for all load requests, while
+there was no contention/sharing with a sibling core.
+Avoiding cache misses (i.e. L2 misses/L3 hits) will improve
+the latency and increase performance."""
     level = 4
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MEM_XSNP_None_Cost * EV("MEM_LOAD_UOPS_RETIRED.LLC_HIT", 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "L3_Latency zero division"
+            print_error("L3_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -759,17 +917,22 @@ class SQ_Full:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric measures fraction of cycles where the Super Queue (SQ) was full
-taking into account all request-types and both hardware SMT threads. The Super
-Queue is used for requests to access the L2 cache or to go out to the Uncore."""
+This metric measures fraction of cycles where the Super
+Queue (SQ) was full taking into account all request-types
+and both hardware SMT threads. The Super Queue is used for
+requests to access the L2 cache or to go out to the Uncore."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = SQ_Full_Cycles(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = self.val > 0.0 and self.parent.thresh
         except ZeroDivisionError:
-            #print "SQ_Full zero division"
+            print_error("SQ_Full zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -779,16 +942,21 @@ class MEM_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled on main memory (DRAM).
-Caching will improve the latency and increase performance."""
+This metric represents how often CPU was stalled on main
+memory (DRAM).  Caching will improve the latency and
+increase performance."""
     level = 3
     htoff = False
+    sample = ['MEM_LOAD_UOPS_RETIRED.LLC_MISS:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (1 - Mem_L3_Hit_Fraction(EV, 3))* EV("CYCLE_ACTIVITY.STALLS_L2_PENDING", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bound zero division"
+            print_error("MEM_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -798,17 +966,21 @@ class MEM_Bandwidth:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to approaching
-bandwidth limits of main memory (DRAM).  NUMA in multi-socket system may be
-considered in such case."""
+This metric represents how often CPU was likely stalled due
+to approaching bandwidth limits of main memory (DRAM).  NUMA
+in multi-socket system may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ORO_Demand_DRD_C6(EV, 4) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Bandwidth zero division"
+            print_error("MEM_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -818,17 +990,22 @@ class MEM_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to latency from
-main memory (DRAM).  Data layout re-structuring or using Software Prefetches
-(also through the compiler) may be considered in such case."""
+This metric represents how often CPU was likely stalled due
+to latency from main memory (DRAM).  Data layout re-
+structuring or using Software Prefetches (also through the
+compiler) may be considered in such case."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (ORO_Demand_DRD_C1(EV, 4) - ORO_Demand_DRD_C6(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MEM_Latency zero division"
+            print_error("MEM_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -838,16 +1015,21 @@ class Local_DRAM:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to loads from
-local memory. Caching will improve the latency and increase performance."""
+This metric represents how often CPU was likely stalled due
+to loads from local memory. Caching will improve the latency
+and increase performance."""
     level = 5
     htoff = False
+    sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_Local_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM", 5) / CLKS(EV, 5 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Local_DRAM zero division"
+            print_error("Local_DRAM zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -857,16 +1039,21 @@ class Remote_DRAM:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to loads from
-remote memory. This is caused often due to non-optimal NUMA allocations."""
+This metric represents how often CPU was likely stalled due
+to loads from remote memory. This is caused often due to
+non-optimal NUMA allocations."""
     level = 5
     htoff = False
+    sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Mem_Remote_DRAM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM", 5) / CLKS(EV, 5 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Remote_DRAM zero division"
+            print_error("Remote_DRAM zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -876,17 +1063,21 @@ class Remote_Cache:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was likely stalled due to loads from
-remote cache in other sockets. This is caused often due to non-optimal NUMA
-allocations."""
+This metric represents how often CPU was likely stalled due
+to loads from remote cache in other sockets. This is caused
+often due to non-optimal NUMA allocations."""
     level = 5
     htoff = False
+    sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM:pp', 'MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_Remote_HitM_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM", 5) + Mem_Remote_Fwd_Cost * EV("MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD", 5)) / CLKS(EV, 5 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Remote_Cache zero division"
+            print_error("Remote_Cache zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -896,18 +1087,23 @@ class Stores_Bound:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents how often CPU was stalled  due to store operations.
-even though memory store accesses do not typically stall out-of-order CPUs;
-there are few cases where stores can lead to actual stalls. This metric will
-be flagged should any of these cases be a bottleneck."""
+This metric represents how often CPU was stalled  due to
+store operations. even though memory store accesses do not
+typically stall out-of-order CPUs; there are few cases where
+stores can lead to actual stalls. This metric will be
+flagged should any of these cases be a bottleneck."""
     level = 3
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Memory_Bound.compute(EV) - STALLS_MEM_ANY(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Stores_Bound zero division"
+            print_error("Stores_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -917,16 +1113,21 @@ class Store_Latency:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents cycles fraction the CPU spent handling long-latency
-store misses (missing 2nd level cache)."""
+This metric represents cycles fraction the CPU spent
+handling long-latency store misses (missing 2nd level
+cache)."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Store_L2_Hit_Cycles(EV, 4) +(1 - Mem_Lock_St_Fraction(EV, 4))* ORO_Demand_RFO_C1(EV, 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Store_Latency zero division"
+            print_error("Store_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -936,16 +1137,21 @@ class Split_Stores:
     domain = "CoreClocks"
     area = "BE/Mem"
     desc = """
-This metric represents rate of split store accesses.  Consider aligning your
-data to the 64-byte cache line granularity."""
+This metric represents rate of split store accesses.
+Consider aligning your data to the 64-byte cache line
+granularity."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("MEM_UOPS_RETIRED.SPLIT_STORES", 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Split_Stores zero division"
+            print_error("Split_Stores zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -955,20 +1161,25 @@ class DTLB_Store:
     domain = "Clocks"
     area = "BE/Mem"
     desc = """
-This metric represents cycles fraction spent handling first-level data TLB
-store misses.  As with ordinary data caching, focus on improving data locality
-and reducing working-set size to reduce DTLB overhead.  Additionally, consider
-using profile-guided optimization (PGO) to collocate frequently-used data on
-the same page.  Try using larger page sizes for large amounts of frequently-
-used data."""
+This metric represents cycles fraction spent handling first-
+level data TLB store misses.  As with ordinary data caching,
+focus on improving data locality and reducing working-set
+size to reduce DTLB overhead.  Additionally, consider using
+profile-guided optimization (PGO) to collocate frequently-
+used data on the same page.  Try using larger page sizes for
+large amounts of frequently-used data."""
     level = 4
     htoff = False
+    sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (Mem_STLB_Hit_Cost * EV("DTLB_STORE_MISSES.STLB_HIT", 4) + EV("DTLB_STORE_MISSES.WALK_DURATION", 4)) / CLKS(EV, 4 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DTLB_Store zero division"
+            print_error("DTLB_Store zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -978,21 +1189,27 @@ class Core_Bound:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents how much Core non-memory issues were of a bottleneck.
-Shortage in hardware compute resources, or dependencies software's
-instructions are both categorized under Core Bound. Hence it may indicate the
-machine ran out of an OOO resources, certain execution units are overloaded or
-dependencies in program's data- or instruction-flow are limiting the
-performance (e.g. FP-chained long-latency arithmetic operations). Tip:
-consider Port Saturation analysis as next step."""
+This metric represents how much Core non-memory issues were
+of a bottleneck.  Shortage in hardware compute resources, or
+dependencies software's instructions are both categorized
+under Core Bound. Hence it may indicate the machine ran out
+of an OOO resources, certain execution units are overloaded
+or dependencies in program's data- or instruction-flow are
+limiting the performance (e.g. FP-chained long-latency
+arithmetic operations). Tip: consider Port Saturation
+analysis as next step."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Backend_Bound_At_EXE(EV, 2) - self.Memory_Bound.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Core_Bound zero division"
+            print_error("Core_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1001,15 +1218,22 @@ class Divider:
     name = "Divider"
     domain = "CoreClocks"
     area = "BE/Core"
-    desc = ""
+    desc = """
+Time waiting for divisions by variables. Change the dividend
+to be constant or use profile feedback to let the compiler
+do that."""
     level = 3
     htoff = False
+    sample = ['ARITH.FPU_DIV_ACTIVE']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ARITH.FPU_DIV_ACTIVE", 3) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Divider zero division"
+            print_error("Divider zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1019,21 +1243,27 @@ class Ports_Utilization:
     domain = "Clocks"
     area = "BE/Core"
     desc = """
-This metric represents cycles fraction application was stalled due to Core
-computation issues (non divider-related).  For example, heavy data-dependency
-between nearby instructions will manifest in this category. Ditto if
-instruction-mix used by the application overloads specific hardware execution
-unit. Hint: Loop Vectorization -most compilers feature auto-Vectorization
-options today- reduces pressure on the execution ports as multiple elements
-are calculated with same uop."""
+This metric represents cycles fraction application was
+stalled due to Core computation issues (non divider-
+related).  For example, heavy data-dependency between nearby
+instructions will manifest in this category. Ditto if
+instruction-mix used by the application overloads specific
+hardware execution unit. Hint: Loop Vectorization -most
+compilers feature auto-Vectorization options today- reduces
+pressure on the execution ports as multiple elements are
+calculated with same uop."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Core_Bound.compute(EV) - self.Divider.compute(EV )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Ports_Utilization zero division"
+            print_error("Ports_Utilization zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1043,16 +1273,20 @@ class G0_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed no uops on any
-execution port."""
+This metric represents Core cycles fraction CPU executed no
+uops on any execution port."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_0_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G0_Ports_Utilized zero division"
+            print_error("G0_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1062,22 +1296,29 @@ class G1_Port_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction where the CPU executed total of 1
-uop per cycle on all execution ports. This can be due to heavy data-dependency
-among software instructions, or over oversubscribing a particular hardware
-resource. In some other cases with high 1_Port_Utilized and L1_Bound, this
-metric can point to L1 data-cache latency bottleneck that may not necessarily
-manifest with complete execution starvation (due to the short L1 latency e.g.
-walking a linked list) - looking at the assembly can be helpful. Tip: consider
-'Core Ports Saturation' analysis-type as next step."""
+This metric represents Core cycles fraction where the CPU
+executed total of 1 uop per cycle on all execution ports.
+This can be due to heavy data-dependency among software
+instructions, or over oversubscribing a particular hardware
+resource. In some other cases with high 1_Port_Utilized and
+L1_Bound, this metric can point to L1 data-cache latency
+bottleneck that may not necessarily manifest with complete
+execution starvation (due to the short L1 latency e.g.
+walking a linked list) - looking at the assembly can be
+helpful. Tip: consider 'Core Ports Saturation' analysis-type
+as next step."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_1_Port_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G1_Port_Utilized zero division"
+            print_error("G1_Port_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1087,19 +1328,25 @@ class G2_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 2 uops per
-cycle on all execution ports. Tip: consider 'Core Port Saturation' analysis-
-type as next step. Loop Vectorization -most compilers feature auto-
-Vectorization options today- reduces pressure on the execution ports as
-multiple elements are calculated with same uop."""
+This metric represents Core cycles fraction CPU executed
+total of 2 uops per cycle on all execution ports. Tip:
+consider 'Core Port Saturation' analysis-type as next step.
+Loop Vectorization -most compilers feature auto-
+Vectorization options today- reduces pressure on the
+execution ports as multiple elements are calculated with
+same uop."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_2_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G2_Ports_Utilized zero division"
+            print_error("G2_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1109,17 +1356,22 @@ class G3m_Ports_Utilized:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU executed total of 3 or more
-uops per cycle on all execution ports. Tip: consider 'Core Port Saturation'
-analysis-type as next step"""
+This metric represents Core cycles fraction CPU executed
+total of 3 or more uops per cycle on all execution ports.
+Tip: consider 'Core Port Saturation' analysis-type as next
+step"""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Cycles_3m_Ports_Utilized(EV, 4) / CORE_CLKS(EV, 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "G3m_Ports_Utilized zero division"
+            print_error("G3m_Ports_Utilized zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1129,16 +1381,21 @@ class Port_0:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 0 (SNB+: ALU; HSW+:ALU and 2nd branch)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 0 (SNB+: ALU; HSW+:ALU and 2nd
+branch)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_0", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_0 zero division"
+            print_error("Port_0 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1148,16 +1405,20 @@ class Port_1:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 1 (ALU)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 1 (ALU)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_1", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_1 zero division"
+            print_error("Port_1 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1167,16 +1428,20 @@ class Port_2:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 2 (Loads and Store-address)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 2 (Loads and Store-address)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_2", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_2 zero division"
+            print_error("Port_2 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1186,16 +1451,20 @@ class Port_3:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 3 (Loads and Store-address)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 3 (Loads and Store-address)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_3", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_3 zero division"
+            print_error("Port_3 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1205,16 +1474,20 @@ class Port_4:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 4 (Store-data)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 4 (Store-data)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_4", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_4 zero division"
+            print_error("Port_4 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1224,16 +1497,20 @@ class Port_5:
     domain = "CoreClocks"
     area = "BE/Core"
     desc = """
-This metric represents Core cycles fraction CPU dispatched uops on execution
-port 5 (SNB+: Branches and ALU; HSW+: ALU)"""
+This metric represents Core cycles fraction CPU dispatched
+uops on execution port 5 (SNB+: Branches and ALU; HSW+: ALU)"""
     level = 5
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_DISPATCHED_PORT.PORT_5", 5) / CORE_CLKS(EV, 5 )
             self.thresh = (self.val > 0.5)
         except ZeroDivisionError:
-            #print "Port_5 zero division"
+            print_error("Port_5 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1243,25 +1520,32 @@ class Retiring:
     domain = "Slots"
     area = "RET"
     desc = """
-This category reflects slots utilized by useful work i.e. allocated uops that
-eventually get retired. Ideally, all pipeline slots would be attributed to the
-Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired
-per cycle has been achieved.  Maximizing Retiring typically increases the
-Instruction-Per-Cycle metric. Note that a high Retiring value does not
-necessary mean there is no room for more performance.  For example, Microcode
-assists are categorized under Retiring. They hurt performance and can often be
-avoided.  A high Retiring value for non-vectorized code may be a good hint for
-programmer to consider vectorizing his code.  Doing so essentially lets more
-computations be done without significantly increasing number of instructions
-thus improving the performance."""
+This category reflects slots utilized by useful work i.e.
+allocated uops that eventually get retired. Ideally, all
+pipeline slots would be attributed to the Retiring category.
+Retiring of 100% would indicate the maximum 4 uops retired
+per cycle has been achieved.  Maximizing Retiring typically
+increases the Instruction-Per-Cycle metric. Note that a high
+Retiring value does not necessary mean there is no room for
+more performance.  For example, Microcode assists are
+categorized under Retiring. They hurt performance and can
+often be avoided.  A high Retiring value for non-vectorized
+code may be a good hint for programmer to consider
+vectorizing his code.  Doing so essentially lets more
+computations be done without significantly increasing number
+of instructions thus improving the performance."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
-            #print "Retiring zero division"
+            print_error("Retiring zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1271,21 +1555,27 @@ class Base:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction where the CPU was retiring uops not
-originated from the microcode-sequencer. This correlates with total number of
-instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. While this is the most desirable of the top 4 categories, high
-values may still indicate areas for improvement. If possible focus on
-techniques that reduce instruction count or result in more efficient
-instructions generation such as vectorization."""
+This metric represents slots fraction where the CPU was
+retiring uops not originated from the microcode-sequencer.
+This correlates with total number of instructions used by
+the program. A uops-per-instruction ratio of 1 should be
+expected. While this is the most desirable of the top 4
+categories, high values may still indicate areas for
+improvement. If possible focus on techniques that reduce
+instruction count or result in more efficient instructions
+generation such as vectorization."""
     level = 2
     htoff = False
+    sample = ['INST_RETIRED.PREC_DIST']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Base zero division"
+            print_error("Base zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1295,16 +1585,20 @@ class FP_Arith:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents overall arithmetic floating-point (FP) uops fraction
-the CPU has executed."""
+This metric represents overall arithmetic floating-point
+(FP) uops fraction the CPU has executed."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.FP_x87.compute(EV) + self.FP_Scalar.compute(EV) + self.FP_Vector.compute(EV )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FP_Arith zero division"
+            print_error("FP_Arith zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1314,17 +1608,22 @@ class FP_x87:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric is an approxmiation of floating-point (FP) x87 (arithmetic) uops
-fraction. Tip: consider compiler flags to generate newer AVX (or SSE)
-instruction sets, which typically perform better and feature vectors."""
+This metric is an approxmiation of floating-point (FP) x87
+(arithmetic) uops fraction. Tip: consider compiler flags to
+generate newer AVX (or SSE) instruction sets, which
+typically perform better and feature vectors."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("FP_COMP_OPS_EXE.X87", 4) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FP_x87 zero division"
+            print_error("FP_x87 zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1334,17 +1633,21 @@ class FP_Scalar:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents arithmetic floating-point (FP) scalar uops fraction the
-CPU has executed. Tip: investigate what limits (compiler) generation of vector
-code."""
+This metric represents arithmetic floating-point (FP) scalar
+uops fraction the CPU has executed. Tip: investigate what
+limits (compiler) generation of vector code."""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("FP_COMP_OPS_EXE.SSE_SCALAR_SINGLE", 4) + EV("FP_COMP_OPS_EXE.SSE_SCALAR_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FP_Scalar zero division"
+            print_error("FP_Scalar zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1354,16 +1657,21 @@ class FP_Vector:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents arithmetic floating-point (FP) vector uops fraction the
-CPU has executed. Tip: check if vector width is expected"""
+This metric represents arithmetic floating-point (FP) vector
+uops fraction the CPU has executed. Tip: check if vector
+width is expected"""
     level = 4
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("FP_COMP_OPS_EXE.SSE_PACKED_DOUBLE", 4) + EV("FP_COMP_OPS_EXE.SSE_PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_SINGLE", 4) + EV("SIMD_FP_256.PACKED_DOUBLE", 4)) / EV("UOPS_EXECUTED.THREAD", 4 )
             self.thresh = (self.val > 0.2) and self.parent.thresh
         except ZeroDivisionError:
-            #print "FP_Vector zero division"
+            print_error("FP_Vector zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1373,17 +1681,21 @@ class Other:
     domain = "Uops"
     area = "RET"
     desc = """
-This metric represents non-floating-point (FP) uop fraction the CPU has
-executed. If you application has no FP operations, this will likely be biggest
-fraction."""
+This metric represents non-floating-point (FP) uop fraction
+the CPU has executed. If you application has no FP
+operations, this will likely be biggest fraction."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 1 - self.FP_Arith.compute(EV )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Other zero division"
+            print_error("Other zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1393,18 +1705,24 @@ class Microcode_Sequencer:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops fetched by the
-Microcode Sequencer (MS) ROM.  The MS is used for CISC instructions not fully
-decoded by the default decoders (like repeat move strings), or by microcode
-assists used to address some operation modes (like in Floating Point assists)."""
+This metric represents slots fraction CPU was retiring uops
+fetched by the Microcode Sequencer (MS) ROM.  The MS is used
+for CISC instructions not fully decoded by the default
+decoders (like repeat move strings), or by microcode assists
+used to address some operation modes (like in Floating Point
+assists)."""
     level = 2
     htoff = False
+    sample = ['IDQ.MS_UOPS']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
-            #print "Microcode_Sequencer zero division"
+            print_error("Microcode_Sequencer zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -1415,12 +1733,14 @@ class Metric_IPC:
 Instructions Per Cycle (per logical thread)"""
     domain = "Metric"
     maxval = 5
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPC(EV, 0)
         except ZeroDivisionError:
-            print "IPC zero division"
+            print_error("IPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CPI:
@@ -1429,12 +1749,14 @@ class Metric_CPI:
 Cycles Per Instruction (threaded)"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CPI(EV, 0)
         except ZeroDivisionError:
-            print "CPI zero division"
+            print_error("CPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CoreIPC:
@@ -1442,13 +1764,15 @@ class Metric_CoreIPC:
     desc = """
 Instructions Per Cycle (per physical core)"""
     domain = "CoreMetric"
-    maxval = 5
+    maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CoreIPC(EV, 0)
         except ZeroDivisionError:
-            print "CoreIPC zero division"
+            print_error("CoreIPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_UPI:
@@ -1457,12 +1781,14 @@ class Metric_UPI:
 Uops Per Instruction"""
     domain = "Metric"
     maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = UPI(EV, 0)
         except ZeroDivisionError:
-            print "UPI zero division"
+            print_error("UPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_IPTB:
@@ -1471,71 +1797,82 @@ class Metric_IPTB:
 Instruction per taken branch"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPTB(EV, 0)
         except ZeroDivisionError:
-            print "IPTB zero division"
+            print_error("IPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_BPTB:
     name = "BPTB"
     desc = """
-Branch instructions per taken branch. Can be used to approximate PGO-
-likelihood for non-loopy codes."""
+Branch instructions per taken branch. Can be used to
+approximate PGO-likelihood for non-loopy codes."""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = BPTB(EV, 0)
         except ZeroDivisionError:
-            print "BPTB zero division"
+            print_error("BPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     desc = """
-Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+Fraction of Uops delivered by the DSB (decoded instructions
+cache)"""
     domain = "Metric"
     maxval = 1
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = DSB_Coverage(EV, 0)
         except ZeroDivisionError:
-            print "DSB_Coverage zero division"
+            print_error("DSB_Coverage zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_ILP:
     name = "ILP"
     desc = """
-Instruction-Level-Parallelism (average number of uops executed when there is
-at least 1 uop executed)"""
+Instruction-Level-Parallelism (average number of uops
+executed when there is at least 1 uop executed)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = ILP(EV, 0)
         except ZeroDivisionError:
-            print "ILP zero division"
+            print_error("ILP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MLP:
     name = "MLP"
     desc = """
-Memory-Level-Parallelism (average number of L1 miss demand load when there is
-at least 1 such miss)"""
+Memory-Level-Parallelism (average number of L1 miss demand
+load when there is at least 1 such miss)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MLP(EV, 0)
         except ZeroDivisionError:
-            print "MLP zero division"
+            print_error("MLP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Load_Miss_Real_Latency:
@@ -1544,12 +1881,14 @@ class Metric_Load_Miss_Real_Latency:
 Actual Average Latency for L1 data-cache miss demand loads"""
     domain = "Metric"
     maxval = 1000
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Load_Miss_Real_Latency(EV, 0)
         except ZeroDivisionError:
-            print "Load_Miss_Real_Latency zero division"
+            print_error("Load_Miss_Real_Latency zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_GFLOPs:
@@ -1558,12 +1897,14 @@ class Metric_GFLOPs:
 Giga Floating Point Operations Per Second"""
     domain = "Metric"
     maxval = 100
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = GFLOPs(EV, 0)
         except ZeroDivisionError:
-            print "GFLOPs zero division"
+            print_error("GFLOPs zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Turbo_Utilization:
@@ -1572,27 +1913,31 @@ class Metric_Turbo_Utilization:
 Average Frequency Utilization relative nominal frequency"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Turbo_Utilization(EV, 0)
         except ZeroDivisionError:
-            print "Turbo_Utilization zero division"
+            print_error("Turbo_Utilization zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Page_Walks_Use:
     name = "Page_Walks_Use"
     desc = """
-Fraction of cycles where the core's Page Walker is busy serving
-iTLB/Load/Store"""
+Fraction of cycles where the core's Page Walker is busy
+serving iTLB/Load/Store"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Page_Walks_Use(EV, 0)
         except ZeroDivisionError:
-            print "Page_Walks_Use zero division"
+            print_error("Page_Walks_Use zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MUX:
@@ -1601,12 +1946,14 @@ class Metric_MUX:
 PerfMon Event Multiplexing accuracy indicator"""
     domain = "Clocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MUX(EV, 0)
         except ZeroDivisionError:
-            print "MUX zero division"
+            print_error("MUX zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CLKS:
@@ -1615,12 +1962,14 @@ class Metric_CLKS:
 Per-thread actual clocks"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CLKS zero division"
+            print_error("CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CORE_CLKS:
@@ -1629,12 +1978,14 @@ class Metric_CORE_CLKS:
 Core actual clocks"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CORE_CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CORE_CLKS zero division"
+            print_error("CORE_CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Time:
@@ -1643,12 +1994,14 @@ class Metric_Time:
 Run duration time in seconds"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "Time zero division"
+            print_error("Time zero division")
+            self.errcount += 1
             self.val = 0
 
 # Schedule
@@ -1803,130 +2156,17 @@ def __init__(self, r):
 
         # siblings cross-tree
 
-        o["Frontend_Bound"].sibling = None
-        o["Frontend_Latency"].sibling = None
-        o["ICache_Misses"].sibling = None
-        o["ITLB_Misses"].sibling = None
 	o["Branch_Resteers"].sibling = o["Bad_Speculation"]
-        o["DSB_Switches"].sibling = None
-        o["LCP"].sibling = None
 	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
-        o["Frontend_Bandwidth"].sibling = None
-        o["MITE"].sibling = None
-        o["DSB"].sibling = None
-        o["LSD"].sibling = None
 	o["Bad_Speculation"].sibling = o["Branch_Resteers"]
-        o["Branch_Mispredicts"].sibling = None
-        o["Machine_Clears"].sibling = None
-        o["Backend_Bound"].sibling = None
-        o["Memory_Bound"].sibling = None
 	o["L1_Bound"].sibling = o["G1_Port_Utilized"]
-        o["DTLB_Load"].sibling = None
-        o["Store_Fwd_Blk"].sibling = None
 	o["Lock_Latency"].sibling = o["Store_Latency"]
-        o["Split_Loads"].sibling = None
-        o["G4K_Aliasing"].sibling = None
-        o["L2_Bound"].sibling = None
-        o["L3_Bound"].sibling = None
-        o["Contested_Accesses"].sibling = None
-        o["Data_Sharing"].sibling = None
-        o["L3_Latency"].sibling = None
-        o["SQ_Full"].sibling = None
-        o["MEM_Bound"].sibling = None
-        o["MEM_Bandwidth"].sibling = None
-        o["MEM_Latency"].sibling = None
-        o["Local_DRAM"].sibling = None
-        o["Remote_DRAM"].sibling = None
-        o["Remote_Cache"].sibling = None
-        o["Stores_Bound"].sibling = None
 	o["Store_Latency"].sibling = o["Lock_Latency"]
 	o["Split_Stores"].sibling = o["Port_4"]
-        o["DTLB_Store"].sibling = None
-        o["Core_Bound"].sibling = None
-        o["Divider"].sibling = None
-        o["Ports_Utilization"].sibling = None
-        o["G0_Ports_Utilized"].sibling = None
 	o["G1_Port_Utilized"].sibling = o["L1_Bound"]
-        o["G2_Ports_Utilized"].sibling = None
-        o["G3m_Ports_Utilized"].sibling = None
-        o["Port_0"].sibling = None
-        o["Port_1"].sibling = None
-        o["Port_2"].sibling = None
-        o["Port_3"].sibling = None
 	o["Port_4"].sibling = o["Split_Stores"]
-        o["Port_5"].sibling = None
-        o["Retiring"].sibling = None
-        o["Base"].sibling = None
-        o["FP_Arith"].sibling = None
-        o["FP_x87"].sibling = None
-        o["FP_Scalar"].sibling = None
-        o["FP_Vector"].sibling = None
-        o["Other"].sibling = None
 	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events
-
-        o["Frontend_Bound"].sample = []
-        o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END']
-        o["ICache_Misses"].sample = []
-        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
-        o["Branch_Resteers"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
-        o["DSB_Switches"].sample = []
-        o["LCP"].sample = []
-        o["MS_Switches"].sample = []
-        o["Frontend_Bandwidth"].sample = []
-        o["MITE"].sample = []
-        o["DSB"].sample = []
-        o["LSD"].sample = []
-        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
-        o["Branch_Mispredicts"].sample = ['BR_MISP_RETIRED.ALL_BRANCHES:pp']
-        o["Machine_Clears"].sample = ['MACHINE_CLEARS.COUNT']
-        o["Backend_Bound"].sample = []
-        o["Memory_Bound"].sample = []
-        o["L1_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L1_HIT:pp', 'MEM_LOAD_UOPS_RETIRED.HIT_LFB:pp']
-        o["DTLB_Load"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_LOADS:pp']
-        o["Store_Fwd_Blk"].sample = []
-        o["Lock_Latency"].sample = ['MEM_UOPS_RETIRED.LOCK_LOADS:pp']
-        o["Split_Loads"].sample = ['MEM_UOPS_RETIRED.SPLIT_LOADS:pp']
-        o["G4K_Aliasing"].sample = []
-        o["L2_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L2_HIT:pp']
-        o["L3_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
-        o["Contested_Accesses"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT:pp', 'MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS:pp']
-        o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp']
-        o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp']
-        o["SQ_Full"].sample = []
-        o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_MISS:pp']
-        o["MEM_Bandwidth"].sample = []
-        o["MEM_Latency"].sample = []
-        o["Local_DRAM"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM:pp']
-        o["Remote_DRAM"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM:pp']
-        o["Remote_Cache"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM:pp', 'MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD:pp']
-        o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp']
-        o["Store_Latency"].sample = []
-        o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp']
-        o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp']
-        o["Core_Bound"].sample = []
-        o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE']
-        o["Ports_Utilization"].sample = []
-        o["G0_Ports_Utilized"].sample = []
-        o["G1_Port_Utilized"].sample = []
-        o["G2_Ports_Utilized"].sample = []
-        o["G3m_Ports_Utilized"].sample = []
-        o["Port_0"].sample = []
-        o["Port_1"].sample = []
-        o["Port_2"].sample = []
-        o["Port_3"].sample = []
-        o["Port_4"].sample = []
-        o["Port_5"].sample = []
-        o["Retiring"].sample = []
-        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
-        o["FP_Arith"].sample = []
-        o["FP_x87"].sample = []
-        o["FP_Scalar"].sample = []
-        o["FP_Vector"].sample = []
-        o["Other"].sample = []
-        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
-
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
diff --git a/jkt_server_ratios.py b/jkt_server_ratios.py
index 02bf569d..3e595f0d 100644
--- a/jkt_server_ratios.py
+++ b/jkt_server_ratios.py
@@ -8,6 +8,10 @@
 # https://sites.google.com/site/analysismethods/yasin-pubs
 #
 
+# Helpers
+
+print_error = lambda msg: False
+
 smt_enabled = False
 
 # Constants
@@ -143,22 +147,28 @@ class Frontend_Bound:
     domain = "Slots"
     area = "FE"
     desc = """
-This category reflects slots where the Frontend of the processor undersupplies
-its Backend. Frontend denotes the first portion of pipeline responsible to
-fetch micro-ops which the Backend can execute. Within the Frontend, a branch
-predictor predicts the next address to fetch, cache-lines are fetched from
-memory, parsed into instructions, and lastly decoded into micro-ops. The
-purpose of the Frontend cluster is to deliver uops to Backend whenever the
-latter can accept them. For example, stalls due to instruction-cache misses
-would be categorized under Frontend Bound."""
+This category reflects slots where the Frontend of the
+processor undersupplies its Backend. Frontend denotes the
+first portion of pipeline responsible to fetch micro-ops
+which the Backend can execute. Within the Frontend, a branch
+predictor predicts the next address to fetch, cache-lines
+are fetched from memory, parsed into instructions, and
+lastly decoded into micro-ops. The purpose of the Frontend
+cluster is to deliver uops to Backend whenever the latter
+can accept them. For example, stalls due to instruction-
+cache misses would be categorized under Frontend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Frontend_Bound zero division"
+            print_error("Frontend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -168,18 +178,24 @@ class Frontend_Latency:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend latency
-issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
-after a branch misprediction are categorized under Frontend Latency. In such
-cases the Frontend eventually delivers no uops for some period."""
+This metric represents slots fraction CPU was stalled due to
+Frontend latency issues.  For example, instruction-cache
+misses, iTLB misses or fetch stalls after a branch
+misprediction are categorized under Frontend Latency. In
+such cases the Frontend eventually delivers no uops for some
+period."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Latency zero division"
+            print_error("Frontend_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -189,16 +205,21 @@ class ITLB_Misses:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to instruction TLB
-misses. Using large code pages may be considered here."""
+This metric represents cycles fraction CPU was stalled due
+to instruction TLB misses. Using large code pages may be
+considered here."""
     level = 3
     htoff = False
+    sample = ['ITLB_MISSES.WALK_COMPLETED']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ITLB_Misses zero division"
+            print_error("ITLB_Misses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -208,16 +229,21 @@ class DSB_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches from
-DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
+This metric represents cycles fraction CPU was stalled due
+to switches from DSB to MITE pipelines. Optimizing for
+better DSB hit rate may be considered."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB_Switches zero division"
+            print_error("DSB_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -227,17 +253,22 @@ class LCP:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to Length Changing
-Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
-certainly avoid this."""
+This metric represents cycles fraction CPU was stalled due
+to Length Changing Prefixes (LCPs). Using proper compiler
+flags or Intel Compiler by default will certainly avoid
+this."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LCP zero division"
+            print_error("LCP zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -247,19 +278,25 @@ class MS_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches of uop
-delivery to the Microcode Sequencer (MS). Commonly used instructions are
-optimized for delivery by the DSB or MITE pipelines. The MS is designated to
-deliver long uop flows required by CISC instructions like CPUID, or uncommon
-conditions like Floating Point Assists when dealing with Denormals."""
+This metric represents cycles fraction CPU was stalled due
+to switches of uop delivery to the Microcode Sequencer (MS).
+Commonly used instructions are optimized for delivery by the
+DSB or MITE pipelines. The MS is designated to deliver long
+uop flows required by CISC instructions like CPUID, or
+uncommon conditions like Floating Point Assists when dealing
+with Denormals."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MS_Switches zero division"
+            print_error("MS_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -269,19 +306,24 @@ class Frontend_Bandwidth:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend
-bandwidth issues.  For example, inefficiencies at the instruction decoders, or
-code restrictions for caching in the DSB (decoded uops cache) are categorized
-under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
-optimal amount of uops to the Backend."""
+This metric represents slots fraction CPU was stalled due to
+Frontend bandwidth issues.  For example, inefficiencies at
+the instruction decoders, or code restrictions for caching
+in the DSB (decoded uops cache) are categorized under
+Frontend Bandwidth. In such cases, the Frontend typically
+delivers non-optimal amount of uops to the Backend."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Bandwidth zero division"
+            print_error("Frontend_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -291,17 +333,22 @@ class MITE:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to the MITE fetch pipeline.  For example, inefficiencies in the
-instruction decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to the MITE fetch pipeline.  For example,
+inefficiencies in the instruction decoders are categorized
+here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MITE zero division"
+            print_error("MITE zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -311,18 +358,23 @@ class DSB:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
-utilization of the DSB cache structure or bank conflict when reading from it,
-are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to DSB (decoded uop cache) fetch
+pipeline.  For example, inefficient utilization of the DSB
+cache structure or bank conflict when reading from it, are
+categorized here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB zero division"
+            print_error("DSB zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -332,19 +384,24 @@ class LSD:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
-Uop supply. However, in some rare cases, optimal uop-delivery could not be
-reached for small loops whose size (in terms of number of uops) does not suit
-well the LSD structure."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to LSD (Loop Stream Detector) unit.  LSD
+typically does well sustaining Uop supply. However, in some
+rare cases, optimal uop-delivery could not be reached for
+small loops whose size (in terms of number of uops) does not
+suit well the LSD structure."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LSD zero division"
+            print_error("LSD zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -354,19 +411,24 @@ class Bad_Speculation:
     domain = "Slots"
     area = "BAD"
     desc = """
-This category reflects slots wasted due to incorrect speculations, which
-include slots used to allocate uops that do not eventually get retired and
-slots for which allocation was blocked due to recovery from earlier incorrect
-speculation. For example, wasted work due to miss-predicted branches are
-categorized under Bad Speculation category"""
+This category reflects slots wasted due to incorrect
+speculations, which include slots used to allocate uops that
+do not eventually get retired and slots for which allocation
+was blocked due to recovery from earlier incorrect
+speculation. For example, wasted work due to miss-predicted
+branches are categorized under Bad Speculation category"""
     level = 1
     htoff = False
+    sample = ['INT_MISC.RECOVERY_CYCLES']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
-            #print "Bad_Speculation zero division"
+            print_error("Bad_Speculation zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -376,21 +438,27 @@ class Backend_Bound:
     domain = "Slots"
     area = "BE"
     desc = """
-This category reflects slots where no uops are being delivered due to a lack
-of required resources for accepting more uops in the Backend of the pipeline.
-Backend describes the portion of the pipeline where the out-of-order scheduler
-dispatches ready uops into their respective execution units, and once
-completed these uops get retired according to program order. For example,
-stalls due to data-cache misses or stalls due to the divider unit being
-overloaded are both categorized under Backend Bound."""
+This category reflects slots where no uops are being
+delivered due to a lack of required resources for accepting
+more uops in the Backend of the pipeline. Backend describes
+the portion of the pipeline where the out-of-order scheduler
+dispatches ready uops into their respective execution units,
+and once completed these uops get retired according to
+program order. For example, stalls due to data-cache misses
+or stalls due to the divider unit being overloaded are both
+categorized under Backend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Backend_Bound zero division"
+            print_error("Backend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -400,25 +468,32 @@ class Retiring:
     domain = "Slots"
     area = "RET"
     desc = """
-This category reflects slots utilized by useful work i.e. allocated uops that
-eventually get retired. Ideally, all pipeline slots would be attributed to the
-Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired
-per cycle has been achieved.  Maximizing Retiring typically increases the
-Instruction-Per-Cycle metric. Note that a high Retiring value does not
-necessary mean there is no room for more performance.  For example, Microcode
-assists are categorized under Retiring. They hurt performance and can often be
-avoided.  A high Retiring value for non-vectorized code may be a good hint for
-programmer to consider vectorizing his code.  Doing so essentially lets more
-computations be done without significantly increasing number of instructions
-thus improving the performance."""
+This category reflects slots utilized by useful work i.e.
+allocated uops that eventually get retired. Ideally, all
+pipeline slots would be attributed to the Retiring category.
+Retiring of 100% would indicate the maximum 4 uops retired
+per cycle has been achieved.  Maximizing Retiring typically
+increases the Instruction-Per-Cycle metric. Note that a high
+Retiring value does not necessary mean there is no room for
+more performance.  For example, Microcode assists are
+categorized under Retiring. They hurt performance and can
+often be avoided.  A high Retiring value for non-vectorized
+code may be a good hint for programmer to consider
+vectorizing his code.  Doing so essentially lets more
+computations be done without significantly increasing number
+of instructions thus improving the performance."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
-            #print "Retiring zero division"
+            print_error("Retiring zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -428,21 +503,27 @@ class Base:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction where the CPU was retiring uops not
-originated from the microcode-sequencer. This correlates with total number of
-instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. While this is the most desirable of the top 4 categories, high
-values may still indicate areas for improvement. If possible focus on
-techniques that reduce instruction count or result in more efficient
-instructions generation such as vectorization."""
+This metric represents slots fraction where the CPU was
+retiring uops not originated from the microcode-sequencer.
+This correlates with total number of instructions used by
+the program. A uops-per-instruction ratio of 1 should be
+expected. While this is the most desirable of the top 4
+categories, high values may still indicate areas for
+improvement. If possible focus on techniques that reduce
+instruction count or result in more efficient instructions
+generation such as vectorization."""
     level = 2
     htoff = False
+    sample = ['INST_RETIRED.PREC_DIST']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Base zero division"
+            print_error("Base zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -452,18 +533,24 @@ class Microcode_Sequencer:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops fetched by the
-Microcode Sequencer (MS) ROM.  The MS is used for CISC instructions not fully
-decoded by the default decoders (like repeat move strings), or by microcode
-assists used to address some operation modes (like in Floating Point assists)."""
+This metric represents slots fraction CPU was retiring uops
+fetched by the Microcode Sequencer (MS) ROM.  The MS is used
+for CISC instructions not fully decoded by the default
+decoders (like repeat move strings), or by microcode assists
+used to address some operation modes (like in Floating Point
+assists)."""
     level = 2
     htoff = False
+    sample = ['IDQ.MS_UOPS']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
-            #print "Microcode_Sequencer zero division"
+            print_error("Microcode_Sequencer zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -474,12 +561,14 @@ class Metric_IPC:
 Instructions Per Cycle (per logical thread)"""
     domain = "Metric"
     maxval = 5
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPC(EV, 0)
         except ZeroDivisionError:
-            print "IPC zero division"
+            print_error("IPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CPI:
@@ -488,12 +577,14 @@ class Metric_CPI:
 Cycles Per Instruction (threaded)"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CPI(EV, 0)
         except ZeroDivisionError:
-            print "CPI zero division"
+            print_error("CPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CoreIPC:
@@ -501,13 +592,15 @@ class Metric_CoreIPC:
     desc = """
 Instructions Per Cycle (per physical core)"""
     domain = "CoreMetric"
-    maxval = 5
+    maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CoreIPC(EV, 0)
         except ZeroDivisionError:
-            print "CoreIPC zero division"
+            print_error("CoreIPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_UPI:
@@ -516,12 +609,14 @@ class Metric_UPI:
 Uops Per Instruction"""
     domain = "Metric"
     maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = UPI(EV, 0)
         except ZeroDivisionError:
-            print "UPI zero division"
+            print_error("UPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_IPTB:
@@ -530,56 +625,65 @@ class Metric_IPTB:
 Instruction per taken branch"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPTB(EV, 0)
         except ZeroDivisionError:
-            print "IPTB zero division"
+            print_error("IPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_BPTB:
     name = "BPTB"
     desc = """
-Branch instructions per taken branch. Can be used to approximate PGO-
-likelihood for non-loopy codes."""
+Branch instructions per taken branch. Can be used to
+approximate PGO-likelihood for non-loopy codes."""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = BPTB(EV, 0)
         except ZeroDivisionError:
-            print "BPTB zero division"
+            print_error("BPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     desc = """
-Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+Fraction of Uops delivered by the DSB (decoded instructions
+cache)"""
     domain = "Metric"
     maxval = 1
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = DSB_Coverage(EV, 0)
         except ZeroDivisionError:
-            print "DSB_Coverage zero division"
+            print_error("DSB_Coverage zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MLP:
     name = "MLP"
     desc = """
-Memory-Level-Parallelism (average number of L1 miss demand load when there is
-at least 1 such miss)"""
+Memory-Level-Parallelism (average number of L1 miss demand
+load when there is at least 1 such miss)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MLP(EV, 0)
         except ZeroDivisionError:
-            print "MLP zero division"
+            print_error("MLP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_GFLOPs:
@@ -588,41 +692,47 @@ class Metric_GFLOPs:
 Giga Floating Point Operations Per Second"""
     domain = "Metric"
     maxval = 100
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = GFLOPs(EV, 0)
         except ZeroDivisionError:
-            print "GFLOPs zero division"
+            print_error("GFLOPs zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Turbo_Utilization:
     name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
-    domain = "CoreMetric"
+    domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Turbo_Utilization(EV, 0)
         except ZeroDivisionError:
-            print "Turbo_Utilization zero division"
+            print_error("Turbo_Utilization zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Page_Walks_Use:
     name = "Page_Walks_Use"
     desc = """
-Fraction of cycles where the core's Page Walker is busy serving
-iTLB/Load/Store"""
+Fraction of cycles where the core's Page Walker is busy
+serving iTLB/Load/Store"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Page_Walks_Use(EV, 0)
         except ZeroDivisionError:
-            print "Page_Walks_Use zero division"
+            print_error("Page_Walks_Use zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MUX:
@@ -631,12 +741,14 @@ class Metric_MUX:
 PerfMon Event Multiplexing accuracy indicator"""
     domain = "Clocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MUX(EV, 0)
         except ZeroDivisionError:
-            print "MUX zero division"
+            print_error("MUX zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CLKS:
@@ -645,12 +757,14 @@ class Metric_CLKS:
 Per-thread actual clocks"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CLKS zero division"
+            print_error("CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CORE_CLKS:
@@ -659,12 +773,14 @@ class Metric_CORE_CLKS:
 Core actual clocks"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CORE_CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CORE_CLKS zero division"
+            print_error("CORE_CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Time:
@@ -673,12 +789,14 @@ class Metric_Time:
 Run duration time in seconds"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "Time zero division"
+            print_error("Time zero division")
+            self.errcount += 1
             self.val = 0
 
 # Schedule
@@ -730,40 +848,9 @@ def __init__(self, r):
 
         # siblings cross-tree
 
-        o["Frontend_Bound"].sibling = None
-        o["Frontend_Latency"].sibling = None
-        o["ITLB_Misses"].sibling = None
-        o["DSB_Switches"].sibling = None
-        o["LCP"].sibling = None
 	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
-        o["Frontend_Bandwidth"].sibling = None
-        o["MITE"].sibling = None
-        o["DSB"].sibling = None
-        o["LSD"].sibling = None
-	o["Bad_Speculation"].sibling = None
-        o["Backend_Bound"].sibling = None
-        o["Retiring"].sibling = None
-        o["Base"].sibling = None
 	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events
-
-        o["Frontend_Bound"].sample = []
-        o["Frontend_Latency"].sample = []
-        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
-        o["DSB_Switches"].sample = []
-        o["LCP"].sample = []
-        o["MS_Switches"].sample = []
-        o["Frontend_Bandwidth"].sample = []
-        o["MITE"].sample = []
-        o["DSB"].sample = []
-        o["LSD"].sample = []
-        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
-        o["Backend_Bound"].sample = []
-        o["Retiring"].sample = []
-        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
-        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
-
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
diff --git a/snb_client_ratios.py b/snb_client_ratios.py
index 5c1a58f5..c1436c32 100644
--- a/snb_client_ratios.py
+++ b/snb_client_ratios.py
@@ -8,6 +8,10 @@
 # https://sites.google.com/site/analysismethods/yasin-pubs
 #
 
+# Helpers
+
+print_error = lambda msg: False
+
 smt_enabled = False
 
 # Constants
@@ -143,22 +147,28 @@ class Frontend_Bound:
     domain = "Slots"
     area = "FE"
     desc = """
-This category reflects slots where the Frontend of the processor undersupplies
-its Backend. Frontend denotes the first portion of pipeline responsible to
-fetch micro-ops which the Backend can execute. Within the Frontend, a branch
-predictor predicts the next address to fetch, cache-lines are fetched from
-memory, parsed into instructions, and lastly decoded into micro-ops. The
-purpose of the Frontend cluster is to deliver uops to Backend whenever the
-latter can accept them. For example, stalls due to instruction-cache misses
-would be categorized under Frontend Bound."""
+This category reflects slots where the Frontend of the
+processor undersupplies its Backend. Frontend denotes the
+first portion of pipeline responsible to fetch micro-ops
+which the Backend can execute. Within the Frontend, a branch
+predictor predicts the next address to fetch, cache-lines
+are fetched from memory, parsed into instructions, and
+lastly decoded into micro-ops. The purpose of the Frontend
+cluster is to deliver uops to Backend whenever the latter
+can accept them. For example, stalls due to instruction-
+cache misses would be categorized under Frontend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("IDQ_UOPS_NOT_DELIVERED.CORE", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Frontend_Bound zero division"
+            print_error("Frontend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -168,18 +178,24 @@ class Frontend_Latency:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend latency
-issues.  For example, instruction-cache misses, iTLB misses or fetch stalls
-after a branch misprediction are categorized under Frontend Latency. In such
-cases the Frontend eventually delivers no uops for some period."""
+This metric represents slots fraction CPU was stalled due to
+Frontend latency issues.  For example, instruction-cache
+misses, iTLB misses or fetch stalls after a branch
+misprediction are categorized under Frontend Latency. In
+such cases the Frontend eventually delivers no uops for some
+period."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Pipeline_Width * EV("IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.15) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Latency zero division"
+            print_error("Frontend_Latency zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -189,16 +205,21 @@ class ITLB_Misses:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to instruction TLB
-misses. Using large code pages may be considered here."""
+This metric represents cycles fraction CPU was stalled due
+to instruction TLB misses. Using large code pages may be
+considered here."""
     level = 3
     htoff = False
+    sample = ['ITLB_MISSES.WALK_COMPLETED']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = ITLB_Miss_Cycles(EV, 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "ITLB_Misses zero division"
+            print_error("ITLB_Misses zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -208,16 +229,21 @@ class DSB_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches from
-DSB to MITE pipelines. Optimizing for better DSB hit rate may be considered."""
+This metric represents cycles fraction CPU was stalled due
+to switches from DSB to MITE pipelines. Optimizing for
+better DSB hit rate may be considered."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("DSB2MITE_SWITCHES.PENALTY_CYCLES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB_Switches zero division"
+            print_error("DSB_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -227,17 +253,22 @@ class LCP:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to Length Changing
-Prefixes (LCPs). Using proper compiler flags or Intel Compiler by default will
-certainly avoid this."""
+This metric represents cycles fraction CPU was stalled due
+to Length Changing Prefixes (LCPs). Using proper compiler
+flags or Intel Compiler by default will certainly avoid
+this."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("ILD_STALL.LCP", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LCP zero division"
+            print_error("LCP zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -247,19 +278,25 @@ class MS_Switches:
     domain = "Clocks"
     area = "FE"
     desc = """
-This metric represents cycles fraction CPU was stalled due to switches of uop
-delivery to the Microcode Sequencer (MS). Commonly used instructions are
-optimized for delivery by the DSB or MITE pipelines. The MS is designated to
-deliver long uop flows required by CISC instructions like CPUID, or uncommon
-conditions like Floating Point Assists when dealing with Denormals."""
+This metric represents cycles fraction CPU was stalled due
+to switches of uop delivery to the Microcode Sequencer (MS).
+Commonly used instructions are optimized for delivery by the
+DSB or MITE pipelines. The MS is designated to deliver long
+uop flows required by CISC instructions like CPUID, or
+uncommon conditions like Floating Point Assists when dealing
+with Denormals."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = MS_Switches_Cost * EV("IDQ.MS_SWITCHES", 3) / CLKS(EV, 3 )
             self.thresh = (self.val > 0.05) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MS_Switches zero division"
+            print_error("MS_Switches zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -269,19 +306,24 @@ class Frontend_Bandwidth:
     domain = "Slots"
     area = "FE"
     desc = """
-This metric represents slots fraction CPU was stalled due to Frontend
-bandwidth issues.  For example, inefficiencies at the instruction decoders, or
-code restrictions for caching in the DSB (decoded uops cache) are categorized
-under Frontend Bandwidth. In such cases, the Frontend typically delivers non-
-optimal amount of uops to the Backend."""
+This metric represents slots fraction CPU was stalled due to
+Frontend bandwidth issues.  For example, inefficiencies at
+the instruction decoders, or code restrictions for caching
+in the DSB (decoded uops cache) are categorized under
+Frontend Bandwidth. In such cases, the Frontend typically
+delivers non-optimal amount of uops to the Backend."""
     level = 2
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Frontend_Bound.compute(EV) - self.Frontend_Latency.compute(EV )
             self.thresh = (self.val > 0.1) & (IPC(EV, 2) > 2.0) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Frontend_Bandwidth zero division"
+            print_error("Frontend_Bandwidth zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -291,17 +333,22 @@ class MITE:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to the MITE fetch pipeline.  For example, inefficiencies in the
-instruction decoders are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to the MITE fetch pipeline.  For example,
+inefficiencies in the instruction decoders are categorized
+here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_MITE_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_MITE_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "MITE zero division"
+            print_error("MITE zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -311,18 +358,23 @@ class DSB:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to DSB (decoded uop cache) fetch pipeline.  For example, inefficient
-utilization of the DSB cache structure or bank conflict when reading from it,
-are categorized here."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to DSB (decoded uop cache) fetch
+pipeline.  For example, inefficient utilization of the DSB
+cache structure or bank conflict when reading from it, are
+categorized here."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("IDQ.ALL_DSB_CYCLES_ANY_UOPS", 3) - EV("IDQ.ALL_DSB_CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.3) and self.parent.thresh
         except ZeroDivisionError:
-            #print "DSB zero division"
+            print_error("DSB zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -332,19 +384,24 @@ class LSD:
     domain = "CoreClocks"
     area = "FE"
     desc = """
-This metric represents Core cycles fraction in which CPU was likely limited
-due to LSD (Loop Stream Detector) unit.  LSD typically does well sustaining
-Uop supply. However, in some rare cases, optimal uop-delivery could not be
-reached for small loops whose size (in terms of number of uops) does not suit
-well the LSD structure."""
+This metric represents Core cycles fraction in which CPU was
+likely limited due to LSD (Loop Stream Detector) unit.  LSD
+typically does well sustaining Uop supply. However, in some
+rare cases, optimal uop-delivery could not be reached for
+small loops whose size (in terms of number of uops) does not
+suit well the LSD structure."""
     level = 3
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("LSD.CYCLES_ACTIVE", 3) - EV("LSD.CYCLES_4_UOPS", 3)) / CORE_CLKS(EV, 3 )
             self.thresh = (self.val > 0.1) and self.parent.thresh
         except ZeroDivisionError:
-            #print "LSD zero division"
+            print_error("LSD zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -354,19 +411,24 @@ class Bad_Speculation:
     domain = "Slots"
     area = "BAD"
     desc = """
-This category reflects slots wasted due to incorrect speculations, which
-include slots used to allocate uops that do not eventually get retired and
-slots for which allocation was blocked due to recovery from earlier incorrect
-speculation. For example, wasted work due to miss-predicted branches are
-categorized under Bad Speculation category"""
+This category reflects slots wasted due to incorrect
+speculations, which include slots used to allocate uops that
+do not eventually get retired and slots for which allocation
+was blocked due to recovery from earlier incorrect
+speculation. For example, wasted work due to miss-predicted
+branches are categorized under Bad Speculation category"""
     level = 1
     htoff = False
+    sample = ['INT_MISC.RECOVERY_CYCLES']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = (EV("UOPS_ISSUED.ANY", 1) - EV("UOPS_RETIRED.RETIRE_SLOTS", 1) + Pipeline_Width * Recovery_Cycles(EV, 1)) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.1)
         except ZeroDivisionError:
-            #print "Bad_Speculation zero division"
+            print_error("Bad_Speculation zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -376,21 +438,27 @@ class Backend_Bound:
     domain = "Slots"
     area = "BE"
     desc = """
-This category reflects slots where no uops are being delivered due to a lack
-of required resources for accepting more uops in the Backend of the pipeline.
-Backend describes the portion of the pipeline where the out-of-order scheduler
-dispatches ready uops into their respective execution units, and once
-completed these uops get retired according to program order. For example,
-stalls due to data-cache misses or stalls due to the divider unit being
-overloaded are both categorized under Backend Bound."""
+This category reflects slots where no uops are being
+delivered due to a lack of required resources for accepting
+more uops in the Backend of the pipeline. Backend describes
+the portion of the pipeline where the out-of-order scheduler
+dispatches ready uops into their respective execution units,
+and once completed these uops get retired according to
+program order. For example, stalls due to data-cache misses
+or stalls due to the divider unit being overloaded are both
+categorized under Backend Bound."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = 1 -(self.Frontend_Bound.compute(EV) + self.Bad_Speculation.compute(EV) + self.Retiring.compute(EV))
             self.thresh = (self.val > 0.2)
         except ZeroDivisionError:
-            #print "Backend_Bound zero division"
+            print_error("Backend_Bound zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -400,25 +468,32 @@ class Retiring:
     domain = "Slots"
     area = "RET"
     desc = """
-This category reflects slots utilized by useful work i.e. allocated uops that
-eventually get retired. Ideally, all pipeline slots would be attributed to the
-Retiring category.  Retiring of 100% would indicate the maximum 4 uops retired
-per cycle has been achieved.  Maximizing Retiring typically increases the
-Instruction-Per-Cycle metric. Note that a high Retiring value does not
-necessary mean there is no room for more performance.  For example, Microcode
-assists are categorized under Retiring. They hurt performance and can often be
-avoided.  A high Retiring value for non-vectorized code may be a good hint for
-programmer to consider vectorizing his code.  Doing so essentially lets more
-computations be done without significantly increasing number of instructions
-thus improving the performance."""
+This category reflects slots utilized by useful work i.e.
+allocated uops that eventually get retired. Ideally, all
+pipeline slots would be attributed to the Retiring category.
+Retiring of 100% would indicate the maximum 4 uops retired
+per cycle has been achieved.  Maximizing Retiring typically
+increases the Instruction-Per-Cycle metric. Note that a high
+Retiring value does not necessary mean there is no room for
+more performance.  For example, Microcode assists are
+categorized under Retiring. They hurt performance and can
+often be avoided.  A high Retiring value for non-vectorized
+code may be a good hint for programmer to consider
+vectorizing his code.  Doing so essentially lets more
+computations be done without significantly increasing number
+of instructions thus improving the performance."""
     level = 1
     htoff = False
+    sample = []
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = EV("UOPS_RETIRED.RETIRE_SLOTS", 1) / SLOTS(EV, 1 )
             self.thresh = (self.val > 0.7) | self.Microcode_Sequencer.thresh
         except ZeroDivisionError:
-            #print "Retiring zero division"
+            print_error("Retiring zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -428,21 +503,27 @@ class Base:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction where the CPU was retiring uops not
-originated from the microcode-sequencer. This correlates with total number of
-instructions used by the program. A uops-per-instruction ratio of 1 should be
-expected. While this is the most desirable of the top 4 categories, high
-values may still indicate areas for improvement. If possible focus on
-techniques that reduce instruction count or result in more efficient
-instructions generation such as vectorization."""
+This metric represents slots fraction where the CPU was
+retiring uops not originated from the microcode-sequencer.
+This correlates with total number of instructions used by
+the program. A uops-per-instruction ratio of 1 should be
+expected. While this is the most desirable of the top 4
+categories, high values may still indicate areas for
+improvement. If possible focus on techniques that reduce
+instruction count or result in more efficient instructions
+generation such as vectorization."""
     level = 2
     htoff = False
+    sample = ['INST_RETIRED.PREC_DIST']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = self.Retiring.compute(EV) - self.Microcode_Sequencer.compute(EV )
             self.thresh = (self.val > 0.6) and self.parent.thresh
         except ZeroDivisionError:
-            #print "Base zero division"
+            print_error("Base zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -452,18 +533,24 @@ class Microcode_Sequencer:
     domain = "Slots"
     area = "RET"
     desc = """
-This metric represents slots fraction CPU was retiring uops fetched by the
-Microcode Sequencer (MS) ROM.  The MS is used for CISC instructions not fully
-decoded by the default decoders (like repeat move strings), or by microcode
-assists used to address some operation modes (like in Floating Point assists)."""
+This metric represents slots fraction CPU was retiring uops
+fetched by the Microcode Sequencer (MS) ROM.  The MS is used
+for CISC instructions not fully decoded by the default
+decoders (like repeat move strings), or by microcode assists
+used to address some operation modes (like in Floating Point
+assists)."""
     level = 2
     htoff = False
+    sample = ['IDQ.MS_UOPS']
+    errcount = 0
+    sibling = None
     def compute(self, EV):
         try:
             self.val = Retire_Uop_Fraction(EV, 2)* EV("IDQ.MS_UOPS", 2) / SLOTS(EV, 2 )
             self.thresh = (self.val > 0.05)
         except ZeroDivisionError:
-            #print "Microcode_Sequencer zero division"
+            print_error("Microcode_Sequencer zero division")
+            self.errcount += 1
             self.val = 0
             self.thresh = False
         return self.val
@@ -474,12 +561,14 @@ class Metric_IPC:
 Instructions Per Cycle (per logical thread)"""
     domain = "Metric"
     maxval = 5
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPC(EV, 0)
         except ZeroDivisionError:
-            print "IPC zero division"
+            print_error("IPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CPI:
@@ -488,12 +577,14 @@ class Metric_CPI:
 Cycles Per Instruction (threaded)"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CPI(EV, 0)
         except ZeroDivisionError:
-            print "CPI zero division"
+            print_error("CPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CoreIPC:
@@ -501,13 +592,15 @@ class Metric_CoreIPC:
     desc = """
 Instructions Per Cycle (per physical core)"""
     domain = "CoreMetric"
-    maxval = 5
+    maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CoreIPC(EV, 0)
         except ZeroDivisionError:
-            print "CoreIPC zero division"
+            print_error("CoreIPC zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_UPI:
@@ -516,12 +609,14 @@ class Metric_UPI:
 Uops Per Instruction"""
     domain = "Metric"
     maxval = 2
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = UPI(EV, 0)
         except ZeroDivisionError:
-            print "UPI zero division"
+            print_error("UPI zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_IPTB:
@@ -530,56 +625,65 @@ class Metric_IPTB:
 Instruction per taken branch"""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = IPTB(EV, 0)
         except ZeroDivisionError:
-            print "IPTB zero division"
+            print_error("IPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_BPTB:
     name = "BPTB"
     desc = """
-Branch instructions per taken branch. Can be used to approximate PGO-
-likelihood for non-loopy codes."""
+Branch instructions per taken branch. Can be used to
+approximate PGO-likelihood for non-loopy codes."""
     domain = "Metric"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = BPTB(EV, 0)
         except ZeroDivisionError:
-            print "BPTB zero division"
+            print_error("BPTB zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_DSB_Coverage:
     name = "DSB_Coverage"
     desc = """
-Fraction of Uops delivered by the DSB (decoded instructions cache)"""
+Fraction of Uops delivered by the DSB (decoded instructions
+cache)"""
     domain = "Metric"
     maxval = 1
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = DSB_Coverage(EV, 0)
         except ZeroDivisionError:
-            print "DSB_Coverage zero division"
+            print_error("DSB_Coverage zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MLP:
     name = "MLP"
     desc = """
-Memory-Level-Parallelism (average number of L1 miss demand load when there is
-at least 1 such miss)"""
+Memory-Level-Parallelism (average number of L1 miss demand
+load when there is at least 1 such miss)"""
     domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MLP(EV, 0)
         except ZeroDivisionError:
-            print "MLP zero division"
+            print_error("MLP zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_GFLOPs:
@@ -588,41 +692,47 @@ class Metric_GFLOPs:
 Giga Floating Point Operations Per Second"""
     domain = "Metric"
     maxval = 100
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = GFLOPs(EV, 0)
         except ZeroDivisionError:
-            print "GFLOPs zero division"
+            print_error("GFLOPs zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Turbo_Utilization:
     name = "Turbo_Utilization"
     desc = """
 Average Frequency Utilization relative nominal frequency"""
-    domain = "CoreMetric"
+    domain = "Metric"
     maxval = 10
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Turbo_Utilization(EV, 0)
         except ZeroDivisionError:
-            print "Turbo_Utilization zero division"
+            print_error("Turbo_Utilization zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Page_Walks_Use:
     name = "Page_Walks_Use"
     desc = """
-Fraction of cycles where the core's Page Walker is busy serving
-iTLB/Load/Store"""
+Fraction of cycles where the core's Page Walker is busy
+serving iTLB/Load/Store"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Page_Walks_Use(EV, 0)
         except ZeroDivisionError:
-            print "Page_Walks_Use zero division"
+            print_error("Page_Walks_Use zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_MUX:
@@ -631,12 +741,14 @@ class Metric_MUX:
 PerfMon Event Multiplexing accuracy indicator"""
     domain = "Clocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = MUX(EV, 0)
         except ZeroDivisionError:
-            print "MUX zero division"
+            print_error("MUX zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CLKS:
@@ -645,12 +757,14 @@ class Metric_CLKS:
 Per-thread actual clocks"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CLKS zero division"
+            print_error("CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_CORE_CLKS:
@@ -659,12 +773,14 @@ class Metric_CORE_CLKS:
 Core actual clocks"""
     domain = "CoreClocks"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = CORE_CLKS(EV, 0)
         except ZeroDivisionError:
-            print "CORE_CLKS zero division"
+            print_error("CORE_CLKS zero division")
+            self.errcount += 1
             self.val = 0
 
 class Metric_Time:
@@ -673,12 +789,14 @@ class Metric_Time:
 Run duration time in seconds"""
     domain = "Count"
     maxval = 0
+    errcount = 0
 
     def compute(self, EV):
         try:
             self.val = Time(EV, 0)
         except ZeroDivisionError:
-            print "Time zero division"
+            print_error("Time zero division")
+            self.errcount += 1
             self.val = 0
 
 # Schedule
@@ -730,40 +848,9 @@ def __init__(self, r):
 
         # siblings cross-tree
 
-        o["Frontend_Bound"].sibling = None
-        o["Frontend_Latency"].sibling = None
-        o["ITLB_Misses"].sibling = None
-        o["DSB_Switches"].sibling = None
-        o["LCP"].sibling = None
 	o["MS_Switches"].sibling = o["Microcode_Sequencer"]
-        o["Frontend_Bandwidth"].sibling = None
-        o["MITE"].sibling = None
-        o["DSB"].sibling = None
-        o["LSD"].sibling = None
-	o["Bad_Speculation"].sibling = None
-        o["Backend_Bound"].sibling = None
-        o["Retiring"].sibling = None
-        o["Base"].sibling = None
 	o["Microcode_Sequencer"].sibling = o["MS_Switches"]
 
-        # sampling events
-
-        o["Frontend_Bound"].sample = []
-        o["Frontend_Latency"].sample = []
-        o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED']
-        o["DSB_Switches"].sample = []
-        o["LCP"].sample = []
-        o["MS_Switches"].sample = []
-        o["Frontend_Bandwidth"].sample = []
-        o["MITE"].sample = []
-        o["DSB"].sample = []
-        o["LSD"].sample = []
-        o["Bad_Speculation"].sample = ['INT_MISC.RECOVERY_CYCLES']
-        o["Backend_Bound"].sample = []
-        o["Retiring"].sample = []
-        o["Base"].sample = ['INST_RETIRED.PREC_DIST']
-        o["Microcode_Sequencer"].sample = ['IDQ.MS_UOPS']
-
         # user visible metrics
 
         n = Metric_IPC() ; r.metric(n)
diff --git a/tl-tester b/tl-tester
index 48fd770d..88bbbe8c 100755
--- a/tl-tester
+++ b/tl-tester
@@ -44,7 +44,7 @@ EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py --no-desc -d -l4 $LOAD
 EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py --no-desc -v -d -l4 $LOAD
 EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py --no-desc -x, -v -d -l4 $LOAD
 EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py --no-desc --metrics -x, -v -d -l4 $LOAD
-EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py -g --raw -v --stats -d $ALL --kernel $LOAD | tee log
+EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py -g --raw -v --debug --stats -d $ALL --kernel $LOAD | tee log
 EVENTMAP=${cpus[ivb]} FORCECPU=ivb $WRAP ./toplev.py --no-desc --stats -d $ALL --kernel --no-multiplex $LOAD | tee log
 grep :k log
 grep /k log
diff --git a/toplev.py b/toplev.py
index 4bb7a5ad..4b0e48c0 100755
--- a/toplev.py
+++ b/toplev.py
@@ -633,20 +633,34 @@ def core_fmt(core):
 def thread_fmt(j):
     return core_fmt(key_to_coreid(j)) + ("-T%d" % cpu.cputothread[int(j)])
 
-def referenced_check(res, referenced, already_warned):
-    if referenced in already_warned:
-        return
-    already_warned.append(referenced)
-
-    # sanity check: did we reference all results?
-    if len(res.keys()) > 0:
-        r = res[res.keys()[0]]
-        if len(referenced) != len(r):
-            print >>sys.stderr, "warning: %d results not referenced:" % (len(r) - len(referenced)),
-            print >>sys.stderr, " ".join(["%d" % x for x in sorted(set(range(len(r))) - referenced)])
+class ComputeStat:
+    def __init__(self):
+        self.referenced = set()
+        self.already_warned = set()
+        self.errcount = 0
+        self.errors = set()
+
+    def referenced_check(self, res):
+        referenced = self.referenced
+        referenced = referenced - self.already_warned
+        if not referenced:
+            return
+        self.already_warned |= referenced
+
+        # sanity check: did we reference all results?
+        if len(res.keys()) > 0:
+            r = res[res.keys()[0]]
+            if len(referenced) != len(r):
+                print >>sys.stderr, "warning: %d results not referenced:" % (len(r) - len(referenced)),
+                print >>sys.stderr, " ".join(["%d" % x for x in sorted(set(range(len(r))) - referenced)])
+
+    def compute_errors(self):
+        if self.errcount > 0:
+            print >>sys.stderr, "warning: %d division by zero errors"
+            print >>sys.stderr, " ".join(self.errors)
 
 def print_keys(runner, res, rev, out, interval, env):
-    referenced = set()
+    stat = runner.stat
     if smt_mode:
         # collect counts from all threads of cores as lists
         # this way the model can access all threads individually
@@ -655,18 +669,15 @@ def print_keys(runner, res, rev, out, interval, env):
         for core, citer in itertools.groupby(core_keys, key_to_coreid):
             cpus = list(citer)
             r = list(itertools.izip(*[res[j] for j in cpus]))
-            runner.print_res(r, rev[cpus[0]], out, interval, core_fmt(core), env, Runner.SMT_yes,
-                             referenced)
+            runner.print_res(r, rev[cpus[0]], out, interval, core_fmt(core), env, Runner.SMT_yes, stat)
 
         # print the non SMT nodes
         for j in sorted(res.keys()):
-            runner.print_res(res[j], rev[j], out, interval, thread_fmt(j), env, Runner.SMT_no,
-                             referenced)
+            runner.print_res(res[j], rev[j], out, interval, thread_fmt(j), env, Runner.SMT_no, stat)
     else:
         for j in sorted(res.keys()):
-            runner.print_res(res[j], rev[j], out, interval, j, env, Runner.SMT_dontcare,
-                             referenced)
-    referenced_check(res, referenced, runner.already_warned)
+            runner.print_res(res[j], rev[j], out, interval, j, env, Runner.SMT_dontcare, stat)
+    stat.referenced_check(res)
 
 def is_outgroup(x):
     return set(x) - outgroup_events == set()
@@ -957,8 +968,8 @@ def __init__(self, max_level):
         self.olist = []
         self.max_level = max_level
         self.missed = 0
-        self.already_warned = []
 	self.sample_obj = set()
+        self.stat = ComputeStat()
 
     def do_run(self, obj):
         obj.res = None
@@ -1094,7 +1105,7 @@ def schedule(self):
                 len(self.olist),
                 self.missed)
 
-    def print_res(self, res, rev, out, timestamp, title, env, smt, referenced):
+    def print_res(self, res, rev, out, timestamp, title, env, smt, stat):
         if len(res) == 0:
             print "Nothing measured?"
             return
@@ -1104,7 +1115,10 @@ def print_res(self, res, rev, out, timestamp, title, env, smt, referenced):
             out.set_hdr(full_name(obj), obj.area if has(obj, 'area') else None)
             if obj.res_map:
                 obj.compute(lambda e, level:
-                            lookup_res(res, rev, e, obj, env, level, referenced))
+                            lookup_res(res, rev, e, obj, env, level, stat.referenced))
+                if has(obj, 'errcount') and obj.errcount > 0:
+                    stat.errors.add(obj.name)
+                    stat.errcount += obj.errcount
             elif obj.name != "Time":
                 print >>sys.stderr, "%s not measured" % (obj.__class__.__name__,)
         out.logf.flush()
@@ -1202,36 +1216,51 @@ def ht_warning():
 
 runner = Runner(args.level)
 
+pe = lambda x: None
+if args.debug:
+    pe = lambda x: sys.stdout.write(x + "\n")
+
 if cpu.cpu == "ivb":
     import ivb_client_ratios
     ivb_client_ratios.smt_enabled = cpu.ht
     smt_mode = cpu.ht
+    ivb_client_ratios.print_error = pe
     ivb_client_ratios.Setup(runner)
 elif cpu.cpu == "ivt":
     import ivb_server_ratios
     ivb_server_ratios.smt_enabled = cpu.ht
     smt_mode = cpu.ht
+    ivb_server_ratios.print_error = pe
     ivb_server_ratios.Setup(runner)
 elif cpu.cpu == "snb":
     import snb_client_ratios
+    snb_client_ratios.smt_enabled = cpu.ht
+    smt_mode = cpu.ht
+    snb_client_ratios.print_error = pe
     snb_client_ratios.Setup(runner)
 elif cpu.cpu == "jkt":
     import jkt_server_ratios
+    jkt_server_ratios.smt_enabled = cpu.ht
+    smt_mode = cpu.ht
+    jkt_server_ratios.print_error = pe
     jkt_server_ratios.Setup(runner)
 elif cpu.cpu == "hsw":
     import hsw_client_ratios
     hsw_client_ratios.smt_enabled = cpu.ht
     smt_mode = cpu.ht
+    hsw_client_ratios.print_error = pe
     hsw_client_ratios.Setup(runner)
 elif cpu.cpu == "hsx":
     import hsx_server_ratios
     hsx_server_ratios.smt_enabled = cpu.ht
     smt_mode = cpu.ht
+    hsx_server_ratios.print_error = pe
     hsx_server_ratios.Setup(runner)
 elif cpu.cpu == "bdw":
     import bdw_client_ratios
     bdw_client_ratios.smt_enabled = cpu.ht
     smt_mode = cpu.ht
+    bdw_client_ratios.print_error = pe
     bdw_client_ratios.Setup(runner)
 elif cpu.cpu == "slm":
     import slm_ratios
@@ -1304,6 +1333,7 @@ def setup_with_metrics(p, runner):
     ret = execute_no_multiplex(runner, out, rest)
 else:
     ret = execute(runner, out, rest)
+runner.stat.compute_errors()
 if args.show_sample or args.run_sample:
     print_sample(runner.sample_obj, rest)
 sys.exit(ret)