diff --git a/README.md b/README.md index f97877f3..44eb248e 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ analysis on Intel CPUs on top of [Linux perf](https://perf.wiki.kernel.org/index # Recent new features: +* toplev can now automatically sample workloads with --run-sample * Added cputop utility to easily enable/disable hyper threading * toplev updated to TopDown 2.9: - Many fixes to SMT support. SMT now supported on Haswell. diff --git a/hsw_client_ratios.py b/hsw_client_ratios.py index fb4fe8ce..5fcf0418 100644 --- a/hsw_client_ratios.py +++ b/hsw_client_ratios.py @@ -1482,7 +1482,7 @@ def __init__(self, r): o["MEM_Bandwidth"].sample = [] o["MEM_Latency"].sample = [] o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp'] - o["False_Sharing"].sample = [' MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE:request=DEMAND_RFO:response=L3_HIT.SNOOP_HITM'] + o["False_Sharing"].sample = ['MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE'] o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp'] o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] o["Core_Bound"].sample = [] diff --git a/hsx_server_ratios.py b/hsx_server_ratios.py index 4a354a5a..a5cb5528 100644 --- a/hsx_server_ratios.py +++ b/hsx_server_ratios.py @@ -1528,7 +1528,7 @@ def __init__(self, r): o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp'] o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] o["Core_Bound"].sample = [] - o["Divider"].sample = ['ARITH.FPU_DIV_ACTIVE'] + o["Divider"].sample = [] o["Ports_Utilization"].sample = [] o["G0_Ports_Utilized"].sample = [] o["G1_Port_Utilized"].sample = [] diff --git a/ivb_client_ratios.py b/ivb_client_ratios.py index 2881660d..7373a4af 100644 --- a/ivb_client_ratios.py +++ b/ivb_client_ratios.py @@ -1852,12 +1852,12 @@ def __init__(self, r): o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp'] o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp'] o["SQ_Full"].sample = [] - o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp'] + o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_MISS:pp'] o["MEM_Bandwidth"].sample = [] o["MEM_Latency"].sample = [] o["Stores_Bound"].sample = ['MEM_UOPS_RETIRED.ALL_STORES:pp'] o["Store_Latency"].sample = [] - o["False_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE_0'] + o["False_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp', 'OFFCORE_RESPONSE.DEMAND_RFO.LLC_HIT.HITM_Other_CORE_0'] o["Split_Stores"].sample = ['MEM_UOPS_RETIRED.SPLIT_STORES:pp'] o["DTLB_Store"].sample = ['MEM_UOPS_RETIRED.STLB_MISS_STORES:pp'] o["Core_Bound"].sample = [] diff --git a/ivb_server_ratios.py b/ivb_server_ratios.py index 881f2baa..348d1eec 100644 --- a/ivb_server_ratios.py +++ b/ivb_server_ratios.py @@ -1895,7 +1895,7 @@ def __init__(self, r): o["Data_Sharing"].sample = ['MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM:pp'] o["L3_Latency"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_HIT:pp'] o["SQ_Full"].sample = [] - o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.L3_MISS:pp'] + o["MEM_Bound"].sample = ['MEM_LOAD_UOPS_RETIRED.LLC_MISS:pp'] o["MEM_Bandwidth"].sample = [] o["MEM_Latency"].sample = [] o["Local_DRAM"].sample = ['MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM:pp'] diff --git a/jkt_server_ratios.py b/jkt_server_ratios.py index 9d03e131..02bf569d 100644 --- a/jkt_server_ratios.py +++ b/jkt_server_ratios.py @@ -749,7 +749,7 @@ def __init__(self, r): # sampling events o["Frontend_Bound"].sample = [] - o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END'] + o["Frontend_Latency"].sample = [] o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED'] o["DSB_Switches"].sample = [] o["LCP"].sample = [] diff --git a/ocperf.py b/ocperf.py index c1c05e62..b45017c2 100755 --- a/ocperf.py +++ b/ocperf.py @@ -121,7 +121,7 @@ def __init__(self, name, val, desc): self.msrval = 0 self.desc = desc - def output_newstyle(self, newextra="", noname=False, period=False): + def output_newstyle(self, newextra="", noname=False, period=False, name=""): """Format an perf event for output and return as perf event string. Always uses new style (cpu/.../).""" val = self.val @@ -129,13 +129,16 @@ def output_newstyle(self, newextra="", noname=False, period=False): if newextra: extra += "," + newextra e = "event=0x%x,umask=0x%x%s" % (val & 0xff, (val >> 8) & 0xff, extra) - if version.has_name and not noname: - e += ",name=%s" % (self.name.replace(".", "_"),) + if version.has_name: + if name: + e += ",name=" + name + elif not noname: + e += ",name=%s" % (self.name.replace(".", "_"),) if period and self.period: e += ",period=%d" % self.period return e - def output(self, use_raw=False, flags="", noname=False, period=False): + def output(self, use_raw=False, flags="", noname=False, period=False, name=""): """Format an event for output and return as perf event string. use_raw when true return old style perf string (rXXX). Otherwise chose between old and new style based on the @@ -165,7 +168,7 @@ def output(self, use_raw=False, flags="", noname=False, period=False): if extra: ename += ":" + extra else: - ename = "cpu/%s/" % (self.output_newstyle(newextra=",".join(newe), noname=noname, period=period)) + extra + ename = "cpu/%s/" % (self.output_newstyle(newextra=",".join(newe), noname=noname, period=period, name=name)) + extra return ename box_to_perf = { diff --git a/snb_client_ratios.py b/snb_client_ratios.py index b5acb576..5c1a58f5 100644 --- a/snb_client_ratios.py +++ b/snb_client_ratios.py @@ -749,7 +749,7 @@ def __init__(self, r): # sampling events o["Frontend_Bound"].sample = [] - o["Frontend_Latency"].sample = ['RS_EVENTS.EMPTY_END'] + o["Frontend_Latency"].sample = [] o["ITLB_Misses"].sample = ['ITLB_MISSES.WALK_COMPLETED'] o["DSB_Switches"].sample = [] o["LCP"].sample = [] diff --git a/tl-tester b/tl-tester index 5b7e2578..cdbbafb9 100755 --- a/tl-tester +++ b/tl-tester @@ -102,6 +102,13 @@ EVENTMAP=${cpus[snb]} FORCEHT=0 FORCECPU=snb $WRAP ./toplev.py -d -l4 -I 1000 -a EVENTMAP=${cpus[snb]} FORCEHT=0 FORCECPU=snb $WRAP ./toplev.py -d -l4 -I 1000 -a --per-socket sleep 1 EVENTMAP=${cpus[snb]} FORCEHT=0 FORCECPU=snb $WRAP ./toplev.py -d --no-desc -l4 -I 1000 -a -A sleep 1 +$WRAP ./toplev.py -o /dev/null --no-desc -v -l5 --run-sample $LOAD +for cpu in $ALLCPUS ; do +EVENTMAP=${cpus[$cpu]} FORCECPU=$cpu $WRAP ./toplev.py -o /dev/null --no-desc -v --all --show-sample $LOAD >&log +cat log +grep "not found" log && exit 1 +done + trap "" ERR 0 echo diff --git a/toplev.py b/toplev.py index 79098d34..93b33b10 100755 --- a/toplev.py +++ b/toplev.py @@ -45,6 +45,17 @@ "cpu/event=0x0,umask=0x3,any=1/" : 2, } +# handle kernels that don't support all events +unsup_pebs = { + "BR_MISP_RETIRED.ALL_BRANCHES:pp": (("hsw",), (3, 18)), + "MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM:pp": (("hsw",), (3, 18)), + "MEM_LOAD_UOPS_RETIRED.L3_MISS:pp": (("hsw",), (3, 18)), +} + +unsup_events = { + "OFFCORE_RESPONSE.DEMAND_RFO.L3_HIT.HITM_OTHER_CORE": (("hsw",), (3, 18)), +} + ingroup_events = frozenset(fixed_to_num.keys()) outgroup_events = set() @@ -83,6 +94,14 @@ def __init__(self): sys.exit("perf binary is too old. please upgrade") self.supports_power = works(perf + " list | grep -q power/") +def unsup_event(e, table): + if e in table: + v = table[e] + return (cpu.cpu in v[0] and + kernel_version[0] <= v[1][0] and + kernel_version[1] < v[1][1]) + return False + def needed_limited_counter(evlist, limit_table, limit_set): limited_only = set(evlist) & set(limit_set) assigned = Counter([limit_table[x] for x in limited_only]).values() @@ -191,8 +210,6 @@ def event_group(evlist): formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument('--verbose', '-v', help='Print all results even when below threshold', action='store_true') -p.add_argument('--force', help='Force potentially broken configurations', - action='store_true') p.add_argument('--kernel', help='Only measure kernel code', action='store_true') p.add_argument('--user', help='Only measure user code', action='store_true') p.add_argument('--print-group', '-g', help='Print event group assignments', @@ -221,6 +238,8 @@ def event_group(evlist): p.add_argument('--no-multiplex', help='Do not multiplex, but run the workload multiple times as needed. Requires reproducible workloads.', action='store_true') +p.add_argument('--show-sample', help='Show command line to rerun workload with sampling', action='store_true') +p.add_argument('--run-sample', help='Automatically rerun workload with sampling', action='store_true') p.add_argument('--stats', help='Show statistics on what events counted', action='store_true') p.add_argument('--power', help='Display power metrics', action='store_true') p.add_argument('--version', help=argparse.SUPPRESS, action='store_true') @@ -262,7 +281,6 @@ def event_group(evlist): detailed_model = (args.level > 1) or args.detailed csv_mode = args.csv interval_mode = args.interval -force = args.force ring_filter = "" if args.kernel: ring_filter = 'k' @@ -496,7 +514,7 @@ def add_filter(s): s = [x + separator(x) + ring_filter for x in s] return s -def raw_event(i): +def raw_event(i, name="", period=False): if i.count(".") > 0: if i in fixed_counters: return fixed_counters[i] @@ -506,10 +524,8 @@ def raw_event(i): e = emap.getevent(event_fixes[i]) if e is None: print >>sys.stderr, "%s not found" % (i,) - if not force: - sys.exit(1) - return "cycles" # XXX - i = e.output(noname=True) + return None + i = e.output(noname=True, name=name, period=period) emap.update_event(e.output(noname=True), e) if e.counter != cpu.standard_counters: # for now only use the first counter only to simplify @@ -914,6 +930,7 @@ def __init__(self, max_level): self.max_level = max_level self.missed = 0 self.already_warned = [] + self.sample_obj = set() def do_run(self, obj): obj.res = None @@ -1092,6 +1109,36 @@ def print_res(self, res, rev, out, timestamp, title, env, smt, referenced): desc + disclaimer, title, sample_desc(obj.sample) if obj.sample else "") + if obj.thresh or args.verbose: + self.sample_obj.add(obj) + +def remove_pp(s): + if s.endswith(":pp"): + return s[:-3] + return s + +def print_sample(sample_obj, rest): + samples = [] + for obj in sample_obj: + for s in obj.sample: + samples.append((s, obj.name)) + if len(samples) == 0: + return + nsamp = [x for x in samples if not unsup_event(x[0], unsup_events)] + nsamp = [(remove_pp(x[0]), x[1]) if unsup_event(x[0], unsup_pebs) else x + for x in nsamp] + if cmp(nsamp, samples): + missing = [x[0] for x in set(samples) - set(nsamp)] + print >>sys.stderr, "warning: update kernel to handle sample events:" + print >>sys.stderr, "\n".join(missing) + sl = [raw_event(s[0], s[1], period=True) for s in nsamp] + sample = ",".join([x for x in sl if x]) + print "Sampling:" + sperf = [perf, "record", "-g", "-e", sample] + [x for x in rest if x != "-A"] + print " ".join(sperf) + if args.run_sample: + os.system(" ".join(sperf)) + print "Run `" + perf + " report' to show the sampling results" def sysctl(name): try: @@ -1222,4 +1269,6 @@ def setup_with_metrics(p, runner): ret = execute_no_multiplex(runner, out, rest) else: ret = execute(runner, out, rest) +if args.show_sample or args.run_sample: + print_sample(runner.sample_obj, rest) sys.exit(ret)