From 4f9a03f5ca658e5e643a3147b287877b5c3da9a0 Mon Sep 17 00:00:00 2001 From: Carlos Date: Thu, 23 May 2024 11:12:44 +0200 Subject: [PATCH] migration: ubench fixes (#46) * migration: ubench fixes * migration: more fixes --- tasks/migration/README.md | 4 +-- tasks/migration/oracle.py | 29 +++++++++++----------- tasks/migration/plot.py | 22 +++++++++++++---- tasks/migration/run.py | 52 +++++++++++++++++++++++++-------------- 4 files changed, 67 insertions(+), 40 deletions(-) diff --git a/tasks/migration/README.md b/tasks/migration/README.md index ef9842f..aaed8b1 100644 --- a/tasks/migration/README.md +++ b/tasks/migration/README.md @@ -6,8 +6,8 @@ applications to benefit from dynamic changes in the compute environment. First, provision the cluster: ```bash -(faasm-exp-pase) inv cluster.provision --vm Standard_D8_v5 --nodes 3 --name ${CLUSTER_NAME} -(faasm-exp-base) inv cluster.credentials --name ${CLUSTER_NAME} +inv cluster.provision --vm Standard_D8_v5 --nodes 3 --name ${CLUSTER_NAME} +inv cluster.credentials --name ${CLUSTER_NAME} ``` Second, deploy the cluster diff --git a/tasks/migration/oracle.py b/tasks/migration/oracle.py index 9557cc5..a4d8657 100644 --- a/tasks/migration/oracle.py +++ b/tasks/migration/oracle.py @@ -2,7 +2,7 @@ from glob import glob from invoke import task from math import ceil -from matplotlib.pyplot import savefig, subplots +from matplotlib.pyplot import subplots from os import makedirs from os.path import basename, join from random import sample @@ -24,6 +24,7 @@ get_lammps_data_file, get_lammps_migration_params, ) +from tasks.util.plot import save_plot from time import sleep @@ -54,14 +55,14 @@ def calculate_cross_vm_links(part): @task() -def run(ctx, workload="network", nprocs=None): +def run(ctx, workload="very-network", nprocs=None): """ Experiment to measure the benefits of migration in isolation """ # Work out the number of processes to run with - num_procs = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16] + num_procs = [2, 3, 4, 5, 6, 7, 8] # , 9, 10, 11, 12, 13, 14, 15, 16] num_cpus_per_vm = 8 - num_vms = 16 + num_vms = 8 # 16 if nprocs is not None: num_procs = [int(nprocs)] @@ -160,12 +161,9 @@ def do_write_csv_line(csv_name, part, xvm_links, actual_time): @task -def plot(ctx): +def plot(ctx, workload="very-network"): plots_dir = join(PLOTS_ROOT, "migration") makedirs(plots_dir, exist_ok=True) - out_file = join( - plots_dir, "migration_oracle_{}.pdf".format(LAMMPS_SIM_WORKLOAD) - ) results_dir = join(PROJ_ROOT, "results", "migration") result_dict = {} @@ -173,7 +171,7 @@ def plot(ctx): for csv in glob( join( results_dir, - "migration_oracle_{}_*.csv".format(LAMMPS_SIM_WORKLOAD), + "migration_oracle_{}_*.csv".format(workload), ) ): num_procs = csv.split("_")[-1].split(".")[0] @@ -193,13 +191,14 @@ def plot(ctx): float(line.split(",")[-1]) ) - print(result_dict) num_plots = len(result_dict) num_cols = 4 num_rows = ceil(num_plots / num_cols) fig, axes = subplots(nrows=num_rows, ncols=num_cols) fig.suptitle( - "Correlation between execution time (Y) and x-VM links (X)\n(wload: compute)" + "Correlation between execution time (Y) and x-VM links (X)\n(wload: {})".format( + workload + ) ) def do_plot(ax, results, num_procs): @@ -213,6 +212,8 @@ def do_plot(ax, results, num_procs): axes[int(i / 4)][int(i % 4)], result_dict[num_procs], num_procs ) - fig.tight_layout() - savefig(out_file, format="pdf") # , bbox_inches="tight") - print("Plot saved to: {}".format(out_file)) + save_plot( + fig, + join(PLOTS_ROOT, "migration"), + "migration_oracle_{}".format(workload), + ) diff --git a/tasks/migration/plot.py b/tasks/migration/plot.py index a35332d..55578cd 100644 --- a/tasks/migration/plot.py +++ b/tasks/migration/plot.py @@ -6,10 +6,16 @@ from pandas import read_csv from tasks.util.env import PROJ_ROOT from tasks.util.migration import MIGRATION_PLOTS_DIR -from tasks.util.plot import save_plot +from tasks.util.plot import UBENCH_PLOT_COLORS, save_plot -ALL_WORKLOADS = ["compute", "network"] +ALL_WORKLOADS = [ + "all-to-all", + "compute", + "network", + "og-network", + "very-network", +] def _read_results(): @@ -43,8 +49,11 @@ def plot(ctx): """ migration_results = _read_results() - do_plot("compute", migration_results) - do_plot("network", migration_results) + do_plot("all-to-all", migration_results) + # do_plot("compute", migration_results) + # do_plot("network", migration_results) + do_plot("very-network", migration_results) + # do_plot("og-network", migration_results) def do_plot(workload, migration_results): @@ -63,11 +72,14 @@ def do_plot(workload, migration_results): ) ) + color_idx = list(migration_results.keys()).index(workload) + ax.bar( xticks, ys, width, label=workload, + color=UBENCH_PLOT_COLORS[color_idx], edgecolor="black", ) @@ -86,7 +98,7 @@ def do_plot(workload, migration_results): 1.5, "{:.1f}".format(ys[0]), rotation="vertical", - fontsize=6, + fontsize=8, bbox={ "boxstyle": "Square, pad=0.2", "edgecolor": "black", diff --git a/tasks/migration/run.py b/tasks/migration/run.py index 94cec3e..131753d 100644 --- a/tasks/migration/run.py +++ b/tasks/migration/run.py @@ -4,7 +4,11 @@ from os import makedirs from os.path import basename, join from tasks.migration.util import generate_host_list -from tasks.util.env import RESULTS_DIR +from tasks.util.env import ( + MPI_MIGRATE_FAASM_FUNC, + MPI_MIGRATE_FAASM_USER, + RESULTS_DIR, +) from tasks.util.faasm import ( get_faasm_exec_time_from_json, post_async_msg_and_get_result_json, @@ -55,16 +59,22 @@ def run(ctx, w, check_in=None, repeats=1, num_cores_per_vm=8): check_array = [int(check_in)] for workload in w: - if workload not in LAMMPS_SIM_WORKLOAD_CONFIGS: + if ( + workload != "all-to-all" + and workload not in LAMMPS_SIM_WORKLOAD_CONFIGS + ): print( "Unrecognised workload config ({}) must be one in: {}".format( workload, LAMMPS_SIM_WORKLOAD.keys() ) ) - workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload] - data_file = basename( - get_lammps_data_file(workload_config["data_file"])["data"][0] - ) + raise RuntimeError("Unrecognised workload: {}".format(workload)) + + if workload != "all-to-all": + workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload] + data_file = basename( + get_lammps_data_file(workload_config["data_file"])["data"][0] + ) csv_name = "migration_{}.csv".format(workload) _init_csv_file(csv_name) @@ -75,14 +85,12 @@ def run(ctx, w, check_in=None, repeats=1, num_cores_per_vm=8): # Print progress print( - "Running migration micro-benchmark (wload:" + "Running migration micro-benchmark (wload: " + "{} - check-at: {} - repeat: {}/{})".format( workload, check, run_num + 1, repeats ) ) - """ - TODO: do we want to keep the all-to-all baseline? if workload == "all-to-all": num_loops = 100000 user = MPI_MIGRATE_FAASM_USER @@ -90,22 +98,28 @@ def run(ctx, w, check_in=None, repeats=1, num_cores_per_vm=8): cmdline = "{} {}".format( check if check != 0 else 5, num_loops ) - """ + input_data = None + else: + user = LAMMPS_FAASM_USER + func = LAMMPS_FAASM_MIGRATION_NET_FUNC + cmdline = "-in faasm://lammps-data/{}".format(data_file) + input_data = get_lammps_migration_params( + check_every=check if check != 0 else 5, + num_loops=5, + num_net_loops=workload_config["num_net_loops"], + chunk_size=workload_config["chunk_size"], + ) - # Run LAMMPS - cmdline = "-in faasm://lammps-data/{}".format(data_file) msg = { - "user": LAMMPS_FAASM_USER, - "function": LAMMPS_FAASM_MIGRATION_NET_FUNC, + "user": user, + "function": func, "cmdline": cmdline, "mpi_world_size": int(num_cores_per_vm), - "input_data": get_lammps_migration_params( - num_loops=5, - num_net_loops=workload_config["num_net_loops"], - chunk_size=workload_config["chunk_size"], - ), } + if input_data is not None: + msg["input_data"] = input_data + if check == 0: # Setting a check fraction of 0 means we don't # under-schedule. We use it as a baseline