Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

migration: ubench fixes #46

Merged
merged 2 commits into from
May 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tasks/migration/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ applications to benefit from dynamic changes in the compute environment.
First, provision the cluster:

```bash
(faasm-exp-pase) inv cluster.provision --vm Standard_D8_v5 --nodes 3 --name ${CLUSTER_NAME}
(faasm-exp-base) inv cluster.credentials --name ${CLUSTER_NAME}
inv cluster.provision --vm Standard_D8_v5 --nodes 3 --name ${CLUSTER_NAME}
inv cluster.credentials --name ${CLUSTER_NAME}
```

Second, deploy the cluster
Expand Down
29 changes: 15 additions & 14 deletions tasks/migration/oracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from glob import glob
from invoke import task
from math import ceil
from matplotlib.pyplot import savefig, subplots
from matplotlib.pyplot import subplots
from os import makedirs
from os.path import basename, join
from random import sample
Expand All @@ -24,6 +24,7 @@
get_lammps_data_file,
get_lammps_migration_params,
)
from tasks.util.plot import save_plot
from time import sleep


Expand Down Expand Up @@ -54,14 +55,14 @@ def calculate_cross_vm_links(part):


@task()
def run(ctx, workload="network", nprocs=None):
def run(ctx, workload="very-network", nprocs=None):
"""
Experiment to measure the benefits of migration in isolation
"""
# Work out the number of processes to run with
num_procs = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
num_procs = [2, 3, 4, 5, 6, 7, 8] # , 9, 10, 11, 12, 13, 14, 15, 16]
num_cpus_per_vm = 8
num_vms = 16
num_vms = 8 # 16
if nprocs is not None:
num_procs = [int(nprocs)]

Expand Down Expand Up @@ -160,20 +161,17 @@ def do_write_csv_line(csv_name, part, xvm_links, actual_time):


@task
def plot(ctx):
def plot(ctx, workload="very-network"):
plots_dir = join(PLOTS_ROOT, "migration")
makedirs(plots_dir, exist_ok=True)
out_file = join(
plots_dir, "migration_oracle_{}.pdf".format(LAMMPS_SIM_WORKLOAD)
)

results_dir = join(PROJ_ROOT, "results", "migration")
result_dict = {}

for csv in glob(
join(
results_dir,
"migration_oracle_{}_*.csv".format(LAMMPS_SIM_WORKLOAD),
"migration_oracle_{}_*.csv".format(workload),
)
):
num_procs = csv.split("_")[-1].split(".")[0]
Expand All @@ -193,13 +191,14 @@ def plot(ctx):
float(line.split(",")[-1])
)

print(result_dict)
num_plots = len(result_dict)
num_cols = 4
num_rows = ceil(num_plots / num_cols)
fig, axes = subplots(nrows=num_rows, ncols=num_cols)
fig.suptitle(
"Correlation between execution time (Y) and x-VM links (X)\n(wload: compute)"
"Correlation between execution time (Y) and x-VM links (X)\n(wload: {})".format(
workload
)
)

def do_plot(ax, results, num_procs):
Expand All @@ -213,6 +212,8 @@ def do_plot(ax, results, num_procs):
axes[int(i / 4)][int(i % 4)], result_dict[num_procs], num_procs
)

fig.tight_layout()
savefig(out_file, format="pdf") # , bbox_inches="tight")
print("Plot saved to: {}".format(out_file))
save_plot(
fig,
join(PLOTS_ROOT, "migration"),
"migration_oracle_{}".format(workload),
)
22 changes: 17 additions & 5 deletions tasks/migration/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,16 @@
from pandas import read_csv
from tasks.util.env import PROJ_ROOT
from tasks.util.migration import MIGRATION_PLOTS_DIR
from tasks.util.plot import save_plot
from tasks.util.plot import UBENCH_PLOT_COLORS, save_plot


ALL_WORKLOADS = ["compute", "network"]
ALL_WORKLOADS = [
"all-to-all",
"compute",
"network",
"og-network",
"very-network",
]


def _read_results():
Expand Down Expand Up @@ -43,8 +49,11 @@ def plot(ctx):
"""
migration_results = _read_results()

do_plot("compute", migration_results)
do_plot("network", migration_results)
do_plot("all-to-all", migration_results)
# do_plot("compute", migration_results)
# do_plot("network", migration_results)
do_plot("very-network", migration_results)
# do_plot("og-network", migration_results)


def do_plot(workload, migration_results):
Expand All @@ -63,11 +72,14 @@ def do_plot(workload, migration_results):
)
)

color_idx = list(migration_results.keys()).index(workload)

ax.bar(
xticks,
ys,
width,
label=workload,
color=UBENCH_PLOT_COLORS[color_idx],
edgecolor="black",
)

Expand All @@ -86,7 +98,7 @@ def do_plot(workload, migration_results):
1.5,
"{:.1f}".format(ys[0]),
rotation="vertical",
fontsize=6,
fontsize=8,
bbox={
"boxstyle": "Square, pad=0.2",
"edgecolor": "black",
Expand Down
52 changes: 33 additions & 19 deletions tasks/migration/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,11 @@
from os import makedirs
from os.path import basename, join
from tasks.migration.util import generate_host_list
from tasks.util.env import RESULTS_DIR
from tasks.util.env import (
MPI_MIGRATE_FAASM_FUNC,
MPI_MIGRATE_FAASM_USER,
RESULTS_DIR,
)
from tasks.util.faasm import (
get_faasm_exec_time_from_json,
post_async_msg_and_get_result_json,
Expand Down Expand Up @@ -55,16 +59,22 @@ def run(ctx, w, check_in=None, repeats=1, num_cores_per_vm=8):
check_array = [int(check_in)]

for workload in w:
if workload not in LAMMPS_SIM_WORKLOAD_CONFIGS:
if (
workload != "all-to-all"
and workload not in LAMMPS_SIM_WORKLOAD_CONFIGS
):
print(
"Unrecognised workload config ({}) must be one in: {}".format(
workload, LAMMPS_SIM_WORKLOAD.keys()
)
)
workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload]
data_file = basename(
get_lammps_data_file(workload_config["data_file"])["data"][0]
)
raise RuntimeError("Unrecognised workload: {}".format(workload))

if workload != "all-to-all":
workload_config = LAMMPS_SIM_WORKLOAD_CONFIGS[workload]
data_file = basename(
get_lammps_data_file(workload_config["data_file"])["data"][0]
)

csv_name = "migration_{}.csv".format(workload)
_init_csv_file(csv_name)
Expand All @@ -75,37 +85,41 @@ def run(ctx, w, check_in=None, repeats=1, num_cores_per_vm=8):

# Print progress
print(
"Running migration micro-benchmark (wload:"
"Running migration micro-benchmark (wload: "
+ "{} - check-at: {} - repeat: {}/{})".format(
workload, check, run_num + 1, repeats
)
)

"""
TODO: do we want to keep the all-to-all baseline?
if workload == "all-to-all":
num_loops = 100000
user = MPI_MIGRATE_FAASM_USER
func = MPI_MIGRATE_FAASM_FUNC
cmdline = "{} {}".format(
check if check != 0 else 5, num_loops
)
"""
input_data = None
else:
user = LAMMPS_FAASM_USER
func = LAMMPS_FAASM_MIGRATION_NET_FUNC
cmdline = "-in faasm://lammps-data/{}".format(data_file)
input_data = get_lammps_migration_params(
check_every=check if check != 0 else 5,
num_loops=5,
num_net_loops=workload_config["num_net_loops"],
chunk_size=workload_config["chunk_size"],
)

# Run LAMMPS
cmdline = "-in faasm://lammps-data/{}".format(data_file)
msg = {
"user": LAMMPS_FAASM_USER,
"function": LAMMPS_FAASM_MIGRATION_NET_FUNC,
"user": user,
"function": func,
"cmdline": cmdline,
"mpi_world_size": int(num_cores_per_vm),
"input_data": get_lammps_migration_params(
num_loops=5,
num_net_loops=workload_config["num_net_loops"],
chunk_size=workload_config["chunk_size"],
),
}

if input_data is not None:
msg["input_data"] = input_data

if check == 0:
# Setting a check fraction of 0 means we don't
# under-schedule. We use it as a baseline
Expand Down
Loading