Skip to content

Commit

Permalink
Enabled long-running benchmarks (with watchdog enforcing deletion) vi… (
Browse files Browse the repository at this point in the history
#313)

* Enabled long-running benchmarks (with watchdog enforcing deletion) via timeout_secods property on benchmark definition

* Fixes per flake8 review
  • Loading branch information
filipecosta90 authored Mar 14, 2022
1 parent 5c11f2c commit 03f688d
Show file tree
Hide file tree
Showing 11 changed files with 264 additions and 35 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "redisbench-admin"
version = "0.7.11"
version = "0.7.12"
description = "Redis benchmark run helper. A wrapper around Redis and Redis Modules benchmark tools ( ftsb_redisearch, memtier_benchmark, redis-benchmark, aibench, etc... )."
authors = ["filipecosta90 <filipecosta.90@gmail.com>","Redis Performance Group <performance@redis.com>"]
readme = "README.md"
Expand Down
2 changes: 2 additions & 0 deletions redisbench_admin/run_remote/remote_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def remote_env_setup(
tf_github_sha,
tf_setup_name_sufix,
tf_triggering_env,
tf_timeout_secs=7200,
):
server_plaintext_port = 6379
db_ssh_port = args.db_ssh_port
Expand Down Expand Up @@ -65,6 +66,7 @@ def remote_env_setup(
tf_github_sha,
tf_setup_name_sufix,
tf_triggering_env,
tf_timeout_secs,
)
return (
client_public_ip,
Expand Down
18 changes: 18 additions & 0 deletions redisbench_admin/run_remote/run_remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
from redisbench_admin.utils.benchmark_config import (
prepare_benchmark_definitions,
get_metadata_tags,
process_benchmark_definitions_remote_timeouts,
)
from redisbench_admin.utils.redisgraph_benchmark_go import setup_remote_benchmark_agent
from redisbench_admin.utils.remote import (
Expand All @@ -59,6 +60,7 @@
check_ec2_env,
get_project_ts_tags,
push_data_to_redistimeseries,
fetch_remote_id_from_config,
)

from redisbench_admin.utils.utils import (
Expand Down Expand Up @@ -180,6 +182,17 @@ def run_remote_command_logic(args, project_name, project_version):
)
rts.ping()

remote_envs_timeout = process_benchmark_definitions_remote_timeouts(
benchmark_definitions
)

for remote_id, termination_timeout_secs in remote_envs_timeout.items():
logging.info(
"Using a timeout of {} seconds for remote setup: {}".format(
termination_timeout_secs, remote_id
)
)

# we have a map of test-type, dataset-name, topology, test-name
benchmark_runs_plan = define_benchmark_plan(benchmark_definitions, default_specs)

Expand Down Expand Up @@ -250,6 +263,10 @@ def run_remote_command_logic(args, project_name, project_version):
)
)
if "remote" in benchmark_config:
remote_id = fetch_remote_id_from_config(
benchmark_config["remote"]
)
tf_timeout_secs = remote_envs_timeout[remote_id]
client_artifacts = []
client_artifacts_map = {}
temporary_dir = get_tmp_folder_rnd()
Expand All @@ -274,6 +291,7 @@ def run_remote_command_logic(args, project_name, project_version):
tf_github_sha,
tf_setup_name_sufix,
tf_triggering_env,
tf_timeout_secs,
)

# after we've created the env, even on error we should always teardown
Expand Down
2 changes: 2 additions & 0 deletions redisbench_admin/run_remote/terraform.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def terraform_spin_or_reuse_env(
tf_github_sha,
tf_setup_name_sufix,
tf_triggering_env,
tf_timeout_secs=7200,
):
(
remote_setup,
Expand Down Expand Up @@ -62,6 +63,7 @@ def terraform_spin_or_reuse_env(
tf_github_org,
tf_github_repo,
tf_triggering_env,
tf_timeout_secs,
)
remote_envs[remote_id] = tf
else:
Expand Down
27 changes: 26 additions & 1 deletion redisbench_admin/utils/benchmark_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
import yaml
from jsonpath_ng import parse

from redisbench_admin.utils.remote import validate_result_expectations
from redisbench_admin.utils.remote import (
validate_result_expectations,
fetch_remote_id_from_config,
)


def parse_exporter_metrics_definition(
Expand Down Expand Up @@ -79,6 +82,21 @@ def prepare_benchmark_definitions(args):
)


def process_benchmark_definitions_remote_timeouts(benchmark_definitions):
remote_envs_timeout = {}
# prepare the timeout for each different remote type
for test_name, benchmark_config in benchmark_definitions.items():
if "remote" in benchmark_config:
remote_id = fetch_remote_id_from_config(benchmark_config["remote"])
termination_timeout_secs = get_termination_timeout_secs(benchmark_config)
if remote_id not in remote_envs_timeout:
remote_envs_timeout[remote_id] = 0
remote_envs_timeout[remote_id] = (
remote_envs_timeout[remote_id] + termination_timeout_secs
)
return remote_envs_timeout


def get_defaults(defaults_filename):
default_metrics = []
exporter_timemetric_path = None
Expand Down Expand Up @@ -302,6 +320,13 @@ def get_metadata_tags(benchmark_config):
return metadata_tags


def get_termination_timeout_secs(benchmark_config):
timeout_seconds = 600
if "timeout_seconds" in benchmark_config:
timeout_seconds = int(benchmark_config["timeout_seconds"])
return timeout_seconds


def extract_benchmark_type_from_config(
benchmark_config,
config_key="clientconfig",
Expand Down
12 changes: 12 additions & 0 deletions redisbench_admin/utils/remote.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ def setup_remote_environment(
tf_github_org,
tf_github_repo,
tf_triggering_env,
tf_timeout_secs=7200,
):
# key = "benchmarks/infrastructure/tf-oss-redisgraph-standalone-r5.tfstate"
_, _, _ = tf.init(
Expand Down Expand Up @@ -287,6 +288,7 @@ def setup_remote_environment(
"github_org": tf_github_org,
"github_repo": tf_github_repo,
"triggering_env": tf_triggering_env,
"timeout_secs": tf_timeout_secs,
},
)
return retrieve_tf_connection_vars(return_code, tf)
Expand Down Expand Up @@ -493,6 +495,16 @@ def get_run_full_filename(
return benchmark_output_filename


def fetch_remote_id_from_config(
remote_setup_config,
):
setup = None
for remote_setup_property in remote_setup_config:
if "setup" in remote_setup_property:
setup = remote_setup_property["setup"]
return setup


def fetch_remote_setup_from_config(
remote_setup_config,
repo="https://github.com/RedisLabsModules/testing-infrastructure.git",
Expand Down
101 changes: 68 additions & 33 deletions redisbench_admin/watchdog/watchdog.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,19 @@ def get_ci_ec2_instances_by_state(ec2_client, ci_machines_prefix, requested_stat
return count, state_instances


def get_vname_timeout_secs(instance):
vm_name = ""
timeout_secs = None
for tag_dict in instance["Tags"]:
key = tag_dict["Key"]
key_v = tag_dict["Value"]
if key == "Name":
vm_name = key_v
if key == "timeout_secs":
timeout_secs = int(key_v)
return vm_name, timeout_secs


def watchdog_dangling_ec2_instances(
ec2_client, terminate_after_secs, ci_machines_prefix, dry_run
):
Expand All @@ -55,42 +68,64 @@ def watchdog_dangling_ec2_instances(
instance_id = instance["InstanceId"]
state = instance["State"]["Name"]
if state != "terminated":
for tag_dict in instance["Tags"]:
key = tag_dict["Key"]
key_v = tag_dict["Value"]
if key == "Name":
if ci_machines_prefix in key_v:
total_instances = total_instances + 1
elapsed = current_datetime - launch_time
will_terminate = False
if elapsed.total_seconds() > terminate_after_secs:
will_terminate = True

logging.info(
"Temporary machine {} {}. terminate? {}".format(
key_v, elapsed, will_terminate
)
)
if will_terminate:
logging.warning(
"Requesting to terminate instance with id {} given it ".format(
instance_id
)
+ "surpassed the maximum allowed ci duration"
)
response = ec2_client.terminate_instances(
InstanceIds=[
instance_id,
]
)
logging.info(
"Request to terminate instance with id {} reply: {}".format(
instance_id, response
)
)
vm_name, timeout_secs = get_vname_timeout_secs(instance)
if timeout_secs is None:
timeout_secs = terminate_after_secs
total_instances = termination_check(
ci_machines_prefix,
current_datetime,
ec2_client,
instance_id,
launch_time,
timeout_secs,
total_instances,
vm_name,
)
logging.info("Detected a total of {} ci.bechmark VMs".format(total_instances))


def termination_check(
ci_machines_prefix,
current_datetime,
ec2_client,
instance_id,
launch_time,
terminate_after_secs,
total_instances,
vm_name,
):
if ci_machines_prefix in vm_name:
total_instances = total_instances + 1
elapsed = current_datetime - launch_time
will_terminate = False
if elapsed.total_seconds() > terminate_after_secs:
will_terminate = True

logging.info(
"Temporary machine {} {}. terminate? {}".format(
vm_name, elapsed, will_terminate
)
)
if will_terminate:
logging.warning(
"Requesting to terminate instance with id {} given it ".format(
instance_id
)
+ "surpassed the maximum allowed ci duration"
)
response = ec2_client.terminate_instances(
InstanceIds=[
instance_id,
]
)
logging.info(
"Request to terminate instance with id {} reply: {}".format(
instance_id, response
)
)
return total_instances


def watchdog_command_logic(args, project_name, project_version):
logging.info(
"Using: {project_name} {project_version}".format(
Expand Down
79 changes: 79 additions & 0 deletions tests/test_benchmark_config.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
import argparse
import json

import yaml

from redisbench_admin.run_remote.args import create_run_remote_arguments
from redisbench_admin.utils.benchmark_config import (
results_dict_kpi_check,
check_required_modules,
extract_redis_dbconfig_parameters,
extract_benchmark_type_from_config,
get_metadata_tags,
get_termination_timeout_secs,
prepare_benchmark_definitions,
process_benchmark_definitions_remote_timeouts,
)


Expand Down Expand Up @@ -152,3 +157,77 @@ def test_get_metadata_tags():
benchmark_config = yaml.safe_load(yml_file)
metadata_tags = get_metadata_tags(benchmark_config)
assert metadata_tags == {"includes_targets": "true", "test_type": "query"}


def test_get_termination_timeout_secs():
with open("./tests/test_data/vecsim-memtier.yml", "r") as yml_file:
benchmark_config = yaml.safe_load(yml_file)
timeout_seconds = get_termination_timeout_secs(benchmark_config)
assert timeout_seconds == 600

with open("./tests/test_data/vecsim-memtier-timeout.yml", "r") as yml_file:
benchmark_config = yaml.safe_load(yml_file)
timeout_seconds = get_termination_timeout_secs(benchmark_config)
assert timeout_seconds == 1200


def test_prepare_benchmark_definitions():
parser = argparse.ArgumentParser(
description="test",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser = create_run_remote_arguments(parser)
args = parser.parse_args(
args=[
"--github_actor",
"gh.user",
"--module_path",
"mymodule.so",
"--test-glob",
"./tests/test_data/benchmark_definitions/*.yml",
]
)
(
result,
benchmark_definitions,
default_metrics,
exporter_timemetric_path,
default_specs,
clusterconfig,
) = prepare_benchmark_definitions(args)
assert result == True
assert len(benchmark_definitions.keys()) == 2


def test_process_benchmark_definitions_remote_timeouts():
parser = argparse.ArgumentParser(
description="test",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser = create_run_remote_arguments(parser)
args = parser.parse_args(
args=[
"--github_actor",
"gh.user",
"--module_path",
"mymodule.so",
"--test-glob",
"./tests/test_data/benchmark_definitions/*.yml",
]
)
(
result,
benchmark_definitions,
default_metrics,
exporter_timemetric_path,
default_specs,
clusterconfig,
) = prepare_benchmark_definitions(args)
assert result == True
assert len(benchmark_definitions.keys()) == 2
remote_envs_timeout = process_benchmark_definitions_remote_timeouts(
benchmark_definitions
)
assert len(remote_envs_timeout.keys()) == 1
# we have the default timeout + the one specified
assert list(remote_envs_timeout.values())[0] == 600 + 1200
Loading

0 comments on commit 03f688d

Please sign in to comment.