Skip to content

Commit

Permalink
[SARC-332] Implémenter les alertes : un cluster ne répond pas depuis …
Browse files Browse the repository at this point in the history
…X temps (#134)

* [SARC-332] Implémenter les alertes : un cluster ne répond pas depuis X temps

* Change date parsing

* Set parsed date timezone to MTL

* Parse date only with MTL timezone

---------

Co-authored-by: Bruno Carrez <bruno.carrez@mila.quebec>
  • Loading branch information
notoraptor and nurbal authored Oct 6, 2024
1 parent 8a49625 commit ec45280
Show file tree
Hide file tree
Showing 11 changed files with 124 additions and 7 deletions.
46 changes: 46 additions & 0 deletions sarc/alerts/usage_alerts/cluster_response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import logging
from datetime import datetime, time, timedelta

from sarc.client.job import get_available_clusters
from sarc.config import MTL

logger = logging.getLogger(__name__)


def check_cluster_response(time_interval: timedelta = timedelta(days=7)):
"""
Check if we scraped clusters recently.
Log a warning for each cluster not scraped since `time_interval` from now.
Parameters
----------
time_interval: timedelta
Interval of time (until current time) in which we want to see cluster scrapings.
For each cluster, if the latest scraping occurred before this period, a warning will be logged.
Default is 7 days.
"""
# Get current date
current_date = datetime.now(tz=MTL)
# Get the oldest date allowed from now
oldest_allowed_date = current_date - time_interval
# Check each available cluster
for cluster in get_available_clusters():
if cluster.end_date is None:
logger.warning(
f"[{cluster.cluster_name}] no end_date available, cannot check last scraping"
)
else:
# Cluster's latest scraping date should be in `cluster.end_date`.
# NB: We assume cluster's `end_date` is stored as a date string,
# so we must first convert it to a datetime object.
# `en_date` is parsed the same way as start/end parameters in `get_jobs()`.
cluster_end_date = datetime.combine(
datetime.strptime(cluster.end_date, "%Y-%m-%d"), time.min
).replace(tzinfo=MTL)
# Now we can check.
if cluster_end_date < oldest_allowed_date:
logger.warning(
f"[{cluster.cluster_name}] no scraping since {cluster_end_date}, "
f"oldest required: {oldest_allowed_date}, "
f"current time: {current_date}"
)
22 changes: 15 additions & 7 deletions tests/functional/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from .allocations.factory import create_allocations
from .diskusage.factory import create_diskusages
from .jobs.factory import create_jobs, create_users
from .jobs.factory import create_cluster_entries, create_jobs, create_users


@pytest.fixture
Expand Down Expand Up @@ -51,22 +51,29 @@ def fill_db(db, with_users=False, with_clusters=False, job_patch=None):
if with_clusters:
# Fill collection `clusters`.
cluster_names = {job["cluster_name"] for job in db.jobs.find({})}
db.clusters.insert_many(
{"cluster_name": cluster_name, "start_date": None, "end_date": None}
for cluster_name in cluster_names
)
db.clusters.insert_many(create_cluster_entries(db))


def create_db_configuration_fixture(
db_name, empty=False, with_users=False, job_patch=None, scope="function"
db_name,
empty=False,
with_users=False,
with_clusters=False,
job_patch=None,
scope="function",
):
@pytest.fixture(scope=scope)
def fixture(standard_config_object):
cfg = custom_db_config(standard_config_object, db_name)
db = cfg.mongo.database_instance
clear_db(db)
if not empty:
fill_db(db, with_users=with_users, job_patch=job_patch)
fill_db(
db,
with_users=with_users,
with_clusters=with_clusters,
job_patch=job_patch,
)
yield

return fixture
Expand Down Expand Up @@ -119,6 +126,7 @@ def fixture(client_config_object):
read_only_db_with_users_config_object = create_db_configuration_fixture(
db_name="sarc-read-only-with-users-test",
with_users=True,
with_clusters=True,
scope="session",
)

Expand Down
20 changes: 20 additions & 0 deletions tests/functional/jobs/factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,26 @@ def create_jobs(job_factory: JobFactory | None = None, job_patch: dict | None =
return job_factory.jobs


def create_cluster_entries(db):
"""Generate cluster entries to fill collection `clusters`."""
cluster_names = sorted({job["cluster_name"] for job in db.jobs.find({})})
cluster_entries = []

date_format = "%Y-%m-%d"

for i, cluster_name in enumerate(cluster_names):
cluster_end_time = end_time - timedelta(days=i)
cluster_start_time = cluster_end_time - timedelta(days=1)
cluster_entries.append(
{
"cluster_name": cluster_name,
"start_date": cluster_start_time.strftime(date_format),
"end_date": cluster_end_time.strftime(date_format),
}
)
return cluster_entries


json_raw = {
"metadata": {
"plugin": {"type": "openapi/dbv0.0.37", "name": "Slurm OpenAPI DB v0.0.37"},
Expand Down
29 changes: 29 additions & 0 deletions tests/functional/usage_alerts/test_alert_cluster_response.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import re
from datetime import timedelta

import pytest

from sarc.alerts.usage_alerts.cluster_response import check_cluster_response
from tests.functional.jobs.test_func_load_job_series import MOCK_TIME

PARAMETERS = {
"default": dict(), # default is 7 days
**{
f"{days}-days": dict(time_interval=timedelta(days=days))
for days in [365, 283, 282, 281, 280, 279]
},
}


@pytest.mark.freeze_time(MOCK_TIME)
@pytest.mark.usefixtures("read_only_db_with_users", "tzlocal_is_mtl")
@pytest.mark.parametrize("params", PARAMETERS.values(), ids=PARAMETERS.keys())
def test_check_cluster_response(params, caplog, file_regression):
check_cluster_response(**params)
file_regression.check(
re.sub(
r"WARNING +sarc\.alerts\.usage_alerts\.cluster_response:cluster_response.py:[0-9]+ +",
"",
caplog.text,
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[fromage] no scraping since 2023-02-15 00:00:00-05:00, oldest required: 2023-02-15 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
[mila] no scraping since 2023-02-14 00:00:00-05:00, oldest required: 2023-02-15 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
[patate] no scraping since 2023-02-13 00:00:00-05:00, oldest required: 2023-02-15 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
[raisin] no scraping since 2023-02-12 00:00:00-05:00, oldest required: 2023-02-15 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[mila] no scraping since 2023-02-14 00:00:00-05:00, oldest required: 2023-02-14 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
[patate] no scraping since 2023-02-13 00:00:00-05:00, oldest required: 2023-02-14 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
[raisin] no scraping since 2023-02-12 00:00:00-05:00, oldest required: 2023-02-14 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[patate] no scraping since 2023-02-13 00:00:00-05:00, oldest required: 2023-02-13 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
[raisin] no scraping since 2023-02-12 00:00:00-05:00, oldest required: 2023-02-13 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[raisin] no scraping since 2023-02-12 00:00:00-05:00, oldest required: 2023-02-12 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[fromage] no scraping since 2023-02-15 00:00:00-05:00, oldest required: 2023-11-14 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
[mila] no scraping since 2023-02-14 00:00:00-05:00, oldest required: 2023-11-14 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
[patate] no scraping since 2023-02-13 00:00:00-05:00, oldest required: 2023-11-14 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00
[raisin] no scraping since 2023-02-12 00:00:00-05:00, oldest required: 2023-11-14 19:00:00-05:00, current time: 2023-11-21 19:00:00-05:00

0 comments on commit ec45280

Please sign in to comment.