From daa312d7eefa0347560d48514c1c82cd4878d318 Mon Sep 17 00:00:00 2001 From: chris meyers Date: Tue, 14 Apr 2020 13:41:36 -0400 Subject: [PATCH 1/4] log file for wsbroadcast --- awx/settings/defaults.py | 12 ++++++++++++ awx/settings/production.py | 1 + 2 files changed, 13 insertions(+) diff --git a/awx/settings/defaults.py b/awx/settings/defaults.py index e024c4191de9..dc12447d0514 100644 --- a/awx/settings/defaults.py +++ b/awx/settings/defaults.py @@ -1043,6 +1043,15 @@ def IS_TESTING(argv=None): 'backupCount': 5, 'formatter':'dispatcher', }, + 'wsbroadcast': { + # don't define a level here, it's set by settings.LOG_AGGREGATOR_LEVEL + 'class': 'logging.handlers.RotatingFileHandler', + 'filters': ['require_debug_false', 'dynamic_level_filter'], + 'filename': os.path.join(LOG_ROOT, 'wsbroadcast.log'), + 'maxBytes': 1024 * 1024 * 5, # 5 MB + 'backupCount': 5, + 'formatter':'simple', + }, 'celery.beat': { 'class':'logging.StreamHandler', 'level': 'ERROR' @@ -1130,6 +1139,9 @@ def IS_TESTING(argv=None): 'awx.main.dispatch': { 'handlers': ['dispatcher'], }, + 'awx.main.wsbroadcast': { + 'handlers': ['wsbroadcast'], + }, 'awx.isolated.manager.playbooks': { 'handlers': ['management_playbooks'], 'propagate': False diff --git a/awx/settings/production.py b/awx/settings/production.py index cda057c08758..fb3971976771 100644 --- a/awx/settings/production.py +++ b/awx/settings/production.py @@ -55,6 +55,7 @@ LOGGING['handlers']['tower_warnings']['filename'] = '/var/log/tower/tower.log' # noqa LOGGING['handlers']['callback_receiver']['filename'] = '/var/log/tower/callback_receiver.log' # noqa LOGGING['handlers']['dispatcher']['filename'] = '/var/log/tower/dispatcher.log' # noqa +LOGGING['handlers']['wsbroadcast']['filename'] = '/var/log/tower/wsbroadcast.log' # noqa LOGGING['handlers']['task_system']['filename'] = '/var/log/tower/task_system.log' # noqa LOGGING['handlers']['management_playbooks']['filename'] = '/var/log/tower/management_playbooks.log' # noqa LOGGING['handlers']['system_tracking_migrations']['filename'] = '/var/log/tower/tower_system_tracking_migrations.log' # noqa From 9cabf3ef4df50514ee78ddb4c24d358a390e98bf Mon Sep 17 00:00:00 2001 From: chris meyers Date: Tue, 14 Apr 2020 16:54:00 -0400 Subject: [PATCH 2/4] do not include iso nodes in wsbroadcast status --- awx/main/management/commands/run_wsbroadcast.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/awx/main/management/commands/run_wsbroadcast.py b/awx/main/management/commands/run_wsbroadcast.py index 5b94bed1e0a7..2e60d945e8d2 100644 --- a/awx/main/management/commands/run_wsbroadcast.py +++ b/awx/main/management/commands/run_wsbroadcast.py @@ -7,6 +7,7 @@ from datetime import datetime as dt from django.core.management.base import BaseCommand +from django.db.models import Q from awx.main.analytics.broadcast_websocket import ( BroadcastWebsocketStatsManager, @@ -67,7 +68,7 @@ def get_connection_status(cls, me, hostnames, data): connection_started = data.get(f'{prefix}_connection_start', 'Error') if connection_started != 'Error': connection_started = datetime.datetime.fromtimestamp(connection_started) - connection_duration = (dt.now() - connection_started).total_seconds() + connection_duration = int((dt.now() - connection_started).total_seconds()) connection_state = f'\033[{connection_color}m{connection_state}\033[0m' @@ -81,8 +82,8 @@ def get_connection_stats(cls, me, hostnames, data): for h in hostnames: h = safe_name(h) prefix = f'awx_{h}' - messages_total = data.get(f'{prefix}_messages_received', 'N/A') - messages_per_minute = data.get(f'{prefix}_messages_received_per_minute', 'N/A') + messages_total = data.get(f'{prefix}_messages_received', '0') + messages_per_minute = data.get(f'{prefix}_messages_received_per_minute', '0') host_stats.append((h, str(int(messages_total)), str(int(messages_per_minute)))) @@ -101,7 +102,7 @@ def handle(self, *arg, **options): else: data[family.name] = family.samples[0].value me = Instance.objects.me() - hostnames = [i.hostname for i in Instance.objects.exclude(hostname=me.hostname)] + hostnames = [i.hostname for i in Instance.objects.exclude(Q(hostname=me.hostname) | Q(rampart_groups__controller__isnull=False))] host_stats = Command.get_connection_status(me, hostnames, data) lines = Command._format_lines(host_stats) From 63f56d33aa1910903151c79cd231ee3fdae2107d Mon Sep 17 00:00:00 2001 From: chris meyers Date: Tue, 14 Apr 2020 16:59:34 -0400 Subject: [PATCH 3/4] show user unsafe name * We log stats using a safe hostname because of prometheus requirements. However, when we display users the hostname we should use the Instance hostname. This change outputs the Instance.hostname instead of the safe prometheus name. --- awx/main/management/commands/run_wsbroadcast.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/awx/main/management/commands/run_wsbroadcast.py b/awx/main/management/commands/run_wsbroadcast.py index 2e60d945e8d2..62f571921df1 100644 --- a/awx/main/management/commands/run_wsbroadcast.py +++ b/awx/main/management/commands/run_wsbroadcast.py @@ -56,8 +56,8 @@ def get_connection_status(cls, me, hostnames, data): host_stats = [('hostname', 'state', 'start time', 'duration (sec)')] for h in hostnames: connection_color = '91' # red - h = safe_name(h) - prefix = f'awx_{h}' + h_safe = safe_name(h) + prefix = f'awx_{h_safe}' connection_state = data.get(f'{prefix}_connection', 'N/A') connection_started = 'N/A' connection_duration = 'N/A' @@ -80,8 +80,8 @@ def get_connection_status(cls, me, hostnames, data): def get_connection_stats(cls, me, hostnames, data): host_stats = [('hostname', 'total', 'per minute')] for h in hostnames: - h = safe_name(h) - prefix = f'awx_{h}' + h_safe = safe_name(h) + prefix = f'awx_{h_safe}' messages_total = data.get(f'{prefix}_messages_received', '0') messages_per_minute = data.get(f'{prefix}_messages_received_per_minute', '0') From 1acca459efc261dd4288c16a621cfc9d3b6316f6 Mon Sep 17 00:00:00 2001 From: chris meyers Date: Wed, 15 Apr 2020 13:28:13 -0400 Subject: [PATCH 4/4] nice error message when redis is down * awx_manage run_wsbroadcast --status nice error message if someone failed to start awx services (i.e. redis) --- awx/main/management/commands/run_wsbroadcast.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/awx/main/management/commands/run_wsbroadcast.py b/awx/main/management/commands/run_wsbroadcast.py index 62f571921df1..31bfac87c0b3 100644 --- a/awx/main/management/commands/run_wsbroadcast.py +++ b/awx/main/management/commands/run_wsbroadcast.py @@ -4,6 +4,7 @@ import asyncio import datetime import re +import redis from datetime import datetime as dt from django.core.management.base import BaseCommand @@ -91,7 +92,12 @@ def get_connection_stats(cls, me, hostnames, data): def handle(self, *arg, **options): if options.get('status'): - stats_all = BroadcastWebsocketStatsManager.get_stats_sync() + try: + stats_all = BroadcastWebsocketStatsManager.get_stats_sync() + except redis.exceptions.ConnectionError as e: + print(f"Unable to get Broadcast Websocket Status. Failed to connect to redis {e}") + return + data = {} for family in stats_all: if family.type == 'gauge' and len(family.samples) > 1: