From 06947843986ae2688b22b90a296037f3772506eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Cervi=C3=B1o?= Date: Mon, 30 Sep 2019 16:10:15 +0200 Subject: [PATCH] Add more metrics to ROV (#1465) --- erizo/src/erizo/thread/ThreadPool.cpp | 15 ++++++ erizo/src/erizo/thread/ThreadPool.h | 3 ++ erizo/src/erizo/thread/Worker.cpp | 48 ++++++++++++++++++ erizo/src/erizo/thread/Worker.h | 20 ++++++++ erizoAPI/ThreadPool.cc | 22 ++++++++ erizoAPI/ThreadPool.h | 3 ++ erizo_controller/ROV/rovMetricsGatherer.js | 50 ++++++++++++++++++- erizo_controller/erizoJS/erizoJSController.js | 40 +++++++++++++++ erizo_controller/erizoJS/models/Client.js | 4 ++ 9 files changed, 204 insertions(+), 1 deletion(-) diff --git a/erizo/src/erizo/thread/ThreadPool.cpp b/erizo/src/erizo/thread/ThreadPool.cpp index 92b1b67b41..4336e8006c 100644 --- a/erizo/src/erizo/thread/ThreadPool.cpp +++ b/erizo/src/erizo/thread/ThreadPool.cpp @@ -6,6 +6,7 @@ constexpr int kNumThreadsPerScheduler = 2; using erizo::ThreadPool; using erizo::Worker; +using erizo::DurationDistribution; ThreadPool::ThreadPool(unsigned int num_workers) : workers_{}, scheduler_{std::make_shared(kNumThreadsPerScheduler)} { @@ -46,3 +47,17 @@ void ThreadPool::close() { } scheduler_->stop(true); } + +DurationDistribution ThreadPool::getDurationDistribution() { + DurationDistribution total_durations; + for (auto worker : workers_) { + total_durations += worker->getDurationDistribution(); + } + return total_durations; +} + +void ThreadPool::resetStats() { + for (auto worker : workers_) { + worker->resetStats(); + } +} diff --git a/erizo/src/erizo/thread/ThreadPool.h b/erizo/src/erizo/thread/ThreadPool.h index 9dd24602ea..9cec72845c 100644 --- a/erizo/src/erizo/thread/ThreadPool.h +++ b/erizo/src/erizo/thread/ThreadPool.h @@ -18,6 +18,9 @@ class ThreadPool { void start(); void close(); + void resetStats(); + DurationDistribution getDurationDistribution(); + private: std::vector> workers_; std::shared_ptr scheduler_; diff --git a/erizo/src/erizo/thread/Worker.cpp b/erizo/src/erizo/thread/Worker.cpp index b74c56db46..3aa1751ed6 100644 --- a/erizo/src/erizo/thread/Worker.cpp +++ b/erizo/src/erizo/thread/Worker.cpp @@ -9,6 +9,7 @@ #include "lib/ClockUtils.h" using erizo::Worker; +using erizo::DurationDistribution; using erizo::SimulatedWorker; using erizo::ScheduledTaskReference; @@ -22,6 +23,30 @@ void ScheduledTaskReference::cancel() { cancelled = true; } +DurationDistribution::DurationDistribution() + : duration_0_10_ms{0}, + duration_10_50_ms{0}, + duration_50_100_ms{0}, + duration_100_1000_ms{0}, + duration_1000_ms{0} {} + +void DurationDistribution::reset() { + duration_0_10_ms = 0; + duration_10_50_ms = 0; + duration_50_100_ms = 0; + duration_100_1000_ms = 0; + duration_1000_ms = 0; +} + +DurationDistribution& DurationDistribution::operator+=(const DurationDistribution& durations) { + duration_0_10_ms += durations.duration_0_10_ms; + duration_10_50_ms += durations.duration_10_50_ms; + duration_50_100_ms += durations.duration_50_100_ms; + duration_100_1000_ms += durations.duration_100_1000_ms; + duration_1000_ms += durations.duration_1000_ms; + return *this; +} + Worker::Worker(std::weak_ptr scheduler, std::shared_ptr the_clock) : scheduler_{scheduler}, clock_{the_clock}, @@ -106,11 +131,34 @@ std::function Worker::safeTask(std::function weak_this = shared_from_this(); return [f, weak_this] { if (auto this_ptr = weak_this.lock()) { + time_point start = this_ptr->clock_->now(); f(this_ptr); + time_point end = this_ptr->clock_->now(); + this_ptr->addToStats(end - start); } }; } +void Worker::addToStats(duration task_duration) { + if (task_duration <= std::chrono::milliseconds(10)) { + durations_.duration_0_10_ms++; + } else if (task_duration <= std::chrono::milliseconds(50)) { + durations_.duration_10_50_ms++; + } else if (task_duration <= std::chrono::milliseconds(100)) { + durations_.duration_50_100_ms++; + } else if (task_duration <= std::chrono::milliseconds(1000)) { + durations_.duration_100_1000_ms++; + } else { + durations_.duration_1000_ms++; + } +} + +void Worker::resetStats() { + task(safeTask([](std::shared_ptr worker) { + worker->durations_.reset(); + })); +} + SimulatedWorker::SimulatedWorker(std::shared_ptr the_clock) : Worker(std::make_shared(1), the_clock), clock_{the_clock} { } diff --git a/erizo/src/erizo/thread/Worker.h b/erizo/src/erizo/thread/Worker.h index 3793688f32..7bc015bc49 100644 --- a/erizo/src/erizo/thread/Worker.h +++ b/erizo/src/erizo/thread/Worker.h @@ -26,6 +26,21 @@ class ScheduledTaskReference { std::atomic cancelled; }; +class DurationDistribution { + public: + DurationDistribution(); + ~DurationDistribution() {} + void reset(); + DurationDistribution& operator+=(const DurationDistribution& buf); + + public: + uint duration_0_10_ms; + uint duration_10_50_ms; + uint duration_50_100_ms; + uint duration_100_1000_ms; + uint duration_1000_ms; +}; + class Worker : public std::enable_shared_from_this { public: typedef std::unique_ptr asio_worker; @@ -48,9 +63,13 @@ class Worker : public std::enable_shared_from_this { virtual void scheduleEvery(ScheduledTask f, duration period); + void resetStats(); + DurationDistribution getDurationDistribution() { return durations_; } + private: void scheduleEvery(ScheduledTask f, duration period, duration next_delay); std::function safeTask(std::function)> f); + void addToStats(duration task_duration); protected: int next_scheduled_ = 0; @@ -63,6 +82,7 @@ class Worker : public std::enable_shared_from_this { boost::thread_group group_; std::atomic closed_; boost::thread::id thread_id_; + DurationDistribution durations_; }; class SimulatedWorker : public Worker { diff --git a/erizoAPI/ThreadPool.cc b/erizoAPI/ThreadPool.cc index ddfdcfc94d..0015ccb97b 100644 --- a/erizoAPI/ThreadPool.cc +++ b/erizoAPI/ThreadPool.cc @@ -11,6 +11,8 @@ using v8::FunctionTemplate; using v8::HandleScope; using v8::Exception; +using erizo::DurationDistribution; + Nan::Persistent ThreadPool::constructor; ThreadPool::ThreadPool() { @@ -28,6 +30,8 @@ NAN_MODULE_INIT(ThreadPool::Init) { // Prototype Nan::SetPrototypeMethod(tpl, "close", close); Nan::SetPrototypeMethod(tpl, "start", start); + Nan::SetPrototypeMethod(tpl, "getDurationDistribution", getDurationDistribution); + Nan::SetPrototypeMethod(tpl, "resetStats", resetStats); constructor.Reset(tpl->GetFunction()); Nan::Set(target, Nan::New("ThreadPool").ToLocalChecked(), Nan::GetFunction(tpl).ToLocalChecked()); @@ -58,3 +62,21 @@ NAN_METHOD(ThreadPool::start) { obj->me->start(); } + +NAN_METHOD(ThreadPool::getDurationDistribution) { + ThreadPool* obj = Nan::ObjectWrap::Unwrap(info.Holder()); + DurationDistribution duration_distribution = obj->me->getDurationDistribution(); + v8::Local array = Nan::New(5); + Nan::Set(array, 0, Nan::New(duration_distribution.duration_0_10_ms)); + Nan::Set(array, 1, Nan::New(duration_distribution.duration_10_50_ms)); + Nan::Set(array, 2, Nan::New(duration_distribution.duration_50_100_ms)); + Nan::Set(array, 3, Nan::New(duration_distribution.duration_100_1000_ms)); + Nan::Set(array, 4, Nan::New(duration_distribution.duration_1000_ms)); + + info.GetReturnValue().Set(array); +} + +NAN_METHOD(ThreadPool::resetStats) { + ThreadPool* obj = Nan::ObjectWrap::Unwrap(info.Holder()); + obj->me->resetStats(); +} diff --git a/erizoAPI/ThreadPool.h b/erizoAPI/ThreadPool.h index 4be04de80e..fd1c07e314 100644 --- a/erizoAPI/ThreadPool.h +++ b/erizoAPI/ThreadPool.h @@ -35,6 +35,9 @@ class ThreadPool : public Nan::ObjectWrap { */ static NAN_METHOD(start); + static NAN_METHOD(getDurationDistribution); + static NAN_METHOD(resetStats); + static Nan::Persistent constructor; }; diff --git a/erizo_controller/ROV/rovMetricsGatherer.js b/erizo_controller/ROV/rovMetricsGatherer.js index 20ba4a51db..ab28c572de 100644 --- a/erizo_controller/ROV/rovMetricsGatherer.js +++ b/erizo_controller/ROV/rovMetricsGatherer.js @@ -8,6 +8,17 @@ class RovMetricsGatherer { totalPublishers: new promClient.Gauge({ name: this.getNameWithPrefix('total_publishers'), help: 'total active publishers' }), totalSubscribers: new promClient.Gauge({ name: this.getNameWithPrefix('total_subscribers'), help: 'total active subscribers' }), activeErizoJsProcesses: new promClient.Gauge({ name: this.getNameWithPrefix('active_erizojs_processes'), help: 'active processes' }), + totalConnectionsFailed: new promClient.Gauge({ name: this.getNameWithPrefix('total_connections_failed'), help: 'connections failed' }), + taskDuration0To10ms: new promClient.Gauge({ name: this.getNameWithPrefix('task_duration_0_to_10_ms'), help: 'tasks lasted less than 10 ms' }), + taskDuration10To50ms: new promClient.Gauge({ name: this.getNameWithPrefix('task_duration_10_to_50_ms'), help: 'tasks lasted between 10 and 50 ms' }), + taskDuration50To100ms: new promClient.Gauge({ name: this.getNameWithPrefix('task_duration_50_to_100_ms'), help: 'tasks lasted between 50 and 100 ms' }), + taskDuration100To1000ms: new promClient.Gauge({ name: this.getNameWithPrefix('task_duration_100_to_1000_ms'), help: 'tasks lasted between 100 ms and 1 s' }), + taskDurationMoreThan1000ms: new promClient.Gauge({ name: this.getNameWithPrefix('task_duration_1000_ms'), help: 'tasks lasted more than 1 s' }), + connectionQualityHigh: new promClient.Gauge({ name: this.getNameWithPrefix('connection_quality_high'), help: 'connections with high quality' }), + connectionQualityMedium: new promClient.Gauge({ name: this.getNameWithPrefix('connection_quality_medium'), help: 'connections with medium quality' }), + connectionQualityLow: new promClient.Gauge({ name: this.getNameWithPrefix('connection_quality_low'), help: 'connections with low quality' }), + totalPublishersInErizoJS: new promClient.Gauge({ name: this.getNameWithPrefix('total_publishers_erizojs'), help: 'total active publishers in erizo js' }), + totalSubscribersInErizoJS: new promClient.Gauge({ name: this.getNameWithPrefix('total_subscribers_erizojs'), help: 'total active subscribers in erizo js' }), }; this.log = logger; } @@ -80,12 +91,49 @@ class RovMetricsGatherer { return Promise.resolve(); } + getErizoJSMetrics() { + this.log.debug('Getting total connections failed'); + return this.rovClient.runInComponentList('console.log(JSON.stringify(context.getAndResetMetrics()))', this.rovClient.components.erizoJS) + .then((results) => { + let totalConnectionsFailed = 0; + let taskDurationDistribution = Array(5).fill(0); + let connectionLevels = Array(10).fill(0); + let publishers = 0; + let subscribers = 0; + results.forEach((result) => { + const parsedResult = JSON.parse(result); + totalConnectionsFailed += parsedResult.connectionsFailed; + taskDurationDistribution = + taskDurationDistribution.map((a, i) => a + parsedResult.durationDistribution[i]); + connectionLevels = connectionLevels.map((a, i) => a + parsedResult.connectionLevels[i]); + publishers += parsedResult.publishers; + subscribers += parsedResult.subscribers; + }); + this.log.debug(`Total connections failed: ${totalConnectionsFailed}`); + this.prometheusMetrics.totalConnectionsFailed.set(totalConnectionsFailed); + this.prometheusMetrics.taskDuration0To10ms.set(taskDurationDistribution[0]); + this.prometheusMetrics.taskDuration10To50ms.set(taskDurationDistribution[1]); + this.prometheusMetrics.taskDuration50To100ms.set(taskDurationDistribution[2]); + this.prometheusMetrics.taskDuration100To1000ms.set(taskDurationDistribution[3]); + this.prometheusMetrics.taskDurationMoreThan1000ms.set(taskDurationDistribution[4]); + + this.prometheusMetrics.connectionQualityHigh.set(connectionLevels[2]); + this.prometheusMetrics.connectionQualityMedium.set(connectionLevels[1]); + this.prometheusMetrics.connectionQualityLow.set(connectionLevels[0]); + + this.prometheusMetrics.totalPublishersInErizoJS.set(publishers); + this.prometheusMetrics.totalSubscribersInErizoJS.set(subscribers); + return Promise.resolve(); + }); + } + gatherMetrics() { return this.rovClient.updateComponentsList() .then(() => this.getTotalRooms()) .then(() => this.getTotalClients()) .then(() => this.getTotalPublishersAndSubscribers()) - .then(() => this.getActiveProcesses()); + .then(() => this.getActiveProcesses()) + .then(() => this.getErizoJSMetrics()); } } diff --git a/erizo_controller/erizoJS/erizoJSController.js b/erizo_controller/erizoJS/erizoJSController.js index d5bfb83c3d..79f4bf0be5 100644 --- a/erizo_controller/erizoJS/erizoJSController.js +++ b/erizo_controller/erizoJS/erizoJSController.js @@ -54,9 +54,16 @@ exports.ErizoJSController = (erizoJSId, threadPool, ioThreadPool) => { } }; + const initMetrics = () => { + that.metrics = { + connectionsFailed: 0, + }; + }; + that.publishers = publishers; that.ioThreadPool = io; + initMetrics(); const forEachPublisher = (action) => { const publisherStreamIds = Object.keys(publishers); @@ -81,6 +88,10 @@ exports.ErizoJSController = (erizoJSId, threadPool, ioThreadPool) => { connectionId, connectionEvent, newStatus) => { const rpcID = `erizoController_${erizoControllerId}`; amqper.callRpc(rpcID, 'connectionStatusEvent', [clientId, connectionId, newStatus, connectionEvent]); + + if (connectionEvent.type === 'failed') { + that.metrics.connectionsFailed += 1; + } }; const getOrCreateClient = (erizoControllerId, clientId, singlePC = false) => { @@ -631,5 +642,34 @@ exports.ErizoJSController = (erizoJSId, threadPool, ioThreadPool) => { } }; + that.getAndResetMetrics = () => { + const metrics = Object.assign({}, that.metrics); + metrics.totalConnections = 0; + metrics.connectionLevels = Array(10).fill(0); + metrics.publishers = Object.keys(that.publishers).length; + let subscribers = 0; + Object.keys(that.publishers).forEach((streamId, publisher) => { + subscribers += publisher.numSubscribers; + }); + metrics.subscribers = subscribers; + + metrics.durationDistribution = threadPool.getDurationDistribution(); + threadPool.resetStats(); + + clients.forEach((client) => { + const connections = client.getConnections(); + metrics.totalConnections += connections.length; + + connections.forEach((connection) => { + const level = connection.qualityLevel; + if (level >= 0 && level < metrics.connectionLevels.length) { + metrics.connectionLevels[level] += 1; + } + }); + }); + initMetrics(); + return metrics; + }; + return that; }; diff --git a/erizo_controller/erizoJS/models/Client.js b/erizo_controller/erizoJS/models/Client.js index 06308ff323..b7e0247f38 100644 --- a/erizo_controller/erizoJS/models/Client.js +++ b/erizo_controller/erizoJS/models/Client.js @@ -54,6 +54,10 @@ class Client extends EventEmitter { log.debug(`Client connections list size after add : ${this.connections.size}`); } + getConnections() { + return Array.from(this.connections.values()); + } + forceCloseConnection(id) { const connection = this.connections.get(id); if (connection !== undefined) {