From ac0045bee35a355e240989a0bfed956f32e17e0d Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Thu, 29 Jun 2017 18:37:23 -0400 Subject: [PATCH 01/12] [cassandra_check] Add cassandra_check --- .travis.yml | 3 + cassandra_check/CHANGELOG.md | 8 ++ cassandra_check/README.md | 29 ++++++++ cassandra_check/check.py | 58 +++++++++++++++ cassandra_check/ci/cassandra_check.rake | 99 +++++++++++++++++++++++++ cassandra_check/conf.yaml.example | 15 ++++ cassandra_check/manifest.json | 12 +++ cassandra_check/metadata.csv | 2 + cassandra_check/requirements.txt | 2 + cassandra_check/test_cassandra_check.py | 43 +++++++++++ circle.yml | 1 + 11 files changed, 272 insertions(+) create mode 100644 cassandra_check/CHANGELOG.md create mode 100644 cassandra_check/README.md create mode 100644 cassandra_check/check.py create mode 100644 cassandra_check/ci/cassandra_check.rake create mode 100644 cassandra_check/conf.yaml.example create mode 100644 cassandra_check/manifest.json create mode 100644 cassandra_check/metadata.csv create mode 100644 cassandra_check/requirements.txt create mode 100644 cassandra_check/test_cassandra_check.py diff --git a/.travis.yml b/.travis.yml index 14978f58ade89..12b3351917136 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,6 +42,9 @@ env: - TRAVIS_FLAVOR=apache FLAVOR_VERSION=2.4.12 - TRAVIS_FLAVOR=cassandra FLAVOR_VERSION=2.0.17 - TRAVIS_FLAVOR=cassandra FLAVOR_VERSION=2.1.14 + - TRAVIS_FLAVOR=cassandra_check FLAVOR_VERSION=2.0.17 + - TRAVIS_FLAVOR=cassandra_check FLAVOR_VERSION=2.1.14 + - TRAVIS_FLAVOR=cassandra_check FLAVOR_VERSION=3.10 - TRAVIS_FLAVOR=couch FLAVOR_VERSION=1.6.1 - TRAVIS_FLAVOR=consul FLAVOR_VERSION=v0.6.4 - TRAVIS_FLAVOR=consul FLAVOR_VERSION=0.7.2 diff --git a/cassandra_check/CHANGELOG.md b/cassandra_check/CHANGELOG.md new file mode 100644 index 0000000000000..b009f3a6dffde --- /dev/null +++ b/cassandra_check/CHANGELOG.md @@ -0,0 +1,8 @@ +# CHANGELOG - Cassandra_check + +0.1.0/ Unreleased +================== + +### Changes + +* [FEATURE] adds cassandra_check integration. diff --git a/cassandra_check/README.md b/cassandra_check/README.md new file mode 100644 index 0000000000000..8023c6c3c538b --- /dev/null +++ b/cassandra_check/README.md @@ -0,0 +1,29 @@ +# Cassandra Check + +## Overview + +Get metrics from cassandra databases that are not available through the [jmx integration](https://github.com/DataDog/integrations-core/tree/master/cassandra) + +## Installation + +Install the `dd-check-cassandra_check` package manually or with your favorite configuration manager + +## Configuration + +Edit the `cassandra_check.yaml` file to point to your server and port and set the keyspaces to monitor + +## Validation + +When you run `datadog-agent info` you should see something like the following: + + Checks + ====== + + cassandra_check + ----------- + - instance #0 [OK] + - Collected 39 metrics, 0 events & 7 service checks + +## Compatibility + +The cassandra_check check is compatible with all major platforms diff --git a/cassandra_check/check.py b/cassandra_check/check.py new file mode 100644 index 0000000000000..75d82ff89602b --- /dev/null +++ b/cassandra_check/check.py @@ -0,0 +1,58 @@ +# (C) Datadog, Inc. 2010-2016 +# All rights reserved +# Licensed under Simplified BSD License (see LICENSE) + +# 3rd party +from cassandra.cluster import Cluster, NoHostAvailable +from cassandra.auth import PlainTextAuthProvider + +# project +from checks import AgentCheck + +EVENT_TYPE = SOURCE_TYPE_NAME = 'cassandra_check' +DEFAULT_NODE_IP = 'localhost' +DEFAULT_NODE_PORT = 9042 + + +class CassandraCheck(AgentCheck): + + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + + def check(self, instance): + # Get the node IP address to connect Cassandra + node_ip = instance.get("node_ip", DEFAULT_NODE_IP) + node_port = instance.get("node_port", DEFAULT_NODE_PORT) + keyspaces = instance.get("keyspaces", []) + tags = instance.get("tags", []) + connect_timeout = instance.get("connect_timeout", 5) + + username = instance.get("username", "") + password = instance.get("password", "") + auth_provider = PlainTextAuthProvider(username, password) + + # Try to connect to the node + cluster = Cluster([node_ip], port=node_port, auth_provider=auth_provider, connect_timeout=connect_timeout) + try: + cluster.connect(wait_for_all_pools=True) + if keyspaces: + for keyspace in keyspaces: + token_map = cluster.metadata.token_map + down_replicas = 0 + for token in token_map.ring: + replicas = token_map.get_replicas(keyspace, token) + down_replicas = max(down_replicas, len([r for r in replicas if not r.is_up])) + + self.gauge("cassandra.replication_failures", down_replicas, + tags=["keyspace:%s" % keyspace, "cluster:%s" % cluster.metadata.cluster_name] + tags) + + + except NoHostAvailable as e: + self.log.error('Could not connect to node %s:%s : %s' % (node_ip, node_port, e)) + node_status = AgentCheck.CRITICAL + else: + node_status = AgentCheck.OK + finally: + cluster.shutdown() + + self.service_check('cassandra.can_connect', node_status, tags=tags) diff --git a/cassandra_check/ci/cassandra_check.rake b/cassandra_check/ci/cassandra_check.rake new file mode 100644 index 0000000000000..d73342d2de4cf --- /dev/null +++ b/cassandra_check/ci/cassandra_check.rake @@ -0,0 +1,99 @@ +require 'ci/common' + +def cassandra_check_version + ENV['FLAVOR_VERSION'] || '2.1.14' # '2.0.17' +end + +def cassandra_check_rootdir + "#{ENV['INTEGRATIONS_DIR']}/cassandra_check_#{cassandra_check_version}" +end + +container_name = 'dd-test-cassandra' +container_name2 = 'dd-test-cassandra2' + +namespace :ci do + namespace :cassandra_check do |flavor| + task before_install: ['ci:common:before_install'] do + sh %(docker kill #{container_name} 2>/dev/null || true) + sh %(docker rm #{container_name} 2>/dev/null || true) + sh %(docker kill #{container_name2} 2>/dev/null || true) + sh %(docker rm #{container_name2} 2>/dev/null || true) + end + + task :install do + Rake::Task['ci:common:install'].invoke('cassandra_check') + sh %(docker create --expose 9042 --expose 7000 --expose 7001 --expose 9160 \ + -p 9042:9042 -p 7000:7000 -p 7001:7001 -p 9160:9160 --name #{container_name} cassandra:#{cassandra_check_version}) + sh %(docker start #{container_name}) + sh %(docker create --name #{container_name2} \ + -e CASSANDRA_SEEDS="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' #{container_name})" cassandra:#{cassandra_check_version}) + sh %(docker start #{container_name2}) + end + + task before_script: ['ci:common:before_script'] do + # Wait.for container_port + count = 0 + logs = `docker logs #{container_name} 2>&1` + logs2 = `docker logs #{container_name2} 2>&1` + puts 'Waiting for Cassandra to come up' + until count == 20 || ((logs.include?('Listening for thrift clients') || logs.include?('Starting listening for CQL clients')) && \ + (logs2.include?('Listening for thrift clients') || logs2.include?('Starting listening for CQL clients'))) + sleep_for 4 + logs = `docker logs #{container_name} 2>&1` + logs2 = `docker logs #{container_name2} 2>&1` + count += 1 + end + if (logs.include?('Listening for thrift clients') || logs.include?('Starting listening for CQL clients')) && \ + (logs2.include?('Listening for thrift clients') || logs2.include?('Starting listening for CQL clients')) + puts 'Cassandra is up!' + else + puts 'Logs of container 1' + sh %(docker logs #{container_name} 2>&1) + puts 'Logs of container 2' + sh %(docker logs #{container_name2} 2>&1) + raise + end + end + + task script: ['ci:common:script'] do + this_provides = [ + 'cassandra_check' + ] + Rake::Task['ci:common:run_tests'].invoke(this_provides) + end + + task before_cache: ['ci:common:before_cache'] + + task cleanup: ['ci:common:cleanup'] do + sh %(docker kill #{container_name} 2>/dev/null || true) + sh %(docker rm #{container_name} 2>/dev/null || true) + sh %(docker kill #{container_name2} 2>/dev/null || true) + sh %(docker rm #{container_name2} 2>/dev/null || true) + end + + task :execute do + exception = nil + begin + %w(before_install install before_script).each do |u| + Rake::Task["#{flavor.scope.path}:#{u}"].invoke + end + if !ENV['SKIP_TEST'] + Rake::Task["#{flavor.scope.path}:script"].invoke + else + puts 'Skipping tests'.yellow + end + Rake::Task["#{flavor.scope.path}:before_cache"].invoke + rescue => e + exception = e + puts "Failed task: #{e.class} #{e.message}".red + end + if ENV['SKIP_CLEANUP'] + puts 'Skipping cleanup, disposable environments are great'.yellow + else + puts 'Cleaning up' + Rake::Task["#{flavor.scope.path}:cleanup"].invoke + end + raise exception if exception + end + end +end diff --git a/cassandra_check/conf.yaml.example b/cassandra_check/conf.yaml.example new file mode 100644 index 0000000000000..9e0a176ea9bd6 --- /dev/null +++ b/cassandra_check/conf.yaml.example @@ -0,0 +1,15 @@ +init_config: + +instances: + # Configuration options: + # keyspaces: a list of keyspaces to monitor + # node_ip: the IP of the cassandra node to connect to. The rest of the nodes in the cluster will be auto discovered by the cassandra driver + # For more information, see https://datastax.github.io/python-driver/api/index.html + # Default to localhost. + # node_port: the port cassandra is listening for connections. + # Default to 9042 + # username/password: a set of credentials to connect to the cassandra cluster. + # tags: optional, a list of tags to be sent with the metrics + # connect_timeout: timeout, in seconds, for creating new connections. Default to 5. + + - keyspaces: ["foo"] diff --git a/cassandra_check/manifest.json b/cassandra_check/manifest.json new file mode 100644 index 0000000000000..31589db3e0149 --- /dev/null +++ b/cassandra_check/manifest.json @@ -0,0 +1,12 @@ +{ + "maintainer": "help@datadoghq.com", + "manifest_version": "0.1.0", + "max_agent_version": "6.0.0", + "min_agent_version": "5.6.3", + "name": "cassandra_check", + "short_description": "cassandra_check description.", + "guid": "00e4a8bd-8ec2-4bb4-b725-6aaa91618d13", + "support": "contrib", + "supported_os": ["linux","mac_os","windows"], + "version": "0.1.0" +} diff --git a/cassandra_check/metadata.csv b/cassandra_check/metadata.csv new file mode 100644 index 0000000000000..b6ff24519cc50 --- /dev/null +++ b/cassandra_check/metadata.csv @@ -0,0 +1,2 @@ +metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name +cassandra.replication_failures,gauge,,,,Number of replica nodes down per keyspace,-1,cassandra_check,replication failures \ No newline at end of file diff --git a/cassandra_check/requirements.txt b/cassandra_check/requirements.txt new file mode 100644 index 0000000000000..aa5ecbcbb4faa --- /dev/null +++ b/cassandra_check/requirements.txt @@ -0,0 +1,2 @@ +# integration pip requirements +cassandra-driver==3.10.0 \ No newline at end of file diff --git a/cassandra_check/test_cassandra_check.py b/cassandra_check/test_cassandra_check.py new file mode 100644 index 0000000000000..d8d3ee273abb3 --- /dev/null +++ b/cassandra_check/test_cassandra_check.py @@ -0,0 +1,43 @@ +# (C) Datadog, Inc. 2010-2016 +# All rights reserved +# Licensed under Simplified BSD License (see LICENSE) + +# stdlib +from nose.plugins.attrib import attr + +# 3p +from cassandra.cluster import Cluster + +# project +from tests.checks.common import AgentCheckTest + + +@attr(requires='cassandra_check') +class TestCassandraCheck(AgentCheckTest): + """Basic Test for cassandra_check integration.""" + CHECK_NAME = 'cassandra_check' + + def test_check(self): + + # Create a keyspace with replication factor 2 + cluster = Cluster(connect_timeout=1) + session = cluster.connect() + session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 2}") + cluster.shutdown() + + config = { + 'instances': [ + {'host': '127.0.0.1', + 'port': 9042, + 'keyspaces': ['test'], + 'tags': ['foo','bar'], + 'connect_timeout': 1} + ] + } + + self.run_check(config) + # We should have the value 1 since the driver won't be able to connect to one of the container (port not exposed) + self.assertMetric('cassandra.replication_failures', value=1, tags=['keyspace:test', 'cluster:Test Cluster', 'foo', 'bar']) + self.assertServiceCheckOK('cassandra.can_connect', tags=['foo', 'bar']) + # Raises when COVERAGE=true and coverage < 100% + self.coverage_report() diff --git a/circle.yml b/circle.yml index 0da58bf889fa4..e4bcf0e878e99 100644 --- a/circle.yml +++ b/circle.yml @@ -80,6 +80,7 @@ test: - rake ci:run[kafka] - rake ci:run[docker_daemon] - rake ci:run[kubernetes] + - rake ci:run[cassandra_check] - bundle exec rake requirements post: - if [[ $(docker ps -a -q) ]]; then docker stop $(docker ps -a -q); fi From 49395f406cea0c3df90fdb0ad681097c9f0903bd Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Wed, 5 Jul 2017 10:40:40 -0400 Subject: [PATCH 02/12] Rework tests --- cassandra_check/ci/cassandra_check.rake | 8 ++--- cassandra_check/test_cassandra_check.py | 44 +++++++++++++++++-------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/cassandra_check/ci/cassandra_check.rake b/cassandra_check/ci/cassandra_check.rake index d73342d2de4cf..56cc03a70edcc 100644 --- a/cassandra_check/ci/cassandra_check.rake +++ b/cassandra_check/ci/cassandra_check.rake @@ -36,15 +36,15 @@ namespace :ci do logs = `docker logs #{container_name} 2>&1` logs2 = `docker logs #{container_name2} 2>&1` puts 'Waiting for Cassandra to come up' - until count == 20 || ((logs.include?('Listening for thrift clients') || logs.include?('Starting listening for CQL clients')) && \ - (logs2.include?('Listening for thrift clients') || logs2.include?('Starting listening for CQL clients'))) + until count == 20 || ((logs.include?('Listening for thrift clients') || logs.include?("Created default superuser role 'cassandra'")) && \ + (logs2.include?('Listening for thrift clients') || logs2.include?('Not starting RPC server as requested'))) sleep_for 4 logs = `docker logs #{container_name} 2>&1` logs2 = `docker logs #{container_name2} 2>&1` count += 1 end - if (logs.include?('Listening for thrift clients') || logs.include?('Starting listening for CQL clients')) && \ - (logs2.include?('Listening for thrift clients') || logs2.include?('Starting listening for CQL clients')) + if (logs.include?('Listening for thrift clients') || logs.include?("Created default superuser role 'cassandra'")) && \ + (logs2.include?('Listening for thrift clients') || logs2.include?('Not starting RPC server as requested')) puts 'Cassandra is up!' else puts 'Logs of container 1' diff --git a/cassandra_check/test_cassandra_check.py b/cassandra_check/test_cassandra_check.py index d8d3ee273abb3..3dbf33ad217ee 100644 --- a/cassandra_check/test_cassandra_check.py +++ b/cassandra_check/test_cassandra_check.py @@ -4,40 +4,56 @@ # stdlib from nose.plugins.attrib import attr +from mock import patch # 3p from cassandra.cluster import Cluster +from cassandra.metadata import TokenMap # project from tests.checks.common import AgentCheckTest +class MockHost: + def __init__(self, up): + self.is_up = up + +def mock_get_replicas(self, keyspace, token): + return [MockHost(True), MockHost(False)] @attr(requires='cassandra_check') class TestCassandraCheck(AgentCheckTest): """Basic Test for cassandra_check integration.""" CHECK_NAME = 'cassandra_check' - def test_check(self): + config = { + 'instances': [ + {'host': '127.0.0.1', + 'port': 9042, + 'keyspaces': ['test'], + 'tags': ['foo','bar'], + 'connect_timeout': 1} + ] + } + def test_check(self): # Create a keyspace with replication factor 2 cluster = Cluster(connect_timeout=1) session = cluster.connect() session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 2}") cluster.shutdown() + # Run check with both, we should get the value 0 (on mac this will fail since + # there is no docker0 bridge so the connection to the second container cannot be made) + self.run_check(self.config) + self.assertMetric('cassandra.replication_failures', value=0, tags=['keyspace:test', 'cluster:Test Cluster', 'foo', 'bar']) + self.assertServiceCheckOK('cassandra.can_connect', tags=['foo', 'bar']) - config = { - 'instances': [ - {'host': '127.0.0.1', - 'port': 9042, - 'keyspaces': ['test'], - 'tags': ['foo','bar'], - 'connect_timeout': 1} - ] - } - - self.run_check(config) - # We should have the value 1 since the driver won't be able to connect to one of the container (port not exposed) + self.coverage_report() + + @patch.object(TokenMap, 'get_replicas', mock_get_replicas) + def test_1_replica_down(self): + # We should have the value 1 since the driver won't be able to connect to the second container + self.run_check(self.config) self.assertMetric('cassandra.replication_failures', value=1, tags=['keyspace:test', 'cluster:Test Cluster', 'foo', 'bar']) self.assertServiceCheckOK('cassandra.can_connect', tags=['foo', 'bar']) - # Raises when COVERAGE=true and coverage < 100% + self.coverage_report() From 639923fba42799fcb43374ec123bfd5d0b73cd80 Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Thu, 6 Jul 2017 17:33:20 -0400 Subject: [PATCH 03/12] disable pylint checks --- cassandra_check/check.py | 4 ++-- cassandra_check/requirements.txt | 2 +- cassandra_check/test_cassandra_check.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cassandra_check/check.py b/cassandra_check/check.py index 75d82ff89602b..5968bcc5448c6 100644 --- a/cassandra_check/check.py +++ b/cassandra_check/check.py @@ -3,8 +3,8 @@ # Licensed under Simplified BSD License (see LICENSE) # 3rd party -from cassandra.cluster import Cluster, NoHostAvailable -from cassandra.auth import PlainTextAuthProvider +from cassandra.cluster import Cluster, NoHostAvailable # pylint: disable=E0611 +from cassandra.auth import PlainTextAuthProvider # pylint: disable=E0611 # project from checks import AgentCheck diff --git a/cassandra_check/requirements.txt b/cassandra_check/requirements.txt index aa5ecbcbb4faa..87e927dc1219b 100644 --- a/cassandra_check/requirements.txt +++ b/cassandra_check/requirements.txt @@ -1,2 +1,2 @@ # integration pip requirements -cassandra-driver==3.10.0 \ No newline at end of file +cassandra-driver==3.10.0 diff --git a/cassandra_check/test_cassandra_check.py b/cassandra_check/test_cassandra_check.py index 3dbf33ad217ee..f2a8463440615 100644 --- a/cassandra_check/test_cassandra_check.py +++ b/cassandra_check/test_cassandra_check.py @@ -7,8 +7,8 @@ from mock import patch # 3p -from cassandra.cluster import Cluster -from cassandra.metadata import TokenMap +from cassandra.cluster import Cluster # pylint: disable=E0611 +from cassandra.metadata import TokenMap # pylint: disable=E0611 # project from tests.checks.common import AgentCheckTest From e2c69b6b95381b54d2b337a251572e5c7648125e Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Thu, 6 Jul 2017 18:12:03 -0400 Subject: [PATCH 04/12] Some linting --- cassandra_check/check.py | 3 +-- cassandra_check/test_cassandra_check.py | 23 +++++++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/cassandra_check/check.py b/cassandra_check/check.py index 5968bcc5448c6..fdb3720e8b261 100644 --- a/cassandra_check/check.py +++ b/cassandra_check/check.py @@ -44,8 +44,7 @@ def check(self, instance): down_replicas = max(down_replicas, len([r for r in replicas if not r.is_up])) self.gauge("cassandra.replication_failures", down_replicas, - tags=["keyspace:%s" % keyspace, "cluster:%s" % cluster.metadata.cluster_name] + tags) - + tags=["keyspace:%s" % keyspace, "cluster:%s" % cluster.metadata.cluster_name] + tags) except NoHostAvailable as e: self.log.error('Could not connect to node %s:%s : %s' % (node_ip, node_port, e)) diff --git a/cassandra_check/test_cassandra_check.py b/cassandra_check/test_cassandra_check.py index f2a8463440615..7bef7fc9ec31f 100644 --- a/cassandra_check/test_cassandra_check.py +++ b/cassandra_check/test_cassandra_check.py @@ -27,11 +27,13 @@ class TestCassandraCheck(AgentCheckTest): config = { 'instances': [ - {'host': '127.0.0.1', - 'port': 9042, - 'keyspaces': ['test'], - 'tags': ['foo','bar'], - 'connect_timeout': 1} + { + 'host': '127.0.0.1', + 'port': 9042, + 'keyspaces': ['test'], + 'tags': ['foo', 'bar'], + 'connect_timeout': 1 + } ] } @@ -39,12 +41,15 @@ def test_check(self): # Create a keyspace with replication factor 2 cluster = Cluster(connect_timeout=1) session = cluster.connect() - session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 2}") + session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH " + "REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 2}") cluster.shutdown() # Run check with both, we should get the value 0 (on mac this will fail since # there is no docker0 bridge so the connection to the second container cannot be made) self.run_check(self.config) - self.assertMetric('cassandra.replication_failures', value=0, tags=['keyspace:test', 'cluster:Test Cluster', 'foo', 'bar']) + self.assertMetric('cassandra.replication_failures', + value=0, + tags=['keyspace:test', 'cluster:Test Cluster', 'foo', 'bar']) self.assertServiceCheckOK('cassandra.can_connect', tags=['foo', 'bar']) self.coverage_report() @@ -53,7 +58,9 @@ def test_check(self): def test_1_replica_down(self): # We should have the value 1 since the driver won't be able to connect to the second container self.run_check(self.config) - self.assertMetric('cassandra.replication_failures', value=1, tags=['keyspace:test', 'cluster:Test Cluster', 'foo', 'bar']) + self.assertMetric('cassandra.replication_failures', + value=1, + tags=['keyspace:test', 'cluster:Test Cluster', 'foo', 'bar']) self.assertServiceCheckOK('cassandra.can_connect', tags=['foo', 'bar']) self.coverage_report() From edd33ac0bc18a87783599d6c83b832ebcdbbabb6 Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Tue, 18 Jul 2017 18:34:51 -0400 Subject: [PATCH 05/12] Rework check. Use nodetool instead of python driver --- cassandra_check/check.py | 95 ++++++++++++--------- cassandra_check/ci/cassandra_check.rake | 48 +---------- cassandra_check/ci/fixtures/nodetool_output | 15 ++++ cassandra_check/conf.yaml.example | 13 ++- cassandra_check/metadata.csv | 3 +- cassandra_check/requirements.txt | 1 - cassandra_check/test_cassandra_check.py | 56 ++++-------- 7 files changed, 101 insertions(+), 130 deletions(-) create mode 100644 cassandra_check/ci/fixtures/nodetool_output diff --git a/cassandra_check/check.py b/cassandra_check/check.py index fdb3720e8b261..745b99b5033ba 100644 --- a/cassandra_check/check.py +++ b/cassandra_check/check.py @@ -2,56 +2,75 @@ # All rights reserved # Licensed under Simplified BSD License (see LICENSE) -# 3rd party -from cassandra.cluster import Cluster, NoHostAvailable # pylint: disable=E0611 -from cassandra.auth import PlainTextAuthProvider # pylint: disable=E0611 +# stdlib +import re # project from checks import AgentCheck +from utils.subprocess_output import get_subprocess_output +from collections import defaultdict EVENT_TYPE = SOURCE_TYPE_NAME = 'cassandra_check' -DEFAULT_NODE_IP = 'localhost' -DEFAULT_NODE_PORT = 9042 - +DEFAULT_HOST = 'localhost' +DEFAULT_PORT = '7199' class CassandraCheck(AgentCheck): + datacenter_name_re = re.compile('^Datacenter: (.*)') + host_status_re = re.compile('^(?P[UD])[NLJM].* (?P(\d+\.\d+%)|\?).*') + def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) def check(self, instance): - # Get the node IP address to connect Cassandra - node_ip = instance.get("node_ip", DEFAULT_NODE_IP) - node_port = instance.get("node_port", DEFAULT_NODE_PORT) + nodetool_path = instance.get("nodetool", "/usr/bin/nodetool") + host = instance.get("host", DEFAULT_HOST) + port = instance.get("port", DEFAULT_PORT) keyspaces = instance.get("keyspaces", []) - tags = instance.get("tags", []) - connect_timeout = instance.get("connect_timeout", 5) - username = instance.get("username", "") password = instance.get("password", "") - auth_provider = PlainTextAuthProvider(username, password) - - # Try to connect to the node - cluster = Cluster([node_ip], port=node_port, auth_provider=auth_provider, connect_timeout=connect_timeout) - try: - cluster.connect(wait_for_all_pools=True) - if keyspaces: - for keyspace in keyspaces: - token_map = cluster.metadata.token_map - down_replicas = 0 - for token in token_map.ring: - replicas = token_map.get_replicas(keyspace, token) - down_replicas = max(down_replicas, len([r for r in replicas if not r.is_up])) - - self.gauge("cassandra.replication_failures", down_replicas, - tags=["keyspace:%s" % keyspace, "cluster:%s" % cluster.metadata.cluster_name] + tags) - - except NoHostAvailable as e: - self.log.error('Could not connect to node %s:%s : %s' % (node_ip, node_port, e)) - node_status = AgentCheck.CRITICAL - else: - node_status = AgentCheck.OK - finally: - cluster.shutdown() - - self.service_check('cassandra.can_connect', node_status, tags=tags) + tags = instance.get("tags", []) + + for keyspace in keyspaces: + # Build the nodetool command + cmd = [nodetool_path, '-h', host, '-p', port] + if username and password: + cmd += ['-u', username, '-pw', password] + cmd += ['status', '--', keyspace] + + # Execute the command + out, err, _ = get_subprocess_output(cmd, self.log, False) + if err or 'Error:' in out: + self.log.error('Error executing nodetool status: %s', err or out) + percent_up_by_dc, percent_total_by_dc = self._process_nodetool_output(out) + for datacenter, percent_up in percent_up_by_dc.items(): + self.gauge('cassandra.replication_availability', percent_up, + tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) + for datacenter, percent_total in percent_total_by_dc.items(): + self.gauge('cassandra.replication_factor', int(round(percent_total / 100)), + tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) + + def _process_nodetool_output(self, output): + percent_up_by_datacenter = defaultdict(float) + percent_total_by_datacenter = defaultdict(float) + for line in output.splitlines(): + # Ouput of nodetool + # Datacenter: dc1 + # =============== + # Status=Up/Down + # |/ State=Normal/Leaving/Joining/Moving + # -- Address Load Tokens Owns (effective) Host ID Rack + # UN 172.21.0.3 184.8 KB 256 38.4% 7501ef03-eb63-4db0-95e6-20bfeb7cdd87 RAC1 + # UN 172.21.0.4 223.34 KB 256 39.5% e521a2a4-39d3-4311-a195-667bf56450f4 RAC1 + match = self.datacenter_name_re.search(line) + if match: + datacenter_name = match.group(1) + match = self.host_status_re.search(line) + if match: + host_status = match.group('status') + host_owns = match.group('owns') + if host_status == 'U' and host_owns != '?': + percent_up_by_datacenter[datacenter_name] += float(host_owns[:-1]) + percent_total_by_datacenter[datacenter_name] += float(host_owns[:-1]) + + return percent_up_by_datacenter, percent_total_by_datacenter diff --git a/cassandra_check/ci/cassandra_check.rake b/cassandra_check/ci/cassandra_check.rake index 56cc03a70edcc..7bf960bbff835 100644 --- a/cassandra_check/ci/cassandra_check.rake +++ b/cassandra_check/ci/cassandra_check.rake @@ -8,52 +8,15 @@ def cassandra_check_rootdir "#{ENV['INTEGRATIONS_DIR']}/cassandra_check_#{cassandra_check_version}" end -container_name = 'dd-test-cassandra' -container_name2 = 'dd-test-cassandra2' - namespace :ci do namespace :cassandra_check do |flavor| - task before_install: ['ci:common:before_install'] do - sh %(docker kill #{container_name} 2>/dev/null || true) - sh %(docker rm #{container_name} 2>/dev/null || true) - sh %(docker kill #{container_name2} 2>/dev/null || true) - sh %(docker rm #{container_name2} 2>/dev/null || true) - end + task before_install: ['ci:common:before_install'] task :install do Rake::Task['ci:common:install'].invoke('cassandra_check') - sh %(docker create --expose 9042 --expose 7000 --expose 7001 --expose 9160 \ - -p 9042:9042 -p 7000:7000 -p 7001:7001 -p 9160:9160 --name #{container_name} cassandra:#{cassandra_check_version}) - sh %(docker start #{container_name}) - sh %(docker create --name #{container_name2} \ - -e CASSANDRA_SEEDS="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' #{container_name})" cassandra:#{cassandra_check_version}) - sh %(docker start #{container_name2}) end - task before_script: ['ci:common:before_script'] do - # Wait.for container_port - count = 0 - logs = `docker logs #{container_name} 2>&1` - logs2 = `docker logs #{container_name2} 2>&1` - puts 'Waiting for Cassandra to come up' - until count == 20 || ((logs.include?('Listening for thrift clients') || logs.include?("Created default superuser role 'cassandra'")) && \ - (logs2.include?('Listening for thrift clients') || logs2.include?('Not starting RPC server as requested'))) - sleep_for 4 - logs = `docker logs #{container_name} 2>&1` - logs2 = `docker logs #{container_name2} 2>&1` - count += 1 - end - if (logs.include?('Listening for thrift clients') || logs.include?("Created default superuser role 'cassandra'")) && \ - (logs2.include?('Listening for thrift clients') || logs2.include?('Not starting RPC server as requested')) - puts 'Cassandra is up!' - else - puts 'Logs of container 1' - sh %(docker logs #{container_name} 2>&1) - puts 'Logs of container 2' - sh %(docker logs #{container_name2} 2>&1) - raise - end - end + task before_script: ['ci:common:before_script'] task script: ['ci:common:script'] do this_provides = [ @@ -64,12 +27,7 @@ namespace :ci do task before_cache: ['ci:common:before_cache'] - task cleanup: ['ci:common:cleanup'] do - sh %(docker kill #{container_name} 2>/dev/null || true) - sh %(docker rm #{container_name} 2>/dev/null || true) - sh %(docker kill #{container_name2} 2>/dev/null || true) - sh %(docker rm #{container_name2} 2>/dev/null || true) - end + task cleanup: ['ci:common:cleanup'] task :execute do exception = nil diff --git a/cassandra_check/ci/fixtures/nodetool_output b/cassandra_check/ci/fixtures/nodetool_output new file mode 100644 index 0000000000000..bc671693cd216 --- /dev/null +++ b/cassandra_check/ci/fixtures/nodetool_output @@ -0,0 +1,15 @@ +Datacenter: dc1 +=============== +Status=Up/Down +|/ State=Normal/Leaving/Joining/Moving +-- Address Load Tokens Owns (effective) Host ID Rack +DN 172.21.0.6 178.43 KB 256 35.4% f86d2d7a-e5c7-4c46-b36e-df08c565171a rack1 +UN 172.21.0.3 184.8 KB 256 31.0% 7501ef03-eb63-4db0-95e6-20bfeb7cdd87 RAC1 +UN 172.21.0.2 182.05 KB 256 33.5% fa859fcc-5e76-44ce-9609-1f314bdf21c1 RAC1 +Datacenter: dc2 +=============== +Status=Up/Down +|/ State=Normal/Leaving/Joining/Moving +-- Address Load Tokens Owns (effective) Host ID Rack +UN 172.21.0.5 216.75 KB 256 100.0% 2250363b-7453-48f2-b6cb-ef79cad0612b RAC1 +UN 172.21.0.4 223.34 KB 256 100.0% e521a2a4-39d3-4311-a195-667bf56450f4 RAC1 \ No newline at end of file diff --git a/cassandra_check/conf.yaml.example b/cassandra_check/conf.yaml.example index 9e0a176ea9bd6..096cd527c6115 100644 --- a/cassandra_check/conf.yaml.example +++ b/cassandra_check/conf.yaml.example @@ -3,13 +3,12 @@ init_config: instances: # Configuration options: # keyspaces: a list of keyspaces to monitor - # node_ip: the IP of the cassandra node to connect to. The rest of the nodes in the cluster will be auto discovered by the cassandra driver - # For more information, see https://datastax.github.io/python-driver/api/index.html + # host: host that nodetool will connect to. # Default to localhost. - # node_port: the port cassandra is listening for connections. - # Default to 9042 - # username/password: a set of credentials to connect to the cassandra cluster. - # tags: optional, a list of tags to be sent with the metrics - # connect_timeout: timeout, in seconds, for creating new connections. Default to 5. + # port: the port JMX is listening for connections. + # Default to 7199 + # username/password: a set of credentials to connect to the host. These are the credentials for the JMX server. + # For the check to work, this user must have a read/write access so that nodetool can execute the `status` command + # tags: optional, a list of additionnal tags to be sent with the metrics - keyspaces: ["foo"] diff --git a/cassandra_check/metadata.csv b/cassandra_check/metadata.csv index b6ff24519cc50..7d853027d5ab0 100644 --- a/cassandra_check/metadata.csv +++ b/cassandra_check/metadata.csv @@ -1,2 +1,3 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name -cassandra.replication_failures,gauge,,,,Number of replica nodes down per keyspace,-1,cassandra_check,replication failures \ No newline at end of file +cassandra.replication_availability,gauge,,,,Percentage of data available per keyspace times replication factor,+1,cassandra_check,available data +cassandra.replication_factor,gauge,,,,Replication factor per keyspace,0,cassandra_check,replication factor \ No newline at end of file diff --git a/cassandra_check/requirements.txt b/cassandra_check/requirements.txt index 87e927dc1219b..f89ecf55da555 100644 --- a/cassandra_check/requirements.txt +++ b/cassandra_check/requirements.txt @@ -1,2 +1 @@ # integration pip requirements -cassandra-driver==3.10.0 diff --git a/cassandra_check/test_cassandra_check.py b/cassandra_check/test_cassandra_check.py index 7bef7fc9ec31f..17dabb6ce872a 100644 --- a/cassandra_check/test_cassandra_check.py +++ b/cassandra_check/test_cassandra_check.py @@ -5,20 +5,12 @@ # stdlib from nose.plugins.attrib import attr from mock import patch - -# 3p -from cassandra.cluster import Cluster # pylint: disable=E0611 -from cassandra.metadata import TokenMap # pylint: disable=E0611 +from os.path import join, dirname # project -from tests.checks.common import AgentCheckTest - -class MockHost: - def __init__(self, up): - self.is_up = up +from tests.checks.common import AgentCheckTest, Fixtures -def mock_get_replicas(self, keyspace, token): - return [MockHost(True), MockHost(False)] +FIXTURE_DIR = join(dirname(__file__), 'ci') @attr(requires='cassandra_check') class TestCassandraCheck(AgentCheckTest): @@ -28,39 +20,27 @@ class TestCassandraCheck(AgentCheckTest): config = { 'instances': [ { - 'host': '127.0.0.1', - 'port': 9042, + 'host': 'localhost', 'keyspaces': ['test'], - 'tags': ['foo', 'bar'], - 'connect_timeout': 1 + 'tags': ['foo', 'bar'] } ] } - def test_check(self): - # Create a keyspace with replication factor 2 - cluster = Cluster(connect_timeout=1) - session = cluster.connect() - session.execute("CREATE KEYSPACE IF NOT EXISTS test WITH " - "REPLICATION = {'class': 'SimpleStrategy', 'replication_factor': 2}") - cluster.shutdown() - # Run check with both, we should get the value 0 (on mac this will fail since - # there is no docker0 bridge so the connection to the second container cannot be made) - self.run_check(self.config) - self.assertMetric('cassandra.replication_failures', - value=0, - tags=['keyspace:test', 'cluster:Test Cluster', 'foo', 'bar']) - self.assertServiceCheckOK('cassandra.can_connect', tags=['foo', 'bar']) - - self.coverage_report() + @patch('_cassandra_check.get_subprocess_output', + return_value=Fixtures.read_file('nodetool_output', sdk_dir=FIXTURE_DIR)) + def test_check(self, mock_output): - @patch.object(TokenMap, 'get_replicas', mock_get_replicas) - def test_1_replica_down(self): - # We should have the value 1 since the driver won't be able to connect to the second container self.run_check(self.config) - self.assertMetric('cassandra.replication_failures', - value=1, - tags=['keyspace:test', 'cluster:Test Cluster', 'foo', 'bar']) - self.assertServiceCheckOK('cassandra.can_connect', tags=['foo', 'bar']) + + mock_output.assertCalledWith(['/usr/bin/nodetool', '-h', 'localhost', '-p', '7199', '--', 'test']) + self.assertMetric('cassandra.replication_availability', value=64.5, + tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) + self.assertMetric('cassandra.replication_availability', value=100, + tags=['keyspace:test', 'datacenter:dc2', 'foo', 'bar']) + self.assertMetric('cassandra.replication_factor', value=1, + tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) + self.assertMetric('cassandra.replication_factor', value=2, + tags=['keyspace:test', 'datacenter:dc2', 'foo', 'bar']) self.coverage_report() From 1c634a8fa5226f6cdcbfb6c94a24994fd71dcc97 Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Tue, 18 Jul 2017 19:42:51 -0400 Subject: [PATCH 06/12] Add integration test --- cassandra_check/ci/cassandra_check.rake | 48 +++++++++++++++++++++---- cassandra_check/ci/jmxremote.password | 1 + cassandra_check/test_cassandra_check.py | 25 ++++++++++--- 3 files changed, 63 insertions(+), 11 deletions(-) create mode 100644 cassandra_check/ci/jmxremote.password diff --git a/cassandra_check/ci/cassandra_check.rake b/cassandra_check/ci/cassandra_check.rake index 7bf960bbff835..6a961670b24eb 100644 --- a/cassandra_check/ci/cassandra_check.rake +++ b/cassandra_check/ci/cassandra_check.rake @@ -4,19 +4,49 @@ def cassandra_check_version ENV['FLAVOR_VERSION'] || '2.1.14' # '2.0.17' end -def cassandra_check_rootdir - "#{ENV['INTEGRATIONS_DIR']}/cassandra_check_#{cassandra_check_version}" -end +container_name = 'dd-test-cassandra' +container_name2 = 'dd-test-cassandra2' + +container_port = 7199 +cassandra_jmx_options = "-Dcom.sun.management.jmxremote.port=#{container_port} + -Dcom.sun.management.jmxremote.rmi.port=#{container_port} + -Dcom.sun.management.jmxremote.ssl=false + -Dcom.sun.management.jmxremote.authenticate=true + -Dcom.sun.management.jmxremote.password.file=/etc/cassandra/jmxremote.password + -Djava.rmi.server.hostname=localhost" namespace :ci do namespace :cassandra_check do |flavor| - task before_install: ['ci:common:before_install'] + task before_install: ['ci:common:before_install'] do + sh %(docker kill #{container_name} 2>/dev/null || true) + sh %(docker rm #{container_name} 2>/dev/null || true) + sh %(docker kill #{container_name2} 2>/dev/null || true) + sh %(docker rm #{container_name2} 2>/dev/null || true) + sh %(rm -f #{__dir__}/jmxremote.password.tmp) + end task :install do Rake::Task['ci:common:install'].invoke('cassandra_check') + sh %(docker create --expose #{container_port} \ + -p #{container_port}:#{container_port} -e JMX_PORT=#{container_port} \ + -e LOCAL_JMX=no -e JVM_EXTRA_OPTS="#{cassandra_jmx_options}" --name #{container_name} cassandra:#{cassandra_check_version}) + sh %(cp #{__dir__}/jmxremote.password #{__dir__}/jmxremote.password.tmp) + sh %(chmod 400 #{__dir__}/jmxremote.password.tmp) + sh %(docker cp #{__dir__}/jmxremote.password.tmp #{container_name}:/etc/cassandra/jmxremote.password) + sh %(rm -f #{__dir__}/jmxremote.password.tmp) + sh %(docker start #{container_name}) + + sh %(docker create --name #{container_name2} \ + -e CASSANDRA_SEEDS="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' #{container_name})" cassandra:#{cassandra_check_version}) + sh %(docker start #{container_name2}) end - task before_script: ['ci:common:before_script'] + task before_script: ['ci:common:before_script'] do + # Wait.for container_port + wait_on_docker_logs(container_name, 20, 'Listening for thrift clients', "Created default superuser role 'cassandra'") + wait_on_docker_logs(container_name2, 40, 'Listening for thrift clients', 'Not starting RPC server as requested') + sh %(docker exec #{container_name} cqlsh -e "CREATE KEYSPACE test WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':2}") + end task script: ['ci:common:script'] do this_provides = [ @@ -27,7 +57,13 @@ namespace :ci do task before_cache: ['ci:common:before_cache'] - task cleanup: ['ci:common:cleanup'] + task cleanup: ['ci:common:cleanup'] do + sh %(docker kill #{container_name} 2>/dev/null || true) + sh %(docker rm #{container_name} 2>/dev/null || true) + sh %(docker kill #{container_name2} 2>/dev/null || true) + sh %(docker rm #{container_name2} 2>/dev/null || true) + sh %(rm -f #{__dir__}/jmxremote.password.tmp) + end task :execute do exception = nil diff --git a/cassandra_check/ci/jmxremote.password b/cassandra_check/ci/jmxremote.password new file mode 100644 index 0000000000000..239d0f318fa6b --- /dev/null +++ b/cassandra_check/ci/jmxremote.password @@ -0,0 +1 @@ +controlRole QED \ No newline at end of file diff --git a/cassandra_check/test_cassandra_check.py b/cassandra_check/test_cassandra_check.py index 17dabb6ce872a..36b068f58ae9e 100644 --- a/cassandra_check/test_cassandra_check.py +++ b/cassandra_check/test_cassandra_check.py @@ -12,7 +12,9 @@ FIXTURE_DIR = join(dirname(__file__), 'ci') -@attr(requires='cassandra_check') +def mock_output(*args): + return Fixtures.read_file('nodetool_output', sdk_dir=FIXTURE_DIR), "", 0 + class TestCassandraCheck(AgentCheckTest): """Basic Test for cassandra_check integration.""" CHECK_NAME = 'cassandra_check' @@ -22,21 +24,24 @@ class TestCassandraCheck(AgentCheckTest): { 'host': 'localhost', 'keyspaces': ['test'], + 'username': 'controlRole', + 'password': 'QED', 'tags': ['foo', 'bar'] } ] } - @patch('_cassandra_check.get_subprocess_output', - return_value=Fixtures.read_file('nodetool_output', sdk_dir=FIXTURE_DIR)) + @patch('utils.subprocess_output.get_subprocess_output', side_effect=mock_output) def test_check(self, mock_output): self.run_check(self.config) - mock_output.assertCalledWith(['/usr/bin/nodetool', '-h', 'localhost', '-p', '7199', '--', 'test']) + self.assertEquals(mock_output.call_args[0][0], + ['/usr/bin/nodetool', '-h', 'localhost', '-p', '7199', '-u', + 'controlRole', '-pw', 'QED', 'status', '--', 'test']) self.assertMetric('cassandra.replication_availability', value=64.5, tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) - self.assertMetric('cassandra.replication_availability', value=100, + self.assertMetric('cassandra.replication_availability', value=200, tags=['keyspace:test', 'datacenter:dc2', 'foo', 'bar']) self.assertMetric('cassandra.replication_factor', value=1, tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) @@ -44,3 +49,13 @@ def test_check(self, mock_output): tags=['keyspace:test', 'datacenter:dc2', 'foo', 'bar']) self.coverage_report() + + @attr(requires='cassandra_check') + def test_integration(self): + self.run_check(self.config) + + self.assertMetric('cassandra.replication_availability', value=200, + tags=['keyspace:test', 'datacenter:datacenter1', 'foo', 'bar']) + self.assertMetric('cassandra.replication_factor', value=2, + tags=['keyspace:test', 'datacenter:datacenter1', 'foo', 'bar']) + self.coverage_report() From cb28feb9e9125c518d36485e542bc91aacb99026 Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Wed, 19 Jul 2017 10:40:55 -0400 Subject: [PATCH 07/12] Fix tests by allowing to specify a docker command for nodetool --- cassandra_check/check.py | 6 ++++-- cassandra_check/conf.yaml.example | 2 ++ cassandra_check/test_cassandra_check.py | 3 ++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cassandra_check/check.py b/cassandra_check/check.py index 745b99b5033ba..9144b36bd92c6 100644 --- a/cassandra_check/check.py +++ b/cassandra_check/check.py @@ -4,6 +4,7 @@ # stdlib import re +import shlex # project from checks import AgentCheck @@ -23,7 +24,8 @@ def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) def check(self, instance): - nodetool_path = instance.get("nodetool", "/usr/bin/nodetool") + # Allow to specify a complete command for nodetool such as `docker exec container nodetool` + nodetool_cmd = shlex.split(instance.get("nodetool", "/usr/bin/nodetool")) host = instance.get("host", DEFAULT_HOST) port = instance.get("port", DEFAULT_PORT) keyspaces = instance.get("keyspaces", []) @@ -33,7 +35,7 @@ def check(self, instance): for keyspace in keyspaces: # Build the nodetool command - cmd = [nodetool_path, '-h', host, '-p', port] + cmd = nodetool_cmd + ['-h', host, '-p', port] if username and password: cmd += ['-u', username, '-pw', password] cmd += ['status', '--', keyspace] diff --git a/cassandra_check/conf.yaml.example b/cassandra_check/conf.yaml.example index 096cd527c6115..c268db2e6bad3 100644 --- a/cassandra_check/conf.yaml.example +++ b/cassandra_check/conf.yaml.example @@ -2,6 +2,8 @@ init_config: instances: # Configuration options: + # nodetool: a command or path to nodetool (e.g. /usr/bin/nodetool or docker exec container nodetool) + # Default to /usr/bin/nodetool # keyspaces: a list of keyspaces to monitor # host: host that nodetool will connect to. # Default to localhost. diff --git a/cassandra_check/test_cassandra_check.py b/cassandra_check/test_cassandra_check.py index 36b068f58ae9e..2a9cac60185b8 100644 --- a/cassandra_check/test_cassandra_check.py +++ b/cassandra_check/test_cassandra_check.py @@ -11,6 +11,7 @@ from tests.checks.common import AgentCheckTest, Fixtures FIXTURE_DIR = join(dirname(__file__), 'ci') +CASSANDRA_CONTAINER_NAME = 'dd-test-cassandra' def mock_output(*args): return Fixtures.read_file('nodetool_output', sdk_dir=FIXTURE_DIR), "", 0 @@ -22,7 +23,7 @@ class TestCassandraCheck(AgentCheckTest): config = { 'instances': [ { - 'host': 'localhost', + 'nodetool': 'docker exec %s nodetool' % CASSANDRA_CONTAINER_NAME, 'keyspaces': ['test'], 'username': 'controlRole', 'password': 'QED', From 5fe62643588d051a4495e3480619b994a25fc5ab Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Wed, 19 Jul 2017 11:13:46 -0400 Subject: [PATCH 08/12] Fix mock test --- cassandra_check/test_cassandra_check.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cassandra_check/test_cassandra_check.py b/cassandra_check/test_cassandra_check.py index 2a9cac60185b8..d87cee34b5668 100644 --- a/cassandra_check/test_cassandra_check.py +++ b/cassandra_check/test_cassandra_check.py @@ -38,8 +38,8 @@ def test_check(self, mock_output): self.run_check(self.config) self.assertEquals(mock_output.call_args[0][0], - ['/usr/bin/nodetool', '-h', 'localhost', '-p', '7199', '-u', - 'controlRole', '-pw', 'QED', 'status', '--', 'test']) + ['docker', 'exec', CASSANDRA_CONTAINER_NAME, 'nodetool', '-h', 'localhost', '-p', + '7199', '-u', 'controlRole', '-pw', 'QED', 'status', '--', 'test']) self.assertMetric('cassandra.replication_availability', value=64.5, tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) self.assertMetric('cassandra.replication_availability', value=200, From 5110f7f71a44b85dafdd127e7d4b66ba68d0bfac Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Thu, 27 Jul 2017 20:16:34 -0400 Subject: [PATCH 09/12] Address review comments --- .travis.yml | 6 +- cassandra_check/CHANGELOG.md | 8 -- cassandra_check/README.md | 29 ----- cassandra_check/check.py | 78 ------------ cassandra_check/conf.yaml.example | 16 --- cassandra_check/metadata.csv | 3 - cassandra_nodetool/CHANGELOG.md | 8 ++ cassandra_nodetool/README.md | 56 +++++++++ cassandra_nodetool/check.py | 117 ++++++++++++++++++ .../ci/cassandra_nodetool.rake | 12 +- .../ci/fixtures/nodetool_output | 0 .../ci/jmxremote.password | 0 cassandra_nodetool/conf.yaml.example | 23 ++++ .../manifest.json | 4 +- cassandra_nodetool/metadata.csv | 3 + .../requirements.txt | 0 .../test_cassandra_nodetool.py | 30 +++-- circle.yml | 2 +- 18 files changed, 237 insertions(+), 158 deletions(-) delete mode 100644 cassandra_check/CHANGELOG.md delete mode 100644 cassandra_check/README.md delete mode 100644 cassandra_check/check.py delete mode 100644 cassandra_check/conf.yaml.example delete mode 100644 cassandra_check/metadata.csv create mode 100644 cassandra_nodetool/CHANGELOG.md create mode 100644 cassandra_nodetool/README.md create mode 100644 cassandra_nodetool/check.py rename cassandra_check/ci/cassandra_check.rake => cassandra_nodetool/ci/cassandra_nodetool.rake (92%) rename {cassandra_check => cassandra_nodetool}/ci/fixtures/nodetool_output (100%) rename {cassandra_check => cassandra_nodetool}/ci/jmxremote.password (100%) create mode 100644 cassandra_nodetool/conf.yaml.example rename {cassandra_check => cassandra_nodetool}/manifest.json (73%) create mode 100644 cassandra_nodetool/metadata.csv rename {cassandra_check => cassandra_nodetool}/requirements.txt (100%) rename cassandra_check/test_cassandra_check.py => cassandra_nodetool/test_cassandra_nodetool.py (56%) diff --git a/.travis.yml b/.travis.yml index bbd93b12281bf..a9d23b6ea7f25 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,9 +43,9 @@ env: - TRAVIS_FLAVOR=cassandra FLAVOR_VERSION=2.0.17 - TRAVIS_FLAVOR=cassandra FLAVOR_VERSION=2.1.14 - TRAVIS_FLAVOR=cassandra FLAVOR_VERSION=2.2.10 - - TRAVIS_FLAVOR=cassandra_check FLAVOR_VERSION=2.0.17 - - TRAVIS_FLAVOR=cassandra_check FLAVOR_VERSION=2.1.14 - - TRAVIS_FLAVOR=cassandra_check FLAVOR_VERSION=2.2.10 + - TRAVIS_FLAVOR=cassandra_nodetool FLAVOR_VERSION=2.0.17 + - TRAVIS_FLAVOR=cassandra_nodetool FLAVOR_VERSION=2.1.14 + - TRAVIS_FLAVOR=cassandra_nodetool FLAVOR_VERSION=2.2.10 - TRAVIS_FLAVOR=couch FLAVOR_VERSION=1.6.1 - TRAVIS_FLAVOR=consul FLAVOR_VERSION=v0.6.4 - TRAVIS_FLAVOR=consul FLAVOR_VERSION=0.7.2 diff --git a/cassandra_check/CHANGELOG.md b/cassandra_check/CHANGELOG.md deleted file mode 100644 index b009f3a6dffde..0000000000000 --- a/cassandra_check/CHANGELOG.md +++ /dev/null @@ -1,8 +0,0 @@ -# CHANGELOG - Cassandra_check - -0.1.0/ Unreleased -================== - -### Changes - -* [FEATURE] adds cassandra_check integration. diff --git a/cassandra_check/README.md b/cassandra_check/README.md deleted file mode 100644 index 8023c6c3c538b..0000000000000 --- a/cassandra_check/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Cassandra Check - -## Overview - -Get metrics from cassandra databases that are not available through the [jmx integration](https://github.com/DataDog/integrations-core/tree/master/cassandra) - -## Installation - -Install the `dd-check-cassandra_check` package manually or with your favorite configuration manager - -## Configuration - -Edit the `cassandra_check.yaml` file to point to your server and port and set the keyspaces to monitor - -## Validation - -When you run `datadog-agent info` you should see something like the following: - - Checks - ====== - - cassandra_check - ----------- - - instance #0 [OK] - - Collected 39 metrics, 0 events & 7 service checks - -## Compatibility - -The cassandra_check check is compatible with all major platforms diff --git a/cassandra_check/check.py b/cassandra_check/check.py deleted file mode 100644 index 9144b36bd92c6..0000000000000 --- a/cassandra_check/check.py +++ /dev/null @@ -1,78 +0,0 @@ -# (C) Datadog, Inc. 2010-2016 -# All rights reserved -# Licensed under Simplified BSD License (see LICENSE) - -# stdlib -import re -import shlex - -# project -from checks import AgentCheck -from utils.subprocess_output import get_subprocess_output -from collections import defaultdict - -EVENT_TYPE = SOURCE_TYPE_NAME = 'cassandra_check' -DEFAULT_HOST = 'localhost' -DEFAULT_PORT = '7199' - -class CassandraCheck(AgentCheck): - - datacenter_name_re = re.compile('^Datacenter: (.*)') - host_status_re = re.compile('^(?P[UD])[NLJM].* (?P(\d+\.\d+%)|\?).*') - - def __init__(self, name, init_config, agentConfig, instances=None): - AgentCheck.__init__(self, name, init_config, agentConfig, instances) - - def check(self, instance): - # Allow to specify a complete command for nodetool such as `docker exec container nodetool` - nodetool_cmd = shlex.split(instance.get("nodetool", "/usr/bin/nodetool")) - host = instance.get("host", DEFAULT_HOST) - port = instance.get("port", DEFAULT_PORT) - keyspaces = instance.get("keyspaces", []) - username = instance.get("username", "") - password = instance.get("password", "") - tags = instance.get("tags", []) - - for keyspace in keyspaces: - # Build the nodetool command - cmd = nodetool_cmd + ['-h', host, '-p', port] - if username and password: - cmd += ['-u', username, '-pw', password] - cmd += ['status', '--', keyspace] - - # Execute the command - out, err, _ = get_subprocess_output(cmd, self.log, False) - if err or 'Error:' in out: - self.log.error('Error executing nodetool status: %s', err or out) - percent_up_by_dc, percent_total_by_dc = self._process_nodetool_output(out) - for datacenter, percent_up in percent_up_by_dc.items(): - self.gauge('cassandra.replication_availability', percent_up, - tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) - for datacenter, percent_total in percent_total_by_dc.items(): - self.gauge('cassandra.replication_factor', int(round(percent_total / 100)), - tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) - - def _process_nodetool_output(self, output): - percent_up_by_datacenter = defaultdict(float) - percent_total_by_datacenter = defaultdict(float) - for line in output.splitlines(): - # Ouput of nodetool - # Datacenter: dc1 - # =============== - # Status=Up/Down - # |/ State=Normal/Leaving/Joining/Moving - # -- Address Load Tokens Owns (effective) Host ID Rack - # UN 172.21.0.3 184.8 KB 256 38.4% 7501ef03-eb63-4db0-95e6-20bfeb7cdd87 RAC1 - # UN 172.21.0.4 223.34 KB 256 39.5% e521a2a4-39d3-4311-a195-667bf56450f4 RAC1 - match = self.datacenter_name_re.search(line) - if match: - datacenter_name = match.group(1) - match = self.host_status_re.search(line) - if match: - host_status = match.group('status') - host_owns = match.group('owns') - if host_status == 'U' and host_owns != '?': - percent_up_by_datacenter[datacenter_name] += float(host_owns[:-1]) - percent_total_by_datacenter[datacenter_name] += float(host_owns[:-1]) - - return percent_up_by_datacenter, percent_total_by_datacenter diff --git a/cassandra_check/conf.yaml.example b/cassandra_check/conf.yaml.example deleted file mode 100644 index c268db2e6bad3..0000000000000 --- a/cassandra_check/conf.yaml.example +++ /dev/null @@ -1,16 +0,0 @@ -init_config: - -instances: - # Configuration options: - # nodetool: a command or path to nodetool (e.g. /usr/bin/nodetool or docker exec container nodetool) - # Default to /usr/bin/nodetool - # keyspaces: a list of keyspaces to monitor - # host: host that nodetool will connect to. - # Default to localhost. - # port: the port JMX is listening for connections. - # Default to 7199 - # username/password: a set of credentials to connect to the host. These are the credentials for the JMX server. - # For the check to work, this user must have a read/write access so that nodetool can execute the `status` command - # tags: optional, a list of additionnal tags to be sent with the metrics - - - keyspaces: ["foo"] diff --git a/cassandra_check/metadata.csv b/cassandra_check/metadata.csv deleted file mode 100644 index 7d853027d5ab0..0000000000000 --- a/cassandra_check/metadata.csv +++ /dev/null @@ -1,3 +0,0 @@ -metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name -cassandra.replication_availability,gauge,,,,Percentage of data available per keyspace times replication factor,+1,cassandra_check,available data -cassandra.replication_factor,gauge,,,,Replication factor per keyspace,0,cassandra_check,replication factor \ No newline at end of file diff --git a/cassandra_nodetool/CHANGELOG.md b/cassandra_nodetool/CHANGELOG.md new file mode 100644 index 0000000000000..8b810cb97a04d --- /dev/null +++ b/cassandra_nodetool/CHANGELOG.md @@ -0,0 +1,8 @@ +# CHANGELOG - Cassandra Nodetool Check + +0.1.0/ Unreleased +================== + +### Changes + +* [FEATURE] adds cassandra_nodetool integration. diff --git a/cassandra_nodetool/README.md b/cassandra_nodetool/README.md new file mode 100644 index 0000000000000..524b8461820fb --- /dev/null +++ b/cassandra_nodetool/README.md @@ -0,0 +1,56 @@ +# Agent Check: Cassandra Nodetool + +# Overview + +This check collects metrics for your Cassandra cluster that are not available through [jmx integration](https://github.com/DataDog/integrations-core/tree/master/cassandra). +It uses the `nodetool` utility to collect them. + +# Installation + +The varnish check is packaged with the Agent, so simply [install the Agent](https://app.datadoghq.com/account/settings#agent) on your cassandra nodes. +If you need the newest version of the check, install the `dd-check-cassandra_nodetool` package. + +# Configuration + +Create a file `cassandra_nodetool.yaml` in the Agent's `conf.d` directory: +``` +init_config: + # command or path to nodetool (e.g. /usr/bin/nodetool or docker exec container nodetool) + # can be overwritten on an instance + # nodetool: /usr/bin/nodetool + +instances: + + # the list of keyspaces to monitor + - keyspaces: [] + + # host that nodetool will connect to. + # host: localhost + + # the port JMX is listening to for connections. + # port: 7199 + + # a set of credentials to connect to the host. These are the credentials for the JMX server. + # For the check to work, this user must have a read/write access so that nodetool can execute the `status` command + # username: + # password: + + # a list of additionnal tags to be sent with the metrics + # tags: [] +``` + +# Validation + +When you run `datadog-agent info` you should see something like the following: + + Checks + ====== + + cassandra_nodetool + ----------- + - instance #0 [OK] + - Collected 39 metrics, 0 events & 7 service checks + +# Compatibility + +The `cassandra_nodetool` check is compatible with all major platforms diff --git a/cassandra_nodetool/check.py b/cassandra_nodetool/check.py new file mode 100644 index 0000000000000..cb77742f6c548 --- /dev/null +++ b/cassandra_nodetool/check.py @@ -0,0 +1,117 @@ +# (C) Datadog, Inc. 2010-2016 +# All rights reserved +# Licensed under Simplified BSD License (see LICENSE) + +# stdlib +import re +import shlex + +# project +from checks import AgentCheck +from utils.subprocess_output import get_subprocess_output +from collections import defaultdict + +EVENT_TYPE = SOURCE_TYPE_NAME = 'cassandra_nodetool' +DEFAULT_HOST = 'localhost' +DEFAULT_PORT = '7199' +TO_BYTES = { + 'B': 1, + 'KB': 1e3, + 'MB': 1e6, + 'GB': 1e9, + 'TB': 1e12, +} + +class CassandraNodetoolCheck(AgentCheck): + + datacenter_name_re = re.compile('^Datacenter: (.*)') + node_status_re = re.compile('^(?P[UD])[NLJM] +(?P
\d+\.\d+\.\d+\.\d+) +' + '(?P\d+\.\d*) (?P(K|M|G|T)?B) +\d+ +' + '(?P(\d+\.\d+%)|\?) +(?P[a-fA-F0-9-]*) +(?P.*)') + + def __init__(self, name, init_config, agentConfig, instances=None): + AgentCheck.__init__(self, name, init_config, agentConfig, instances) + self.nodetool_cmd = init_config.get("nodetool", "/usr/bin/nodetool") + + def check(self, instance): + # Allow to specify a complete command for nodetool such as `docker exec container nodetool` + nodetool_cmd = shlex.split(instance.get("nodetool", self.nodetool_cmd)) + host = instance.get("host", DEFAULT_HOST) + port = instance.get("port", DEFAULT_PORT) + keyspaces = instance.get("keyspaces", []) + username = instance.get("username", "") + password = instance.get("password", "") + tags = instance.get("tags", []) + + for keyspace in keyspaces: + # Build the nodetool command + cmd = nodetool_cmd + ['-h', host, '-p', port] + if username and password: + cmd += ['-u', username, '-pw', password] + cmd += ['status', '--', keyspace] + + # Execute the command + out, err, _ = get_subprocess_output(cmd, self.log, False) + if err or 'Error:' in out: + self.log.error('Error executing nodetool status: %s', err or out) + nodes = self._process_nodetool_output(out) + + percent_up_by_dc = defaultdict(float) + percent_total_by_dc = defaultdict(float) + for node in nodes: + if node['status'] == 'U' and node['owns'] != '?': + percent_up_by_dc[node['datacenter']] += float(node['owns'][:-1]) + percent_total_by_dc[node['datacenter']] += float(node['owns'][:-1]) + + node_tags = ['node_address:%s' % node['address'], + 'node_id:%s' % node['id'], + 'datacenter:%s' % node['datacenter'], + 'rack:%s' % node['rack']] + + self.gauge('cassandra.nodetool.status.status', 1 if node['status'] == 'U' else 0, + tags=tags + node_tags) + self.gauge('cassandra.nodetool.status.load', float(node['load']) * TO_BYTES[node['load_unit']], + tags=tags + node_tags) + self.gauge('cassandra.nodetool.status.owns', float(node['owns'][:-1]), + tags=tags + node_tags) + + for datacenter, percent_up in percent_up_by_dc.items(): + self.gauge('cassandra.nodetool.status.replication_availability', percent_up, + tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) + for datacenter, percent_total in percent_total_by_dc.items(): + self.gauge('cassandra.nodetool.status.replication_factor', int(round(percent_total / 100)), + tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) + + def _process_nodetool_output(self, output): + nodes = [] + datacenter_name = "" + for line in output.splitlines(): + # Ouput of nodetool + # Datacenter: dc1 + # =============== + # Status=Up/Down + # |/ State=Normal/Leaving/Joining/Moving + # -- Address Load Tokens Owns (effective) Host ID Rack + # UN 172.21.0.3 184.8 KB 256 38.4% 7501ef03-eb63-4db0-95e6-20bfeb7cdd87 RAC1 + # UN 172.21.0.4 223.34 KB 256 39.5% e521a2a4-39d3-4311-a195-667bf56450f4 RAC1 + + match = self.datacenter_name_re.search(line) + if match: + datacenter_name = match.group(1) + continue + + match = self.node_status_re.search(line) + if match: + node = { + 'status': match.group('status'), + 'address': match.group('address'), + 'load': match.group('load'), + 'load_unit': match.group('load_unit'), + 'owns': match.group('owns'), + 'id': match.group('id'), + 'rack': match.group('rack'), + 'datacenter': datacenter_name + } + nodes.append(node) + + return nodes diff --git a/cassandra_check/ci/cassandra_check.rake b/cassandra_nodetool/ci/cassandra_nodetool.rake similarity index 92% rename from cassandra_check/ci/cassandra_check.rake rename to cassandra_nodetool/ci/cassandra_nodetool.rake index 6a961670b24eb..1b622c9a7ac65 100644 --- a/cassandra_check/ci/cassandra_check.rake +++ b/cassandra_nodetool/ci/cassandra_nodetool.rake @@ -1,6 +1,6 @@ require 'ci/common' -def cassandra_check_version +def cassandra_nodetool_version ENV['FLAVOR_VERSION'] || '2.1.14' # '2.0.17' end @@ -16,7 +16,7 @@ cassandra_jmx_options = "-Dcom.sun.management.jmxremote.port=#{container_port} -Djava.rmi.server.hostname=localhost" namespace :ci do - namespace :cassandra_check do |flavor| + namespace :cassandra_nodetool do |flavor| task before_install: ['ci:common:before_install'] do sh %(docker kill #{container_name} 2>/dev/null || true) sh %(docker rm #{container_name} 2>/dev/null || true) @@ -26,10 +26,10 @@ namespace :ci do end task :install do - Rake::Task['ci:common:install'].invoke('cassandra_check') + Rake::Task['ci:common:install'].invoke('cassandra_nodetool') sh %(docker create --expose #{container_port} \ -p #{container_port}:#{container_port} -e JMX_PORT=#{container_port} \ - -e LOCAL_JMX=no -e JVM_EXTRA_OPTS="#{cassandra_jmx_options}" --name #{container_name} cassandra:#{cassandra_check_version}) + -e LOCAL_JMX=no -e JVM_EXTRA_OPTS="#{cassandra_jmx_options}" --name #{container_name} cassandra:#{cassandra_nodetool_version}) sh %(cp #{__dir__}/jmxremote.password #{__dir__}/jmxremote.password.tmp) sh %(chmod 400 #{__dir__}/jmxremote.password.tmp) sh %(docker cp #{__dir__}/jmxremote.password.tmp #{container_name}:/etc/cassandra/jmxremote.password) @@ -37,7 +37,7 @@ namespace :ci do sh %(docker start #{container_name}) sh %(docker create --name #{container_name2} \ - -e CASSANDRA_SEEDS="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' #{container_name})" cassandra:#{cassandra_check_version}) + -e CASSANDRA_SEEDS="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' #{container_name})" cassandra:#{cassandra_nodetool_version}) sh %(docker start #{container_name2}) end @@ -50,7 +50,7 @@ namespace :ci do task script: ['ci:common:script'] do this_provides = [ - 'cassandra_check' + 'cassandra_nodetool' ] Rake::Task['ci:common:run_tests'].invoke(this_provides) end diff --git a/cassandra_check/ci/fixtures/nodetool_output b/cassandra_nodetool/ci/fixtures/nodetool_output similarity index 100% rename from cassandra_check/ci/fixtures/nodetool_output rename to cassandra_nodetool/ci/fixtures/nodetool_output diff --git a/cassandra_check/ci/jmxremote.password b/cassandra_nodetool/ci/jmxremote.password similarity index 100% rename from cassandra_check/ci/jmxremote.password rename to cassandra_nodetool/ci/jmxremote.password diff --git a/cassandra_nodetool/conf.yaml.example b/cassandra_nodetool/conf.yaml.example new file mode 100644 index 0000000000000..f5dcd81e115d5 --- /dev/null +++ b/cassandra_nodetool/conf.yaml.example @@ -0,0 +1,23 @@ +init_config: + # command or path to nodetool (e.g. /usr/bin/nodetool or docker exec container nodetool) + # can be overwritten on an instance + # nodetool: /usr/bin/nodetool + +instances: + + # the list of keyspaces to monitor + - keyspaces: [] + + # host that nodetool will connect to. + # host: localhost + + # the port JMX is listening to for connections. + # port: 7199 + + # a set of credentials to connect to the host. These are the credentials for the JMX server. + # For the check to work, this user must have a read/write access so that nodetool can execute the `status` command + # username: + # password: + + # a list of additionnal tags to be sent with the metrics + # tags: [] \ No newline at end of file diff --git a/cassandra_check/manifest.json b/cassandra_nodetool/manifest.json similarity index 73% rename from cassandra_check/manifest.json rename to cassandra_nodetool/manifest.json index 31589db3e0149..c49cc0b3d0bc2 100644 --- a/cassandra_check/manifest.json +++ b/cassandra_nodetool/manifest.json @@ -3,8 +3,8 @@ "manifest_version": "0.1.0", "max_agent_version": "6.0.0", "min_agent_version": "5.6.3", - "name": "cassandra_check", - "short_description": "cassandra_check description.", + "name": "cassandra_nodetool", + "short_description": "monitor cassandra using the nodetool utility", "guid": "00e4a8bd-8ec2-4bb4-b725-6aaa91618d13", "support": "contrib", "supported_os": ["linux","mac_os","windows"], diff --git a/cassandra_nodetool/metadata.csv b/cassandra_nodetool/metadata.csv new file mode 100644 index 0000000000000..2ec496904ec3d --- /dev/null +++ b/cassandra_nodetool/metadata.csv @@ -0,0 +1,3 @@ +metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name +cassandra.nodetool.status.replication_availability,gauge,,,,Percentage of data available per keyspace times replication factor,+1,cassandra_nodetool,available data +cassandra.nodetool.status.replication_factor,gauge,,,,Replication factor per keyspace,0,cassandra_nodetool,replication factor \ No newline at end of file diff --git a/cassandra_check/requirements.txt b/cassandra_nodetool/requirements.txt similarity index 100% rename from cassandra_check/requirements.txt rename to cassandra_nodetool/requirements.txt diff --git a/cassandra_check/test_cassandra_check.py b/cassandra_nodetool/test_cassandra_nodetool.py similarity index 56% rename from cassandra_check/test_cassandra_check.py rename to cassandra_nodetool/test_cassandra_nodetool.py index d87cee34b5668..8cfafedd1ec88 100644 --- a/cassandra_check/test_cassandra_check.py +++ b/cassandra_nodetool/test_cassandra_nodetool.py @@ -16,9 +16,9 @@ def mock_output(*args): return Fixtures.read_file('nodetool_output', sdk_dir=FIXTURE_DIR), "", 0 -class TestCassandraCheck(AgentCheckTest): +class TestCassandraNodetoolCheck(AgentCheckTest): """Basic Test for cassandra_check integration.""" - CHECK_NAME = 'cassandra_check' + CHECK_NAME = 'cassandra_nodetool' config = { 'instances': [ @@ -40,23 +40,29 @@ def test_check(self, mock_output): self.assertEquals(mock_output.call_args[0][0], ['docker', 'exec', CASSANDRA_CONTAINER_NAME, 'nodetool', '-h', 'localhost', '-p', '7199', '-u', 'controlRole', '-pw', 'QED', 'status', '--', 'test']) - self.assertMetric('cassandra.replication_availability', value=64.5, + self.assertMetric('cassandra.nodetool.status.replication_availability', value=64.5, tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) - self.assertMetric('cassandra.replication_availability', value=200, + self.assertMetric('cassandra.nodetool.status.replication_availability', value=200, tags=['keyspace:test', 'datacenter:dc2', 'foo', 'bar']) - self.assertMetric('cassandra.replication_factor', value=1, + self.assertMetric('cassandra.nodetool.status.replication_factor', value=1, tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) - self.assertMetric('cassandra.replication_factor', value=2, + self.assertMetric('cassandra.nodetool.status.replication_factor', value=2, tags=['keyspace:test', 'datacenter:dc2', 'foo', 'bar']) + self.assertMetric('cassandra.nodetool.status.status', value=1, + tags=['datacenter:dc2', 'node_id:e521a2a4-39d3-4311-a195-667bf56450f4', + 'node_address:172.21.0.4', 'rack:RAC1', 'foo', 'bar']) + self.assertMetric('cassandra.nodetool.status.owns', value=100, + tags=['datacenter:dc2', 'node_id:e521a2a4-39d3-4311-a195-667bf56450f4', + 'node_address:172.21.0.4', 'rack:RAC1', 'foo', 'bar']) + self.assertMetric('cassandra.nodetool.status.load', value=223340, + tags=['datacenter:dc2', 'node_id:e521a2a4-39d3-4311-a195-667bf56450f4', + 'node_address:172.21.0.4', 'rack:RAC1', 'foo', 'bar']) - self.coverage_report() - - @attr(requires='cassandra_check') + @attr(requires='cassandra_nodetool') def test_integration(self): self.run_check(self.config) - self.assertMetric('cassandra.replication_availability', value=200, + self.assertMetric('cassandra.nodetool.status.replication_availability', value=200, tags=['keyspace:test', 'datacenter:datacenter1', 'foo', 'bar']) - self.assertMetric('cassandra.replication_factor', value=2, + self.assertMetric('cassandra.nodetool.status.replication_factor', value=2, tags=['keyspace:test', 'datacenter:datacenter1', 'foo', 'bar']) - self.coverage_report() diff --git a/circle.yml b/circle.yml index e4bcf0e878e99..7417959b6c188 100644 --- a/circle.yml +++ b/circle.yml @@ -80,7 +80,7 @@ test: - rake ci:run[kafka] - rake ci:run[docker_daemon] - rake ci:run[kubernetes] - - rake ci:run[cassandra_check] + - rake ci:run[cassandra_nodetool] - bundle exec rake requirements post: - if [[ $(docker ps -a -q) ]]; then docker stop $(docker ps -a -q); fi From db4aa03d4c113dfe8b9e73e18e9ffb5d23106824 Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Fri, 28 Jul 2017 14:10:48 -0400 Subject: [PATCH 10/12] More addressing --- cassandra_nodetool/check.py | 30 +++++++++++++++---- cassandra_nodetool/ci/cassandra_nodetool.rake | 3 +- cassandra_nodetool/metadata.csv | 7 +++-- cassandra_nodetool/test_cassandra_nodetool.py | 20 ++++++------- 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/cassandra_nodetool/check.py b/cassandra_nodetool/check.py index cb77742f6c548..1019d2403364e 100644 --- a/cassandra_nodetool/check.py +++ b/cassandra_nodetool/check.py @@ -27,7 +27,7 @@ class CassandraNodetoolCheck(AgentCheck): datacenter_name_re = re.compile('^Datacenter: (.*)') node_status_re = re.compile('^(?P[UD])[NLJM] +(?P
\d+\.\d+\.\d+\.\d+) +' '(?P\d+\.\d*) (?P(K|M|G|T)?B) +\d+ +' - '(?P(\d+\.\d+%)|\?) +(?P[a-fA-F0-9-]*) +(?P.*)') + '(?P(\d+\.\d+)|\?)%? +(?P[a-fA-F0-9-]*) +(?P.*)') def __init__(self, name, init_config, agentConfig, instances=None): AgentCheck.__init__(self, name, init_config, agentConfig, instances) @@ -43,6 +43,9 @@ def check(self, instance): password = instance.get("password", "") tags = instance.get("tags", []) + # Flag to send service checks only once and not for every keyspace + send_service_checks = True + for keyspace in keyspaces: # Build the nodetool command cmd = nodetool_cmd + ['-h', host, '-p', port] @@ -58,23 +61,38 @@ def check(self, instance): percent_up_by_dc = defaultdict(float) percent_total_by_dc = defaultdict(float) + # Send the stats per node and compute the stats per datacenter for node in nodes: - if node['status'] == 'U' and node['owns'] != '?': - percent_up_by_dc[node['datacenter']] += float(node['owns'][:-1]) - percent_total_by_dc[node['datacenter']] += float(node['owns'][:-1]) node_tags = ['node_address:%s' % node['address'], 'node_id:%s' % node['id'], 'datacenter:%s' % node['datacenter'], 'rack:%s' % node['rack']] + # nodetool prints `?` when it can't compute the value of `owns` for certain keyspaces (e.g. system) + # don't send metric in this case + if node['owns'] != '?': + owns = float(node['owns']) + if node['status'] == 'U': + percent_up_by_dc[node['datacenter']] += owns + percent_total_by_dc[node['datacenter']] += owns + self.gauge('cassandra.nodetool.status.owns', owns, + tags=tags + node_tags + ['keyspace:%s' % keyspace]) + + # Send service check only once for each node + if send_service_checks: + status = AgentCheck.OK if node['status'] == 'U' else AgentCheck.CRITICAL + self.service_check('cassandra.nodetool.node_up', status, tags + node_tags) + self.gauge('cassandra.nodetool.status.status', 1 if node['status'] == 'U' else 0, tags=tags + node_tags) self.gauge('cassandra.nodetool.status.load', float(node['load']) * TO_BYTES[node['load_unit']], tags=tags + node_tags) - self.gauge('cassandra.nodetool.status.owns', float(node['owns'][:-1]), - tags=tags + node_tags) + # All service checks have been sent, don't resend + send_service_checks = False + + # Send the stats per datacenter for datacenter, percent_up in percent_up_by_dc.items(): self.gauge('cassandra.nodetool.status.replication_availability', percent_up, tags=tags + ['keyspace:%s' % keyspace, 'datacenter:%s' % datacenter]) diff --git a/cassandra_nodetool/ci/cassandra_nodetool.rake b/cassandra_nodetool/ci/cassandra_nodetool.rake index 1b622c9a7ac65..39f072691f042 100644 --- a/cassandra_nodetool/ci/cassandra_nodetool.rake +++ b/cassandra_nodetool/ci/cassandra_nodetool.rake @@ -37,7 +37,8 @@ namespace :ci do sh %(docker start #{container_name}) sh %(docker create --name #{container_name2} \ - -e CASSANDRA_SEEDS="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' #{container_name})" cassandra:#{cassandra_nodetool_version}) + -e CASSANDRA_SEEDS="$(docker inspect --format='{{ .NetworkSettings.IPAddress }}' #{container_name})" \ + cassandra:#{cassandra_nodetool_version}) sh %(docker start #{container_name2}) end diff --git a/cassandra_nodetool/metadata.csv b/cassandra_nodetool/metadata.csv index 2ec496904ec3d..2632f34248bcd 100644 --- a/cassandra_nodetool/metadata.csv +++ b/cassandra_nodetool/metadata.csv @@ -1,3 +1,6 @@ metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name -cassandra.nodetool.status.replication_availability,gauge,,,,Percentage of data available per keyspace times replication factor,+1,cassandra_nodetool,available data -cassandra.nodetool.status.replication_factor,gauge,,,,Replication factor per keyspace,0,cassandra_nodetool,replication factor \ No newline at end of file +cassandra.nodetool.status.replication_availability,gauge,,percent,,Percentage of data available per keyspace times replication factor,1,cassandra_nodetool,available data +cassandra.nodetool.status.replication_factor,gauge,,,,Replication factor per keyspace,0,cassandra_nodetool,replication factor +cassandra.nodetool.status.status,gauge,,,,Node status: up (1) or down (0),1,cassandra_nodetool,node status +cassandra.nodetool.status.owns,gauge,,percent,,Percentage of the data owned by the node per datacenter times the replication factor,0,cassandra_nodetool,owns +cassandra.nodetool.status.load,gauge,,byte,,Amount of file system data under the cassandra data directory without snapshot content,0,cassandra_nodetool,load \ No newline at end of file diff --git a/cassandra_nodetool/test_cassandra_nodetool.py b/cassandra_nodetool/test_cassandra_nodetool.py index 8cfafedd1ec88..61d11102d96d2 100644 --- a/cassandra_nodetool/test_cassandra_nodetool.py +++ b/cassandra_nodetool/test_cassandra_nodetool.py @@ -24,7 +24,7 @@ class TestCassandraNodetoolCheck(AgentCheckTest): 'instances': [ { 'nodetool': 'docker exec %s nodetool' % CASSANDRA_CONTAINER_NAME, - 'keyspaces': ['test'], + 'keyspaces': ['system', 'test'], 'username': 'controlRole', 'password': 'QED', 'tags': ['foo', 'bar'] @@ -37,6 +37,7 @@ def test_check(self, mock_output): self.run_check(self.config) + # test per datacenter metrics self.assertEquals(mock_output.call_args[0][0], ['docker', 'exec', CASSANDRA_CONTAINER_NAME, 'nodetool', '-h', 'localhost', '-p', '7199', '-u', 'controlRole', '-pw', 'QED', 'status', '--', 'test']) @@ -48,15 +49,14 @@ def test_check(self, mock_output): tags=['keyspace:test', 'datacenter:dc1', 'foo', 'bar']) self.assertMetric('cassandra.nodetool.status.replication_factor', value=2, tags=['keyspace:test', 'datacenter:dc2', 'foo', 'bar']) - self.assertMetric('cassandra.nodetool.status.status', value=1, - tags=['datacenter:dc2', 'node_id:e521a2a4-39d3-4311-a195-667bf56450f4', - 'node_address:172.21.0.4', 'rack:RAC1', 'foo', 'bar']) - self.assertMetric('cassandra.nodetool.status.owns', value=100, - tags=['datacenter:dc2', 'node_id:e521a2a4-39d3-4311-a195-667bf56450f4', - 'node_address:172.21.0.4', 'rack:RAC1', 'foo', 'bar']) - self.assertMetric('cassandra.nodetool.status.load', value=223340, - tags=['datacenter:dc2', 'node_id:e521a2a4-39d3-4311-a195-667bf56450f4', - 'node_address:172.21.0.4', 'rack:RAC1', 'foo', 'bar']) + # test per node metrics + tags = ['datacenter:dc2', 'node_id:e521a2a4-39d3-4311-a195-667bf56450f4', + 'node_address:172.21.0.4', 'rack:RAC1', 'foo', 'bar'] + self.assertMetric('cassandra.nodetool.status.status', value=1, tags=tags) + self.assertMetric('cassandra.nodetool.status.owns', value=100, tags=tags + ['keyspace:test']) + self.assertMetric('cassandra.nodetool.status.load', value=223340, tags=tags) + self.assertServiceCheckOK('cassandra.nodetool.node_up', count=4) + self.assertServiceCheckCritical('cassandra.nodetool.node_up', count=1) @attr(requires='cassandra_nodetool') def test_integration(self): From 4ac71c7fc4103531eb172c410c3bf8cc34a61303 Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Mon, 31 Jul 2017 13:37:51 -0400 Subject: [PATCH 11/12] Add service check section in readme --- cassandra_nodetool/README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cassandra_nodetool/README.md b/cassandra_nodetool/README.md index 524b8461820fb..c5706395e26b1 100644 --- a/cassandra_nodetool/README.md +++ b/cassandra_nodetool/README.md @@ -54,3 +54,9 @@ When you run `datadog-agent info` you should see something like the following: # Compatibility The `cassandra_nodetool` check is compatible with all major platforms + +# Service Checks + +**cassandra.nodetool.node_up**: + +The agent sends this service check for each node of the monitored cluster. Returns CRITICAL if the node is down, otherwise OK. From 9e53421dbd7aca5ac431db417c58ead76a08881e Mon Sep 17 00:00:00 2001 From: Hippolyte HENRY Date: Tue, 1 Aug 2017 17:22:32 -0400 Subject: [PATCH 12/12] Continue loop if error on calling nodetool --- cassandra_nodetool/check.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cassandra_nodetool/check.py b/cassandra_nodetool/check.py index 1019d2403364e..13c8e113709c4 100644 --- a/cassandra_nodetool/check.py +++ b/cassandra_nodetool/check.py @@ -57,6 +57,7 @@ def check(self, instance): out, err, _ = get_subprocess_output(cmd, self.log, False) if err or 'Error:' in out: self.log.error('Error executing nodetool status: %s', err or out) + continue nodes = self._process_nodetool_output(out) percent_up_by_dc = defaultdict(float)